blob: 90d0d853acfa3b9f67eb71921a20818e8fa698b7 [file] [log] [blame]
Adam Lesinski393b5f02015-12-17 13:03:11 -08001/*
2 * Copyright (C) 2015 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#include "compile/Pseudolocalizer.h"
18#include "util/Util.h"
19
20namespace aapt {
21
22// String basis to generate expansion
Adam Lesinskid0f116b2016-07-08 15:00:32 -070023static const std::string k_expansion_string = "one two three "
Adam Lesinski393b5f02015-12-17 13:03:11 -080024 "four five six seven eight nine ten eleven twelve thirteen "
25 "fourteen fiveteen sixteen seventeen nineteen twenty";
26
27// Special unicode characters to override directionality of the words
Adam Lesinskid0f116b2016-07-08 15:00:32 -070028static const std::string k_rlm = "\u200f";
29static const std::string k_rlo = "\u202e";
30static const std::string k_pdf = "\u202c";
Adam Lesinski393b5f02015-12-17 13:03:11 -080031
32// Placeholder marks
Adam Lesinskid0f116b2016-07-08 15:00:32 -070033static const std::string k_placeholder_open = "\u00bb";
34static const std::string k_placeholder_close = "\u00ab";
Adam Lesinski393b5f02015-12-17 13:03:11 -080035
Adam Lesinskid0f116b2016-07-08 15:00:32 -070036static const char k_arg_start = '{';
37static const char k_arg_end = '}';
Adam Lesinski393b5f02015-12-17 13:03:11 -080038
39class PseudoMethodNone : public PseudoMethodImpl {
40public:
Adam Lesinskid0f116b2016-07-08 15:00:32 -070041 std::string text(const StringPiece& text) override { return text.toString(); }
42 std::string placeholder(const StringPiece& text) override { return text.toString(); }
Adam Lesinski393b5f02015-12-17 13:03:11 -080043};
44
45class PseudoMethodBidi : public PseudoMethodImpl {
46public:
Adam Lesinskid0f116b2016-07-08 15:00:32 -070047 std::string text(const StringPiece& text) override;
48 std::string placeholder(const StringPiece& text) override;
Adam Lesinski393b5f02015-12-17 13:03:11 -080049};
50
51class PseudoMethodAccent : public PseudoMethodImpl {
52public:
53 PseudoMethodAccent() : mDepth(0), mWordCount(0), mLength(0) {}
Adam Lesinskid0f116b2016-07-08 15:00:32 -070054 std::string start() override;
55 std::string end() override;
56 std::string text(const StringPiece& text) override;
57 std::string placeholder(const StringPiece& text) override;
Adam Lesinski393b5f02015-12-17 13:03:11 -080058private:
59 size_t mDepth;
60 size_t mWordCount;
61 size_t mLength;
62};
63
64Pseudolocalizer::Pseudolocalizer(Method method) : mLastDepth(0) {
65 setMethod(method);
66}
67
68void Pseudolocalizer::setMethod(Method method) {
69 switch (method) {
70 case Method::kNone:
71 mImpl = util::make_unique<PseudoMethodNone>();
72 break;
73 case Method::kAccent:
74 mImpl = util::make_unique<PseudoMethodAccent>();
75 break;
76 case Method::kBidi:
77 mImpl = util::make_unique<PseudoMethodBidi>();
78 break;
79 }
80}
81
Adam Lesinskid0f116b2016-07-08 15:00:32 -070082std::string Pseudolocalizer::text(const StringPiece& text) {
83 std::string out;
Adam Lesinski393b5f02015-12-17 13:03:11 -080084 size_t depth = mLastDepth;
85 size_t lastpos, pos;
86 const size_t length = text.size();
Adam Lesinskid0f116b2016-07-08 15:00:32 -070087 const char* str = text.data();
Adam Lesinski393b5f02015-12-17 13:03:11 -080088 bool escaped = false;
89 for (lastpos = pos = 0; pos < length; pos++) {
90 char16_t c = str[pos];
91 if (escaped) {
92 escaped = false;
93 continue;
94 }
95 if (c == '\'') {
96 escaped = true;
97 continue;
98 }
99
100 if (c == k_arg_start) {
101 depth++;
102 } else if (c == k_arg_end && depth) {
103 depth--;
104 }
105
106 if (mLastDepth != depth || pos == length - 1) {
107 bool pseudo = ((mLastDepth % 2) == 0);
108 size_t nextpos = pos;
109 if (!pseudo || depth == mLastDepth) {
110 nextpos++;
111 }
112 size_t size = nextpos - lastpos;
113 if (size) {
Adam Lesinskid0f116b2016-07-08 15:00:32 -0700114 std::string chunk = text.substr(lastpos, size).toString();
Adam Lesinski393b5f02015-12-17 13:03:11 -0800115 if (pseudo) {
116 chunk = mImpl->text(chunk);
117 } else if (str[lastpos] == k_arg_start && str[nextpos - 1] == k_arg_end) {
118 chunk = mImpl->placeholder(chunk);
119 }
120 out.append(chunk);
121 }
122 if (pseudo && depth < mLastDepth) { // End of message
123 out.append(mImpl->end());
124 } else if (!pseudo && depth > mLastDepth) { // Start of message
125 out.append(mImpl->start());
126 }
127 lastpos = nextpos;
128 mLastDepth = depth;
129 }
130 }
131 return out;
132}
133
Adam Lesinskid0f116b2016-07-08 15:00:32 -0700134static const char* pseudolocalizeChar(const char c) {
Adam Lesinski393b5f02015-12-17 13:03:11 -0800135 switch (c) {
Adam Lesinskid0f116b2016-07-08 15:00:32 -0700136 case 'a': return "\u00e5";
137 case 'b': return "\u0253";
138 case 'c': return "\u00e7";
139 case 'd': return "\u00f0";
140 case 'e': return "\u00e9";
141 case 'f': return "\u0192";
142 case 'g': return "\u011d";
143 case 'h': return "\u0125";
144 case 'i': return "\u00ee";
145 case 'j': return "\u0135";
146 case 'k': return "\u0137";
147 case 'l': return "\u013c";
148 case 'm': return "\u1e3f";
149 case 'n': return "\u00f1";
150 case 'o': return "\u00f6";
151 case 'p': return "\u00fe";
152 case 'q': return "\u0051";
153 case 'r': return "\u0155";
154 case 's': return "\u0161";
155 case 't': return "\u0163";
156 case 'u': return "\u00fb";
157 case 'v': return "\u0056";
158 case 'w': return "\u0175";
159 case 'x': return "\u0445";
160 case 'y': return "\u00fd";
161 case 'z': return "\u017e";
162 case 'A': return "\u00c5";
163 case 'B': return "\u03b2";
164 case 'C': return "\u00c7";
165 case 'D': return "\u00d0";
166 case 'E': return "\u00c9";
167 case 'G': return "\u011c";
168 case 'H': return "\u0124";
169 case 'I': return "\u00ce";
170 case 'J': return "\u0134";
171 case 'K': return "\u0136";
172 case 'L': return "\u013b";
173 case 'M': return "\u1e3e";
174 case 'N': return "\u00d1";
175 case 'O': return "\u00d6";
176 case 'P': return "\u00de";
177 case 'Q': return "\u0071";
178 case 'R': return "\u0154";
179 case 'S': return "\u0160";
180 case 'T': return "\u0162";
181 case 'U': return "\u00db";
182 case 'V': return "\u03bd";
183 case 'W': return "\u0174";
184 case 'X': return "\u00d7";
185 case 'Y': return "\u00dd";
186 case 'Z': return "\u017d";
187 case '!': return "\u00a1";
188 case '?': return "\u00bf";
189 case '$': return "\u20ac";
190 default: return nullptr;
Adam Lesinski393b5f02015-12-17 13:03:11 -0800191 }
192}
193
Adam Lesinskid0f116b2016-07-08 15:00:32 -0700194static bool isPossibleNormalPlaceholderEnd(const char c) {
Adam Lesinski393b5f02015-12-17 13:03:11 -0800195 switch (c) {
196 case 's': return true;
197 case 'S': return true;
198 case 'c': return true;
199 case 'C': return true;
200 case 'd': return true;
201 case 'o': return true;
202 case 'x': return true;
203 case 'X': return true;
204 case 'f': return true;
205 case 'e': return true;
206 case 'E': return true;
207 case 'g': return true;
208 case 'G': return true;
209 case 'a': return true;
210 case 'A': return true;
211 case 'b': return true;
212 case 'B': return true;
213 case 'h': return true;
214 case 'H': return true;
215 case '%': return true;
216 case 'n': return true;
217 default: return false;
218 }
219}
220
Adam Lesinskid0f116b2016-07-08 15:00:32 -0700221static std::string pseudoGenerateExpansion(const unsigned int length) {
222 std::string result = k_expansion_string;
223 const char* s = result.data();
Adam Lesinski393b5f02015-12-17 13:03:11 -0800224 if (result.size() < length) {
Adam Lesinskid0f116b2016-07-08 15:00:32 -0700225 result += " ";
Adam Lesinski393b5f02015-12-17 13:03:11 -0800226 result += pseudoGenerateExpansion(length - result.size());
227 } else {
228 int ext = 0;
229 // Should contain only whole words, so looking for a space
230 for (unsigned int i = length + 1; i < result.size(); ++i) {
231 ++ext;
232 if (s[i] == ' ') {
233 break;
234 }
235 }
236 result = result.substr(0, length + ext);
237 }
238 return result;
239}
240
Adam Lesinskid0f116b2016-07-08 15:00:32 -0700241std::string PseudoMethodAccent::start() {
242 std::string result;
Adam Lesinski393b5f02015-12-17 13:03:11 -0800243 if (mDepth == 0) {
Adam Lesinskid0f116b2016-07-08 15:00:32 -0700244 result = "[";
Adam Lesinski393b5f02015-12-17 13:03:11 -0800245 }
246 mWordCount = mLength = 0;
247 mDepth++;
248 return result;
249}
250
Adam Lesinskid0f116b2016-07-08 15:00:32 -0700251std::string PseudoMethodAccent::end() {
252 std::string result;
Adam Lesinski393b5f02015-12-17 13:03:11 -0800253 if (mLength) {
Adam Lesinskid0f116b2016-07-08 15:00:32 -0700254 result += " ";
Adam Lesinski393b5f02015-12-17 13:03:11 -0800255 result += pseudoGenerateExpansion(mWordCount > 3 ? mLength : mLength / 2);
256 }
257 mWordCount = mLength = 0;
258 mDepth--;
259 if (mDepth == 0) {
Adam Lesinskid0f116b2016-07-08 15:00:32 -0700260 result += "]";
Adam Lesinski393b5f02015-12-17 13:03:11 -0800261 }
262 return result;
263}
264
265/**
266 * Converts characters so they look like they've been localized.
267 *
268 * Note: This leaves placeholder syntax untouched.
269 */
Adam Lesinskid0f116b2016-07-08 15:00:32 -0700270std::string PseudoMethodAccent::text(const StringPiece& source)
Adam Lesinski393b5f02015-12-17 13:03:11 -0800271{
Adam Lesinskid0f116b2016-07-08 15:00:32 -0700272 const char* s = source.data();
273 std::string result;
Adam Lesinski393b5f02015-12-17 13:03:11 -0800274 const size_t I = source.size();
275 bool lastspace = true;
276 for (size_t i = 0; i < I; i++) {
Adam Lesinskid0f116b2016-07-08 15:00:32 -0700277 char c = s[i];
Adam Lesinski393b5f02015-12-17 13:03:11 -0800278 if (c == '%') {
279 // Placeholder syntax, no need to pseudolocalize
Adam Lesinskid0f116b2016-07-08 15:00:32 -0700280 std::string chunk;
Adam Lesinski393b5f02015-12-17 13:03:11 -0800281 bool end = false;
282 chunk.append(&c, 1);
Adam Lesinskib2106682016-06-23 13:03:15 -0700283 while (!end && i + 1 < I) {
Adam Lesinski393b5f02015-12-17 13:03:11 -0800284 ++i;
285 c = s[i];
286 chunk.append(&c, 1);
287 if (isPossibleNormalPlaceholderEnd(c)) {
288 end = true;
Adam Lesinskib2106682016-06-23 13:03:15 -0700289 } else if (i + 1 < I && c == 't') {
Adam Lesinski393b5f02015-12-17 13:03:11 -0800290 ++i;
291 c = s[i];
292 chunk.append(&c, 1);
293 end = true;
294 }
295 }
296 // Treat chunk as a placeholder unless it ends with %.
297 result += ((c == '%') ? chunk : placeholder(chunk));
298 } else if (c == '<' || c == '&') {
299 // html syntax, no need to pseudolocalize
300 bool tag_closed = false;
301 while (!tag_closed && i < I) {
302 if (c == '&') {
Adam Lesinskid0f116b2016-07-08 15:00:32 -0700303 std::string escapeText;
Adam Lesinski393b5f02015-12-17 13:03:11 -0800304 escapeText.append(&c, 1);
305 bool end = false;
306 size_t htmlCodePos = i;
307 while (!end && htmlCodePos < I) {
308 ++htmlCodePos;
309 c = s[htmlCodePos];
310 escapeText.append(&c, 1);
311 // Valid html code
312 if (c == ';') {
313 end = true;
314 i = htmlCodePos;
315 }
316 // Wrong html code
317 else if (!((c == '#' ||
318 (c >= 'a' && c <= 'z') ||
319 (c >= 'A' && c <= 'Z') ||
320 (c >= '0' && c <= '9')))) {
321 end = true;
322 }
323 }
324 result += escapeText;
Adam Lesinskid0f116b2016-07-08 15:00:32 -0700325 if (escapeText != "&lt;") {
Adam Lesinski393b5f02015-12-17 13:03:11 -0800326 tag_closed = true;
327 }
328 continue;
329 }
330 if (c == '>') {
331 tag_closed = true;
332 result.append(&c, 1);
333 continue;
334 }
335 result.append(&c, 1);
336 i++;
337 c = s[i];
338 }
339 } else {
340 // This is a pure text that should be pseudolocalized
Adam Lesinskid0f116b2016-07-08 15:00:32 -0700341 const char* p = pseudolocalizeChar(c);
Adam Lesinski393b5f02015-12-17 13:03:11 -0800342 if (p != nullptr) {
343 result += p;
344 } else {
Adam Lesinskid0f116b2016-07-08 15:00:32 -0700345 bool space = isspace(c);
Adam Lesinski393b5f02015-12-17 13:03:11 -0800346 if (lastspace && !space) {
347 mWordCount++;
348 }
349 lastspace = space;
350 result.append(&c, 1);
351 }
352 // Count only pseudolocalizable chars and delimiters
353 mLength++;
354 }
355 }
356 return result;
357}
358
Adam Lesinskid0f116b2016-07-08 15:00:32 -0700359std::string PseudoMethodAccent::placeholder(const StringPiece& source) {
Adam Lesinski393b5f02015-12-17 13:03:11 -0800360 // Surround a placeholder with brackets
361 return k_placeholder_open + source.toString() + k_placeholder_close;
362}
363
Adam Lesinskid0f116b2016-07-08 15:00:32 -0700364std::string PseudoMethodBidi::text(const StringPiece& source) {
365 const char* s = source.data();
366 std::string result;
Adam Lesinski393b5f02015-12-17 13:03:11 -0800367 bool lastspace = true;
368 bool space = true;
369 for (size_t i = 0; i < source.size(); i++) {
Adam Lesinskid0f116b2016-07-08 15:00:32 -0700370 char c = s[i];
371 space = isspace(c);
Adam Lesinski393b5f02015-12-17 13:03:11 -0800372 if (lastspace && !space) {
373 // Word start
374 result += k_rlm + k_rlo;
375 } else if (!lastspace && space) {
376 // Word end
377 result += k_pdf + k_rlm;
378 }
379 lastspace = space;
380 result.append(&c, 1);
381 }
382 if (!lastspace) {
383 // End of last word
384 result += k_pdf + k_rlm;
385 }
386 return result;
387}
388
Adam Lesinskid0f116b2016-07-08 15:00:32 -0700389std::string PseudoMethodBidi::placeholder(const StringPiece& source) {
Adam Lesinski393b5f02015-12-17 13:03:11 -0800390 // Surround a placeholder with directionality change sequence
391 return k_rlm + k_rlo + source.toString() + k_pdf + k_rlm;
392}
393
394} // namespace aapt