blob: c3aec98d89e0abda42447f21fdbb1d3cff41f236 [file] [log] [blame]
Adam Lesinski393b5f02015-12-17 13:03:11 -08001/*
2 * Copyright (C) 2015 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#include "compile/Pseudolocalizer.h"
18#include "util/Util.h"
19
20namespace aapt {
21
22// String basis to generate expansion
Adam Lesinskicacb28f2016-10-19 12:18:14 -070023static const std::string k_expansion_string =
24 "one two three "
25 "four five six seven eight nine ten eleven twelve thirteen "
26 "fourteen fiveteen sixteen seventeen nineteen twenty";
Adam Lesinski393b5f02015-12-17 13:03:11 -080027
28// Special unicode characters to override directionality of the words
Adam Lesinskid0f116b2016-07-08 15:00:32 -070029static const std::string k_rlm = "\u200f";
30static const std::string k_rlo = "\u202e";
31static const std::string k_pdf = "\u202c";
Adam Lesinski393b5f02015-12-17 13:03:11 -080032
33// Placeholder marks
Adam Lesinskid0f116b2016-07-08 15:00:32 -070034static const std::string k_placeholder_open = "\u00bb";
35static const std::string k_placeholder_close = "\u00ab";
Adam Lesinski393b5f02015-12-17 13:03:11 -080036
Adam Lesinskid0f116b2016-07-08 15:00:32 -070037static const char k_arg_start = '{';
38static const char k_arg_end = '}';
Adam Lesinski393b5f02015-12-17 13:03:11 -080039
40class PseudoMethodNone : public PseudoMethodImpl {
Adam Lesinskicacb28f2016-10-19 12:18:14 -070041 public:
42 std::string text(const StringPiece& text) override { return text.toString(); }
43 std::string placeholder(const StringPiece& text) override {
44 return text.toString();
45 }
Adam Lesinski393b5f02015-12-17 13:03:11 -080046};
47
48class PseudoMethodBidi : public PseudoMethodImpl {
Adam Lesinskicacb28f2016-10-19 12:18:14 -070049 public:
50 std::string text(const StringPiece& text) override;
51 std::string placeholder(const StringPiece& text) override;
Adam Lesinski393b5f02015-12-17 13:03:11 -080052};
53
54class PseudoMethodAccent : public PseudoMethodImpl {
Adam Lesinskicacb28f2016-10-19 12:18:14 -070055 public:
56 PseudoMethodAccent() : mDepth(0), mWordCount(0), mLength(0) {}
57 std::string start() override;
58 std::string end() override;
59 std::string text(const StringPiece& text) override;
60 std::string placeholder(const StringPiece& text) override;
61
62 private:
63 size_t mDepth;
64 size_t mWordCount;
65 size_t mLength;
Adam Lesinski393b5f02015-12-17 13:03:11 -080066};
67
68Pseudolocalizer::Pseudolocalizer(Method method) : mLastDepth(0) {
Adam Lesinskicacb28f2016-10-19 12:18:14 -070069 setMethod(method);
Adam Lesinski393b5f02015-12-17 13:03:11 -080070}
71
72void Pseudolocalizer::setMethod(Method method) {
Adam Lesinskicacb28f2016-10-19 12:18:14 -070073 switch (method) {
Adam Lesinski393b5f02015-12-17 13:03:11 -080074 case Method::kNone:
Adam Lesinskicacb28f2016-10-19 12:18:14 -070075 mImpl = util::make_unique<PseudoMethodNone>();
76 break;
Adam Lesinski393b5f02015-12-17 13:03:11 -080077 case Method::kAccent:
Adam Lesinskicacb28f2016-10-19 12:18:14 -070078 mImpl = util::make_unique<PseudoMethodAccent>();
79 break;
Adam Lesinski393b5f02015-12-17 13:03:11 -080080 case Method::kBidi:
Adam Lesinskicacb28f2016-10-19 12:18:14 -070081 mImpl = util::make_unique<PseudoMethodBidi>();
82 break;
83 }
Adam Lesinski393b5f02015-12-17 13:03:11 -080084}
85
Adam Lesinskid0f116b2016-07-08 15:00:32 -070086std::string Pseudolocalizer::text(const StringPiece& text) {
Adam Lesinskicacb28f2016-10-19 12:18:14 -070087 std::string out;
88 size_t depth = mLastDepth;
89 size_t lastpos, pos;
90 const size_t length = text.size();
91 const char* str = text.data();
92 bool escaped = false;
93 for (lastpos = pos = 0; pos < length; pos++) {
94 char16_t c = str[pos];
95 if (escaped) {
96 escaped = false;
97 continue;
Adam Lesinski393b5f02015-12-17 13:03:11 -080098 }
Adam Lesinskicacb28f2016-10-19 12:18:14 -070099 if (c == '\'') {
100 escaped = true;
101 continue;
102 }
103
104 if (c == k_arg_start) {
105 depth++;
106 } else if (c == k_arg_end && depth) {
107 depth--;
108 }
109
110 if (mLastDepth != depth || pos == length - 1) {
111 bool pseudo = ((mLastDepth % 2) == 0);
112 size_t nextpos = pos;
113 if (!pseudo || depth == mLastDepth) {
114 nextpos++;
115 }
116 size_t size = nextpos - lastpos;
117 if (size) {
118 std::string chunk = text.substr(lastpos, size).toString();
119 if (pseudo) {
120 chunk = mImpl->text(chunk);
121 } else if (str[lastpos] == k_arg_start &&
122 str[nextpos - 1] == k_arg_end) {
123 chunk = mImpl->placeholder(chunk);
124 }
125 out.append(chunk);
126 }
127 if (pseudo && depth < mLastDepth) { // End of message
128 out.append(mImpl->end());
129 } else if (!pseudo && depth > mLastDepth) { // Start of message
130 out.append(mImpl->start());
131 }
132 lastpos = nextpos;
133 mLastDepth = depth;
134 }
135 }
136 return out;
Adam Lesinski393b5f02015-12-17 13:03:11 -0800137}
138
Adam Lesinskid0f116b2016-07-08 15:00:32 -0700139static const char* pseudolocalizeChar(const char c) {
Adam Lesinskicacb28f2016-10-19 12:18:14 -0700140 switch (c) {
141 case 'a':
142 return "\u00e5";
143 case 'b':
144 return "\u0253";
145 case 'c':
146 return "\u00e7";
147 case 'd':
148 return "\u00f0";
149 case 'e':
150 return "\u00e9";
151 case 'f':
152 return "\u0192";
153 case 'g':
154 return "\u011d";
155 case 'h':
156 return "\u0125";
157 case 'i':
158 return "\u00ee";
159 case 'j':
160 return "\u0135";
161 case 'k':
162 return "\u0137";
163 case 'l':
164 return "\u013c";
165 case 'm':
166 return "\u1e3f";
167 case 'n':
168 return "\u00f1";
169 case 'o':
170 return "\u00f6";
171 case 'p':
172 return "\u00fe";
173 case 'q':
174 return "\u0051";
175 case 'r':
176 return "\u0155";
177 case 's':
178 return "\u0161";
179 case 't':
180 return "\u0163";
181 case 'u':
182 return "\u00fb";
183 case 'v':
184 return "\u0056";
185 case 'w':
186 return "\u0175";
187 case 'x':
188 return "\u0445";
189 case 'y':
190 return "\u00fd";
191 case 'z':
192 return "\u017e";
193 case 'A':
194 return "\u00c5";
195 case 'B':
196 return "\u03b2";
197 case 'C':
198 return "\u00c7";
199 case 'D':
200 return "\u00d0";
201 case 'E':
202 return "\u00c9";
203 case 'G':
204 return "\u011c";
205 case 'H':
206 return "\u0124";
207 case 'I':
208 return "\u00ce";
209 case 'J':
210 return "\u0134";
211 case 'K':
212 return "\u0136";
213 case 'L':
214 return "\u013b";
215 case 'M':
216 return "\u1e3e";
217 case 'N':
218 return "\u00d1";
219 case 'O':
220 return "\u00d6";
221 case 'P':
222 return "\u00de";
223 case 'Q':
224 return "\u0071";
225 case 'R':
226 return "\u0154";
227 case 'S':
228 return "\u0160";
229 case 'T':
230 return "\u0162";
231 case 'U':
232 return "\u00db";
233 case 'V':
234 return "\u03bd";
235 case 'W':
236 return "\u0174";
237 case 'X':
238 return "\u00d7";
239 case 'Y':
240 return "\u00dd";
241 case 'Z':
242 return "\u017d";
243 case '!':
244 return "\u00a1";
245 case '?':
246 return "\u00bf";
247 case '$':
248 return "\u20ac";
249 default:
250 return nullptr;
251 }
Adam Lesinski393b5f02015-12-17 13:03:11 -0800252}
253
Adam Lesinskid0f116b2016-07-08 15:00:32 -0700254static bool isPossibleNormalPlaceholderEnd(const char c) {
Adam Lesinskicacb28f2016-10-19 12:18:14 -0700255 switch (c) {
256 case 's':
257 return true;
258 case 'S':
259 return true;
260 case 'c':
261 return true;
262 case 'C':
263 return true;
264 case 'd':
265 return true;
266 case 'o':
267 return true;
268 case 'x':
269 return true;
270 case 'X':
271 return true;
272 case 'f':
273 return true;
274 case 'e':
275 return true;
276 case 'E':
277 return true;
278 case 'g':
279 return true;
280 case 'G':
281 return true;
282 case 'a':
283 return true;
284 case 'A':
285 return true;
286 case 'b':
287 return true;
288 case 'B':
289 return true;
290 case 'h':
291 return true;
292 case 'H':
293 return true;
294 case '%':
295 return true;
296 case 'n':
297 return true;
298 default:
299 return false;
300 }
Adam Lesinski393b5f02015-12-17 13:03:11 -0800301}
302
Adam Lesinskid0f116b2016-07-08 15:00:32 -0700303static std::string pseudoGenerateExpansion(const unsigned int length) {
Adam Lesinskicacb28f2016-10-19 12:18:14 -0700304 std::string result = k_expansion_string;
305 const char* s = result.data();
306 if (result.size() < length) {
307 result += " ";
308 result += pseudoGenerateExpansion(length - result.size());
309 } else {
310 int ext = 0;
311 // Should contain only whole words, so looking for a space
312 for (unsigned int i = length + 1; i < result.size(); ++i) {
313 ++ext;
314 if (s[i] == ' ') {
315 break;
316 }
Adam Lesinski393b5f02015-12-17 13:03:11 -0800317 }
Adam Lesinskicacb28f2016-10-19 12:18:14 -0700318 result = result.substr(0, length + ext);
319 }
320 return result;
Adam Lesinski393b5f02015-12-17 13:03:11 -0800321}
322
Adam Lesinskid0f116b2016-07-08 15:00:32 -0700323std::string PseudoMethodAccent::start() {
Adam Lesinskicacb28f2016-10-19 12:18:14 -0700324 std::string result;
325 if (mDepth == 0) {
326 result = "[";
327 }
328 mWordCount = mLength = 0;
329 mDepth++;
330 return result;
Adam Lesinski393b5f02015-12-17 13:03:11 -0800331}
332
Adam Lesinskid0f116b2016-07-08 15:00:32 -0700333std::string PseudoMethodAccent::end() {
Adam Lesinskicacb28f2016-10-19 12:18:14 -0700334 std::string result;
335 if (mLength) {
336 result += " ";
337 result += pseudoGenerateExpansion(mWordCount > 3 ? mLength : mLength / 2);
338 }
339 mWordCount = mLength = 0;
340 mDepth--;
341 if (mDepth == 0) {
342 result += "]";
343 }
344 return result;
Adam Lesinski393b5f02015-12-17 13:03:11 -0800345}
346
347/**
348 * Converts characters so they look like they've been localized.
349 *
350 * Note: This leaves placeholder syntax untouched.
351 */
Adam Lesinskicacb28f2016-10-19 12:18:14 -0700352std::string PseudoMethodAccent::text(const StringPiece& source) {
353 const char* s = source.data();
354 std::string result;
355 const size_t I = source.size();
356 bool lastspace = true;
357 for (size_t i = 0; i < I; i++) {
358 char c = s[i];
359 if (c == '%') {
360 // Placeholder syntax, no need to pseudolocalize
361 std::string chunk;
362 bool end = false;
363 chunk.append(&c, 1);
364 while (!end && i + 1 < I) {
365 ++i;
366 c = s[i];
367 chunk.append(&c, 1);
368 if (isPossibleNormalPlaceholderEnd(c)) {
369 end = true;
370 } else if (i + 1 < I && c == 't') {
371 ++i;
372 c = s[i];
373 chunk.append(&c, 1);
374 end = true;
Adam Lesinski393b5f02015-12-17 13:03:11 -0800375 }
Adam Lesinskicacb28f2016-10-19 12:18:14 -0700376 }
377 // Treat chunk as a placeholder unless it ends with %.
378 result += ((c == '%') ? chunk : placeholder(chunk));
379 } else if (c == '<' || c == '&') {
380 // html syntax, no need to pseudolocalize
381 bool tag_closed = false;
382 while (!tag_closed && i < I) {
383 if (c == '&') {
384 std::string escapeText;
385 escapeText.append(&c, 1);
386 bool end = false;
387 size_t htmlCodePos = i;
388 while (!end && htmlCodePos < I) {
389 ++htmlCodePos;
390 c = s[htmlCodePos];
391 escapeText.append(&c, 1);
392 // Valid html code
393 if (c == ';') {
394 end = true;
395 i = htmlCodePos;
396 }
397 // Wrong html code
398 else if (!((c == '#' || (c >= 'a' && c <= 'z') ||
399 (c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9')))) {
400 end = true;
401 }
402 }
403 result += escapeText;
404 if (escapeText != "&lt;") {
405 tag_closed = true;
406 }
407 continue;
408 }
409 if (c == '>') {
410 tag_closed = true;
411 result.append(&c, 1);
412 continue;
413 }
414 result.append(&c, 1);
415 i++;
416 c = s[i];
417 }
418 } else {
419 // This is a pure text that should be pseudolocalized
420 const char* p = pseudolocalizeChar(c);
421 if (p != nullptr) {
422 result += p;
423 } else {
424 bool space = isspace(c);
Adam Lesinski393b5f02015-12-17 13:03:11 -0800425 if (lastspace && !space) {
Adam Lesinskicacb28f2016-10-19 12:18:14 -0700426 mWordCount++;
Adam Lesinski393b5f02015-12-17 13:03:11 -0800427 }
428 lastspace = space;
429 result.append(&c, 1);
Adam Lesinskicacb28f2016-10-19 12:18:14 -0700430 }
431 // Count only pseudolocalizable chars and delimiters
432 mLength++;
Adam Lesinski393b5f02015-12-17 13:03:11 -0800433 }
Adam Lesinskicacb28f2016-10-19 12:18:14 -0700434 }
435 return result;
436}
437
438std::string PseudoMethodAccent::placeholder(const StringPiece& source) {
439 // Surround a placeholder with brackets
440 return k_placeholder_open + source.toString() + k_placeholder_close;
441}
442
443std::string PseudoMethodBidi::text(const StringPiece& source) {
444 const char* s = source.data();
445 std::string result;
446 bool lastspace = true;
447 bool space = true;
448 for (size_t i = 0; i < source.size(); i++) {
449 char c = s[i];
450 space = isspace(c);
451 if (lastspace && !space) {
452 // Word start
453 result += k_rlm + k_rlo;
454 } else if (!lastspace && space) {
455 // Word end
456 result += k_pdf + k_rlm;
Adam Lesinski393b5f02015-12-17 13:03:11 -0800457 }
Adam Lesinskicacb28f2016-10-19 12:18:14 -0700458 lastspace = space;
459 result.append(&c, 1);
460 }
461 if (!lastspace) {
462 // End of last word
463 result += k_pdf + k_rlm;
464 }
465 return result;
Adam Lesinski393b5f02015-12-17 13:03:11 -0800466}
467
Adam Lesinskid0f116b2016-07-08 15:00:32 -0700468std::string PseudoMethodBidi::placeholder(const StringPiece& source) {
Adam Lesinskicacb28f2016-10-19 12:18:14 -0700469 // Surround a placeholder with directionality change sequence
470 return k_rlm + k_rlo + source.toString() + k_pdf + k_rlm;
Adam Lesinski393b5f02015-12-17 13:03:11 -0800471}
472
Adam Lesinskicacb28f2016-10-19 12:18:14 -0700473} // namespace aapt