AAPT2: Port AAPT pseudolocalization to AAPT2
Pseudolocalization happens at the compile phase. Pseudolocalized
values are weak, such that manually specified values will take precedence.
Change-Id: I5e064ce0d270c9f4f9022f75aecedab9d45bc980
diff --git a/tools/aapt2/compile/Pseudolocalizer.cpp b/tools/aapt2/compile/Pseudolocalizer.cpp
new file mode 100644
index 0000000..eae52d7
--- /dev/null
+++ b/tools/aapt2/compile/Pseudolocalizer.cpp
@@ -0,0 +1,394 @@
+/*
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "compile/Pseudolocalizer.h"
+#include "util/Util.h"
+
+namespace aapt {
+
+// String basis to generate expansion
+static const std::u16string k_expansion_string = u"one two three "
+ "four five six seven eight nine ten eleven twelve thirteen "
+ "fourteen fiveteen sixteen seventeen nineteen twenty";
+
+// Special unicode characters to override directionality of the words
+static const std::u16string k_rlm = u"\u200f";
+static const std::u16string k_rlo = u"\u202e";
+static const std::u16string k_pdf = u"\u202c";
+
+// Placeholder marks
+static const std::u16string k_placeholder_open = u"\u00bb";
+static const std::u16string k_placeholder_close = u"\u00ab";
+
+static const char16_t k_arg_start = u'{';
+static const char16_t k_arg_end = u'}';
+
+class PseudoMethodNone : public PseudoMethodImpl {
+public:
+ std::u16string text(const StringPiece16& text) override { return text.toString(); }
+ std::u16string placeholder(const StringPiece16& text) override { return text.toString(); }
+};
+
+class PseudoMethodBidi : public PseudoMethodImpl {
+public:
+ std::u16string text(const StringPiece16& text) override;
+ std::u16string placeholder(const StringPiece16& text) override;
+};
+
+class PseudoMethodAccent : public PseudoMethodImpl {
+public:
+ PseudoMethodAccent() : mDepth(0), mWordCount(0), mLength(0) {}
+ std::u16string start() override;
+ std::u16string end() override;
+ std::u16string text(const StringPiece16& text) override;
+ std::u16string placeholder(const StringPiece16& text) override;
+private:
+ size_t mDepth;
+ size_t mWordCount;
+ size_t mLength;
+};
+
+Pseudolocalizer::Pseudolocalizer(Method method) : mLastDepth(0) {
+ setMethod(method);
+}
+
+void Pseudolocalizer::setMethod(Method method) {
+ switch (method) {
+ case Method::kNone:
+ mImpl = util::make_unique<PseudoMethodNone>();
+ break;
+ case Method::kAccent:
+ mImpl = util::make_unique<PseudoMethodAccent>();
+ break;
+ case Method::kBidi:
+ mImpl = util::make_unique<PseudoMethodBidi>();
+ break;
+ }
+}
+
+std::u16string Pseudolocalizer::text(const StringPiece16& text) {
+ std::u16string out;
+ size_t depth = mLastDepth;
+ size_t lastpos, pos;
+ const size_t length = text.size();
+ const char16_t* str = text.data();
+ bool escaped = false;
+ for (lastpos = pos = 0; pos < length; pos++) {
+ char16_t c = str[pos];
+ if (escaped) {
+ escaped = false;
+ continue;
+ }
+ if (c == '\'') {
+ escaped = true;
+ continue;
+ }
+
+ if (c == k_arg_start) {
+ depth++;
+ } else if (c == k_arg_end && depth) {
+ depth--;
+ }
+
+ if (mLastDepth != depth || pos == length - 1) {
+ bool pseudo = ((mLastDepth % 2) == 0);
+ size_t nextpos = pos;
+ if (!pseudo || depth == mLastDepth) {
+ nextpos++;
+ }
+ size_t size = nextpos - lastpos;
+ if (size) {
+ std::u16string chunk = text.substr(lastpos, size).toString();
+ if (pseudo) {
+ chunk = mImpl->text(chunk);
+ } else if (str[lastpos] == k_arg_start && str[nextpos - 1] == k_arg_end) {
+ chunk = mImpl->placeholder(chunk);
+ }
+ out.append(chunk);
+ }
+ if (pseudo && depth < mLastDepth) { // End of message
+ out.append(mImpl->end());
+ } else if (!pseudo && depth > mLastDepth) { // Start of message
+ out.append(mImpl->start());
+ }
+ lastpos = nextpos;
+ mLastDepth = depth;
+ }
+ }
+ return out;
+}
+
+static const char16_t* pseudolocalizeChar(const char16_t c) {
+ switch (c) {
+ case 'a': return u"\u00e5";
+ case 'b': return u"\u0253";
+ case 'c': return u"\u00e7";
+ case 'd': return u"\u00f0";
+ case 'e': return u"\u00e9";
+ case 'f': return u"\u0192";
+ case 'g': return u"\u011d";
+ case 'h': return u"\u0125";
+ case 'i': return u"\u00ee";
+ case 'j': return u"\u0135";
+ case 'k': return u"\u0137";
+ case 'l': return u"\u013c";
+ case 'm': return u"\u1e3f";
+ case 'n': return u"\u00f1";
+ case 'o': return u"\u00f6";
+ case 'p': return u"\u00fe";
+ case 'q': return u"\u0051";
+ case 'r': return u"\u0155";
+ case 's': return u"\u0161";
+ case 't': return u"\u0163";
+ case 'u': return u"\u00fb";
+ case 'v': return u"\u0056";
+ case 'w': return u"\u0175";
+ case 'x': return u"\u0445";
+ case 'y': return u"\u00fd";
+ case 'z': return u"\u017e";
+ case 'A': return u"\u00c5";
+ case 'B': return u"\u03b2";
+ case 'C': return u"\u00c7";
+ case 'D': return u"\u00d0";
+ case 'E': return u"\u00c9";
+ case 'G': return u"\u011c";
+ case 'H': return u"\u0124";
+ case 'I': return u"\u00ce";
+ case 'J': return u"\u0134";
+ case 'K': return u"\u0136";
+ case 'L': return u"\u013b";
+ case 'M': return u"\u1e3e";
+ case 'N': return u"\u00d1";
+ case 'O': return u"\u00d6";
+ case 'P': return u"\u00de";
+ case 'Q': return u"\u0071";
+ case 'R': return u"\u0154";
+ case 'S': return u"\u0160";
+ case 'T': return u"\u0162";
+ case 'U': return u"\u00db";
+ case 'V': return u"\u03bd";
+ case 'W': return u"\u0174";
+ case 'X': return u"\u00d7";
+ case 'Y': return u"\u00dd";
+ case 'Z': return u"\u017d";
+ case '!': return u"\u00a1";
+ case '?': return u"\u00bf";
+ case '$': return u"\u20ac";
+ default: return NULL;
+ }
+}
+
+static bool isPossibleNormalPlaceholderEnd(const char16_t c) {
+ switch (c) {
+ case 's': return true;
+ case 'S': return true;
+ case 'c': return true;
+ case 'C': return true;
+ case 'd': return true;
+ case 'o': return true;
+ case 'x': return true;
+ case 'X': return true;
+ case 'f': return true;
+ case 'e': return true;
+ case 'E': return true;
+ case 'g': return true;
+ case 'G': return true;
+ case 'a': return true;
+ case 'A': return true;
+ case 'b': return true;
+ case 'B': return true;
+ case 'h': return true;
+ case 'H': return true;
+ case '%': return true;
+ case 'n': return true;
+ default: return false;
+ }
+}
+
+static std::u16string pseudoGenerateExpansion(const unsigned int length) {
+ std::u16string result = k_expansion_string;
+ const char16_t* s = result.data();
+ if (result.size() < length) {
+ result += u" ";
+ result += pseudoGenerateExpansion(length - result.size());
+ } else {
+ int ext = 0;
+ // Should contain only whole words, so looking for a space
+ for (unsigned int i = length + 1; i < result.size(); ++i) {
+ ++ext;
+ if (s[i] == ' ') {
+ break;
+ }
+ }
+ result = result.substr(0, length + ext);
+ }
+ return result;
+}
+
+std::u16string PseudoMethodAccent::start() {
+ std::u16string result;
+ if (mDepth == 0) {
+ result = u"[";
+ }
+ mWordCount = mLength = 0;
+ mDepth++;
+ return result;
+}
+
+std::u16string PseudoMethodAccent::end() {
+ std::u16string result;
+ if (mLength) {
+ result += u" ";
+ result += pseudoGenerateExpansion(mWordCount > 3 ? mLength : mLength / 2);
+ }
+ mWordCount = mLength = 0;
+ mDepth--;
+ if (mDepth == 0) {
+ result += u"]";
+ }
+ return result;
+}
+
+/**
+ * Converts characters so they look like they've been localized.
+ *
+ * Note: This leaves placeholder syntax untouched.
+ */
+std::u16string PseudoMethodAccent::text(const StringPiece16& source)
+{
+ const char16_t* s = source.data();
+ std::u16string result;
+ const size_t I = source.size();
+ bool lastspace = true;
+ for (size_t i = 0; i < I; i++) {
+ char16_t c = s[i];
+ if (c == '%') {
+ // Placeholder syntax, no need to pseudolocalize
+ std::u16string chunk;
+ bool end = false;
+ chunk.append(&c, 1);
+ while (!end && i < I) {
+ ++i;
+ c = s[i];
+ chunk.append(&c, 1);
+ if (isPossibleNormalPlaceholderEnd(c)) {
+ end = true;
+ } else if (c == 't') {
+ ++i;
+ c = s[i];
+ chunk.append(&c, 1);
+ end = true;
+ }
+ }
+ // Treat chunk as a placeholder unless it ends with %.
+ result += ((c == '%') ? chunk : placeholder(chunk));
+ } else if (c == '<' || c == '&') {
+ // html syntax, no need to pseudolocalize
+ bool tag_closed = false;
+ while (!tag_closed && i < I) {
+ if (c == '&') {
+ std::u16string escapeText;
+ escapeText.append(&c, 1);
+ bool end = false;
+ size_t htmlCodePos = i;
+ while (!end && htmlCodePos < I) {
+ ++htmlCodePos;
+ c = s[htmlCodePos];
+ escapeText.append(&c, 1);
+ // Valid html code
+ if (c == ';') {
+ end = true;
+ i = htmlCodePos;
+ }
+ // Wrong html code
+ else if (!((c == '#' ||
+ (c >= 'a' && c <= 'z') ||
+ (c >= 'A' && c <= 'Z') ||
+ (c >= '0' && c <= '9')))) {
+ end = true;
+ }
+ }
+ result += escapeText;
+ if (escapeText != u"<") {
+ tag_closed = true;
+ }
+ continue;
+ }
+ if (c == '>') {
+ tag_closed = true;
+ result.append(&c, 1);
+ continue;
+ }
+ result.append(&c, 1);
+ i++;
+ c = s[i];
+ }
+ } else {
+ // This is a pure text that should be pseudolocalized
+ const char16_t* p = pseudolocalizeChar(c);
+ if (p != nullptr) {
+ result += p;
+ } else {
+ bool space = util::isspace16(c);
+ if (lastspace && !space) {
+ mWordCount++;
+ }
+ lastspace = space;
+ result.append(&c, 1);
+ }
+ // Count only pseudolocalizable chars and delimiters
+ mLength++;
+ }
+ }
+ return result;
+}
+
+std::u16string PseudoMethodAccent::placeholder(const StringPiece16& source) {
+ // Surround a placeholder with brackets
+ return k_placeholder_open + source.toString() + k_placeholder_close;
+}
+
+std::u16string PseudoMethodBidi::text(const StringPiece16& source) {
+ const char16_t* s = source.data();
+ std::u16string result;
+ bool lastspace = true;
+ bool space = true;
+ for (size_t i = 0; i < source.size(); i++) {
+ char16_t c = s[i];
+ space = util::isspace16(c);
+ if (lastspace && !space) {
+ // Word start
+ result += k_rlm + k_rlo;
+ } else if (!lastspace && space) {
+ // Word end
+ result += k_pdf + k_rlm;
+ }
+ lastspace = space;
+ result.append(&c, 1);
+ }
+ if (!lastspace) {
+ // End of last word
+ result += k_pdf + k_rlm;
+ }
+ return result;
+}
+
+std::u16string PseudoMethodBidi::placeholder(const StringPiece16& source) {
+ // Surround a placeholder with directionality change sequence
+ return k_rlm + k_rlo + source.toString() + k_pdf + k_rlm;
+}
+
+} // namespace aapt