Pseudolocalizer improvements. Fixes accented pseudolocalization and adds RTL pseudolocale. This change contains following modifications in the pseudolocalization logic: 1) zz_ZZ pseudolocale was removed; 2) en_XA pseudolocale was added for pseudo-accented; 3) ar_XB pseudolocale was added for pseudo-rtl; 4) Pseudo RTL localization functionality was implemented; 5) Text expansion functionality was implemented; 6) Text bracketing was implemented; 7) Couple of issues of previous implementation were fixed. Change-Id: I9f7f27bed717e39e82717d15c398decffc8bec3c Signed-off-by: Anton Krumin <antkrumin@google.com>

commit: a2ef5c0d4fb863c0382e77ae00f986a019b11cbe [log] [tgz]
author: Anton Krumin <antkrumin@google.com> Wed Mar 12 14:46:44 2014 -0700
committer: Anton Krumin <antkrumin@google.com> Wed Apr 09 16:59:48 2014 -0700
tree: d6c10bedf5d762e0fc0bd10b38f381ccefefa248
parent: ecdf9b199ac9659c37c34c0b23084199acea80bf [diff] [blame]
diff --git a/tools/aapt/pseudolocalize.cpp b/tools/aapt/pseudolocalize.cpp
index 9e50c5a..c02327a 100644
--- a/tools/aapt/pseudolocalize.cpp
+++ b/tools/aapt/pseudolocalize.cpp

@@ -2,89 +2,155 @@
 
 using namespace std;
 
+// String basis to generate expansion
+static const String16 k_expansion_string = String16("one two three "
+    "four five six seven eight nine ten eleven twelve thirteen "
+    "fourteen fiveteen sixteen seventeen nineteen twenty");
+
+// Special unicode characters to override directionality of the words
+static const String16 k_rlm = String16("\xe2\x80\x8f");
+static const String16 k_rlo = String16("\xE2\x80\xae");
+static const String16 k_pdf = String16("\xE2\x80\xac");
+
+// Placeholder marks
+static const String16 k_placeholder_open = String16("\xc2\xbb");
+static const String16 k_placeholder_close = String16("\xc2\xab");
+
 static const char*
-pseudolocalize_char(char c)
+pseudolocalize_char(const char16_t c)
 {
     switch (c) {
-        case 'a':   return "\xc4\x83";
-        case 'b':   return "\xcf\x84";
-        case 'c':   return "\xc4\x8b";
-        case 'd':   return "\xc4\x8f";
-        case 'e':   return "\xc4\x99";
+        case 'a':   return "\xc3\xa5";
+        case 'b':   return "\xc9\x93";
+        case 'c':   return "\xc3\xa7";
+        case 'd':   return "\xc3\xb0";
+        case 'e':   return "\xc3\xa9";
         case 'f':   return "\xc6\x92";
         case 'g':   return "\xc4\x9d";
-        case 'h':   return "\xd1\x9b";
-        case 'i':   return "\xcf\x8a";
+        case 'h':   return "\xc4\xa5";
+        case 'i':   return "\xc3\xae";
         case 'j':   return "\xc4\xb5";
-        case 'k':   return "\xc4\xb8";
-        case 'l':   return "\xc4\xba";
+        case 'k':   return "\xc4\xb7";
+        case 'l':   return "\xc4\xbc";
         case 'm':   return "\xe1\xb8\xbf";
-        case 'n':   return "\xd0\xb8";
-        case 'o':   return "\xcf\x8c";
-        case 'p':   return "\xcf\x81";
+        case 'n':   return "\xc3\xb1";
+        case 'o':   return "\xc3\xb6";
+        case 'p':   return "\xc3\xbe";
         case 'q':   return "\x51";
-        case 'r':   return "\xd2\x91";
+        case 'r':   return "\xc5\x95";
         case 's':   return "\xc5\xa1";
-        case 't':   return "\xd1\x82";
-        case 'u':   return "\xce\xb0";
+        case 't':   return "\xc5\xa3";
+        case 'u':   return "\xc3\xbb";
         case 'v':   return "\x56";
-        case 'w':   return "\xe1\xba\x85";
+        case 'w':   return "\xc5\xb5";
         case 'x':   return "\xd1\x85";
-        case 'y':   return "\xe1\xbb\xb3";
-        case 'z':   return "\xc5\xba";
+        case 'y':   return "\xc3\xbd";
+        case 'z':   return "\xc5\xbe";
         case 'A':   return "\xc3\x85";
         case 'B':   return "\xce\xb2";
-        case 'C':   return "\xc4\x88";
-        case 'D':   return "\xc4\x90";
-        case 'E':   return "\xd0\x84";
-        case 'F':   return "\xce\x93";
-        case 'G':   return "\xc4\x9e";
-        case 'H':   return "\xc4\xa6";
-        case 'I':   return "\xd0\x87";
-        case 'J':   return "\xc4\xb5";
+        case 'C':   return "\xc3\x87";
+        case 'D':   return "\xc3\x90";
+        case 'E':   return "\xc3\x89";
+        case 'G':   return "\xc4\x9c";
+        case 'H':   return "\xc4\xa4";
+        case 'I':   return "\xc3\x8e";
+        case 'J':   return "\xc4\xb4";
         case 'K':   return "\xc4\xb6";
-        case 'L':   return "\xc5\x81";
+        case 'L':   return "\xc4\xbb";
         case 'M':   return "\xe1\xb8\xbe";
-        case 'N':   return "\xc5\x83";
-        case 'O':   return "\xce\x98";
-        case 'P':   return "\xcf\x81";
+        case 'N':   return "\xc3\x91";
+        case 'O':   return "\xc3\x96";
+        case 'P':   return "\xc3\x9e";
         case 'Q':   return "\x71";
-        case 'R':   return "\xd0\xaf";
-        case 'S':   return "\xc8\x98";
-        case 'T':   return "\xc5\xa6";
-        case 'U':   return "\xc5\xa8";
+        case 'R':   return "\xc5\x94";
+        case 'S':   return "\xc5\xa0";
+        case 'T':   return "\xc5\xa2";
+        case 'U':   return "\xc3\x9b";
         case 'V':   return "\xce\xbd";
-        case 'W':   return "\xe1\xba\x84";
+        case 'W':   return "\xc5\xb4";
         case 'X':   return "\xc3\x97";
-        case 'Y':   return "\xc2\xa5";
+        case 'Y':   return "\xc3\x9d";
         case 'Z':   return "\xc5\xbd";
+        case '!':   return "\xc2\xa1";
+        case '?':   return "\xc2\xbf";
+        case '$':   return "\xe2\x82\xac";
         default:    return NULL;
     }
 }
 
+static const bool
+is_possible_normal_placeholder_end(const char16_t c) {
+    switch (c) {
+        case 's': return true;
+        case 'S': return true;
+        case 'c': return true;
+        case 'C': return true;
+        case 'd': return true;
+        case 'o': return true;
+        case 'x': return true;
+        case 'X': return true;
+        case 'f': return true;
+        case 'e': return true;
+        case 'E': return true;
+        case 'g': return true;
+        case 'G': return true;
+        case 'a': return true;
+        case 'A': return true;
+        case 'b': return true;
+        case 'B': return true;
+        case 'h': return true;
+        case 'H': return true;
+        case '%': return true;
+        case 'n': return true;
+        default:  return false;
+    }
+}
+
+String16
+pseudo_generate_expansion(const unsigned int length) {
+    String16 result = k_expansion_string;
+    const char16_t* s = result.string();
+    if (result.size() < length) {
+        result += String16(" ");
+        result += pseudo_generate_expansion(length - result.size());
+    } else {
+        int ext = 0;
+        // Should contain only whole words, so looking for a space
+        for (unsigned int i = length + 1; i < result.size(); ++i) {
+          ++ext;
+          if (s[i] == ' ') {
+            break;
+          }
+        }
+        result.remove(length + ext, 0);
+    }
+    return result;
+}
+
 /**
  * Converts characters so they look like they've been localized.
  *
  * Note: This leaves escape sequences untouched so they can later be
  * processed by ResTable::collectString in the normal way.
  */
-string
-pseudolocalize_string(const string& source)
+String16
+pseudolocalize_string(const String16& source)
 {
-    const char* s = source.c_str();
-    string result;
-    const size_t I = source.length();
+    const char16_t* s = source.string();
+    String16 result;
+    const size_t I = source.size();
     for (size_t i=0; i<I; i++) {
-        char c = s[i];
+        char16_t c = s[i];
         if (c == '\\') {
+            // Escape syntax, no need to pseudolocalize
             if (i<I-1) {
-                result += '\\';
+                result += String16("\\");
                 i++;
                 c = s[i];
                 switch (c) {
                     case 'u':
                         // this one takes up 5 chars
-                        result += string(s+i, 5);
+                        result += String16(s+i, 5);
                         i += 4;
                         break;
                     case 't':
@@ -96,24 +162,107 @@
                     case '\'':
                     case '\\':
                     default:
-                        result += c;
+                        result.append(&c, 1);
                         break;
                 }
             } else {
-                result += c;
+                result.append(&c, 1);
+            }
+        } else if (c == '%') {
+            // Placeholder syntax, no need to pseudolocalize
+            result += k_placeholder_open;
+            bool end = false;
+            result.append(&c, 1);
+            while (!end && i < I) {
+                ++i;
+                c = s[i];
+                result.append(&c, 1);
+                if (is_possible_normal_placeholder_end(c)) {
+                    end = true;
+                } else if (c == 't') {
+                    ++i;
+                    c = s[i];
+                    result.append(&c, 1);
+                    end = true;
+                }
+            }
+            result += k_placeholder_close;
+        } else if (c == '<' || c == '&') {
+            // html syntax, no need to pseudolocalize
+            bool tag_closed = false;
+            while (!tag_closed && i < I) {
+                if (c == '&') {
+                    String16 escape_text;
+                    escape_text.append(&c, 1);
+                    bool end = false;
+                    size_t htmlCodePos = i;
+                    while (!end && htmlCodePos < I) {
+                        ++htmlCodePos;
+                        c = s[htmlCodePos];
+                        escape_text.append(&c, 1);
+                        // Valid html code
+                        if (c == ';') {
+                            end = true;
+                            i = htmlCodePos;
+                        }
+                        // Wrong html code
+                        else if (!((c == '#' ||
+                                 (c >= 'a' && c <= 'z') ||
+                                 (c >= 'A' && c <= 'Z') ||
+                                 (c >= '0' && c <= '9')))) {
+                            end = true;
+                        }
+                    }
+                    result += escape_text;
+                    if (escape_text != String16("&lt;")) {
+                        tag_closed = true;
+                    }
+                    continue;
+                }
+                if (c == '>') {
+                    tag_closed = true;
+                    result.append(&c, 1);
+                    continue;
+                }
+                result.append(&c, 1);
+                i++;
+                c = s[i];
             }
         } else {
+            // This is a pure text that should be pseudolocalized
             const char* p = pseudolocalize_char(c);
             if (p != NULL) {
-                result += p;
+                result += String16(p);
             } else {
-                result += c;
+                result.append(&c, 1);
             }
         }
     }
-
-    //printf("result=\'%s\'\n", result.c_str());
     return result;
 }
 
+String16
+pseudobidi_string(const String16& source)
+{
+    const char16_t* s = source.string();
+    String16 result;
+    result += k_rlm;
+    result += k_rlo;
+    for (size_t i=0; i<source.size(); i++) {
+        char16_t c = s[i];
+        switch(c) {
+            case ' ': result += k_pdf;
+                      result += k_rlm;
+                      result.append(&c, 1);
+                      result += k_rlm;
+                      result += k_rlo;
+                      break;
+            default: result.append(&c, 1);
+                     break;
+        }
+    }
+    result += k_pdf;
+    result += k_rlm;
+    return result;
+}
commit	a2ef5c0d4fb863c0382e77ae00f986a019b11cbe	[log] [tgz]
author	Anton Krumin <antkrumin@google.com>	Wed Mar 12 14:46:44 2014 -0700
committer	Anton Krumin <antkrumin@google.com>	Wed Apr 09 16:59:48 2014 -0700
tree	d6c10bedf5d762e0fc0bd10b38f381ccefefa248
parent	ecdf9b199ac9659c37c34c0b23084199acea80bf [diff] [blame]