AAPT2: Fix styled string whitespace processing Change styled string whitespace processing to be like AAPT's was. Main changes: - whitespace around tags is preserved. - tags start exactly where they are supposed to, not off by one. Bug: 72406283 Test: make aapt2_tests Change-Id: I4d12728c493efd8c978e2e3d2718b56534ff52ef

commit: 2eed52ecc0c2fa3e96530e4b5556eaa82f7c2dfc [log] [tgz]
author: Adam Lesinski <adamlesinski@google.com> Wed Feb 21 15:55:58 2018 -0800
committer: Adam Lesinski <adamlesinski@google.com> Tue Feb 27 11:39:10 2018 -0800
tree: 4e0a49770f684a2ca823d958c0f1a2b3adabcab9
parent: e1094a2e232277a719025aa5c97c492502c34f5b [diff]
diff --git a/tools/aapt2/util/Util.cpp b/tools/aapt2/util/Util.cpp
index e42145d..d1c9ca1 100644
--- a/tools/aapt2/util/Util.cpp
+++ b/tools/aapt2/util/Util.cpp

@@ -76,6 +76,34 @@
   return str.substr(str.size() - suffix.size(), suffix.size()) == suffix;
 }
 
+StringPiece TrimLeadingWhitespace(const StringPiece& str) {
+  if (str.size() == 0 || str.data() == nullptr) {
+    return str;
+  }
+
+  const char* start = str.data();
+  const char* end = start + str.length();
+
+  while (start != end && isspace(*start)) {
+    start++;
+  }
+  return StringPiece(start, end - start);
+}
+
+StringPiece TrimTrailingWhitespace(const StringPiece& str) {
+  if (str.size() == 0 || str.data() == nullptr) {
+    return str;
+  }
+
+  const char* start = str.data();
+  const char* end = start + str.length();
+
+  while (end != start && isspace(*(end - 1))) {
+    end--;
+  }
+  return StringPiece(start, end - start);
+}
+
 StringPiece TrimWhitespace(const StringPiece& str) {
   if (str.size() == 0 || str.data() == nullptr) {
     return str;
@@ -269,162 +297,6 @@
   return true;
 }
 
-static bool AppendCodepointToUtf8String(char32_t codepoint, std::string* output) {
-  ssize_t len = utf32_to_utf8_length(&codepoint, 1);
-  if (len < 0) {
-    return false;
-  }
-
-  const size_t start_append_pos = output->size();
-
-  // Make room for the next character.
-  output->resize(output->size() + len);
-
-  char* dst = &*(output->begin() + start_append_pos);
-  utf32_to_utf8(&codepoint, 1, dst, len + 1);
-  return true;
-}
-
-static bool AppendUnicodeCodepoint(Utf8Iterator* iter, std::string* output) {
-  char32_t code = 0;
-  for (size_t i = 0; i < 4 && iter->HasNext(); i++) {
-    char32_t codepoint = iter->Next();
-    char32_t a;
-    if (codepoint >= U'0' && codepoint <= U'9') {
-      a = codepoint - U'0';
-    } else if (codepoint >= U'a' && codepoint <= U'f') {
-      a = codepoint - U'a' + 10;
-    } else if (codepoint >= U'A' && codepoint <= U'F') {
-      a = codepoint - U'A' + 10;
-    } else {
-      return {};
-    }
-    code = (code << 4) | a;
-  }
-  return AppendCodepointToUtf8String(code, output);
-}
-
-static bool IsCodepointSpace(char32_t codepoint) {
-  if (static_cast<uint32_t>(codepoint) & 0xffffff00u) {
-    return false;
-  }
-  return isspace(static_cast<char>(codepoint));
-}
-
-StringBuilder::StringBuilder(bool preserve_spaces) : preserve_spaces_(preserve_spaces) {
-}
-
-StringBuilder& StringBuilder::Append(const StringPiece& str) {
-  if (!error_.empty()) {
-    return *this;
-  }
-
-  // Where the new data will be appended to.
-  const size_t new_data_index = str_.size();
-
-  Utf8Iterator iter(str);
-  while (iter.HasNext()) {
-    const char32_t codepoint = iter.Next();
-
-    if (last_char_was_escape_) {
-      switch (codepoint) {
-        case U't':
-          str_ += '\t';
-          break;
-
-        case U'n':
-          str_ += '\n';
-          break;
-
-        case U'#':
-        case U'@':
-        case U'?':
-        case U'"':
-        case U'\'':
-        case U'\\':
-          str_ += static_cast<char>(codepoint);
-          break;
-
-        case U'u':
-          if (!AppendUnicodeCodepoint(&iter, &str_)) {
-            error_ = "invalid unicode escape sequence";
-            return *this;
-          }
-          break;
-
-        default:
-          // Ignore the escape character and just include the codepoint.
-          AppendCodepointToUtf8String(codepoint, &str_);
-          break;
-      }
-      last_char_was_escape_ = false;
-
-    } else if (!preserve_spaces_ && codepoint == U'"') {
-      if (!quote_ && trailing_space_) {
-        // We found an opening quote, and we have trailing space, so we should append that
-        // space now.
-        if (trailing_space_) {
-          // We had trailing whitespace, so replace with a single space.
-          if (!str_.empty()) {
-            str_ += ' ';
-          }
-          trailing_space_ = false;
-        }
-      }
-      quote_ = !quote_;
-
-    } else if (!preserve_spaces_ && codepoint == U'\'' && !quote_) {
-      // This should be escaped.
-      error_ = "unescaped apostrophe";
-      return *this;
-
-    } else if (codepoint == U'\\') {
-      // This is an escape sequence, convert to the real value.
-      if (!quote_ && trailing_space_) {
-        // We had trailing whitespace, so
-        // replace with a single space.
-        if (!str_.empty()) {
-          str_ += ' ';
-        }
-        trailing_space_ = false;
-      }
-      last_char_was_escape_ = true;
-    } else {
-      if (preserve_spaces_ || quote_) {
-        // Quotes mean everything is taken, including whitespace.
-        AppendCodepointToUtf8String(codepoint, &str_);
-      } else {
-        // This is not quoted text, so we will accumulate whitespace and only emit a single
-        // character of whitespace if it is followed by a non-whitespace character.
-        if (IsCodepointSpace(codepoint)) {
-          // We found whitespace.
-          trailing_space_ = true;
-        } else {
-          if (trailing_space_) {
-            // We saw trailing space before, so replace all
-            // that trailing space with one space.
-            if (!str_.empty()) {
-              str_ += ' ';
-            }
-            trailing_space_ = false;
-          }
-          AppendCodepointToUtf8String(codepoint, &str_);
-        }
-      }
-    }
-  }
-
-  // Accumulate the added string's UTF-16 length.
-  ssize_t len = utf8_to_utf16_length(reinterpret_cast<const uint8_t*>(str_.data()) + new_data_index,
-                                     str_.size() - new_data_index);
-  if (len < 0) {
-    error_ = "invalid unicode code point";
-    return *this;
-  }
-  utf16_len_ += len;
-  return *this;
-}
-
 std::u16string Utf8ToUtf16(const StringPiece& utf8) {
   ssize_t utf16_length = utf8_to_utf16_length(
       reinterpret_cast<const uint8_t*>(utf8.data()), utf8.length());

diff --git a/tools/aapt2/util/Util.h b/tools/aapt2/util/Util.h
index 7c949b90..0eb35d1 100644
--- a/tools/aapt2/util/Util.h
+++ b/tools/aapt2/util/Util.h

@@ -59,7 +59,15 @@
 // Returns true if the string ends with suffix.
 bool EndsWith(const android::StringPiece& str, const android::StringPiece& suffix);
 
-// Creates a new StringPiece16 that points to a substring of the original string without leading or
+// Creates a new StringPiece that points to a substring of the original string without leading
+// whitespace.
+android::StringPiece TrimLeadingWhitespace(const android::StringPiece& str);
+
+// Creates a new StringPiece that points to a substring of the original string without trailing
+// whitespace.
+android::StringPiece TrimTrailingWhitespace(const android::StringPiece& str);
+
+// Creates a new StringPiece that points to a substring of the original string without leading or
 // trailing whitespace.
 android::StringPiece TrimWhitespace(const android::StringPiece& str);
 
@@ -141,9 +149,12 @@
 // break the string interpolation.
 bool VerifyJavaStringFormat(const android::StringPiece& str);
 
+bool AppendStyledString(const android::StringPiece& input, bool preserve_spaces,
+                        std::string* out_str, std::string* out_error);
+
 class StringBuilder {
  public:
-  explicit StringBuilder(bool preserve_spaces = false);
+  StringBuilder() = default;
 
   StringBuilder& Append(const android::StringPiece& str);
   const std::string& ToString() const;
@@ -158,7 +169,6 @@
   explicit operator bool() const;
 
  private:
-  bool preserve_spaces_;
   std::string str_;
   size_t utf16_len_ = 0;
   bool quote_ = false;

diff --git a/tools/aapt2/util/Util_test.cpp b/tools/aapt2/util/Util_test.cpp
index 2d1242a..d4e3bec 100644
--- a/tools/aapt2/util/Util_test.cpp
+++ b/tools/aapt2/util/Util_test.cpp

@@ -41,45 +41,6 @@
   EXPECT_TRUE(util::StartsWith("hello.xml", "he"));
 }
 
-TEST(UtilTest, StringBuilderSplitEscapeSequence) {
-  EXPECT_THAT(util::StringBuilder().Append("this is a new\\").Append("nline.").ToString(),
-              Eq("this is a new\nline."));
-}
-
-TEST(UtilTest, StringBuilderWhitespaceRemoval) {
-  EXPECT_THAT(util::StringBuilder().Append("    hey guys ").Append(" this is so cool ").ToString(),
-              Eq("hey guys this is so cool"));
-  EXPECT_THAT(
-      util::StringBuilder().Append(" \" wow,  so many \t ").Append("spaces. \"what? ").ToString(),
-      Eq(" wow,  so many \t spaces. what?"));
-  EXPECT_THAT(util::StringBuilder().Append("  where \t ").Append(" \nis the pie?").ToString(),
-              Eq("where is the pie?"));
-}
-
-TEST(UtilTest, StringBuilderEscaping) {
-  EXPECT_THAT(util::StringBuilder()
-                  .Append("    hey guys\\n ")
-                  .Append(" this \\t is so\\\\ cool ")
-                  .ToString(),
-              Eq("hey guys\n this \t is so\\ cool"));
-  EXPECT_THAT(util::StringBuilder().Append("\\@\\?\\#\\\\\\'").ToString(), Eq("@?#\\\'"));
-}
-
-TEST(UtilTest, StringBuilderMisplacedQuote) {
-  util::StringBuilder builder;
-  EXPECT_FALSE(builder.Append("they're coming!"));
-}
-
-TEST(UtilTest, StringBuilderUnicodeCodes) {
-  EXPECT_THAT(util::StringBuilder().Append("\\u00AF\\u0AF0 woah").ToString(),
-              Eq("\u00AF\u0AF0 woah"));
-  EXPECT_FALSE(util::StringBuilder().Append("\\u00 yo"));
-}
-
-TEST(UtilTest, StringBuilderPreserveSpaces) {
-  EXPECT_THAT(util::StringBuilder(true /*preserve_spaces*/).Append("\"").ToString(), Eq("\""));
-}
-
 TEST(UtilTest, TokenizeInput) {
   auto tokenizer = util::Tokenize(StringPiece("this| is|the|end"), '|');
   auto iter = tokenizer.begin();
commit	2eed52ecc0c2fa3e96530e4b5556eaa82f7c2dfc	[log] [tgz]
author	Adam Lesinski <adamlesinski@google.com>	Wed Feb 21 15:55:58 2018 -0800
committer	Adam Lesinski <adamlesinski@google.com>	Tue Feb 27 11:39:10 2018 -0800
tree	4e0a49770f684a2ca823d958c0f1a2b3adabcab9
parent	e1094a2e232277a719025aa5c97c492502c34f5b [diff]