Be more lenient with 4 byte UTF-8 sequences.

Accept 4 byte sequences and convert them into surrogate
pairs instead of expecting 2 separate 3 byte sequences
each encoding one half of a surrogate pair.

Note that in addition to supporting 4 byte sequences in
strings from JNI, we also tolerate them in dex files. This
is mainly for consistency, and there's no need to claim any
sort of official support.

bug: 18848397
bug: https://code.google.com/p/android/issues/detail?id=81341
Change-Id: Ibc98d29e59d98803e640f2489ea4c56912a59b29
diff --git a/runtime/utf-inl.h b/runtime/utf-inl.h
index 1373d17..b2d6765 100644
--- a/runtime/utf-inl.h
+++ b/runtime/utf-inl.h
@@ -21,26 +21,57 @@
 
 namespace art {
 
-inline uint16_t GetUtf16FromUtf8(const char** utf8_data_in) {
-  uint8_t one = *(*utf8_data_in)++;
+inline uint16_t GetTrailingUtf16Char(uint32_t maybe_pair) {
+  return static_cast<uint16_t>(maybe_pair >> 16);
+}
+
+inline uint16_t GetLeadingUtf16Char(uint32_t maybe_pair) {
+  return static_cast<uint16_t>(maybe_pair & 0x0000FFFF);
+}
+
+inline uint32_t GetUtf16FromUtf8(const char** utf8_data_in) {
+  const uint8_t one = *(*utf8_data_in)++;
   if ((one & 0x80) == 0) {
     // one-byte encoding
     return one;
   }
-  // two- or three-byte encoding
-  uint8_t two = *(*utf8_data_in)++;
+
+  const uint8_t two = *(*utf8_data_in)++;
   if ((one & 0x20) == 0) {
     // two-byte encoding
     return ((one & 0x1f) << 6) | (two & 0x3f);
   }
-  // three-byte encoding
-  uint8_t three = *(*utf8_data_in)++;
-  return ((one & 0x0f) << 12) | ((two & 0x3f) << 6) | (three & 0x3f);
+
+  const uint8_t three = *(*utf8_data_in)++;
+  if ((one & 0x10) == 0) {
+    return ((one & 0x0f) << 12) | ((two & 0x3f) << 6) | (three & 0x3f);
+  }
+
+  // Four byte encodings need special handling. We'll have
+  // to convert them into a surrogate pair.
+  const uint8_t four = *(*utf8_data_in)++;
+
+  // Since this is a 4 byte UTF-8 sequence, it will lie between
+  // U+10000 and U+1FFFFF.
+  //
+  // TODO: What do we do about values in (U+10FFFF, U+1FFFFF) ? The
+  // spec says they're invalid but nobody appears to check for them.
+  const uint32_t code_point = ((one & 0x0f) << 18) | ((two & 0x3f) << 12)
+      | ((three & 0x3f) << 6) | (four & 0x3f);
+
+  uint32_t surrogate_pair = 0;
+  // Step two: Write out the high (leading) surrogate to the bottom 16 bits
+  // of the of the 32 bit type.
+  surrogate_pair |= ((code_point >> 10) + 0xd7c0) & 0xffff;
+  // Step three : Write out the low (trailing) surrogate to the top 16 bits.
+  surrogate_pair |= ((code_point & 0x03ff) + 0xdc00) << 16;
+
+  return surrogate_pair;
 }
 
 inline int CompareModifiedUtf8ToModifiedUtf8AsUtf16CodePointValues(const char* utf8_1,
                                                                    const char* utf8_2) {
-  uint16_t c1, c2;
+  uint32_t c1, c2;
   do {
     c1 = *utf8_1;
     c2 = *utf8_2;
@@ -50,50 +81,17 @@
     } else if (c2 == 0) {
       return 1;
     }
-    // Assume 1-byte value and handle all cases first.
-    utf8_1++;
-    utf8_2++;
-    if ((c1 & 0x80) == 0) {
-      if (c1 == c2) {
-        // Matching 1-byte values.
-        continue;
-      } else {
-        // Non-matching values.
-        if ((c2 & 0x80) == 0) {
-          // 1-byte value, do nothing.
-        } else if ((c2 & 0x20) == 0) {
-          // 2-byte value.
-          c2 = ((c2 & 0x1f) << 6) | (*utf8_2 & 0x3f);
-        } else {
-          // 3-byte value.
-          c2 = ((c2 & 0x0f) << 12) | ((utf8_2[0] & 0x3f) << 6) | (utf8_2[1] & 0x3f);
-        }
-        return static_cast<int>(c1) - static_cast<int>(c2);
-      }
-    }
-    // Non-matching or multi-byte values.
-    if ((c1 & 0x20) == 0) {
-      // 2-byte value.
-      c1 = ((c1 & 0x1f) << 6) | (*utf8_1 & 0x3f);
-      utf8_1++;
-    } else {
-      // 3-byte value.
-      c1 = ((c1 & 0x0f) << 12) | ((utf8_1[0] & 0x3f) << 6) | (utf8_1[1] & 0x3f);
-      utf8_1 += 2;
-    }
-    if ((c2 & 0x80) == 0) {
-      // 1-byte value, do nothing.
-    } else if ((c2 & 0x20) == 0) {
-      // 2-byte value.
-      c2 = ((c2 & 0x1f) << 6) | (*utf8_2 & 0x3f);
-      utf8_2++;
-    } else {
-      // 3-byte value.
-      c2 = ((c2 & 0x0f) << 12) | ((utf8_2[0] & 0x3f) << 6) | (utf8_2[1] & 0x3f);
-      utf8_2 += 2;
-    }
+
+    c1 = GetUtf16FromUtf8(&utf8_1);
+    c2 = GetUtf16FromUtf8(&utf8_2);
   } while (c1 == c2);
-  return static_cast<int>(c1) - static_cast<int>(c2);
+
+  const uint32_t leading_surrogate_diff = GetLeadingUtf16Char(c1) - GetLeadingUtf16Char(c2);
+  if (leading_surrogate_diff != 0) {
+      return static_cast<int>(leading_surrogate_diff);
+  }
+
+  return GetTrailingUtf16Char(c1) - GetTrailingUtf16Char(c2);
 }
 
 }  // namespace art