Optimize some commonly used utf8 functions by:
- using counted loops instead of searching for terminating null. In
the important cases the caller already knows the length: change
the API to pass it in. Keep the old API version as well to avoid
extensive changes to non-critical debug and test code.
- ensure the common cases are at the start of if/then/else chains.
Usually 99+% of characters are ASCII even in mixed strings.
- for the "convert" functions, when both utf8 and utf16 lengths are
passed, and are equal, it means the entire string is ASCII, and a
specialized loop can be used. The compiler might then unroll or
even vectorize this.
The functions improved are (tested on Nexus 5 with a 44 character
ASCII string):
CountModifiedUtf8Chars : 20% faster
ConvertUtf16ToModifiedUtf8: 80% faster
ConvertModifiedUtf8ToUtf16: 200% faster
Also for completeness CountUtf8Bytes has been cleaned up a little, but
the speed is unchanged. Unlike CountModifiedUtf8Chars, it was already
passed the length, rather than searching for null.
Change-Id: I1c9b7dea3eda869fc9f5f6b4dd6be8cdd5bc3ac0
diff --git a/runtime/utf_test.cc b/runtime/utf_test.cc
index 94a6ea5..64e4eb7 100644
--- a/runtime/utf_test.cc
+++ b/runtime/utf_test.cc
@@ -48,7 +48,7 @@
};
// A test string that contains a UTF-8 encoding of a surrogate pair
-// (code point = U+10400)
+// (code point = U+10400).
static const uint8_t kSurrogateEncoding[] = {
0xed, 0xa0, 0x81,
0xed, 0xb0, 0x80,
@@ -66,13 +66,13 @@
EXPECT_EQ(0, GetTrailingUtf16Char(pair));
EXPECT_ARRAY_POSITION(1, ptr, start);
- // Two byte sequence
+ // Two byte sequence.
pair = GetUtf16FromUtf8(&ptr);
EXPECT_EQ(0xa2, GetLeadingUtf16Char(pair));
EXPECT_EQ(0, GetTrailingUtf16Char(pair));
EXPECT_ARRAY_POSITION(3, ptr, start);
- // Three byte sequence
+ // Three byte sequence.
pair = GetUtf16FromUtf8(&ptr);
EXPECT_EQ(0x20ac, GetLeadingUtf16Char(pair));
EXPECT_EQ(0, GetTrailingUtf16Char(pair));
@@ -84,7 +84,7 @@
EXPECT_EQ(0xdfe0, GetTrailingUtf16Char(pair));
EXPECT_ARRAY_POSITION(10, ptr, start);
- // Null terminator
+ // Null terminator.
pair = GetUtf16FromUtf8(&ptr);
EXPECT_EQ(0, GetLeadingUtf16Char(pair));
EXPECT_EQ(0, GetTrailingUtf16Char(pair));
@@ -117,7 +117,8 @@
ASSERT_EQ(expected.size(), CountUtf8Bytes(&input[0], input.size()));
std::vector<uint8_t> output(expected.size());
- ConvertUtf16ToModifiedUtf8(reinterpret_cast<char*>(&output[0]), &input[0], input.size());
+ ConvertUtf16ToModifiedUtf8(reinterpret_cast<char*>(&output[0]), expected.size(),
+ &input[0], input.size());
EXPECT_EQ(expected, output);
}
@@ -139,10 +140,10 @@
AssertConversion({ 'h', 'e', 'l', 'l', 'o' }, { 0x68, 0x65, 0x6c, 0x6c, 0x6f });
AssertConversion({
- 0xd802, 0xdc02, // Surrogate pair
- 0xdef0, 0xdcff, // Three byte encodings
- 0x0101, 0x0000, // Two byte encodings
- 'p' , 'p' // One byte encoding
+ 0xd802, 0xdc02, // Surrogate pair.
+ 0xdef0, 0xdcff, // Three byte encodings.
+ 0x0101, 0x0000, // Two byte encodings.
+ 'p' , 'p' // One byte encoding.
}, {
0xf0, 0x90, 0xa0, 0x82,
0xed, 0xbb, 0xb0, 0xed, 0xb3, 0xbf,
@@ -160,4 +161,186 @@
AssertConversion({ 'h', 0xdc00, 0xdc00, 'e' }, { 'h', 0xed, 0xb0, 0x80, 0xed, 0xb0, 0x80, 'e' });
}
+// Old versions of functions, here to compare answers with optimized versions.
+
+size_t CountModifiedUtf8Chars_reference(const char* utf8) {
+ size_t len = 0;
+ int ic;
+ while ((ic = *utf8++) != '\0') {
+ len++;
+ if ((ic & 0x80) == 0) {
+ // one-byte encoding
+ continue;
+ }
+ // two- or three-byte encoding
+ utf8++;
+ if ((ic & 0x20) == 0) {
+ // two-byte encoding
+ continue;
+ }
+ utf8++;
+ if ((ic & 0x10) == 0) {
+ // three-byte encoding
+ continue;
+ }
+
+ // four-byte encoding: needs to be converted into a surrogate
+ // pair.
+ utf8++;
+ len++;
+ }
+ return len;
+}
+
+static size_t CountUtf8Bytes_reference(const uint16_t* chars, size_t char_count) {
+ size_t result = 0;
+ while (char_count--) {
+ const uint16_t ch = *chars++;
+ if (ch > 0 && ch <= 0x7f) {
+ ++result;
+ } else if (ch >= 0xd800 && ch <= 0xdbff) {
+ if (char_count > 0) {
+ const uint16_t ch2 = *chars;
+ // If we find a properly paired surrogate, we emit it as a 4 byte
+ // UTF sequence. If we find an unpaired leading or trailing surrogate,
+ // we emit it as a 3 byte sequence like would have done earlier.
+ if (ch2 >= 0xdc00 && ch2 <= 0xdfff) {
+ chars++;
+ char_count--;
+
+ result += 4;
+ } else {
+ result += 3;
+ }
+ } else {
+ // This implies we found an unpaired trailing surrogate at the end
+ // of a string.
+ result += 3;
+ }
+ } else if (ch > 0x7ff) {
+ result += 3;
+ } else {
+ result += 2;
+ }
+ }
+ return result;
+}
+
+static void ConvertUtf16ToModifiedUtf8_reference(char* utf8_out, const uint16_t* utf16_in,
+ size_t char_count) {
+ while (char_count--) {
+ const uint16_t ch = *utf16_in++;
+ if (ch > 0 && ch <= 0x7f) {
+ *utf8_out++ = ch;
+ } else {
+ // Char_count == 0 here implies we've encountered an unpaired
+ // surrogate and we have no choice but to encode it as 3-byte UTF
+ // sequence. Note that unpaired surrogates can occur as a part of
+ // "normal" operation.
+ if ((ch >= 0xd800 && ch <= 0xdbff) && (char_count > 0)) {
+ const uint16_t ch2 = *utf16_in;
+
+ // Check if the other half of the pair is within the expected
+ // range. If it isn't, we will have to emit both "halves" as
+ // separate 3 byte sequences.
+ if (ch2 >= 0xdc00 && ch2 <= 0xdfff) {
+ utf16_in++;
+ char_count--;
+ const uint32_t code_point = (ch << 10) + ch2 - 0x035fdc00;
+ *utf8_out++ = (code_point >> 18) | 0xf0;
+ *utf8_out++ = ((code_point >> 12) & 0x3f) | 0x80;
+ *utf8_out++ = ((code_point >> 6) & 0x3f) | 0x80;
+ *utf8_out++ = (code_point & 0x3f) | 0x80;
+ continue;
+ }
+ }
+
+ if (ch > 0x07ff) {
+ // Three byte encoding.
+ *utf8_out++ = (ch >> 12) | 0xe0;
+ *utf8_out++ = ((ch >> 6) & 0x3f) | 0x80;
+ *utf8_out++ = (ch & 0x3f) | 0x80;
+ } else /*(ch > 0x7f || ch == 0)*/ {
+ // Two byte encoding.
+ *utf8_out++ = (ch >> 6) | 0xc0;
+ *utf8_out++ = (ch & 0x3f) | 0x80;
+ }
+ }
+ }
+}
+
+// Exhaustive test of converting a single code point to UTF-16, then UTF-8, and back again.
+
+static void codePointToSurrogatePair(uint32_t code_point, uint16_t &first, uint16_t &second) {
+ first = (code_point >> 10) + 0xd7c0;
+ second = (code_point & 0x03ff) + 0xdc00;
+}
+
+static void testConversions(uint16_t *buf, int char_count) {
+ char bytes_test[8], bytes_reference[8];
+ uint16_t out_buf_test[4], out_buf_reference[4];
+ int byte_count_test, byte_count_reference;
+ int char_count_test, char_count_reference;
+
+ // Calculate the number of utf-8 bytes for the utf-16 chars.
+ byte_count_reference = CountUtf8Bytes_reference(buf, char_count);
+ byte_count_test = CountUtf8Bytes(buf, char_count);
+ EXPECT_EQ(byte_count_reference, byte_count_test);
+
+ // Convert the utf-16 string to utf-8 bytes.
+ ConvertUtf16ToModifiedUtf8_reference(bytes_reference, buf, char_count);
+ ConvertUtf16ToModifiedUtf8(bytes_test, byte_count_test, buf, char_count);
+ for (int i = 0; i < byte_count_test; ++i) {
+ EXPECT_EQ(bytes_reference[i], bytes_test[i]);
+ }
+
+ // Calculate the number of utf-16 chars from the utf-8 bytes.
+ bytes_reference[byte_count_reference] = 0; // Reference function needs null termination.
+ char_count_reference = CountModifiedUtf8Chars_reference(bytes_reference);
+ char_count_test = CountModifiedUtf8Chars(bytes_test, byte_count_test);
+ EXPECT_EQ(char_count, char_count_reference);
+ EXPECT_EQ(char_count, char_count_test);
+
+ // Convert the utf-8 bytes back to utf-16 chars.
+ // Does not need copied _reference version of the function because the original
+ // function with the old API is retained for debug/testing code.
+ ConvertModifiedUtf8ToUtf16(out_buf_reference, bytes_reference);
+ ConvertModifiedUtf8ToUtf16(out_buf_test, char_count_test, bytes_test, byte_count_test);
+ for (int i = 0; i < char_count_test; ++i) {
+ EXPECT_EQ(buf[i], out_buf_reference[i]);
+ EXPECT_EQ(buf[i], out_buf_test[i]);
+ }
+}
+
+TEST_F(UtfTest, ExhaustiveBidirectionalCodePointCheck) {
+ for (int codePoint = 0; codePoint <= 0x10ffff; ++codePoint) {
+ uint16_t buf[4];
+ if (codePoint <= 0xffff) {
+ if (codePoint >= 0xd800 && codePoint <= 0xdfff) {
+ // According to the Unicode standard, no character will ever
+ // be assigned to these code points, and they can not be encoded
+ // into either utf-16 or utf-8.
+ continue;
+ }
+ buf[0] = 'h';
+ buf[1] = codePoint;
+ buf[2] = 'e';
+ testConversions(buf, 2);
+ testConversions(buf, 3);
+ testConversions(buf + 1, 1);
+ testConversions(buf + 1, 2);
+ } else {
+ buf[0] = 'h';
+ codePointToSurrogatePair(codePoint, buf[1], buf[2]);
+ buf[3] = 'e';
+ testConversions(buf, 2);
+ testConversions(buf, 3);
+ testConversions(buf, 4);
+ testConversions(buf + 1, 1);
+ testConversions(buf + 1, 2);
+ testConversions(buf + 1, 3);
+ }
+ }
+}
+
} // namespace art