Blame - runtime/utf_test.cc - platform_art

blob: 64e4eb798b194608efb7076bba8b5f7747e09e38 [file] [log] [blame]

Narayan Kamath	a5afcfc	2015-01-29 20:06:46 +0000	[diff] [blame]	1	/*
				2	* Copyright (C) 2015 The Android Open Source Project
				3	*
				4	* Licensed under the Apache License, Version 2.0 (the "License");
				5	* you may not use this file except in compliance with the License.
				6	* You may obtain a copy of the License at
				7	*
				8	* http://www.apache.org/licenses/LICENSE-2.0
				9	*
				10	* Unless required by applicable law or agreed to in writing, software
				11	* distributed under the License is distributed on an "AS IS" BASIS,
				12	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
				13	* See the License for the specific language governing permissions and
				14	* limitations under the License.
				15	*/
				16
				17	#include "utf.h"
				18
				19	#include "common_runtime_test.h"
				20	#include "utf-inl.h"
				21
Narayan Kamath	e16dad1	2015-02-13 11:49:22 +0000	[diff] [blame]	22	#include <vector>
				23
Narayan Kamath	a5afcfc	2015-01-29 20:06:46 +0000	[diff] [blame]	24	namespace art {
				25
				26	class UtfTest : public CommonRuntimeTest {};
				27
				28	TEST_F(UtfTest, GetLeadingUtf16Char) {
				29	EXPECT_EQ(0xffff, GetLeadingUtf16Char(0xeeeeffff));
				30	}
				31
				32	TEST_F(UtfTest, GetTrailingUtf16Char) {
				33	EXPECT_EQ(0xffff, GetTrailingUtf16Char(0xffffeeee));
				34	EXPECT_EQ(0, GetTrailingUtf16Char(0x0000aaaa));
				35	}
				36
				37	#define EXPECT_ARRAY_POSITION(expected, end, start) \
				38	EXPECT_EQ(static_cast<uintptr_t>(expected), \
				39	reinterpret_cast<uintptr_t>(end) - reinterpret_cast<uintptr_t>(start));
				40
				41	// A test string containing one, two, three and four byte UTF-8 sequences.
				42	static const uint8_t kAllSequences[] = {
				43	0x24,
				44	0xc2, 0xa2,
				45	0xe2, 0x82, 0xac,
				46	0xf0, 0x9f, 0x8f, 0xa0,
				47	0x00
				48	};
				49
				50	// A test string that contains a UTF-8 encoding of a surrogate pair
Bruce Hoult	1646d7a	2015-10-28 15:06:12 +0300	[diff] [blame^]	51	// (code point = U+10400).
Narayan Kamath	a5afcfc	2015-01-29 20:06:46 +0000	[diff] [blame]	52	static const uint8_t kSurrogateEncoding[] = {
				53	0xed, 0xa0, 0x81,
				54	0xed, 0xb0, 0x80,
				55	0x00
				56	};
				57
				58	TEST_F(UtfTest, GetUtf16FromUtf8) {
				59	const char* const start = reinterpret_cast<const char*>(kAllSequences);
				60	const char* ptr = start;
				61	uint32_t pair = 0;
				62
				63	// Single byte sequence.
				64	pair = GetUtf16FromUtf8(&ptr);
				65	EXPECT_EQ(0x24, GetLeadingUtf16Char(pair));
				66	EXPECT_EQ(0, GetTrailingUtf16Char(pair));
				67	EXPECT_ARRAY_POSITION(1, ptr, start);
				68
Bruce Hoult	1646d7a	2015-10-28 15:06:12 +0300	[diff] [blame^]	69	// Two byte sequence.
Narayan Kamath	a5afcfc	2015-01-29 20:06:46 +0000	[diff] [blame]	70	pair = GetUtf16FromUtf8(&ptr);
				71	EXPECT_EQ(0xa2, GetLeadingUtf16Char(pair));
				72	EXPECT_EQ(0, GetTrailingUtf16Char(pair));
				73	EXPECT_ARRAY_POSITION(3, ptr, start);
				74
Bruce Hoult	1646d7a	2015-10-28 15:06:12 +0300	[diff] [blame^]	75	// Three byte sequence.
Narayan Kamath	a5afcfc	2015-01-29 20:06:46 +0000	[diff] [blame]	76	pair = GetUtf16FromUtf8(&ptr);
				77	EXPECT_EQ(0x20ac, GetLeadingUtf16Char(pair));
				78	EXPECT_EQ(0, GetTrailingUtf16Char(pair));
				79	EXPECT_ARRAY_POSITION(6, ptr, start);
				80
				81	// Four byte sequence
				82	pair = GetUtf16FromUtf8(&ptr);
				83	EXPECT_EQ(0xd83c, GetLeadingUtf16Char(pair));
				84	EXPECT_EQ(0xdfe0, GetTrailingUtf16Char(pair));
				85	EXPECT_ARRAY_POSITION(10, ptr, start);
				86
Bruce Hoult	1646d7a	2015-10-28 15:06:12 +0300	[diff] [blame^]	87	// Null terminator.
Narayan Kamath	a5afcfc	2015-01-29 20:06:46 +0000	[diff] [blame]	88	pair = GetUtf16FromUtf8(&ptr);
				89	EXPECT_EQ(0, GetLeadingUtf16Char(pair));
				90	EXPECT_EQ(0, GetTrailingUtf16Char(pair));
				91	EXPECT_ARRAY_POSITION(11, ptr, start);
				92	}
				93
				94	TEST_F(UtfTest, GetUtf16FromUtf8_SurrogatesPassThrough) {
				95	const char* const start = reinterpret_cast<const char *>(kSurrogateEncoding);
				96	const char* ptr = start;
				97	uint32_t pair = 0;
				98
				99	pair = GetUtf16FromUtf8(&ptr);
				100	EXPECT_EQ(0xd801, GetLeadingUtf16Char(pair));
				101	EXPECT_EQ(0, GetTrailingUtf16Char(pair));
				102	EXPECT_ARRAY_POSITION(3, ptr, start);
				103
				104	pair = GetUtf16FromUtf8(&ptr);
				105	EXPECT_EQ(0xdc00, GetLeadingUtf16Char(pair));
				106	EXPECT_EQ(0, GetTrailingUtf16Char(pair));
				107	EXPECT_ARRAY_POSITION(6, ptr, start);
				108	}
				109
				110	TEST_F(UtfTest, CountModifiedUtf8Chars) {
				111	EXPECT_EQ(5u, CountModifiedUtf8Chars(reinterpret_cast<const char *>(kAllSequences)));
				112	EXPECT_EQ(2u, CountModifiedUtf8Chars(reinterpret_cast<const char *>(kSurrogateEncoding)));
				113	}
				114
Narayan Kamath	e16dad1	2015-02-13 11:49:22 +0000	[diff] [blame]	115	static void AssertConversion(const std::vector<uint16_t> input,
				116	const std::vector<uint8_t> expected) {
				117	ASSERT_EQ(expected.size(), CountUtf8Bytes(&input[0], input.size()));
				118
				119	std::vector<uint8_t> output(expected.size());
Bruce Hoult	1646d7a	2015-10-28 15:06:12 +0300	[diff] [blame^]	120	ConvertUtf16ToModifiedUtf8(reinterpret_cast<char*>(&output[0]), expected.size(),
				121	&input[0], input.size());
Narayan Kamath	e16dad1	2015-02-13 11:49:22 +0000	[diff] [blame]	122	EXPECT_EQ(expected, output);
				123	}
				124
				125	TEST_F(UtfTest, CountAndConvertUtf8Bytes) {
				126	// Surrogate pairs will be converted into 4 byte sequences.
				127	AssertConversion({ 0xd801, 0xdc00 }, { 0xf0, 0x90, 0x90, 0x80 });
				128
				129	// Three byte encodings that are below & above the leading surrogate
				130	// range respectively.
				131	AssertConversion({ 0xdef0 }, { 0xed, 0xbb, 0xb0 });
				132	AssertConversion({ 0xdcff }, { 0xed, 0xb3, 0xbf });
				133	// Two byte encoding.
				134	AssertConversion({ 0x0101 }, { 0xc4, 0x81 });
				135
				136	// Two byte special case : 0 must use an overlong encoding.
				137	AssertConversion({ 0x0101, 0x0000 }, { 0xc4, 0x81, 0xc0, 0x80 });
				138
				139	// One byte encoding.
				140	AssertConversion({ 'h', 'e', 'l', 'l', 'o' }, { 0x68, 0x65, 0x6c, 0x6c, 0x6f });
				141
				142	AssertConversion({
Bruce Hoult	1646d7a	2015-10-28 15:06:12 +0300	[diff] [blame^]	143	0xd802, 0xdc02, // Surrogate pair.
				144	0xdef0, 0xdcff, // Three byte encodings.
				145	0x0101, 0x0000, // Two byte encodings.
				146	'p' , 'p' // One byte encoding.
Narayan Kamath	e16dad1	2015-02-13 11:49:22 +0000	[diff] [blame]	147	}, {
				148	0xf0, 0x90, 0xa0, 0x82,
				149	0xed, 0xbb, 0xb0, 0xed, 0xb3, 0xbf,
				150	0xc4, 0x81, 0xc0, 0x80,
				151	0x70, 0x70
				152	});
				153	}
				154
				155	TEST_F(UtfTest, CountAndConvertUtf8Bytes_UnpairedSurrogate) {
				156	// Unpaired trailing surrogate at the end of input.
				157	AssertConversion({ 'h', 'e', 0xd801 }, { 'h', 'e', 0xed, 0xa0, 0x81 });
				158	// Unpaired (or incorrectly paired) surrogates in the middle of the input.
				159	AssertConversion({ 'h', 0xd801, 'e' }, { 'h', 0xed, 0xa0, 0x81, 'e' });
				160	AssertConversion({ 'h', 0xd801, 0xd801, 'e' }, { 'h', 0xed, 0xa0, 0x81, 0xed, 0xa0, 0x81, 'e' });
				161	AssertConversion({ 'h', 0xdc00, 0xdc00, 'e' }, { 'h', 0xed, 0xb0, 0x80, 0xed, 0xb0, 0x80, 'e' });
				162	}
				163
Bruce Hoult	1646d7a	2015-10-28 15:06:12 +0300	[diff] [blame^]	164	// Old versions of functions, here to compare answers with optimized versions.
				165
				166	size_t CountModifiedUtf8Chars_reference(const char* utf8) {
				167	size_t len = 0;
				168	int ic;
				169	while ((ic = *utf8++) != '\0') {
				170	len++;
				171	if ((ic & 0x80) == 0) {
				172	// one-byte encoding
				173	continue;
				174	}
				175	// two- or three-byte encoding
				176	utf8++;
				177	if ((ic & 0x20) == 0) {
				178	// two-byte encoding
				179	continue;
				180	}
				181	utf8++;
				182	if ((ic & 0x10) == 0) {
				183	// three-byte encoding
				184	continue;
				185	}
				186
				187	// four-byte encoding: needs to be converted into a surrogate
				188	// pair.
				189	utf8++;
				190	len++;
				191	}
				192	return len;
				193	}
				194
				195	static size_t CountUtf8Bytes_reference(const uint16_t* chars, size_t char_count) {
				196	size_t result = 0;
				197	while (char_count--) {
				198	const uint16_t ch = *chars++;
				199	if (ch > 0 && ch <= 0x7f) {
				200	++result;
				201	} else if (ch >= 0xd800 && ch <= 0xdbff) {
				202	if (char_count > 0) {
				203	const uint16_t ch2 = *chars;
				204	// If we find a properly paired surrogate, we emit it as a 4 byte
				205	// UTF sequence. If we find an unpaired leading or trailing surrogate,
				206	// we emit it as a 3 byte sequence like would have done earlier.
				207	if (ch2 >= 0xdc00 && ch2 <= 0xdfff) {
				208	chars++;
				209	char_count--;
				210
				211	result += 4;
				212	} else {
				213	result += 3;
				214	}
				215	} else {
				216	// This implies we found an unpaired trailing surrogate at the end
				217	// of a string.
				218	result += 3;
				219	}
				220	} else if (ch > 0x7ff) {
				221	result += 3;
				222	} else {
				223	result += 2;
				224	}
				225	}
				226	return result;
				227	}
				228
				229	static void ConvertUtf16ToModifiedUtf8_reference(char* utf8_out, const uint16_t* utf16_in,
				230	size_t char_count) {
				231	while (char_count--) {
				232	const uint16_t ch = *utf16_in++;
				233	if (ch > 0 && ch <= 0x7f) {
				234	*utf8_out++ = ch;
				235	} else {
				236	// Char_count == 0 here implies we've encountered an unpaired
				237	// surrogate and we have no choice but to encode it as 3-byte UTF
				238	// sequence. Note that unpaired surrogates can occur as a part of
				239	// "normal" operation.
				240	if ((ch >= 0xd800 && ch <= 0xdbff) && (char_count > 0)) {
				241	const uint16_t ch2 = *utf16_in;
				242
				243	// Check if the other half of the pair is within the expected
				244	// range. If it isn't, we will have to emit both "halves" as
				245	// separate 3 byte sequences.
				246	if (ch2 >= 0xdc00 && ch2 <= 0xdfff) {
				247	utf16_in++;
				248	char_count--;
				249	const uint32_t code_point = (ch << 10) + ch2 - 0x035fdc00;
				250	*utf8_out++ = (code_point >> 18) \| 0xf0;
				251	*utf8_out++ = ((code_point >> 12) & 0x3f) \| 0x80;
				252	*utf8_out++ = ((code_point >> 6) & 0x3f) \| 0x80;
				253	*utf8_out++ = (code_point & 0x3f) \| 0x80;
				254	continue;
				255	}
				256	}
				257
				258	if (ch > 0x07ff) {
				259	// Three byte encoding.
				260	*utf8_out++ = (ch >> 12) \| 0xe0;
				261	*utf8_out++ = ((ch >> 6) & 0x3f) \| 0x80;
				262	*utf8_out++ = (ch & 0x3f) \| 0x80;
				263	} else /(ch > 0x7f \|\| ch == 0)/ {
				264	// Two byte encoding.
				265	*utf8_out++ = (ch >> 6) \| 0xc0;
				266	*utf8_out++ = (ch & 0x3f) \| 0x80;
				267	}
				268	}
				269	}
				270	}
				271
				272	// Exhaustive test of converting a single code point to UTF-16, then UTF-8, and back again.
				273
				274	static void codePointToSurrogatePair(uint32_t code_point, uint16_t &first, uint16_t &second) {
				275	first = (code_point >> 10) + 0xd7c0;
				276	second = (code_point & 0x03ff) + 0xdc00;
				277	}
				278
				279	static void testConversions(uint16_t *buf, int char_count) {
				280	char bytes_test[8], bytes_reference[8];
				281	uint16_t out_buf_test[4], out_buf_reference[4];
				282	int byte_count_test, byte_count_reference;
				283	int char_count_test, char_count_reference;
				284
				285	// Calculate the number of utf-8 bytes for the utf-16 chars.
				286	byte_count_reference = CountUtf8Bytes_reference(buf, char_count);
				287	byte_count_test = CountUtf8Bytes(buf, char_count);
				288	EXPECT_EQ(byte_count_reference, byte_count_test);
				289
				290	// Convert the utf-16 string to utf-8 bytes.
				291	ConvertUtf16ToModifiedUtf8_reference(bytes_reference, buf, char_count);
				292	ConvertUtf16ToModifiedUtf8(bytes_test, byte_count_test, buf, char_count);
				293	for (int i = 0; i < byte_count_test; ++i) {
				294	EXPECT_EQ(bytes_reference[i], bytes_test[i]);
				295	}
				296
				297	// Calculate the number of utf-16 chars from the utf-8 bytes.
				298	bytes_reference[byte_count_reference] = 0; // Reference function needs null termination.
				299	char_count_reference = CountModifiedUtf8Chars_reference(bytes_reference);
				300	char_count_test = CountModifiedUtf8Chars(bytes_test, byte_count_test);
				301	EXPECT_EQ(char_count, char_count_reference);
				302	EXPECT_EQ(char_count, char_count_test);
				303
				304	// Convert the utf-8 bytes back to utf-16 chars.
				305	// Does not need copied _reference version of the function because the original
				306	// function with the old API is retained for debug/testing code.
				307	ConvertModifiedUtf8ToUtf16(out_buf_reference, bytes_reference);
				308	ConvertModifiedUtf8ToUtf16(out_buf_test, char_count_test, bytes_test, byte_count_test);
				309	for (int i = 0; i < char_count_test; ++i) {
				310	EXPECT_EQ(buf[i], out_buf_reference[i]);
				311	EXPECT_EQ(buf[i], out_buf_test[i]);
				312	}
				313	}
				314
				315	TEST_F(UtfTest, ExhaustiveBidirectionalCodePointCheck) {
				316	for (int codePoint = 0; codePoint <= 0x10ffff; ++codePoint) {
				317	uint16_t buf[4];
				318	if (codePoint <= 0xffff) {
				319	if (codePoint >= 0xd800 && codePoint <= 0xdfff) {
				320	// According to the Unicode standard, no character will ever
				321	// be assigned to these code points, and they can not be encoded
				322	// into either utf-16 or utf-8.
				323	continue;
				324	}
				325	buf[0] = 'h';
				326	buf[1] = codePoint;
				327	buf[2] = 'e';
				328	testConversions(buf, 2);
				329	testConversions(buf, 3);
				330	testConversions(buf + 1, 1);
				331	testConversions(buf + 1, 2);
				332	} else {
				333	buf[0] = 'h';
				334	codePointToSurrogatePair(codePoint, buf[1], buf[2]);
				335	buf[3] = 'e';
				336	testConversions(buf, 2);
				337	testConversions(buf, 3);
				338	testConversions(buf, 4);
				339	testConversions(buf + 1, 1);
				340	testConversions(buf + 1, 2);
				341	testConversions(buf + 1, 3);
				342	}
				343	}
				344	}
				345
Narayan Kamath	a5afcfc	2015-01-29 20:06:46 +0000	[diff] [blame]	346	} // namespace art