Blame - libdex/DexUtf.cpp - platform_dalvik

blob: df49d18313ce247849f14798e9598aa89bef3fce [file] [log] [blame]

Aart Bik	1a65052	2015-07-08 21:20:41 +0000	[diff] [blame^]	1	/*
				2	* Copyright (C) 2011 The Android Open Source Project
				3	*
				4	* Licensed under the Apache License, Version 2.0 (the "License");
				5	* you may not use this file except in compliance with the License.
				6	* You may obtain a copy of the License at
				7	*
				8	* http://www.apache.org/licenses/LICENSE-2.0
				9	*
				10	* Unless required by applicable law or agreed to in writing, software
				11	* distributed under the License is distributed on an "AS IS" BASIS,
				12	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
				13	* See the License for the specific language governing permissions and
				14	* limitations under the License.
				15	*/
				16
				17	/*
				18	* Validate and manipulate MUTF-8 encoded string data.
				19	*/
				20
				21	#include "DexUtf.h"
				22
				23	/* Compare two '\0'-terminated modified UTF-8 strings, using Unicode
				24	* code point values for comparison. This treats different encodings
				25	* for the same code point as equivalent, except that only a real '\0'
				26	* byte is considered the string terminator. The return value is as
				27	* for strcmp(). */
				28	int dexUtf8Cmp(const char* s1, const char* s2) {
				29	for (;;) {
				30	if (*s1 == '\0') {
				31	if (*s2 == '\0') {
				32	return 0;
				33	}
				34	return -1;
				35	} else if (*s2 == '\0') {
				36	return 1;
				37	}
				38
				39	int utf1 = dexGetUtf16FromUtf8(&s1);
				40	int utf2 = dexGetUtf16FromUtf8(&s2);
				41	int diff = utf1 - utf2;
				42
				43	if (diff != 0) {
				44	return diff;
				45	}
				46	}
				47	}
				48
				49	/* for dexIsValidMemberNameUtf8(), a bit vector indicating valid low ascii */
				50	u4 DEX_MEMBER_VALID_LOW_ASCII[4] = {
				51	0x00000000, // 00..1f low control characters; nothing valid
				52	0x03ff2010, // 20..3f digits and symbols; valid: '0'..'9', '$', '-'
				53	0x87fffffe, // 40..5f uppercase etc.; valid: 'A'..'Z', '_'
				54	0x07fffffe // 60..7f lowercase etc.; valid: 'a'..'z'
				55	};
				56
				57	/* Helper for dexIsValidMemberNameUtf8(); do not call directly. */
				58	bool dexIsValidMemberNameUtf8_0(const char** pUtf8Ptr) {
				59	/*
				60	* It's a multibyte encoded character. Decode it and analyze. We
				61	* accept anything that isn't (a) an improperly encoded low value,
				62	* (b) an improper surrogate pair, (c) an encoded '\0', (d) a high
				63	* control character, or (e) a high space, layout, or special
				64	* character (U+00a0, U+2000..U+200f, U+2028..U+202f,
				65	* U+fff0..U+ffff). This is all specified in the dex format
				66	* document.
				67	*/
				68
				69	u2 utf16 = dexGetUtf16FromUtf8(pUtf8Ptr);
				70
				71	// Perform follow-up tests based on the high 8 bits.
				72	switch (utf16 >> 8) {
				73	case 0x00: {
				74	// It's only valid if it's above the ISO-8859-1 high space (0xa0).
				75	return (utf16 > 0x00a0);
				76	}
				77	case 0xd8:
				78	case 0xd9:
				79	case 0xda:
				80	case 0xdb: {
				81	/*
				82	* It's a leading surrogate. Check to see that a trailing
				83	* surrogate follows.
				84	*/
				85	utf16 = dexGetUtf16FromUtf8(pUtf8Ptr);
				86	return (utf16 >= 0xdc00) && (utf16 <= 0xdfff);
				87	}
				88	case 0xdc:
				89	case 0xdd:
				90	case 0xde:
				91	case 0xdf: {
				92	// It's a trailing surrogate, which is not valid at this point.
				93	return false;
				94	}
				95	case 0x20:
				96	case 0xff: {
				97	// It's in the range that has spaces, controls, and specials.
				98	switch (utf16 & 0xfff8) {
				99	case 0x2000:
				100	case 0x2008:
				101	case 0x2028:
				102	case 0xfff0:
				103	case 0xfff8: {
				104	return false;
				105	}
				106	}
				107	break;
				108	}
				109	}
				110
				111	return true;
				112	}
				113
				114	/* Return whether the given string is a valid field or method name. */
				115	bool dexIsValidMemberName(const char* s) {
				116	bool angleName = false;
				117
				118	switch (*s) {
				119	case '\0': {
				120	// The empty string is not a valid name.
				121	return false;
				122	}
				123	case '<': {
				124	/*
				125	* '<' is allowed only at the start of a name, and if present,
				126	* means that the name must end with '>'.
				127	*/
				128	angleName = true;
				129	s++;
				130	break;
				131	}
				132	}
				133
				134	for (;;) {
				135	switch (*s) {
				136	case '\0': {
				137	return !angleName;
				138	}
				139	case '>': {
				140	return angleName && s[1] == '\0';
				141	}
				142	}
				143	if (!dexIsValidMemberNameUtf8(&s)) {
				144	return false;
				145	}
				146	}
				147	}
				148
				149	/* Helper for validating type descriptors and class names, which is parametric
				150	* with respect to type vs. class and dot vs. slash. */
				151	static bool isValidTypeDescriptorOrClassName(const char* s, bool isClassName,
				152	bool dotSeparator) {
				153	int arrayCount = 0;
				154
				155	while (*s == '[') {
				156	arrayCount++;
				157	s++;
				158	}
				159
				160	if (arrayCount > 255) {
				161	// Arrays may have no more than 255 dimensions.
				162	return false;
				163	}
				164
				165	if (arrayCount != 0) {
				166	/*
				167	* If we're looking at an array of some sort, then it doesn't
				168	* matter if what is being asked for is a class name; the
				169	* format looks the same as a type descriptor in that case, so
				170	* treat it as such.
				171	*/
				172	isClassName = false;
				173	}
				174
				175	if (!isClassName) {
				176	/*
				177	* We are looking for a descriptor. Either validate it as a
				178	* single-character primitive type, or continue on to check the
				179	* embedded class name (bracketed by "L" and ";").
				180	*/
				181	switch (*(s++)) {
				182	case 'B':
				183	case 'C':
				184	case 'D':
				185	case 'F':
				186	case 'I':
				187	case 'J':
				188	case 'S':
				189	case 'Z': {
				190	// These are all single-character descriptors for primitive types.
				191	return (*s == '\0');
				192	}
				193	case 'V': {
				194	// Non-array void is valid, but you can't have an array of void.
				195	return (arrayCount == 0) && (*s == '\0');
				196	}
				197	case 'L': {
				198	// Class name: Break out and continue below.
				199	break;
				200	}
				201	default: {
				202	// Oddball descriptor character.
				203	return false;
				204	}
				205	}
				206	}
				207
				208	/*
				209	* We just consumed the 'L' that introduces a class name as part
				210	* of a type descriptor, or we are looking for an unadorned class
				211	* name.
				212	*/
				213
				214	bool sepOrFirst = true; // first character or just encountered a separator.
				215	for (;;) {
				216	u1 c = (u1) *s;
				217	switch (c) {
				218	case '\0': {
				219	/*
				220	* Premature end for a type descriptor, but valid for
				221	* a class name as long as we haven't encountered an
				222	* empty component (including the degenerate case of
				223	* the empty string "").
				224	*/
				225	return isClassName && !sepOrFirst;
				226	}
				227	case ';': {
				228	/*
				229	* Invalid character for a class name, but the
				230	* legitimate end of a type descriptor. In the latter
				231	* case, make sure that this is the end of the string
				232	* and that it doesn't end with an empty component
				233	* (including the degenerate case of "L;").
				234	*/
				235	return !isClassName && !sepOrFirst && (s[1] == '\0');
				236	}
				237	case '/':
				238	case '.': {
				239	if (dotSeparator != (c == '.')) {
				240	// The wrong separator character.
				241	return false;
				242	}
				243	if (sepOrFirst) {
				244	// Separator at start or two separators in a row.
				245	return false;
				246	}
				247	sepOrFirst = true;
				248	s++;
				249	break;
				250	}
				251	default: {
				252	if (!dexIsValidMemberNameUtf8(&s)) {
				253	return false;
				254	}
				255	sepOrFirst = false;
				256	break;
				257	}
				258	}
				259	}
				260	}
				261
				262	/* Return whether the given string is a valid type descriptor. */
				263	bool dexIsValidTypeDescriptor(const char* s) {
				264	return isValidTypeDescriptorOrClassName(s, false, false);
				265	}
				266
				267	/* (documented in header) */
				268	bool dexIsValidClassName(const char* s, bool dotSeparator) {
				269	return isValidTypeDescriptorOrClassName(s, true, dotSeparator);
				270	}
				271
				272	/* Return whether the given string is a valid reference descriptor. This
				273	* is true if dexIsValidTypeDescriptor() returns true and the descriptor
				274	* is for a class or array and not a primitive type. */
				275	bool dexIsReferenceDescriptor(const char* s) {
				276	if (!dexIsValidTypeDescriptor(s)) {
				277	return false;
				278	}
				279
				280	return (s[0] == 'L') \|\| (s[0] == '[');
				281	}
				282
				283	/* Return whether the given string is a valid class descriptor. This
				284	* is true if dexIsValidTypeDescriptor() returns true and the descriptor
				285	* is for a class and not an array or primitive type. */
				286	bool dexIsClassDescriptor(const char* s) {
				287	if (!dexIsValidTypeDescriptor(s)) {
				288	return false;
				289	}
				290
				291	return s[0] == 'L';
				292	}
				293
				294	/* Return whether the given string is a valid field type descriptor. This
				295	* is true if dexIsValidTypeDescriptor() returns true and the descriptor
				296	* is for anything but "void". */
				297	bool dexIsFieldDescriptor(const char* s) {
				298	if (!dexIsValidTypeDescriptor(s)) {
				299	return false;
				300	}
				301
				302	return s[0] != 'V';
				303	}
				304