blob: df49d18313ce247849f14798e9598aa89bef3fce [file] [log] [blame]
Aart Bik1a650522015-07-08 21:20:41 +00001/*
2 * Copyright (C) 2011 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17/*
18 * Validate and manipulate MUTF-8 encoded string data.
19 */
20
21#include "DexUtf.h"
22
23/* Compare two '\0'-terminated modified UTF-8 strings, using Unicode
24 * code point values for comparison. This treats different encodings
25 * for the same code point as equivalent, except that only a real '\0'
26 * byte is considered the string terminator. The return value is as
27 * for strcmp(). */
28int dexUtf8Cmp(const char* s1, const char* s2) {
29 for (;;) {
30 if (*s1 == '\0') {
31 if (*s2 == '\0') {
32 return 0;
33 }
34 return -1;
35 } else if (*s2 == '\0') {
36 return 1;
37 }
38
39 int utf1 = dexGetUtf16FromUtf8(&s1);
40 int utf2 = dexGetUtf16FromUtf8(&s2);
41 int diff = utf1 - utf2;
42
43 if (diff != 0) {
44 return diff;
45 }
46 }
47}
48
49/* for dexIsValidMemberNameUtf8(), a bit vector indicating valid low ascii */
50u4 DEX_MEMBER_VALID_LOW_ASCII[4] = {
51 0x00000000, // 00..1f low control characters; nothing valid
52 0x03ff2010, // 20..3f digits and symbols; valid: '0'..'9', '$', '-'
53 0x87fffffe, // 40..5f uppercase etc.; valid: 'A'..'Z', '_'
54 0x07fffffe // 60..7f lowercase etc.; valid: 'a'..'z'
55};
56
57/* Helper for dexIsValidMemberNameUtf8(); do not call directly. */
58bool dexIsValidMemberNameUtf8_0(const char** pUtf8Ptr) {
59 /*
60 * It's a multibyte encoded character. Decode it and analyze. We
61 * accept anything that isn't (a) an improperly encoded low value,
62 * (b) an improper surrogate pair, (c) an encoded '\0', (d) a high
63 * control character, or (e) a high space, layout, or special
64 * character (U+00a0, U+2000..U+200f, U+2028..U+202f,
65 * U+fff0..U+ffff). This is all specified in the dex format
66 * document.
67 */
68
69 u2 utf16 = dexGetUtf16FromUtf8(pUtf8Ptr);
70
71 // Perform follow-up tests based on the high 8 bits.
72 switch (utf16 >> 8) {
73 case 0x00: {
74 // It's only valid if it's above the ISO-8859-1 high space (0xa0).
75 return (utf16 > 0x00a0);
76 }
77 case 0xd8:
78 case 0xd9:
79 case 0xda:
80 case 0xdb: {
81 /*
82 * It's a leading surrogate. Check to see that a trailing
83 * surrogate follows.
84 */
85 utf16 = dexGetUtf16FromUtf8(pUtf8Ptr);
86 return (utf16 >= 0xdc00) && (utf16 <= 0xdfff);
87 }
88 case 0xdc:
89 case 0xdd:
90 case 0xde:
91 case 0xdf: {
92 // It's a trailing surrogate, which is not valid at this point.
93 return false;
94 }
95 case 0x20:
96 case 0xff: {
97 // It's in the range that has spaces, controls, and specials.
98 switch (utf16 & 0xfff8) {
99 case 0x2000:
100 case 0x2008:
101 case 0x2028:
102 case 0xfff0:
103 case 0xfff8: {
104 return false;
105 }
106 }
107 break;
108 }
109 }
110
111 return true;
112}
113
114/* Return whether the given string is a valid field or method name. */
115bool dexIsValidMemberName(const char* s) {
116 bool angleName = false;
117
118 switch (*s) {
119 case '\0': {
120 // The empty string is not a valid name.
121 return false;
122 }
123 case '<': {
124 /*
125 * '<' is allowed only at the start of a name, and if present,
126 * means that the name must end with '>'.
127 */
128 angleName = true;
129 s++;
130 break;
131 }
132 }
133
134 for (;;) {
135 switch (*s) {
136 case '\0': {
137 return !angleName;
138 }
139 case '>': {
140 return angleName && s[1] == '\0';
141 }
142 }
143 if (!dexIsValidMemberNameUtf8(&s)) {
144 return false;
145 }
146 }
147}
148
149/* Helper for validating type descriptors and class names, which is parametric
150 * with respect to type vs. class and dot vs. slash. */
151static bool isValidTypeDescriptorOrClassName(const char* s, bool isClassName,
152 bool dotSeparator) {
153 int arrayCount = 0;
154
155 while (*s == '[') {
156 arrayCount++;
157 s++;
158 }
159
160 if (arrayCount > 255) {
161 // Arrays may have no more than 255 dimensions.
162 return false;
163 }
164
165 if (arrayCount != 0) {
166 /*
167 * If we're looking at an array of some sort, then it doesn't
168 * matter if what is being asked for is a class name; the
169 * format looks the same as a type descriptor in that case, so
170 * treat it as such.
171 */
172 isClassName = false;
173 }
174
175 if (!isClassName) {
176 /*
177 * We are looking for a descriptor. Either validate it as a
178 * single-character primitive type, or continue on to check the
179 * embedded class name (bracketed by "L" and ";").
180 */
181 switch (*(s++)) {
182 case 'B':
183 case 'C':
184 case 'D':
185 case 'F':
186 case 'I':
187 case 'J':
188 case 'S':
189 case 'Z': {
190 // These are all single-character descriptors for primitive types.
191 return (*s == '\0');
192 }
193 case 'V': {
194 // Non-array void is valid, but you can't have an array of void.
195 return (arrayCount == 0) && (*s == '\0');
196 }
197 case 'L': {
198 // Class name: Break out and continue below.
199 break;
200 }
201 default: {
202 // Oddball descriptor character.
203 return false;
204 }
205 }
206 }
207
208 /*
209 * We just consumed the 'L' that introduces a class name as part
210 * of a type descriptor, or we are looking for an unadorned class
211 * name.
212 */
213
214 bool sepOrFirst = true; // first character or just encountered a separator.
215 for (;;) {
216 u1 c = (u1) *s;
217 switch (c) {
218 case '\0': {
219 /*
220 * Premature end for a type descriptor, but valid for
221 * a class name as long as we haven't encountered an
222 * empty component (including the degenerate case of
223 * the empty string "").
224 */
225 return isClassName && !sepOrFirst;
226 }
227 case ';': {
228 /*
229 * Invalid character for a class name, but the
230 * legitimate end of a type descriptor. In the latter
231 * case, make sure that this is the end of the string
232 * and that it doesn't end with an empty component
233 * (including the degenerate case of "L;").
234 */
235 return !isClassName && !sepOrFirst && (s[1] == '\0');
236 }
237 case '/':
238 case '.': {
239 if (dotSeparator != (c == '.')) {
240 // The wrong separator character.
241 return false;
242 }
243 if (sepOrFirst) {
244 // Separator at start or two separators in a row.
245 return false;
246 }
247 sepOrFirst = true;
248 s++;
249 break;
250 }
251 default: {
252 if (!dexIsValidMemberNameUtf8(&s)) {
253 return false;
254 }
255 sepOrFirst = false;
256 break;
257 }
258 }
259 }
260}
261
262/* Return whether the given string is a valid type descriptor. */
263bool dexIsValidTypeDescriptor(const char* s) {
264 return isValidTypeDescriptorOrClassName(s, false, false);
265}
266
267/* (documented in header) */
268bool dexIsValidClassName(const char* s, bool dotSeparator) {
269 return isValidTypeDescriptorOrClassName(s, true, dotSeparator);
270}
271
272/* Return whether the given string is a valid reference descriptor. This
273 * is true if dexIsValidTypeDescriptor() returns true and the descriptor
274 * is for a class or array and not a primitive type. */
275bool dexIsReferenceDescriptor(const char* s) {
276 if (!dexIsValidTypeDescriptor(s)) {
277 return false;
278 }
279
280 return (s[0] == 'L') || (s[0] == '[');
281}
282
283/* Return whether the given string is a valid class descriptor. This
284 * is true if dexIsValidTypeDescriptor() returns true and the descriptor
285 * is for a class and not an array or primitive type. */
286bool dexIsClassDescriptor(const char* s) {
287 if (!dexIsValidTypeDescriptor(s)) {
288 return false;
289 }
290
291 return s[0] == 'L';
292}
293
294/* Return whether the given string is a valid field type descriptor. This
295 * is true if dexIsValidTypeDescriptor() returns true and the descriptor
296 * is for anything but "void". */
297bool dexIsFieldDescriptor(const char* s) {
298 if (!dexIsValidTypeDescriptor(s)) {
299 return false;
300 }
301
302 return s[0] != 'V';
303}
304