blob: 8ffdb9c53d89c7dc5aab8c6c7092631079d243bb [file] [log] [blame]
Elliott Hughes5a0aa3d2014-04-30 22:03:12 -07001/* $OpenBSD: citrus_utf8.c,v 1.6 2012/12/05 23:19:59 deraadt Exp $ */
2
3/*-
4 * Copyright (c) 2002-2004 Tim J. Robbins
Elliott Hughes29c7f0b2012-10-22 17:05:27 -07005 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
Elliott Hughes5a0aa3d2014-04-30 22:03:12 -070010 * 1. Redistributions of source code must retain the above copyright
Elliott Hughes29c7f0b2012-10-22 17:05:27 -070011 * notice, this list of conditions and the following disclaimer.
Elliott Hughes5a0aa3d2014-04-30 22:03:12 -070012 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution.
Elliott Hughes29c7f0b2012-10-22 17:05:27 -070015 *
Elliott Hughes5a0aa3d2014-04-30 22:03:12 -070016 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
Elliott Hughes29c7f0b2012-10-22 17:05:27 -070026 * SUCH DAMAGE.
27 */
28
Elliott Hughes29c7f0b2012-10-22 17:05:27 -070029#include <errno.h>
Elliott Hughes29c7f0b2012-10-22 17:05:27 -070030#include <string.h>
Elliott Hughes5a0aa3d2014-04-30 22:03:12 -070031#include <sys/param.h>
Elliott Hughes29c7f0b2012-10-22 17:05:27 -070032#include <wchar.h>
33
Elliott Hughes5a0aa3d2014-04-30 22:03:12 -070034//
35// This file is basically OpenBSD's citrus_utf8.c but rewritten to not require a 12-byte mbstate_t
36// so we're backwards-compatible with our LP32 ABI where mbstate_t was only 4 bytes. An additional
37// advantage of this is that callers who don't supply their own mbstate_t won't be accessing shared
38// state.
39//
40// We also implement the POSIX interface directly rather than being accessed via function pointers.
41//
Elliott Hughes29c7f0b2012-10-22 17:05:27 -070042
Elliott Hughes5a0aa3d2014-04-30 22:03:12 -070043#define ERR_ILLEGAL_SEQUENCE static_cast<size_t>(-1)
44#define ERR_INCOMPLETE_SEQUENCE static_cast<size_t>(-2)
Elliott Hughes29c7f0b2012-10-22 17:05:27 -070045
Elliott Hughes5a0aa3d2014-04-30 22:03:12 -070046int mbsinit(const mbstate_t*) {
47 // We have no state, so we're always in the initial state.
Elliott Hughes29c7f0b2012-10-22 17:05:27 -070048 return 1;
49}
50
Elliott Hughes5a0aa3d2014-04-30 22:03:12 -070051size_t mbrtowc(wchar_t* pwc, const char* s, size_t n, mbstate_t*) {
Elliott Hughes29c7f0b2012-10-22 17:05:27 -070052 if (s == NULL) {
Elliott Hughes5a0aa3d2014-04-30 22:03:12 -070053 s = "";
54 n = 1;
55 pwc = NULL;
Elliott Hughes29c7f0b2012-10-22 17:05:27 -070056 }
Elliott Hughes5a0aa3d2014-04-30 22:03:12 -070057
Elliott Hughes29c7f0b2012-10-22 17:05:27 -070058 if (n == 0) {
Elliott Hughes0a5e26d2014-04-28 17:51:13 -070059 return 0;
Elliott Hughes29c7f0b2012-10-22 17:05:27 -070060 }
Elliott Hughes29c7f0b2012-10-22 17:05:27 -070061
Elliott Hughes5a0aa3d2014-04-30 22:03:12 -070062 int ch;
63 if (((ch = static_cast<uint8_t>(*s)) & ~0x7f) == 0) {
64 // Fast path for plain ASCII characters.
65 if (pwc != NULL) {
66 *pwc = ch;
Elliott Hughes3d7a0d92014-04-29 14:46:56 -070067 }
Elliott Hughes5a0aa3d2014-04-30 22:03:12 -070068 return (ch != '\0' ? 1 : 0);
Elliott Hughes29c7f0b2012-10-22 17:05:27 -070069 }
Elliott Hughes29c7f0b2012-10-22 17:05:27 -070070
Elliott Hughes5a0aa3d2014-04-30 22:03:12 -070071 // Determine the number of octets that make up this character
72 // from the first octet, and a mask that extracts the
73 // interesting bits of the first octet. We already know
74 // the character is at least two bytes long.
Elliott Hughes568c86a2014-05-01 16:49:55 -070075 size_t length;
Elliott Hughes5a0aa3d2014-04-30 22:03:12 -070076 int mask;
Elliott Hughes29c7f0b2012-10-22 17:05:27 -070077
Elliott Hughes5a0aa3d2014-04-30 22:03:12 -070078 // We also specify a lower bound for the character code to
79 // detect redundant, non-"shortest form" encodings. For
80 // example, the sequence C0 80 is _not_ a legal representation
81 // of the null character. This enforces a 1-to-1 mapping
82 // between character codes and their multibyte representations.
83 wchar_t lower_bound;
Elliott Hughes29c7f0b2012-10-22 17:05:27 -070084
Elliott Hughes5a0aa3d2014-04-30 22:03:12 -070085 ch = static_cast<uint8_t>(*s);
86 if ((ch & 0x80) == 0) {
87 mask = 0x7f;
88 length = 1;
89 lower_bound = 0;
90 } else if ((ch & 0xe0) == 0xc0) {
91 mask = 0x1f;
92 length = 2;
93 lower_bound = 0x80;
94 } else if ((ch & 0xf0) == 0xe0) {
95 mask = 0x0f;
96 length = 3;
97 lower_bound = 0x800;
98 } else if ((ch & 0xf8) == 0xf0) {
99 mask = 0x07;
100 length = 4;
101 lower_bound = 0x10000;
Elliott Hughes77e944f2014-04-04 17:34:51 -0700102 } else {
Elliott Hughes5a0aa3d2014-04-30 22:03:12 -0700103 // Malformed input; input is not UTF-8. See RFC 3629.
104 errno = EILSEQ;
105 return ERR_ILLEGAL_SEQUENCE;
Elliott Hughes29c7f0b2012-10-22 17:05:27 -0700106 }
Elliott Hughes29c7f0b2012-10-22 17:05:27 -0700107
Elliott Hughes5a0aa3d2014-04-30 22:03:12 -0700108 // Decode the octet sequence representing the character in chunks
109 // of 6 bits, most significant first.
110 wchar_t wch = static_cast<uint8_t>(*s++) & mask;
Elliott Hughes568c86a2014-05-01 16:49:55 -0700111 size_t i;
Elliott Hughes5a0aa3d2014-04-30 22:03:12 -0700112 for (i = 1; i < MIN(length, n); i++) {
113 if ((*s & 0xc0) != 0x80) {
114 // Malformed input; bad characters in the middle of a character.
Elliott Hughes05493712014-04-17 17:30:03 -0700115 errno = EILSEQ;
Elliott Hughes5a0aa3d2014-04-30 22:03:12 -0700116 return ERR_ILLEGAL_SEQUENCE;
117 }
118 wch <<= 6;
119 wch |= *s++ & 0x3f;
120 }
121 if (i < length) {
122 return ERR_INCOMPLETE_SEQUENCE;
123 }
124 if (wch < lower_bound) {
125 // Malformed input; redundant encoding.
126 errno = EILSEQ;
127 return ERR_ILLEGAL_SEQUENCE;
128 }
129 if ((wch >= 0xd800 && wch <= 0xdfff) || wch == 0xfffe || wch == 0xffff) {
130 // Malformed input; invalid code points.
131 errno = EILSEQ;
132 return ERR_ILLEGAL_SEQUENCE;
133 }
134 if (pwc != NULL) {
135 *pwc = wch;
136 }
137 return (wch == L'\0' ? 0 : length);
138}
139
140size_t mbsnrtowcs(wchar_t* dst, const char** src, size_t nmc, size_t len, mbstate_t* ps) {
141 size_t i, o, r;
142
143 if (dst == NULL) {
144 for (i = o = 0; i < nmc; i += r, o++) {
145 if (static_cast<uint8_t>((*src)[i]) < 0x80) {
146 // Fast path for plain ASCII characters.
147 if ((*src)[i] == '\0') {
148 return o;
149 }
150 r = 1;
151 } else {
152 r = mbrtowc(NULL, *src + i, nmc - i, ps);
153 if (r == ERR_ILLEGAL_SEQUENCE) {
154 return r;
155 }
156 if (r == ERR_INCOMPLETE_SEQUENCE) {
157 return o;
158 }
159 if (r == 0) {
160 return o;
161 }
Elliott Hughes1b836ee2014-04-18 13:32:33 -0700162 }
Elliott Hughes05493712014-04-17 17:30:03 -0700163 }
Elliott Hughes5a0aa3d2014-04-30 22:03:12 -0700164 return o;
165 }
166
167 for (i = o = 0; i < nmc && o < len; i += r, o++) {
168 if (static_cast<uint8_t>((*src)[i]) < 0x80) {
169 // Fast path for plain ASCII characters.
170 dst[o] = (*src)[i];
171 if ((*src)[i] == '\0') {
172 *src = NULL;
173 return o;
Elliott Hughes05493712014-04-17 17:30:03 -0700174 }
Elliott Hughes5a0aa3d2014-04-30 22:03:12 -0700175 r = 1;
Elliott Hughes05493712014-04-17 17:30:03 -0700176 } else {
Elliott Hughes5a0aa3d2014-04-30 22:03:12 -0700177 r = mbrtowc(dst + o, *src + i, nmc - i, ps);
178 if (r == ERR_ILLEGAL_SEQUENCE) {
179 *src += i;
180 return r;
181 }
182 if (r == ERR_INCOMPLETE_SEQUENCE) {
183 *src += nmc;
184 return o;
185 }
186 if (r == 0) {
187 *src = NULL;
188 return o;
189 }
Elliott Hughes05493712014-04-17 17:30:03 -0700190 }
Elliott Hughes29c7f0b2012-10-22 17:05:27 -0700191 }
Elliott Hughes5a0aa3d2014-04-30 22:03:12 -0700192 *src += i;
Elliott Hughes05493712014-04-17 17:30:03 -0700193 return o;
Elliott Hughes29c7f0b2012-10-22 17:05:27 -0700194}
195
Elliott Hughes5a0aa3d2014-04-30 22:03:12 -0700196size_t mbsrtowcs(wchar_t* dst, const char** src, size_t len, mbstate_t* ps) {
197 return mbsnrtowcs(dst, src, SIZE_MAX, len, ps);
Elliott Hughes29c7f0b2012-10-22 17:05:27 -0700198}
199
Elliott Hughes5a0aa3d2014-04-30 22:03:12 -0700200size_t wcrtomb(char* s, wchar_t wc, mbstate_t*) {
201 unsigned char lead;
202 int i, len;
203
204 if (s == NULL) {
205 // Reset to initial shift state (no-op).
206 return 1;
207 }
208
209 if ((wc & ~0x7f) == 0) {
210 // Fast path for plain ASCII characters.
211 *s = wc;
212 return 1;
213 }
214
215 // Determine the number of octets needed to represent this character.
216 // We always output the shortest sequence possible. Also specify the
217 // first few bits of the first octet, which contains the information
218 // about the sequence length.
219 if ((wc & ~0x7f) == 0) {
220 lead = 0;
221 len = 1;
222 } else if ((wc & ~0x7ff) == 0) {
223 lead = 0xc0;
224 len = 2;
225 } else if ((wc & ~0xffff) == 0) {
226 lead = 0xe0;
227 len = 3;
228 } else if ((wc & ~0x1fffff) == 0) {
229 lead = 0xf0;
230 len = 4;
231 } else {
232 errno = EILSEQ;
233 return ERR_ILLEGAL_SEQUENCE;
234 }
235
236 // Output the octets representing the character in chunks
237 // of 6 bits, least significant last. The first octet is
238 // a special case because it contains the sequence length
239 // information.
240 for (i = len - 1; i > 0; i--) {
241 s[i] = (wc & 0x3f) | 0x80;
242 wc >>= 6;
243 }
244 *s = (wc & 0xff) | lead;
245
246 return len;
247}
248
249size_t wcsnrtombs(char* dst, const wchar_t** src, size_t nwc, size_t len, mbstate_t* ps) {
250 char buf[MB_LEN_MAX];
251 size_t i, o, r;
252 if (dst == NULL) {
253 for (i = o = 0; i < nwc; i++, o += r) {
254 wchar_t wc = (*src)[i];
255 if (wc < 0x80) {
256 // Fast path for plain ASCII characters.
257 if (wc == 0) {
258 return o;
259 }
260 r = 1;
261 } else {
262 r = wcrtomb(buf, wc, ps);
263 if (r == ERR_ILLEGAL_SEQUENCE) {
264 return r;
265 }
266 }
267 }
268 return o;
269 }
270
271 for (i = o = 0; i < nwc && o < len; i++, o += r) {
272 wchar_t wc = (*src)[i];
273 if (wc < 0x80) {
274 // Fast path for plain ASCII characters.
275 dst[o] = wc;
276 if (wc == 0) {
277 *src = NULL;
278 return o;
279 }
280 r = 1;
281 } else if (len - o >= sizeof(buf)) {
282 // Enough space to translate in-place.
283 r = wcrtomb(dst + o, wc, ps);
284 if (r == ERR_ILLEGAL_SEQUENCE) {
285 *src += i;
286 return r;
287 }
288 } else {
289 // May not be enough space; use temp buffer.
290 r = wcrtomb(buf, wc, ps);
291 if (r == ERR_ILLEGAL_SEQUENCE) {
292 *src += i;
293 return r;
294 }
295 if (r > len - o) {
296 break;
297 }
298 memcpy(dst + o, buf, r);
Elliott Hughes29c7f0b2012-10-22 17:05:27 -0700299 }
300 }
Elliott Hughes5a0aa3d2014-04-30 22:03:12 -0700301 *src += i;
302 return o;
Elliott Hughes29c7f0b2012-10-22 17:05:27 -0700303}
304
Elliott Hughes5a0aa3d2014-04-30 22:03:12 -0700305size_t wcsrtombs(char* dst, const wchar_t** src, size_t len, mbstate_t* ps) {
306 return wcsnrtombs(dst, src, SIZE_MAX, len, ps);
Elliott Hughes29c7f0b2012-10-22 17:05:27 -0700307}