blob: b46ad49397287555928228b2b390f28f48b99a96 [file] [log] [blame]
Elliott Hughes5a0aa3d2014-04-30 22:03:12 -07001/* $OpenBSD: citrus_utf8.c,v 1.6 2012/12/05 23:19:59 deraadt Exp $ */
2
3/*-
4 * Copyright (c) 2002-2004 Tim J. Robbins
Elliott Hughes29c7f0b2012-10-22 17:05:27 -07005 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
Elliott Hughes5a0aa3d2014-04-30 22:03:12 -070010 * 1. Redistributions of source code must retain the above copyright
Elliott Hughes29c7f0b2012-10-22 17:05:27 -070011 * notice, this list of conditions and the following disclaimer.
Elliott Hughes5a0aa3d2014-04-30 22:03:12 -070012 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution.
Elliott Hughes29c7f0b2012-10-22 17:05:27 -070015 *
Elliott Hughes5a0aa3d2014-04-30 22:03:12 -070016 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
Elliott Hughes29c7f0b2012-10-22 17:05:27 -070026 * SUCH DAMAGE.
27 */
28
Elliott Hughes29c7f0b2012-10-22 17:05:27 -070029#include <errno.h>
Elliott Hughes29c7f0b2012-10-22 17:05:27 -070030#include <string.h>
Elliott Hughes5a0aa3d2014-04-30 22:03:12 -070031#include <sys/param.h>
Elliott Hughes29c7f0b2012-10-22 17:05:27 -070032#include <wchar.h>
33
Elliott Hughes5a0aa3d2014-04-30 22:03:12 -070034//
35// This file is basically OpenBSD's citrus_utf8.c but rewritten to not require a 12-byte mbstate_t
36// so we're backwards-compatible with our LP32 ABI where mbstate_t was only 4 bytes. An additional
37// advantage of this is that callers who don't supply their own mbstate_t won't be accessing shared
38// state.
39//
40// We also implement the POSIX interface directly rather than being accessed via function pointers.
41//
Elliott Hughes29c7f0b2012-10-22 17:05:27 -070042
Elliott Hughes5a0aa3d2014-04-30 22:03:12 -070043#define ERR_ILLEGAL_SEQUENCE static_cast<size_t>(-1)
44#define ERR_INCOMPLETE_SEQUENCE static_cast<size_t>(-2)
Elliott Hughes29c7f0b2012-10-22 17:05:27 -070045
Elliott Hughes5a0aa3d2014-04-30 22:03:12 -070046int mbsinit(const mbstate_t*) {
47 // We have no state, so we're always in the initial state.
Elliott Hughes29c7f0b2012-10-22 17:05:27 -070048 return 1;
49}
50
Elliott Hughes5a0aa3d2014-04-30 22:03:12 -070051size_t mbrtowc(wchar_t* pwc, const char* s, size_t n, mbstate_t*) {
Elliott Hughes29c7f0b2012-10-22 17:05:27 -070052 if (s == NULL) {
Elliott Hughes5a0aa3d2014-04-30 22:03:12 -070053 s = "";
54 n = 1;
55 pwc = NULL;
Elliott Hughes29c7f0b2012-10-22 17:05:27 -070056 }
Elliott Hughes5a0aa3d2014-04-30 22:03:12 -070057
Elliott Hughes29c7f0b2012-10-22 17:05:27 -070058 if (n == 0) {
Elliott Hughes0a5e26d2014-04-28 17:51:13 -070059 return 0;
Elliott Hughes29c7f0b2012-10-22 17:05:27 -070060 }
Elliott Hughes29c7f0b2012-10-22 17:05:27 -070061
Elliott Hughes5a0aa3d2014-04-30 22:03:12 -070062 int ch;
63 if (((ch = static_cast<uint8_t>(*s)) & ~0x7f) == 0) {
64 // Fast path for plain ASCII characters.
65 if (pwc != NULL) {
66 *pwc = ch;
Elliott Hughes3d7a0d92014-04-29 14:46:56 -070067 }
Elliott Hughes5a0aa3d2014-04-30 22:03:12 -070068 return (ch != '\0' ? 1 : 0);
Elliott Hughes29c7f0b2012-10-22 17:05:27 -070069 }
Elliott Hughes29c7f0b2012-10-22 17:05:27 -070070
Elliott Hughes5a0aa3d2014-04-30 22:03:12 -070071 // Determine the number of octets that make up this character
72 // from the first octet, and a mask that extracts the
73 // interesting bits of the first octet. We already know
74 // the character is at least two bytes long.
Elliott Hughes568c86a2014-05-01 16:49:55 -070075 size_t length;
Elliott Hughes5a0aa3d2014-04-30 22:03:12 -070076 int mask;
Elliott Hughes29c7f0b2012-10-22 17:05:27 -070077
Elliott Hughes5a0aa3d2014-04-30 22:03:12 -070078 // We also specify a lower bound for the character code to
79 // detect redundant, non-"shortest form" encodings. For
80 // example, the sequence C0 80 is _not_ a legal representation
81 // of the null character. This enforces a 1-to-1 mapping
82 // between character codes and their multibyte representations.
83 wchar_t lower_bound;
Elliott Hughes29c7f0b2012-10-22 17:05:27 -070084
Elliott Hughes5a0aa3d2014-04-30 22:03:12 -070085 ch = static_cast<uint8_t>(*s);
86 if ((ch & 0x80) == 0) {
87 mask = 0x7f;
88 length = 1;
89 lower_bound = 0;
90 } else if ((ch & 0xe0) == 0xc0) {
91 mask = 0x1f;
92 length = 2;
93 lower_bound = 0x80;
94 } else if ((ch & 0xf0) == 0xe0) {
95 mask = 0x0f;
96 length = 3;
97 lower_bound = 0x800;
98 } else if ((ch & 0xf8) == 0xf0) {
99 mask = 0x07;
100 length = 4;
101 lower_bound = 0x10000;
Elliott Hughes77e944f2014-04-04 17:34:51 -0700102 } else {
Elliott Hughes5a0aa3d2014-04-30 22:03:12 -0700103 // Malformed input; input is not UTF-8. See RFC 3629.
104 errno = EILSEQ;
105 return ERR_ILLEGAL_SEQUENCE;
Elliott Hughes29c7f0b2012-10-22 17:05:27 -0700106 }
Elliott Hughes29c7f0b2012-10-22 17:05:27 -0700107
Elliott Hughes5a0aa3d2014-04-30 22:03:12 -0700108 // Decode the octet sequence representing the character in chunks
109 // of 6 bits, most significant first.
110 wchar_t wch = static_cast<uint8_t>(*s++) & mask;
Elliott Hughes568c86a2014-05-01 16:49:55 -0700111 size_t i;
Elliott Hughes5a0aa3d2014-04-30 22:03:12 -0700112 for (i = 1; i < MIN(length, n); i++) {
113 if ((*s & 0xc0) != 0x80) {
114 // Malformed input; bad characters in the middle of a character.
Elliott Hughes05493712014-04-17 17:30:03 -0700115 errno = EILSEQ;
Elliott Hughes5a0aa3d2014-04-30 22:03:12 -0700116 return ERR_ILLEGAL_SEQUENCE;
117 }
118 wch <<= 6;
119 wch |= *s++ & 0x3f;
120 }
121 if (i < length) {
122 return ERR_INCOMPLETE_SEQUENCE;
123 }
124 if (wch < lower_bound) {
125 // Malformed input; redundant encoding.
126 errno = EILSEQ;
127 return ERR_ILLEGAL_SEQUENCE;
128 }
129 if ((wch >= 0xd800 && wch <= 0xdfff) || wch == 0xfffe || wch == 0xffff) {
130 // Malformed input; invalid code points.
131 errno = EILSEQ;
132 return ERR_ILLEGAL_SEQUENCE;
133 }
134 if (pwc != NULL) {
135 *pwc = wch;
136 }
137 return (wch == L'\0' ? 0 : length);
138}
139
140size_t mbsnrtowcs(wchar_t* dst, const char** src, size_t nmc, size_t len, mbstate_t* ps) {
141 size_t i, o, r;
142
143 if (dst == NULL) {
144 for (i = o = 0; i < nmc; i += r, o++) {
145 if (static_cast<uint8_t>((*src)[i]) < 0x80) {
146 // Fast path for plain ASCII characters.
147 if ((*src)[i] == '\0') {
148 return o;
149 }
150 r = 1;
151 } else {
152 r = mbrtowc(NULL, *src + i, nmc - i, ps);
153 if (r == ERR_ILLEGAL_SEQUENCE) {
154 return r;
155 }
156 if (r == ERR_INCOMPLETE_SEQUENCE) {
157 return o;
158 }
159 if (r == 0) {
160 return o;
161 }
Elliott Hughes1b836ee2014-04-18 13:32:33 -0700162 }
Elliott Hughes05493712014-04-17 17:30:03 -0700163 }
Elliott Hughes5a0aa3d2014-04-30 22:03:12 -0700164 return o;
165 }
166
167 for (i = o = 0; i < nmc && o < len; i += r, o++) {
168 if (static_cast<uint8_t>((*src)[i]) < 0x80) {
169 // Fast path for plain ASCII characters.
170 dst[o] = (*src)[i];
171 if ((*src)[i] == '\0') {
172 *src = NULL;
173 return o;
Elliott Hughes05493712014-04-17 17:30:03 -0700174 }
Elliott Hughes5a0aa3d2014-04-30 22:03:12 -0700175 r = 1;
Elliott Hughes05493712014-04-17 17:30:03 -0700176 } else {
Elliott Hughes5a0aa3d2014-04-30 22:03:12 -0700177 r = mbrtowc(dst + o, *src + i, nmc - i, ps);
178 if (r == ERR_ILLEGAL_SEQUENCE) {
179 *src += i;
180 return r;
181 }
182 if (r == ERR_INCOMPLETE_SEQUENCE) {
183 *src += nmc;
184 return o;
185 }
186 if (r == 0) {
187 *src = NULL;
188 return o;
189 }
Elliott Hughes05493712014-04-17 17:30:03 -0700190 }
Elliott Hughes29c7f0b2012-10-22 17:05:27 -0700191 }
Elliott Hughes5a0aa3d2014-04-30 22:03:12 -0700192 *src += i;
Elliott Hughes05493712014-04-17 17:30:03 -0700193 return o;
Elliott Hughes29c7f0b2012-10-22 17:05:27 -0700194}
195
Elliott Hughes5a0aa3d2014-04-30 22:03:12 -0700196size_t mbsrtowcs(wchar_t* dst, const char** src, size_t len, mbstate_t* ps) {
197 return mbsnrtowcs(dst, src, SIZE_MAX, len, ps);
Elliott Hughes29c7f0b2012-10-22 17:05:27 -0700198}
199
Elliott Hughes5a0aa3d2014-04-30 22:03:12 -0700200size_t wcrtomb(char* s, wchar_t wc, mbstate_t*) {
Elliott Hughes5a0aa3d2014-04-30 22:03:12 -0700201 if (s == NULL) {
202 // Reset to initial shift state (no-op).
203 return 1;
204 }
205
206 if ((wc & ~0x7f) == 0) {
207 // Fast path for plain ASCII characters.
208 *s = wc;
209 return 1;
210 }
211
212 // Determine the number of octets needed to represent this character.
213 // We always output the shortest sequence possible. Also specify the
214 // first few bits of the first octet, which contains the information
215 // about the sequence length.
Elliott Hughes0d0ccfe2014-05-01 19:03:18 -0700216 uint8_t lead;
217 size_t length;
Elliott Hughes5a0aa3d2014-04-30 22:03:12 -0700218 if ((wc & ~0x7f) == 0) {
219 lead = 0;
Elliott Hughes0d0ccfe2014-05-01 19:03:18 -0700220 length = 1;
Elliott Hughes5a0aa3d2014-04-30 22:03:12 -0700221 } else if ((wc & ~0x7ff) == 0) {
222 lead = 0xc0;
Elliott Hughes0d0ccfe2014-05-01 19:03:18 -0700223 length = 2;
Elliott Hughes5a0aa3d2014-04-30 22:03:12 -0700224 } else if ((wc & ~0xffff) == 0) {
225 lead = 0xe0;
Elliott Hughes0d0ccfe2014-05-01 19:03:18 -0700226 length = 3;
Elliott Hughes5a0aa3d2014-04-30 22:03:12 -0700227 } else if ((wc & ~0x1fffff) == 0) {
228 lead = 0xf0;
Elliott Hughes0d0ccfe2014-05-01 19:03:18 -0700229 length = 4;
Elliott Hughes5a0aa3d2014-04-30 22:03:12 -0700230 } else {
231 errno = EILSEQ;
232 return ERR_ILLEGAL_SEQUENCE;
233 }
234
235 // Output the octets representing the character in chunks
236 // of 6 bits, least significant last. The first octet is
237 // a special case because it contains the sequence length
238 // information.
Elliott Hughes0d0ccfe2014-05-01 19:03:18 -0700239 for (size_t i = length - 1; i > 0; i--) {
Elliott Hughes5a0aa3d2014-04-30 22:03:12 -0700240 s[i] = (wc & 0x3f) | 0x80;
241 wc >>= 6;
242 }
243 *s = (wc & 0xff) | lead;
244
Elliott Hughes0d0ccfe2014-05-01 19:03:18 -0700245 return length;
Elliott Hughes5a0aa3d2014-04-30 22:03:12 -0700246}
247
248size_t wcsnrtombs(char* dst, const wchar_t** src, size_t nwc, size_t len, mbstate_t* ps) {
249 char buf[MB_LEN_MAX];
250 size_t i, o, r;
251 if (dst == NULL) {
252 for (i = o = 0; i < nwc; i++, o += r) {
253 wchar_t wc = (*src)[i];
Elliott Hughes0d0ccfe2014-05-01 19:03:18 -0700254 if (static_cast<uint32_t>(wc) < 0x80) {
Elliott Hughes5a0aa3d2014-04-30 22:03:12 -0700255 // Fast path for plain ASCII characters.
256 if (wc == 0) {
257 return o;
258 }
259 r = 1;
260 } else {
261 r = wcrtomb(buf, wc, ps);
262 if (r == ERR_ILLEGAL_SEQUENCE) {
263 return r;
264 }
265 }
266 }
267 return o;
268 }
269
270 for (i = o = 0; i < nwc && o < len; i++, o += r) {
271 wchar_t wc = (*src)[i];
Elliott Hughes0d0ccfe2014-05-01 19:03:18 -0700272 if (static_cast<uint32_t>(wc) < 0x80) {
Elliott Hughes5a0aa3d2014-04-30 22:03:12 -0700273 // Fast path for plain ASCII characters.
274 dst[o] = wc;
275 if (wc == 0) {
276 *src = NULL;
277 return o;
278 }
279 r = 1;
280 } else if (len - o >= sizeof(buf)) {
281 // Enough space to translate in-place.
282 r = wcrtomb(dst + o, wc, ps);
283 if (r == ERR_ILLEGAL_SEQUENCE) {
284 *src += i;
285 return r;
286 }
287 } else {
288 // May not be enough space; use temp buffer.
289 r = wcrtomb(buf, wc, ps);
290 if (r == ERR_ILLEGAL_SEQUENCE) {
291 *src += i;
292 return r;
293 }
294 if (r > len - o) {
295 break;
296 }
297 memcpy(dst + o, buf, r);
Elliott Hughes29c7f0b2012-10-22 17:05:27 -0700298 }
299 }
Elliott Hughes5a0aa3d2014-04-30 22:03:12 -0700300 *src += i;
301 return o;
Elliott Hughes29c7f0b2012-10-22 17:05:27 -0700302}
303
Elliott Hughes5a0aa3d2014-04-30 22:03:12 -0700304size_t wcsrtombs(char* dst, const wchar_t** src, size_t len, mbstate_t* ps) {
305 return wcsnrtombs(dst, src, SIZE_MAX, len, ps);
Elliott Hughes29c7f0b2012-10-22 17:05:27 -0700306}