Elliott Hughes | 5a0aa3d | 2014-04-30 22:03:12 -0700 | [diff] [blame] | 1 | /* $OpenBSD: citrus_utf8.c,v 1.6 2012/12/05 23:19:59 deraadt Exp $ */ |
| 2 | |
| 3 | /*- |
| 4 | * Copyright (c) 2002-2004 Tim J. Robbins |
Elliott Hughes | 29c7f0b | 2012-10-22 17:05:27 -0700 | [diff] [blame] | 5 | * All rights reserved. |
| 6 | * |
| 7 | * Redistribution and use in source and binary forms, with or without |
| 8 | * modification, are permitted provided that the following conditions |
| 9 | * are met: |
Elliott Hughes | 5a0aa3d | 2014-04-30 22:03:12 -0700 | [diff] [blame] | 10 | * 1. Redistributions of source code must retain the above copyright |
Elliott Hughes | 29c7f0b | 2012-10-22 17:05:27 -0700 | [diff] [blame] | 11 | * notice, this list of conditions and the following disclaimer. |
Elliott Hughes | 5a0aa3d | 2014-04-30 22:03:12 -0700 | [diff] [blame] | 12 | * 2. Redistributions in binary form must reproduce the above copyright |
| 13 | * notice, this list of conditions and the following disclaimer in the |
| 14 | * documentation and/or other materials provided with the distribution. |
Elliott Hughes | 29c7f0b | 2012-10-22 17:05:27 -0700 | [diff] [blame] | 15 | * |
Elliott Hughes | 5a0aa3d | 2014-04-30 22:03:12 -0700 | [diff] [blame] | 16 | * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND |
| 17 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
| 18 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
| 19 | * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE |
| 20 | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL |
| 21 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS |
| 22 | * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) |
| 23 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT |
| 24 | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY |
| 25 | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF |
Elliott Hughes | 29c7f0b | 2012-10-22 17:05:27 -0700 | [diff] [blame] | 26 | * SUCH DAMAGE. |
| 27 | */ |
| 28 | |
Elliott Hughes | 29c7f0b | 2012-10-22 17:05:27 -0700 | [diff] [blame] | 29 | #include <errno.h> |
Elliott Hughes | 29c7f0b | 2012-10-22 17:05:27 -0700 | [diff] [blame] | 30 | #include <string.h> |
Elliott Hughes | 5a0aa3d | 2014-04-30 22:03:12 -0700 | [diff] [blame] | 31 | #include <sys/param.h> |
Elliott Hughes | 29c7f0b | 2012-10-22 17:05:27 -0700 | [diff] [blame] | 32 | #include <wchar.h> |
| 33 | |
Elliott Hughes | 5a0aa3d | 2014-04-30 22:03:12 -0700 | [diff] [blame] | 34 | // |
| 35 | // This file is basically OpenBSD's citrus_utf8.c but rewritten to not require a 12-byte mbstate_t |
| 36 | // so we're backwards-compatible with our LP32 ABI where mbstate_t was only 4 bytes. An additional |
| 37 | // advantage of this is that callers who don't supply their own mbstate_t won't be accessing shared |
| 38 | // state. |
| 39 | // |
| 40 | // We also implement the POSIX interface directly rather than being accessed via function pointers. |
| 41 | // |
Elliott Hughes | 29c7f0b | 2012-10-22 17:05:27 -0700 | [diff] [blame] | 42 | |
Elliott Hughes | 5a0aa3d | 2014-04-30 22:03:12 -0700 | [diff] [blame] | 43 | #define ERR_ILLEGAL_SEQUENCE static_cast<size_t>(-1) |
| 44 | #define ERR_INCOMPLETE_SEQUENCE static_cast<size_t>(-2) |
Elliott Hughes | 29c7f0b | 2012-10-22 17:05:27 -0700 | [diff] [blame] | 45 | |
Elliott Hughes | 5a0aa3d | 2014-04-30 22:03:12 -0700 | [diff] [blame] | 46 | int mbsinit(const mbstate_t*) { |
| 47 | // We have no state, so we're always in the initial state. |
Elliott Hughes | 29c7f0b | 2012-10-22 17:05:27 -0700 | [diff] [blame] | 48 | return 1; |
| 49 | } |
| 50 | |
Elliott Hughes | 5a0aa3d | 2014-04-30 22:03:12 -0700 | [diff] [blame] | 51 | size_t mbrtowc(wchar_t* pwc, const char* s, size_t n, mbstate_t*) { |
Elliott Hughes | 29c7f0b | 2012-10-22 17:05:27 -0700 | [diff] [blame] | 52 | if (s == NULL) { |
Elliott Hughes | 5a0aa3d | 2014-04-30 22:03:12 -0700 | [diff] [blame] | 53 | s = ""; |
| 54 | n = 1; |
| 55 | pwc = NULL; |
Elliott Hughes | 29c7f0b | 2012-10-22 17:05:27 -0700 | [diff] [blame] | 56 | } |
Elliott Hughes | 5a0aa3d | 2014-04-30 22:03:12 -0700 | [diff] [blame] | 57 | |
Elliott Hughes | 29c7f0b | 2012-10-22 17:05:27 -0700 | [diff] [blame] | 58 | if (n == 0) { |
Elliott Hughes | 0a5e26d | 2014-04-28 17:51:13 -0700 | [diff] [blame] | 59 | return 0; |
Elliott Hughes | 29c7f0b | 2012-10-22 17:05:27 -0700 | [diff] [blame] | 60 | } |
Elliott Hughes | 29c7f0b | 2012-10-22 17:05:27 -0700 | [diff] [blame] | 61 | |
Elliott Hughes | 5a0aa3d | 2014-04-30 22:03:12 -0700 | [diff] [blame] | 62 | int ch; |
| 63 | if (((ch = static_cast<uint8_t>(*s)) & ~0x7f) == 0) { |
| 64 | // Fast path for plain ASCII characters. |
| 65 | if (pwc != NULL) { |
| 66 | *pwc = ch; |
Elliott Hughes | 3d7a0d9 | 2014-04-29 14:46:56 -0700 | [diff] [blame] | 67 | } |
Elliott Hughes | 5a0aa3d | 2014-04-30 22:03:12 -0700 | [diff] [blame] | 68 | return (ch != '\0' ? 1 : 0); |
Elliott Hughes | 29c7f0b | 2012-10-22 17:05:27 -0700 | [diff] [blame] | 69 | } |
Elliott Hughes | 29c7f0b | 2012-10-22 17:05:27 -0700 | [diff] [blame] | 70 | |
Elliott Hughes | 5a0aa3d | 2014-04-30 22:03:12 -0700 | [diff] [blame] | 71 | // Determine the number of octets that make up this character |
| 72 | // from the first octet, and a mask that extracts the |
| 73 | // interesting bits of the first octet. We already know |
| 74 | // the character is at least two bytes long. |
| 75 | int length; |
| 76 | int mask; |
Elliott Hughes | 29c7f0b | 2012-10-22 17:05:27 -0700 | [diff] [blame] | 77 | |
Elliott Hughes | 5a0aa3d | 2014-04-30 22:03:12 -0700 | [diff] [blame] | 78 | // We also specify a lower bound for the character code to |
| 79 | // detect redundant, non-"shortest form" encodings. For |
| 80 | // example, the sequence C0 80 is _not_ a legal representation |
| 81 | // of the null character. This enforces a 1-to-1 mapping |
| 82 | // between character codes and their multibyte representations. |
| 83 | wchar_t lower_bound; |
Elliott Hughes | 29c7f0b | 2012-10-22 17:05:27 -0700 | [diff] [blame] | 84 | |
Elliott Hughes | 5a0aa3d | 2014-04-30 22:03:12 -0700 | [diff] [blame] | 85 | ch = static_cast<uint8_t>(*s); |
| 86 | if ((ch & 0x80) == 0) { |
| 87 | mask = 0x7f; |
| 88 | length = 1; |
| 89 | lower_bound = 0; |
| 90 | } else if ((ch & 0xe0) == 0xc0) { |
| 91 | mask = 0x1f; |
| 92 | length = 2; |
| 93 | lower_bound = 0x80; |
| 94 | } else if ((ch & 0xf0) == 0xe0) { |
| 95 | mask = 0x0f; |
| 96 | length = 3; |
| 97 | lower_bound = 0x800; |
| 98 | } else if ((ch & 0xf8) == 0xf0) { |
| 99 | mask = 0x07; |
| 100 | length = 4; |
| 101 | lower_bound = 0x10000; |
Elliott Hughes | 77e944f | 2014-04-04 17:34:51 -0700 | [diff] [blame] | 102 | } else { |
Elliott Hughes | 5a0aa3d | 2014-04-30 22:03:12 -0700 | [diff] [blame] | 103 | // Malformed input; input is not UTF-8. See RFC 3629. |
| 104 | errno = EILSEQ; |
| 105 | return ERR_ILLEGAL_SEQUENCE; |
Elliott Hughes | 29c7f0b | 2012-10-22 17:05:27 -0700 | [diff] [blame] | 106 | } |
Elliott Hughes | 29c7f0b | 2012-10-22 17:05:27 -0700 | [diff] [blame] | 107 | |
Elliott Hughes | 5a0aa3d | 2014-04-30 22:03:12 -0700 | [diff] [blame] | 108 | // Decode the octet sequence representing the character in chunks |
| 109 | // of 6 bits, most significant first. |
| 110 | wchar_t wch = static_cast<uint8_t>(*s++) & mask; |
| 111 | int i; |
| 112 | for (i = 1; i < MIN(length, n); i++) { |
| 113 | if ((*s & 0xc0) != 0x80) { |
| 114 | // Malformed input; bad characters in the middle of a character. |
Elliott Hughes | 0549371 | 2014-04-17 17:30:03 -0700 | [diff] [blame] | 115 | errno = EILSEQ; |
Elliott Hughes | 5a0aa3d | 2014-04-30 22:03:12 -0700 | [diff] [blame] | 116 | return ERR_ILLEGAL_SEQUENCE; |
| 117 | } |
| 118 | wch <<= 6; |
| 119 | wch |= *s++ & 0x3f; |
| 120 | } |
| 121 | if (i < length) { |
| 122 | return ERR_INCOMPLETE_SEQUENCE; |
| 123 | } |
| 124 | if (wch < lower_bound) { |
| 125 | // Malformed input; redundant encoding. |
| 126 | errno = EILSEQ; |
| 127 | return ERR_ILLEGAL_SEQUENCE; |
| 128 | } |
| 129 | if ((wch >= 0xd800 && wch <= 0xdfff) || wch == 0xfffe || wch == 0xffff) { |
| 130 | // Malformed input; invalid code points. |
| 131 | errno = EILSEQ; |
| 132 | return ERR_ILLEGAL_SEQUENCE; |
| 133 | } |
| 134 | if (pwc != NULL) { |
| 135 | *pwc = wch; |
| 136 | } |
| 137 | return (wch == L'\0' ? 0 : length); |
| 138 | } |
| 139 | |
| 140 | size_t mbsnrtowcs(wchar_t* dst, const char** src, size_t nmc, size_t len, mbstate_t* ps) { |
| 141 | size_t i, o, r; |
| 142 | |
| 143 | if (dst == NULL) { |
| 144 | for (i = o = 0; i < nmc; i += r, o++) { |
| 145 | if (static_cast<uint8_t>((*src)[i]) < 0x80) { |
| 146 | // Fast path for plain ASCII characters. |
| 147 | if ((*src)[i] == '\0') { |
| 148 | return o; |
| 149 | } |
| 150 | r = 1; |
| 151 | } else { |
| 152 | r = mbrtowc(NULL, *src + i, nmc - i, ps); |
| 153 | if (r == ERR_ILLEGAL_SEQUENCE) { |
| 154 | return r; |
| 155 | } |
| 156 | if (r == ERR_INCOMPLETE_SEQUENCE) { |
| 157 | return o; |
| 158 | } |
| 159 | if (r == 0) { |
| 160 | return o; |
| 161 | } |
Elliott Hughes | 1b836ee | 2014-04-18 13:32:33 -0700 | [diff] [blame] | 162 | } |
Elliott Hughes | 0549371 | 2014-04-17 17:30:03 -0700 | [diff] [blame] | 163 | } |
Elliott Hughes | 5a0aa3d | 2014-04-30 22:03:12 -0700 | [diff] [blame] | 164 | return o; |
| 165 | } |
| 166 | |
| 167 | for (i = o = 0; i < nmc && o < len; i += r, o++) { |
| 168 | if (static_cast<uint8_t>((*src)[i]) < 0x80) { |
| 169 | // Fast path for plain ASCII characters. |
| 170 | dst[o] = (*src)[i]; |
| 171 | if ((*src)[i] == '\0') { |
| 172 | *src = NULL; |
| 173 | return o; |
Elliott Hughes | 0549371 | 2014-04-17 17:30:03 -0700 | [diff] [blame] | 174 | } |
Elliott Hughes | 5a0aa3d | 2014-04-30 22:03:12 -0700 | [diff] [blame] | 175 | r = 1; |
Elliott Hughes | 0549371 | 2014-04-17 17:30:03 -0700 | [diff] [blame] | 176 | } else { |
Elliott Hughes | 5a0aa3d | 2014-04-30 22:03:12 -0700 | [diff] [blame] | 177 | r = mbrtowc(dst + o, *src + i, nmc - i, ps); |
| 178 | if (r == ERR_ILLEGAL_SEQUENCE) { |
| 179 | *src += i; |
| 180 | return r; |
| 181 | } |
| 182 | if (r == ERR_INCOMPLETE_SEQUENCE) { |
| 183 | *src += nmc; |
| 184 | return o; |
| 185 | } |
| 186 | if (r == 0) { |
| 187 | *src = NULL; |
| 188 | return o; |
| 189 | } |
Elliott Hughes | 0549371 | 2014-04-17 17:30:03 -0700 | [diff] [blame] | 190 | } |
Elliott Hughes | 29c7f0b | 2012-10-22 17:05:27 -0700 | [diff] [blame] | 191 | } |
Elliott Hughes | 5a0aa3d | 2014-04-30 22:03:12 -0700 | [diff] [blame] | 192 | *src += i; |
Elliott Hughes | 0549371 | 2014-04-17 17:30:03 -0700 | [diff] [blame] | 193 | return o; |
Elliott Hughes | 29c7f0b | 2012-10-22 17:05:27 -0700 | [diff] [blame] | 194 | } |
| 195 | |
Elliott Hughes | 5a0aa3d | 2014-04-30 22:03:12 -0700 | [diff] [blame] | 196 | size_t mbsrtowcs(wchar_t* dst, const char** src, size_t len, mbstate_t* ps) { |
| 197 | return mbsnrtowcs(dst, src, SIZE_MAX, len, ps); |
Elliott Hughes | 29c7f0b | 2012-10-22 17:05:27 -0700 | [diff] [blame] | 198 | } |
| 199 | |
Elliott Hughes | 5a0aa3d | 2014-04-30 22:03:12 -0700 | [diff] [blame] | 200 | size_t wcrtomb(char* s, wchar_t wc, mbstate_t*) { |
| 201 | unsigned char lead; |
| 202 | int i, len; |
| 203 | |
| 204 | if (s == NULL) { |
| 205 | // Reset to initial shift state (no-op). |
| 206 | return 1; |
| 207 | } |
| 208 | |
| 209 | if ((wc & ~0x7f) == 0) { |
| 210 | // Fast path for plain ASCII characters. |
| 211 | *s = wc; |
| 212 | return 1; |
| 213 | } |
| 214 | |
| 215 | // Determine the number of octets needed to represent this character. |
| 216 | // We always output the shortest sequence possible. Also specify the |
| 217 | // first few bits of the first octet, which contains the information |
| 218 | // about the sequence length. |
| 219 | if ((wc & ~0x7f) == 0) { |
| 220 | lead = 0; |
| 221 | len = 1; |
| 222 | } else if ((wc & ~0x7ff) == 0) { |
| 223 | lead = 0xc0; |
| 224 | len = 2; |
| 225 | } else if ((wc & ~0xffff) == 0) { |
| 226 | lead = 0xe0; |
| 227 | len = 3; |
| 228 | } else if ((wc & ~0x1fffff) == 0) { |
| 229 | lead = 0xf0; |
| 230 | len = 4; |
| 231 | } else { |
| 232 | errno = EILSEQ; |
| 233 | return ERR_ILLEGAL_SEQUENCE; |
| 234 | } |
| 235 | |
| 236 | // Output the octets representing the character in chunks |
| 237 | // of 6 bits, least significant last. The first octet is |
| 238 | // a special case because it contains the sequence length |
| 239 | // information. |
| 240 | for (i = len - 1; i > 0; i--) { |
| 241 | s[i] = (wc & 0x3f) | 0x80; |
| 242 | wc >>= 6; |
| 243 | } |
| 244 | *s = (wc & 0xff) | lead; |
| 245 | |
| 246 | return len; |
| 247 | } |
| 248 | |
| 249 | size_t wcsnrtombs(char* dst, const wchar_t** src, size_t nwc, size_t len, mbstate_t* ps) { |
| 250 | char buf[MB_LEN_MAX]; |
| 251 | size_t i, o, r; |
| 252 | if (dst == NULL) { |
| 253 | for (i = o = 0; i < nwc; i++, o += r) { |
| 254 | wchar_t wc = (*src)[i]; |
| 255 | if (wc < 0x80) { |
| 256 | // Fast path for plain ASCII characters. |
| 257 | if (wc == 0) { |
| 258 | return o; |
| 259 | } |
| 260 | r = 1; |
| 261 | } else { |
| 262 | r = wcrtomb(buf, wc, ps); |
| 263 | if (r == ERR_ILLEGAL_SEQUENCE) { |
| 264 | return r; |
| 265 | } |
| 266 | } |
| 267 | } |
| 268 | return o; |
| 269 | } |
| 270 | |
| 271 | for (i = o = 0; i < nwc && o < len; i++, o += r) { |
| 272 | wchar_t wc = (*src)[i]; |
| 273 | if (wc < 0x80) { |
| 274 | // Fast path for plain ASCII characters. |
| 275 | dst[o] = wc; |
| 276 | if (wc == 0) { |
| 277 | *src = NULL; |
| 278 | return o; |
| 279 | } |
| 280 | r = 1; |
| 281 | } else if (len - o >= sizeof(buf)) { |
| 282 | // Enough space to translate in-place. |
| 283 | r = wcrtomb(dst + o, wc, ps); |
| 284 | if (r == ERR_ILLEGAL_SEQUENCE) { |
| 285 | *src += i; |
| 286 | return r; |
| 287 | } |
| 288 | } else { |
| 289 | // May not be enough space; use temp buffer. |
| 290 | r = wcrtomb(buf, wc, ps); |
| 291 | if (r == ERR_ILLEGAL_SEQUENCE) { |
| 292 | *src += i; |
| 293 | return r; |
| 294 | } |
| 295 | if (r > len - o) { |
| 296 | break; |
| 297 | } |
| 298 | memcpy(dst + o, buf, r); |
Elliott Hughes | 29c7f0b | 2012-10-22 17:05:27 -0700 | [diff] [blame] | 299 | } |
| 300 | } |
Elliott Hughes | 5a0aa3d | 2014-04-30 22:03:12 -0700 | [diff] [blame] | 301 | *src += i; |
| 302 | return o; |
Elliott Hughes | 29c7f0b | 2012-10-22 17:05:27 -0700 | [diff] [blame] | 303 | } |
| 304 | |
Elliott Hughes | 5a0aa3d | 2014-04-30 22:03:12 -0700 | [diff] [blame] | 305 | size_t wcsrtombs(char* dst, const wchar_t** src, size_t len, mbstate_t* ps) { |
| 306 | return wcsnrtombs(dst, src, SIZE_MAX, len, ps); |
Elliott Hughes | 29c7f0b | 2012-10-22 17:05:27 -0700 | [diff] [blame] | 307 | } |