Elliott Hughes | 5a0aa3d | 2014-04-30 22:03:12 -0700 | [diff] [blame] | 1 | /* $OpenBSD: citrus_utf8.c,v 1.6 2012/12/05 23:19:59 deraadt Exp $ */ |
| 2 | |
| 3 | /*- |
| 4 | * Copyright (c) 2002-2004 Tim J. Robbins |
Elliott Hughes | 29c7f0b | 2012-10-22 17:05:27 -0700 | [diff] [blame] | 5 | * All rights reserved. |
| 6 | * |
| 7 | * Redistribution and use in source and binary forms, with or without |
| 8 | * modification, are permitted provided that the following conditions |
| 9 | * are met: |
Elliott Hughes | 5a0aa3d | 2014-04-30 22:03:12 -0700 | [diff] [blame] | 10 | * 1. Redistributions of source code must retain the above copyright |
Elliott Hughes | 29c7f0b | 2012-10-22 17:05:27 -0700 | [diff] [blame] | 11 | * notice, this list of conditions and the following disclaimer. |
Elliott Hughes | 5a0aa3d | 2014-04-30 22:03:12 -0700 | [diff] [blame] | 12 | * 2. Redistributions in binary form must reproduce the above copyright |
| 13 | * notice, this list of conditions and the following disclaimer in the |
| 14 | * documentation and/or other materials provided with the distribution. |
Elliott Hughes | 29c7f0b | 2012-10-22 17:05:27 -0700 | [diff] [blame] | 15 | * |
Elliott Hughes | 5a0aa3d | 2014-04-30 22:03:12 -0700 | [diff] [blame] | 16 | * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND |
| 17 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
| 18 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
| 19 | * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE |
| 20 | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL |
| 21 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS |
| 22 | * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) |
| 23 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT |
| 24 | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY |
| 25 | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF |
Elliott Hughes | 29c7f0b | 2012-10-22 17:05:27 -0700 | [diff] [blame] | 26 | * SUCH DAMAGE. |
| 27 | */ |
| 28 | |
Elliott Hughes | 29c7f0b | 2012-10-22 17:05:27 -0700 | [diff] [blame] | 29 | #include <errno.h> |
Elliott Hughes | 29c7f0b | 2012-10-22 17:05:27 -0700 | [diff] [blame] | 30 | #include <string.h> |
Elliott Hughes | 5a0aa3d | 2014-04-30 22:03:12 -0700 | [diff] [blame] | 31 | #include <sys/param.h> |
Elliott Hughes | 29c7f0b | 2012-10-22 17:05:27 -0700 | [diff] [blame] | 32 | #include <wchar.h> |
| 33 | |
Elliott Hughes | 5a0aa3d | 2014-04-30 22:03:12 -0700 | [diff] [blame] | 34 | // |
Calin Juravle | 15a6310 | 2014-05-08 14:38:35 +0100 | [diff] [blame] | 35 | // This file is basically OpenBSD's citrus_utf8.c but rewritten to not require a |
| 36 | // 12-byte mbstate_t so we're backwards-compatible with our LP32 ABI where |
| 37 | // mbstate_t was only 4 bytes. |
Elliott Hughes | 5a0aa3d | 2014-04-30 22:03:12 -0700 | [diff] [blame] | 38 | // |
Calin Juravle | 15a6310 | 2014-05-08 14:38:35 +0100 | [diff] [blame] | 39 | // The state is the UTF-8 sequence. We only support <= 4-bytes sequences so LP32 |
| 40 | // mbstate_t already has enough space (out of the 4 available bytes we only |
| 41 | // need 3 since we should never need to store the entire sequence in the |
| 42 | // intermediary state). |
| 43 | // |
| 44 | // The C standard leaves the conversion state undefined after a bad conversion. |
| 45 | // To avoid unexpected failures due to the possible use of the internal private |
| 46 | // state we always reset the conversion state when encountering illegal |
| 47 | // sequences. |
| 48 | // |
| 49 | // We also implement the POSIX interface directly rather than being accessed via |
| 50 | // function pointers. |
Elliott Hughes | 5a0aa3d | 2014-04-30 22:03:12 -0700 | [diff] [blame] | 51 | // |
Elliott Hughes | 29c7f0b | 2012-10-22 17:05:27 -0700 | [diff] [blame] | 52 | |
Elliott Hughes | 5a0aa3d | 2014-04-30 22:03:12 -0700 | [diff] [blame] | 53 | #define ERR_ILLEGAL_SEQUENCE static_cast<size_t>(-1) |
| 54 | #define ERR_INCOMPLETE_SEQUENCE static_cast<size_t>(-2) |
Elliott Hughes | 29c7f0b | 2012-10-22 17:05:27 -0700 | [diff] [blame] | 55 | |
Calin Juravle | 15a6310 | 2014-05-08 14:38:35 +0100 | [diff] [blame] | 56 | static size_t mbstate_bytes_so_far(const mbstate_t* ps) { |
| 57 | return |
| 58 | (ps->__seq[2] != 0) ? 3 : |
| 59 | (ps->__seq[1] != 0) ? 2 : |
| 60 | (ps->__seq[0] != 0) ? 1 : 0; |
Elliott Hughes | 29c7f0b | 2012-10-22 17:05:27 -0700 | [diff] [blame] | 61 | } |
| 62 | |
Calin Juravle | 15a6310 | 2014-05-08 14:38:35 +0100 | [diff] [blame] | 63 | static void mbstate_set_byte(mbstate_t* ps, int i, char byte) { |
| 64 | ps->__seq[i] = static_cast<uint8_t>(byte); |
| 65 | } |
| 66 | |
| 67 | static uint8_t mbstate_get_byte(const mbstate_t* ps, int n) { |
| 68 | return ps->__seq[n]; |
| 69 | } |
| 70 | |
| 71 | static size_t reset_and_return_illegal(int _errno, mbstate_t* ps) { |
| 72 | errno = _errno; |
| 73 | *(reinterpret_cast<uint32_t*>(ps->__seq)) = 0; |
| 74 | return ERR_ILLEGAL_SEQUENCE; |
| 75 | } |
| 76 | |
| 77 | static size_t reset_and_return(int _return, mbstate_t* ps) { |
| 78 | *(reinterpret_cast<uint32_t*>(ps->__seq)) = 0; |
| 79 | return _return; |
| 80 | } |
| 81 | |
| 82 | |
| 83 | int mbsinit(const mbstate_t* ps) { |
| 84 | return (ps == NULL || (*(reinterpret_cast<const uint32_t*>(ps->__seq)) == 0)); |
| 85 | } |
| 86 | |
| 87 | size_t mbrtowc(wchar_t* pwc, const char* s, size_t n, mbstate_t* ps) { |
| 88 | static mbstate_t __private_state; |
| 89 | mbstate_t* state = (ps == NULL) ? &__private_state : ps; |
| 90 | |
| 91 | // We should never get to a state which has all 4 bytes of the sequence set. |
| 92 | // Full state verification is done when decoding the sequence (after we have |
| 93 | // all the bytes). |
| 94 | if (mbstate_get_byte(state, 3) != 0) { |
| 95 | return reset_and_return_illegal(EINVAL, state); |
| 96 | } |
| 97 | |
Elliott Hughes | 29c7f0b | 2012-10-22 17:05:27 -0700 | [diff] [blame] | 98 | if (s == NULL) { |
Elliott Hughes | 5a0aa3d | 2014-04-30 22:03:12 -0700 | [diff] [blame] | 99 | s = ""; |
| 100 | n = 1; |
| 101 | pwc = NULL; |
Elliott Hughes | 29c7f0b | 2012-10-22 17:05:27 -0700 | [diff] [blame] | 102 | } |
Elliott Hughes | 5a0aa3d | 2014-04-30 22:03:12 -0700 | [diff] [blame] | 103 | |
Elliott Hughes | 29c7f0b | 2012-10-22 17:05:27 -0700 | [diff] [blame] | 104 | if (n == 0) { |
Elliott Hughes | 0a5e26d | 2014-04-28 17:51:13 -0700 | [diff] [blame] | 105 | return 0; |
Elliott Hughes | 29c7f0b | 2012-10-22 17:05:27 -0700 | [diff] [blame] | 106 | } |
Elliott Hughes | 29c7f0b | 2012-10-22 17:05:27 -0700 | [diff] [blame] | 107 | |
Calin Juravle | 15a6310 | 2014-05-08 14:38:35 +0100 | [diff] [blame] | 108 | uint8_t ch; |
| 109 | if (mbsinit(state) && (((ch = static_cast<uint8_t>(*s)) & ~0x7f) == 0)) { |
Elliott Hughes | 5a0aa3d | 2014-04-30 22:03:12 -0700 | [diff] [blame] | 110 | // Fast path for plain ASCII characters. |
| 111 | if (pwc != NULL) { |
| 112 | *pwc = ch; |
Elliott Hughes | 3d7a0d9 | 2014-04-29 14:46:56 -0700 | [diff] [blame] | 113 | } |
Elliott Hughes | 5a0aa3d | 2014-04-30 22:03:12 -0700 | [diff] [blame] | 114 | return (ch != '\0' ? 1 : 0); |
Elliott Hughes | 29c7f0b | 2012-10-22 17:05:27 -0700 | [diff] [blame] | 115 | } |
Elliott Hughes | 29c7f0b | 2012-10-22 17:05:27 -0700 | [diff] [blame] | 116 | |
Elliott Hughes | 5a0aa3d | 2014-04-30 22:03:12 -0700 | [diff] [blame] | 117 | // Determine the number of octets that make up this character |
| 118 | // from the first octet, and a mask that extracts the |
| 119 | // interesting bits of the first octet. We already know |
| 120 | // the character is at least two bytes long. |
Elliott Hughes | 568c86a | 2014-05-01 16:49:55 -0700 | [diff] [blame] | 121 | size_t length; |
Elliott Hughes | 5a0aa3d | 2014-04-30 22:03:12 -0700 | [diff] [blame] | 122 | int mask; |
Elliott Hughes | 29c7f0b | 2012-10-22 17:05:27 -0700 | [diff] [blame] | 123 | |
Elliott Hughes | 5a0aa3d | 2014-04-30 22:03:12 -0700 | [diff] [blame] | 124 | // We also specify a lower bound for the character code to |
| 125 | // detect redundant, non-"shortest form" encodings. For |
| 126 | // example, the sequence C0 80 is _not_ a legal representation |
| 127 | // of the null character. This enforces a 1-to-1 mapping |
| 128 | // between character codes and their multibyte representations. |
| 129 | wchar_t lower_bound; |
Elliott Hughes | 29c7f0b | 2012-10-22 17:05:27 -0700 | [diff] [blame] | 130 | |
Calin Juravle | 15a6310 | 2014-05-08 14:38:35 +0100 | [diff] [blame] | 131 | // The first byte in the state (if any) tells the length. |
| 132 | size_t bytes_so_far = mbstate_bytes_so_far(state); |
| 133 | ch = bytes_so_far > 0 ? mbstate_get_byte(state, 0) : static_cast<uint8_t>(*s); |
Elliott Hughes | 5a0aa3d | 2014-04-30 22:03:12 -0700 | [diff] [blame] | 134 | if ((ch & 0x80) == 0) { |
| 135 | mask = 0x7f; |
| 136 | length = 1; |
| 137 | lower_bound = 0; |
| 138 | } else if ((ch & 0xe0) == 0xc0) { |
| 139 | mask = 0x1f; |
| 140 | length = 2; |
| 141 | lower_bound = 0x80; |
| 142 | } else if ((ch & 0xf0) == 0xe0) { |
| 143 | mask = 0x0f; |
| 144 | length = 3; |
| 145 | lower_bound = 0x800; |
| 146 | } else if ((ch & 0xf8) == 0xf0) { |
| 147 | mask = 0x07; |
| 148 | length = 4; |
| 149 | lower_bound = 0x10000; |
Elliott Hughes | 77e944f | 2014-04-04 17:34:51 -0700 | [diff] [blame] | 150 | } else { |
Elliott Hughes | 5a0aa3d | 2014-04-30 22:03:12 -0700 | [diff] [blame] | 151 | // Malformed input; input is not UTF-8. See RFC 3629. |
Calin Juravle | 15a6310 | 2014-05-08 14:38:35 +0100 | [diff] [blame] | 152 | return reset_and_return_illegal(EILSEQ, state); |
| 153 | } |
| 154 | |
| 155 | // Fill in the state. |
| 156 | size_t bytes_wanted = length - bytes_so_far; |
| 157 | size_t i; |
| 158 | for (i = 0; i < MIN(bytes_wanted, n); i++) { |
| 159 | if (!mbsinit(state) && ((*s & 0xc0) != 0x80)) { |
| 160 | // Malformed input; bad characters in the middle of a character. |
| 161 | return reset_and_return_illegal(EILSEQ, state); |
| 162 | } |
| 163 | mbstate_set_byte(state, bytes_so_far + i, *s++); |
| 164 | } |
| 165 | if (i < bytes_wanted) { |
| 166 | return ERR_INCOMPLETE_SEQUENCE; |
Elliott Hughes | 29c7f0b | 2012-10-22 17:05:27 -0700 | [diff] [blame] | 167 | } |
Elliott Hughes | 29c7f0b | 2012-10-22 17:05:27 -0700 | [diff] [blame] | 168 | |
Elliott Hughes | 5a0aa3d | 2014-04-30 22:03:12 -0700 | [diff] [blame] | 169 | // Decode the octet sequence representing the character in chunks |
| 170 | // of 6 bits, most significant first. |
Calin Juravle | 15a6310 | 2014-05-08 14:38:35 +0100 | [diff] [blame] | 171 | wchar_t wch = mbstate_get_byte(state, 0) & mask; |
| 172 | for (i = 1; i < length; i++) { |
Elliott Hughes | 5a0aa3d | 2014-04-30 22:03:12 -0700 | [diff] [blame] | 173 | wch <<= 6; |
Calin Juravle | 15a6310 | 2014-05-08 14:38:35 +0100 | [diff] [blame] | 174 | wch |= mbstate_get_byte(state, i) & 0x3f; |
Elliott Hughes | 5a0aa3d | 2014-04-30 22:03:12 -0700 | [diff] [blame] | 175 | } |
Calin Juravle | 15a6310 | 2014-05-08 14:38:35 +0100 | [diff] [blame] | 176 | |
Elliott Hughes | 5a0aa3d | 2014-04-30 22:03:12 -0700 | [diff] [blame] | 177 | if (wch < lower_bound) { |
| 178 | // Malformed input; redundant encoding. |
Calin Juravle | 15a6310 | 2014-05-08 14:38:35 +0100 | [diff] [blame] | 179 | return reset_and_return_illegal(EILSEQ, state); |
Elliott Hughes | 5a0aa3d | 2014-04-30 22:03:12 -0700 | [diff] [blame] | 180 | } |
| 181 | if ((wch >= 0xd800 && wch <= 0xdfff) || wch == 0xfffe || wch == 0xffff) { |
| 182 | // Malformed input; invalid code points. |
Calin Juravle | 15a6310 | 2014-05-08 14:38:35 +0100 | [diff] [blame] | 183 | return reset_and_return_illegal(EILSEQ, state); |
Elliott Hughes | 5a0aa3d | 2014-04-30 22:03:12 -0700 | [diff] [blame] | 184 | } |
| 185 | if (pwc != NULL) { |
| 186 | *pwc = wch; |
| 187 | } |
Calin Juravle | 15a6310 | 2014-05-08 14:38:35 +0100 | [diff] [blame] | 188 | return reset_and_return(wch == L'\0' ? 0 : bytes_wanted, state); |
Elliott Hughes | 5a0aa3d | 2014-04-30 22:03:12 -0700 | [diff] [blame] | 189 | } |
| 190 | |
| 191 | size_t mbsnrtowcs(wchar_t* dst, const char** src, size_t nmc, size_t len, mbstate_t* ps) { |
Calin Juravle | 15a6310 | 2014-05-08 14:38:35 +0100 | [diff] [blame] | 192 | static mbstate_t __private_state; |
| 193 | mbstate_t* state = (ps == NULL) ? &__private_state : ps; |
Elliott Hughes | 5a0aa3d | 2014-04-30 22:03:12 -0700 | [diff] [blame] | 194 | size_t i, o, r; |
| 195 | |
| 196 | if (dst == NULL) { |
Calin Juravle | 15a6310 | 2014-05-08 14:38:35 +0100 | [diff] [blame] | 197 | /* |
| 198 | * The fast path in the loop below is not safe if an ASCII |
| 199 | * character appears as anything but the first byte of a |
| 200 | * multibyte sequence. Check now to avoid doing it in the loop. |
| 201 | */ |
| 202 | if ((nmc > 0) && (mbstate_bytes_so_far(state) > 0) |
| 203 | && (static_cast<uint8_t>((*src)[0]) < 0x80)) { |
| 204 | return reset_and_return_illegal(EILSEQ, state); |
| 205 | } |
Elliott Hughes | 5a0aa3d | 2014-04-30 22:03:12 -0700 | [diff] [blame] | 206 | for (i = o = 0; i < nmc; i += r, o++) { |
| 207 | if (static_cast<uint8_t>((*src)[i]) < 0x80) { |
| 208 | // Fast path for plain ASCII characters. |
| 209 | if ((*src)[i] == '\0') { |
Calin Juravle | 15a6310 | 2014-05-08 14:38:35 +0100 | [diff] [blame] | 210 | return reset_and_return(o, state); |
Elliott Hughes | 5a0aa3d | 2014-04-30 22:03:12 -0700 | [diff] [blame] | 211 | } |
| 212 | r = 1; |
| 213 | } else { |
Calin Juravle | 15a6310 | 2014-05-08 14:38:35 +0100 | [diff] [blame] | 214 | r = mbrtowc(NULL, *src + i, nmc - i, state); |
Elliott Hughes | 5a0aa3d | 2014-04-30 22:03:12 -0700 | [diff] [blame] | 215 | if (r == ERR_ILLEGAL_SEQUENCE) { |
Calin Juravle | 15a6310 | 2014-05-08 14:38:35 +0100 | [diff] [blame] | 216 | return reset_and_return_illegal(EILSEQ, state); |
Elliott Hughes | 5a0aa3d | 2014-04-30 22:03:12 -0700 | [diff] [blame] | 217 | } |
| 218 | if (r == ERR_INCOMPLETE_SEQUENCE) { |
Calin Juravle | 15a6310 | 2014-05-08 14:38:35 +0100 | [diff] [blame] | 219 | return reset_and_return_illegal(EILSEQ, state); |
Elliott Hughes | 5a0aa3d | 2014-04-30 22:03:12 -0700 | [diff] [blame] | 220 | } |
| 221 | if (r == 0) { |
Calin Juravle | 15a6310 | 2014-05-08 14:38:35 +0100 | [diff] [blame] | 222 | return reset_and_return(o, state); |
Elliott Hughes | 5a0aa3d | 2014-04-30 22:03:12 -0700 | [diff] [blame] | 223 | } |
Elliott Hughes | 1b836ee | 2014-04-18 13:32:33 -0700 | [diff] [blame] | 224 | } |
Elliott Hughes | 0549371 | 2014-04-17 17:30:03 -0700 | [diff] [blame] | 225 | } |
Calin Juravle | 15a6310 | 2014-05-08 14:38:35 +0100 | [diff] [blame] | 226 | return reset_and_return(o, state); |
Elliott Hughes | 5a0aa3d | 2014-04-30 22:03:12 -0700 | [diff] [blame] | 227 | } |
| 228 | |
Calin Juravle | 15a6310 | 2014-05-08 14:38:35 +0100 | [diff] [blame] | 229 | /* |
| 230 | * The fast path in the loop below is not safe if an ASCII |
| 231 | * character appears as anything but the first byte of a |
| 232 | * multibyte sequence. Check now to avoid doing it in the loop. |
| 233 | */ |
| 234 | if ((nmc > 0) && (mbstate_bytes_so_far(state) > 0) |
| 235 | && (static_cast<uint8_t>((*src)[0]) < 0x80)) { |
| 236 | return reset_and_return_illegal(EILSEQ, state); |
| 237 | } |
Elliott Hughes | 5a0aa3d | 2014-04-30 22:03:12 -0700 | [diff] [blame] | 238 | for (i = o = 0; i < nmc && o < len; i += r, o++) { |
| 239 | if (static_cast<uint8_t>((*src)[i]) < 0x80) { |
| 240 | // Fast path for plain ASCII characters. |
| 241 | dst[o] = (*src)[i]; |
| 242 | if ((*src)[i] == '\0') { |
| 243 | *src = NULL; |
Calin Juravle | 15a6310 | 2014-05-08 14:38:35 +0100 | [diff] [blame] | 244 | return reset_and_return_illegal(EILSEQ, state); |
Elliott Hughes | 0549371 | 2014-04-17 17:30:03 -0700 | [diff] [blame] | 245 | } |
Elliott Hughes | 5a0aa3d | 2014-04-30 22:03:12 -0700 | [diff] [blame] | 246 | r = 1; |
Elliott Hughes | 0549371 | 2014-04-17 17:30:03 -0700 | [diff] [blame] | 247 | } else { |
Calin Juravle | 15a6310 | 2014-05-08 14:38:35 +0100 | [diff] [blame] | 248 | r = mbrtowc(dst + o, *src + i, nmc - i, state); |
Elliott Hughes | 5a0aa3d | 2014-04-30 22:03:12 -0700 | [diff] [blame] | 249 | if (r == ERR_ILLEGAL_SEQUENCE) { |
| 250 | *src += i; |
Calin Juravle | 15a6310 | 2014-05-08 14:38:35 +0100 | [diff] [blame] | 251 | return reset_and_return_illegal(EILSEQ, state); |
Elliott Hughes | 5a0aa3d | 2014-04-30 22:03:12 -0700 | [diff] [blame] | 252 | } |
| 253 | if (r == ERR_INCOMPLETE_SEQUENCE) { |
| 254 | *src += nmc; |
Calin Juravle | 15a6310 | 2014-05-08 14:38:35 +0100 | [diff] [blame] | 255 | return reset_and_return(EILSEQ, state); |
Elliott Hughes | 5a0aa3d | 2014-04-30 22:03:12 -0700 | [diff] [blame] | 256 | } |
| 257 | if (r == 0) { |
| 258 | *src = NULL; |
Calin Juravle | 15a6310 | 2014-05-08 14:38:35 +0100 | [diff] [blame] | 259 | return reset_and_return(o, state); |
Elliott Hughes | 5a0aa3d | 2014-04-30 22:03:12 -0700 | [diff] [blame] | 260 | } |
Elliott Hughes | 0549371 | 2014-04-17 17:30:03 -0700 | [diff] [blame] | 261 | } |
Elliott Hughes | 29c7f0b | 2012-10-22 17:05:27 -0700 | [diff] [blame] | 262 | } |
Elliott Hughes | 5a0aa3d | 2014-04-30 22:03:12 -0700 | [diff] [blame] | 263 | *src += i; |
Calin Juravle | 15a6310 | 2014-05-08 14:38:35 +0100 | [diff] [blame] | 264 | return reset_and_return(o, state); |
Elliott Hughes | 29c7f0b | 2012-10-22 17:05:27 -0700 | [diff] [blame] | 265 | } |
| 266 | |
Elliott Hughes | 5a0aa3d | 2014-04-30 22:03:12 -0700 | [diff] [blame] | 267 | size_t mbsrtowcs(wchar_t* dst, const char** src, size_t len, mbstate_t* ps) { |
| 268 | return mbsnrtowcs(dst, src, SIZE_MAX, len, ps); |
Elliott Hughes | 29c7f0b | 2012-10-22 17:05:27 -0700 | [diff] [blame] | 269 | } |
| 270 | |
Calin Juravle | 15a6310 | 2014-05-08 14:38:35 +0100 | [diff] [blame] | 271 | size_t wcrtomb(char* s, wchar_t wc, mbstate_t* ps) { |
| 272 | static mbstate_t __private_state; |
| 273 | mbstate_t* state = (ps == NULL) ? &__private_state : ps; |
| 274 | |
Elliott Hughes | 5a0aa3d | 2014-04-30 22:03:12 -0700 | [diff] [blame] | 275 | if (s == NULL) { |
Calin Juravle | 15a6310 | 2014-05-08 14:38:35 +0100 | [diff] [blame] | 276 | // Equivalent to wcrtomb(buf, L'\0', ps). |
| 277 | return reset_and_return(1, state); |
| 278 | } |
| 279 | |
| 280 | // POSIX states that if wc is a null wide character, a null byte shall be |
| 281 | // stored, preceded by any shift sequence needed to restore the initial shift |
| 282 | // state. Since shift states are not supported, only the null byte is stored. |
| 283 | if (wc == L'\0') { |
| 284 | *s = '\0'; |
| 285 | reset_and_return(1, state); |
| 286 | } |
| 287 | |
| 288 | if (!mbsinit(state)) { |
| 289 | return reset_and_return_illegal(EILSEQ, state); |
Elliott Hughes | 5a0aa3d | 2014-04-30 22:03:12 -0700 | [diff] [blame] | 290 | } |
| 291 | |
| 292 | if ((wc & ~0x7f) == 0) { |
| 293 | // Fast path for plain ASCII characters. |
| 294 | *s = wc; |
| 295 | return 1; |
| 296 | } |
| 297 | |
| 298 | // Determine the number of octets needed to represent this character. |
| 299 | // We always output the shortest sequence possible. Also specify the |
| 300 | // first few bits of the first octet, which contains the information |
| 301 | // about the sequence length. |
Elliott Hughes | 0d0ccfe | 2014-05-01 19:03:18 -0700 | [diff] [blame] | 302 | uint8_t lead; |
| 303 | size_t length; |
Elliott Hughes | 5a0aa3d | 2014-04-30 22:03:12 -0700 | [diff] [blame] | 304 | if ((wc & ~0x7f) == 0) { |
| 305 | lead = 0; |
Elliott Hughes | 0d0ccfe | 2014-05-01 19:03:18 -0700 | [diff] [blame] | 306 | length = 1; |
Elliott Hughes | 5a0aa3d | 2014-04-30 22:03:12 -0700 | [diff] [blame] | 307 | } else if ((wc & ~0x7ff) == 0) { |
| 308 | lead = 0xc0; |
Elliott Hughes | 0d0ccfe | 2014-05-01 19:03:18 -0700 | [diff] [blame] | 309 | length = 2; |
Elliott Hughes | 5a0aa3d | 2014-04-30 22:03:12 -0700 | [diff] [blame] | 310 | } else if ((wc & ~0xffff) == 0) { |
| 311 | lead = 0xe0; |
Elliott Hughes | 0d0ccfe | 2014-05-01 19:03:18 -0700 | [diff] [blame] | 312 | length = 3; |
Elliott Hughes | 5a0aa3d | 2014-04-30 22:03:12 -0700 | [diff] [blame] | 313 | } else if ((wc & ~0x1fffff) == 0) { |
| 314 | lead = 0xf0; |
Elliott Hughes | 0d0ccfe | 2014-05-01 19:03:18 -0700 | [diff] [blame] | 315 | length = 4; |
Elliott Hughes | 5a0aa3d | 2014-04-30 22:03:12 -0700 | [diff] [blame] | 316 | } else { |
| 317 | errno = EILSEQ; |
| 318 | return ERR_ILLEGAL_SEQUENCE; |
| 319 | } |
| 320 | |
| 321 | // Output the octets representing the character in chunks |
| 322 | // of 6 bits, least significant last. The first octet is |
| 323 | // a special case because it contains the sequence length |
| 324 | // information. |
Elliott Hughes | 0d0ccfe | 2014-05-01 19:03:18 -0700 | [diff] [blame] | 325 | for (size_t i = length - 1; i > 0; i--) { |
Elliott Hughes | 5a0aa3d | 2014-04-30 22:03:12 -0700 | [diff] [blame] | 326 | s[i] = (wc & 0x3f) | 0x80; |
| 327 | wc >>= 6; |
| 328 | } |
| 329 | *s = (wc & 0xff) | lead; |
| 330 | |
Elliott Hughes | 0d0ccfe | 2014-05-01 19:03:18 -0700 | [diff] [blame] | 331 | return length; |
Elliott Hughes | 5a0aa3d | 2014-04-30 22:03:12 -0700 | [diff] [blame] | 332 | } |
| 333 | |
| 334 | size_t wcsnrtombs(char* dst, const wchar_t** src, size_t nwc, size_t len, mbstate_t* ps) { |
Calin Juravle | 15a6310 | 2014-05-08 14:38:35 +0100 | [diff] [blame] | 335 | static mbstate_t __private_state; |
| 336 | mbstate_t* state = (ps == NULL) ? &__private_state : ps; |
| 337 | |
| 338 | if (!mbsinit(state)) { |
| 339 | return reset_and_return_illegal(EILSEQ, state); |
| 340 | } |
| 341 | |
Elliott Hughes | 5a0aa3d | 2014-04-30 22:03:12 -0700 | [diff] [blame] | 342 | char buf[MB_LEN_MAX]; |
| 343 | size_t i, o, r; |
| 344 | if (dst == NULL) { |
| 345 | for (i = o = 0; i < nwc; i++, o += r) { |
| 346 | wchar_t wc = (*src)[i]; |
Elliott Hughes | 0d0ccfe | 2014-05-01 19:03:18 -0700 | [diff] [blame] | 347 | if (static_cast<uint32_t>(wc) < 0x80) { |
Elliott Hughes | 5a0aa3d | 2014-04-30 22:03:12 -0700 | [diff] [blame] | 348 | // Fast path for plain ASCII characters. |
| 349 | if (wc == 0) { |
| 350 | return o; |
| 351 | } |
| 352 | r = 1; |
| 353 | } else { |
Calin Juravle | 15a6310 | 2014-05-08 14:38:35 +0100 | [diff] [blame] | 354 | r = wcrtomb(buf, wc, state); |
Elliott Hughes | 5a0aa3d | 2014-04-30 22:03:12 -0700 | [diff] [blame] | 355 | if (r == ERR_ILLEGAL_SEQUENCE) { |
| 356 | return r; |
| 357 | } |
| 358 | } |
| 359 | } |
| 360 | return o; |
| 361 | } |
| 362 | |
| 363 | for (i = o = 0; i < nwc && o < len; i++, o += r) { |
| 364 | wchar_t wc = (*src)[i]; |
Elliott Hughes | 0d0ccfe | 2014-05-01 19:03:18 -0700 | [diff] [blame] | 365 | if (static_cast<uint32_t>(wc) < 0x80) { |
Elliott Hughes | 5a0aa3d | 2014-04-30 22:03:12 -0700 | [diff] [blame] | 366 | // Fast path for plain ASCII characters. |
| 367 | dst[o] = wc; |
| 368 | if (wc == 0) { |
| 369 | *src = NULL; |
| 370 | return o; |
| 371 | } |
| 372 | r = 1; |
| 373 | } else if (len - o >= sizeof(buf)) { |
| 374 | // Enough space to translate in-place. |
Calin Juravle | 15a6310 | 2014-05-08 14:38:35 +0100 | [diff] [blame] | 375 | r = wcrtomb(dst + o, wc, state); |
Elliott Hughes | 5a0aa3d | 2014-04-30 22:03:12 -0700 | [diff] [blame] | 376 | if (r == ERR_ILLEGAL_SEQUENCE) { |
| 377 | *src += i; |
| 378 | return r; |
| 379 | } |
| 380 | } else { |
| 381 | // May not be enough space; use temp buffer. |
Calin Juravle | 15a6310 | 2014-05-08 14:38:35 +0100 | [diff] [blame] | 382 | r = wcrtomb(buf, wc, state); |
Elliott Hughes | 5a0aa3d | 2014-04-30 22:03:12 -0700 | [diff] [blame] | 383 | if (r == ERR_ILLEGAL_SEQUENCE) { |
| 384 | *src += i; |
| 385 | return r; |
| 386 | } |
| 387 | if (r > len - o) { |
| 388 | break; |
| 389 | } |
| 390 | memcpy(dst + o, buf, r); |
Elliott Hughes | 29c7f0b | 2012-10-22 17:05:27 -0700 | [diff] [blame] | 391 | } |
| 392 | } |
Elliott Hughes | 5a0aa3d | 2014-04-30 22:03:12 -0700 | [diff] [blame] | 393 | *src += i; |
| 394 | return o; |
Elliott Hughes | 29c7f0b | 2012-10-22 17:05:27 -0700 | [diff] [blame] | 395 | } |
| 396 | |
Elliott Hughes | 5a0aa3d | 2014-04-30 22:03:12 -0700 | [diff] [blame] | 397 | size_t wcsrtombs(char* dst, const wchar_t** src, size_t len, mbstate_t* ps) { |
| 398 | return wcsnrtombs(dst, src, SIZE_MAX, len, ps); |
Elliott Hughes | 29c7f0b | 2012-10-22 17:05:27 -0700 | [diff] [blame] | 399 | } |