blob: ecb8b3391fe643c91c3d864676c0ee147e1bfcd8 [file] [log] [blame]
Elliott Hughes5a0aa3d2014-04-30 22:03:12 -07001/* $OpenBSD: citrus_utf8.c,v 1.6 2012/12/05 23:19:59 deraadt Exp $ */
2
3/*-
4 * Copyright (c) 2002-2004 Tim J. Robbins
Elliott Hughes29c7f0b2012-10-22 17:05:27 -07005 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
Elliott Hughes5a0aa3d2014-04-30 22:03:12 -070010 * 1. Redistributions of source code must retain the above copyright
Elliott Hughes29c7f0b2012-10-22 17:05:27 -070011 * notice, this list of conditions and the following disclaimer.
Elliott Hughes5a0aa3d2014-04-30 22:03:12 -070012 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution.
Elliott Hughes29c7f0b2012-10-22 17:05:27 -070015 *
Elliott Hughes5a0aa3d2014-04-30 22:03:12 -070016 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
Elliott Hughes29c7f0b2012-10-22 17:05:27 -070026 * SUCH DAMAGE.
27 */
28
Elliott Hughes29c7f0b2012-10-22 17:05:27 -070029#include <errno.h>
Elliott Hughes5a0aa3d2014-04-30 22:03:12 -070030#include <sys/param.h>
Dan Albert7a7f9952014-06-02 11:33:04 -070031#include <string.h>
Elliott Hughes29c7f0b2012-10-22 17:05:27 -070032#include <wchar.h>
Dan Albert7a7f9952014-06-02 11:33:04 -070033#include <uchar.h>
34
35#include "private/bionic_mbstate.h"
Elliott Hughes29c7f0b2012-10-22 17:05:27 -070036
Elliott Hughes5a0aa3d2014-04-30 22:03:12 -070037//
Calin Juravle15a63102014-05-08 14:38:35 +010038// This file is basically OpenBSD's citrus_utf8.c but rewritten to not require a
39// 12-byte mbstate_t so we're backwards-compatible with our LP32 ABI where
40// mbstate_t was only 4 bytes.
Elliott Hughes5a0aa3d2014-04-30 22:03:12 -070041//
Calin Juravle15a63102014-05-08 14:38:35 +010042// The state is the UTF-8 sequence. We only support <= 4-bytes sequences so LP32
43// mbstate_t already has enough space (out of the 4 available bytes we only
44// need 3 since we should never need to store the entire sequence in the
45// intermediary state).
46//
47// The C standard leaves the conversion state undefined after a bad conversion.
48// To avoid unexpected failures due to the possible use of the internal private
49// state we always reset the conversion state when encountering illegal
50// sequences.
51//
52// We also implement the POSIX interface directly rather than being accessed via
53// function pointers.
Elliott Hughes5a0aa3d2014-04-30 22:03:12 -070054//
Elliott Hughes29c7f0b2012-10-22 17:05:27 -070055
Calin Juravle15a63102014-05-08 14:38:35 +010056int mbsinit(const mbstate_t* ps) {
57 return (ps == NULL || (*(reinterpret_cast<const uint32_t*>(ps->__seq)) == 0));
58}
59
60size_t mbrtowc(wchar_t* pwc, const char* s, size_t n, mbstate_t* ps) {
61 static mbstate_t __private_state;
62 mbstate_t* state = (ps == NULL) ? &__private_state : ps;
63
Dan Albert7a7f9952014-06-02 11:33:04 -070064 // Our wchar_t is UTF-32
65 return mbrtoc32(reinterpret_cast<char32_t*>(pwc), s, n, state);
Elliott Hughes5a0aa3d2014-04-30 22:03:12 -070066}
67
68size_t mbsnrtowcs(wchar_t* dst, const char** src, size_t nmc, size_t len, mbstate_t* ps) {
Calin Juravle15a63102014-05-08 14:38:35 +010069 static mbstate_t __private_state;
70 mbstate_t* state = (ps == NULL) ? &__private_state : ps;
Elliott Hughes5a0aa3d2014-04-30 22:03:12 -070071 size_t i, o, r;
72
73 if (dst == NULL) {
Calin Juravle15a63102014-05-08 14:38:35 +010074 /*
75 * The fast path in the loop below is not safe if an ASCII
76 * character appears as anything but the first byte of a
77 * multibyte sequence. Check now to avoid doing it in the loop.
78 */
79 if ((nmc > 0) && (mbstate_bytes_so_far(state) > 0)
80 && (static_cast<uint8_t>((*src)[0]) < 0x80)) {
81 return reset_and_return_illegal(EILSEQ, state);
82 }
Elliott Hughes5a0aa3d2014-04-30 22:03:12 -070083 for (i = o = 0; i < nmc; i += r, o++) {
84 if (static_cast<uint8_t>((*src)[i]) < 0x80) {
85 // Fast path for plain ASCII characters.
86 if ((*src)[i] == '\0') {
Calin Juravle15a63102014-05-08 14:38:35 +010087 return reset_and_return(o, state);
Elliott Hughes5a0aa3d2014-04-30 22:03:12 -070088 }
89 r = 1;
90 } else {
Calin Juravle15a63102014-05-08 14:38:35 +010091 r = mbrtowc(NULL, *src + i, nmc - i, state);
Dan Albert7a7f9952014-06-02 11:33:04 -070092 if (r == __MB_ERR_ILLEGAL_SEQUENCE) {
Calin Juravle15a63102014-05-08 14:38:35 +010093 return reset_and_return_illegal(EILSEQ, state);
Elliott Hughes5a0aa3d2014-04-30 22:03:12 -070094 }
Dan Albert7a7f9952014-06-02 11:33:04 -070095 if (r == __MB_ERR_INCOMPLETE_SEQUENCE) {
Calin Juravle15a63102014-05-08 14:38:35 +010096 return reset_and_return_illegal(EILSEQ, state);
Elliott Hughes5a0aa3d2014-04-30 22:03:12 -070097 }
98 if (r == 0) {
Calin Juravle15a63102014-05-08 14:38:35 +010099 return reset_and_return(o, state);
Elliott Hughes5a0aa3d2014-04-30 22:03:12 -0700100 }
Elliott Hughes1b836ee2014-04-18 13:32:33 -0700101 }
Elliott Hughes05493712014-04-17 17:30:03 -0700102 }
Calin Juravle15a63102014-05-08 14:38:35 +0100103 return reset_and_return(o, state);
Elliott Hughes5a0aa3d2014-04-30 22:03:12 -0700104 }
105
Calin Juravle15a63102014-05-08 14:38:35 +0100106 /*
107 * The fast path in the loop below is not safe if an ASCII
108 * character appears as anything but the first byte of a
109 * multibyte sequence. Check now to avoid doing it in the loop.
110 */
111 if ((nmc > 0) && (mbstate_bytes_so_far(state) > 0)
112 && (static_cast<uint8_t>((*src)[0]) < 0x80)) {
113 return reset_and_return_illegal(EILSEQ, state);
114 }
Elliott Hughes5a0aa3d2014-04-30 22:03:12 -0700115 for (i = o = 0; i < nmc && o < len; i += r, o++) {
116 if (static_cast<uint8_t>((*src)[i]) < 0x80) {
117 // Fast path for plain ASCII characters.
118 dst[o] = (*src)[i];
119 if ((*src)[i] == '\0') {
120 *src = NULL;
Calin Juravle15a63102014-05-08 14:38:35 +0100121 return reset_and_return_illegal(EILSEQ, state);
Elliott Hughes05493712014-04-17 17:30:03 -0700122 }
Elliott Hughes5a0aa3d2014-04-30 22:03:12 -0700123 r = 1;
Elliott Hughes05493712014-04-17 17:30:03 -0700124 } else {
Calin Juravle15a63102014-05-08 14:38:35 +0100125 r = mbrtowc(dst + o, *src + i, nmc - i, state);
Dan Albert7a7f9952014-06-02 11:33:04 -0700126 if (r == __MB_ERR_ILLEGAL_SEQUENCE) {
Elliott Hughes5a0aa3d2014-04-30 22:03:12 -0700127 *src += i;
Calin Juravle15a63102014-05-08 14:38:35 +0100128 return reset_and_return_illegal(EILSEQ, state);
Elliott Hughes5a0aa3d2014-04-30 22:03:12 -0700129 }
Dan Albert7a7f9952014-06-02 11:33:04 -0700130 if (r == __MB_ERR_INCOMPLETE_SEQUENCE) {
Elliott Hughes5a0aa3d2014-04-30 22:03:12 -0700131 *src += nmc;
Calin Juravle15a63102014-05-08 14:38:35 +0100132 return reset_and_return(EILSEQ, state);
Elliott Hughes5a0aa3d2014-04-30 22:03:12 -0700133 }
134 if (r == 0) {
135 *src = NULL;
Calin Juravle15a63102014-05-08 14:38:35 +0100136 return reset_and_return(o, state);
Elliott Hughes5a0aa3d2014-04-30 22:03:12 -0700137 }
Elliott Hughes05493712014-04-17 17:30:03 -0700138 }
Elliott Hughes29c7f0b2012-10-22 17:05:27 -0700139 }
Elliott Hughes5a0aa3d2014-04-30 22:03:12 -0700140 *src += i;
Calin Juravle15a63102014-05-08 14:38:35 +0100141 return reset_and_return(o, state);
Elliott Hughes29c7f0b2012-10-22 17:05:27 -0700142}
143
Elliott Hughes5a0aa3d2014-04-30 22:03:12 -0700144size_t mbsrtowcs(wchar_t* dst, const char** src, size_t len, mbstate_t* ps) {
145 return mbsnrtowcs(dst, src, SIZE_MAX, len, ps);
Elliott Hughes29c7f0b2012-10-22 17:05:27 -0700146}
147
Calin Juravle15a63102014-05-08 14:38:35 +0100148size_t wcrtomb(char* s, wchar_t wc, mbstate_t* ps) {
149 static mbstate_t __private_state;
150 mbstate_t* state = (ps == NULL) ? &__private_state : ps;
151
Dan Albert7a7f9952014-06-02 11:33:04 -0700152 // Our wchar_t is UTF-32
153 return c32rtomb(s, static_cast<char32_t>(wc), state);
Elliott Hughes5a0aa3d2014-04-30 22:03:12 -0700154}
155
156size_t wcsnrtombs(char* dst, const wchar_t** src, size_t nwc, size_t len, mbstate_t* ps) {
Calin Juravle15a63102014-05-08 14:38:35 +0100157 static mbstate_t __private_state;
158 mbstate_t* state = (ps == NULL) ? &__private_state : ps;
159
160 if (!mbsinit(state)) {
161 return reset_and_return_illegal(EILSEQ, state);
162 }
163
Elliott Hughes5a0aa3d2014-04-30 22:03:12 -0700164 char buf[MB_LEN_MAX];
165 size_t i, o, r;
166 if (dst == NULL) {
167 for (i = o = 0; i < nwc; i++, o += r) {
168 wchar_t wc = (*src)[i];
Elliott Hughes0d0ccfe2014-05-01 19:03:18 -0700169 if (static_cast<uint32_t>(wc) < 0x80) {
Elliott Hughes5a0aa3d2014-04-30 22:03:12 -0700170 // Fast path for plain ASCII characters.
171 if (wc == 0) {
172 return o;
173 }
174 r = 1;
175 } else {
Calin Juravle15a63102014-05-08 14:38:35 +0100176 r = wcrtomb(buf, wc, state);
Dan Albert7a7f9952014-06-02 11:33:04 -0700177 if (r == __MB_ERR_ILLEGAL_SEQUENCE) {
Elliott Hughes5a0aa3d2014-04-30 22:03:12 -0700178 return r;
179 }
180 }
181 }
182 return o;
183 }
184
185 for (i = o = 0; i < nwc && o < len; i++, o += r) {
186 wchar_t wc = (*src)[i];
Elliott Hughes0d0ccfe2014-05-01 19:03:18 -0700187 if (static_cast<uint32_t>(wc) < 0x80) {
Elliott Hughes5a0aa3d2014-04-30 22:03:12 -0700188 // Fast path for plain ASCII characters.
189 dst[o] = wc;
190 if (wc == 0) {
191 *src = NULL;
192 return o;
193 }
194 r = 1;
195 } else if (len - o >= sizeof(buf)) {
196 // Enough space to translate in-place.
Calin Juravle15a63102014-05-08 14:38:35 +0100197 r = wcrtomb(dst + o, wc, state);
Dan Albert7a7f9952014-06-02 11:33:04 -0700198 if (r == __MB_ERR_ILLEGAL_SEQUENCE) {
Elliott Hughes5a0aa3d2014-04-30 22:03:12 -0700199 *src += i;
200 return r;
201 }
202 } else {
203 // May not be enough space; use temp buffer.
Calin Juravle15a63102014-05-08 14:38:35 +0100204 r = wcrtomb(buf, wc, state);
Dan Albert7a7f9952014-06-02 11:33:04 -0700205 if (r == __MB_ERR_ILLEGAL_SEQUENCE) {
Elliott Hughes5a0aa3d2014-04-30 22:03:12 -0700206 *src += i;
207 return r;
208 }
209 if (r > len - o) {
210 break;
211 }
212 memcpy(dst + o, buf, r);
Elliott Hughes29c7f0b2012-10-22 17:05:27 -0700213 }
214 }
Elliott Hughes5a0aa3d2014-04-30 22:03:12 -0700215 *src += i;
216 return o;
Elliott Hughes29c7f0b2012-10-22 17:05:27 -0700217}
218
Elliott Hughes5a0aa3d2014-04-30 22:03:12 -0700219size_t wcsrtombs(char* dst, const wchar_t** src, size_t len, mbstate_t* ps) {
220 return wcsnrtombs(dst, src, SIZE_MAX, len, ps);
Elliott Hughes29c7f0b2012-10-22 17:05:27 -0700221}
Dan Albertdfb5ce42014-07-09 22:51:34 +0000222
223int wcscoll_l(const wchar_t *ws1, const wchar_t *ws2, locale_t) {
224 return wcscoll(ws1, ws2);
225}
226
227size_t wcsxfrm_l(wchar_t *dest, const wchar_t *src, size_t n, locale_t) {
228 return wcsxfrm(dest, src, n);
229}
230
231long long wcstoll_l(const wchar_t *nptr, wchar_t **endptr, size_t base,
232 locale_t) {
233 return wcstoll(nptr, endptr, base);
234}
235
236unsigned long long wcstoull_l(const wchar_t *nptr, wchar_t **endptr,
237 size_t base, locale_t) {
238 return wcstoull(nptr, endptr, base);
239}
240
241long double wcstold_l(const wchar_t *nptr, wchar_t **endptr, locale_t) {
242 return wcstold(nptr, endptr);
243}