blob: 798260ecdeae03259db3a5eed3ba6294237e5c7b [file] [log] [blame]
Chet Ramey495aee42011-11-22 19:11:26 -05001/* unicode.c - functions to convert unicode characters */
2
Chet Rameyac50fba2014-02-26 09:36:43 -05003/* Copyright (C) 2010-2012 Free Software Foundation, Inc.
Chet Ramey495aee42011-11-22 19:11:26 -05004
5 This file is part of GNU Bash, the Bourne Again SHell.
6
7 Bash is free software: you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation, either version 3 of the License, or
10 (at your option) any later version.
11
12 Bash is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
16
17 You should have received a copy of the GNU General Public License
18 along with Bash. If not, see <http://www.gnu.org/licenses/>.
19*/
20
21#include <config.h>
22
23#if defined (HANDLE_MULTIBYTE)
24
25#include <stdc.h>
26#include <wchar.h>
27#include <bashansi.h>
28#ifdef HAVE_UNISTD_H
29#include <unistd.h>
30#endif
Chet Rameyac50fba2014-02-26 09:36:43 -050031#include <stdio.h>
Chet Ramey495aee42011-11-22 19:11:26 -050032#include <limits.h>
33
34#if HAVE_ICONV
35# include <iconv.h>
36#endif
37
38#include <xmalloc.h>
39
40#ifndef USHORT_MAX
41# ifdef USHRT_MAX
42# define USHORT_MAX USHRT_MAX
43# else
44# define USHORT_MAX ((unsigned short) ~(unsigned short)0)
45# endif
46#endif
47
48#if !defined (STREQ)
49# define STREQ(a, b) ((a)[0] == (b)[0] && strcmp ((a), (b)) == 0)
50#endif /* !STREQ */
51
52#if defined (HAVE_LOCALE_CHARSET)
53extern const char *locale_charset __P((void));
54#else
55extern char *get_locale_var __P((char *));
56#endif
57
58static int u32init = 0;
59static int utf8locale = 0;
60#if defined (HAVE_ICONV)
61static iconv_t localconv;
62#endif
63
64#ifndef HAVE_LOCALE_CHARSET
Chet Rameyac50fba2014-02-26 09:36:43 -050065static char charsetbuf[40];
66
Chet Ramey495aee42011-11-22 19:11:26 -050067static char *
68stub_charset ()
69{
70 char *locale, *s, *t;
71
72 locale = get_locale_var ("LC_CTYPE");
73 if (locale == 0 || *locale == 0)
Chet Rameyac50fba2014-02-26 09:36:43 -050074 {
75 strcpy (charsetbuf, "ASCII");
76 return charsetbuf;
77 }
Chet Ramey495aee42011-11-22 19:11:26 -050078 s = strrchr (locale, '.');
79 if (s)
80 {
Chet Ramey3d9a3122015-05-19 14:55:27 -040081 strncpy (charsetbuf, s+1, sizeof (charsetbuf) - 1);
82 charsetbuf[sizeof (charsetbuf) - 1] = '\0';
Chet Rameyac50fba2014-02-26 09:36:43 -050083 t = strchr (charsetbuf, '@');
Chet Ramey495aee42011-11-22 19:11:26 -050084 if (t)
85 *t = 0;
Chet Rameyac50fba2014-02-26 09:36:43 -050086 return charsetbuf;
Chet Ramey495aee42011-11-22 19:11:26 -050087 }
Chet Ramey3d9a3122015-05-19 14:55:27 -040088 strncpy (charsetbuf, locale, sizeof (charsetbuf) - 1);
89 charsetbuf[sizeof (charsetbuf) - 1] = '\0';
Chet Rameyac50fba2014-02-26 09:36:43 -050090 return charsetbuf;
Chet Ramey495aee42011-11-22 19:11:26 -050091}
92#endif
93
Chet Rameyac50fba2014-02-26 09:36:43 -050094void
95u32reset ()
96{
97#if defined (HAVE_ICONV)
98 if (u32init && localconv != (iconv_t)-1)
99 {
100 iconv_close (localconv);
101 localconv = (iconv_t)-1;
102 }
103#endif
104 u32init = 0;
105 utf8locale = 0;
106}
107
Chet Ramey495aee42011-11-22 19:11:26 -0500108/* u32toascii ? */
109int
Chet Rameyac50fba2014-02-26 09:36:43 -0500110u32tochar (x, s)
111 unsigned long x;
Chet Ramey495aee42011-11-22 19:11:26 -0500112 char *s;
113{
Chet Ramey495aee42011-11-22 19:11:26 -0500114 int l;
115
Chet Ramey495aee42011-11-22 19:11:26 -0500116 l = (x <= UCHAR_MAX) ? 1 : ((x <= USHORT_MAX) ? 2 : 4);
117
118 if (x <= UCHAR_MAX)
119 s[0] = x & 0xFF;
120 else if (x <= USHORT_MAX) /* assume unsigned short = 16 bits */
121 {
122 s[0] = (x >> 8) & 0xFF;
123 s[1] = x & 0xFF;
124 }
125 else
126 {
127 s[0] = (x >> 24) & 0xFF;
128 s[1] = (x >> 16) & 0xFF;
129 s[2] = (x >> 8) & 0xFF;
130 s[3] = x & 0xFF;
131 }
132 s[l] = '\0';
133 return l;
134}
135
136int
Chet Rameyac50fba2014-02-26 09:36:43 -0500137u32tocesc (wc, s)
138 u_bits32_t wc;
Chet Ramey495aee42011-11-22 19:11:26 -0500139 char *s;
140{
141 int l;
142
Chet Rameyac50fba2014-02-26 09:36:43 -0500143 if (wc < 0x10000)
144 l = sprintf (s, "\\u%04X", wc);
145 else
146 l = sprintf (s, "\\u%08X", wc);
147 return l;
148}
149
150/* Convert unsigned 32-bit int to utf-8 character string */
151int
152u32toutf8 (wc, s)
153 u_bits32_t wc;
154 char *s;
155{
156 int l;
Chet Ramey495aee42011-11-22 19:11:26 -0500157
158 if (wc < 0x0080)
Chet Rameyac50fba2014-02-26 09:36:43 -0500159 {
160 s[0] = (char)wc;
161 l = 1;
162 }
Chet Ramey495aee42011-11-22 19:11:26 -0500163 else if (wc < 0x0800)
164 {
165 s[0] = (wc >> 6) | 0xc0;
166 s[1] = (wc & 0x3f) | 0x80;
Chet Rameyac50fba2014-02-26 09:36:43 -0500167 l = 2;
Chet Ramey495aee42011-11-22 19:11:26 -0500168 }
Chet Rameyac50fba2014-02-26 09:36:43 -0500169 else if (wc < 0x10000)
Chet Ramey495aee42011-11-22 19:11:26 -0500170 {
Chet Rameyac50fba2014-02-26 09:36:43 -0500171 /* Technically, we could return 0 here if 0xd800 <= wc <= 0x0dfff */
Chet Ramey495aee42011-11-22 19:11:26 -0500172 s[0] = (wc >> 12) | 0xe0;
173 s[1] = ((wc >> 6) & 0x3f) | 0x80;
174 s[2] = (wc & 0x3f) | 0x80;
Chet Rameyac50fba2014-02-26 09:36:43 -0500175 l = 3;
Chet Ramey495aee42011-11-22 19:11:26 -0500176 }
Chet Rameyac50fba2014-02-26 09:36:43 -0500177 else if (wc < 0x200000)
178 {
179 s[0] = (wc >> 18) | 0xf0;
180 s[1] = ((wc >> 12) & 0x3f) | 0x80;
181 s[2] = ((wc >> 6) & 0x3f) | 0x80;
182 s[3] = (wc & 0x3f) | 0x80;
183 l = 4;
184 }
185 /* Strictly speaking, UTF-8 doesn't have characters longer than 4 bytes */
186 else if (wc < 0x04000000)
187 {
188 s[0] = (wc >> 24) | 0xf8;
189 s[1] = ((wc >> 18) & 0x3f) | 0x80;
190 s[2] = ((wc >> 12) & 0x3f) | 0x80;
191 s[3] = ((wc >> 6) & 0x3f) | 0x80;
192 s[4] = (wc & 0x3f) | 0x80;
193 l = 5;
194 }
195 else if (wc < 0x080000000)
196 {
197 s[0] = (wc >> 30) | 0xf8;
198 s[1] = ((wc >> 24) & 0x3f) | 0x80;
199 s[2] = ((wc >> 18) & 0x3f) | 0x80;
200 s[3] = ((wc >> 12) & 0x3f) | 0x80;
201 s[4] = ((wc >> 6) & 0x3f) | 0x80;
202 s[5] = (wc & 0x3f) | 0x80;
203 l = 6;
204 }
205 else
206 l = 0;
207
Chet Ramey495aee42011-11-22 19:11:26 -0500208 s[l] = '\0';
209 return l;
210}
211
Chet Rameyac50fba2014-02-26 09:36:43 -0500212/* Convert a 32-bit unsigned int (unicode) to a UTF-16 string. Rarely used,
213 only if sizeof(wchar_t) == 2. */
214int
215u32toutf16 (c, s)
216 u_bits32_t c;
217 unsigned short *s;
218{
219 int l;
220
221 l = 0;
222 if (c < 0x0d800)
223 {
224 s[0] = (unsigned short) (c & 0xFFFF);
225 l = 1;
226 }
227 else if (c >= 0x0e000 && c <= 0x010ffff)
228 {
229 c -= 0x010000;
230 s[0] = (unsigned short)((c >> 10) + 0xd800);
231 s[1] = (unsigned short)((c & 0x3ff) + 0xdc00);
232 l = 2;
233 }
234 s[l] = 0;
235 return l;
236}
237
Chet Ramey495aee42011-11-22 19:11:26 -0500238/* convert a single unicode-32 character into a multibyte string and put the
239 result in S, which must be large enough (at least MB_LEN_MAX bytes) */
240int
241u32cconv (c, s)
242 unsigned long c;
243 char *s;
244{
245 wchar_t wc;
Chet Rameyac50fba2014-02-26 09:36:43 -0500246 wchar_t ws[3];
Chet Ramey495aee42011-11-22 19:11:26 -0500247 int n;
248#if HAVE_ICONV
249 const char *charset;
250 char obuf[25], *optr;
251 size_t obytesleft;
252 const char *iptr;
253 size_t sn;
254#endif
255
Chet Ramey495aee42011-11-22 19:11:26 -0500256#if __STDC_ISO_10646__
Chet Rameyac50fba2014-02-26 09:36:43 -0500257 wc = c;
258 if (sizeof (wchar_t) == 4 && c <= 0x7fffffff)
259 n = wctomb (s, wc);
260 else if (sizeof (wchar_t) == 2 && c <= 0x10ffff && u32toutf16 (c, ws))
261 n = wcstombs (s, ws, MB_LEN_MAX);
262 else
263 n = -1;
264 if (n != -1)
265 return n;
Chet Ramey495aee42011-11-22 19:11:26 -0500266#endif
267
268#if HAVE_NL_LANGINFO
269 codeset = nl_langinfo (CODESET);
270 if (STREQ (codeset, "UTF-8"))
271 {
Chet Rameyac50fba2014-02-26 09:36:43 -0500272 n = u32toutf8 (c, s);
Chet Ramey495aee42011-11-22 19:11:26 -0500273 return n;
274 }
275#endif
276
277#if HAVE_ICONV
278 /* this is mostly from coreutils-8.5/lib/unicodeio.c */
279 if (u32init == 0)
280 {
281# if HAVE_LOCALE_CHARSET
282 charset = locale_charset (); /* XXX - fix later */
283# else
284 charset = stub_charset ();
285# endif
286 if (STREQ (charset, "UTF-8"))
287 utf8locale = 1;
288 else
289 {
290 localconv = iconv_open (charset, "UTF-8");
291 if (localconv == (iconv_t)-1)
Chet Rameyac50fba2014-02-26 09:36:43 -0500292 /* We assume ASCII when presented with an unknown encoding. */
293 localconv = iconv_open ("ASCII", "UTF-8");
Chet Ramey495aee42011-11-22 19:11:26 -0500294 }
295 u32init = 1;
296 }
297
Chet Rameyac50fba2014-02-26 09:36:43 -0500298 /* If we have a UTF-8 locale, convert to UTF-8 and return converted value. */
299 n = u32toutf8 (c, s);
Chet Ramey495aee42011-11-22 19:11:26 -0500300 if (utf8locale)
Chet Rameyac50fba2014-02-26 09:36:43 -0500301 return n;
Chet Ramey495aee42011-11-22 19:11:26 -0500302
Chet Rameyac50fba2014-02-26 09:36:43 -0500303 /* If the conversion is not supported, even the ASCII requested above, we
304 bail now. Currently we return the UTF-8 conversion. We could return
305 u32tocesc(). */
Chet Ramey495aee42011-11-22 19:11:26 -0500306 if (localconv == (iconv_t)-1)
Chet Rameyac50fba2014-02-26 09:36:43 -0500307 return n;
308
Chet Ramey495aee42011-11-22 19:11:26 -0500309 optr = obuf;
310 obytesleft = sizeof (obuf);
311 iptr = s;
312 sn = n;
313
314 iconv (localconv, NULL, NULL, NULL, NULL);
315
316 if (iconv (localconv, (ICONV_CONST char **)&iptr, &sn, &optr, &obytesleft) == (size_t)-1)
Chet Rameyac50fba2014-02-26 09:36:43 -0500317 {
318#if 1
319 /* You get ISO C99 escape sequences if iconv fails */
320 n = u32tocesc (c, s);
321#else
322 /* You get UTF-8 if iconv fails */
323#endif
324 return n;
325 }
Chet Ramey495aee42011-11-22 19:11:26 -0500326
327 *optr = '\0';
328
329 /* number of chars to be copied is optr - obuf if we want to do bounds
330 checking */
331 strcpy (s, obuf);
332 return (optr - obuf);
Chet Rameyac50fba2014-02-26 09:36:43 -0500333#endif /* HAVE_ICONV */
Chet Ramey495aee42011-11-22 19:11:26 -0500334
Chet Rameyac50fba2014-02-26 09:36:43 -0500335 n = u32tocesc (c, s); /* fallback is ISO C99 escape sequences */
Chet Ramey495aee42011-11-22 19:11:26 -0500336 return n;
337}
Chet Rameyac50fba2014-02-26 09:36:43 -0500338#else
339void
340u32reset ()
341{
342}
Chet Ramey495aee42011-11-22 19:11:26 -0500343#endif /* HANDLE_MULTIBYTE */