Chet Ramey | 495aee4 | 2011-11-22 19:11:26 -0500 | [diff] [blame] | 1 | /* unicode.c - functions to convert unicode characters */ |
| 2 | |
Chet Ramey | ac50fba | 2014-02-26 09:36:43 -0500 | [diff] [blame] | 3 | /* Copyright (C) 2010-2012 Free Software Foundation, Inc. |
Chet Ramey | 495aee4 | 2011-11-22 19:11:26 -0500 | [diff] [blame] | 4 | |
| 5 | This file is part of GNU Bash, the Bourne Again SHell. |
| 6 | |
| 7 | Bash is free software: you can redistribute it and/or modify |
| 8 | it under the terms of the GNU General Public License as published by |
| 9 | the Free Software Foundation, either version 3 of the License, or |
| 10 | (at your option) any later version. |
| 11 | |
| 12 | Bash is distributed in the hope that it will be useful, |
| 13 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 14 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
| 15 | GNU General Public License for more details. |
| 16 | |
| 17 | You should have received a copy of the GNU General Public License |
| 18 | along with Bash. If not, see <http://www.gnu.org/licenses/>. |
| 19 | */ |
| 20 | |
| 21 | #include <config.h> |
| 22 | |
| 23 | #if defined (HANDLE_MULTIBYTE) |
| 24 | |
| 25 | #include <stdc.h> |
| 26 | #include <wchar.h> |
| 27 | #include <bashansi.h> |
| 28 | #ifdef HAVE_UNISTD_H |
| 29 | #include <unistd.h> |
| 30 | #endif |
Chet Ramey | ac50fba | 2014-02-26 09:36:43 -0500 | [diff] [blame] | 31 | #include <stdio.h> |
Chet Ramey | 495aee4 | 2011-11-22 19:11:26 -0500 | [diff] [blame] | 32 | #include <limits.h> |
| 33 | |
| 34 | #if HAVE_ICONV |
| 35 | # include <iconv.h> |
| 36 | #endif |
| 37 | |
| 38 | #include <xmalloc.h> |
| 39 | |
| 40 | #ifndef USHORT_MAX |
| 41 | # ifdef USHRT_MAX |
| 42 | # define USHORT_MAX USHRT_MAX |
| 43 | # else |
| 44 | # define USHORT_MAX ((unsigned short) ~(unsigned short)0) |
| 45 | # endif |
| 46 | #endif |
| 47 | |
| 48 | #if !defined (STREQ) |
| 49 | # define STREQ(a, b) ((a)[0] == (b)[0] && strcmp ((a), (b)) == 0) |
| 50 | #endif /* !STREQ */ |
| 51 | |
| 52 | #if defined (HAVE_LOCALE_CHARSET) |
| 53 | extern const char *locale_charset __P((void)); |
| 54 | #else |
| 55 | extern char *get_locale_var __P((char *)); |
| 56 | #endif |
| 57 | |
| 58 | static int u32init = 0; |
| 59 | static int utf8locale = 0; |
| 60 | #if defined (HAVE_ICONV) |
| 61 | static iconv_t localconv; |
| 62 | #endif |
| 63 | |
| 64 | #ifndef HAVE_LOCALE_CHARSET |
Chet Ramey | ac50fba | 2014-02-26 09:36:43 -0500 | [diff] [blame] | 65 | static char charsetbuf[40]; |
| 66 | |
Chet Ramey | 495aee4 | 2011-11-22 19:11:26 -0500 | [diff] [blame] | 67 | static char * |
| 68 | stub_charset () |
| 69 | { |
| 70 | char *locale, *s, *t; |
| 71 | |
| 72 | locale = get_locale_var ("LC_CTYPE"); |
| 73 | if (locale == 0 || *locale == 0) |
Chet Ramey | ac50fba | 2014-02-26 09:36:43 -0500 | [diff] [blame] | 74 | { |
| 75 | strcpy (charsetbuf, "ASCII"); |
| 76 | return charsetbuf; |
| 77 | } |
Chet Ramey | 495aee4 | 2011-11-22 19:11:26 -0500 | [diff] [blame] | 78 | s = strrchr (locale, '.'); |
| 79 | if (s) |
| 80 | { |
Chet Ramey | 3d9a312 | 2015-05-19 14:55:27 -0400 | [diff] [blame] | 81 | strncpy (charsetbuf, s+1, sizeof (charsetbuf) - 1); |
| 82 | charsetbuf[sizeof (charsetbuf) - 1] = '\0'; |
Chet Ramey | ac50fba | 2014-02-26 09:36:43 -0500 | [diff] [blame] | 83 | t = strchr (charsetbuf, '@'); |
Chet Ramey | 495aee4 | 2011-11-22 19:11:26 -0500 | [diff] [blame] | 84 | if (t) |
| 85 | *t = 0; |
Chet Ramey | ac50fba | 2014-02-26 09:36:43 -0500 | [diff] [blame] | 86 | return charsetbuf; |
Chet Ramey | 495aee4 | 2011-11-22 19:11:26 -0500 | [diff] [blame] | 87 | } |
Chet Ramey | 3d9a312 | 2015-05-19 14:55:27 -0400 | [diff] [blame] | 88 | strncpy (charsetbuf, locale, sizeof (charsetbuf) - 1); |
| 89 | charsetbuf[sizeof (charsetbuf) - 1] = '\0'; |
Chet Ramey | ac50fba | 2014-02-26 09:36:43 -0500 | [diff] [blame] | 90 | return charsetbuf; |
Chet Ramey | 495aee4 | 2011-11-22 19:11:26 -0500 | [diff] [blame] | 91 | } |
| 92 | #endif |
| 93 | |
Chet Ramey | ac50fba | 2014-02-26 09:36:43 -0500 | [diff] [blame] | 94 | void |
| 95 | u32reset () |
| 96 | { |
| 97 | #if defined (HAVE_ICONV) |
| 98 | if (u32init && localconv != (iconv_t)-1) |
| 99 | { |
| 100 | iconv_close (localconv); |
| 101 | localconv = (iconv_t)-1; |
| 102 | } |
| 103 | #endif |
| 104 | u32init = 0; |
| 105 | utf8locale = 0; |
| 106 | } |
| 107 | |
Chet Ramey | 495aee4 | 2011-11-22 19:11:26 -0500 | [diff] [blame] | 108 | /* u32toascii ? */ |
| 109 | int |
Chet Ramey | ac50fba | 2014-02-26 09:36:43 -0500 | [diff] [blame] | 110 | u32tochar (x, s) |
| 111 | unsigned long x; |
Chet Ramey | 495aee4 | 2011-11-22 19:11:26 -0500 | [diff] [blame] | 112 | char *s; |
| 113 | { |
Chet Ramey | 495aee4 | 2011-11-22 19:11:26 -0500 | [diff] [blame] | 114 | int l; |
| 115 | |
Chet Ramey | 495aee4 | 2011-11-22 19:11:26 -0500 | [diff] [blame] | 116 | l = (x <= UCHAR_MAX) ? 1 : ((x <= USHORT_MAX) ? 2 : 4); |
| 117 | |
| 118 | if (x <= UCHAR_MAX) |
| 119 | s[0] = x & 0xFF; |
| 120 | else if (x <= USHORT_MAX) /* assume unsigned short = 16 bits */ |
| 121 | { |
| 122 | s[0] = (x >> 8) & 0xFF; |
| 123 | s[1] = x & 0xFF; |
| 124 | } |
| 125 | else |
| 126 | { |
| 127 | s[0] = (x >> 24) & 0xFF; |
| 128 | s[1] = (x >> 16) & 0xFF; |
| 129 | s[2] = (x >> 8) & 0xFF; |
| 130 | s[3] = x & 0xFF; |
| 131 | } |
| 132 | s[l] = '\0'; |
| 133 | return l; |
| 134 | } |
| 135 | |
| 136 | int |
Chet Ramey | ac50fba | 2014-02-26 09:36:43 -0500 | [diff] [blame] | 137 | u32tocesc (wc, s) |
| 138 | u_bits32_t wc; |
Chet Ramey | 495aee4 | 2011-11-22 19:11:26 -0500 | [diff] [blame] | 139 | char *s; |
| 140 | { |
| 141 | int l; |
| 142 | |
Chet Ramey | ac50fba | 2014-02-26 09:36:43 -0500 | [diff] [blame] | 143 | if (wc < 0x10000) |
| 144 | l = sprintf (s, "\\u%04X", wc); |
| 145 | else |
| 146 | l = sprintf (s, "\\u%08X", wc); |
| 147 | return l; |
| 148 | } |
| 149 | |
| 150 | /* Convert unsigned 32-bit int to utf-8 character string */ |
| 151 | int |
| 152 | u32toutf8 (wc, s) |
| 153 | u_bits32_t wc; |
| 154 | char *s; |
| 155 | { |
| 156 | int l; |
Chet Ramey | 495aee4 | 2011-11-22 19:11:26 -0500 | [diff] [blame] | 157 | |
| 158 | if (wc < 0x0080) |
Chet Ramey | ac50fba | 2014-02-26 09:36:43 -0500 | [diff] [blame] | 159 | { |
| 160 | s[0] = (char)wc; |
| 161 | l = 1; |
| 162 | } |
Chet Ramey | 495aee4 | 2011-11-22 19:11:26 -0500 | [diff] [blame] | 163 | else if (wc < 0x0800) |
| 164 | { |
| 165 | s[0] = (wc >> 6) | 0xc0; |
| 166 | s[1] = (wc & 0x3f) | 0x80; |
Chet Ramey | ac50fba | 2014-02-26 09:36:43 -0500 | [diff] [blame] | 167 | l = 2; |
Chet Ramey | 495aee4 | 2011-11-22 19:11:26 -0500 | [diff] [blame] | 168 | } |
Chet Ramey | ac50fba | 2014-02-26 09:36:43 -0500 | [diff] [blame] | 169 | else if (wc < 0x10000) |
Chet Ramey | 495aee4 | 2011-11-22 19:11:26 -0500 | [diff] [blame] | 170 | { |
Chet Ramey | ac50fba | 2014-02-26 09:36:43 -0500 | [diff] [blame] | 171 | /* Technically, we could return 0 here if 0xd800 <= wc <= 0x0dfff */ |
Chet Ramey | 495aee4 | 2011-11-22 19:11:26 -0500 | [diff] [blame] | 172 | s[0] = (wc >> 12) | 0xe0; |
| 173 | s[1] = ((wc >> 6) & 0x3f) | 0x80; |
| 174 | s[2] = (wc & 0x3f) | 0x80; |
Chet Ramey | ac50fba | 2014-02-26 09:36:43 -0500 | [diff] [blame] | 175 | l = 3; |
Chet Ramey | 495aee4 | 2011-11-22 19:11:26 -0500 | [diff] [blame] | 176 | } |
Chet Ramey | ac50fba | 2014-02-26 09:36:43 -0500 | [diff] [blame] | 177 | else if (wc < 0x200000) |
| 178 | { |
| 179 | s[0] = (wc >> 18) | 0xf0; |
| 180 | s[1] = ((wc >> 12) & 0x3f) | 0x80; |
| 181 | s[2] = ((wc >> 6) & 0x3f) | 0x80; |
| 182 | s[3] = (wc & 0x3f) | 0x80; |
| 183 | l = 4; |
| 184 | } |
| 185 | /* Strictly speaking, UTF-8 doesn't have characters longer than 4 bytes */ |
| 186 | else if (wc < 0x04000000) |
| 187 | { |
| 188 | s[0] = (wc >> 24) | 0xf8; |
| 189 | s[1] = ((wc >> 18) & 0x3f) | 0x80; |
| 190 | s[2] = ((wc >> 12) & 0x3f) | 0x80; |
| 191 | s[3] = ((wc >> 6) & 0x3f) | 0x80; |
| 192 | s[4] = (wc & 0x3f) | 0x80; |
| 193 | l = 5; |
| 194 | } |
| 195 | else if (wc < 0x080000000) |
| 196 | { |
| 197 | s[0] = (wc >> 30) | 0xf8; |
| 198 | s[1] = ((wc >> 24) & 0x3f) | 0x80; |
| 199 | s[2] = ((wc >> 18) & 0x3f) | 0x80; |
| 200 | s[3] = ((wc >> 12) & 0x3f) | 0x80; |
| 201 | s[4] = ((wc >> 6) & 0x3f) | 0x80; |
| 202 | s[5] = (wc & 0x3f) | 0x80; |
| 203 | l = 6; |
| 204 | } |
| 205 | else |
| 206 | l = 0; |
| 207 | |
Chet Ramey | 495aee4 | 2011-11-22 19:11:26 -0500 | [diff] [blame] | 208 | s[l] = '\0'; |
| 209 | return l; |
| 210 | } |
| 211 | |
Chet Ramey | ac50fba | 2014-02-26 09:36:43 -0500 | [diff] [blame] | 212 | /* Convert a 32-bit unsigned int (unicode) to a UTF-16 string. Rarely used, |
| 213 | only if sizeof(wchar_t) == 2. */ |
| 214 | int |
| 215 | u32toutf16 (c, s) |
| 216 | u_bits32_t c; |
| 217 | unsigned short *s; |
| 218 | { |
| 219 | int l; |
| 220 | |
| 221 | l = 0; |
| 222 | if (c < 0x0d800) |
| 223 | { |
| 224 | s[0] = (unsigned short) (c & 0xFFFF); |
| 225 | l = 1; |
| 226 | } |
| 227 | else if (c >= 0x0e000 && c <= 0x010ffff) |
| 228 | { |
| 229 | c -= 0x010000; |
| 230 | s[0] = (unsigned short)((c >> 10) + 0xd800); |
| 231 | s[1] = (unsigned short)((c & 0x3ff) + 0xdc00); |
| 232 | l = 2; |
| 233 | } |
| 234 | s[l] = 0; |
| 235 | return l; |
| 236 | } |
| 237 | |
Chet Ramey | 495aee4 | 2011-11-22 19:11:26 -0500 | [diff] [blame] | 238 | /* convert a single unicode-32 character into a multibyte string and put the |
| 239 | result in S, which must be large enough (at least MB_LEN_MAX bytes) */ |
| 240 | int |
| 241 | u32cconv (c, s) |
| 242 | unsigned long c; |
| 243 | char *s; |
| 244 | { |
| 245 | wchar_t wc; |
Chet Ramey | ac50fba | 2014-02-26 09:36:43 -0500 | [diff] [blame] | 246 | wchar_t ws[3]; |
Chet Ramey | 495aee4 | 2011-11-22 19:11:26 -0500 | [diff] [blame] | 247 | int n; |
| 248 | #if HAVE_ICONV |
| 249 | const char *charset; |
| 250 | char obuf[25], *optr; |
| 251 | size_t obytesleft; |
| 252 | const char *iptr; |
| 253 | size_t sn; |
| 254 | #endif |
| 255 | |
Chet Ramey | 495aee4 | 2011-11-22 19:11:26 -0500 | [diff] [blame] | 256 | #if __STDC_ISO_10646__ |
Chet Ramey | ac50fba | 2014-02-26 09:36:43 -0500 | [diff] [blame] | 257 | wc = c; |
| 258 | if (sizeof (wchar_t) == 4 && c <= 0x7fffffff) |
| 259 | n = wctomb (s, wc); |
| 260 | else if (sizeof (wchar_t) == 2 && c <= 0x10ffff && u32toutf16 (c, ws)) |
| 261 | n = wcstombs (s, ws, MB_LEN_MAX); |
| 262 | else |
| 263 | n = -1; |
| 264 | if (n != -1) |
| 265 | return n; |
Chet Ramey | 495aee4 | 2011-11-22 19:11:26 -0500 | [diff] [blame] | 266 | #endif |
| 267 | |
| 268 | #if HAVE_NL_LANGINFO |
| 269 | codeset = nl_langinfo (CODESET); |
| 270 | if (STREQ (codeset, "UTF-8")) |
| 271 | { |
Chet Ramey | ac50fba | 2014-02-26 09:36:43 -0500 | [diff] [blame] | 272 | n = u32toutf8 (c, s); |
Chet Ramey | 495aee4 | 2011-11-22 19:11:26 -0500 | [diff] [blame] | 273 | return n; |
| 274 | } |
| 275 | #endif |
| 276 | |
| 277 | #if HAVE_ICONV |
| 278 | /* this is mostly from coreutils-8.5/lib/unicodeio.c */ |
| 279 | if (u32init == 0) |
| 280 | { |
| 281 | # if HAVE_LOCALE_CHARSET |
| 282 | charset = locale_charset (); /* XXX - fix later */ |
| 283 | # else |
| 284 | charset = stub_charset (); |
| 285 | # endif |
| 286 | if (STREQ (charset, "UTF-8")) |
| 287 | utf8locale = 1; |
| 288 | else |
| 289 | { |
| 290 | localconv = iconv_open (charset, "UTF-8"); |
| 291 | if (localconv == (iconv_t)-1) |
Chet Ramey | ac50fba | 2014-02-26 09:36:43 -0500 | [diff] [blame] | 292 | /* We assume ASCII when presented with an unknown encoding. */ |
| 293 | localconv = iconv_open ("ASCII", "UTF-8"); |
Chet Ramey | 495aee4 | 2011-11-22 19:11:26 -0500 | [diff] [blame] | 294 | } |
| 295 | u32init = 1; |
| 296 | } |
| 297 | |
Chet Ramey | ac50fba | 2014-02-26 09:36:43 -0500 | [diff] [blame] | 298 | /* If we have a UTF-8 locale, convert to UTF-8 and return converted value. */ |
| 299 | n = u32toutf8 (c, s); |
Chet Ramey | 495aee4 | 2011-11-22 19:11:26 -0500 | [diff] [blame] | 300 | if (utf8locale) |
Chet Ramey | ac50fba | 2014-02-26 09:36:43 -0500 | [diff] [blame] | 301 | return n; |
Chet Ramey | 495aee4 | 2011-11-22 19:11:26 -0500 | [diff] [blame] | 302 | |
Chet Ramey | ac50fba | 2014-02-26 09:36:43 -0500 | [diff] [blame] | 303 | /* If the conversion is not supported, even the ASCII requested above, we |
| 304 | bail now. Currently we return the UTF-8 conversion. We could return |
| 305 | u32tocesc(). */ |
Chet Ramey | 495aee4 | 2011-11-22 19:11:26 -0500 | [diff] [blame] | 306 | if (localconv == (iconv_t)-1) |
Chet Ramey | ac50fba | 2014-02-26 09:36:43 -0500 | [diff] [blame] | 307 | return n; |
| 308 | |
Chet Ramey | 495aee4 | 2011-11-22 19:11:26 -0500 | [diff] [blame] | 309 | optr = obuf; |
| 310 | obytesleft = sizeof (obuf); |
| 311 | iptr = s; |
| 312 | sn = n; |
| 313 | |
| 314 | iconv (localconv, NULL, NULL, NULL, NULL); |
| 315 | |
| 316 | if (iconv (localconv, (ICONV_CONST char **)&iptr, &sn, &optr, &obytesleft) == (size_t)-1) |
Chet Ramey | ac50fba | 2014-02-26 09:36:43 -0500 | [diff] [blame] | 317 | { |
| 318 | #if 1 |
| 319 | /* You get ISO C99 escape sequences if iconv fails */ |
| 320 | n = u32tocesc (c, s); |
| 321 | #else |
| 322 | /* You get UTF-8 if iconv fails */ |
| 323 | #endif |
| 324 | return n; |
| 325 | } |
Chet Ramey | 495aee4 | 2011-11-22 19:11:26 -0500 | [diff] [blame] | 326 | |
| 327 | *optr = '\0'; |
| 328 | |
| 329 | /* number of chars to be copied is optr - obuf if we want to do bounds |
| 330 | checking */ |
| 331 | strcpy (s, obuf); |
| 332 | return (optr - obuf); |
Chet Ramey | ac50fba | 2014-02-26 09:36:43 -0500 | [diff] [blame] | 333 | #endif /* HAVE_ICONV */ |
Chet Ramey | 495aee4 | 2011-11-22 19:11:26 -0500 | [diff] [blame] | 334 | |
Chet Ramey | ac50fba | 2014-02-26 09:36:43 -0500 | [diff] [blame] | 335 | n = u32tocesc (c, s); /* fallback is ISO C99 escape sequences */ |
Chet Ramey | 495aee4 | 2011-11-22 19:11:26 -0500 | [diff] [blame] | 336 | return n; |
| 337 | } |
Chet Ramey | ac50fba | 2014-02-26 09:36:43 -0500 | [diff] [blame] | 338 | #else |
| 339 | void |
| 340 | u32reset () |
| 341 | { |
| 342 | } |
Chet Ramey | 495aee4 | 2011-11-22 19:11:26 -0500 | [diff] [blame] | 343 | #endif /* HANDLE_MULTIBYTE */ |