| /* unicode.c - functions to convert unicode characters */ |
| |
| /* Copyright (C) 2010-2012 Free Software Foundation, Inc. |
| |
| This file is part of GNU Bash, the Bourne Again SHell. |
| |
| Bash is free software: you can redistribute it and/or modify |
| it under the terms of the GNU General Public License as published by |
| the Free Software Foundation, either version 3 of the License, or |
| (at your option) any later version. |
| |
| Bash is distributed in the hope that it will be useful, |
| but WITHOUT ANY WARRANTY; without even the implied warranty of |
| MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
| GNU General Public License for more details. |
| |
| You should have received a copy of the GNU General Public License |
| along with Bash. If not, see <http://www.gnu.org/licenses/>. |
| */ |
| |
| #include <config.h> |
| |
| #if defined (HANDLE_MULTIBYTE) |
| |
| #include <stdc.h> |
| #include <wchar.h> |
| #include <bashansi.h> |
| #ifdef HAVE_UNISTD_H |
| #include <unistd.h> |
| #endif |
| #include <stdio.h> |
| #include <limits.h> |
| |
| #if HAVE_ICONV |
| # include <iconv.h> |
| #endif |
| |
| #include <xmalloc.h> |
| |
| #ifndef USHORT_MAX |
| # ifdef USHRT_MAX |
| # define USHORT_MAX USHRT_MAX |
| # else |
| # define USHORT_MAX ((unsigned short) ~(unsigned short)0) |
| # endif |
| #endif |
| |
| #if !defined (STREQ) |
| # define STREQ(a, b) ((a)[0] == (b)[0] && strcmp ((a), (b)) == 0) |
| #endif /* !STREQ */ |
| |
| #if defined (HAVE_LOCALE_CHARSET) |
| extern const char *locale_charset __P((void)); |
| #else |
| extern char *get_locale_var __P((char *)); |
| #endif |
| |
| static int u32init = 0; |
| static int utf8locale = 0; |
| #if defined (HAVE_ICONV) |
| static iconv_t localconv; |
| #endif |
| |
| #ifndef HAVE_LOCALE_CHARSET |
| static char charsetbuf[40]; |
| |
| static char * |
| stub_charset () |
| { |
| char *locale, *s, *t; |
| |
| locale = get_locale_var ("LC_CTYPE"); |
| if (locale == 0 || *locale == 0) |
| { |
| strcpy (charsetbuf, "ASCII"); |
| return charsetbuf; |
| } |
| s = strrchr (locale, '.'); |
| if (s) |
| { |
| strncpy (charsetbuf, s+1, sizeof (charsetbuf) - 1); |
| charsetbuf[sizeof (charsetbuf) - 1] = '\0'; |
| t = strchr (charsetbuf, '@'); |
| if (t) |
| *t = 0; |
| return charsetbuf; |
| } |
| strncpy (charsetbuf, locale, sizeof (charsetbuf) - 1); |
| charsetbuf[sizeof (charsetbuf) - 1] = '\0'; |
| return charsetbuf; |
| } |
| #endif |
| |
| void |
| u32reset () |
| { |
| #if defined (HAVE_ICONV) |
| if (u32init && localconv != (iconv_t)-1) |
| { |
| iconv_close (localconv); |
| localconv = (iconv_t)-1; |
| } |
| #endif |
| u32init = 0; |
| utf8locale = 0; |
| } |
| |
| /* u32toascii ? */ |
| int |
| u32tochar (x, s) |
| unsigned long x; |
| char *s; |
| { |
| int l; |
| |
| l = (x <= UCHAR_MAX) ? 1 : ((x <= USHORT_MAX) ? 2 : 4); |
| |
| if (x <= UCHAR_MAX) |
| s[0] = x & 0xFF; |
| else if (x <= USHORT_MAX) /* assume unsigned short = 16 bits */ |
| { |
| s[0] = (x >> 8) & 0xFF; |
| s[1] = x & 0xFF; |
| } |
| else |
| { |
| s[0] = (x >> 24) & 0xFF; |
| s[1] = (x >> 16) & 0xFF; |
| s[2] = (x >> 8) & 0xFF; |
| s[3] = x & 0xFF; |
| } |
| s[l] = '\0'; |
| return l; |
| } |
| |
| int |
| u32tocesc (wc, s) |
| u_bits32_t wc; |
| char *s; |
| { |
| int l; |
| |
| if (wc < 0x10000) |
| l = sprintf (s, "\\u%04X", wc); |
| else |
| l = sprintf (s, "\\u%08X", wc); |
| return l; |
| } |
| |
| /* Convert unsigned 32-bit int to utf-8 character string */ |
| int |
| u32toutf8 (wc, s) |
| u_bits32_t wc; |
| char *s; |
| { |
| int l; |
| |
| if (wc < 0x0080) |
| { |
| s[0] = (char)wc; |
| l = 1; |
| } |
| else if (wc < 0x0800) |
| { |
| s[0] = (wc >> 6) | 0xc0; |
| s[1] = (wc & 0x3f) | 0x80; |
| l = 2; |
| } |
| else if (wc < 0x10000) |
| { |
| /* Technically, we could return 0 here if 0xd800 <= wc <= 0x0dfff */ |
| s[0] = (wc >> 12) | 0xe0; |
| s[1] = ((wc >> 6) & 0x3f) | 0x80; |
| s[2] = (wc & 0x3f) | 0x80; |
| l = 3; |
| } |
| else if (wc < 0x200000) |
| { |
| s[0] = (wc >> 18) | 0xf0; |
| s[1] = ((wc >> 12) & 0x3f) | 0x80; |
| s[2] = ((wc >> 6) & 0x3f) | 0x80; |
| s[3] = (wc & 0x3f) | 0x80; |
| l = 4; |
| } |
| /* Strictly speaking, UTF-8 doesn't have characters longer than 4 bytes */ |
| else if (wc < 0x04000000) |
| { |
| s[0] = (wc >> 24) | 0xf8; |
| s[1] = ((wc >> 18) & 0x3f) | 0x80; |
| s[2] = ((wc >> 12) & 0x3f) | 0x80; |
| s[3] = ((wc >> 6) & 0x3f) | 0x80; |
| s[4] = (wc & 0x3f) | 0x80; |
| l = 5; |
| } |
| else if (wc < 0x080000000) |
| { |
| s[0] = (wc >> 30) | 0xf8; |
| s[1] = ((wc >> 24) & 0x3f) | 0x80; |
| s[2] = ((wc >> 18) & 0x3f) | 0x80; |
| s[3] = ((wc >> 12) & 0x3f) | 0x80; |
| s[4] = ((wc >> 6) & 0x3f) | 0x80; |
| s[5] = (wc & 0x3f) | 0x80; |
| l = 6; |
| } |
| else |
| l = 0; |
| |
| s[l] = '\0'; |
| return l; |
| } |
| |
| /* Convert a 32-bit unsigned int (unicode) to a UTF-16 string. Rarely used, |
| only if sizeof(wchar_t) == 2. */ |
| int |
| u32toutf16 (c, s) |
| u_bits32_t c; |
| unsigned short *s; |
| { |
| int l; |
| |
| l = 0; |
| if (c < 0x0d800) |
| { |
| s[0] = (unsigned short) (c & 0xFFFF); |
| l = 1; |
| } |
| else if (c >= 0x0e000 && c <= 0x010ffff) |
| { |
| c -= 0x010000; |
| s[0] = (unsigned short)((c >> 10) + 0xd800); |
| s[1] = (unsigned short)((c & 0x3ff) + 0xdc00); |
| l = 2; |
| } |
| s[l] = 0; |
| return l; |
| } |
| |
| /* convert a single unicode-32 character into a multibyte string and put the |
| result in S, which must be large enough (at least MB_LEN_MAX bytes) */ |
| int |
| u32cconv (c, s) |
| unsigned long c; |
| char *s; |
| { |
| wchar_t wc; |
| wchar_t ws[3]; |
| int n; |
| #if HAVE_ICONV |
| const char *charset; |
| char obuf[25], *optr; |
| size_t obytesleft; |
| const char *iptr; |
| size_t sn; |
| #endif |
| |
| #if __STDC_ISO_10646__ |
| wc = c; |
| if (sizeof (wchar_t) == 4 && c <= 0x7fffffff) |
| n = wctomb (s, wc); |
| else if (sizeof (wchar_t) == 2 && c <= 0x10ffff && u32toutf16 (c, ws)) |
| n = wcstombs (s, ws, MB_LEN_MAX); |
| else |
| n = -1; |
| if (n != -1) |
| return n; |
| #endif |
| |
| #if HAVE_NL_LANGINFO |
| codeset = nl_langinfo (CODESET); |
| if (STREQ (codeset, "UTF-8")) |
| { |
| n = u32toutf8 (c, s); |
| return n; |
| } |
| #endif |
| |
| #if HAVE_ICONV |
| /* this is mostly from coreutils-8.5/lib/unicodeio.c */ |
| if (u32init == 0) |
| { |
| # if HAVE_LOCALE_CHARSET |
| charset = locale_charset (); /* XXX - fix later */ |
| # else |
| charset = stub_charset (); |
| # endif |
| if (STREQ (charset, "UTF-8")) |
| utf8locale = 1; |
| else |
| { |
| localconv = iconv_open (charset, "UTF-8"); |
| if (localconv == (iconv_t)-1) |
| /* We assume ASCII when presented with an unknown encoding. */ |
| localconv = iconv_open ("ASCII", "UTF-8"); |
| } |
| u32init = 1; |
| } |
| |
| /* If we have a UTF-8 locale, convert to UTF-8 and return converted value. */ |
| n = u32toutf8 (c, s); |
| if (utf8locale) |
| return n; |
| |
| /* If the conversion is not supported, even the ASCII requested above, we |
| bail now. Currently we return the UTF-8 conversion. We could return |
| u32tocesc(). */ |
| if (localconv == (iconv_t)-1) |
| return n; |
| |
| optr = obuf; |
| obytesleft = sizeof (obuf); |
| iptr = s; |
| sn = n; |
| |
| iconv (localconv, NULL, NULL, NULL, NULL); |
| |
| if (iconv (localconv, (ICONV_CONST char **)&iptr, &sn, &optr, &obytesleft) == (size_t)-1) |
| { |
| #if 1 |
| /* You get ISO C99 escape sequences if iconv fails */ |
| n = u32tocesc (c, s); |
| #else |
| /* You get UTF-8 if iconv fails */ |
| #endif |
| return n; |
| } |
| |
| *optr = '\0'; |
| |
| /* number of chars to be copied is optr - obuf if we want to do bounds |
| checking */ |
| strcpy (s, obuf); |
| return (optr - obuf); |
| #endif /* HAVE_ICONV */ |
| |
| n = u32tocesc (c, s); /* fallback is ISO C99 escape sequences */ |
| return n; |
| } |
| #else |
| void |
| u32reset () |
| { |
| } |
| #endif /* HANDLE_MULTIBYTE */ |