src/chars.c - platform_external_nano - Gitiles

 /**************************************************************************
  *   chars.c  --  This file is part of GNU nano.                          *
  *                                                                        *
  *   Copyright (C) 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009,  *
  *   2010, 2011, 2013, 2014 Free Software Foundation, Inc.                *
  *   Copyright (C) 2016 Benno Schulenberg                                 *
  *                                                                        *
  *   GNU nano is free software: you can redistribute it and/or modify     *
  *   it under the terms of the GNU General Public License as published    *
  *   by the Free Software Foundation, either version 3 of the License,    *
  *   or (at your option) any later version.                               *
  *                                                                        *
  *   GNU nano is distributed in the hope that it will be useful,          *
  *   but WITHOUT ANY WARRANTY; without even the implied warranty          *
  *   of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.              *
  *   See the GNU General Public License for more details.                 *
  *                                                                        *
  *   You should have received a copy of the GNU General Public License    *
  *   along with this program.  If not, see http://www.gnu.org/licenses/.  *
  *                                                                        *
  **************************************************************************/

 #include "proto.h"

 #include <string.h>
 #include <ctype.h>

 #ifdef ENABLE_UTF8
 #ifdef HAVE_WCHAR_H
 #include <wchar.h>
 #endif
 #ifdef HAVE_WCTYPE_H
 #include <wctype.h>
 #endif

 static bool use_utf8 = FALSE;
 	/* Whether we've enabled UTF-8 support. */

 /* Enable UTF-8 support. */
 void utf8_init(void)
 {
     use_utf8 = TRUE;
 }

 /* Is UTF-8 support enabled? */
 bool using_utf8(void)
 {
     return use_utf8;
 }
 #endif /* ENABLE_UTF8 */

 /* Concatenate two allocated strings, and free the second. */
 char *addstrings(char* str1, size_t len1, char* str2, size_t len2)
 {
     str1 = charealloc(str1, len1 + len2 + 1);
     str1[len1] = '\0';

     strncat(&str1[len1], str2, len2);
     free(str2);

     return str1;
 }

 #ifndef HAVE_ISBLANK
 /* This function is equivalent to isblank(). */
 bool nisblank(int c)
 {
     return isspace(c) && (c == '\t' || !is_cntrl_char(c));
 }
 #endif

 #if !defined(HAVE_ISWBLANK) && defined(ENABLE_UTF8)
 /* This function is equivalent to iswblank(). */
 bool niswblank(wchar_t wc)
 {
     return iswspace(wc) && (wc == '\t' || !is_cntrl_wchar(wc));
 }
 #endif

 /* Return TRUE if the value of c is in byte range, and FALSE otherwise. */
 bool is_byte(int c)
 {
     return ((unsigned int)c == (unsigned char)c);
 }

 void mbtowc_reset(void)
 {
     IGNORE_CALL_RESULT(mbtowc(NULL, NULL, 0));
 }

 /* This function is equivalent to isalpha() for multibyte characters. */
 bool is_alpha_mbchar(const char *c)
 {
     assert(c != NULL);

 #ifdef ENABLE_UTF8
     if (use_utf8) {
 	wchar_t wc;

 	if (mbtowc(&wc, c, MB_CUR_MAX) < 0) {
 	    mbtowc_reset();
 	    return 0;
 	}

 	return iswalpha(wc);
     } else
 #endif
 	return isalpha((unsigned char)*c);
 }

 /* This function is equivalent to isalnum() for multibyte characters. */
 bool is_alnum_mbchar(const char *c)
 {
     assert(c != NULL);

 #ifdef ENABLE_UTF8
     if (use_utf8) {
 	wchar_t wc;

 	if (mbtowc(&wc, c, MB_CUR_MAX) < 0) {
 	    mbtowc_reset();
 	    return 0;
 	}

 	return iswalnum(wc);
     } else
 #endif
 	return isalnum((unsigned char)*c);
 }

 /* This function is equivalent to isblank() for multibyte characters. */
 bool is_blank_mbchar(const char *c)
 {
     assert(c != NULL);

 #ifdef ENABLE_UTF8
     if (use_utf8) {
 	wchar_t wc;

 	if (mbtowc(&wc, c, MB_CUR_MAX) < 0) {
 	    mbtowc_reset();
 	    return 0;
 	}

 	return iswblank(wc);
     } else
 #endif
 	return isblank((unsigned char)*c);
 }

 /* This function is equivalent to iscntrl(), except in that it only
  * handles non-high-bit control characters. */
 bool is_ascii_cntrl_char(int c)
 {
     return (0 <= c && c < 32);
 }

 /* This function is equivalent to iscntrl(), except in that it also
  * handles high-bit control characters. */
 bool is_cntrl_char(int c)
 {
     return ((c & 0x60) == 0 || c == 127);
 }

 /* This function is equivalent to iscntrl() for multibyte characters,
  * except in that it also handles multibyte control characters with
  * their high bits set. */
 bool is_cntrl_mbchar(const char *c)
 {
 #ifdef ENABLE_UTF8
     if (use_utf8) {
 	return ((c[0] & 0xE0) == 0 || c[0] == 127 ||
 		((signed char)c[0] == -62 && (signed char)c[1] < -96));
     } else
 #endif
 	return is_cntrl_char((unsigned char)*c);
 }

 /* This function is equivalent to ispunct() for multibyte characters. */
 bool is_punct_mbchar(const char *c)
 {
     assert(c != NULL);

 #ifdef ENABLE_UTF8
     if (use_utf8) {
 	wchar_t wc;

 	if (mbtowc(&wc, c, MB_CUR_MAX) < 0) {
 	    mbtowc_reset();
 	    return 0;
 	}

 	return iswpunct(wc);
     } else
 #endif
 	return ispunct((unsigned char)*c);
 }

 /* Return TRUE when the given multibyte character c is a word-forming
  * character (that is: alphanumeric, or specified in wordchars, or
  * punctuation when allow_punct is TRUE), and FALSE otherwise. */
 bool is_word_mbchar(const char *c, bool allow_punct)
 {
     if (*c == '\0')
 	return FALSE;

     if (is_alnum_mbchar(c))
 	return TRUE;

     if (word_chars != NULL && *word_chars != '\0') {
 	char symbol[mb_cur_max() + 1];
 	int symlen = parse_mbchar(c, symbol, NULL);

 	symbol[symlen] = '\0';
 	return (strstr(word_chars, symbol) != NULL);
     }

     return (allow_punct && is_punct_mbchar(c));
 }

 /* Return the visible representation of control character c. */
 char control_rep(const signed char c)
 {
     if (c == DEL_CODE)
 	return '?';
     else if (c == -97)
 	return '=';
     else if (c < 0)
 	return c + 224;
     else
 	return c + 64;
 }

 /* Return the visible representation of multibyte control character c. */
 char control_mbrep(const char *c, bool isdata)
 {
     /* An embedded newline is an encoded NUL if it is data. */
     if (*c == '\n' && (isdata || as_an_at))
 	return '@';

 #ifdef ENABLE_UTF8
     if (use_utf8) {
 	if ((unsigned char)c[0] < 128)
 	    return control_rep(c[0]);
 	else
 	    return control_rep(c[1]);
     } else
 #endif
 	return control_rep(*c);
 }

 /* Assess how many bytes the given (multibyte) character occupies.  Return -1
  * if the byte sequence is invalid, and return the number of bytes minus 8
  * when it encodes an invalid codepoint.  Also, in the second parameter,
  * return the number of columns that the character occupies. */
 int length_of_char(const char *c, int *width)
 {
     assert(c != NULL);

 #ifdef ENABLE_UTF8
     if (use_utf8) {
 	wchar_t wc;
 	int charlen = mbtowc(&wc, c, MB_CUR_MAX);

 	/* If the sequence is invalid... */
 	if (charlen < 0) {
 	    mbtowc_reset();
 	    return -1;
 	}

 	/* If the codepoint is invalid... */
 	if (!is_valid_unicode(wc))
 	    return charlen - 8;
 	else {
 	    *width = wcwidth(wc);
 	    /* If the codepoint is unassigned, assume a width of one. */
 	    if (*width < 0)
 		*width = 1;
 	    return charlen;
 	}
     } else
 #endif
 	return 1;
 }

 /* This function is equivalent to wcwidth() for multibyte characters. */
 int mbwidth(const char *c)
 {
     assert(c != NULL);

 #ifdef ENABLE_UTF8
     if (use_utf8) {
 	wchar_t wc;
 	int width;

 	if (mbtowc(&wc, c, MB_CUR_MAX) < 0) {
 	    mbtowc_reset();
 	    return 1;
 	}

 	width = wcwidth(wc);

 	if (width == -1)
 	    return 1;

 	return width;
     } else
 #endif
 	return 1;
 }

 /* Return the maximum length (in bytes) of a character. */
 int mb_cur_max(void)
 {
 #ifdef ENABLE_UTF8
     if (use_utf8)
 	return MB_CUR_MAX;
     else
 #endif
 	return 1;
 }

 /* Convert the Unicode value in chr to a multibyte character, if possible.
  * If the conversion succeeds, return the (dynamically allocated) multibyte
  * character and its length.  Otherwise, return an undefined (dynamically
  * allocated) multibyte character and a length of zero. */
 char *make_mbchar(long chr, int *chr_mb_len)
 {
     char *chr_mb;

 #ifdef ENABLE_UTF8
     if (use_utf8) {
 	chr_mb = charalloc(MB_CUR_MAX);
 	*chr_mb_len = wctomb(chr_mb, (wchar_t)chr);

 	/* Reject invalid Unicode characters. */
 	if (*chr_mb_len < 0 || !is_valid_unicode((wchar_t)chr)) {
 	    IGNORE_CALL_RESULT(wctomb(NULL, 0));
 	    *chr_mb_len = 0;
 	}
     } else
 #endif
     {
 	*chr_mb_len = 1;
 	chr_mb = mallocstrncpy(NULL, (char *)&chr, 1);
     }

     return chr_mb;
 }

 /* Parse a multibyte character from buf.  Return the number of bytes
  * used.  If chr isn't NULL, store the multibyte character in it.  If
  * col isn't NULL, add the character's width (in columns) to it. */
 int parse_mbchar(const char *buf, char *chr, size_t *col)
 {
     int length;

     assert(buf != NULL);

 #ifdef ENABLE_UTF8
     if (use_utf8) {
 	/* Get the number of bytes in the multibyte character. */
 	length = mblen(buf, MB_CUR_MAX);

 	/* When the multibyte sequence is invalid, only take the first byte. */
 	if (length <= 0) {
 	    IGNORE_CALL_RESULT(mblen(NULL, 0));
 	    length = 1;
 	}

 	/* When requested, store the multibyte character in chr. */
 	if (chr != NULL) {
 	    int i;

 	    for (i = 0; i < length; i++)
 		chr[i] = buf[i];
 	}

 	/* When requested, add the width of the character to col. */
 	if (col != NULL) {
 	    /* If we have a tab, compute its width in columns based on the
 	     * current value of col. */
 	    if (*buf == '\t')
 		*col += tabsize - *col % tabsize;
 	    /* If we have a control character, it's two columns wide: one
 	     * column for the "^", and one for the visible character. */
 	    else if (is_cntrl_mbchar(buf)) {
 		*col += 2;
 	    /* If we have a normal character, get its width normally. */
 	    } else
 		*col += mbwidth(buf);
 	}
     } else
 #endif
     {
 	/* A byte character is one byte long. */
 	length = 1;

 	/* When requested, store the byte character in chr. */
 	if (chr != NULL)
 	    *chr = *buf;

 	/* When requested, add the width of the character to col. */
 	if (col != NULL) {
 	    /* If we have a tab, compute its width in columns using the
 	     * current value of col. */
 	    if (*buf == '\t')
 		*col += tabsize - *col % tabsize;
 	    /* If we have a control character, it's two columns wide: one
 	     * column for the "^", and one for the visible character. */
 	    else if (is_cntrl_char((unsigned char)*buf))
 		*col += 2;
 	    /* If we have a normal character, it's one column wide. */
 	    else
 		(*col)++;
 	}
     }

     return length;
 }

 /* Return the index in buf of the beginning of the multibyte character
  * before the one at pos. */
 size_t move_mbleft(const char *buf, size_t pos)
 {
     size_t before, char_len = 0;

     assert(buf != NULL && pos <= strlen(buf));

     /* There is no library function to move backward one multibyte
      * character.  So we just start groping for one at the farthest
      * possible point. */
     if (mb_cur_max() > pos)
 	before = 0;
     else
 	before = pos - mb_cur_max();

     while (before < pos) {
 	char_len = parse_mbchar(buf + before, NULL, NULL);
 	before += char_len;
     }

     return before - char_len;
 }

 /* Return the index in buf of the beginning of the multibyte character
  * after the one at pos. */
 size_t move_mbright(const char *buf, size_t pos)
 {
     return pos + parse_mbchar(buf + pos, NULL, NULL);
 }

 #ifndef HAVE_STRCASECMP
 /* This function is equivalent to strcasecmp(). */
 int nstrcasecmp(const char *s1, const char *s2)
 {
     return strncasecmp(s1, s2, HIGHEST_POSITIVE);
 }
 #endif

 /* This function is equivalent to strcasecmp() for multibyte strings. */
 int mbstrcasecmp(const char *s1, const char *s2)
 {
     return mbstrncasecmp(s1, s2, HIGHEST_POSITIVE);
 }

 #ifndef HAVE_STRNCASECMP
 /* This function is equivalent to strncasecmp(). */
 int nstrncasecmp(const char *s1, const char *s2, size_t n)
 {
     if (s1 == s2)
 	return 0;

     for (; *s1 != '\0' && *s2 != '\0' && n > 0; s1++, s2++, n--) {
 	if (tolower(*s1) != tolower(*s2))
 	    break;
     }

     return (n > 0) ? tolower(*s1) - tolower(*s2) : 0;
 }
 #endif

 /* This function is equivalent to strncasecmp() for multibyte strings. */
 int mbstrncasecmp(const char *s1, const char *s2, size_t n)
 {
 #ifdef ENABLE_UTF8
     if (use_utf8) {
 	wchar_t wc1, wc2;

 	while (*s1 != '\0' && *s2 != '\0' && n > 0) {
 	    bool bad1 = FALSE, bad2 = FALSE;

 	    if (mbtowc(&wc1, s1, MB_CUR_MAX) < 0) {
 		mbtowc_reset();
 		bad1 = TRUE;
 	    }

 	    if (mbtowc(&wc2, s2, MB_CUR_MAX) < 0) {
 		mbtowc_reset();
 		bad2 = TRUE;
 	    }

 	    if (bad1 || bad2) {
 		if (*s1 != *s2)
 		    return (unsigned char)*s1 - (unsigned char)*s2;

 		if (bad1 != bad2)
 		    return (bad1 ? 1 : -1);
 	    } else {
 		int difference = towlower(wc1) - towlower(wc2);

 		if (difference != 0)
 		    return difference;
 	    }

 	    s1 += move_mbright(s1, 0);
 	    s2 += move_mbright(s2, 0);
 	    n--;
 	}

 	return (n > 0) ? ((unsigned char)*s1 - (unsigned char)*s2) : 0;
     } else
 #endif
 	return strncasecmp(s1, s2, n);
 }

 #ifndef HAVE_STRCASESTR
 /* This function is equivalent to strcasestr(). */
 char *nstrcasestr(const char *haystack, const char *needle)
 {
     size_t needle_len;

     if (*needle == '\0')
 	return (char *)haystack;

     needle_len = strlen(needle);

     while (*haystack != '\0') {
 	if (strncasecmp(haystack, needle, needle_len) == 0)
 	    return (char *)haystack;

 	haystack++;
     }

     return NULL;
 }
 #endif

 /* This function is equivalent to strcasestr() for multibyte strings. */
 char *mbstrcasestr(const char *haystack, const char *needle)
 {
 #ifdef ENABLE_UTF8
     if (use_utf8) {
 	size_t needle_len;

 	if (*needle == '\0')
 	    return (char *)haystack;

 	needle_len = mbstrlen(needle);

 	while (*haystack != '\0') {
 	    if (mbstrncasecmp(haystack, needle, needle_len) == 0)
 		return (char *)haystack;

 	    haystack += move_mbright(haystack, 0);
 	}

 	return NULL;
     } else
 #endif
 	return (char *) strcasestr(haystack, needle);
 }

 /* This function is equivalent to strstr(), except in that it scans the
  * string in reverse, starting at rev_start. */
 char *revstrstr(const char *haystack, const char *needle,
 	const char *pointer)
 {
     size_t needle_len = strlen(needle);
     size_t tail_len = strlen(pointer);

     if (needle_len == 0)
 	return (char *)pointer;

     if (strlen(haystack) < needle_len)
 	return NULL;

     if (tail_len < needle_len)
 	pointer += tail_len - needle_len;

     while (pointer >= haystack) {
 	if (strncmp(pointer, needle, needle_len) == 0)
 	    return (char *)pointer;
 	pointer--;
     }

     return NULL;
 }

 /* This function is equivalent to strcasestr(), except in that it scans
  * the string in reverse, starting at rev_start. */
 char *revstrcasestr(const char *haystack, const char *needle, const char
 	*rev_start)
 {
     size_t rev_start_len, needle_len;

     if (*needle == '\0')
 	return (char *)rev_start;

     needle_len = strlen(needle);

     if (strlen(haystack) < needle_len)
 	return NULL;

     rev_start_len = strlen(rev_start);

     for (; rev_start >= haystack; rev_start--, rev_start_len++) {
 	if (rev_start_len >= needle_len && strncasecmp(rev_start,
 		needle, needle_len) == 0)
 	    return (char *)rev_start;
     }

     return NULL;
 }

 /* This function is equivalent to strcasestr() for multibyte strings,
  * except in that it scans the string in reverse, starting at rev_start. */
 char *mbrevstrcasestr(const char *haystack, const char *needle, const
 	char *rev_start)
 {
 #ifdef ENABLE_UTF8
     if (use_utf8) {
 	size_t rev_start_len, needle_len;

 	if (*needle == '\0')
 	    return (char *)rev_start;

 	needle_len = mbstrlen(needle);

 	if (mbstrlen(haystack) < needle_len)
 	    return NULL;

 	rev_start_len = mbstrlen(rev_start);

 	while (TRUE) {
 	    if (rev_start_len >= needle_len &&
 			mbstrncasecmp(rev_start, needle, needle_len) == 0)
 		return (char *)rev_start;

 	    /* If we've reached the head of the haystack, we found nothing. */
 	    if (rev_start == haystack)
 		return NULL;

 	    rev_start = haystack + move_mbleft(haystack, rev_start - haystack);
 	    rev_start_len++;
 	}
     } else
 #endif
 	return revstrcasestr(haystack, needle, rev_start);
 }

 /* This function is equivalent to strlen() for multibyte strings. */
 size_t mbstrlen(const char *s)
 {
     return mbstrnlen(s, (size_t)-1);
 }

 #ifndef HAVE_STRNLEN
 /* This function is equivalent to strnlen(). */
 size_t nstrnlen(const char *s, size_t maxlen)
 {
     size_t n = 0;

     for (; *s != '\0' && maxlen > 0; s++, maxlen--, n++)
 	;

     return n;
 }
 #endif

 /* This function is equivalent to strnlen() for multibyte strings. */
 size_t mbstrnlen(const char *s, size_t maxlen)
 {
 #ifdef ENABLE_UTF8
     if (use_utf8) {
 	size_t n = 0;

 	for (; *s != '\0' && maxlen > 0; s += move_mbright(s, 0),
 		maxlen--, n++)
 	    ;

 	return n;
     } else
 #endif
 	return strnlen(s, maxlen);
 }

 #if !defined(NANO_TINY) || !defined(DISABLE_JUSTIFY)
 /* This function is equivalent to strchr() for multibyte strings. */
 char *mbstrchr(const char *s, const char *c)
 {
     assert(s != NULL && c != NULL);

 #ifdef ENABLE_UTF8
     if (use_utf8) {
 	bool bad_s_mb = FALSE, bad_c_mb = FALSE;
 	char symbol[MB_CUR_MAX];
 	const char *q = s;
 	wchar_t ws, wc;

 	if (mbtowc(&wc, c, MB_CUR_MAX) < 0) {
 	    mbtowc_reset();
 	    wc = (unsigned char)*c;
 	    bad_c_mb = TRUE;
 	}

 	while (*s != '\0') {
 	    int sym_len = parse_mbchar(s, symbol, NULL);

 	    if (mbtowc(&ws, symbol, sym_len) < 0) {
 		mbtowc_reset();
 		ws = (unsigned char)*s;
 		bad_s_mb = TRUE;
 	    }

 	    if (bad_s_mb == bad_c_mb && ws == wc)
 		break;

 	    s += sym_len;
 	    q += sym_len;
 	}

 	if (*s == '\0')
 	    q = NULL;

 	return (char *)q;
     } else
 #endif
 	return (char *) strchr(s, *c);
 }
 #endif /* !NANO_TINY || !DISABLE_JUSTIFY */

 #ifndef NANO_TINY
 /* This function is equivalent to strpbrk() for multibyte strings. */
 char *mbstrpbrk(const char *s, const char *accept)
 {
 #ifdef ENABLE_UTF8
     if (use_utf8) {
 	for (; *s != '\0'; s += move_mbright(s, 0)) {
 	    if (mbstrchr(accept, s) != NULL)
 		return (char *)s;
 	}

 	return NULL;
     } else
 #endif
 	return (char *) strpbrk(s, accept);
 }

 /* This function is equivalent to strpbrk(), except in that it scans the
  * string in reverse, starting at rev_start. */
 char *revstrpbrk(const char *s, const char *accept, const char
 	*rev_start)
 {
     assert(s != NULL && accept != NULL && rev_start != NULL);

     if (*rev_start == '\0') {
 	if (rev_start == s)
 	   return NULL;
 	rev_start--;
     }

     for (; rev_start >= s; rev_start--) {
 	if (strchr(accept, *rev_start) != NULL)
 	    return (char *)rev_start;
     }

     return NULL;
 }

 /* This function is equivalent to strpbrk() for multibyte strings,
  * except in that it scans the string in reverse, starting at rev_start. */
 char *mbrevstrpbrk(const char *s, const char *accept, const char
 	*rev_start)
 {
     assert(s != NULL && accept != NULL && rev_start != NULL);

 #ifdef ENABLE_UTF8
     if (use_utf8) {
 	if (*rev_start == '\0') {
 	    if (rev_start == s)
 		return NULL;
 	    rev_start = s + move_mbleft(s, rev_start - s);
 	}

 	while (TRUE) {
 	    if (mbstrchr(accept, rev_start) != NULL)
 		return (char *)rev_start;

 	    /* If we've reached the head of the string, we found nothing. */
 	    if (rev_start == s)
 		return NULL;

 	    rev_start = s + move_mbleft(s, rev_start - s);
 	}
     } else
 #endif
 	return revstrpbrk(s, accept, rev_start);
 }
 #endif /* !NANO_TINY */

 #if !defined(DISABLE_NANORC) && (!defined(NANO_TINY) || !defined(DISABLE_JUSTIFY))
 /* Return TRUE if the string s contains one or more blank characters,
  * and FALSE otherwise. */
 bool has_blank_chars(const char *s)
 {
     for (; *s != '\0'; s++) {
 	if (isblank(*s))
 	    return TRUE;
     }

     return FALSE;
 }

 /* Return TRUE if the multibyte string s contains one or more blank
  * multibyte characters, and FALSE otherwise. */
 bool has_blank_mbchars(const char *s)
 {
 #ifdef ENABLE_UTF8
     if (use_utf8) {
 	char symbol[MB_CUR_MAX];

 	for (; *s != '\0'; s += move_mbright(s, 0)) {
 	    parse_mbchar(s, symbol, NULL);

 	    if (is_blank_mbchar(symbol))
 		return TRUE;
 	}

 	return FALSE;
     } else
 #endif
 	return has_blank_chars(s);
 }
 #endif /* !DISABLE_NANORC && (!NANO_TINY || !DISABLE_JUSTIFY) */

 #ifdef ENABLE_UTF8
 /* Return TRUE if wc is valid Unicode, and FALSE otherwise. */
 bool is_valid_unicode(wchar_t wc)
 {
     return ((0 <= wc && wc <= 0xD7FF) ||
 		(0xE000 <= wc && wc <= 0xFDCF) ||
 		(0xFDF0 <= wc && wc <= 0xFFFD) ||
 		(0xFFFF < wc && wc <= 0x10FFFF && (wc & 0xFFFF) <= 0xFFFD));
 }
 #endif

 #ifndef DISABLE_NANORC
 /* Check if the string s is a valid multibyte string.  Return TRUE if it
  * is, and FALSE otherwise. */
 bool is_valid_mbstring(const char *s)
 {
 #ifdef ENABLE_UTF8
     if (use_utf8)
 	return (mbstowcs(NULL, s, 0) != (size_t)-1);
     else
 #endif
 	return TRUE;
 }
 #endif /* !DISABLE_NANORC */
	/**************************************************************************
	* chars.c -- This file is part of GNU nano. *
	* *
	* Copyright (C) 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, *
	* 2010, 2011, 2013, 2014 Free Software Foundation, Inc. *
	* Copyright (C) 2016 Benno Schulenberg *
	* *
	* GNU nano is free software: you can redistribute it and/or modify *
	* it under the terms of the GNU General Public License as published *
	* by the Free Software Foundation, either version 3 of the License, *
	* or (at your option) any later version. *
	* *
	* GNU nano is distributed in the hope that it will be useful, *
	* but WITHOUT ANY WARRANTY; without even the implied warranty *
	* of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
	* See the GNU General Public License for more details. *
	* *
	* You should have received a copy of the GNU General Public License *
	* along with this program. If not, see http://www.gnu.org/licenses/. *
	* *
	**************************************************************************/

	#include "proto.h"

	#include <string.h>
	#include <ctype.h>

	#ifdef ENABLE_UTF8
	#ifdef HAVE_WCHAR_H
	#include <wchar.h>
	#endif
	#ifdef HAVE_WCTYPE_H
	#include <wctype.h>
	#endif

	static bool use_utf8 = FALSE;
	/* Whether we've enabled UTF-8 support. */

	/* Enable UTF-8 support. */
	void utf8_init(void)
	{
	use_utf8 = TRUE;
	}

	/* Is UTF-8 support enabled? */
	bool using_utf8(void)
	{
	return use_utf8;
	}
	#endif /* ENABLE_UTF8 */

	/* Concatenate two allocated strings, and free the second. */
	char addstrings(char str1, size_t len1, char* str2, size_t len2)
	{
	str1 = charealloc(str1, len1 + len2 + 1);
	str1[len1] = '\0';

	strncat(&str1[len1], str2, len2);
	free(str2);

	return str1;
	}

	#ifndef HAVE_ISBLANK
	/* This function is equivalent to isblank(). */
	bool nisblank(int c)
	{
	return isspace(c) && (c == '\t' \|\| !is_cntrl_char(c));
	}
	#endif

	#if !defined(HAVE_ISWBLANK) && defined(ENABLE_UTF8)
	/* This function is equivalent to iswblank(). */
	bool niswblank(wchar_t wc)
	{
	return iswspace(wc) && (wc == '\t' \|\| !is_cntrl_wchar(wc));
	}
	#endif

	/* Return TRUE if the value of c is in byte range, and FALSE otherwise. */
	bool is_byte(int c)
	{
	return ((unsigned int)c == (unsigned char)c);
	}

	void mbtowc_reset(void)
	{
	IGNORE_CALL_RESULT(mbtowc(NULL, NULL, 0));
	}

	/* This function is equivalent to isalpha() for multibyte characters. */
	bool is_alpha_mbchar(const char *c)
	{
	assert(c != NULL);

	#ifdef ENABLE_UTF8
	if (use_utf8) {
	wchar_t wc;

	if (mbtowc(&wc, c, MB_CUR_MAX) < 0) {
	mbtowc_reset();
	return 0;
	}

	return iswalpha(wc);
	} else
	#endif
	return isalpha((unsigned char)*c);
	}

	/* This function is equivalent to isalnum() for multibyte characters. */
	bool is_alnum_mbchar(const char *c)
	{
	assert(c != NULL);

	#ifdef ENABLE_UTF8
	if (use_utf8) {
	wchar_t wc;

	if (mbtowc(&wc, c, MB_CUR_MAX) < 0) {
	mbtowc_reset();
	return 0;
	}

	return iswalnum(wc);
	} else
	#endif
	return isalnum((unsigned char)*c);
	}

	/* This function is equivalent to isblank() for multibyte characters. */
	bool is_blank_mbchar(const char *c)
	{
	assert(c != NULL);

	#ifdef ENABLE_UTF8
	if (use_utf8) {
	wchar_t wc;

	if (mbtowc(&wc, c, MB_CUR_MAX) < 0) {
	mbtowc_reset();
	return 0;
	}

	return iswblank(wc);
	} else
	#endif
	return isblank((unsigned char)*c);
	}

	/* This function is equivalent to iscntrl(), except in that it only
	* handles non-high-bit control characters. */
	bool is_ascii_cntrl_char(int c)
	{
	return (0 <= c && c < 32);
	}

	/* This function is equivalent to iscntrl(), except in that it also
	* handles high-bit control characters. */
	bool is_cntrl_char(int c)
	{
	return ((c & 0x60) == 0 \|\| c == 127);
	}

	/* This function is equivalent to iscntrl() for multibyte characters,
	* except in that it also handles multibyte control characters with
	* their high bits set. */
	bool is_cntrl_mbchar(const char *c)
	{
	#ifdef ENABLE_UTF8
	if (use_utf8) {
	return ((c[0] & 0xE0) == 0 \|\| c[0] == 127 \|\|
	((signed char)c[0] == -62 && (signed char)c[1] < -96));
	} else
	#endif
	return is_cntrl_char((unsigned char)*c);
	}

	/* This function is equivalent to ispunct() for multibyte characters. */
	bool is_punct_mbchar(const char *c)
	{
	assert(c != NULL);

	#ifdef ENABLE_UTF8
	if (use_utf8) {
	wchar_t wc;

	if (mbtowc(&wc, c, MB_CUR_MAX) < 0) {
	mbtowc_reset();
	return 0;
	}

	return iswpunct(wc);
	} else
	#endif
	return ispunct((unsigned char)*c);
	}

	/* Return TRUE when the given multibyte character c is a word-forming
	* character (that is: alphanumeric, or specified in wordchars, or
	* punctuation when allow_punct is TRUE), and FALSE otherwise. */
	bool is_word_mbchar(const char *c, bool allow_punct)
	{
	if (*c == '\0')
	return FALSE;

	if (is_alnum_mbchar(c))
	return TRUE;

	if (word_chars != NULL && *word_chars != '\0') {
	char symbol[mb_cur_max() + 1];
	int symlen = parse_mbchar(c, symbol, NULL);

	symbol[symlen] = '\0';
	return (strstr(word_chars, symbol) != NULL);
	}

	return (allow_punct && is_punct_mbchar(c));
	}

	/* Return the visible representation of control character c. */
	char control_rep(const signed char c)
	{
	if (c == DEL_CODE)
	return '?';
	else if (c == -97)
	return '=';
	else if (c < 0)
	return c + 224;
	else
	return c + 64;
	}

	/* Return the visible representation of multibyte control character c. */
	char control_mbrep(const char *c, bool isdata)
	{
	/* An embedded newline is an encoded NUL if it is data. */
	if (*c == '\n' && (isdata \|\| as_an_at))
	return '@';

	#ifdef ENABLE_UTF8
	if (use_utf8) {
	if ((unsigned char)c[0] < 128)
	return control_rep(c[0]);
	else
	return control_rep(c[1]);
	} else
	#endif
	return control_rep(*c);
	}

	/* Assess how many bytes the given (multibyte) character occupies. Return -1
	* if the byte sequence is invalid, and return the number of bytes minus 8
	* when it encodes an invalid codepoint. Also, in the second parameter,
	* return the number of columns that the character occupies. */
	int length_of_char(const char c, int width)
	{
	assert(c != NULL);

	#ifdef ENABLE_UTF8
	if (use_utf8) {
	wchar_t wc;
	int charlen = mbtowc(&wc, c, MB_CUR_MAX);

	/* If the sequence is invalid... */
	if (charlen < 0) {
	mbtowc_reset();
	return -1;
	}

	/* If the codepoint is invalid... */
	if (!is_valid_unicode(wc))
	return charlen - 8;
	else {
	*width = wcwidth(wc);
	/* If the codepoint is unassigned, assume a width of one. */
	if (*width < 0)
	*width = 1;
	return charlen;
	}
	} else
	#endif
	return 1;
	}

	/* This function is equivalent to wcwidth() for multibyte characters. */
	int mbwidth(const char *c)
	{
	assert(c != NULL);

	#ifdef ENABLE_UTF8
	if (use_utf8) {
	wchar_t wc;
	int width;

	if (mbtowc(&wc, c, MB_CUR_MAX) < 0) {
	mbtowc_reset();
	return 1;
	}

	width = wcwidth(wc);

	if (width == -1)
	return 1;

	return width;
	} else
	#endif
	return 1;
	}

	/* Return the maximum length (in bytes) of a character. */
	int mb_cur_max(void)
	{
	#ifdef ENABLE_UTF8
	if (use_utf8)
	return MB_CUR_MAX;
	else
	#endif
	return 1;
	}

	/* Convert the Unicode value in chr to a multibyte character, if possible.
	* If the conversion succeeds, return the (dynamically allocated) multibyte
	* character and its length. Otherwise, return an undefined (dynamically
	* allocated) multibyte character and a length of zero. */
	char make_mbchar(long chr, int chr_mb_len)
	{
	char *chr_mb;

	#ifdef ENABLE_UTF8
	if (use_utf8) {
	chr_mb = charalloc(MB_CUR_MAX);
	*chr_mb_len = wctomb(chr_mb, (wchar_t)chr);

	/* Reject invalid Unicode characters. */
	if (*chr_mb_len < 0 \|\| !is_valid_unicode((wchar_t)chr)) {
	IGNORE_CALL_RESULT(wctomb(NULL, 0));
	*chr_mb_len = 0;
	}
	} else
	#endif
	{
	*chr_mb_len = 1;
	chr_mb = mallocstrncpy(NULL, (char *)&chr, 1);
	}

	return chr_mb;
	}

	/* Parse a multibyte character from buf. Return the number of bytes
	* used. If chr isn't NULL, store the multibyte character in it. If
	* col isn't NULL, add the character's width (in columns) to it. */
	int parse_mbchar(const char buf, char chr, size_t *col)
	{
	int length;

	assert(buf != NULL);

	#ifdef ENABLE_UTF8
	if (use_utf8) {
	/* Get the number of bytes in the multibyte character. */
	length = mblen(buf, MB_CUR_MAX);

	/* When the multibyte sequence is invalid, only take the first byte. */
	if (length <= 0) {
	IGNORE_CALL_RESULT(mblen(NULL, 0));
	length = 1;
	}

	/* When requested, store the multibyte character in chr. */
	if (chr != NULL) {
	int i;

	for (i = 0; i < length; i++)
	chr[i] = buf[i];
	}

	/* When requested, add the width of the character to col. */
	if (col != NULL) {
	/* If we have a tab, compute its width in columns based on the
	* current value of col. */
	if (*buf == '\t')
	col += tabsize - col % tabsize;
	/* If we have a control character, it's two columns wide: one
	* column for the "^", and one for the visible character. */
	else if (is_cntrl_mbchar(buf)) {
	*col += 2;
	/* If we have a normal character, get its width normally. */
	} else
	*col += mbwidth(buf);
	}
	} else
	#endif
	{
	/* A byte character is one byte long. */
	length = 1;

	/* When requested, store the byte character in chr. */
	if (chr != NULL)
	chr = buf;

	/* When requested, add the width of the character to col. */
	if (col != NULL) {
	/* If we have a tab, compute its width in columns using the
	* current value of col. */
	if (*buf == '\t')
	col += tabsize - col % tabsize;
	/* If we have a control character, it's two columns wide: one
	* column for the "^", and one for the visible character. */
	else if (is_cntrl_char((unsigned char)*buf))
	*col += 2;
	/* If we have a normal character, it's one column wide. */
	else
	(*col)++;
	}
	}

	return length;
	}

	/* Return the index in buf of the beginning of the multibyte character
	* before the one at pos. */
	size_t move_mbleft(const char *buf, size_t pos)
	{
	size_t before, char_len = 0;

	assert(buf != NULL && pos <= strlen(buf));

	/* There is no library function to move backward one multibyte
	* character. So we just start groping for one at the farthest
	* possible point. */
	if (mb_cur_max() > pos)
	before = 0;
	else
	before = pos - mb_cur_max();

	while (before < pos) {
	char_len = parse_mbchar(buf + before, NULL, NULL);
	before += char_len;
	}

	return before - char_len;
	}

	/* Return the index in buf of the beginning of the multibyte character
	* after the one at pos. */
	size_t move_mbright(const char *buf, size_t pos)
	{
	return pos + parse_mbchar(buf + pos, NULL, NULL);
	}

	#ifndef HAVE_STRCASECMP
	/* This function is equivalent to strcasecmp(). */
	int nstrcasecmp(const char s1, const char s2)
	{
	return strncasecmp(s1, s2, HIGHEST_POSITIVE);
	}
	#endif

	/* This function is equivalent to strcasecmp() for multibyte strings. */
	int mbstrcasecmp(const char s1, const char s2)
	{
	return mbstrncasecmp(s1, s2, HIGHEST_POSITIVE);
	}

	#ifndef HAVE_STRNCASECMP
	/* This function is equivalent to strncasecmp(). */
	int nstrncasecmp(const char s1, const char s2, size_t n)
	{
	if (s1 == s2)
	return 0;

	for (; s1 != '\0' && s2 != '\0' && n > 0; s1++, s2++, n--) {
	if (tolower(s1) != tolower(s2))
	break;
	}

	return (n > 0) ? tolower(s1) - tolower(s2) : 0;
	}
	#endif

	/* This function is equivalent to strncasecmp() for multibyte strings. */
	int mbstrncasecmp(const char s1, const char s2, size_t n)
	{
	#ifdef ENABLE_UTF8
	if (use_utf8) {
	wchar_t wc1, wc2;

	while (s1 != '\0' && s2 != '\0' && n > 0) {
	bool bad1 = FALSE, bad2 = FALSE;

	if (mbtowc(&wc1, s1, MB_CUR_MAX) < 0) {
	mbtowc_reset();
	bad1 = TRUE;
	}

	if (mbtowc(&wc2, s2, MB_CUR_MAX) < 0) {
	mbtowc_reset();
	bad2 = TRUE;
	}

	if (bad1 \|\| bad2) {
	if (s1 != s2)
	return (unsigned char)s1 - (unsigned char)s2;

	if (bad1 != bad2)
	return (bad1 ? 1 : -1);
	} else {
	int difference = towlower(wc1) - towlower(wc2);

	if (difference != 0)
	return difference;
	}

	s1 += move_mbright(s1, 0);
	s2 += move_mbright(s2, 0);
	n--;
	}

	return (n > 0) ? ((unsigned char)s1 - (unsigned char)s2) : 0;
	} else
	#endif
	return strncasecmp(s1, s2, n);
	}

	#ifndef HAVE_STRCASESTR
	/* This function is equivalent to strcasestr(). */
	char nstrcasestr(const char haystack, const char *needle)
	{
	size_t needle_len;

	if (*needle == '\0')
	return (char *)haystack;

	needle_len = strlen(needle);

	while (*haystack != '\0') {
	if (strncasecmp(haystack, needle, needle_len) == 0)
	return (char *)haystack;

	haystack++;
	}

	return NULL;
	}
	#endif

	/* This function is equivalent to strcasestr() for multibyte strings. */
	char mbstrcasestr(const char haystack, const char *needle)
	{
	#ifdef ENABLE_UTF8
	if (use_utf8) {
	size_t needle_len;

	if (*needle == '\0')
	return (char *)haystack;

	needle_len = mbstrlen(needle);

	while (*haystack != '\0') {
	if (mbstrncasecmp(haystack, needle, needle_len) == 0)
	return (char *)haystack;

	haystack += move_mbright(haystack, 0);
	}

	return NULL;
	} else
	#endif
	return (char *) strcasestr(haystack, needle);
	}

	/* This function is equivalent to strstr(), except in that it scans the
	* string in reverse, starting at rev_start. */
	char revstrstr(const char haystack, const char *needle,
	const char *pointer)
	{
	size_t needle_len = strlen(needle);
	size_t tail_len = strlen(pointer);

	if (needle_len == 0)
	return (char *)pointer;

	if (strlen(haystack) < needle_len)
	return NULL;

	if (tail_len < needle_len)
	pointer += tail_len - needle_len;

	while (pointer >= haystack) {
	if (strncmp(pointer, needle, needle_len) == 0)
	return (char *)pointer;
	pointer--;
	}

	return NULL;
	}

	/* This function is equivalent to strcasestr(), except in that it scans
	* the string in reverse, starting at rev_start. */
	char revstrcasestr(const char haystack, const char *needle, const char
	*rev_start)
	{
	size_t rev_start_len, needle_len;

	if (*needle == '\0')
	return (char *)rev_start;

	needle_len = strlen(needle);

	if (strlen(haystack) < needle_len)
	return NULL;

	rev_start_len = strlen(rev_start);

	for (; rev_start >= haystack; rev_start--, rev_start_len++) {
	if (rev_start_len >= needle_len && strncasecmp(rev_start,
	needle, needle_len) == 0)
	return (char *)rev_start;
	}

	return NULL;
	}

	/* This function is equivalent to strcasestr() for multibyte strings,
	* except in that it scans the string in reverse, starting at rev_start. */
	char mbrevstrcasestr(const char haystack, const char *needle, const
	char *rev_start)
	{
	#ifdef ENABLE_UTF8
	if (use_utf8) {
	size_t rev_start_len, needle_len;

	if (*needle == '\0')
	return (char *)rev_start;

	needle_len = mbstrlen(needle);

	if (mbstrlen(haystack) < needle_len)
	return NULL;

	rev_start_len = mbstrlen(rev_start);

	while (TRUE) {
	if (rev_start_len >= needle_len &&
	mbstrncasecmp(rev_start, needle, needle_len) == 0)
	return (char *)rev_start;

	/* If we've reached the head of the haystack, we found nothing. */
	if (rev_start == haystack)
	return NULL;

	rev_start = haystack + move_mbleft(haystack, rev_start - haystack);
	rev_start_len++;
	}
	} else
	#endif
	return revstrcasestr(haystack, needle, rev_start);
	}

	/* This function is equivalent to strlen() for multibyte strings. */
	size_t mbstrlen(const char *s)
	{
	return mbstrnlen(s, (size_t)-1);
	}

	#ifndef HAVE_STRNLEN
	/* This function is equivalent to strnlen(). */
	size_t nstrnlen(const char *s, size_t maxlen)
	{
	size_t n = 0;

	for (; *s != '\0' && maxlen > 0; s++, maxlen--, n++)
	;

	return n;
	}
	#endif

	/* This function is equivalent to strnlen() for multibyte strings. */
	size_t mbstrnlen(const char *s, size_t maxlen)
	{
	#ifdef ENABLE_UTF8
	if (use_utf8) {
	size_t n = 0;

	for (; *s != '\0' && maxlen > 0; s += move_mbright(s, 0),
	maxlen--, n++)
	;

	return n;
	} else
	#endif
	return strnlen(s, maxlen);
	}

	#if !defined(NANO_TINY) \|\| !defined(DISABLE_JUSTIFY)
	/* This function is equivalent to strchr() for multibyte strings. */
	char mbstrchr(const char s, const char *c)
	{
	assert(s != NULL && c != NULL);

	#ifdef ENABLE_UTF8
	if (use_utf8) {
	bool bad_s_mb = FALSE, bad_c_mb = FALSE;
	char symbol[MB_CUR_MAX];
	const char *q = s;
	wchar_t ws, wc;

	if (mbtowc(&wc, c, MB_CUR_MAX) < 0) {
	mbtowc_reset();
	wc = (unsigned char)*c;
	bad_c_mb = TRUE;
	}

	while (*s != '\0') {
	int sym_len = parse_mbchar(s, symbol, NULL);

	if (mbtowc(&ws, symbol, sym_len) < 0) {
	mbtowc_reset();
	ws = (unsigned char)*s;
	bad_s_mb = TRUE;
	}

	if (bad_s_mb == bad_c_mb && ws == wc)
	break;

	s += sym_len;
	q += sym_len;
	}

	if (*s == '\0')
	q = NULL;

	return (char *)q;
	} else
	#endif
	return (char ) strchr(s, c);
	}
	#endif /* !NANO_TINY \|\| !DISABLE_JUSTIFY */

	#ifndef NANO_TINY
	/* This function is equivalent to strpbrk() for multibyte strings. */
	char mbstrpbrk(const char s, const char *accept)
	{
	#ifdef ENABLE_UTF8
	if (use_utf8) {
	for (; *s != '\0'; s += move_mbright(s, 0)) {
	if (mbstrchr(accept, s) != NULL)
	return (char *)s;
	}

	return NULL;
	} else
	#endif
	return (char *) strpbrk(s, accept);
	}

	/* This function is equivalent to strpbrk(), except in that it scans the
	* string in reverse, starting at rev_start. */
	char revstrpbrk(const char s, const char *accept, const char
	*rev_start)
	{
	assert(s != NULL && accept != NULL && rev_start != NULL);

	if (*rev_start == '\0') {
	if (rev_start == s)
	return NULL;
	rev_start--;
	}

	for (; rev_start >= s; rev_start--) {
	if (strchr(accept, *rev_start) != NULL)
	return (char *)rev_start;
	}

	return NULL;
	}

	/* This function is equivalent to strpbrk() for multibyte strings,
	* except in that it scans the string in reverse, starting at rev_start. */
	char mbrevstrpbrk(const char s, const char *accept, const char
	*rev_start)
	{
	assert(s != NULL && accept != NULL && rev_start != NULL);

	#ifdef ENABLE_UTF8
	if (use_utf8) {
	if (*rev_start == '\0') {
	if (rev_start == s)
	return NULL;
	rev_start = s + move_mbleft(s, rev_start - s);
	}

	while (TRUE) {
	if (mbstrchr(accept, rev_start) != NULL)
	return (char *)rev_start;

	/* If we've reached the head of the string, we found nothing. */
	if (rev_start == s)
	return NULL;

	rev_start = s + move_mbleft(s, rev_start - s);
	}
	} else
	#endif
	return revstrpbrk(s, accept, rev_start);
	}
	#endif /* !NANO_TINY */

	#if !defined(DISABLE_NANORC) && (!defined(NANO_TINY) \|\| !defined(DISABLE_JUSTIFY))
	/* Return TRUE if the string s contains one or more blank characters,
	* and FALSE otherwise. */
	bool has_blank_chars(const char *s)
	{
	for (; *s != '\0'; s++) {
	if (isblank(*s))
	return TRUE;
	}

	return FALSE;
	}

	/* Return TRUE if the multibyte string s contains one or more blank
	* multibyte characters, and FALSE otherwise. */
	bool has_blank_mbchars(const char *s)
	{
	#ifdef ENABLE_UTF8
	if (use_utf8) {
	char symbol[MB_CUR_MAX];

	for (; *s != '\0'; s += move_mbright(s, 0)) {
	parse_mbchar(s, symbol, NULL);

	if (is_blank_mbchar(symbol))
	return TRUE;
	}

	return FALSE;
	} else
	#endif
	return has_blank_chars(s);
	}
	#endif /* !DISABLE_NANORC && (!NANO_TINY \|\| !DISABLE_JUSTIFY) */

	#ifdef ENABLE_UTF8
	/* Return TRUE if wc is valid Unicode, and FALSE otherwise. */
	bool is_valid_unicode(wchar_t wc)
	{
	return ((0 <= wc && wc <= 0xD7FF) \|\|
	(0xE000 <= wc && wc <= 0xFDCF) \|\|
	(0xFDF0 <= wc && wc <= 0xFFFD) \|\|
	(0xFFFF < wc && wc <= 0x10FFFF && (wc & 0xFFFF) <= 0xFFFD));
	}
	#endif

	#ifndef DISABLE_NANORC
	/* Check if the string s is a valid multibyte string. Return TRUE if it
	* is, and FALSE otherwise. */
	bool is_valid_mbstring(const char *s)
	{
	#ifdef ENABLE_UTF8
	if (use_utf8)
	return (mbstowcs(NULL, s, 0) != (size_t)-1);
	else
	#endif
	return TRUE;
	}
	#endif /* !DISABLE_NANORC */