Blame - util/mkutf8data.c - platform_external_e2fsprogs

blob: 49bb0e16669b835093a390d1955b04565619bc20 [file] [log] [blame]

Gabriel Krisman Bertazi	8cbcf15	2018-12-03 18:04:28 -0500	[diff] [blame]	1	/*
				2	* Copyright (c) 2014 SGI.
				3	* All rights reserved.
				4	*
				5	* This program is free software; you can redistribute it and/or
				6	* modify it under the terms of the GNU General Public License as
				7	* published by the Free Software Foundation.
				8	*
				9	* This program is distributed in the hope that it would be useful,
				10	* but WITHOUT ANY WARRANTY; without even the implied warranty of
				11	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
				12	* GNU General Public License for more details.
				13	*
				14	* You should have received a copy of the GNU General Public License
				15	* along with this program; if not, write the Free Software Foundation,
				16	* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
				17	*/
				18
				19	/* Generator for a compact trie for unicode normalization */
				20
				21	#include <sys/types.h>
				22	#include <stddef.h>
				23	#include <stdlib.h>
				24	#include <stdio.h>
				25	#include <assert.h>
				26	#include <string.h>
				27	#include <unistd.h>
				28	#include <errno.h>
				29
				30	/* Default names of the in- and output files. */
				31
				32	#define AGE_NAME "DerivedAge.txt"
				33	#define CCC_NAME "DerivedCombiningClass.txt"
				34	#define PROP_NAME "DerivedCoreProperties.txt"
				35	#define DATA_NAME "UnicodeData.txt"
				36	#define FOLD_NAME "CaseFolding.txt"
				37	#define NORM_NAME "NormalizationCorrections.txt"
				38	#define TEST_NAME "NormalizationTest.txt"
				39	#define UTF8_NAME "utf8data.h"
				40
				41	const char *age_name = AGE_NAME;
				42	const char *ccc_name = CCC_NAME;
				43	const char *prop_name = PROP_NAME;
				44	const char *data_name = DATA_NAME;
				45	const char *fold_name = FOLD_NAME;
				46	const char *norm_name = NORM_NAME;
				47	const char *test_name = TEST_NAME;
				48	const char *utf8_name = UTF8_NAME;
				49
				50	int verbose = 0;
				51
				52	/* An arbitrary line size limit on input lines. */
				53
				54	#define LINESIZE 1024
				55	char line[LINESIZE];
				56	char buf0[LINESIZE];
				57	char buf1[LINESIZE];
				58	char buf2[LINESIZE];
				59	char buf3[LINESIZE];
				60
				61	const char *argv0;
				62
				63	/* ------------------------------------------------------------------ */
				64
				65	/*
				66	* Unicode version numbers consist of three parts: major, minor, and a
				67	* revision. These numbers are packed into an unsigned int to obtain
				68	* a single version number.
				69	*
				70	* To save space in the generated trie, the unicode version is not
				71	* stored directly, instead we calculate a generation number from the
				72	* unicode versions seen in the DerivedAge file, and use that as an
				73	* index into a table of unicode versions.
				74	*/
				75	#define UNICODE_MAJ_SHIFT (16)
				76	#define UNICODE_MIN_SHIFT (8)
				77
				78	#define UNICODE_MAJ_MAX ((unsigned short)-1)
				79	#define UNICODE_MIN_MAX ((unsigned char)-1)
				80	#define UNICODE_REV_MAX ((unsigned char)-1)
				81
				82	#define UNICODE_AGE(MAJ,MIN,REV) \
				83	(((unsigned int)(MAJ) << UNICODE_MAJ_SHIFT) \| \
				84	((unsigned int)(MIN) << UNICODE_MIN_SHIFT) \| \
				85	((unsigned int)(REV)))
				86
				87	unsigned int *ages;
				88	int ages_count;
				89
				90	unsigned int unicode_maxage;
				91
				92	static int age_valid(unsigned int major, unsigned int minor,
				93	unsigned int revision)
				94	{
				95	if (major > UNICODE_MAJ_MAX)
				96	return 0;
				97	if (minor > UNICODE_MIN_MAX)
				98	return 0;
				99	if (revision > UNICODE_REV_MAX)
				100	return 0;
				101	return 1;
				102	}
				103
				104	/* ------------------------------------------------------------------ */
				105
				106	/*
				107	* utf8trie_t
				108	*
				109	* A compact binary tree, used to decode UTF-8 characters.
				110	*
				111	* Internal nodes are one byte for the node itself, and up to three
				112	* bytes for an offset into the tree. The first byte contains the
				113	* following information:
				114	* NEXTBYTE - flag - advance to next byte if set
				115	* BITNUM - 3 bit field - the bit number to tested
				116	* OFFLEN - 2 bit field - number of bytes in the offset
				117	* if offlen == 0 (non-branching node)
				118	* RIGHTPATH - 1 bit field - set if the following node is for the
				119	* right-hand path (tested bit is set)
				120	* TRIENODE - 1 bit field - set if the following node is an internal
				121	* node, otherwise it is a leaf node
				122	* if offlen != 0 (branching node)
				123	* LEFTNODE - 1 bit field - set if the left-hand node is internal
				124	* RIGHTNODE - 1 bit field - set if the right-hand node is internal
				125	*
				126	* Due to the way utf8 works, there cannot be branching nodes with
				127	* NEXTBYTE set, and moreover those nodes always have a righthand
				128	* descendant.
				129	*/
				130	typedef unsigned char utf8trie_t;
				131	#define BITNUM 0x07
				132	#define NEXTBYTE 0x08
				133	#define OFFLEN 0x30
				134	#define OFFLEN_SHIFT 4
				135	#define RIGHTPATH 0x40
				136	#define TRIENODE 0x80
				137	#define RIGHTNODE 0x40
				138	#define LEFTNODE 0x80
				139
				140	/*
				141	* utf8leaf_t
				142	*
				143	* The leaves of the trie are embedded in the trie, and so the same
				144	* underlying datatype, unsigned char.
				145	*
				146	* leaf[0]: The unicode version, stored as a generation number that is
				147	* an index into utf8agetab[]. With this we can filter code
				148	* points based on the unicode version in which they were
				149	* defined. The CCC of a non-defined code point is 0.
				150	* leaf[1]: Canonical Combining Class. During normalization, we need
				151	* to do a stable sort into ascending order of all characters
				152	* with a non-zero CCC that occur between two characters with
				153	* a CCC of 0, or at the begin or end of a string.
				154	* The unicode standard guarantees that all CCC values are
				155	* between 0 and 254 inclusive, which leaves 255 available as
				156	* a special value.
				157	* Code points with CCC 0 are known as stoppers.
				158	* leaf[2]: Decomposition. If leaf[1] == 255, then leaf[2] is the
				159	* start of a NUL-terminated string that is the decomposition
				160	* of the character.
				161	* The CCC of a decomposable character is the same as the CCC
				162	* of the first character of its decomposition.
				163	* Some characters decompose as the empty string: these are
				164	* characters with the Default_Ignorable_Code_Point property.
				165	* These do affect normalization, as they all have CCC 0.
				166	*
				167	* The decompositions in the trie have been fully expanded.
				168	*
				169	* Casefolding, if applicable, is also done using decompositions.
				170	*/
				171	typedef unsigned char utf8leaf_t;
				172
				173	#define LEAF_GEN(LEAF) ((LEAF)[0])
				174	#define LEAF_CCC(LEAF) ((LEAF)[1])
				175	#define LEAF_STR(LEAF) ((const char*)((LEAF) + 2))
				176
				177	#define MAXGEN (255)
				178
				179	#define MINCCC (0)
				180	#define MAXCCC (254)
				181	#define STOPPER (0)
				182	#define DECOMPOSE (255)
				183	#define HANGUL ((char)(255))
				184
				185	#define UTF8HANGULLEAF (12)
				186
				187	struct tree;
				188	static utf8leaf_t utf8nlookup(struct tree , unsigned char *,
				189	const char *, size_t);
				190	static utf8leaf_t utf8lookup(struct tree , unsigned char , const char );
				191
				192	unsigned char *utf8data;
				193	size_t utf8data_size;
				194
				195	utf8trie_t *nfkdi;
				196	utf8trie_t *nfkdicf;
				197
				198	/* ------------------------------------------------------------------ */
				199
				200	/*
				201	* UTF8 valid ranges.
				202	*
				203	* The UTF-8 encoding spreads the bits of a 32bit word over several
				204	* bytes. This table gives the ranges that can be held and how they'd
				205	* be represented.
				206	*
				207	* 0x00000000 0x0000007F: 0xxxxxxx
				208	* 0x00000000 0x000007FF: 110xxxxx 10xxxxxx
				209	* 0x00000000 0x0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx
				210	* 0x00000000 0x001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
				211	* 0x00000000 0x03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
				212	* 0x00000000 0x7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
				213	*
				214	* There is an additional requirement on UTF-8, in that only the
				215	* shortest representation of a 32bit value is to be used. A decoder
				216	* must not decode sequences that do not satisfy this requirement.
				217	* Thus the allowed ranges have a lower bound.
				218	*
				219	* 0x00000000 0x0000007F: 0xxxxxxx
				220	* 0x00000080 0x000007FF: 110xxxxx 10xxxxxx
				221	* 0x00000800 0x0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx
				222	* 0x00010000 0x001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
				223	* 0x00200000 0x03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
				224	* 0x04000000 0x7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
				225	*
				226	* Actual unicode characters are limited to the range 0x0 - 0x10FFFF,
				227	* 17 planes of 65536 values. This limits the sequences actually seen
				228	* even more, to just the following.
				229	*
				230	* 0 - 0x7f: 0 0x7f
				231	* 0x80 - 0x7ff: 0xc2 0x80 0xdf 0xbf
				232	* 0x800 - 0xffff: 0xe0 0xa0 0x80 0xef 0xbf 0xbf
				233	* 0x10000 - 0x10ffff: 0xf0 0x90 0x80 0x80 0xf4 0x8f 0xbf 0xbf
				234	*
				235	* Even within those ranges not all values are allowed: the surrogates
				236	* 0xd800 - 0xdfff should never be seen.
				237	*
				238	* Note that the longest sequence seen with valid usage is 4 bytes,
				239	* the same a single UTF-32 character. This makes the UTF-8
				240	* representation of Unicode strictly smaller than UTF-32.
				241	*
				242	* The shortest sequence requirement was introduced by:
				243	* Corrigendum #1: UTF-8 Shortest Form
				244	* It can be found here:
				245	* http://www.unicode.org/versions/corrigendum1.html
				246	*
				247	*/
				248
				249	#define UTF8_2_BITS 0xC0
				250	#define UTF8_3_BITS 0xE0
				251	#define UTF8_4_BITS 0xF0
				252	#define UTF8_N_BITS 0x80
				253	#define UTF8_2_MASK 0xE0
				254	#define UTF8_3_MASK 0xF0
				255	#define UTF8_4_MASK 0xF8
				256	#define UTF8_N_MASK 0xC0
				257	#define UTF8_V_MASK 0x3F
				258	#define UTF8_V_SHIFT 6
				259
				260	static int utf8encode(char *str, unsigned int val)
				261	{
				262	int len;
				263
				264	if (val < 0x80) {
				265	str[0] = val;
				266	len = 1;
				267	} else if (val < 0x800) {
				268	str[1] = val & UTF8_V_MASK;
				269	str[1] \|= UTF8_N_BITS;
				270	val >>= UTF8_V_SHIFT;
				271	str[0] = val;
				272	str[0] \|= UTF8_2_BITS;
				273	len = 2;
				274	} else if (val < 0x10000) {
				275	str[2] = val & UTF8_V_MASK;
				276	str[2] \|= UTF8_N_BITS;
				277	val >>= UTF8_V_SHIFT;
				278	str[1] = val & UTF8_V_MASK;
				279	str[1] \|= UTF8_N_BITS;
				280	val >>= UTF8_V_SHIFT;
				281	str[0] = val;
				282	str[0] \|= UTF8_3_BITS;
				283	len = 3;
				284	} else if (val < 0x110000) {
				285	str[3] = val & UTF8_V_MASK;
				286	str[3] \|= UTF8_N_BITS;
				287	val >>= UTF8_V_SHIFT;
				288	str[2] = val & UTF8_V_MASK;
				289	str[2] \|= UTF8_N_BITS;
				290	val >>= UTF8_V_SHIFT;
				291	str[1] = val & UTF8_V_MASK;
				292	str[1] \|= UTF8_N_BITS;
				293	val >>= UTF8_V_SHIFT;
				294	str[0] = val;
				295	str[0] \|= UTF8_4_BITS;
				296	len = 4;
				297	} else {
				298	printf("%#x: illegal val\n", val);
				299	len = 0;
				300	}
				301	return len;
				302	}
				303
				304	static unsigned int utf8decode(const char *str)
				305	{
				306	const unsigned char s = (const unsigned char)str;
				307	unsigned int unichar = 0;
				308
				309	if (*s < 0x80) {
				310	unichar = *s;
				311	} else if (*s < UTF8_3_BITS) {
				312	unichar = *s++ & 0x1F;
				313	unichar <<= UTF8_V_SHIFT;
				314	unichar \|= *s & 0x3F;
				315	} else if (*s < UTF8_4_BITS) {
				316	unichar = *s++ & 0x0F;
				317	unichar <<= UTF8_V_SHIFT;
				318	unichar \|= *s++ & 0x3F;
				319	unichar <<= UTF8_V_SHIFT;
				320	unichar \|= *s & 0x3F;
				321	} else {
				322	unichar = *s++ & 0x0F;
				323	unichar <<= UTF8_V_SHIFT;
				324	unichar \|= *s++ & 0x3F;
				325	unichar <<= UTF8_V_SHIFT;
				326	unichar \|= *s++ & 0x3F;
				327	unichar <<= UTF8_V_SHIFT;
				328	unichar \|= *s & 0x3F;
				329	}
				330	return unichar;
				331	}
				332
				333	static int utf32valid(unsigned int unichar)
				334	{
				335	return unichar < 0x110000;
				336	}
				337
				338	#define HANGUL_SYLLABLE(U) ((U) >= 0xAC00 && (U) <= 0xD7A3)
				339
				340	#define NODE 1
				341	#define LEAF 0
				342
				343	struct tree {
				344	void *root;
				345	int childnode;
				346	const char *type;
				347	unsigned int maxage;
				348	struct tree *next;
				349	int (leaf_equal)(void , void *);
				350	void (leaf_print)(void , int);
				351	int (leaf_mark)(void );
				352	int (leaf_size)(void );
				353	int (leaf_index)(struct tree , void );
				354	unsigned char (leaf_emit)(void , unsigned char );
				355	int leafindex[0x110000];
				356	int index;
				357	};
				358
				359	struct node {
				360	int index;
				361	int offset;
				362	int mark;
				363	int size;
				364	struct node *parent;
				365	void *left;
				366	void *right;
				367	unsigned char bitnum;
				368	unsigned char nextbyte;
				369	unsigned char leftnode;
				370	unsigned char rightnode;
				371	unsigned int keybits;
				372	unsigned int keymask;
				373	};
				374
				375	/*
				376	* Example lookup function for a tree.
				377	*/
				378	static void lookup(struct tree tree, const char *key)
				379	{
				380	struct node *node;
				381	void *leaf = NULL;
				382
				383	node = tree->root;
				384	while (!leaf && node) {
				385	if (node->nextbyte)
				386	key++;
				387	if (*key & (1 << (node->bitnum & 7))) {
				388	/* Right leg */
				389	if (node->rightnode == NODE) {
				390	node = node->right;
				391	} else if (node->rightnode == LEAF) {
				392	leaf = node->right;
				393	} else {
				394	node = NULL;
				395	}
				396	} else {
				397	/* Left leg */
				398	if (node->leftnode == NODE) {
				399	node = node->left;
				400	} else if (node->leftnode == LEAF) {
				401	leaf = node->left;
				402	} else {
				403	node = NULL;
				404	}
				405	}
				406	}
				407
				408	return leaf;
				409	}
				410
				411	/*
				412	* A simple non-recursive tree walker: keep track of visits to the
				413	* left and right branches in the leftmask and rightmask.
				414	*/
				415	static void tree_walk(struct tree *tree)
				416	{
				417	struct node *node;
				418	unsigned int leftmask;
				419	unsigned int rightmask;
				420	unsigned int bitmask;
				421	int indent = 1;
				422	int nodes, singletons, leaves;
				423
				424	nodes = singletons = leaves = 0;
				425
				426	printf("%s_%x root %p\n", tree->type, tree->maxage, tree->root);
				427	if (tree->childnode == LEAF) {
				428	assert(tree->root);
				429	tree->leaf_print(tree->root, indent);
				430	leaves = 1;
				431	} else {
				432	assert(tree->childnode == NODE);
				433	node = tree->root;
				434	leftmask = rightmask = 0;
				435	while (node) {
				436	printf("%*snode @ %p bitnum %d nextbyte %d"
				437	" left %p right %p mask %x bits %x\n",
				438	indent, "", node,
				439	node->bitnum, node->nextbyte,
				440	node->left, node->right,
				441	node->keymask, node->keybits);
				442	nodes += 1;
				443	if (!(node->left && node->right))
				444	singletons += 1;
				445
				446	while (node) {
				447	bitmask = 1 << node->bitnum;
				448	if ((leftmask & bitmask) == 0) {
				449	leftmask \|= bitmask;
				450	if (node->leftnode == LEAF) {
				451	assert(node->left);
				452	tree->leaf_print(node->left,
				453	indent+1);
				454	leaves += 1;
				455	} else if (node->left) {
				456	assert(node->leftnode == NODE);
				457	indent += 1;
				458	node = node->left;
				459	break;
				460	}
				461	}
				462	if ((rightmask & bitmask) == 0) {
				463	rightmask \|= bitmask;
				464	if (node->rightnode == LEAF) {
				465	assert(node->right);
				466	tree->leaf_print(node->right,
				467	indent+1);
				468	leaves += 1;
				469	} else if (node->right) {
				470	assert(node->rightnode == NODE);
				471	indent += 1;
				472	node = node->right;
				473	break;
				474	}
				475	}
				476	leftmask &= ~bitmask;
				477	rightmask &= ~bitmask;
				478	node = node->parent;
				479	indent -= 1;
				480	}
				481	}
				482	}
				483	printf("nodes %d leaves %d singletons %d\n",
				484	nodes, leaves, singletons);
				485	}
				486
				487	/*
				488	* Allocate an initialize a new internal node.
				489	*/
				490	static struct node alloc_node(struct node parent)
				491	{
				492	struct node *node;
				493	int bitnum;
				494
				495	node = malloc(sizeof(*node));
				496	node->left = node->right = NULL;
				497	node->parent = parent;
				498	node->leftnode = NODE;
				499	node->rightnode = NODE;
				500	node->keybits = 0;
				501	node->keymask = 0;
				502	node->mark = 0;
				503	node->index = 0;
				504	node->offset = -1;
				505	node->size = 4;
				506
				507	if (node->parent) {
				508	bitnum = parent->bitnum;
				509	if ((bitnum & 7) == 0) {
				510	node->bitnum = bitnum + 7 + 8;
				511	node->nextbyte = 1;
				512	} else {
				513	node->bitnum = bitnum - 1;
				514	node->nextbyte = 0;
				515	}
				516	} else {
				517	node->bitnum = 7;
				518	node->nextbyte = 0;
				519	}
				520
				521	return node;
				522	}
				523
				524	/*
				525	* Insert a new leaf into the tree, and collapse any subtrees that are
				526	* fully populated and end in identical leaves. A nextbyte tagged
				527	* internal node will not be removed to preserve the tree's integrity.
				528	* Note that due to the structure of utf8, no nextbyte tagged node
				529	* will be a candidate for removal.
				530	*/
				531	static int insert(struct tree tree, char key, int keylen, void *leaf)
				532	{
				533	struct node *node;
				534	struct node *parent;
				535	void **cursor;
				536	int keybits;
				537
				538	assert(keylen >= 1 && keylen <= 4);
				539
				540	node = NULL;
				541	cursor = &tree->root;
				542	keybits = 8 * keylen;
				543
				544	/* Insert, creating path along the way. */
				545	while (keybits) {
				546	if (!*cursor)
				547	*cursor = alloc_node(node);
				548	node = *cursor;
				549	if (node->nextbyte)
				550	key++;
				551	if (*key & (1 << (node->bitnum & 7)))
				552	cursor = &node->right;
				553	else
				554	cursor = &node->left;
				555	keybits--;
				556	}
				557	*cursor = leaf;
				558
				559	/* Merge subtrees if possible. */
				560	while (node) {
				561	if (*key & (1 << (node->bitnum & 7)))
				562	node->rightnode = LEAF;
				563	else
				564	node->leftnode = LEAF;
				565	if (node->nextbyte)
				566	break;
				567	if (node->leftnode == NODE \|\| node->rightnode == NODE)
				568	break;
				569	assert(node->left);
				570	assert(node->right);
				571	/* Compare */
				572	if (! tree->leaf_equal(node->left, node->right))
				573	break;
				574	/* Keep left, drop right leaf. */
				575	leaf = node->left;
				576	/* Check in parent */
				577	parent = node->parent;
				578	if (!parent) {
				579	/* root of tree! */
				580	tree->root = leaf;
				581	tree->childnode = LEAF;
				582	} else if (parent->left == node) {
				583	parent->left = leaf;
				584	parent->leftnode = LEAF;
				585	if (parent->right) {
				586	parent->keymask = 0;
				587	parent->keybits = 0;
				588	} else {
				589	parent->keymask \|= (1 << node->bitnum);
				590	}
				591	} else if (parent->right == node) {
				592	parent->right = leaf;
				593	parent->rightnode = LEAF;
				594	if (parent->left) {
				595	parent->keymask = 0;
				596	parent->keybits = 0;
				597	} else {
				598	parent->keymask \|= (1 << node->bitnum);
				599	parent->keybits \|= (1 << node->bitnum);
				600	}
				601	} else {
				602	/* internal tree error */
				603	assert(0);
				604	}
				605	free(node);
				606	node = parent;
				607	}
				608
				609	/* Propagate keymasks up along singleton chains. */
				610	while (node) {
				611	parent = node->parent;
				612	if (!parent)
				613	break;
				614	/* Nix the mask for parents with two children. */
				615	if (node->keymask == 0) {
				616	parent->keymask = 0;
				617	parent->keybits = 0;
				618	} else if (parent->left && parent->right) {
				619	parent->keymask = 0;
				620	parent->keybits = 0;
				621	} else {
				622	assert((parent->keymask & node->keymask) == 0);
				623	parent->keymask \|= node->keymask;
				624	parent->keymask \|= (1 << parent->bitnum);
				625	parent->keybits \|= node->keybits;
				626	if (parent->right)
				627	parent->keybits \|= (1 << parent->bitnum);
				628	}
				629	node = parent;
				630	}
				631
				632	return 0;
				633	}
				634
				635	/*
				636	* Prune internal nodes.
				637	*
				638	* Fully populated subtrees that end at the same leaf have already
				639	* been collapsed. There are still internal nodes that have for both
				640	* their left and right branches a sequence of singletons that make
				641	* identical choices and end in identical leaves. The keymask and
				642	* keybits collected in the nodes describe the choices made in these
				643	* singleton chains. When they are identical for the left and right
				644	* branch of a node, and the two leaves comare identical, the node in
				645	* question can be removed.
				646	*
				647	* Note that nodes with the nextbyte tag set will not be removed by
				648	* this to ensure tree integrity. Note as well that the structure of
				649	* utf8 ensures that these nodes would not have been candidates for
				650	* removal in any case.
				651	*/
				652	static void prune(struct tree *tree)
				653	{
				654	struct node *node;
				655	struct node *left;
				656	struct node *right;
				657	struct node *parent;
				658	void *leftleaf;
				659	void *rightleaf;
				660	unsigned int leftmask;
				661	unsigned int rightmask;
				662	unsigned int bitmask;
				663	int count;
				664
				665	if (verbose > 0)
				666	printf("Pruning %s_%x\n", tree->type, tree->maxage);
				667
				668	count = 0;
				669	if (tree->childnode == LEAF)
				670	return;
				671	if (!tree->root)
				672	return;
				673
				674	leftmask = rightmask = 0;
				675	node = tree->root;
				676	while (node) {
				677	if (node->nextbyte)
				678	goto advance;
				679	if (node->leftnode == LEAF)
				680	goto advance;
				681	if (node->rightnode == LEAF)
				682	goto advance;
				683	if (!node->left)
				684	goto advance;
				685	if (!node->right)
				686	goto advance;
				687	left = node->left;
				688	right = node->right;
				689	if (left->keymask == 0)
				690	goto advance;
				691	if (right->keymask == 0)
				692	goto advance;
				693	if (left->keymask != right->keymask)
				694	goto advance;
				695	if (left->keybits != right->keybits)
				696	goto advance;
				697	leftleaf = NULL;
				698	while (!leftleaf) {
				699	assert(left->left \|\| left->right);
				700	if (left->leftnode == LEAF)
				701	leftleaf = left->left;
				702	else if (left->rightnode == LEAF)
				703	leftleaf = left->right;
				704	else if (left->left)
				705	left = left->left;
				706	else if (left->right)
				707	left = left->right;
				708	else
				709	assert(0);
				710	}
				711	rightleaf = NULL;
				712	while (!rightleaf) {
				713	assert(right->left \|\| right->right);
				714	if (right->leftnode == LEAF)
				715	rightleaf = right->left;
				716	else if (right->rightnode == LEAF)
				717	rightleaf = right->right;
				718	else if (right->left)
				719	right = right->left;
				720	else if (right->right)
				721	right = right->right;
				722	else
				723	assert(0);
				724	}
				725	if (! tree->leaf_equal(leftleaf, rightleaf))
				726	goto advance;
				727	/*
				728	* This node has identical singleton-only subtrees.
				729	* Remove it.
				730	*/
				731	parent = node->parent;
				732	left = node->left;
				733	right = node->right;
				734	if (parent->left == node)
				735	parent->left = left;
				736	else if (parent->right == node)
				737	parent->right = left;
				738	else
				739	assert(0);
				740	left->parent = parent;
				741	left->keymask \|= (1 << node->bitnum);
				742	node->left = NULL;
				743	while (node) {
				744	bitmask = 1 << node->bitnum;
				745	leftmask &= ~bitmask;
				746	rightmask &= ~bitmask;
				747	if (node->leftnode == NODE && node->left) {
				748	left = node->left;
				749	free(node);
				750	count++;
				751	node = left;
				752	} else if (node->rightnode == NODE && node->right) {
				753	right = node->right;
				754	free(node);
				755	count++;
				756	node = right;
				757	} else {
				758	node = NULL;
				759	}
				760	}
				761	/* Propagate keymasks up along singleton chains. */
				762	node = parent;
				763	/* Force re-check */
				764	bitmask = 1 << node->bitnum;
				765	leftmask &= ~bitmask;
				766	rightmask &= ~bitmask;
				767	for (;;) {
				768	if (node->left && node->right)
				769	break;
				770	if (node->left) {
				771	left = node->left;
				772	node->keymask \|= left->keymask;
				773	node->keybits \|= left->keybits;
				774	}
				775	if (node->right) {
				776	right = node->right;
				777	node->keymask \|= right->keymask;
				778	node->keybits \|= right->keybits;
				779	}
				780	node->keymask \|= (1 << node->bitnum);
				781	node = node->parent;
				782	/* Force re-check */
				783	bitmask = 1 << node->bitnum;
				784	leftmask &= ~bitmask;
				785	rightmask &= ~bitmask;
				786	}
				787	advance:
				788	bitmask = 1 << node->bitnum;
				789	if ((leftmask & bitmask) == 0 &&
				790	node->leftnode == NODE &&
				791	node->left) {
				792	leftmask \|= bitmask;
				793	node = node->left;
				794	} else if ((rightmask & bitmask) == 0 &&
				795	node->rightnode == NODE &&
				796	node->right) {
				797	rightmask \|= bitmask;
				798	node = node->right;
				799	} else {
				800	leftmask &= ~bitmask;
				801	rightmask &= ~bitmask;
				802	node = node->parent;
				803	}
				804	}
				805	if (verbose > 0)
				806	printf("Pruned %d nodes\n", count);
				807	}
				808
				809	/*
				810	* Mark the nodes in the tree that lead to leaves that must be
				811	* emitted.
				812	*/
				813	static void mark_nodes(struct tree *tree)
				814	{
				815	struct node *node;
				816	struct node *n;
				817	unsigned int leftmask;
				818	unsigned int rightmask;
				819	unsigned int bitmask;
				820	int marked;
				821
				822	marked = 0;
				823	if (verbose > 0)
				824	printf("Marking %s_%x\n", tree->type, tree->maxage);
				825	if (tree->childnode == LEAF)
				826	goto done;
				827
				828	assert(tree->childnode == NODE);
				829	node = tree->root;
				830	leftmask = rightmask = 0;
				831	while (node) {
				832	bitmask = 1 << node->bitnum;
				833	if ((leftmask & bitmask) == 0) {
				834	leftmask \|= bitmask;
				835	if (node->leftnode == LEAF) {
				836	assert(node->left);
				837	if (tree->leaf_mark(node->left)) {
				838	n = node;
				839	while (n && !n->mark) {
				840	marked++;
				841	n->mark = 1;
				842	n = n->parent;
				843	}
				844	}
				845	} else if (node->left) {
				846	assert(node->leftnode == NODE);
				847	node = node->left;
				848	continue;
				849	}
				850	}
				851	if ((rightmask & bitmask) == 0) {
				852	rightmask \|= bitmask;
				853	if (node->rightnode == LEAF) {
				854	assert(node->right);
				855	if (tree->leaf_mark(node->right)) {
				856	n = node;
				857	while (n && !n->mark) {
				858	marked++;
				859	n->mark = 1;
				860	n = n->parent;
				861	}
				862	}
				863	} else if (node->right) {
				864	assert(node->rightnode == NODE);
				865	node = node->right;
				866	continue;
				867	}
				868	}
				869	leftmask &= ~bitmask;
				870	rightmask &= ~bitmask;
				871	node = node->parent;
				872	}
				873
				874	/* second pass: left siblings and singletons */
				875
				876	assert(tree->childnode == NODE);
				877	node = tree->root;
				878	leftmask = rightmask = 0;
				879	while (node) {
				880	bitmask = 1 << node->bitnum;
				881	if ((leftmask & bitmask) == 0) {
				882	leftmask \|= bitmask;
				883	if (node->leftnode == LEAF) {
				884	assert(node->left);
				885	if (tree->leaf_mark(node->left)) {
				886	n = node;
				887	while (n && !n->mark) {
				888	marked++;
				889	n->mark = 1;
				890	n = n->parent;
				891	}
				892	}
				893	} else if (node->left) {
				894	assert(node->leftnode == NODE);
				895	node = node->left;
				896	if (!node->mark && node->parent->mark) {
				897	marked++;
				898	node->mark = 1;
				899	}
				900	continue;
				901	}
				902	}
				903	if ((rightmask & bitmask) == 0) {
				904	rightmask \|= bitmask;
				905	if (node->rightnode == LEAF) {
				906	assert(node->right);
				907	if (tree->leaf_mark(node->right)) {
				908	n = node;
				909	while (n && !n->mark) {
				910	marked++;
				911	n->mark = 1;
				912	n = n->parent;
				913	}
				914	}
				915	} else if (node->right) {
				916	assert(node->rightnode == NODE);
				917	node = node->right;
				918	if (!node->mark && node->parent->mark &&
				919	!node->parent->left) {
				920	marked++;
				921	node->mark = 1;
				922	}
				923	continue;
				924	}
				925	}
				926	leftmask &= ~bitmask;
				927	rightmask &= ~bitmask;
				928	node = node->parent;
				929	}
				930	done:
				931	if (verbose > 0)
				932	printf("Marked %d nodes\n", marked);
				933	}
				934
				935	/*
				936	* Compute the index of each node and leaf, which is the offset in the
				937	* emitted trie. These values must be pre-computed because relative
				938	* offsets between nodes are used to navigate the tree.
				939	*/
				940	static int index_nodes(struct tree *tree, int index)
				941	{
				942	struct node *node;
				943	unsigned int leftmask;
				944	unsigned int rightmask;
				945	unsigned int bitmask;
				946	int count;
				947	int indent;
				948
				949	/* Align to a cache line (or half a cache line?). */
				950	while (index % 64)
				951	index++;
				952	tree->index = index;
				953	indent = 1;
				954	count = 0;
				955
				956	if (verbose > 0)
				957	printf("Indexing %s_%x: %d\n", tree->type, tree->maxage, index);
				958	if (tree->childnode == LEAF) {
				959	index += tree->leaf_size(tree->root);
				960	goto done;
				961	}
				962
				963	assert(tree->childnode == NODE);
				964	node = tree->root;
				965	leftmask = rightmask = 0;
				966	while (node) {
				967	if (!node->mark)
				968	goto skip;
				969	count++;
				970	if (node->index != index)
				971	node->index = index;
				972	index += node->size;
				973	skip:
				974	while (node) {
				975	bitmask = 1 << node->bitnum;
				976	if (node->mark && (leftmask & bitmask) == 0) {
				977	leftmask \|= bitmask;
				978	if (node->leftnode == LEAF) {
				979	assert(node->left);
				980	*tree->leaf_index(tree, node->left) =
				981	index;
				982	index += tree->leaf_size(node->left);
				983	count++;
				984	} else if (node->left) {
				985	assert(node->leftnode == NODE);
				986	indent += 1;
				987	node = node->left;
				988	break;
				989	}
				990	}
				991	if (node->mark && (rightmask & bitmask) == 0) {
				992	rightmask \|= bitmask;
				993	if (node->rightnode == LEAF) {
				994	assert(node->right);
				995	*tree->leaf_index(tree, node->right) = index;
				996	index += tree->leaf_size(node->right);
				997	count++;
				998	} else if (node->right) {
				999	assert(node->rightnode == NODE);
				1000	indent += 1;
				1001	node = node->right;
				1002	break;
				1003	}
				1004	}
				1005	leftmask &= ~bitmask;
				1006	rightmask &= ~bitmask;
				1007	node = node->parent;
				1008	indent -= 1;
				1009	}
				1010	}
				1011	done:
				1012	/* Round up to a multiple of 16 */
				1013	while (index % 16)
				1014	index++;
				1015	if (verbose > 0)
				1016	printf("Final index %d\n", index);
				1017	return index;
				1018	}
				1019
				1020	/*
				1021	* Mark the nodes in a subtree, helper for size_nodes().
				1022	*/
				1023	static int mark_subtree(struct node *node)
				1024	{
				1025	int changed;
				1026
				1027	if (!node \|\| node->mark)
				1028	return 0;
				1029	node->mark = 1;
				1030	node->index = node->parent->index;
				1031	changed = 1;
				1032	if (node->leftnode == NODE)
				1033	changed += mark_subtree(node->left);
				1034	if (node->rightnode == NODE)
				1035	changed += mark_subtree(node->right);
				1036	return changed;
				1037	}
				1038
				1039	/*
				1040	* Compute the size of nodes and leaves. We start by assuming that
				1041	* each node needs to store a three-byte offset. The indexes of the
				1042	* nodes are calculated based on that, and then this function is
				1043	* called to see if the sizes of some nodes can be reduced. This is
				1044	* repeated until no more changes are seen.
				1045	*/
				1046	static int size_nodes(struct tree *tree)
				1047	{
				1048	struct tree *next;
				1049	struct node *node;
				1050	struct node *right;
				1051	struct node *n;
				1052	unsigned int leftmask;
				1053	unsigned int rightmask;
				1054	unsigned int bitmask;
				1055	unsigned int pathbits;
				1056	unsigned int pathmask;
				1057	unsigned int nbit;
				1058	int changed;
				1059	int offset;
				1060	int size;
				1061	int indent;
				1062
				1063	indent = 1;
				1064	changed = 0;
				1065	size = 0;
				1066
				1067	if (verbose > 0)
				1068	printf("Sizing %s_%x\n", tree->type, tree->maxage);
				1069	if (tree->childnode == LEAF)
				1070	goto done;
				1071
				1072	assert(tree->childnode == NODE);
				1073	pathbits = 0;
				1074	pathmask = 0;
				1075	node = tree->root;
				1076	leftmask = rightmask = 0;
				1077	while (node) {
				1078	if (!node->mark)
				1079	goto skip;
				1080	offset = 0;
				1081	if (!node->left \|\| !node->right) {
				1082	size = 1;
				1083	} else {
				1084	if (node->rightnode == NODE) {
				1085	/*
				1086	* If the right node is not marked,
				1087	* look for a corresponding node in
				1088	* the next tree. Such a node need
				1089	* not exist.
				1090	*/
				1091	right = node->right;
				1092	next = tree->next;
				1093	while (!right->mark) {
				1094	assert(next);
				1095	n = next->root;
				1096	while (n->bitnum != node->bitnum) {
				1097	nbit = 1 << n->bitnum;
				1098	if (!(pathmask & nbit))
				1099	break;
				1100	if (pathbits & nbit) {
				1101	if (n->rightnode == LEAF)
				1102	break;
				1103	n = n->right;
				1104	} else {
				1105	if (n->leftnode == LEAF)
				1106	break;
				1107	n = n->left;
				1108	}
				1109	}
				1110	if (n->bitnum != node->bitnum)
				1111	break;
				1112	n = n->right;
				1113	right = n;
				1114	next = next->next;
				1115	}
				1116	/* Make sure the right node is marked. */
				1117	if (!right->mark)
				1118	changed += mark_subtree(right);
				1119	offset = right->index - node->index;
				1120	} else {
				1121	offset = *tree->leaf_index(tree, node->right);
				1122	offset -= node->index;
				1123	}
				1124	assert(offset >= 0);
				1125	assert(offset <= 0xffffff);
				1126	if (offset <= 0xff) {
				1127	size = 2;
				1128	} else if (offset <= 0xffff) {
				1129	size = 3;
				1130	} else { /* offset <= 0xffffff */
				1131	size = 4;
				1132	}
				1133	}
				1134	if (node->size != size \|\| node->offset != offset) {
				1135	node->size = size;
				1136	node->offset = offset;
				1137	changed++;
				1138	}
				1139	skip:
				1140	while (node) {
				1141	bitmask = 1 << node->bitnum;
				1142	pathmask \|= bitmask;
				1143	if (node->mark && (leftmask & bitmask) == 0) {
				1144	leftmask \|= bitmask;
				1145	if (node->leftnode == LEAF) {
				1146	assert(node->left);
				1147	} else if (node->left) {
				1148	assert(node->leftnode == NODE);
				1149	indent += 1;
				1150	node = node->left;
				1151	break;
				1152	}
				1153	}
				1154	if (node->mark && (rightmask & bitmask) == 0) {
				1155	rightmask \|= bitmask;
				1156	pathbits \|= bitmask;
				1157	if (node->rightnode == LEAF) {
				1158	assert(node->right);
				1159	} else if (node->right) {
				1160	assert(node->rightnode == NODE);
				1161	indent += 1;
				1162	node = node->right;
				1163	break;
				1164	}
				1165	}
				1166	leftmask &= ~bitmask;
				1167	rightmask &= ~bitmask;
				1168	pathmask &= ~bitmask;
				1169	pathbits &= ~bitmask;
				1170	node = node->parent;
				1171	indent -= 1;
				1172	}
				1173	}
				1174	done:
				1175	if (verbose > 0)
				1176	printf("Found %d changes\n", changed);
				1177	return changed;
				1178	}
				1179
				1180	/*
				1181	* Emit a trie for the given tree into the data array.
				1182	*/
				1183	static void emit(struct tree tree, unsigned char data)
				1184	{
				1185	struct node *node;
				1186	unsigned int leftmask;
				1187	unsigned int rightmask;
				1188	unsigned int bitmask;
				1189	int offlen;
				1190	int offset;
				1191	int index;
				1192	int indent;
				1193	int size;
				1194	int bytes;
				1195	int leaves;
				1196	int nodes[4];
				1197	unsigned char byte;
				1198
				1199	nodes[0] = nodes[1] = nodes[2] = nodes[3] = 0;
				1200	leaves = 0;
				1201	bytes = 0;
				1202	index = tree->index;
				1203	data += index;
				1204	indent = 1;
				1205	if (verbose > 0)
				1206	printf("Emitting %s_%x\n", tree->type, tree->maxage);
				1207	if (tree->childnode == LEAF) {
				1208	assert(tree->root);
				1209	tree->leaf_emit(tree->root, data);
				1210	size = tree->leaf_size(tree->root);
				1211	index += size;
				1212	leaves++;
				1213	goto done;
				1214	}
				1215
				1216	assert(tree->childnode == NODE);
				1217	node = tree->root;
				1218	leftmask = rightmask = 0;
				1219	while (node) {
				1220	if (!node->mark)
				1221	goto skip;
				1222	assert(node->offset != -1);
				1223	assert(node->index == index);
				1224
				1225	byte = 0;
				1226	if (node->nextbyte)
				1227	byte \|= NEXTBYTE;
				1228	byte \|= (node->bitnum & BITNUM);
				1229	if (node->left && node->right) {
				1230	if (node->leftnode == NODE)
				1231	byte \|= LEFTNODE;
				1232	if (node->rightnode == NODE)
				1233	byte \|= RIGHTNODE;
				1234	if (node->offset <= 0xff)
				1235	offlen = 1;
				1236	else if (node->offset <= 0xffff)
				1237	offlen = 2;
				1238	else
				1239	offlen = 3;
				1240	nodes[offlen]++;
				1241	offset = node->offset;
				1242	byte \|= offlen << OFFLEN_SHIFT;
				1243	*data++ = byte;
				1244	index++;
				1245	while (offlen--) {
				1246	*data++ = offset & 0xff;
				1247	index++;
				1248	offset >>= 8;
				1249	}
				1250	} else if (node->left) {
				1251	if (node->leftnode == NODE)
				1252	byte \|= TRIENODE;
				1253	nodes[0]++;
				1254	*data++ = byte;
				1255	index++;
				1256	} else if (node->right) {
				1257	byte \|= RIGHTNODE;
				1258	if (node->rightnode == NODE)
				1259	byte \|= TRIENODE;
				1260	nodes[0]++;
				1261	*data++ = byte;
				1262	index++;
				1263	} else {
				1264	assert(0);
				1265	}
				1266	skip:
				1267	while (node) {
				1268	bitmask = 1 << node->bitnum;
				1269	if (node->mark && (leftmask & bitmask) == 0) {
				1270	leftmask \|= bitmask;
				1271	if (node->leftnode == LEAF) {
				1272	assert(node->left);
				1273	data = tree->leaf_emit(node->left,
				1274	data);
				1275	size = tree->leaf_size(node->left);
				1276	index += size;
				1277	bytes += size;
				1278	leaves++;
				1279	} else if (node->left) {
				1280	assert(node->leftnode == NODE);
				1281	indent += 1;
				1282	node = node->left;
				1283	break;
				1284	}
				1285	}
				1286	if (node->mark && (rightmask & bitmask) == 0) {
				1287	rightmask \|= bitmask;
				1288	if (node->rightnode == LEAF) {
				1289	assert(node->right);
				1290	data = tree->leaf_emit(node->right,
				1291	data);
				1292	size = tree->leaf_size(node->right);
				1293	index += size;
				1294	bytes += size;
				1295	leaves++;
				1296	} else if (node->right) {
				1297	assert(node->rightnode == NODE);
				1298	indent += 1;
				1299	node = node->right;
				1300	break;
				1301	}
				1302	}
				1303	leftmask &= ~bitmask;
				1304	rightmask &= ~bitmask;
				1305	node = node->parent;
				1306	indent -= 1;
				1307	}
				1308	}
				1309	done:
				1310	if (verbose > 0) {
				1311	printf("Emitted %d (%d) leaves",
				1312	leaves, bytes);
				1313	printf(" %d (%d+%d+%d+%d) nodes",
				1314	nodes[0] + nodes[1] + nodes[2] + nodes[3],
				1315	nodes[0], nodes[1], nodes[2], nodes[3]);
				1316	printf(" %d total\n", index - tree->index);
				1317	}
				1318	}
				1319
				1320	/* ------------------------------------------------------------------ */
				1321
				1322	/*
				1323	* Unicode data.
				1324	*
				1325	* We need to keep track of the Canonical Combining Class, the Age,
				1326	* and decompositions for a code point.
				1327	*
				1328	* For the Age, we store the index into the ages table. Effectively
				1329	* this is a generation number that the table maps to a unicode
				1330	* version.
				1331	*
				1332	* The correction field is used to indicate that this entry is in the
				1333	* corrections array, which contains decompositions that were
				1334	* corrected in later revisions. The value of the correction field is
				1335	* the Unicode version in which the mapping was corrected.
				1336	*/
				1337	struct unicode_data {
				1338	unsigned int code;
				1339	int ccc;
				1340	int gen;
				1341	int correction;
				1342	unsigned int *utf32nfkdi;
				1343	unsigned int *utf32nfkdicf;
				1344	char *utf8nfkdi;
				1345	char *utf8nfkdicf;
				1346	};
				1347
				1348	struct unicode_data unicode_data[0x110000];
				1349	struct unicode_data *corrections;
				1350	int corrections_count;
				1351
				1352	struct tree *nfkdi_tree;
				1353	struct tree *nfkdicf_tree;
				1354
				1355	struct tree *trees;
				1356	int trees_count;
				1357
				1358	/*
				1359	* Check the corrections array to see if this entry was corrected at
				1360	* some point.
				1361	*/
				1362	static struct unicode_data corrections_lookup(struct unicode_data u)
				1363	{
				1364	int i;
				1365
				1366	for (i = 0; i != corrections_count; i++)
				1367	if (u->code == corrections[i].code)
				1368	return &corrections[i];
				1369	return u;
				1370	}
				1371
				1372	static int nfkdi_equal(void l, void r)
				1373	{
				1374	struct unicode_data *left = l;
				1375	struct unicode_data *right = r;
				1376
				1377	if (left->gen != right->gen)
				1378	return 0;
				1379	if (left->ccc != right->ccc)
				1380	return 0;
				1381	if (left->utf8nfkdi && right->utf8nfkdi &&
				1382	strcmp(left->utf8nfkdi, right->utf8nfkdi) == 0)
				1383	return 1;
				1384	if (left->utf8nfkdi \|\| right->utf8nfkdi)
				1385	return 0;
				1386	return 1;
				1387	}
				1388
				1389	static int nfkdicf_equal(void l, void r)
				1390	{
				1391	struct unicode_data *left = l;
				1392	struct unicode_data *right = r;
				1393
				1394	if (left->gen != right->gen)
				1395	return 0;
				1396	if (left->ccc != right->ccc)
				1397	return 0;
				1398	if (left->utf8nfkdicf && right->utf8nfkdicf &&
				1399	strcmp(left->utf8nfkdicf, right->utf8nfkdicf) == 0)
				1400	return 1;
				1401	if (left->utf8nfkdicf && right->utf8nfkdicf)
				1402	return 0;
				1403	if (left->utf8nfkdicf \|\| right->utf8nfkdicf)
				1404	return 0;
				1405	if (left->utf8nfkdi && right->utf8nfkdi &&
				1406	strcmp(left->utf8nfkdi, right->utf8nfkdi) == 0)
				1407	return 1;
				1408	if (left->utf8nfkdi \|\| right->utf8nfkdi)
				1409	return 0;
				1410	return 1;
				1411	}
				1412
				1413	static void nfkdi_print(void *l, int indent)
				1414	{
				1415	struct unicode_data *leaf = l;
				1416
				1417	printf("%*sleaf @ %p code %X ccc %d gen %d", indent, "", leaf,
				1418	leaf->code, leaf->ccc, leaf->gen);
				1419	if (leaf->utf8nfkdi && leaf->utf8nfkdi[0] == HANGUL)
				1420	printf(" nfkdi \"%s\"", "HANGUL SYLLABLE");
				1421	else if (leaf->utf8nfkdi)
				1422	printf(" nfkdi \"%s\"", (const char*)leaf->utf8nfkdi);
				1423	printf("\n");
				1424	}
				1425
				1426	static void nfkdicf_print(void *l, int indent)
				1427	{
				1428	struct unicode_data *leaf = l;
				1429
				1430	printf("%*sleaf @ %p code %X ccc %d gen %d", indent, "", leaf,
				1431	leaf->code, leaf->ccc, leaf->gen);
				1432	if (leaf->utf8nfkdicf)
				1433	printf(" nfkdicf \"%s\"", (const char*)leaf->utf8nfkdicf);
				1434	else if (leaf->utf8nfkdi && leaf->utf8nfkdi[0] == HANGUL)
				1435	printf(" nfkdi \"%s\"", "HANGUL SYLLABLE");
				1436	else if (leaf->utf8nfkdi)
				1437	printf(" nfkdi \"%s\"", (const char*)leaf->utf8nfkdi);
				1438	printf("\n");
				1439	}
				1440
				1441	static int nfkdi_mark(void *l)
				1442	{
				1443	return 1;
				1444	}
				1445
				1446	static int nfkdicf_mark(void *l)
				1447	{
				1448	struct unicode_data *leaf = l;
				1449
				1450	if (leaf->utf8nfkdicf)
				1451	return 1;
				1452	return 0;
				1453	}
				1454
				1455	static int correction_mark(void *l)
				1456	{
				1457	struct unicode_data *leaf = l;
				1458
				1459	return leaf->correction;
				1460	}
				1461
				1462	static int nfkdi_size(void *l)
				1463	{
				1464	struct unicode_data *leaf = l;
				1465
				1466	int size = 2;
				1467	if (HANGUL_SYLLABLE(leaf->code))
				1468	size += 1;
				1469	else if (leaf->utf8nfkdi)
				1470	size += strlen(leaf->utf8nfkdi) + 1;
				1471	return size;
				1472	}
				1473
				1474	static int nfkdicf_size(void *l)
				1475	{
				1476	struct unicode_data *leaf = l;
				1477
				1478	int size = 2;
				1479	if (HANGUL_SYLLABLE(leaf->code))
				1480	size += 1;
				1481	else if (leaf->utf8nfkdicf)
				1482	size += strlen(leaf->utf8nfkdicf) + 1;
				1483	else if (leaf->utf8nfkdi)
				1484	size += strlen(leaf->utf8nfkdi) + 1;
				1485	return size;
				1486	}
				1487
				1488	static int nfkdi_index(struct tree tree, void *l)
				1489	{
				1490	struct unicode_data *leaf = l;
				1491
				1492	return &tree->leafindex[leaf->code];
				1493	}
				1494
				1495	static int nfkdicf_index(struct tree tree, void *l)
				1496	{
				1497	struct unicode_data *leaf = l;
				1498
				1499	return &tree->leafindex[leaf->code];
				1500	}
				1501
				1502	static unsigned char nfkdi_emit(void l, unsigned char *data)
				1503	{
				1504	struct unicode_data *leaf = l;
				1505	unsigned char *s;
				1506
				1507	*data++ = leaf->gen;
				1508	if (HANGUL_SYLLABLE(leaf->code)) {
				1509	*data++ = DECOMPOSE;
				1510	*data++ = HANGUL;
				1511	} else if (leaf->utf8nfkdi) {
				1512	*data++ = DECOMPOSE;
				1513	s = (unsigned char*)leaf->utf8nfkdi;
				1514	while ((data++ = s++) != 0)
				1515	;
				1516	} else {
				1517	*data++ = leaf->ccc;
				1518	}
				1519	return data;
				1520	}
				1521
				1522	static unsigned char nfkdicf_emit(void l, unsigned char *data)
				1523	{
				1524	struct unicode_data *leaf = l;
				1525	unsigned char *s;
				1526
				1527	*data++ = leaf->gen;
				1528	if (HANGUL_SYLLABLE(leaf->code)) {
				1529	*data++ = DECOMPOSE;
				1530	*data++ = HANGUL;
				1531	} else if (leaf->utf8nfkdicf) {
				1532	*data++ = DECOMPOSE;
				1533	s = (unsigned char*)leaf->utf8nfkdicf;
				1534	while ((data++ = s++) != 0)
				1535	;
				1536	} else if (leaf->utf8nfkdi) {
				1537	*data++ = DECOMPOSE;
				1538	s = (unsigned char*)leaf->utf8nfkdi;
				1539	while ((data++ = s++) != 0)
				1540	;
				1541	} else {
				1542	*data++ = leaf->ccc;
				1543	}
				1544	return data;
				1545	}
				1546
				1547	static void utf8_create(struct unicode_data *data)
				1548	{
				1549	char utf[18*4+1];
				1550	char *u;
				1551	unsigned int *um;
				1552	int i;
				1553
				1554	if (data->utf8nfkdi) {
				1555	assert(data->utf8nfkdi[0] == HANGUL);
				1556	return;
				1557	}
				1558
				1559	u = utf;
				1560	um = data->utf32nfkdi;
				1561	if (um) {
				1562	for (i = 0; um[i]; i++)
				1563	u += utf8encode(u, um[i]);
				1564	*u = '\0';
				1565	data->utf8nfkdi = strdup(utf);
				1566	}
				1567	u = utf;
				1568	um = data->utf32nfkdicf;
				1569	if (um) {
				1570	for (i = 0; um[i]; i++)
				1571	u += utf8encode(u, um[i]);
				1572	*u = '\0';
				1573	if (!data->utf8nfkdi \|\| strcmp(data->utf8nfkdi, utf))
				1574	data->utf8nfkdicf = strdup(utf);
				1575	}
				1576	}
				1577
				1578	static void utf8_init(void)
				1579	{
				1580	unsigned int unichar;
				1581	int i;
				1582
				1583	for (unichar = 0; unichar != 0x110000; unichar++)
				1584	utf8_create(&unicode_data[unichar]);
				1585
				1586	for (i = 0; i != corrections_count; i++)
				1587	utf8_create(&corrections[i]);
				1588	}
				1589
				1590	static void trees_init(void)
				1591	{
				1592	struct unicode_data *data;
				1593	unsigned int maxage;
				1594	unsigned int nextage;
				1595	int count;
				1596	int i;
				1597	int j;
				1598
				1599	/* Count the number of different ages. */
				1600	count = 0;
				1601	nextage = (unsigned int)-1;
				1602	do {
				1603	maxage = nextage;
				1604	nextage = 0;
				1605	for (i = 0; i <= corrections_count; i++) {
				1606	data = &corrections[i];
				1607	if (nextage < data->correction &&
				1608	data->correction < maxage)
				1609	nextage = data->correction;
				1610	}
				1611	count++;
				1612	} while (nextage);
				1613
				1614	/* Two trees per age: nfkdi and nfkdicf */
				1615	trees_count = count * 2;
				1616	trees = calloc(trees_count, sizeof(struct tree));
				1617
				1618	/* Assign ages to the trees. */
				1619	count = trees_count;
				1620	nextage = (unsigned int)-1;
				1621	do {
				1622	maxage = nextage;
				1623	trees[--count].maxage = maxage;
				1624	trees[--count].maxage = maxage;
				1625	nextage = 0;
				1626	for (i = 0; i <= corrections_count; i++) {
				1627	data = &corrections[i];
				1628	if (nextage < data->correction &&
				1629	data->correction < maxage)
				1630	nextage = data->correction;
				1631	}
				1632	} while (nextage);
				1633
				1634	/* The ages assigned above are off by one. */
				1635	for (i = 0; i != trees_count; i++) {
				1636	j = 0;
				1637	while (ages[j] < trees[i].maxage)
				1638	j++;
				1639	trees[i].maxage = ages[j-1];
				1640	}
				1641
				1642	/* Set up the forwarding between trees. */
				1643	trees[trees_count-2].next = &trees[trees_count-1];
				1644	trees[trees_count-1].leaf_mark = nfkdi_mark;
				1645	trees[trees_count-2].leaf_mark = nfkdicf_mark;
				1646	for (i = 0; i != trees_count-2; i += 2) {
				1647	trees[i].next = &trees[trees_count-2];
				1648	trees[i].leaf_mark = correction_mark;
				1649	trees[i+1].next = &trees[trees_count-1];
				1650	trees[i+1].leaf_mark = correction_mark;
				1651	}
				1652
				1653	/* Assign the callouts. */
				1654	for (i = 0; i != trees_count; i += 2) {
				1655	trees[i].type = "nfkdicf";
				1656	trees[i].leaf_equal = nfkdicf_equal;
				1657	trees[i].leaf_print = nfkdicf_print;
				1658	trees[i].leaf_size = nfkdicf_size;
				1659	trees[i].leaf_index = nfkdicf_index;
				1660	trees[i].leaf_emit = nfkdicf_emit;
				1661
				1662	trees[i+1].type = "nfkdi";
				1663	trees[i+1].leaf_equal = nfkdi_equal;
				1664	trees[i+1].leaf_print = nfkdi_print;
				1665	trees[i+1].leaf_size = nfkdi_size;
				1666	trees[i+1].leaf_index = nfkdi_index;
				1667	trees[i+1].leaf_emit = nfkdi_emit;
				1668	}
				1669
				1670	/* Finish init. */
				1671	for (i = 0; i != trees_count; i++)
				1672	trees[i].childnode = NODE;
				1673	}
				1674
				1675	static void trees_populate(void)
				1676	{
				1677	struct unicode_data *data;
				1678	unsigned int unichar;
				1679	char keyval[4];
				1680	int keylen;
				1681	int i;
				1682
				1683	for (i = 0; i != trees_count; i++) {
				1684	if (verbose > 0) {
				1685	printf("Populating %s_%x\n",
				1686	trees[i].type, trees[i].maxage);
				1687	}
				1688	for (unichar = 0; unichar != 0x110000; unichar++) {
				1689	if (unicode_data[unichar].gen < 0)
				1690	continue;
				1691	keylen = utf8encode(keyval, unichar);
				1692	data = corrections_lookup(&unicode_data[unichar]);
				1693	if (data->correction <= trees[i].maxage)
				1694	data = &unicode_data[unichar];
				1695	insert(&trees[i], keyval, keylen, data);
				1696	}
				1697	}
				1698	}
				1699
				1700	static void trees_reduce(void)
				1701	{
				1702	int i;
				1703	int size;
				1704	int changed;
				1705
				1706	for (i = 0; i != trees_count; i++)
				1707	prune(&trees[i]);
				1708	for (i = 0; i != trees_count; i++)
				1709	mark_nodes(&trees[i]);
				1710	do {
				1711	size = 0;
				1712	for (i = 0; i != trees_count; i++)
				1713	size = index_nodes(&trees[i], size);
				1714	changed = 0;
				1715	for (i = 0; i != trees_count; i++)
				1716	changed += size_nodes(&trees[i]);
				1717	} while (changed);
				1718
				1719	utf8data = calloc(size, 1);
				1720	utf8data_size = size;
				1721	for (i = 0; i != trees_count; i++)
				1722	emit(&trees[i], utf8data);
				1723
				1724	if (verbose > 0) {
				1725	for (i = 0; i != trees_count; i++) {
				1726	printf("%s_%x idx %d\n",
				1727	trees[i].type, trees[i].maxage, trees[i].index);
				1728	}
				1729	}
				1730
				1731	nfkdi = utf8data + trees[trees_count-1].index;
				1732	nfkdicf = utf8data + trees[trees_count-2].index;
				1733
				1734	nfkdi_tree = &trees[trees_count-1];
				1735	nfkdicf_tree = &trees[trees_count-2];
				1736	}
				1737
				1738	static void verify(struct tree *tree)
				1739	{
				1740	struct unicode_data *data;
				1741	utf8leaf_t *leaf;
				1742	unsigned int unichar;
				1743	char key[4];
				1744	unsigned char hangul[UTF8HANGULLEAF];
				1745	int report;
				1746	int nocf;
				1747
				1748	if (verbose > 0)
				1749	printf("Verifying %s_%x\n", tree->type, tree->maxage);
				1750	nocf = strcmp(tree->type, "nfkdicf");
				1751
				1752	for (unichar = 0; unichar != 0x110000; unichar++) {
				1753	report = 0;
				1754	data = corrections_lookup(&unicode_data[unichar]);
				1755	if (data->correction <= tree->maxage)
				1756	data = &unicode_data[unichar];
				1757	utf8encode(key,unichar);
				1758	leaf = utf8lookup(tree, hangul, key);
				1759
				1760	if (!leaf) {
				1761	if (data->gen != -1)
				1762	report++;
				1763	if (unichar < 0xd800 \|\| unichar > 0xdfff)
				1764	report++;
				1765	} else {
				1766	if (unichar >= 0xd800 && unichar <= 0xdfff)
				1767	report++;
				1768	if (data->gen == -1)
				1769	report++;
				1770	if (data->gen != LEAF_GEN(leaf))
				1771	report++;
				1772	if (LEAF_CCC(leaf) == DECOMPOSE) {
				1773	if (HANGUL_SYLLABLE(data->code)) {
				1774	if (data->utf8nfkdi[0] != HANGUL)
				1775	report++;
				1776	} else if (nocf) {
				1777	if (!data->utf8nfkdi) {
				1778	report++;
				1779	} else if (strcmp(data->utf8nfkdi,
				1780	LEAF_STR(leaf))) {
				1781	report++;
				1782	}
				1783	} else {
				1784	if (!data->utf8nfkdicf &&
				1785	!data->utf8nfkdi) {
				1786	report++;
				1787	} else if (data->utf8nfkdicf) {
				1788	if (strcmp(data->utf8nfkdicf,
				1789	LEAF_STR(leaf)))
				1790	report++;
				1791	} else if (strcmp(data->utf8nfkdi,
				1792	LEAF_STR(leaf))) {
				1793	report++;
				1794	}
				1795	}
				1796	} else if (data->ccc != LEAF_CCC(leaf)) {
				1797	report++;
				1798	}
				1799	}
				1800	if (report) {
				1801	printf("%X code %X gen %d ccc %d"
				1802	" nfkdi -> \"%s\"",
				1803	unichar, data->code, data->gen,
				1804	data->ccc,
				1805	data->utf8nfkdi);
				1806	if (leaf) {
				1807	printf(" gen %d ccc %d"
				1808	" nfkdi -> \"%s\"",
				1809	LEAF_GEN(leaf),
				1810	LEAF_CCC(leaf),
				1811	LEAF_CCC(leaf) == DECOMPOSE ?
				1812	LEAF_STR(leaf) : "");
				1813	}
				1814	printf("\n");
				1815	}
				1816	}
				1817	}
				1818
				1819	static void trees_verify(void)
				1820	{
				1821	int i;
				1822
				1823	for (i = 0; i != trees_count; i++)
				1824	verify(&trees[i]);
				1825	}
				1826
				1827	/* ------------------------------------------------------------------ */
				1828
				1829	static void help(void)
				1830	{
				1831	printf("Usage: %s [options]\n", argv0);
				1832	printf("\n");
				1833	printf("This program creates an a data trie used for parsing and\n");
				1834	printf("normalization of UTF-8 strings. The trie is derived from\n");
				1835	printf("a set of input files from the Unicode character database\n");
				1836	printf("found at: http://www.unicode.org/Public/UCD/latest/ucd/\n");
				1837	printf("\n");
				1838	printf("The generated tree supports two normalization forms:\n");
				1839	printf("\n");
				1840	printf("\tnfkdi:\n");
				1841	printf("\t- Apply unicode normalization form NFKD.\n");
				1842	printf("\t- Remove any Default_Ignorable_Code_Point.\n");
				1843	printf("\n");
				1844	printf("\tnfkdicf:\n");
				1845	printf("\t- Apply unicode normalization form NFKD.\n");
				1846	printf("\t- Remove any Default_Ignorable_Code_Point.\n");
				1847	printf("\t- Apply a full casefold (C + F).\n");
				1848	printf("\n");
				1849	printf("These forms were chosen as being most useful when dealing\n");
				1850	printf("with file names: NFKD catches most cases where characters\n");
				1851	printf("should be considered equivalent. The ignorables are mostly\n");
				1852	printf("invisible, making names hard to type.\n");
				1853	printf("\n");
				1854	printf("The options to specify the files to be used are listed\n");
				1855	printf("below with their default values, which are the names used\n");
				1856	printf("by version 11.0.0 of the Unicode Character Database.\n");
				1857	printf("\n");
				1858	printf("The input files:\n");
				1859	printf("\t-a %s\n", AGE_NAME);
				1860	printf("\t-c %s\n", CCC_NAME);
				1861	printf("\t-p %s\n", PROP_NAME);
				1862	printf("\t-d %s\n", DATA_NAME);
				1863	printf("\t-f %s\n", FOLD_NAME);
				1864	printf("\t-n %s\n", NORM_NAME);
				1865	printf("\n");
				1866	printf("Additionally, the generated tables are tested using:\n");
				1867	printf("\t-t %s\n", TEST_NAME);
				1868	printf("\n");
				1869	printf("Finally, the output file:\n");
				1870	printf("\t-o %s\n", UTF8_NAME);
				1871	printf("\n");
				1872	}
				1873
				1874	static void usage(void)
				1875	{
				1876	help();
				1877	exit(1);
				1878	}
				1879
				1880	static void open_fail(const char *name, int error)
				1881	{
				1882	printf("Error %d opening %s: %s\n", error, name, strerror(error));
				1883	exit(1);
				1884	}
				1885
				1886	static void file_fail(const char *filename)
				1887	{
				1888	printf("Error parsing %s\n", filename);
				1889	exit(1);
				1890	}
				1891
				1892	static void line_fail(const char filename, const char line)
				1893	{
				1894	printf("Error parsing %s:%s\n", filename, line);
				1895	exit(1);
				1896	}
				1897
				1898	/* ------------------------------------------------------------------ */
				1899
				1900	static void print_utf32(unsigned int *utf32str)
				1901	{
				1902	int i;
				1903
				1904	for (i = 0; utf32str[i]; i++)
				1905	printf(" %X", utf32str[i]);
				1906	}
				1907
				1908	static void print_utf32nfkdi(unsigned int unichar)
				1909	{
				1910	printf(" %X ->", unichar);
				1911	print_utf32(unicode_data[unichar].utf32nfkdi);
				1912	printf("\n");
				1913	}
				1914
				1915	static void print_utf32nfkdicf(unsigned int unichar)
				1916	{
				1917	printf(" %X ->", unichar);
				1918	print_utf32(unicode_data[unichar].utf32nfkdicf);
				1919	printf("\n");
				1920	}
				1921
				1922	/* ------------------------------------------------------------------ */
				1923
				1924	static void age_init(void)
				1925	{
				1926	FILE *file;
				1927	unsigned int first;
				1928	unsigned int last;
				1929	unsigned int unichar;
				1930	unsigned int major;
				1931	unsigned int minor;
				1932	unsigned int revision;
				1933	int gen;
				1934	int count;
				1935	int ret;
				1936
				1937	if (verbose > 0)
				1938	printf("Parsing %s\n", age_name);
				1939
				1940	file = fopen(age_name, "r");
				1941	if (!file)
				1942	open_fail(age_name, errno);
				1943	count = 0;
				1944
				1945	gen = 0;
				1946	while (fgets(line, LINESIZE, file)) {
				1947	ret = sscanf(line, "# Age=V%d_%d_%d",
				1948	&major, &minor, &revision);
				1949	if (ret == 3) {
				1950	ages_count++;
				1951	if (verbose > 1)
				1952	printf(" Age V%d_%d_%d\n",
				1953	major, minor, revision);
				1954	if (!age_valid(major, minor, revision))
				1955	line_fail(age_name, line);
				1956	continue;
				1957	}
				1958	ret = sscanf(line, "# Age=V%d_%d", &major, &minor);
				1959	if (ret == 2) {
				1960	ages_count++;
				1961	if (verbose > 1)
				1962	printf(" Age V%d_%d\n", major, minor);
				1963	if (!age_valid(major, minor, 0))
				1964	line_fail(age_name, line);
				1965	continue;
				1966	}
				1967	}
				1968
				1969	/* We must have found something above. */
				1970	if (verbose > 1)
				1971	printf("%d age entries\n", ages_count);
				1972	if (ages_count == 0 \|\| ages_count > MAXGEN)
				1973	file_fail(age_name);
				1974
				1975	/* There is a 0 entry. */
				1976	ages_count++;
				1977	ages = calloc(ages_count + 1, sizeof(*ages));
				1978	/* And a guard entry. */
				1979	ages[ages_count] = (unsigned int)-1;
				1980
				1981	rewind(file);
				1982	count = 0;
				1983	gen = 0;
				1984	while (fgets(line, LINESIZE, file)) {
				1985	ret = sscanf(line, "# Age=V%d_%d_%d",
				1986	&major, &minor, &revision);
				1987	if (ret == 3) {
				1988	ages[++gen] =
				1989	UNICODE_AGE(major, minor, revision);
				1990	if (verbose > 1)
				1991	printf(" Age V%d_%d_%d = gen %d\n",
				1992	major, minor, revision, gen);
				1993	if (!age_valid(major, minor, revision))
				1994	line_fail(age_name, line);
				1995	continue;
				1996	}
				1997	ret = sscanf(line, "# Age=V%d_%d", &major, &minor);
				1998	if (ret == 2) {
				1999	ages[++gen] = UNICODE_AGE(major, minor, 0);
				2000	if (verbose > 1)
				2001	printf(" Age V%d_%d = %d\n",
				2002	major, minor, gen);
				2003	if (!age_valid(major, minor, 0))
				2004	line_fail(age_name, line);
				2005	continue;
				2006	}
				2007	ret = sscanf(line, "%X..%X ; %d.%d #",
				2008	&first, &last, &major, &minor);
				2009	if (ret == 4) {
				2010	for (unichar = first; unichar <= last; unichar++)
				2011	unicode_data[unichar].gen = gen;
				2012	count += 1 + last - first;
				2013	if (verbose > 1)
				2014	printf(" %X..%X gen %d\n", first, last, gen);
				2015	if (!utf32valid(first) \|\| !utf32valid(last))
				2016	line_fail(age_name, line);
				2017	continue;
				2018	}
				2019	ret = sscanf(line, "%X ; %d.%d #", &unichar, &major, &minor);
				2020	if (ret == 3) {
				2021	unicode_data[unichar].gen = gen;
				2022	count++;
				2023	if (verbose > 1)
				2024	printf(" %X gen %d\n", unichar, gen);
				2025	if (!utf32valid(unichar))
				2026	line_fail(age_name, line);
				2027	continue;
				2028	}
				2029	}
				2030	unicode_maxage = ages[gen];
				2031	fclose(file);
				2032
				2033	/* Nix surrogate block */
				2034	if (verbose > 1)
				2035	printf(" Removing surrogate block D800..DFFF\n");
				2036	for (unichar = 0xd800; unichar <= 0xdfff; unichar++)
				2037	unicode_data[unichar].gen = -1;
				2038
				2039	if (verbose > 0)
				2040	printf("Found %d entries\n", count);
				2041	if (count == 0)
				2042	file_fail(age_name);
				2043	}
				2044
				2045	static void ccc_init(void)
				2046	{
				2047	FILE *file;
				2048	unsigned int first;
				2049	unsigned int last;
				2050	unsigned int unichar;
				2051	unsigned int value;
				2052	int count;
				2053	int ret;
				2054
				2055	if (verbose > 0)
				2056	printf("Parsing %s\n", ccc_name);
				2057
				2058	file = fopen(ccc_name, "r");
				2059	if (!file)
				2060	open_fail(ccc_name, errno);
				2061
				2062	count = 0;
				2063	while (fgets(line, LINESIZE, file)) {
				2064	ret = sscanf(line, "%X..%X ; %d #", &first, &last, &value);
				2065	if (ret == 3) {
				2066	for (unichar = first; unichar <= last; unichar++) {
				2067	unicode_data[unichar].ccc = value;
				2068	count++;
				2069	}
				2070	if (verbose > 1)
				2071	printf(" %X..%X ccc %d\n", first, last, value);
				2072	if (!utf32valid(first) \|\| !utf32valid(last))
				2073	line_fail(ccc_name, line);
				2074	continue;
				2075	}
				2076	ret = sscanf(line, "%X ; %d #", &unichar, &value);
				2077	if (ret == 2) {
				2078	unicode_data[unichar].ccc = value;
				2079	count++;
				2080	if (verbose > 1)
				2081	printf(" %X ccc %d\n", unichar, value);
				2082	if (!utf32valid(unichar))
				2083	line_fail(ccc_name, line);
				2084	continue;
				2085	}
				2086	}
				2087	fclose(file);
				2088
				2089	if (verbose > 0)
				2090	printf("Found %d entries\n", count);
				2091	if (count == 0)
				2092	file_fail(ccc_name);
				2093	}
				2094
				2095	static void nfkdi_init(void)
				2096	{
				2097	FILE *file;
				2098	unsigned int unichar;
				2099	unsigned int mapping[19]; /* Magic - guaranteed not to be exceeded. */
				2100	char *s;
				2101	unsigned int *um;
				2102	int count;
				2103	int i;
				2104	int ret;
				2105
				2106	if (verbose > 0)
				2107	printf("Parsing %s\n", data_name);
				2108	file = fopen(data_name, "r");
				2109	if (!file)
				2110	open_fail(data_name, errno);
				2111
				2112	count = 0;
				2113	while (fgets(line, LINESIZE, file)) {
				2114	ret = sscanf(line, "%X;%[^;];%[^;];%[^;];%[^;];%[^;];",
				2115	&unichar, buf0);
				2116	if (ret != 2)
				2117	continue;
				2118	if (!utf32valid(unichar))
				2119	line_fail(data_name, line);
				2120
				2121	s = buf0;
				2122	/* skip over <tag> */
				2123	if (*s == '<')
				2124	while (*s++ != ' ')
				2125	;
				2126	/* decode the decomposition into UTF-32 */
				2127	i = 0;
				2128	while (*s) {
				2129	mapping[i] = strtoul(s, &s, 16);
				2130	if (!utf32valid(mapping[i]))
				2131	line_fail(data_name, line);
				2132	i++;
				2133	}
				2134	mapping[i++] = 0;
				2135
				2136	um = malloc(i * sizeof(unsigned int));
				2137	memcpy(um, mapping, i * sizeof(unsigned int));
				2138	unicode_data[unichar].utf32nfkdi = um;
				2139
				2140	if (verbose > 1)
				2141	print_utf32nfkdi(unichar);
				2142	count++;
				2143	}
				2144	fclose(file);
				2145	if (verbose > 0)
				2146	printf("Found %d entries\n", count);
				2147	if (count == 0)
				2148	file_fail(data_name);
				2149	}
				2150
				2151	static void nfkdicf_init(void)
				2152	{
				2153	FILE *file;
				2154	unsigned int unichar;
				2155	unsigned int mapping[19]; /* Magic - guaranteed not to be exceeded. */
				2156	char status;
				2157	char *s;
				2158	unsigned int *um;
				2159	int i;
				2160	int count;
				2161	int ret;
				2162
				2163	if (verbose > 0)
				2164	printf("Parsing %s\n", fold_name);
				2165	file = fopen(fold_name, "r");
				2166	if (!file)
				2167	open_fail(fold_name, errno);
				2168
				2169	count = 0;
				2170	while (fgets(line, LINESIZE, file)) {
				2171	ret = sscanf(line, "%X; %c; %[^;];", &unichar, &status, buf0);
				2172	if (ret != 3)
				2173	continue;
				2174	if (!utf32valid(unichar))
				2175	line_fail(fold_name, line);
				2176	/* Use the C+F casefold. */
				2177	if (status != 'C' && status != 'F')
				2178	continue;
				2179	s = buf0;
				2180	if (*s == '<')
				2181	while (*s++ != ' ')
				2182	;
				2183	i = 0;
				2184	while (*s) {
				2185	mapping[i] = strtoul(s, &s, 16);
				2186	if (!utf32valid(mapping[i]))
				2187	line_fail(fold_name, line);
				2188	i++;
				2189	}
				2190	mapping[i++] = 0;
				2191
				2192	um = malloc(i * sizeof(unsigned int));
				2193	memcpy(um, mapping, i * sizeof(unsigned int));
				2194	unicode_data[unichar].utf32nfkdicf = um;
				2195
				2196	if (verbose > 1)
				2197	print_utf32nfkdicf(unichar);
				2198	count++;
				2199	}
				2200	fclose(file);
				2201	if (verbose > 0)
				2202	printf("Found %d entries\n", count);
				2203	if (count == 0)
				2204	file_fail(fold_name);
				2205	}
				2206
				2207	static void ignore_init(void)
				2208	{
				2209	FILE *file;
				2210	unsigned int unichar;
				2211	unsigned int first;
				2212	unsigned int last;
				2213	unsigned int *um;
				2214	int count;
				2215	int ret;
				2216
				2217	if (verbose > 0)
				2218	printf("Parsing %s\n", prop_name);
				2219	file = fopen(prop_name, "r");
				2220	if (!file)
				2221	open_fail(prop_name, errno);
				2222	assert(file);
				2223	count = 0;
				2224	while (fgets(line, LINESIZE, file)) {
				2225	ret = sscanf(line, "%X..%X ; %s # ", &first, &last, buf0);
				2226	if (ret == 3) {
				2227	if (strcmp(buf0, "Default_Ignorable_Code_Point"))
				2228	continue;
				2229	if (!utf32valid(first) \|\| !utf32valid(last))
				2230	line_fail(prop_name, line);
				2231	for (unichar = first; unichar <= last; unichar++) {
				2232	free(unicode_data[unichar].utf32nfkdi);
				2233	um = malloc(sizeof(unsigned int));
				2234	*um = 0;
				2235	unicode_data[unichar].utf32nfkdi = um;
				2236	free(unicode_data[unichar].utf32nfkdicf);
				2237	um = malloc(sizeof(unsigned int));
				2238	*um = 0;
				2239	unicode_data[unichar].utf32nfkdicf = um;
				2240	count++;
				2241	}
				2242	if (verbose > 1)
				2243	printf(" %X..%X Default_Ignorable_Code_Point\n",
				2244	first, last);
				2245	continue;
				2246	}
				2247	ret = sscanf(line, "%X ; %s # ", &unichar, buf0);
				2248	if (ret == 2) {
				2249	if (strcmp(buf0, "Default_Ignorable_Code_Point"))
				2250	continue;
				2251	if (!utf32valid(unichar))
				2252	line_fail(prop_name, line);
				2253	free(unicode_data[unichar].utf32nfkdi);
				2254	um = malloc(sizeof(unsigned int));
				2255	*um = 0;
				2256	unicode_data[unichar].utf32nfkdi = um;
				2257	free(unicode_data[unichar].utf32nfkdicf);
				2258	um = malloc(sizeof(unsigned int));
				2259	*um = 0;
				2260	unicode_data[unichar].utf32nfkdicf = um;
				2261	if (verbose > 1)
				2262	printf(" %X Default_Ignorable_Code_Point\n",
				2263	unichar);
				2264	count++;
				2265	continue;
				2266	}
				2267	}
				2268	fclose(file);
				2269
				2270	if (verbose > 0)
				2271	printf("Found %d entries\n", count);
				2272	if (count == 0)
				2273	file_fail(prop_name);
				2274	}
				2275
				2276	static void corrections_init(void)
				2277	{
				2278	FILE *file;
				2279	unsigned int unichar;
				2280	unsigned int major;
				2281	unsigned int minor;
				2282	unsigned int revision;
				2283	unsigned int age;
				2284	unsigned int *um;
				2285	unsigned int mapping[19]; /* Magic - guaranteed not to be exceeded. */
				2286	char *s;
				2287	int i;
				2288	int count;
				2289	int ret;
				2290
				2291	if (verbose > 0)
				2292	printf("Parsing %s\n", norm_name);
				2293	file = fopen(norm_name, "r");
				2294	if (!file)
				2295	open_fail(norm_name, errno);
				2296
				2297	count = 0;
				2298	while (fgets(line, LINESIZE, file)) {
				2299	ret = sscanf(line, "%X;%[^;];%[^;];%d.%d.%d #",
				2300	&unichar, buf0, buf1,
				2301	&major, &minor, &revision);
				2302	if (ret != 6)
				2303	continue;
				2304	if (!utf32valid(unichar) \|\| !age_valid(major, minor, revision))
				2305	line_fail(norm_name, line);
				2306	count++;
				2307	}
				2308	corrections = calloc(count, sizeof(struct unicode_data));
				2309	corrections_count = count;
				2310	rewind(file);
				2311
				2312	count = 0;
				2313	while (fgets(line, LINESIZE, file)) {
				2314	ret = sscanf(line, "%X;%[^;];%[^;];%d.%d.%d #",
				2315	&unichar, buf0, buf1,
				2316	&major, &minor, &revision);
				2317	if (ret != 6)
				2318	continue;
				2319	if (!utf32valid(unichar) \|\| !age_valid(major, minor, revision))
				2320	line_fail(norm_name, line);
				2321	corrections[count] = unicode_data[unichar];
				2322	assert(corrections[count].code == unichar);
				2323	age = UNICODE_AGE(major, minor, revision);
				2324	corrections[count].correction = age;
				2325
				2326	i = 0;
				2327	s = buf0;
				2328	while (*s) {
				2329	mapping[i] = strtoul(s, &s, 16);
				2330	if (!utf32valid(mapping[i]))
				2331	line_fail(norm_name, line);
				2332	i++;
				2333	}
				2334	mapping[i++] = 0;
				2335
				2336	um = malloc(i * sizeof(unsigned int));
				2337	memcpy(um, mapping, i * sizeof(unsigned int));
				2338	corrections[count].utf32nfkdi = um;
				2339
				2340	if (verbose > 1)
				2341	printf(" %X -> %s -> %s V%d_%d_%d\n",
				2342	unichar, buf0, buf1, major, minor, revision);
				2343	count++;
				2344	}
				2345	fclose(file);
				2346
				2347	if (verbose > 0)
				2348	printf("Found %d entries\n", count);
				2349	if (count == 0)
				2350	file_fail(norm_name);
				2351	}
				2352
				2353	/* ------------------------------------------------------------------ */
				2354
				2355	/*
				2356	* Hangul decomposition (algorithm from Section 3.12 of Unicode 6.3.0)
				2357	*
				2358	* AC00;<Hangul Syllable, First>;Lo;0;L;;;;;N;;;;;
				2359	* D7A3;<Hangul Syllable, Last>;Lo;0;L;;;;;N;;;;;
				2360	*
				2361	* SBase = 0xAC00
				2362	* LBase = 0x1100
				2363	* VBase = 0x1161
				2364	* TBase = 0x11A7
				2365	* LCount = 19
				2366	* VCount = 21
				2367	* TCount = 28
				2368	* NCount = 588 (VCount * TCount)
				2369	* SCount = 11172 (LCount * NCount)
				2370	*
				2371	* Decomposition:
				2372	* SIndex = s - SBase
				2373	*
				2374	* LV (Canonical/Full)
				2375	* LIndex = SIndex / NCount
				2376	* VIndex = (Sindex % NCount) / TCount
				2377	* LPart = LBase + LIndex
				2378	* VPart = VBase + VIndex
				2379	*
				2380	* LVT (Canonical)
				2381	* LVIndex = (SIndex / TCount) * TCount
				2382	* TIndex = (Sindex % TCount)
				2383	* LVPart = SBase + LVIndex
				2384	* TPart = TBase + TIndex
				2385	*
				2386	* LVT (Full)
				2387	* LIndex = SIndex / NCount
				2388	* VIndex = (Sindex % NCount) / TCount
				2389	* TIndex = (Sindex % TCount)
				2390	* LPart = LBase + LIndex
				2391	* VPart = VBase + VIndex
				2392	* if (TIndex == 0) {
				2393	* d = <LPart, VPart>
				2394	* } else {
				2395	* TPart = TBase + TIndex
				2396	* d = <LPart, VPart, TPart>
				2397	* }
				2398	*
				2399	*/
				2400
				2401	static void hangul_decompose(void)
				2402	{
				2403	unsigned int sb = 0xAC00;
				2404	unsigned int lb = 0x1100;
				2405	unsigned int vb = 0x1161;
				2406	unsigned int tb = 0x11a7;
				2407	/* unsigned int lc = 19; */
				2408	unsigned int vc = 21;
				2409	unsigned int tc = 28;
				2410	unsigned int nc = (vc * tc);
				2411	/* unsigned int sc = (lc * nc); */
				2412	unsigned int unichar;
				2413	unsigned int mapping[4];
				2414	unsigned int *um;
				2415	int count;
				2416	int i;
				2417
				2418	if (verbose > 0)
				2419	printf("Decomposing hangul\n");
				2420	/* Hangul */
				2421	count = 0;
				2422	for (unichar = 0xAC00; unichar <= 0xD7A3; unichar++) {
				2423	unsigned int si = unichar - sb;
				2424	unsigned int li = si / nc;
				2425	unsigned int vi = (si % nc) / tc;
				2426	unsigned int ti = si % tc;
				2427
				2428	i = 0;
				2429	mapping[i++] = lb + li;
				2430	mapping[i++] = vb + vi;
				2431	if (ti)
				2432	mapping[i++] = tb + ti;
				2433	mapping[i++] = 0;
				2434
				2435	assert(!unicode_data[unichar].utf32nfkdi);
				2436	um = malloc(i * sizeof(unsigned int));
				2437	memcpy(um, mapping, i * sizeof(unsigned int));
				2438	unicode_data[unichar].utf32nfkdi = um;
				2439
				2440	assert(!unicode_data[unichar].utf32nfkdicf);
				2441	um = malloc(i * sizeof(unsigned int));
				2442	memcpy(um, mapping, i * sizeof(unsigned int));
				2443	unicode_data[unichar].utf32nfkdicf = um;
				2444
				2445	/*
				2446	* Add a cookie as a reminder that the hangul syllable
				2447	* decompositions must not be stored in the generated
				2448	* trie.
				2449	*/
				2450	unicode_data[unichar].utf8nfkdi = malloc(2);
				2451	unicode_data[unichar].utf8nfkdi[0] = HANGUL;
				2452	unicode_data[unichar].utf8nfkdi[1] = '\0';
				2453
				2454	if (verbose > 1)
				2455	print_utf32nfkdi(unichar);
				2456
				2457	count++;
				2458	}
				2459	if (verbose > 0)
				2460	printf("Created %d entries\n", count);
				2461	}
				2462
				2463	static void nfkdi_decompose(void)
				2464	{
				2465	unsigned int unichar;
				2466	unsigned int mapping[19]; /* Magic - guaranteed not to be exceeded. */
				2467	unsigned int *um;
				2468	unsigned int *dc;
				2469	int count;
				2470	int i;
				2471	int j;
				2472	int ret;
				2473
				2474	if (verbose > 0)
				2475	printf("Decomposing nfkdi\n");
				2476
				2477	count = 0;
				2478	for (unichar = 0; unichar != 0x110000; unichar++) {
				2479	if (!unicode_data[unichar].utf32nfkdi)
				2480	continue;
				2481	for (;;) {
				2482	ret = 1;
				2483	i = 0;
				2484	um = unicode_data[unichar].utf32nfkdi;
				2485	while (*um) {
				2486	dc = unicode_data[*um].utf32nfkdi;
				2487	if (dc) {
				2488	for (j = 0; dc[j]; j++)
				2489	mapping[i++] = dc[j];
				2490	ret = 0;
				2491	} else {
				2492	mapping[i++] = *um;
				2493	}
				2494	um++;
				2495	}
				2496	mapping[i++] = 0;
				2497	if (ret)
				2498	break;
				2499	free(unicode_data[unichar].utf32nfkdi);
				2500	um = malloc(i * sizeof(unsigned int));
				2501	memcpy(um, mapping, i * sizeof(unsigned int));
				2502	unicode_data[unichar].utf32nfkdi = um;
				2503	}
				2504	/* Add this decomposition to nfkdicf if there is no entry. */
				2505	if (!unicode_data[unichar].utf32nfkdicf) {
				2506	um = malloc(i * sizeof(unsigned int));
				2507	memcpy(um, mapping, i * sizeof(unsigned int));
				2508	unicode_data[unichar].utf32nfkdicf = um;
				2509	}
				2510	if (verbose > 1)
				2511	print_utf32nfkdi(unichar);
				2512	count++;
				2513	}
				2514	if (verbose > 0)
				2515	printf("Processed %d entries\n", count);
				2516	}
				2517
				2518	static void nfkdicf_decompose(void)
				2519	{
				2520	unsigned int unichar;
				2521	unsigned int mapping[19]; /* Magic - guaranteed not to be exceeded. */
				2522	unsigned int *um;
				2523	unsigned int *dc;
				2524	int count;
				2525	int i;
				2526	int j;
				2527	int ret;
				2528
				2529	if (verbose > 0)
				2530	printf("Decomposing nfkdicf\n");
				2531	count = 0;
				2532	for (unichar = 0; unichar != 0x110000; unichar++) {
				2533	if (!unicode_data[unichar].utf32nfkdicf)
				2534	continue;
				2535	for (;;) {
				2536	ret = 1;
				2537	i = 0;
				2538	um = unicode_data[unichar].utf32nfkdicf;
				2539	while (*um) {
				2540	dc = unicode_data[*um].utf32nfkdicf;
				2541	if (dc) {
				2542	for (j = 0; dc[j]; j++)
				2543	mapping[i++] = dc[j];
				2544	ret = 0;
				2545	} else {
				2546	mapping[i++] = *um;
				2547	}
				2548	um++;
				2549	}
				2550	mapping[i++] = 0;
				2551	if (ret)
				2552	break;
				2553	free(unicode_data[unichar].utf32nfkdicf);
				2554	um = malloc(i * sizeof(unsigned int));
				2555	memcpy(um, mapping, i * sizeof(unsigned int));
				2556	unicode_data[unichar].utf32nfkdicf = um;
				2557	}
				2558	if (verbose > 1)
				2559	print_utf32nfkdicf(unichar);
				2560	count++;
				2561	}
				2562	if (verbose > 0)
				2563	printf("Processed %d entries\n", count);
				2564	}
				2565
				2566	/* ------------------------------------------------------------------ */
				2567
				2568	int utf8agemax(struct tree , const char );
				2569	int utf8nagemax(struct tree , const char , size_t);
				2570	int utf8agemin(struct tree , const char );
				2571	int utf8nagemin(struct tree , const char , size_t);
				2572	ssize_t utf8len(struct tree , const char );
				2573	ssize_t utf8nlen(struct tree , const char , size_t);
				2574	struct utf8cursor;
				2575	int utf8cursor(struct utf8cursor , struct tree , const char *);
				2576	int utf8ncursor(struct utf8cursor , struct tree , const char *, size_t);
				2577	int utf8byte(struct utf8cursor *);
				2578
				2579	/*
				2580	* Hangul decomposition (algorithm from Section 3.12 of Unicode 6.3.0)
				2581	*
				2582	* AC00;<Hangul Syllable, First>;Lo;0;L;;;;;N;;;;;
				2583	* D7A3;<Hangul Syllable, Last>;Lo;0;L;;;;;N;;;;;
				2584	*
				2585	* SBase = 0xAC00
				2586	* LBase = 0x1100
				2587	* VBase = 0x1161
				2588	* TBase = 0x11A7
				2589	* LCount = 19
				2590	* VCount = 21
				2591	* TCount = 28
				2592	* NCount = 588 (VCount * TCount)
				2593	* SCount = 11172 (LCount * NCount)
				2594	*
				2595	* Decomposition:
				2596	* SIndex = s - SBase
				2597	*
				2598	* LV (Canonical/Full)
				2599	* LIndex = SIndex / NCount
				2600	* VIndex = (Sindex % NCount) / TCount
				2601	* LPart = LBase + LIndex
				2602	* VPart = VBase + VIndex
				2603	*
				2604	* LVT (Canonical)
				2605	* LVIndex = (SIndex / TCount) * TCount
				2606	* TIndex = (Sindex % TCount)
				2607	* LVPart = SBase + LVIndex
				2608	* TPart = TBase + TIndex
				2609	*
				2610	* LVT (Full)
				2611	* LIndex = SIndex / NCount
				2612	* VIndex = (Sindex % NCount) / TCount
				2613	* TIndex = (Sindex % TCount)
				2614	* LPart = LBase + LIndex
				2615	* VPart = VBase + VIndex
				2616	* if (TIndex == 0) {
				2617	* d = <LPart, VPart>
				2618	* } else {
				2619	* TPart = TBase + TIndex
				2620	* d = <LPart, VPart, TPart>
				2621	* }
				2622	*/
				2623
				2624	/* Constants */
				2625	#define SB (0xAC00)
				2626	#define LB (0x1100)
				2627	#define VB (0x1161)
				2628	#define TB (0x11A7)
				2629	#define LC (19)
				2630	#define VC (21)
				2631	#define TC (28)
				2632	#define NC (VC * TC)
				2633	#define SC (LC * NC)
				2634
				2635	/* Algorithmic decomposition of hangul syllable. */
				2636	static utf8leaf_t utf8hangul(const char str, unsigned char *hangul)
				2637	{
				2638	unsigned int si;
				2639	unsigned int li;
				2640	unsigned int vi;
				2641	unsigned int ti;
				2642	unsigned char *h;
				2643
				2644	/* Calculate the SI, LI, VI, and TI values. */
				2645	si = utf8decode(str) - SB;
				2646	li = si / NC;
				2647	vi = (si % NC) / TC;
				2648	ti = si % TC;
				2649
				2650	/* Fill in base of leaf. */
				2651	h = hangul;
				2652	LEAF_GEN(h) = 2;
				2653	LEAF_CCC(h) = DECOMPOSE;
				2654	h += 2;
				2655
				2656	/* Add LPart, a 3-byte UTF-8 sequence. */
				2657	h += utf8encode((char *)h, li + LB);
				2658
				2659	/* Add VPart, a 3-byte UTF-8 sequence. */
				2660	h += utf8encode((char *)h, vi + VB);
				2661
				2662	/* Add TPart if required, also a 3-byte UTF-8 sequence. */
				2663	if (ti)
				2664	h += utf8encode((char *)h, ti + TB);
				2665
				2666	/* Terminate string. */
				2667	h[0] = '\0';
				2668
				2669	return hangul;
				2670	}
				2671
				2672	/*
				2673	* Use trie to scan s, touching at most len bytes.
				2674	* Returns the leaf if one exists, NULL otherwise.
				2675	*
				2676	* A non-NULL return guarantees that the UTF-8 sequence starting at s
				2677	* is well-formed and corresponds to a known unicode code point. The
				2678	* shorthand for this will be "is valid UTF-8 unicode".
				2679	*/
				2680	static utf8leaf_t utf8nlookup(struct tree tree, unsigned char *hangul,
				2681	const char *s, size_t len)
				2682	{
				2683	utf8trie_t *trie = utf8data + tree->index;
				2684	int offlen;
				2685	int offset;
				2686	int mask;
				2687	int node;
				2688
				2689	if (!tree)
				2690	return NULL;
				2691	if (len == 0)
				2692	return NULL;
				2693	node = 1;
				2694	while (node) {
				2695	offlen = (*trie & OFFLEN) >> OFFLEN_SHIFT;
				2696	if (*trie & NEXTBYTE) {
				2697	if (--len == 0)
				2698	return NULL;
				2699	s++;
				2700	}
				2701	mask = 1 << (*trie & BITNUM);
				2702	if (*s & mask) {
				2703	/* Right leg */
				2704	if (offlen) {
				2705	/* Right node at offset of trie */
				2706	node = (*trie & RIGHTNODE);
				2707	offset = trie[offlen];
				2708	while (--offlen) {
				2709	offset <<= 8;
				2710	offset \|= trie[offlen];
				2711	}
				2712	trie += offset;
				2713	} else if (*trie & RIGHTPATH) {
				2714	/* Right node after this node */
				2715	node = (*trie & TRIENODE);
				2716	trie++;
				2717	} else {
				2718	/* No right node. */
				2719	return NULL;
				2720	}
				2721	} else {
				2722	/* Left leg */
				2723	if (offlen) {
				2724	/* Left node after this node. */
				2725	node = (*trie & LEFTNODE);
				2726	trie += offlen + 1;
				2727	} else if (*trie & RIGHTPATH) {
				2728	/* No left node. */
				2729	return NULL;
				2730	} else {
				2731	/* Left node after this node */
				2732	node = (*trie & TRIENODE);
				2733	trie++;
				2734	}
				2735	}
				2736	}
				2737	/*
				2738	* Hangul decomposition is done algorithmically. These are the
				2739	* codepoints >= 0xAC00 and <= 0xD7A3. Their UTF-8 encoding is
				2740	* always 3 bytes long, so s has been advanced twice, and the
				2741	* start of the sequence is at s-2.
				2742	*/
				2743	if (LEAF_CCC(trie) == DECOMPOSE && LEAF_STR(trie)[0] == HANGUL)
				2744	trie = utf8hangul(s - 2, hangul);
				2745	return trie;
				2746	}
				2747
				2748	/*
				2749	* Use trie to scan s.
				2750	* Returns the leaf if one exists, NULL otherwise.
				2751	*
				2752	* Forwards to trie_nlookup().
				2753	*/
				2754	static utf8leaf_t utf8lookup(struct tree tree, unsigned char *hangul,
				2755	const char *s)
				2756	{
				2757	return utf8nlookup(tree, hangul, s, (size_t)-1);
				2758	}
				2759
				2760	/*
				2761	* Return the number of bytes used by the current UTF-8 sequence.
				2762	* Assumes the input points to the first byte of a valid UTF-8
				2763	* sequence.
				2764	*/
				2765	static inline int utf8clen(const char *s)
				2766	{
				2767	unsigned char c = *s;
				2768	return 1 + (c >= 0xC0) + (c >= 0xE0) + (c >= 0xF0);
				2769	}
				2770
				2771	/*
				2772	* Maximum age of any character in s.
				2773	* Return -1 if s is not valid UTF-8 unicode.
				2774	* Return 0 if only non-assigned code points are used.
				2775	*/
				2776	int utf8agemax(struct tree tree, const char s)
				2777	{
				2778	utf8leaf_t *leaf;
				2779	int age = 0;
				2780	int leaf_age;
				2781	unsigned char hangul[UTF8HANGULLEAF];
				2782
				2783	if (!tree)
				2784	return -1;
				2785
				2786	while (*s) {
				2787	leaf = utf8lookup(tree, hangul, s);
				2788	if (!leaf)
				2789	return -1;
				2790	leaf_age = ages[LEAF_GEN(leaf)];
				2791	if (leaf_age <= tree->maxage && leaf_age > age)
				2792	age = leaf_age;
				2793	s += utf8clen(s);
				2794	}
				2795	return age;
				2796	}
				2797
				2798	/*
				2799	* Minimum age of any character in s.
				2800	* Return -1 if s is not valid UTF-8 unicode.
				2801	* Return 0 if non-assigned code points are used.
				2802	*/
				2803	int utf8agemin(struct tree tree, const char s)
				2804	{
				2805	utf8leaf_t *leaf;
				2806	int age;
				2807	int leaf_age;
				2808	unsigned char hangul[UTF8HANGULLEAF];
				2809
				2810	if (!tree)
				2811	return -1;
				2812	age = tree->maxage;
				2813	while (*s) {
				2814	leaf = utf8lookup(tree, hangul, s);
				2815	if (!leaf)
				2816	return -1;
				2817	leaf_age = ages[LEAF_GEN(leaf)];
				2818	if (leaf_age <= tree->maxage && leaf_age < age)
				2819	age = leaf_age;
				2820	s += utf8clen(s);
				2821	}
				2822	return age;
				2823	}
				2824
				2825	/*
				2826	* Maximum age of any character in s, touch at most len bytes.
				2827	* Return -1 if s is not valid UTF-8 unicode.
				2828	*/
				2829	int utf8nagemax(struct tree tree, const char s, size_t len)
				2830	{
				2831	utf8leaf_t *leaf;
				2832	int age = 0;
				2833	int leaf_age;
				2834	unsigned char hangul[UTF8HANGULLEAF];
				2835
				2836	if (!tree)
				2837	return -1;
				2838
				2839	while (len && *s) {
				2840	leaf = utf8nlookup(tree, hangul, s, len);
				2841	if (!leaf)
				2842	return -1;
				2843	leaf_age = ages[LEAF_GEN(leaf)];
				2844	if (leaf_age <= tree->maxage && leaf_age > age)
				2845	age = leaf_age;
				2846	len -= utf8clen(s);
				2847	s += utf8clen(s);
				2848	}
				2849	return age;
				2850	}
				2851
				2852	/*
				2853	* Maximum age of any character in s, touch at most len bytes.
				2854	* Return -1 if s is not valid UTF-8 unicode.
				2855	*/
				2856	int utf8nagemin(struct tree tree, const char s, size_t len)
				2857	{
				2858	utf8leaf_t *leaf;
				2859	int leaf_age;
				2860	int age;
				2861	unsigned char hangul[UTF8HANGULLEAF];
				2862
				2863	if (!tree)
				2864	return -1;
				2865	age = tree->maxage;
				2866	while (len && *s) {
				2867	leaf = utf8nlookup(tree, hangul, s, len);
				2868	if (!leaf)
				2869	return -1;
				2870	leaf_age = ages[LEAF_GEN(leaf)];
				2871	if (leaf_age <= tree->maxage && leaf_age < age)
				2872	age = leaf_age;
				2873	len -= utf8clen(s);
				2874	s += utf8clen(s);
				2875	}
				2876	return age;
				2877	}
				2878
				2879	/*
				2880	* Length of the normalization of s.
				2881	* Return -1 if s is not valid UTF-8 unicode.
				2882	*
				2883	* A string of Default_Ignorable_Code_Point has length 0.
				2884	*/
				2885	ssize_t utf8len(struct tree tree, const char s)
				2886	{
				2887	utf8leaf_t *leaf;
				2888	size_t ret = 0;
				2889	unsigned char hangul[UTF8HANGULLEAF];
				2890
				2891	if (!tree)
				2892	return -1;
				2893	while (*s) {
				2894	leaf = utf8lookup(tree, hangul, s);
				2895	if (!leaf)
				2896	return -1;
				2897	if (ages[LEAF_GEN(leaf)] > tree->maxage)
				2898	ret += utf8clen(s);
				2899	else if (LEAF_CCC(leaf) == DECOMPOSE)
				2900	ret += strlen(LEAF_STR(leaf));
				2901	else
				2902	ret += utf8clen(s);
				2903	s += utf8clen(s);
				2904	}
				2905	return ret;
				2906	}
				2907
				2908	/*
				2909	* Length of the normalization of s, touch at most len bytes.
				2910	* Return -1 if s is not valid UTF-8 unicode.
				2911	*/
				2912	ssize_t utf8nlen(struct tree tree, const char s, size_t len)
				2913	{
				2914	utf8leaf_t *leaf;
				2915	size_t ret = 0;
				2916	unsigned char hangul[UTF8HANGULLEAF];
				2917
				2918	if (!tree)
				2919	return -1;
				2920	while (len && *s) {
				2921	leaf = utf8nlookup(tree, hangul, s, len);
				2922	if (!leaf)
				2923	return -1;
				2924	if (ages[LEAF_GEN(leaf)] > tree->maxage)
				2925	ret += utf8clen(s);
				2926	else if (LEAF_CCC(leaf) == DECOMPOSE)
				2927	ret += strlen(LEAF_STR(leaf));
				2928	else
				2929	ret += utf8clen(s);
				2930	len -= utf8clen(s);
				2931	s += utf8clen(s);
				2932	}
				2933	return ret;
				2934	}
				2935
				2936	/*
				2937	* Cursor structure used by the normalizer.
				2938	*/
				2939	struct utf8cursor {
				2940	struct tree *tree;
				2941	const char *s;
				2942	const char *p;
				2943	const char *ss;
				2944	const char *sp;
				2945	unsigned int len;
				2946	unsigned int slen;
				2947	short int ccc;
				2948	short int nccc;
				2949	unsigned int unichar;
				2950	unsigned char hangul[UTF8HANGULLEAF];
				2951	};
				2952
				2953	/*
				2954	* Set up an utf8cursor for use by utf8byte().
				2955	*
				2956	* s : string.
				2957	* len : length of s.
				2958	* u8c : pointer to cursor.
				2959	* trie : utf8trie_t to use for normalization.
				2960	*
				2961	* Returns -1 on error, 0 on success.
				2962	*/
				2963	int utf8ncursor(struct utf8cursor u8c, struct tree tree, const char *s,
				2964	size_t len)
				2965	{
				2966	if (!tree)
				2967	return -1;
				2968	if (!s)
				2969	return -1;
				2970	u8c->tree = tree;
				2971	u8c->s = s;
				2972	u8c->p = NULL;
				2973	u8c->ss = NULL;
				2974	u8c->sp = NULL;
				2975	u8c->len = len;
				2976	u8c->slen = 0;
				2977	u8c->ccc = STOPPER;
				2978	u8c->nccc = STOPPER;
				2979	u8c->unichar = 0;
				2980	/* Check we didn't clobber the maximum length. */
				2981	if (u8c->len != len)
				2982	return -1;
				2983	/* The first byte of s may not be an utf8 continuation. */
				2984	if (len > 0 && (*s & 0xC0) == 0x80)
				2985	return -1;
				2986	return 0;
				2987	}
				2988
				2989	/*
				2990	* Set up an utf8cursor for use by utf8byte().
				2991	*
				2992	* s : NUL-terminated string.
				2993	* u8c : pointer to cursor.
				2994	* trie : utf8trie_t to use for normalization.
				2995	*
				2996	* Returns -1 on error, 0 on success.
				2997	*/
				2998	int utf8cursor(struct utf8cursor u8c, struct tree tree, const char *s)
				2999	{
				3000	return utf8ncursor(u8c, tree, s, (unsigned int)-1);
				3001	}
				3002
				3003	/*
				3004	* Get one byte from the normalized form of the string described by u8c.
				3005	*
				3006	* Returns the byte cast to an unsigned char on succes, and -1 on failure.
				3007	*
				3008	* The cursor keeps track of the location in the string in u8c->s.
				3009	* When a character is decomposed, the current location is stored in
				3010	* u8c->p, and u8c->s is set to the start of the decomposition. Note
				3011	* that bytes from a decomposition do not count against u8c->len.
				3012	*
				3013	* Characters are emitted if they match the current CCC in u8c->ccc.
				3014	* Hitting end-of-string while u8c->ccc == STOPPER means we're done,
				3015	* and the function returns 0 in that case.
				3016	*
				3017	* Sorting by CCC is done by repeatedly scanning the string. The
				3018	* values of u8c->s and u8c->p are stored in u8c->ss and u8c->sp at
				3019	* the start of the scan. The first pass finds the lowest CCC to be
				3020	* emitted and stores it in u8c->nccc, the second pass emits the
				3021	* characters with this CCC and finds the next lowest CCC. This limits
				3022	* the number of passes to 1 + the number of different CCCs in the
				3023	* sequence being scanned.
				3024	*
				3025	* Therefore:
				3026	* u8c->p != NULL -> a decomposition is being scanned.
				3027	* u8c->ss != NULL -> this is a repeating scan.
				3028	* u8c->ccc == -1 -> this is the first scan of a repeating scan.
				3029	*/
				3030	int utf8byte(struct utf8cursor *u8c)
				3031	{
				3032	utf8leaf_t *leaf;
				3033	int ccc;
				3034
				3035	for (;;) {
				3036	/* Check for the end of a decomposed character. */
				3037	if (u8c->p && *u8c->s == '\0') {
				3038	u8c->s = u8c->p;
				3039	u8c->p = NULL;
				3040	}
				3041
				3042	/* Check for end-of-string. */
				3043	if (!u8c->p && (u8c->len == 0 \|\| *u8c->s == '\0')) {
				3044	/* There is no next byte. */
				3045	if (u8c->ccc == STOPPER)
				3046	return 0;
				3047	/* End-of-string during a scan counts as a stopper. */
				3048	ccc = STOPPER;
				3049	goto ccc_mismatch;
				3050	} else if ((*u8c->s & 0xC0) == 0x80) {
				3051	/* This is a continuation of the current character. */
				3052	if (!u8c->p)
				3053	u8c->len--;
				3054	return (unsigned char)*u8c->s++;
				3055	}
				3056
				3057	/* Look up the data for the current character. */
				3058	if (u8c->p) {
				3059	leaf = utf8lookup(u8c->tree, u8c->hangul, u8c->s);
				3060	} else {
				3061	leaf = utf8nlookup(u8c->tree, u8c->hangul,
				3062	u8c->s, u8c->len);
				3063	}
				3064
				3065	/* No leaf found implies that the input is a binary blob. */
				3066	if (!leaf)
				3067	return -1;
				3068
				3069	/* Characters that are too new have CCC 0. */
				3070	if (ages[LEAF_GEN(leaf)] > u8c->tree->maxage) {
				3071	ccc = STOPPER;
				3072	} else if ((ccc = LEAF_CCC(leaf)) == DECOMPOSE) {
				3073	u8c->len -= utf8clen(u8c->s);
				3074	u8c->p = u8c->s + utf8clen(u8c->s);
				3075	u8c->s = LEAF_STR(leaf);
				3076	/* Empty decomposition implies CCC 0. */
				3077	if (*u8c->s == '\0') {
				3078	if (u8c->ccc == STOPPER)
				3079	continue;
				3080	ccc = STOPPER;
				3081	goto ccc_mismatch;
				3082	}
				3083	leaf = utf8lookup(u8c->tree, u8c->hangul, u8c->s);
				3084	ccc = LEAF_CCC(leaf);
				3085	}
				3086	u8c->unichar = utf8decode(u8c->s);
				3087
				3088	/*
				3089	* If this is not a stopper, then see if it updates
				3090	* the next canonical class to be emitted.
				3091	*/
				3092	if (ccc != STOPPER && u8c->ccc < ccc && ccc < u8c->nccc)
				3093	u8c->nccc = ccc;
				3094
				3095	/*
				3096	* Return the current byte if this is the current
				3097	* combining class.
				3098	*/
				3099	if (ccc == u8c->ccc) {
				3100	if (!u8c->p)
				3101	u8c->len--;
				3102	return (unsigned char)*u8c->s++;
				3103	}
				3104
				3105	/* Current combining class mismatch. */
				3106	ccc_mismatch:
				3107	if (u8c->nccc == STOPPER) {
				3108	/*
				3109	* Scan forward for the first canonical class
				3110	* to be emitted. Save the position from
				3111	* which to restart.
				3112	*/
				3113	assert(u8c->ccc == STOPPER);
				3114	u8c->ccc = MINCCC - 1;
				3115	u8c->nccc = ccc;
				3116	u8c->sp = u8c->p;
				3117	u8c->ss = u8c->s;
				3118	u8c->slen = u8c->len;
				3119	if (!u8c->p)
				3120	u8c->len -= utf8clen(u8c->s);
				3121	u8c->s += utf8clen(u8c->s);
				3122	} else if (ccc != STOPPER) {
				3123	/* Not a stopper, and not the ccc we're emitting. */
				3124	if (!u8c->p)
				3125	u8c->len -= utf8clen(u8c->s);
				3126	u8c->s += utf8clen(u8c->s);
				3127	} else if (u8c->nccc != MAXCCC + 1) {
				3128	/* At a stopper, restart for next ccc. */
				3129	u8c->ccc = u8c->nccc;
				3130	u8c->nccc = MAXCCC + 1;
				3131	u8c->s = u8c->ss;
				3132	u8c->p = u8c->sp;
				3133	u8c->len = u8c->slen;
				3134	} else {
				3135	/* All done, proceed from here. */
				3136	u8c->ccc = STOPPER;
				3137	u8c->nccc = STOPPER;
				3138	u8c->sp = NULL;
				3139	u8c->ss = NULL;
				3140	u8c->slen = 0;
				3141	}
				3142	}
				3143	}
				3144
				3145	/* ------------------------------------------------------------------ */
				3146
				3147	static int normalize_line(struct tree *tree)
				3148	{
				3149	char *s;
				3150	char *t;
				3151	int c;
				3152	struct utf8cursor u8c;
				3153
				3154	/* First test: null-terminated string. */
				3155	s = buf2;
				3156	t = buf3;
				3157	if (utf8cursor(&u8c, tree, s))
				3158	return -1;
				3159	while ((c = utf8byte(&u8c)) > 0)
				3160	if (c != (unsigned char)*t++)
				3161	return -1;
				3162	if (c < 0)
				3163	return -1;
				3164	if (*t != 0)
				3165	return -1;
				3166
				3167	/* Second test: length-limited string. */
				3168	s = buf2;
				3169	/* Replace NUL with a value that will cause an error if seen. */
				3170	s[strlen(s) + 1] = -1;
				3171	t = buf3;
				3172	if (utf8cursor(&u8c, tree, s))
				3173	return -1;
				3174	while ((c = utf8byte(&u8c)) > 0)
				3175	if (c != (unsigned char)*t++)
				3176	return -1;
				3177	if (c < 0)
				3178	return -1;
				3179	if (*t != 0)
				3180	return -1;
				3181
				3182	return 0;
				3183	}
				3184
				3185	static void normalization_test(void)
				3186	{
				3187	FILE *file;
				3188	unsigned int unichar;
				3189	struct unicode_data *data;
				3190	char *s;
				3191	char *t;
				3192	int ret;
				3193	int ignorables;
				3194	int tests = 0;
				3195	int failures = 0;
				3196
				3197	if (verbose > 0)
				3198	printf("Parsing %s\n", test_name);
				3199	/* Step one, read data from file. */
				3200	file = fopen(test_name, "r");
				3201	if (!file)
				3202	open_fail(test_name, errno);
				3203
				3204	while (fgets(line, LINESIZE, file)) {
				3205	ret = sscanf(line, "%[^;];%[^;];%[^;];%*[^;];%[^;];",
				3206	buf0, buf1);
				3207	if (ret != 2 \|\| *line == '#')
				3208	continue;
				3209	s = buf0;
				3210	t = buf2;
				3211	while (*s) {
				3212	unichar = strtoul(s, &s, 16);
				3213	t += utf8encode(t, unichar);
				3214	}
				3215	*t = '\0';
				3216
				3217	ignorables = 0;
				3218	s = buf1;
				3219	t = buf3;
				3220	while (*s) {
				3221	unichar = strtoul(s, &s, 16);
				3222	data = &unicode_data[unichar];
				3223	if (data->utf8nfkdi && !*data->utf8nfkdi)
				3224	ignorables = 1;
				3225	else
				3226	t += utf8encode(t, unichar);
				3227	}
				3228	*t = '\0';
				3229
				3230	tests++;
				3231	if (normalize_line(nfkdi_tree) < 0) {
				3232	printf("Line %s -> %s", buf0, buf1);
				3233	if (ignorables)
				3234	printf(" (ignorables removed)");
				3235	printf(" failure\n");
				3236	failures++;
				3237	}
				3238	}
				3239	fclose(file);
				3240	if (verbose > 0)
				3241	printf("Ran %d tests with %d failures\n", tests, failures);
				3242	if (failures)
				3243	file_fail(test_name);
				3244	}
				3245
				3246	/* ------------------------------------------------------------------ */
				3247
				3248	static void write_file(void)
				3249	{
				3250	FILE *file;
				3251	int i;
				3252	int j;
				3253	int t;
				3254	int gen;
				3255
				3256	if (verbose > 0)
				3257	printf("Writing %s\n", utf8_name);
				3258	file = fopen(utf8_name, "w");
				3259	if (!file)
				3260	open_fail(utf8_name, errno);
				3261
				3262	fprintf(file, "/* This file is generated code, do not edit. */\n");
				3263	fprintf(file, "#ifndef __INCLUDED_FROM_UTF8NORM_C__\n");
				3264	fprintf(file, "#error Only nls_utf8-norm.c should include this file.\n");
				3265	fprintf(file, "#endif\n");
				3266	fprintf(file, "\n");
				3267	fprintf(file, "static const unsigned int utf8vers = %#x;\n",
				3268	unicode_maxage);
				3269	fprintf(file, "\n");
				3270	fprintf(file, "static const unsigned int utf8agetab[] = {\n");
				3271	for (i = 0; i != ages_count; i++)
				3272	fprintf(file, "\t%#x%s\n", ages[i],
				3273	ages[i] == unicode_maxage ? "" : ",");
				3274	fprintf(file, "};\n");
				3275	fprintf(file, "\n");
				3276	fprintf(file, "static const struct utf8data utf8nfkdicfdata[] = {\n");
				3277	t = 0;
				3278	for (gen = 0; gen < ages_count; gen++) {
				3279	fprintf(file, "\t{ %#x, %d }%s\n",
				3280	ages[gen], trees[t].index,
				3281	ages[gen] == unicode_maxage ? "" : ",");
				3282	if (trees[t].maxage == ages[gen])
				3283	t += 2;
				3284	}
				3285	fprintf(file, "};\n");
				3286	fprintf(file, "\n");
				3287	fprintf(file, "static const struct utf8data utf8nfkdidata[] = {\n");
				3288	t = 1;
				3289	for (gen = 0; gen < ages_count; gen++) {
				3290	fprintf(file, "\t{ %#x, %d }%s\n",
				3291	ages[gen], trees[t].index,
				3292	ages[gen] == unicode_maxage ? "" : ",");
				3293	if (trees[t].maxage == ages[gen])
				3294	t += 2;
				3295	}
				3296	fprintf(file, "};\n");
				3297	fprintf(file, "\n");
				3298	fprintf(file, "static const unsigned char utf8data[%zd] = {\n",
				3299	utf8data_size);
				3300	t = 0;
				3301	for (i = 0; i != utf8data_size; i += 16) {
				3302	if (i == trees[t].index) {
				3303	fprintf(file, "\t/* %s_%x */\n",
				3304	trees[t].type, trees[t].maxage);
				3305	if (t < trees_count-1)
				3306	t++;
				3307	}
				3308	fprintf(file, "\t");
				3309	for (j = i; j != i + 16; j++)
				3310	fprintf(file, "0x%.2x%s", utf8data[j],
				3311	(j < utf8data_size -1 ? "," : ""));
				3312	fprintf(file, "\n");
				3313	}
				3314	fprintf(file, "};\n");
				3315	fclose(file);
				3316	}
				3317
				3318	/* ------------------------------------------------------------------ */
				3319
				3320	int main(int argc, char *argv[])
				3321	{
				3322	unsigned int unichar;
				3323	int opt;
				3324
				3325	argv0 = argv[0];
				3326
				3327	while ((opt = getopt(argc, argv, "a:c:d:f:hn:o:p:t:v")) != -1) {
				3328	switch (opt) {
				3329	case 'a':
				3330	age_name = optarg;
				3331	break;
				3332	case 'c':
				3333	ccc_name = optarg;
				3334	break;
				3335	case 'd':
				3336	data_name = optarg;
				3337	break;
				3338	case 'f':
				3339	fold_name = optarg;
				3340	break;
				3341	case 'n':
				3342	norm_name = optarg;
				3343	break;
				3344	case 'o':
				3345	utf8_name = optarg;
				3346	break;
				3347	case 'p':
				3348	prop_name = optarg;
				3349	break;
				3350	case 't':
				3351	test_name = optarg;
				3352	break;
				3353	case 'v':
				3354	verbose++;
				3355	break;
				3356	case 'h':
				3357	help();
				3358	exit(0);
				3359	default:
				3360	usage();
				3361	}
				3362	}
				3363
				3364	if (verbose > 1)
				3365	help();
				3366	for (unichar = 0; unichar != 0x110000; unichar++)
				3367	unicode_data[unichar].code = unichar;
				3368	age_init();
				3369	ccc_init();
				3370	nfkdi_init();
				3371	nfkdicf_init();
				3372	ignore_init();
				3373	corrections_init();
				3374	hangul_decompose();
				3375	nfkdi_decompose();
				3376	nfkdicf_decompose();
				3377	utf8_init();
				3378	trees_init();
				3379	trees_populate();
				3380	trees_reduce();
				3381	trees_verify();
				3382	/* Prevent "unused function" warning. */
				3383	(void)lookup(nfkdi_tree, " ");
				3384	if (verbose > 2)
				3385	tree_walk(nfkdi_tree);
				3386	if (verbose > 2)
				3387	tree_walk(nfkdicf_tree);
				3388	normalization_test();
				3389	write_file();
				3390
				3391	return 0;
				3392	}