git.subgeniuskitty.com - screensavers/.git/blame_incremental

... / ...

Commit	Line	Data
	1	/* xscreensaver, Copyright (c) 2014-2016 Jamie Zawinski <jwz@jwz.org>
	2	*
	3	* Permission to use, copy, modify, distribute, and sell this software and its
	4	* documentation for any purpose is hereby granted without fee, provided that
	5	* the above copyright notice appear in all copies and that both that
	6	* copyright notice and this permission notice appear in supporting
	7	* documentation. No representations are made about the suitability of this
	8	* software for any purpose. It is provided "as is" without express or
	9	* implied warranty.
	10	*/
	11
	12	#ifdef HAVE_CONFIG_H
	13	# include "config.h"
	14	#endif
	15
	16	#include <stdlib.h>
	17	#include <stdio.h>
	18	#include <string.h>
	19
	20	#ifdef HAVE_JWXYZ
	21	# include "jwxyz.h"
	22	#else /* !HAVE_JWXYZ */
	23	# include <X11/Xlib.h>
	24	#endif
	25
	26	#include "utf8wc.h"
	27
	28
	29	/* "Unicode Replacement Character", displayed in lieu of invalid characters. */
	30	# define INVALID 0xFFFD
	31
	32
	33	/* Mask the number to be within the valid range of unicode characters.
	34	*/
	35	static unsigned long
	36	uc_truncate (unsigned long uc)
	37	{
	38	uc &= 0x7FFFFFFFL; /* Unicode is 31 bits */
	39	if (uc > 0x10FFFF) uc = INVALID; /* But UTF-8 is 4 bytes */
	40	if (uc == 0) uc = INVALID; /* no nulls */
	41
	42	if (uc >= 0xD800 && uc <= 0xDFFF)
	43	/* Reserved for use with UTF-16: not a real character. */
	44	uc = INVALID;
	45
	46	return uc;
	47	}
	48
	49
	50	/* Parse the first UTF8 character at the front of the string.
	51	Return the Unicode character, and the number of bytes read.
	52	*/
	53	long
	54	utf8_decode (const unsigned char in, long length, unsigned long unicode_ret)
	55	{
	56	const unsigned char *start = in;
	57	const unsigned char *end = in + length;
	58	unsigned long uc = INVALID;
	59	unsigned long min = 0;
	60	unsigned char c;
	61
	62	if (length <= 0) goto DONE;
	63
	64	c = *in++;
	65
	66	# define PREMATURE_EOF { in = end; goto DONE; }
	67
	68	if ((c & 0xC0) == 0x80) { /* 10xxxxxx - lonely continuation byte */
	69	uc = INVALID;
	70
	71	} else if ((c & 0x80) == 0) { /* 0xxxxxxx - 7 bits in 1 byte */
	72	uc = (c & 0x7F); /* 01111111 */
	73
	74	} else if ((c & 0xE0) == 0xC0) { /* 110xxxxx - 11 bits in 2 bytes */
	75	if (in+1 > end) PREMATURE_EOF;
	76	min = 1 << 7;
	77	uc = (((c & 0x1F) << 6) \| /* 00011111------ */
	78	(in[0] & 0x3F)); /* 00111111 */
	79	in += 1;
	80
	81	} else if ((c & 0xF0) == 0xE0) { /* 1110xxxx - 16 bits in 3 bytes */
	82	if (in+2 > end) PREMATURE_EOF;
	83	min = 1 << 11;
	84	uc = (((c & 0x0F) << 12) \| /* 00001111----+------- */
	85	((in[0] & 0x3F) << 6) \| /* 00111111------ */
	86	((in[1] & 0x3F))); /* 00111111 */
	87	in += 2;
	88
	89	} else if ((c & 0xF8) == 0xF0) { /* 11110xxx - 21 bits in 4 bytes */
	90	if (in+3 > end) PREMATURE_EOF;
	91	min = 1 << 16;
	92	uc = (((c & 0x07) << 18) \| /* 00000111--+-------+------- */
	93	((in[0] & 0x3F) << 12) \| /* 01111111----+------- */
	94	((in[1] & 0x3F) << 6) \| /* 00111111------ */
	95	((in[2] & 0x3F))); /* 00111111 */
	96	in += 3;
	97
	98	} else if ((c & 0xFC) == 0xF8) { /* 111110xx - 26 bits in 5 bytes */
	99	if (in+4 > end) PREMATURE_EOF;
	100	min = 1 << 21;
	101	uc = (((c & 0x03) << 24) \| /* 00000011--------+-------+------- */
	102	((in[0] & 0x3F) << 18) \| /* 00111111--+-------+------- */
	103	((in[1] & 0x3F) << 12) \| /* 00111111----+------- */
	104	((in[2] & 0x3F) << 6) \| /* 00111111------ */
	105	((in[3] & 0x3F))); /* 00111111 */
	106	in += 4;
	107
	108	} else if ((c & 0xFE) == 0xFC) { /* 1111110x - 31 bits in 6 bytes */
	109	if (in+5 > end) PREMATURE_EOF;
	110	min = 1 << 26;
	111	uc = (((c & 0x01) << 30) \| /* 00000001------+-------+-------+------- */
	112	((in[0] & 0x3F) << 24) \| /* 00111111+-------+-------+------- */
	113	((in[1] & 0x3F) << 18) \| /* 00111111--+-------+------- */
	114	((in[2] & 0x3F) << 12) \| /* 00111111----+------- */
	115	((in[3] & 0x3F) << 6) \| /* 00111111------ */
	116	((in[4] & 0x3F))); /* 00111111 */
	117	in += 5;
	118	} else {
	119	uc = INVALID; /* Unparsable sequence. */
	120	}
	121
	122	DONE:
	123
	124	length = in - start;
	125
	126	/* If any of the continuation bytes didn't begin with the continuation tag,
	127	the sequence is invalid; stop at the bad byte, not consuming later ones.
	128	(It's easier to check this after the fact than up above.) */
	129	{
	130	int i;
	131	for (i = 1; i < length; i++)
	132	if ((start[i] & 0xC0) != 0x80) {
	133	uc = INVALID;
	134	length = i+1;
	135	break;
	136	}
	137	}
	138
	139	if (uc < min)
	140	/* A multi-byte sequence encoded a character that could have been
	141	encoded with a shorter sequence, e.g., hiding ASCII inside a
	142	multi-byte sequence. Something hinky's going on. Reject it. */
	143	uc = INVALID;
	144
	145	uc = uc_truncate (uc);
	146
	147	if (unicode_ret)
	148	*unicode_ret = uc;
	149
	150	return length;
	151	}
	152
	153
	154	/* Converts a Unicode character to a multi-byte UTF8 sequence.
	155	Returns the number of bytes written.
	156	*/
	157	int
	158	utf8_encode (unsigned long uc, char *out, long length)
	159	{
	160	const char *old = out;
	161
	162	uc = uc_truncate (uc);
	163
	164	if (uc < 0x80 && length >= 1) /* 7 bits in 1 byte */
	165	{
	166	out++ = uc; / 0xxxxxxx */
	167	}
	168	else if (uc < 0x800 && length >= 2) /* 11 bits in 2 bytes */
	169	{
	170	out++ = (0xC0 \| ((uc >> 6) & 0x1F)); / 110xxxxx */
	171	out++ = (0x80 \| (uc & 0x3F)); / 10xxxxxx */
	172	}
	173	else if (uc < 0x10000L && length >= 3) /* 16 bits in 3 bytes */
	174	{
	175	out++ = (0xE0 \| ((uc >> 12) & 0x0F)); / 1110xxxx */
	176	out++ = (0x80 \| ((uc >> 6) & 0x3F)); / 10xxxxxx */
	177	out++ = (0x80 \| (uc & 0x3F)); / 10xxxxxx */
	178	}
	179	else if (uc < 0x200000L && length >= 4) /* 21 bits in 4 bytes */
	180	{
	181	out++ = (0xF0 \| ((uc >> 18) & 0x07)); / 11110xxx */
	182	out++ = (0x80 \| ((uc >> 12) & 0x3F)); / 10xxxxxx */
	183	out++ = (0x80 \| ((uc >> 6) & 0x3F)); / 10xxxxxx */
	184	out++ = (0x80 \| (uc & 0x3F)); / 10xxxxxx */
	185	}
	186	else if (uc < 0x4000000L && length >= 5) /* 26 bits in 5 bytes */
	187	{
	188	out++ = (0xF8 \| ((uc >> 24) & 0x03)); / 111110xx */
	189	out++ = (0x80 \| ((uc >> 18) & 0x3F)); / 10xxxxxx */
	190	out++ = (0x80 \| ((uc >> 12) & 0x3F)); / 10xxxxxx */
	191	out++ = (0x80 \| ((uc >> 6) & 0x3F)); / 10xxxxxx */
	192	out++ = (0x80 \| (uc & 0x3F)); / 10xxxxxx */
	193	}
	194	else if (length >= 6) /* 31 bits in 6 bytes */
	195	{
	196	out++ = (0xFC \| ((uc >> 30) & 0x01)); / 1111110x */
	197	out++ = (0x80 \| ((uc >> 24) & 0x3F)); / 10xxxxxx */
	198	out++ = (0x80 \| ((uc >> 18) & 0x3F)); / 10xxxxxx */
	199	out++ = (0x80 \| ((uc >> 12) & 0x3F)); / 10xxxxxx */
	200	out++ = (0x80 \| ((uc >> 6) & 0x3F)); / 10xxxxxx */
	201	out++ = (0x80 \| (uc & 0x3F)); / 10xxxxxx */
	202	}
	203
	204	return (int) (out - old);
	205	}
	206
	207
	208	/* Converts a null-terminated UTF8 string to a null-terminated XChar2b array.
	209	This only handles characters that can be represented in 16 bits, the
	210	Basic Multilingual Plane. (No hieroglyphics, Elvish, Klingon or Emoji.)
	211	*/
	212	XChar2b *
	213	utf8_to_XChar2b (const char string, int length_ret)
	214	{
	215	long in_len = strlen(string);
	216	const unsigned char in = (const unsigned char ) string;
	217	const unsigned char *in_end = in + in_len;
	218	XChar2b c2b = (XChar2b ) malloc ((in_len + 1) * sizeof(*c2b));
	219	XChar2b *out = c2b;
	220	if (! out) return 0;
	221
	222	while (in < in_end)
	223	{
	224	unsigned long uc = 0;
	225	long L = utf8_decode (in, in_end - in, &uc);
	226	in += L;
	227
	228	/* If it can't be represented in a 16-bit XChar2b,
	229	use "Unicode Replacement Character". */
	230	if (uc > 0xFFFF) uc = INVALID;
	231
	232	out->byte1 = (uc >> 8) & 0xFF;
	233	out->byte2 = uc & 0xFF;
	234	out++;
	235	}
	236
	237	out->byte1 = 0;
	238	out->byte2 = 0;
	239
	240	if (length_ret)
	241	*length_ret = (int) (out - c2b);
	242
	243	/* shrink */
	244	c2b = (XChar2b ) realloc (c2b, (out - c2b + 1) sizeof(*c2b));
	245
	246	return c2b;
	247	}
	248
	249
	250	/* Split a UTF8 string into an array of strings, one per character.
	251	The sub-strings will be null terminated and may be multiple bytes.
	252	*/
	253	char **
	254	utf8_split (const char string, int length_ret)
	255	{
	256	const unsigned char in = (const unsigned char ) string;
	257	long len = strlen (string);
	258	const unsigned char *end = in + len;
	259	char ret = (char ) malloc ((len+1) * sizeof(*ret));
	260	int i = 0;
	261	int zwjp = 0;
	262	if (!ret) return 0;
	263
	264	while (in < end)
	265	{
	266	unsigned long uc;
	267	long len2 = utf8_decode (in, len, &uc);
	268	char tmp[10];
	269	memcpy (tmp, (char *) in, len2);
	270	tmp[len2] = 0;
	271	ret[i++] = strdup (tmp);
	272	in += len2;
	273
	274	/* If this is a Combining Diacritical, append it to the previous
	275	character. E.g., "y\314\206\314\206" is one string, not three.
	276
	277	If this is ZWJ, Zero Width Joiner, then we append both this character
	278	and the following character, e.g. "X ZWJ Y" is one string not three.
	279
	280	#### Hmmm, should this also include every character in the
	281	"Symbol, Modifier" category, or does ZWJ get used for those?
	282	https://www.fileformat.info/info/unicode/category/Sk/list.htm
	283
	284	Is it intended that "Latin small letter C, 0063" + "Cedilla, 00B8"
	285	should be a single glyph? Or is that what "Combining Cedilla, 0327"
	286	is for? I'm confused by the fact that the skin tones (1F3FB-1F3FF)
	287	do not seem to be in a readily-identifiable block the way the various
	288	combining diacriticals are.
	289	*/
	290	if (i > 1 &&
	291	((uc >= 0x300 && uc <= 0x36F) \|\| /* Combining Diacritical */
	292	(uc >= 0x1AB0 && uc <= 0x1AFF) \|\| /* Combining Diacritical Ext. */
	293	(uc >= 0x1DC0 && uc <= 0x1DFF) \|\| /* Combining Diacritical Supp. */
	294	(uc >= 0x20D0 && uc <= 0x20FF) \|\| /* Combining Diacritical Sym. */
	295	(uc >= 0xFE20 && uc <= 0xFE2F) \|\| /* Combining Half Marks */
	296	(uc >= 0x1F3FB && uc <= 0x1F3FF) \|\| /* Emoji skin tone modifiers */
	297	zwjp \|\| uc == 0x200D)) /* Zero Width Joiner */
	298	{
	299	long L1 = strlen(ret[i-2]);
	300	long L2 = strlen(ret[i-1]);
	301	char s2 = (char ) malloc (L1 + L2 + 1);
	302	memcpy (s2, ret[i-2], L1);
	303	memcpy (s2 + L1, ret[i-1], L2);
	304	s2[L1 + L2] = 0;
	305	free (ret[i-2]);
	306	ret[i-2] = s2;
	307	i--;
	308	zwjp = (uc == 0x200D); /* Swallow the next character as well */
	309	}
	310	}
	311	ret[i] = 0;
	312
	313	if (length_ret)
	314	*length_ret = i;
	315
	316	/* shrink */
	317	ret = (char *) realloc (ret, (i+1) sizeof(*ret));
	318
	319	return ret;
	320	}
	321
	322
	323	/* Converts a null-terminated XChar2b array to a null-terminated UTF8 string.
	324	*/
	325	char *
	326	XChar2b_to_utf8 (const XChar2b in, int length_ret)
	327	{
	328	int in_len = 0;
	329	const XChar2b *in_end;
	330	int out_len;
	331	char utf8, out;
	332	const char *out_end;
	333
	334	/* Find the null termination on the XChar2b. */
	335	for (in_end = in; in_end->byte1 \|\| in_end->byte2; in_end++, in_len++)
	336	;
	337
	338	out_len = (in_len + 1) * 3; /* 16 bit chars = 3 bytes max */
	339	utf8 = out = (char *) malloc (out_len + 1);
	340	if (! out) return 0;
	341	out_end = out + out_len;
	342
	343	while (in < in_end)
	344	{
	345	unsigned long uc = (in->byte1 << 8) \| in->byte2;
	346	int wrote = utf8_encode (uc, out, out_end - out);
	347	if (wrote > 3) abort(); /* Can't happen with 16 bit input */
	348	out += wrote;
	349	in++;
	350	}
	351	*out = 0;
	352
	353	out_len = (int) (out - utf8 + 1);
	354
	355	if (length_ret)
	356	*length_ret = out_len;
	357
	358	/* shrink */
	359	utf8 = (char *) realloc (utf8, out_len);
	360
	361	return utf8;
	362	}
	363
	364
	365	/* Converts a UTF8 string to the closest Latin1 or ASCII equivalent.
	366	*/
	367	char *
	368	utf8_to_latin1 (const char *string, Bool ascii_p)
	369	{
	370	long in_len = strlen(string);
	371	const unsigned char in = (const unsigned char ) string;
	372	const unsigned char *in_end = in + in_len;
	373	unsigned char ret = (unsigned char ) malloc (in_len + 1);
	374	unsigned char *out = ret;
	375
	376	if (! ret) return 0;
	377
	378	while (in < in_end)
	379	{
	380	unsigned long uc = 0;
	381	long len2 = utf8_decode (in, in_end - in, &uc);
	382	in += len2;
	383
	384	if (uc == '\240') /*   */
	385	uc = ' ';
	386	else if (uc >= 0x300 && uc <= 0x36F)
	387	uc = 0; /* Discard "Combining Diacritical Marks" */
	388	else if (uc >= 0x1AB0 && uc <= 0x1AFF)
	389	uc = 0; /* Discard "Combining Diacritical Marks Extended" */
	390	else if (uc >= 0x1DC0 && uc <= 0x1DFF)
	391	uc = 0; /* Discard "Combining Diacritical Marks Supplement" */
	392	else if (uc >= 0x20D0 && uc <= 0x20FF)
	393	uc = 0; /* Discard "Combining Diacritical Marks for Symbols" */
	394	else if (uc >= 0xFE20 && uc <= 0xFE2F)
	395	uc = 0; /* Discard "Combining Half Marks" */
	396
	397	else if (uc > 0xFF)
	398	switch (uc) {
	399
	400	/* Map "Unicode General Punctuation Block" to Latin1 equivalents. */
	401
	402	case 0x2000: /* EN QUAD */
	403	case 0x2001: /* EM QUAD */
	404	case 0x2002: /* EN SPACE */
	405	case 0x2003: /* EM SPACE */
	406	case 0x2004: /* THREE-PER-EM SPACE */
	407	case 0x2005: /* FOUR-PER-EM SPACE */
	408	case 0x2006: /* SIX-PER-EM SPACE */
	409	case 0x2007: /* FIGURE SPACE */
	410	case 0x2008: /* PUNCTUATION SPACE */
	411	case 0x2009: /* THIN SPACE */
	412	case 0x200A: /* HAIR SPACE */
	413	uc = ' ';
	414	break;
	415
	416	case 0x2010: /* HYPHEN */
	417	case 0x2011: /* NON-BREAKING HYPHEN */
	418	case 0x2012: /* FIGURE DASH */
	419	case 0x2013: /* EN DASH */
	420	case 0x2014: /* EM DASH */
	421	case 0x2015: /* HORIZONTAL BAR */
	422	uc = '-';
	423	break;
	424
	425	case 0x2018: /* LEFT SINGLE QUOTATION MARK */
	426	case 0x2019: /* SINGLE LOW-9 QUOTATION MARK */
	427	case 0x201A: /* SINGLE LOW-9 QUOTATION MARK */
	428	case 0x201B: /* SINGLE HIGH-REVERSED-9 QUOTATION MARK */
	429	uc = '\'';
	430	break;
	431
	432	case 0x201C: /* LEFT DOUBLE QUOTATION MARK */
	433	case 0x201D: /* RIGHT DOUBLE QUOTATION MARK */
	434	case 0x201E: /* DOUBLE LOW-9 QUOTATION MARK */
	435	case 0x201F: /* DOUBLE HIGH-REVERSED-9 QUOTATION MARK */
	436	uc = '"';
	437	break;
	438
	439	case 0x2022: uc = '\267'; break; /* BULLET */
	440	case 0x2023: uc = '\273'; break; /* TRIANGULAR BULLET */
	441	case 0x2027: uc = '\267'; break; /* HYPHENATION POINT */
	442	case 0x202F: uc = ' '; break; /* NARROW NO-BREAK SPACE */
	443	case 0x2038: uc = '^'; break; /* CARET */
	444	case 0x2039: uc = '\253'; break; /* SINGLE LEFT ANGLE QUOTATION MARK */
	445	case 0x203A: uc = '\273'; break; /* SINGLE RIGHT ANGLE QUOTATION MARK*/
	446	case 0x2041: uc = '^'; break; /* CARET INSERTION POINT */
	447	case 0x2042: uc = ''; break; / ASTERISM */
	448	case 0x2043: uc = '='; break; /* HYPHEN BULLET */
	449	case 0x2044: uc = '/'; break; /* FRACTION SLASH */
	450	case 0x204B: uc = '\266'; break; /* REVERSED PILCROW SIGN */
	451	case 0x204C: uc = '\267'; break; /* BLACK LEFTWARDS BULLET */
	452	case 0x204D: uc = '\267'; break; /* BLACK RIGHTWARDS BULLET */
	453	case 0x204E: uc = ''; break; / LOW ASTERISK */
	454	case 0x204F: uc = ';'; break; /* REVERSED SEMICOLON */
	455	default:
	456	break;
	457	}
	458
	459	if (uc > 0xFF)
	460	/* "Inverted question mark" looks enough like 0xFFFD,
	461	the "Unicode Replacement Character". */
	462	uc = (ascii_p ? '#' : '\277');
	463
	464	if (ascii_p) /* Map Latin1 to the closest ASCII versions. */
	465	{
	466	const unsigned char latin1_to_ascii[96] =
	467	" !C##Y\|S_C#<=-R_##23'uP.,1o>###?"
	468	"AAAAAAECEEEEIIIIDNOOOOOx0UUUUYpS"
	469	"aaaaaaeceeeeiiiionooooo/ouuuuypy";
	470	if (uc >= 0xA0)
	471	uc = latin1_to_ascii[uc - 0xA0];
	472	}
	473
	474	if (uc > 0)
	475	*out++ = (unsigned char) uc;
	476	}
	477	*out = 0;
	478
	479	/* shrink */
	480	ret = (unsigned char ) realloc (ret, (out - ret + 1) sizeof(*ret));
	481
	482	return (char *) ret;
	483	}
	484
	485
	486	/*************************************************************************
	487
	488	cd ../hacks ; make test-utf8wc
	489
	490	*************************************************************************/
	491
	492	#ifdef SELFTEST
	493
	494	/* Convert a UTF8 string to Unicode and back again.
	495	*/
	496	static char *
	497	split_and_join (const char *string)
	498	{
	499	const unsigned char in = (const unsigned char ) string;
	500	int len = strlen (string);
	501	const unsigned char *end = in + len;
	502	unsigned long unicode = (unsigned long )
	503	malloc((len + 1) * sizeof(*unicode));
	504	int i = 0;
	505	char ret, out, *out_end;
	506
	507	while (in < end)
	508	{
	509	long len2 = utf8_decode (in, len, &unicode[i]);
	510	i++;
	511	in += len2;
	512	}
	513	unicode[i] = 0;
	514
	515	i = i*6 + 1;
	516	out = ret = (char *) malloc(i);
	517	out_end = out + i;
	518	i = 0;
	519	while (unicode[i])
	520	{
	521	int len2 = utf8_encode (unicode[i], out, out_end - out);
	522	out += len2;
	523	i++;
	524	}
	525	*out = 0;
	526	free (unicode);
	527
	528	return ret;
	529	}
	530
	531
	532	static void
	533	LOG (FILE out, const char prefix, const char *s)
	534	{
	535	fprintf (out, "%6s: \"", prefix);
	536	while (*s)
	537	{
	538	unsigned char c = *s;
	539	if (c == '"' \|\| c == '\\') fprintf(out, "\\%c", c);
	540	else if (c < 32 \|\| c >= 127) fprintf(out, "\\%03o", c);
	541	else fprintf (out, "%c", c);
	542	s++;
	543	}
	544	fprintf (out, "\"\n");
	545	}
	546
	547
	548	int
	549	main (int argc, char **argv)
	550	{
	551	/* Adapted from http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt
	552	*/
	553
	554	# define URC "\357\277\275" /* 0xFFFD, "Unicode Replacement Character" */
	555
	556	static const struct { const char name, in, target, target2; } tests[] = {
	557	/* 1 Some correct UTF-8 text */
	558
	559	/* The Greek word 'kosme': */
	560	{ "1", "\316\272\341\275\271\317\203\316\274\316\265" },
	561
	562
	563	/* 2 Boundary condition test cases */
	564
	565	/* 2.1 First possible sequence of a certain length */
	566
	567	{ "2.1.1", /* 1 byte (U-00000000): */ "\000" },
	568	{ "2.1.2", /* 2 bytes (U-00000080): */ "\302\200" },
	569	{ "2.1.3", /* 3 bytes (U-00000800): */ "\340\240\200" },
	570	{ "2.1.4", /* 4 bytes (U-00010000): */ "\360\220\200\200", 0, URC },
	571	{ "2.1.5", /* 5 bytes (U-00200000): */ "\370\210\200\200\200", URC },
	572	{ "2.1.6", /* 6 bytes (U-04000000): */ "\374\204\200\200\200\200", URC },
	573
	574	/* 2.2 Last possible sequence of a certain length */
	575
	576	{ "2.2.1", /* 1 byte (U-0000007F): */ "\177" },
	577	{ "2.2.2", /* 2 bytes (U-000007FF): */ "\337\277" },
	578	{ "2.2.3", /* 3 bytes (U-0000FFFF): */ "\357\277\277" },
	579	{ "2.2.4", /* 4 bytes (U-001FFFFF): */ "\367\277\277\277", URC },
	580	{ "2.2.5", /* 5 bytes (U-03FFFFFF): */ "\373\277\277\277\277", URC },
	581	{ "2.2.6", /* 6 bytes (U-7FFFFFFF): */ "\375\277\277\277\277\277", URC },
	582
	583	/* 2.3 Other boundary conditions */
	584
	585	{ "2.3.1", /* U-0000D7FF = ed 9f bf = */ "\355\237\277" },
	586	{ "2.3.2", /* U-0000E000 = ee 80 80 = */ "\356\200\200" },
	587	{ "2.3.3", /* U-0000FFFD = ef bf bd = */ URC },
	588	{ "2.3.4", /* U-0010FFFF = f4 8f bf bf = */ "\364\217\277\277", 0, URC },
	589	{ "2.3.5", /* U-00110000 = f4 90 80 80 = */ "\364\220\200\200", URC },
	590
	591
	592	/* 3 Malformed sequences */
	593
	594	/* 3.1 Unexpected continuation bytes */
	595
	596	/* Each unexpected continuation byte should be separately signalled as a
	597	malformed sequence of its own. */
	598
	599	{ "3.1.1", /* First continuation byte 0x80: */ "\200", URC },
	600	{ "3.1.2", /* Last continuation byte 0xbf: */ "\277", URC },
	601	{ "3.1.3", /* 2 continuation bytes: */ "\200\277", URC URC },
	602	{ "3.1.4", /* 3 continuation bytes: */ "\200\277\200", URC URC URC },
	603	{ "3.1.5", /* 4 continuation bytes: */ "\200\277\200\277",
	604	URC URC URC URC },
	605	{ "3.1.6", /* 5 continuation bytes: */ "\200\277\200\277\200",
	606	URC URC URC URC URC },
	607	{ "3.1.7", /* 6 continuation bytes: */ "\200\277\200\277\200\277",
	608	URC URC URC URC URC URC },
	609	{ "3.1.8", /* 7 continuation bytes: */ "\200\277\200\277\200\277\200",
	610	URC URC URC URC URC URC URC },
	611
	612	{ "3.1.9", /* Sequence of all 64 possible continuation bytes (0x80-0xbf):*/
	613
	614	"\200\201\202\203\204\205\206\207\210\211\212\213\214\215\216\217"
	615	"\220\221\222\223\224\225\226\227\230\231\232\233\234\235\236\237"
	616	"\240\241\242\243\244\245\246\247\250\251\252\253\254\255\256\257"
	617	"\260\261\262\263\264\265\266\267\270\271\272\273\274\275\276\277",
	618	URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC
	619	URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC
	620	URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC
	621	URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC },
	622
	623	/* 3.2 Lonely start characters */
	624
	625	{ "3.2.1", /* All 32 first bytes of 2-byte sequences (0xc0-0xdf),
	626	each followed by a space character: */
	627
	628	"\300 \301 \302 \303 \304 \305 \306 \307 \310 \311 \312 \313 \314 "
	629	"\315 \316 \317 \320 \321 \322 \323 \324 \325 \326 \327 \330 \331 "
	630	"\332 \333 \334 \335 \336 \337 ",
	631	URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC
	632	URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC },
	633
	634	{ "3.2.2", /* All 16 first bytes of 3-byte sequences (0xe0-0xef),
	635	each followed by a space character: */
	636	"\340 \341 \342 \343 \344 \345 \346 \347 "
	637	"\350 \351 \352 \353 \354 \355 \356 \357 ",
	638	URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC },
	639
	640	{ "3.2.3", /* All 8 first bytes of 4-byte sequences (0xf0-0xf7),
	641	each followed by a space character: */
	642	URC URC URC URC URC URC URC URC },
	643
	644	{ "3.2.4", /* All 4 first bytes of 5-byte sequences (0xf8-0xfb),
	645	each followed by a space character: */
	646	"\370 \371 \372 \373 ",
	647	URC URC URC URC },
	648
	649	{ "3.2.5", /* All 2 first bytes of 6-byte sequences (0xfc-0xfd),
	650	each followed by a space character: */
	651	"\374 \375 ", URC URC },
	652
	653	/* 3.3 Sequences with last continuation byte missing */
	654
	655	/* All bytes of an incomplete sequence should be signalled as a single
	656	malformed sequence, i.e., you should see only a single replacement
	657	character in each of the next 10 tests. (Characters as in section 2) */
	658
	659	{ "3.3.1", /* 2-byte sequence with last byte missing (U+0000): */
	660	"\300", URC },
	661	{ "3.3.2", /* 3-byte sequence with last byte missing (U+0000): */
	662	"\340\200", URC },
	663	{ "3.3.3", /* 4-byte sequence with last byte missing (U+0000): */
	664	"\360\200\200", URC },
	665	{ "3.3.4", /* 5-byte sequence with last byte missing (U+0000): */
	666	"\370\200\200\200", URC },
	667	{ "3.3.5", /* 6-byte sequence with last byte missing (U+0000): */
	668	"\374\200\200\200\200", URC },
	669	{ "3.3.6", /* 2-byte sequence with last byte missing (U-000007FF): */
	670	"\337", URC },
	671	{ "3.3.7", /* 3-byte sequence with last byte missing (U-0000FFFF): */
	672	"\357\277", URC },
	673	{ "3.3.8", /* 4-byte sequence with last byte missing (U-001FFFFF): */
	674	"\367\277\277", URC },
	675	{ "3.3.9", /* 5-byte sequence with last byte missing (U-03FFFFFF): */
	676	"\373\277\277\277", URC },
	677	{ "3.3.10", /* 6-byte sequence with last byte missing (U-7FFFFFFF): */
	678	"\375\277\277\277\277", URC },
	679
	680	/* 3.4 Concatenation of incomplete sequences */
	681
	682	/* All the 10 sequences of 3.3 concatenated, you should see 10 malformed
	683	sequences being signalled: */
	684
	685	{ "3.4", "\300\340\200\360\200\200\370\200\200\200\374\200\200\200\200"
	686	"\337\357\277\367\277\277\373\277\277\277\375\277\277\277\277",
	687	URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC },
	688
	689	/* 3.5 Impossible bytes */
	690
	691	/* The following two bytes cannot appear in a correct UTF-8 string */
	692
	693	{ "3.5.1", /* fe = */ "\376", URC },
	694	{ "3.5.2", /* ff = */ "\377", URC },
	695	{ "3.5.3", /* fe fe ff ff = */ "\376\376\377\377", URC URC URC URC },
	696
	697
	698	/* 4 Overlong sequences */
	699
	700	/* 4.1 Examples of an overlong ASCII character */
	701
	702	{ "4.1.1", /* U+002F = c0 af = */ "\300\257", URC },
	703	{ "4.1.2", /* U+002F = e0 80 af = */ "\340\200\257", URC },
	704	{ "4.1.3", /* U+002F = f0 80 80 af = */ "\360\200\200\257", URC },
	705	{ "4.1.4", /* U+002F = f8 80 80 80 af = */ "\370\200\200\200\257",
	706	URC },
	707	{ "4.1.5", /* U+002F = fc 80 80 80 80 af = */ "\374\200\200\200\200\257",
	708	URC },
	709
	710	/* 4.2 Maximum overlong sequences */
	711
	712	{ "4.2.1", /* U-0000007F = c1 bf = */ "\301\277", URC },
	713	{ "4.2.2", /* U-000007FF = e0 9f bf = */ "\340\237\277", URC },
	714	{ "4.2.3", /* U-0000FFFF = f0 8f bf bf = */ "\360\217\277\277",
	715	URC },
	716	{ "4.2.4", /* U-001FFFFF = f8 87 bf bf bf = */ "\370\207\277\277\277",
	717	URC },
	718	{ "4.2.5", /* U-03FFFFFF = fc 83 bf bf bf bf = */ URC },
	719
	720	/* 4.3 Overlong representation of the NUL character */
	721
	722	{ "4.3.1", /* U+0000 = c0 80 = */ "\300\200", URC },
	723	{ "4.3.2", /* U+0000 = e0 80 80 = */ "\340\200\200", URC },
	724	{ "4.3.3", /* U+0000 = f0 80 80 80 = */ "\360\200\200\200", URC },
	725	{ "4.3.4", /* U+0000 = f8 80 80 80 80 = */ "\370\200\200\200\200",
	726	URC },
	727	{ "4.3.5", /* U+0000 = fc 80 80 80 80 80 = */ "\374\200\200\200\200\200",
	728	URC },
	729
	730
	731	/* 5 Illegal code positions */
	732
	733	/* 5.1 Single UTF-16 surrogates */
	734
	735	{ "5.1.1", /* U+D800 = ed a0 80 = */ "\355\240\200", URC },
	736	{ "5.1.2", /* U+DB7F = ed ad bf = */ "\355\255\277", URC },
	737	{ "5.1.3", /* U+DB80 = ed ae 80 = */ "\355\256\200", URC },
	738	{ "5.1.4", /* U+DBFF = ed af bf = */ "\355\257\277", URC },
	739	{ "5.1.5", /* U+DC00 = ed b0 80 = */ "\355\260\200", URC },
	740	{ "5.1.6", /* U+DF80 = ed be 80 = */ "\355\276\200", URC },
	741	{ "5.1.7", /* U+DFFF = ed bf bf = */ "\355\277\277", URC },
	742
	743	/* 5.2 Paired UTF-16 surrogates */
	744
	745	{ "5.2.1", /* U+D800 U+DC00 = ed a0 80 ed b0 80 = */ URC URC },
	746	{ "5.2.2", /* U+D800 U+DFFF = ed a0 80 ed bf bf = */ URC URC },
	747	{ "5.2.3", /* U+DB7F U+DC00 = ed ad bf ed b0 80 = */ URC URC },
	748	{ "5.2.4", /* U+DB7F U+DFFF = ed ad bf ed bf bf = */ URC URC },
	749	{ "5.2.5", /* U+DB80 U+DC00 = ed ae 80 ed b0 80 = */ URC URC },
	750	{ "5.2.6", /* U+DB80 U+DFFF = ed ae 80 ed bf bf = */ URC URC },
	751	{ "5.2.7", /* U+DBFF U+DC00 = ed af bf ed b0 80 = */ URC URC },
	752	{ "5.2.8", /* U+DBFF U+DFFF = ed af bf ed bf bf = */ URC URC },
	753
	754	/* 5.3 Other illegal code positions */
	755
	756	{ "5.3.1", /* U+FFFE = ef bf be = */ "\357\277\276" },
	757	{ "5.3.2", /* U+FFFF = ef bf bf = */ "\357\277\277" },
	758
	759
	760	/* 6 Some other junk */
	761
	762	{ "6.0", "" },
	763	{ "6.1", "\001\002\003\004\005 ABC" },
	764	{ "6.2", /* every non-ASCII Latin1 character */
	765	"\302\241\302\242\302\243\302\244\302\245\302\246\302\247\302\250"
	766	"\302\251\302\252\302\253\302\254\302\255\302\256\302\257\302\260"
	767	"\302\261\302\262\302\263\302\264\302\265\302\266\302\267\302\270"
	768	"\302\271\302\272\302\273\302\274\302\275\302\276\302\277\303\200"
	769	"\303\201\303\202\303\203\303\204\303\205\303\206\303\207\303\210"
	770	"\303\211\303\212\303\213\303\214\303\215\303\216\303\217\303\220"
	771	"\303\221\303\222\303\223\303\224\303\225\303\226\303\227\303\230"
	772	"\303\231\303\232\303\233\303\234\303\235\303\236\303\237\303\240"
	773	"\303\241\303\242\303\243\303\244\303\245\303\246\303\247\303\250"
	774	"\303\251\303\252\303\253\303\254\303\255\303\256\303\257\303\260"
	775	"\303\261\303\262\303\263\303\264\303\265\303\266\303\267\303\270"
	776	"\303\271\303\272\303\273\303\274\303\275\303\276\303\277" },
	777
	778	{ "6.3", /* Christmas tree */
	779	"\001\002\003\004\005\006\007\010\011\012\013\014\015\016\017\020"
	780	"\021\022\023\024\025\026\027\030\031\032\033\034\035\036\037\040"
	781	"\041\042\043\044\045\046\047\050\051\052\053\054\055\056\057\060"
	782	"\061\062\063\064\065\066\067\070\071\072\073\074\075\076\077\100"
	783	"\101\102\103\104\105\106\107\110\111\112\113\114\115\116\117\120"
	784	"\121\122\123\124\125\126\127\130\131\132\133\134\135\136\137\140"
	785	"\141\142\143\144\145\146\147\150\151\152\153\154\155\156\157\160"
	786	"\161\162\163\164\165\166\167\170\171\172\173\174\175\176\177\200"
	787	"\201\202\203\204\205\206\207\210\211\212\213\214\215\216\217\220"
	788	"\221\222\223\224\225\226\227\230\231\232\233\234\235\236\237\240"
	789	"\241\242\243\244\245\246\247\250\251\252\253\254\255\256\257\260"
	790	"\261\262\263\264\265\266\267\270\271\272\273\274\275\276\277\300"
	791	"\301\302\303\304\305\306\307\310\311\312\313\314\315\316\317\320"
	792	"\321\322\323\324\325\326\327\330\331\332\333\334\335\336\337\340"
	793	"\341\342\343\344\345\346\347\350\351\352\353\354\355\356\357\360"
	794	"\361\362\363\364\365\366\367\370\371\372\373\374\375\376\377",
	795
	796	"\001\002\003\004\005\006\007\010\011\012\013\014\015\016\017\020"
	797	"\021\022\023\024\025\026\027\030\031\032\033\034\035\036\037"
	798	" !\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ"
	799	"[\\]^_`abcdefghijklmnopqrstuvwxyz{\|}~\177"
	800	URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC
	801	URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC
	802	URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC
	803	URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC
	804	URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC
	805	URC URC URC URC URC URC URC URC URC URC URC URC },
	806	};
	807
	808	int i;
	809	int ok = 1;
	810	for (i = 0; i < sizeof(tests)/sizeof(*tests); i++)
	811	{
	812	const char *name = tests[i].name;
	813	const char *in = tests[i].in;
	814	const char *target = (tests[i].target ? tests[i].target : in);
	815	const char *target2 = (tests[i].target2 ? tests[i].target2 : target);
	816	char *out = split_and_join (in);
	817	XChar2b *out16 = utf8_to_XChar2b (in, 0);
	818	char *out2 = XChar2b_to_utf8 (out16, 0);
	819	if (strcmp (out, target))
	820	{
	821	LOG (stderr, name, target);
	822	LOG (stderr, "FAIL", out);
	823	fprintf (stderr, "\n");
	824	ok = 0;
	825	}
	826	if (strcmp (out2, target2))
	827	{
	828	LOG (stderr, name, target2);
	829	LOG (stderr, "FAIL2", out2);
	830	fprintf (stderr, "\n");
	831	ok = 0;
	832	}
	833	free (out);
	834	free (out2);
	835	free (out16);
	836	}
	837
	838	/* Check conversion from UTF8 to Latin1 and ASCII. */
	839	{
	840	const char *utf8 = ("son \303\256le int\303\251rieure, \303\240 "
	841	"c\303\264t\303\251 de l'alc\303\264ve "
	842	"ovo\303\257de, o\303\271 les b\303\273ches "
	843	"se consument dans l'\303\242tre");
	844	const char *latin1 = ("son \356le int\351rieure, \340 "
	845	"c\364t\351 de l'alc\364ve ovo\357de, "
	846	"o\371 les b\373ches se consument dans "
	847	"l'\342tre");
	848	const char *ascii = ("son ile interieure, a cote de l'alcove "
	849	"ovoide, ou les buches se consument dans "
	850	"l'atre");
	851	char *latin1b = utf8_to_latin1 (utf8, False);
	852	char *ascii2 = utf8_to_latin1 (utf8, True);
	853	if (strcmp (latin1, latin1b))
	854	{
	855	LOG (stderr, "LATIN1", utf8);
	856	LOG (stderr, "FAIL3", latin1b);
	857	fprintf (stderr, "\n");
	858	ok = 0;
	859	}
	860	if (strcmp (ascii, ascii2))
	861	{
	862	LOG (stderr, "ASCII", utf8);
	863	LOG (stderr, "FAIL4", ascii2);
	864	fprintf (stderr, "\n");
	865	ok = 0;
	866	}
	867	free (latin1b);
	868	free (ascii2);
	869	}
	870
	871	/* Check de-composition of emoji that should all be treated as a unit
	872	for measurement and display purposes. */
	873	{
	874	static const char * const tests[] = {
	875
	876	/* 0: "Man" */
	877	" \360\237\221\250 ",
	878
	879	/* 1: "Blackula" = "Vampire, dark skin tone" = 1F9DB 1F3FF */
	880	" \360\237\247\233\360\237\217\277 ",
	881
	882	/* 2: "Black male teacher" = "Man, dark skin tone, ZWJ, school" =
	883	1F468 1F3FF 200D 1F3EB
	884	*/
	885	" \360\237\221\250\360\237\217\277\342\200\215\360\237\217\253 ",
	886
	887	/* 3: "Female runner" = "Runner, ZWJ, female sign" = 1F3C3 200D 2640 */
	888	" \360\237\217\203\342\200\215\342\231\200 ",
	889
	890	/* 4: "Woman astronaut" = "Woman, ZWJ, rocket ship" = 1F3C3 200D 1F680 */
	891	" \360\237\217\203\342\200\215\360\237\232\200 ",
	892
	893	/* 5:
	894	Group of people displayed as a single glyph:
	895	Woman, dark skin tone, ZWJ, 1F469 1F3FF 200D
	896	Man, light skin tone, ZWJ, 1F468 1F3FB 200D
	897	Boy, medium skin tone, ZWJ, 1F466 1F3FD 200D
	898	Girl, dark skin tone. 1F467 1F3FF
	899	*/
	900	" \360\237\221\251\360\237\217\277\342\200\215"
	901	"\360\237\221\250\360\237\217\273\342\200\215"
	902	"\360\237\221\246\360\237\217\275\342\200\215"
	903	"\360\237\221\247\360\237\217\277 ",
	904	};
	905	int i;
	906	for (i = 0; i < sizeof(tests)/sizeof(*tests); i++)
	907	{
	908	int L = 0;
	909	char **out = utf8_split (tests[i], &L);
	910	char name[100];
	911	int j;
	912	sprintf (name, "SPLIT %d: %d glyphs", i, L-2);
	913	if (L != 3)
	914	{
	915	LOG (stderr, name, tests[i]);
	916	ok = 0;
	917	}
	918	for (j = 0; j < L; j++)
	919	free (out[j]);
	920	free (out);
	921	}
	922	}
	923
	924	if (ok) fprintf (stderr, "OK\n");
	925	return (ok == 0);
	926	}
	927
	928	#endif /* SELFTEST */