Added missing newline in NEDsim error message.
[screensavers] / screenhack / utf8wc.c
CommitLineData
3144ee8a
AT
1/* xscreensaver, Copyright (c) 2014-2016 Jamie Zawinski <jwz@jwz.org>
2 *
3 * Permission to use, copy, modify, distribute, and sell this software and its
4 * documentation for any purpose is hereby granted without fee, provided that
5 * the above copyright notice appear in all copies and that both that
6 * copyright notice and this permission notice appear in supporting
7 * documentation. No representations are made about the suitability of this
8 * software for any purpose. It is provided "as is" without express or
9 * implied warranty.
10 */
11
12#ifdef HAVE_CONFIG_H
13# include "config.h"
14#endif
15
16#include <stdlib.h>
17#include <stdio.h>
18#include <string.h>
19
20#ifdef HAVE_JWXYZ
21# include "jwxyz.h"
22#else /* !HAVE_JWXYZ */
23# include <X11/Xlib.h>
24#endif
25
26#include "utf8wc.h"
27
28
29/* "Unicode Replacement Character", displayed in lieu of invalid characters. */
30# define INVALID 0xFFFD
31
32
33/* Mask the number to be within the valid range of unicode characters.
34 */
35static unsigned long
36uc_truncate (unsigned long uc)
37{
38 uc &= 0x7FFFFFFFL; /* Unicode is 31 bits */
39 if (uc > 0x10FFFF) uc = INVALID; /* But UTF-8 is 4 bytes */
40 if (uc == 0) uc = INVALID; /* no nulls */
41
42 if (uc >= 0xD800 && uc <= 0xDFFF)
43 /* Reserved for use with UTF-16: not a real character. */
44 uc = INVALID;
45
46 return uc;
47}
48
49
50/* Parse the first UTF8 character at the front of the string.
51 Return the Unicode character, and the number of bytes read.
52 */
53long
54utf8_decode (const unsigned char *in, long length, unsigned long *unicode_ret)
55{
56 const unsigned char *start = in;
57 const unsigned char *end = in + length;
58 unsigned long uc = INVALID;
59 unsigned long min = 0;
60 unsigned char c;
61
62 if (length <= 0) goto DONE;
63
64 c = *in++;
65
66# define PREMATURE_EOF { in = end; goto DONE; }
67
68 if ((c & 0xC0) == 0x80) { /* 10xxxxxx - lonely continuation byte */
69 uc = INVALID;
70
71 } else if ((c & 0x80) == 0) { /* 0xxxxxxx - 7 bits in 1 byte */
72 uc = (c & 0x7F); /* 01111111 */
73
74 } else if ((c & 0xE0) == 0xC0) { /* 110xxxxx - 11 bits in 2 bytes */
75 if (in+1 > end) PREMATURE_EOF;
76 min = 1 << 7;
77 uc = (((c & 0x1F) << 6) | /* 00011111------ */
78 (in[0] & 0x3F)); /* 00111111 */
79 in += 1;
80
81 } else if ((c & 0xF0) == 0xE0) { /* 1110xxxx - 16 bits in 3 bytes */
82 if (in+2 > end) PREMATURE_EOF;
83 min = 1 << 11;
84 uc = (((c & 0x0F) << 12) | /* 00001111----+------- */
85 ((in[0] & 0x3F) << 6) | /* 00111111------ */
86 ((in[1] & 0x3F))); /* 00111111 */
87 in += 2;
88
89 } else if ((c & 0xF8) == 0xF0) { /* 11110xxx - 21 bits in 4 bytes */
90 if (in+3 > end) PREMATURE_EOF;
91 min = 1 << 16;
92 uc = (((c & 0x07) << 18) | /* 00000111--+-------+------- */
93 ((in[0] & 0x3F) << 12) | /* 01111111----+------- */
94 ((in[1] & 0x3F) << 6) | /* 00111111------ */
95 ((in[2] & 0x3F))); /* 00111111 */
96 in += 3;
97
98 } else if ((c & 0xFC) == 0xF8) { /* 111110xx - 26 bits in 5 bytes */
99 if (in+4 > end) PREMATURE_EOF;
100 min = 1 << 21;
101 uc = (((c & 0x03) << 24) | /* 00000011--------+-------+------- */
102 ((in[0] & 0x3F) << 18) | /* 00111111--+-------+------- */
103 ((in[1] & 0x3F) << 12) | /* 00111111----+------- */
104 ((in[2] & 0x3F) << 6) | /* 00111111------ */
105 ((in[3] & 0x3F))); /* 00111111 */
106 in += 4;
107
108 } else if ((c & 0xFE) == 0xFC) { /* 1111110x - 31 bits in 6 bytes */
109 if (in+5 > end) PREMATURE_EOF;
110 min = 1 << 26;
111 uc = (((c & 0x01) << 30) | /* 00000001------+-------+-------+------- */
112 ((in[0] & 0x3F) << 24) | /* 00111111+-------+-------+------- */
113 ((in[1] & 0x3F) << 18) | /* 00111111--+-------+------- */
114 ((in[2] & 0x3F) << 12) | /* 00111111----+------- */
115 ((in[3] & 0x3F) << 6) | /* 00111111------ */
116 ((in[4] & 0x3F))); /* 00111111 */
117 in += 5;
118 } else {
119 uc = INVALID; /* Unparsable sequence. */
120 }
121
122 DONE:
123
124 length = in - start;
125
126 /* If any of the continuation bytes didn't begin with the continuation tag,
127 the sequence is invalid; stop at the bad byte, not consuming later ones.
128 (It's easier to check this after the fact than up above.) */
129 {
130 int i;
131 for (i = 1; i < length; i++)
132 if ((start[i] & 0xC0) != 0x80) {
133 uc = INVALID;
134 length = i+1;
135 break;
136 }
137 }
138
139 if (uc < min)
140 /* A multi-byte sequence encoded a character that could have been
141 encoded with a shorter sequence, e.g., hiding ASCII inside a
142 multi-byte sequence. Something hinky's going on. Reject it. */
143 uc = INVALID;
144
145 uc = uc_truncate (uc);
146
147 if (unicode_ret)
148 *unicode_ret = uc;
149
150 return length;
151}
152
153
154/* Converts a Unicode character to a multi-byte UTF8 sequence.
155 Returns the number of bytes written.
156 */
157int
158utf8_encode (unsigned long uc, char *out, long length)
159{
160 const char *old = out;
161
162 uc = uc_truncate (uc);
163
164 if (uc < 0x80 && length >= 1) /* 7 bits in 1 byte */
165 {
166 *out++ = uc; /* 0xxxxxxx */
167 }
168 else if (uc < 0x800 && length >= 2) /* 11 bits in 2 bytes */
169 {
170 *out++ = (0xC0 | ((uc >> 6) & 0x1F)); /* 110xxxxx */
171 *out++ = (0x80 | (uc & 0x3F)); /* 10xxxxxx */
172 }
173 else if (uc < 0x10000L && length >= 3) /* 16 bits in 3 bytes */
174 {
175 *out++ = (0xE0 | ((uc >> 12) & 0x0F)); /* 1110xxxx */
176 *out++ = (0x80 | ((uc >> 6) & 0x3F)); /* 10xxxxxx */
177 *out++ = (0x80 | (uc & 0x3F)); /* 10xxxxxx */
178 }
179 else if (uc < 0x200000L && length >= 4) /* 21 bits in 4 bytes */
180 {
181 *out++ = (0xF0 | ((uc >> 18) & 0x07)); /* 11110xxx */
182 *out++ = (0x80 | ((uc >> 12) & 0x3F)); /* 10xxxxxx */
183 *out++ = (0x80 | ((uc >> 6) & 0x3F)); /* 10xxxxxx */
184 *out++ = (0x80 | (uc & 0x3F)); /* 10xxxxxx */
185 }
186 else if (uc < 0x4000000L && length >= 5) /* 26 bits in 5 bytes */
187 {
188 *out++ = (0xF8 | ((uc >> 24) & 0x03)); /* 111110xx */
189 *out++ = (0x80 | ((uc >> 18) & 0x3F)); /* 10xxxxxx */
190 *out++ = (0x80 | ((uc >> 12) & 0x3F)); /* 10xxxxxx */
191 *out++ = (0x80 | ((uc >> 6) & 0x3F)); /* 10xxxxxx */
192 *out++ = (0x80 | (uc & 0x3F)); /* 10xxxxxx */
193 }
194 else if (length >= 6) /* 31 bits in 6 bytes */
195 {
196 *out++ = (0xFC | ((uc >> 30) & 0x01)); /* 1111110x */
197 *out++ = (0x80 | ((uc >> 24) & 0x3F)); /* 10xxxxxx */
198 *out++ = (0x80 | ((uc >> 18) & 0x3F)); /* 10xxxxxx */
199 *out++ = (0x80 | ((uc >> 12) & 0x3F)); /* 10xxxxxx */
200 *out++ = (0x80 | ((uc >> 6) & 0x3F)); /* 10xxxxxx */
201 *out++ = (0x80 | (uc & 0x3F)); /* 10xxxxxx */
202 }
203
204 return (int) (out - old);
205}
206
207
208/* Converts a null-terminated UTF8 string to a null-terminated XChar2b array.
209 This only handles characters that can be represented in 16 bits, the
210 Basic Multilingual Plane. (No hieroglyphics, Elvish, Klingon or Emoji.)
211 */
212XChar2b *
213utf8_to_XChar2b (const char *string, int *length_ret)
214{
215 long in_len = strlen(string);
216 const unsigned char *in = (const unsigned char *) string;
217 const unsigned char *in_end = in + in_len;
218 XChar2b *c2b = (XChar2b *) malloc ((in_len + 1) * sizeof(*c2b));
219 XChar2b *out = c2b;
220 if (! out) return 0;
221
222 while (in < in_end)
223 {
224 unsigned long uc = 0;
225 long L = utf8_decode (in, in_end - in, &uc);
226 in += L;
227
228 /* If it can't be represented in a 16-bit XChar2b,
229 use "Unicode Replacement Character". */
230 if (uc > 0xFFFF) uc = INVALID;
231
232 out->byte1 = (uc >> 8) & 0xFF;
233 out->byte2 = uc & 0xFF;
234 out++;
235 }
236
237 out->byte1 = 0;
238 out->byte2 = 0;
239
240 if (length_ret)
241 *length_ret = (int) (out - c2b);
242
243 /* shrink */
244 c2b = (XChar2b *) realloc (c2b, (out - c2b + 1) * sizeof(*c2b));
245
246 return c2b;
247}
248
249
250/* Split a UTF8 string into an array of strings, one per character.
251 The sub-strings will be null terminated and may be multiple bytes.
252 */
253char **
254utf8_split (const char *string, int *length_ret)
255{
256 const unsigned char *in = (const unsigned char *) string;
257 long len = strlen (string);
258 const unsigned char *end = in + len;
259 char **ret = (char **) malloc ((len+1) * sizeof(*ret));
260 int i = 0;
261 int zwjp = 0;
262 if (!ret) return 0;
263
264 while (in < end)
265 {
266 unsigned long uc;
267 long len2 = utf8_decode (in, len, &uc);
268 char tmp[10];
269 memcpy (tmp, (char *) in, len2);
270 tmp[len2] = 0;
271 ret[i++] = strdup (tmp);
272 in += len2;
273
274 /* If this is a Combining Diacritical, append it to the previous
275 character. E.g., "y\314\206\314\206" is one string, not three.
276
277 If this is ZWJ, Zero Width Joiner, then we append both this character
278 and the following character, e.g. "X ZWJ Y" is one string not three.
279
280 #### Hmmm, should this also include every character in the
281 "Symbol, Modifier" category, or does ZWJ get used for those?
282 https://www.fileformat.info/info/unicode/category/Sk/list.htm
283
284 Is it intended that "Latin small letter C, 0063" + "Cedilla, 00B8"
285 should be a single glyph? Or is that what "Combining Cedilla, 0327"
286 is for? I'm confused by the fact that the skin tones (1F3FB-1F3FF)
287 do not seem to be in a readily-identifiable block the way the various
288 combining diacriticals are.
289 */
290 if (i > 1 &&
291 ((uc >= 0x300 && uc <= 0x36F) || /* Combining Diacritical */
292 (uc >= 0x1AB0 && uc <= 0x1AFF) || /* Combining Diacritical Ext. */
293 (uc >= 0x1DC0 && uc <= 0x1DFF) || /* Combining Diacritical Supp. */
294 (uc >= 0x20D0 && uc <= 0x20FF) || /* Combining Diacritical Sym. */
295 (uc >= 0xFE20 && uc <= 0xFE2F) || /* Combining Half Marks */
296 (uc >= 0x1F3FB && uc <= 0x1F3FF) || /* Emoji skin tone modifiers */
297 zwjp || uc == 0x200D)) /* Zero Width Joiner */
298 {
299 long L1 = strlen(ret[i-2]);
300 long L2 = strlen(ret[i-1]);
301 char *s2 = (char *) malloc (L1 + L2 + 1);
302 memcpy (s2, ret[i-2], L1);
303 memcpy (s2 + L1, ret[i-1], L2);
304 s2[L1 + L2] = 0;
305 free (ret[i-2]);
306 ret[i-2] = s2;
307 i--;
308 zwjp = (uc == 0x200D); /* Swallow the next character as well */
309 }
310 }
311 ret[i] = 0;
312
313 if (length_ret)
314 *length_ret = i;
315
316 /* shrink */
317 ret = (char **) realloc (ret, (i+1) * sizeof(*ret));
318
319 return ret;
320}
321
322
323/* Converts a null-terminated XChar2b array to a null-terminated UTF8 string.
324 */
325char *
326XChar2b_to_utf8 (const XChar2b *in, int *length_ret)
327{
328 int in_len = 0;
329 const XChar2b *in_end;
330 int out_len;
331 char *utf8, *out;
332 const char *out_end;
333
334 /* Find the null termination on the XChar2b. */
335 for (in_end = in; in_end->byte1 || in_end->byte2; in_end++, in_len++)
336 ;
337
338 out_len = (in_len + 1) * 3; /* 16 bit chars = 3 bytes max */
339 utf8 = out = (char *) malloc (out_len + 1);
340 if (! out) return 0;
341 out_end = out + out_len;
342
343 while (in < in_end)
344 {
345 unsigned long uc = (in->byte1 << 8) | in->byte2;
346 int wrote = utf8_encode (uc, out, out_end - out);
347 if (wrote > 3) abort(); /* Can't happen with 16 bit input */
348 out += wrote;
349 in++;
350 }
351 *out = 0;
352
353 out_len = (int) (out - utf8 + 1);
354
355 if (length_ret)
356 *length_ret = out_len;
357
358 /* shrink */
359 utf8 = (char *) realloc (utf8, out_len);
360
361 return utf8;
362}
363
364
365/* Converts a UTF8 string to the closest Latin1 or ASCII equivalent.
366 */
367char *
368utf8_to_latin1 (const char *string, Bool ascii_p)
369{
370 long in_len = strlen(string);
371 const unsigned char *in = (const unsigned char *) string;
372 const unsigned char *in_end = in + in_len;
373 unsigned char *ret = (unsigned char *) malloc (in_len + 1);
374 unsigned char *out = ret;
375
376 if (! ret) return 0;
377
378 while (in < in_end)
379 {
380 unsigned long uc = 0;
381 long len2 = utf8_decode (in, in_end - in, &uc);
382 in += len2;
383
384 if (uc == '\240') /* &nbsp; */
385 uc = ' ';
386 else if (uc >= 0x300 && uc <= 0x36F)
387 uc = 0; /* Discard "Combining Diacritical Marks" */
388 else if (uc >= 0x1AB0 && uc <= 0x1AFF)
389 uc = 0; /* Discard "Combining Diacritical Marks Extended" */
390 else if (uc >= 0x1DC0 && uc <= 0x1DFF)
391 uc = 0; /* Discard "Combining Diacritical Marks Supplement" */
392 else if (uc >= 0x20D0 && uc <= 0x20FF)
393 uc = 0; /* Discard "Combining Diacritical Marks for Symbols" */
394 else if (uc >= 0xFE20 && uc <= 0xFE2F)
395 uc = 0; /* Discard "Combining Half Marks" */
396
397 else if (uc > 0xFF)
398 switch (uc) {
399
400 /* Map "Unicode General Punctuation Block" to Latin1 equivalents. */
401
402 case 0x2000: /* EN QUAD */
403 case 0x2001: /* EM QUAD */
404 case 0x2002: /* EN SPACE */
405 case 0x2003: /* EM SPACE */
406 case 0x2004: /* THREE-PER-EM SPACE */
407 case 0x2005: /* FOUR-PER-EM SPACE */
408 case 0x2006: /* SIX-PER-EM SPACE */
409 case 0x2007: /* FIGURE SPACE */
410 case 0x2008: /* PUNCTUATION SPACE */
411 case 0x2009: /* THIN SPACE */
412 case 0x200A: /* HAIR SPACE */
413 uc = ' ';
414 break;
415
416 case 0x2010: /* HYPHEN */
417 case 0x2011: /* NON-BREAKING HYPHEN */
418 case 0x2012: /* FIGURE DASH */
419 case 0x2013: /* EN DASH */
420 case 0x2014: /* EM DASH */
421 case 0x2015: /* HORIZONTAL BAR */
422 uc = '-';
423 break;
424
425 case 0x2018: /* LEFT SINGLE QUOTATION MARK */
426 case 0x2019: /* SINGLE LOW-9 QUOTATION MARK */
427 case 0x201A: /* SINGLE LOW-9 QUOTATION MARK */
428 case 0x201B: /* SINGLE HIGH-REVERSED-9 QUOTATION MARK */
429 uc = '\'';
430 break;
431
432 case 0x201C: /* LEFT DOUBLE QUOTATION MARK */
433 case 0x201D: /* RIGHT DOUBLE QUOTATION MARK */
434 case 0x201E: /* DOUBLE LOW-9 QUOTATION MARK */
435 case 0x201F: /* DOUBLE HIGH-REVERSED-9 QUOTATION MARK */
436 uc = '"';
437 break;
438
439 case 0x2022: uc = '\267'; break; /* BULLET */
440 case 0x2023: uc = '\273'; break; /* TRIANGULAR BULLET */
441 case 0x2027: uc = '\267'; break; /* HYPHENATION POINT */
442 case 0x202F: uc = ' '; break; /* NARROW NO-BREAK SPACE */
443 case 0x2038: uc = '^'; break; /* CARET */
444 case 0x2039: uc = '\253'; break; /* SINGLE LEFT ANGLE QUOTATION MARK */
445 case 0x203A: uc = '\273'; break; /* SINGLE RIGHT ANGLE QUOTATION MARK*/
446 case 0x2041: uc = '^'; break; /* CARET INSERTION POINT */
447 case 0x2042: uc = '*'; break; /* ASTERISM */
448 case 0x2043: uc = '='; break; /* HYPHEN BULLET */
449 case 0x2044: uc = '/'; break; /* FRACTION SLASH */
450 case 0x204B: uc = '\266'; break; /* REVERSED PILCROW SIGN */
451 case 0x204C: uc = '\267'; break; /* BLACK LEFTWARDS BULLET */
452 case 0x204D: uc = '\267'; break; /* BLACK RIGHTWARDS BULLET */
453 case 0x204E: uc = '*'; break; /* LOW ASTERISK */
454 case 0x204F: uc = ';'; break; /* REVERSED SEMICOLON */
455 default:
456 break;
457 }
458
459 if (uc > 0xFF)
460 /* "Inverted question mark" looks enough like 0xFFFD,
461 the "Unicode Replacement Character". */
462 uc = (ascii_p ? '#' : '\277');
463
464 if (ascii_p) /* Map Latin1 to the closest ASCII versions. */
465 {
466 const unsigned char latin1_to_ascii[96] =
467 " !C##Y|S_C#<=-R_##23'uP.,1o>###?"
468 "AAAAAAECEEEEIIIIDNOOOOOx0UUUUYpS"
469 "aaaaaaeceeeeiiiionooooo/ouuuuypy";
470 if (uc >= 0xA0)
471 uc = latin1_to_ascii[uc - 0xA0];
472 }
473
474 if (uc > 0)
475 *out++ = (unsigned char) uc;
476 }
477 *out = 0;
478
479 /* shrink */
480 ret = (unsigned char *) realloc (ret, (out - ret + 1) * sizeof(*ret));
481
482 return (char *) ret;
483}
484
485
486/*************************************************************************
487
488 cd ../hacks ; make test-utf8wc
489
490 *************************************************************************/
491
492#ifdef SELFTEST
493
494/* Convert a UTF8 string to Unicode and back again.
495 */
496static char *
497split_and_join (const char *string)
498{
499 const unsigned char *in = (const unsigned char *) string;
500 int len = strlen (string);
501 const unsigned char *end = in + len;
502 unsigned long *unicode = (unsigned long *)
503 malloc((len + 1) * sizeof(*unicode));
504 int i = 0;
505 char *ret, *out, *out_end;
506
507 while (in < end)
508 {
509 long len2 = utf8_decode (in, len, &unicode[i]);
510 i++;
511 in += len2;
512 }
513 unicode[i] = 0;
514
515 i = i*6 + 1;
516 out = ret = (char *) malloc(i);
517 out_end = out + i;
518 i = 0;
519 while (unicode[i])
520 {
521 int len2 = utf8_encode (unicode[i], out, out_end - out);
522 out += len2;
523 i++;
524 }
525 *out = 0;
526 free (unicode);
527
528 return ret;
529}
530
531
532static void
533LOG (FILE *out, const char *prefix, const char *s)
534{
535 fprintf (out, "%6s: \"", prefix);
536 while (*s)
537 {
538 unsigned char c = *s;
539 if (c == '"' || c == '\\') fprintf(out, "\\%c", c);
540 else if (c < 32 || c >= 127) fprintf(out, "\\%03o", c);
541 else fprintf (out, "%c", c);
542 s++;
543 }
544 fprintf (out, "\"\n");
545}
546
547
548int
549main (int argc, char **argv)
550{
551 /* Adapted from http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt
552 */
553
554# define URC "\357\277\275" /* 0xFFFD, "Unicode Replacement Character" */
555
556 static const struct { const char *name, *in, *target, *target2; } tests[] = {
557 /* 1 Some correct UTF-8 text */
558
559 /* The Greek word 'kosme': */
560 { "1", "\316\272\341\275\271\317\203\316\274\316\265" },
561
562
563 /* 2 Boundary condition test cases */
564
565 /* 2.1 First possible sequence of a certain length */
566
567 { "2.1.1", /* 1 byte (U-00000000): */ "\000" },
568 { "2.1.2", /* 2 bytes (U-00000080): */ "\302\200" },
569 { "2.1.3", /* 3 bytes (U-00000800): */ "\340\240\200" },
570 { "2.1.4", /* 4 bytes (U-00010000): */ "\360\220\200\200", 0, URC },
571 { "2.1.5", /* 5 bytes (U-00200000): */ "\370\210\200\200\200", URC },
572 { "2.1.6", /* 6 bytes (U-04000000): */ "\374\204\200\200\200\200", URC },
573
574 /* 2.2 Last possible sequence of a certain length */
575
576 { "2.2.1", /* 1 byte (U-0000007F): */ "\177" },
577 { "2.2.2", /* 2 bytes (U-000007FF): */ "\337\277" },
578 { "2.2.3", /* 3 bytes (U-0000FFFF): */ "\357\277\277" },
579 { "2.2.4", /* 4 bytes (U-001FFFFF): */ "\367\277\277\277", URC },
580 { "2.2.5", /* 5 bytes (U-03FFFFFF): */ "\373\277\277\277\277", URC },
581 { "2.2.6", /* 6 bytes (U-7FFFFFFF): */ "\375\277\277\277\277\277", URC },
582
583 /* 2.3 Other boundary conditions */
584
585 { "2.3.1", /* U-0000D7FF = ed 9f bf = */ "\355\237\277" },
586 { "2.3.2", /* U-0000E000 = ee 80 80 = */ "\356\200\200" },
587 { "2.3.3", /* U-0000FFFD = ef bf bd = */ URC },
588 { "2.3.4", /* U-0010FFFF = f4 8f bf bf = */ "\364\217\277\277", 0, URC },
589 { "2.3.5", /* U-00110000 = f4 90 80 80 = */ "\364\220\200\200", URC },
590
591
592 /* 3 Malformed sequences */
593
594 /* 3.1 Unexpected continuation bytes */
595
596 /* Each unexpected continuation byte should be separately signalled as a
597 malformed sequence of its own. */
598
599 { "3.1.1", /* First continuation byte 0x80: */ "\200", URC },
600 { "3.1.2", /* Last continuation byte 0xbf: */ "\277", URC },
601 { "3.1.3", /* 2 continuation bytes: */ "\200\277", URC URC },
602 { "3.1.4", /* 3 continuation bytes: */ "\200\277\200", URC URC URC },
603 { "3.1.5", /* 4 continuation bytes: */ "\200\277\200\277",
604 URC URC URC URC },
605 { "3.1.6", /* 5 continuation bytes: */ "\200\277\200\277\200",
606 URC URC URC URC URC },
607 { "3.1.7", /* 6 continuation bytes: */ "\200\277\200\277\200\277",
608 URC URC URC URC URC URC },
609 { "3.1.8", /* 7 continuation bytes: */ "\200\277\200\277\200\277\200",
610 URC URC URC URC URC URC URC },
611
612 { "3.1.9", /* Sequence of all 64 possible continuation bytes (0x80-0xbf):*/
613
614 "\200\201\202\203\204\205\206\207\210\211\212\213\214\215\216\217"
615 "\220\221\222\223\224\225\226\227\230\231\232\233\234\235\236\237"
616 "\240\241\242\243\244\245\246\247\250\251\252\253\254\255\256\257"
617 "\260\261\262\263\264\265\266\267\270\271\272\273\274\275\276\277",
618 URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC
619 URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC
620 URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC
621 URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC },
622
623 /* 3.2 Lonely start characters */
624
625 { "3.2.1", /* All 32 first bytes of 2-byte sequences (0xc0-0xdf),
626 each followed by a space character: */
627
628 "\300 \301 \302 \303 \304 \305 \306 \307 \310 \311 \312 \313 \314 "
629 "\315 \316 \317 \320 \321 \322 \323 \324 \325 \326 \327 \330 \331 "
630 "\332 \333 \334 \335 \336 \337 ",
631 URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC
632 URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC },
633
634 { "3.2.2", /* All 16 first bytes of 3-byte sequences (0xe0-0xef),
635 each followed by a space character: */
636 "\340 \341 \342 \343 \344 \345 \346 \347 "
637 "\350 \351 \352 \353 \354 \355 \356 \357 ",
638 URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC },
639
640 { "3.2.3", /* All 8 first bytes of 4-byte sequences (0xf0-0xf7),
641 each followed by a space character: */
642 URC URC URC URC URC URC URC URC },
643
644 { "3.2.4", /* All 4 first bytes of 5-byte sequences (0xf8-0xfb),
645 each followed by a space character: */
646 "\370 \371 \372 \373 ",
647 URC URC URC URC },
648
649 { "3.2.5", /* All 2 first bytes of 6-byte sequences (0xfc-0xfd),
650 each followed by a space character: */
651 "\374 \375 ", URC URC },
652
653 /* 3.3 Sequences with last continuation byte missing */
654
655 /* All bytes of an incomplete sequence should be signalled as a single
656 malformed sequence, i.e., you should see only a single replacement
657 character in each of the next 10 tests. (Characters as in section 2) */
658
659 { "3.3.1", /* 2-byte sequence with last byte missing (U+0000): */
660 "\300", URC },
661 { "3.3.2", /* 3-byte sequence with last byte missing (U+0000): */
662 "\340\200", URC },
663 { "3.3.3", /* 4-byte sequence with last byte missing (U+0000): */
664 "\360\200\200", URC },
665 { "3.3.4", /* 5-byte sequence with last byte missing (U+0000): */
666 "\370\200\200\200", URC },
667 { "3.3.5", /* 6-byte sequence with last byte missing (U+0000): */
668 "\374\200\200\200\200", URC },
669 { "3.3.6", /* 2-byte sequence with last byte missing (U-000007FF): */
670 "\337", URC },
671 { "3.3.7", /* 3-byte sequence with last byte missing (U-0000FFFF): */
672 "\357\277", URC },
673 { "3.3.8", /* 4-byte sequence with last byte missing (U-001FFFFF): */
674 "\367\277\277", URC },
675 { "3.3.9", /* 5-byte sequence with last byte missing (U-03FFFFFF): */
676 "\373\277\277\277", URC },
677 { "3.3.10", /* 6-byte sequence with last byte missing (U-7FFFFFFF): */
678 "\375\277\277\277\277", URC },
679
680 /* 3.4 Concatenation of incomplete sequences */
681
682 /* All the 10 sequences of 3.3 concatenated, you should see 10 malformed
683 sequences being signalled: */
684
685 { "3.4", "\300\340\200\360\200\200\370\200\200\200\374\200\200\200\200"
686 "\337\357\277\367\277\277\373\277\277\277\375\277\277\277\277",
687 URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC },
688
689 /* 3.5 Impossible bytes */
690
691 /* The following two bytes cannot appear in a correct UTF-8 string */
692
693 { "3.5.1", /* fe = */ "\376", URC },
694 { "3.5.2", /* ff = */ "\377", URC },
695 { "3.5.3", /* fe fe ff ff = */ "\376\376\377\377", URC URC URC URC },
696
697
698 /* 4 Overlong sequences */
699
700 /* 4.1 Examples of an overlong ASCII character */
701
702 { "4.1.1", /* U+002F = c0 af = */ "\300\257", URC },
703 { "4.1.2", /* U+002F = e0 80 af = */ "\340\200\257", URC },
704 { "4.1.3", /* U+002F = f0 80 80 af = */ "\360\200\200\257", URC },
705 { "4.1.4", /* U+002F = f8 80 80 80 af = */ "\370\200\200\200\257",
706 URC },
707 { "4.1.5", /* U+002F = fc 80 80 80 80 af = */ "\374\200\200\200\200\257",
708 URC },
709
710 /* 4.2 Maximum overlong sequences */
711
712 { "4.2.1", /* U-0000007F = c1 bf = */ "\301\277", URC },
713 { "4.2.2", /* U-000007FF = e0 9f bf = */ "\340\237\277", URC },
714 { "4.2.3", /* U-0000FFFF = f0 8f bf bf = */ "\360\217\277\277",
715 URC },
716 { "4.2.4", /* U-001FFFFF = f8 87 bf bf bf = */ "\370\207\277\277\277",
717 URC },
718 { "4.2.5", /* U-03FFFFFF = fc 83 bf bf bf bf = */ URC },
719
720 /* 4.3 Overlong representation of the NUL character */
721
722 { "4.3.1", /* U+0000 = c0 80 = */ "\300\200", URC },
723 { "4.3.2", /* U+0000 = e0 80 80 = */ "\340\200\200", URC },
724 { "4.3.3", /* U+0000 = f0 80 80 80 = */ "\360\200\200\200", URC },
725 { "4.3.4", /* U+0000 = f8 80 80 80 80 = */ "\370\200\200\200\200",
726 URC },
727 { "4.3.5", /* U+0000 = fc 80 80 80 80 80 = */ "\374\200\200\200\200\200",
728 URC },
729
730
731 /* 5 Illegal code positions */
732
733 /* 5.1 Single UTF-16 surrogates */
734
735 { "5.1.1", /* U+D800 = ed a0 80 = */ "\355\240\200", URC },
736 { "5.1.2", /* U+DB7F = ed ad bf = */ "\355\255\277", URC },
737 { "5.1.3", /* U+DB80 = ed ae 80 = */ "\355\256\200", URC },
738 { "5.1.4", /* U+DBFF = ed af bf = */ "\355\257\277", URC },
739 { "5.1.5", /* U+DC00 = ed b0 80 = */ "\355\260\200", URC },
740 { "5.1.6", /* U+DF80 = ed be 80 = */ "\355\276\200", URC },
741 { "5.1.7", /* U+DFFF = ed bf bf = */ "\355\277\277", URC },
742
743 /* 5.2 Paired UTF-16 surrogates */
744
745 { "5.2.1", /* U+D800 U+DC00 = ed a0 80 ed b0 80 = */ URC URC },
746 { "5.2.2", /* U+D800 U+DFFF = ed a0 80 ed bf bf = */ URC URC },
747 { "5.2.3", /* U+DB7F U+DC00 = ed ad bf ed b0 80 = */ URC URC },
748 { "5.2.4", /* U+DB7F U+DFFF = ed ad bf ed bf bf = */ URC URC },
749 { "5.2.5", /* U+DB80 U+DC00 = ed ae 80 ed b0 80 = */ URC URC },
750 { "5.2.6", /* U+DB80 U+DFFF = ed ae 80 ed bf bf = */ URC URC },
751 { "5.2.7", /* U+DBFF U+DC00 = ed af bf ed b0 80 = */ URC URC },
752 { "5.2.8", /* U+DBFF U+DFFF = ed af bf ed bf bf = */ URC URC },
753
754 /* 5.3 Other illegal code positions */
755
756 { "5.3.1", /* U+FFFE = ef bf be = */ "\357\277\276" },
757 { "5.3.2", /* U+FFFF = ef bf bf = */ "\357\277\277" },
758
759
760 /* 6 Some other junk */
761
762 { "6.0", "" },
763 { "6.1", "\001\002\003\004\005 ABC" },
764 { "6.2", /* every non-ASCII Latin1 character */
765 "\302\241\302\242\302\243\302\244\302\245\302\246\302\247\302\250"
766 "\302\251\302\252\302\253\302\254\302\255\302\256\302\257\302\260"
767 "\302\261\302\262\302\263\302\264\302\265\302\266\302\267\302\270"
768 "\302\271\302\272\302\273\302\274\302\275\302\276\302\277\303\200"
769 "\303\201\303\202\303\203\303\204\303\205\303\206\303\207\303\210"
770 "\303\211\303\212\303\213\303\214\303\215\303\216\303\217\303\220"
771 "\303\221\303\222\303\223\303\224\303\225\303\226\303\227\303\230"
772 "\303\231\303\232\303\233\303\234\303\235\303\236\303\237\303\240"
773 "\303\241\303\242\303\243\303\244\303\245\303\246\303\247\303\250"
774 "\303\251\303\252\303\253\303\254\303\255\303\256\303\257\303\260"
775 "\303\261\303\262\303\263\303\264\303\265\303\266\303\267\303\270"
776 "\303\271\303\272\303\273\303\274\303\275\303\276\303\277" },
777
778 { "6.3", /* Christmas tree */
779 "\001\002\003\004\005\006\007\010\011\012\013\014\015\016\017\020"
780 "\021\022\023\024\025\026\027\030\031\032\033\034\035\036\037\040"
781 "\041\042\043\044\045\046\047\050\051\052\053\054\055\056\057\060"
782 "\061\062\063\064\065\066\067\070\071\072\073\074\075\076\077\100"
783 "\101\102\103\104\105\106\107\110\111\112\113\114\115\116\117\120"
784 "\121\122\123\124\125\126\127\130\131\132\133\134\135\136\137\140"
785 "\141\142\143\144\145\146\147\150\151\152\153\154\155\156\157\160"
786 "\161\162\163\164\165\166\167\170\171\172\173\174\175\176\177\200"
787 "\201\202\203\204\205\206\207\210\211\212\213\214\215\216\217\220"
788 "\221\222\223\224\225\226\227\230\231\232\233\234\235\236\237\240"
789 "\241\242\243\244\245\246\247\250\251\252\253\254\255\256\257\260"
790 "\261\262\263\264\265\266\267\270\271\272\273\274\275\276\277\300"
791 "\301\302\303\304\305\306\307\310\311\312\313\314\315\316\317\320"
792 "\321\322\323\324\325\326\327\330\331\332\333\334\335\336\337\340"
793 "\341\342\343\344\345\346\347\350\351\352\353\354\355\356\357\360"
794 "\361\362\363\364\365\366\367\370\371\372\373\374\375\376\377",
795
796 "\001\002\003\004\005\006\007\010\011\012\013\014\015\016\017\020"
797 "\021\022\023\024\025\026\027\030\031\032\033\034\035\036\037"
798 " !\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ"
799 "[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\177"
800 URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC
801 URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC
802 URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC
803 URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC
804 URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC
805 URC URC URC URC URC URC URC URC URC URC URC URC },
806 };
807
808 int i;
809 int ok = 1;
810 for (i = 0; i < sizeof(tests)/sizeof(*tests); i++)
811 {
812 const char *name = tests[i].name;
813 const char *in = tests[i].in;
814 const char *target = (tests[i].target ? tests[i].target : in);
815 const char *target2 = (tests[i].target2 ? tests[i].target2 : target);
816 char *out = split_and_join (in);
817 XChar2b *out16 = utf8_to_XChar2b (in, 0);
818 char *out2 = XChar2b_to_utf8 (out16, 0);
819 if (strcmp (out, target))
820 {
821 LOG (stderr, name, target);
822 LOG (stderr, "FAIL", out);
823 fprintf (stderr, "\n");
824 ok = 0;
825 }
826 if (strcmp (out2, target2))
827 {
828 LOG (stderr, name, target2);
829 LOG (stderr, "FAIL2", out2);
830 fprintf (stderr, "\n");
831 ok = 0;
832 }
833 free (out);
834 free (out2);
835 free (out16);
836 }
837
838 /* Check conversion from UTF8 to Latin1 and ASCII. */
839 {
840 const char *utf8 = ("son \303\256le int\303\251rieure, \303\240 "
841 "c\303\264t\303\251 de l'alc\303\264ve "
842 "ovo\303\257de, o\303\271 les b\303\273ches "
843 "se consument dans l'\303\242tre");
844 const char *latin1 = ("son \356le int\351rieure, \340 "
845 "c\364t\351 de l'alc\364ve ovo\357de, "
846 "o\371 les b\373ches se consument dans "
847 "l'\342tre");
848 const char *ascii = ("son ile interieure, a cote de l'alcove "
849 "ovoide, ou les buches se consument dans "
850 "l'atre");
851 char *latin1b = utf8_to_latin1 (utf8, False);
852 char *ascii2 = utf8_to_latin1 (utf8, True);
853 if (strcmp (latin1, latin1b))
854 {
855 LOG (stderr, "LATIN1", utf8);
856 LOG (stderr, "FAIL3", latin1b);
857 fprintf (stderr, "\n");
858 ok = 0;
859 }
860 if (strcmp (ascii, ascii2))
861 {
862 LOG (stderr, "ASCII", utf8);
863 LOG (stderr, "FAIL4", ascii2);
864 fprintf (stderr, "\n");
865 ok = 0;
866 }
867 free (latin1b);
868 free (ascii2);
869 }
870
871 /* Check de-composition of emoji that should all be treated as a unit
872 for measurement and display purposes. */
873 {
874 static const char * const tests[] = {
875
876 /* 0: "Man" */
877 " \360\237\221\250 ",
878
879 /* 1: "Blackula" = "Vampire, dark skin tone" = 1F9DB 1F3FF */
880 " \360\237\247\233\360\237\217\277 ",
881
882 /* 2: "Black male teacher" = "Man, dark skin tone, ZWJ, school" =
883 1F468 1F3FF 200D 1F3EB
884 */
885 " \360\237\221\250\360\237\217\277\342\200\215\360\237\217\253 ",
886
887 /* 3: "Female runner" = "Runner, ZWJ, female sign" = 1F3C3 200D 2640 */
888 " \360\237\217\203\342\200\215\342\231\200 ",
889
890 /* 4: "Woman astronaut" = "Woman, ZWJ, rocket ship" = 1F3C3 200D 1F680 */
891 " \360\237\217\203\342\200\215\360\237\232\200 ",
892
893 /* 5:
894 Group of people displayed as a single glyph:
895 Woman, dark skin tone, ZWJ, 1F469 1F3FF 200D
896 Man, light skin tone, ZWJ, 1F468 1F3FB 200D
897 Boy, medium skin tone, ZWJ, 1F466 1F3FD 200D
898 Girl, dark skin tone. 1F467 1F3FF
899 */
900 " \360\237\221\251\360\237\217\277\342\200\215"
901 "\360\237\221\250\360\237\217\273\342\200\215"
902 "\360\237\221\246\360\237\217\275\342\200\215"
903 "\360\237\221\247\360\237\217\277 ",
904 };
905 int i;
906 for (i = 0; i < sizeof(tests)/sizeof(*tests); i++)
907 {
908 int L = 0;
909 char **out = utf8_split (tests[i], &L);
910 char name[100];
911 int j;
912 sprintf (name, "SPLIT %d: %d glyphs", i, L-2);
913 if (L != 3)
914 {
915 LOG (stderr, name, tests[i]);
916 ok = 0;
917 }
918 for (j = 0; j < L; j++)
919 free (out[j]);
920 free (out);
921 }
922 }
923
924 if (ok) fprintf (stderr, "OK\n");
925 return (ok == 0);
926}
927
928#endif /* SELFTEST */