| 1 | /* xscreensaver, Copyright (c) 2014-2016 Jamie Zawinski <jwz@jwz.org> |
| 2 | * |
| 3 | * Permission to use, copy, modify, distribute, and sell this software and its |
| 4 | * documentation for any purpose is hereby granted without fee, provided that |
| 5 | * the above copyright notice appear in all copies and that both that |
| 6 | * copyright notice and this permission notice appear in supporting |
| 7 | * documentation. No representations are made about the suitability of this |
| 8 | * software for any purpose. It is provided "as is" without express or |
| 9 | * implied warranty. |
| 10 | */ |
| 11 | |
| 12 | #ifdef HAVE_CONFIG_H |
| 13 | # include "config.h" |
| 14 | #endif |
| 15 | |
| 16 | #include <stdlib.h> |
| 17 | #include <stdio.h> |
| 18 | #include <string.h> |
| 19 | |
| 20 | #ifdef HAVE_JWXYZ |
| 21 | # include "jwxyz.h" |
| 22 | #else /* !HAVE_JWXYZ */ |
| 23 | # include <X11/Xlib.h> |
| 24 | #endif |
| 25 | |
| 26 | #include "utf8wc.h" |
| 27 | |
| 28 | |
| 29 | /* "Unicode Replacement Character", displayed in lieu of invalid characters. */ |
| 30 | # define INVALID 0xFFFD |
| 31 | |
| 32 | |
| 33 | /* Mask the number to be within the valid range of unicode characters. |
| 34 | */ |
| 35 | static unsigned long |
| 36 | uc_truncate (unsigned long uc) |
| 37 | { |
| 38 | uc &= 0x7FFFFFFFL; /* Unicode is 31 bits */ |
| 39 | if (uc > 0x10FFFF) uc = INVALID; /* But UTF-8 is 4 bytes */ |
| 40 | if (uc == 0) uc = INVALID; /* no nulls */ |
| 41 | |
| 42 | if (uc >= 0xD800 && uc <= 0xDFFF) |
| 43 | /* Reserved for use with UTF-16: not a real character. */ |
| 44 | uc = INVALID; |
| 45 | |
| 46 | return uc; |
| 47 | } |
| 48 | |
| 49 | |
| 50 | /* Parse the first UTF8 character at the front of the string. |
| 51 | Return the Unicode character, and the number of bytes read. |
| 52 | */ |
| 53 | long |
| 54 | utf8_decode (const unsigned char *in, long length, unsigned long *unicode_ret) |
| 55 | { |
| 56 | const unsigned char *start = in; |
| 57 | const unsigned char *end = in + length; |
| 58 | unsigned long uc = INVALID; |
| 59 | unsigned long min = 0; |
| 60 | unsigned char c; |
| 61 | |
| 62 | if (length <= 0) goto DONE; |
| 63 | |
| 64 | c = *in++; |
| 65 | |
| 66 | # define PREMATURE_EOF { in = end; goto DONE; } |
| 67 | |
| 68 | if ((c & 0xC0) == 0x80) { /* 10xxxxxx - lonely continuation byte */ |
| 69 | uc = INVALID; |
| 70 | |
| 71 | } else if ((c & 0x80) == 0) { /* 0xxxxxxx - 7 bits in 1 byte */ |
| 72 | uc = (c & 0x7F); /* 01111111 */ |
| 73 | |
| 74 | } else if ((c & 0xE0) == 0xC0) { /* 110xxxxx - 11 bits in 2 bytes */ |
| 75 | if (in+1 > end) PREMATURE_EOF; |
| 76 | min = 1 << 7; |
| 77 | uc = (((c & 0x1F) << 6) | /* 00011111------ */ |
| 78 | (in[0] & 0x3F)); /* 00111111 */ |
| 79 | in += 1; |
| 80 | |
| 81 | } else if ((c & 0xF0) == 0xE0) { /* 1110xxxx - 16 bits in 3 bytes */ |
| 82 | if (in+2 > end) PREMATURE_EOF; |
| 83 | min = 1 << 11; |
| 84 | uc = (((c & 0x0F) << 12) | /* 00001111----+------- */ |
| 85 | ((in[0] & 0x3F) << 6) | /* 00111111------ */ |
| 86 | ((in[1] & 0x3F))); /* 00111111 */ |
| 87 | in += 2; |
| 88 | |
| 89 | } else if ((c & 0xF8) == 0xF0) { /* 11110xxx - 21 bits in 4 bytes */ |
| 90 | if (in+3 > end) PREMATURE_EOF; |
| 91 | min = 1 << 16; |
| 92 | uc = (((c & 0x07) << 18) | /* 00000111--+-------+------- */ |
| 93 | ((in[0] & 0x3F) << 12) | /* 01111111----+------- */ |
| 94 | ((in[1] & 0x3F) << 6) | /* 00111111------ */ |
| 95 | ((in[2] & 0x3F))); /* 00111111 */ |
| 96 | in += 3; |
| 97 | |
| 98 | } else if ((c & 0xFC) == 0xF8) { /* 111110xx - 26 bits in 5 bytes */ |
| 99 | if (in+4 > end) PREMATURE_EOF; |
| 100 | min = 1 << 21; |
| 101 | uc = (((c & 0x03) << 24) | /* 00000011--------+-------+------- */ |
| 102 | ((in[0] & 0x3F) << 18) | /* 00111111--+-------+------- */ |
| 103 | ((in[1] & 0x3F) << 12) | /* 00111111----+------- */ |
| 104 | ((in[2] & 0x3F) << 6) | /* 00111111------ */ |
| 105 | ((in[3] & 0x3F))); /* 00111111 */ |
| 106 | in += 4; |
| 107 | |
| 108 | } else if ((c & 0xFE) == 0xFC) { /* 1111110x - 31 bits in 6 bytes */ |
| 109 | if (in+5 > end) PREMATURE_EOF; |
| 110 | min = 1 << 26; |
| 111 | uc = (((c & 0x01) << 30) | /* 00000001------+-------+-------+------- */ |
| 112 | ((in[0] & 0x3F) << 24) | /* 00111111+-------+-------+------- */ |
| 113 | ((in[1] & 0x3F) << 18) | /* 00111111--+-------+------- */ |
| 114 | ((in[2] & 0x3F) << 12) | /* 00111111----+------- */ |
| 115 | ((in[3] & 0x3F) << 6) | /* 00111111------ */ |
| 116 | ((in[4] & 0x3F))); /* 00111111 */ |
| 117 | in += 5; |
| 118 | } else { |
| 119 | uc = INVALID; /* Unparsable sequence. */ |
| 120 | } |
| 121 | |
| 122 | DONE: |
| 123 | |
| 124 | length = in - start; |
| 125 | |
| 126 | /* If any of the continuation bytes didn't begin with the continuation tag, |
| 127 | the sequence is invalid; stop at the bad byte, not consuming later ones. |
| 128 | (It's easier to check this after the fact than up above.) */ |
| 129 | { |
| 130 | int i; |
| 131 | for (i = 1; i < length; i++) |
| 132 | if ((start[i] & 0xC0) != 0x80) { |
| 133 | uc = INVALID; |
| 134 | length = i+1; |
| 135 | break; |
| 136 | } |
| 137 | } |
| 138 | |
| 139 | if (uc < min) |
| 140 | /* A multi-byte sequence encoded a character that could have been |
| 141 | encoded with a shorter sequence, e.g., hiding ASCII inside a |
| 142 | multi-byte sequence. Something hinky's going on. Reject it. */ |
| 143 | uc = INVALID; |
| 144 | |
| 145 | uc = uc_truncate (uc); |
| 146 | |
| 147 | if (unicode_ret) |
| 148 | *unicode_ret = uc; |
| 149 | |
| 150 | return length; |
| 151 | } |
| 152 | |
| 153 | |
| 154 | /* Converts a Unicode character to a multi-byte UTF8 sequence. |
| 155 | Returns the number of bytes written. |
| 156 | */ |
| 157 | int |
| 158 | utf8_encode (unsigned long uc, char *out, long length) |
| 159 | { |
| 160 | const char *old = out; |
| 161 | |
| 162 | uc = uc_truncate (uc); |
| 163 | |
| 164 | if (uc < 0x80 && length >= 1) /* 7 bits in 1 byte */ |
| 165 | { |
| 166 | *out++ = uc; /* 0xxxxxxx */ |
| 167 | } |
| 168 | else if (uc < 0x800 && length >= 2) /* 11 bits in 2 bytes */ |
| 169 | { |
| 170 | *out++ = (0xC0 | ((uc >> 6) & 0x1F)); /* 110xxxxx */ |
| 171 | *out++ = (0x80 | (uc & 0x3F)); /* 10xxxxxx */ |
| 172 | } |
| 173 | else if (uc < 0x10000L && length >= 3) /* 16 bits in 3 bytes */ |
| 174 | { |
| 175 | *out++ = (0xE0 | ((uc >> 12) & 0x0F)); /* 1110xxxx */ |
| 176 | *out++ = (0x80 | ((uc >> 6) & 0x3F)); /* 10xxxxxx */ |
| 177 | *out++ = (0x80 | (uc & 0x3F)); /* 10xxxxxx */ |
| 178 | } |
| 179 | else if (uc < 0x200000L && length >= 4) /* 21 bits in 4 bytes */ |
| 180 | { |
| 181 | *out++ = (0xF0 | ((uc >> 18) & 0x07)); /* 11110xxx */ |
| 182 | *out++ = (0x80 | ((uc >> 12) & 0x3F)); /* 10xxxxxx */ |
| 183 | *out++ = (0x80 | ((uc >> 6) & 0x3F)); /* 10xxxxxx */ |
| 184 | *out++ = (0x80 | (uc & 0x3F)); /* 10xxxxxx */ |
| 185 | } |
| 186 | else if (uc < 0x4000000L && length >= 5) /* 26 bits in 5 bytes */ |
| 187 | { |
| 188 | *out++ = (0xF8 | ((uc >> 24) & 0x03)); /* 111110xx */ |
| 189 | *out++ = (0x80 | ((uc >> 18) & 0x3F)); /* 10xxxxxx */ |
| 190 | *out++ = (0x80 | ((uc >> 12) & 0x3F)); /* 10xxxxxx */ |
| 191 | *out++ = (0x80 | ((uc >> 6) & 0x3F)); /* 10xxxxxx */ |
| 192 | *out++ = (0x80 | (uc & 0x3F)); /* 10xxxxxx */ |
| 193 | } |
| 194 | else if (length >= 6) /* 31 bits in 6 bytes */ |
| 195 | { |
| 196 | *out++ = (0xFC | ((uc >> 30) & 0x01)); /* 1111110x */ |
| 197 | *out++ = (0x80 | ((uc >> 24) & 0x3F)); /* 10xxxxxx */ |
| 198 | *out++ = (0x80 | ((uc >> 18) & 0x3F)); /* 10xxxxxx */ |
| 199 | *out++ = (0x80 | ((uc >> 12) & 0x3F)); /* 10xxxxxx */ |
| 200 | *out++ = (0x80 | ((uc >> 6) & 0x3F)); /* 10xxxxxx */ |
| 201 | *out++ = (0x80 | (uc & 0x3F)); /* 10xxxxxx */ |
| 202 | } |
| 203 | |
| 204 | return (int) (out - old); |
| 205 | } |
| 206 | |
| 207 | |
| 208 | /* Converts a null-terminated UTF8 string to a null-terminated XChar2b array. |
| 209 | This only handles characters that can be represented in 16 bits, the |
| 210 | Basic Multilingual Plane. (No hieroglyphics, Elvish, Klingon or Emoji.) |
| 211 | */ |
| 212 | XChar2b * |
| 213 | utf8_to_XChar2b (const char *string, int *length_ret) |
| 214 | { |
| 215 | long in_len = strlen(string); |
| 216 | const unsigned char *in = (const unsigned char *) string; |
| 217 | const unsigned char *in_end = in + in_len; |
| 218 | XChar2b *c2b = (XChar2b *) malloc ((in_len + 1) * sizeof(*c2b)); |
| 219 | XChar2b *out = c2b; |
| 220 | if (! out) return 0; |
| 221 | |
| 222 | while (in < in_end) |
| 223 | { |
| 224 | unsigned long uc = 0; |
| 225 | long L = utf8_decode (in, in_end - in, &uc); |
| 226 | in += L; |
| 227 | |
| 228 | /* If it can't be represented in a 16-bit XChar2b, |
| 229 | use "Unicode Replacement Character". */ |
| 230 | if (uc > 0xFFFF) uc = INVALID; |
| 231 | |
| 232 | out->byte1 = (uc >> 8) & 0xFF; |
| 233 | out->byte2 = uc & 0xFF; |
| 234 | out++; |
| 235 | } |
| 236 | |
| 237 | out->byte1 = 0; |
| 238 | out->byte2 = 0; |
| 239 | |
| 240 | if (length_ret) |
| 241 | *length_ret = (int) (out - c2b); |
| 242 | |
| 243 | /* shrink */ |
| 244 | c2b = (XChar2b *) realloc (c2b, (out - c2b + 1) * sizeof(*c2b)); |
| 245 | |
| 246 | return c2b; |
| 247 | } |
| 248 | |
| 249 | |
| 250 | /* Split a UTF8 string into an array of strings, one per character. |
| 251 | The sub-strings will be null terminated and may be multiple bytes. |
| 252 | */ |
| 253 | char ** |
| 254 | utf8_split (const char *string, int *length_ret) |
| 255 | { |
| 256 | const unsigned char *in = (const unsigned char *) string; |
| 257 | long len = strlen (string); |
| 258 | const unsigned char *end = in + len; |
| 259 | char **ret = (char **) malloc ((len+1) * sizeof(*ret)); |
| 260 | int i = 0; |
| 261 | int zwjp = 0; |
| 262 | if (!ret) return 0; |
| 263 | |
| 264 | while (in < end) |
| 265 | { |
| 266 | unsigned long uc; |
| 267 | long len2 = utf8_decode (in, len, &uc); |
| 268 | char tmp[10]; |
| 269 | memcpy (tmp, (char *) in, len2); |
| 270 | tmp[len2] = 0; |
| 271 | ret[i++] = strdup (tmp); |
| 272 | in += len2; |
| 273 | |
| 274 | /* If this is a Combining Diacritical, append it to the previous |
| 275 | character. E.g., "y\314\206\314\206" is one string, not three. |
| 276 | |
| 277 | If this is ZWJ, Zero Width Joiner, then we append both this character |
| 278 | and the following character, e.g. "X ZWJ Y" is one string not three. |
| 279 | |
| 280 | #### Hmmm, should this also include every character in the |
| 281 | "Symbol, Modifier" category, or does ZWJ get used for those? |
| 282 | https://www.fileformat.info/info/unicode/category/Sk/list.htm |
| 283 | |
| 284 | Is it intended that "Latin small letter C, 0063" + "Cedilla, 00B8" |
| 285 | should be a single glyph? Or is that what "Combining Cedilla, 0327" |
| 286 | is for? I'm confused by the fact that the skin tones (1F3FB-1F3FF) |
| 287 | do not seem to be in a readily-identifiable block the way the various |
| 288 | combining diacriticals are. |
| 289 | */ |
| 290 | if (i > 1 && |
| 291 | ((uc >= 0x300 && uc <= 0x36F) || /* Combining Diacritical */ |
| 292 | (uc >= 0x1AB0 && uc <= 0x1AFF) || /* Combining Diacritical Ext. */ |
| 293 | (uc >= 0x1DC0 && uc <= 0x1DFF) || /* Combining Diacritical Supp. */ |
| 294 | (uc >= 0x20D0 && uc <= 0x20FF) || /* Combining Diacritical Sym. */ |
| 295 | (uc >= 0xFE20 && uc <= 0xFE2F) || /* Combining Half Marks */ |
| 296 | (uc >= 0x1F3FB && uc <= 0x1F3FF) || /* Emoji skin tone modifiers */ |
| 297 | zwjp || uc == 0x200D)) /* Zero Width Joiner */ |
| 298 | { |
| 299 | long L1 = strlen(ret[i-2]); |
| 300 | long L2 = strlen(ret[i-1]); |
| 301 | char *s2 = (char *) malloc (L1 + L2 + 1); |
| 302 | memcpy (s2, ret[i-2], L1); |
| 303 | memcpy (s2 + L1, ret[i-1], L2); |
| 304 | s2[L1 + L2] = 0; |
| 305 | free (ret[i-2]); |
| 306 | ret[i-2] = s2; |
| 307 | i--; |
| 308 | zwjp = (uc == 0x200D); /* Swallow the next character as well */ |
| 309 | } |
| 310 | } |
| 311 | ret[i] = 0; |
| 312 | |
| 313 | if (length_ret) |
| 314 | *length_ret = i; |
| 315 | |
| 316 | /* shrink */ |
| 317 | ret = (char **) realloc (ret, (i+1) * sizeof(*ret)); |
| 318 | |
| 319 | return ret; |
| 320 | } |
| 321 | |
| 322 | |
| 323 | /* Converts a null-terminated XChar2b array to a null-terminated UTF8 string. |
| 324 | */ |
| 325 | char * |
| 326 | XChar2b_to_utf8 (const XChar2b *in, int *length_ret) |
| 327 | { |
| 328 | int in_len = 0; |
| 329 | const XChar2b *in_end; |
| 330 | int out_len; |
| 331 | char *utf8, *out; |
| 332 | const char *out_end; |
| 333 | |
| 334 | /* Find the null termination on the XChar2b. */ |
| 335 | for (in_end = in; in_end->byte1 || in_end->byte2; in_end++, in_len++) |
| 336 | ; |
| 337 | |
| 338 | out_len = (in_len + 1) * 3; /* 16 bit chars = 3 bytes max */ |
| 339 | utf8 = out = (char *) malloc (out_len + 1); |
| 340 | if (! out) return 0; |
| 341 | out_end = out + out_len; |
| 342 | |
| 343 | while (in < in_end) |
| 344 | { |
| 345 | unsigned long uc = (in->byte1 << 8) | in->byte2; |
| 346 | int wrote = utf8_encode (uc, out, out_end - out); |
| 347 | if (wrote > 3) abort(); /* Can't happen with 16 bit input */ |
| 348 | out += wrote; |
| 349 | in++; |
| 350 | } |
| 351 | *out = 0; |
| 352 | |
| 353 | out_len = (int) (out - utf8 + 1); |
| 354 | |
| 355 | if (length_ret) |
| 356 | *length_ret = out_len; |
| 357 | |
| 358 | /* shrink */ |
| 359 | utf8 = (char *) realloc (utf8, out_len); |
| 360 | |
| 361 | return utf8; |
| 362 | } |
| 363 | |
| 364 | |
| 365 | /* Converts a UTF8 string to the closest Latin1 or ASCII equivalent. |
| 366 | */ |
| 367 | char * |
| 368 | utf8_to_latin1 (const char *string, Bool ascii_p) |
| 369 | { |
| 370 | long in_len = strlen(string); |
| 371 | const unsigned char *in = (const unsigned char *) string; |
| 372 | const unsigned char *in_end = in + in_len; |
| 373 | unsigned char *ret = (unsigned char *) malloc (in_len + 1); |
| 374 | unsigned char *out = ret; |
| 375 | |
| 376 | if (! ret) return 0; |
| 377 | |
| 378 | while (in < in_end) |
| 379 | { |
| 380 | unsigned long uc = 0; |
| 381 | long len2 = utf8_decode (in, in_end - in, &uc); |
| 382 | in += len2; |
| 383 | |
| 384 | if (uc == '\240') /* */ |
| 385 | uc = ' '; |
| 386 | else if (uc >= 0x300 && uc <= 0x36F) |
| 387 | uc = 0; /* Discard "Combining Diacritical Marks" */ |
| 388 | else if (uc >= 0x1AB0 && uc <= 0x1AFF) |
| 389 | uc = 0; /* Discard "Combining Diacritical Marks Extended" */ |
| 390 | else if (uc >= 0x1DC0 && uc <= 0x1DFF) |
| 391 | uc = 0; /* Discard "Combining Diacritical Marks Supplement" */ |
| 392 | else if (uc >= 0x20D0 && uc <= 0x20FF) |
| 393 | uc = 0; /* Discard "Combining Diacritical Marks for Symbols" */ |
| 394 | else if (uc >= 0xFE20 && uc <= 0xFE2F) |
| 395 | uc = 0; /* Discard "Combining Half Marks" */ |
| 396 | |
| 397 | else if (uc > 0xFF) |
| 398 | switch (uc) { |
| 399 | |
| 400 | /* Map "Unicode General Punctuation Block" to Latin1 equivalents. */ |
| 401 | |
| 402 | case 0x2000: /* EN QUAD */ |
| 403 | case 0x2001: /* EM QUAD */ |
| 404 | case 0x2002: /* EN SPACE */ |
| 405 | case 0x2003: /* EM SPACE */ |
| 406 | case 0x2004: /* THREE-PER-EM SPACE */ |
| 407 | case 0x2005: /* FOUR-PER-EM SPACE */ |
| 408 | case 0x2006: /* SIX-PER-EM SPACE */ |
| 409 | case 0x2007: /* FIGURE SPACE */ |
| 410 | case 0x2008: /* PUNCTUATION SPACE */ |
| 411 | case 0x2009: /* THIN SPACE */ |
| 412 | case 0x200A: /* HAIR SPACE */ |
| 413 | uc = ' '; |
| 414 | break; |
| 415 | |
| 416 | case 0x2010: /* HYPHEN */ |
| 417 | case 0x2011: /* NON-BREAKING HYPHEN */ |
| 418 | case 0x2012: /* FIGURE DASH */ |
| 419 | case 0x2013: /* EN DASH */ |
| 420 | case 0x2014: /* EM DASH */ |
| 421 | case 0x2015: /* HORIZONTAL BAR */ |
| 422 | uc = '-'; |
| 423 | break; |
| 424 | |
| 425 | case 0x2018: /* LEFT SINGLE QUOTATION MARK */ |
| 426 | case 0x2019: /* SINGLE LOW-9 QUOTATION MARK */ |
| 427 | case 0x201A: /* SINGLE LOW-9 QUOTATION MARK */ |
| 428 | case 0x201B: /* SINGLE HIGH-REVERSED-9 QUOTATION MARK */ |
| 429 | uc = '\''; |
| 430 | break; |
| 431 | |
| 432 | case 0x201C: /* LEFT DOUBLE QUOTATION MARK */ |
| 433 | case 0x201D: /* RIGHT DOUBLE QUOTATION MARK */ |
| 434 | case 0x201E: /* DOUBLE LOW-9 QUOTATION MARK */ |
| 435 | case 0x201F: /* DOUBLE HIGH-REVERSED-9 QUOTATION MARK */ |
| 436 | uc = '"'; |
| 437 | break; |
| 438 | |
| 439 | case 0x2022: uc = '\267'; break; /* BULLET */ |
| 440 | case 0x2023: uc = '\273'; break; /* TRIANGULAR BULLET */ |
| 441 | case 0x2027: uc = '\267'; break; /* HYPHENATION POINT */ |
| 442 | case 0x202F: uc = ' '; break; /* NARROW NO-BREAK SPACE */ |
| 443 | case 0x2038: uc = '^'; break; /* CARET */ |
| 444 | case 0x2039: uc = '\253'; break; /* SINGLE LEFT ANGLE QUOTATION MARK */ |
| 445 | case 0x203A: uc = '\273'; break; /* SINGLE RIGHT ANGLE QUOTATION MARK*/ |
| 446 | case 0x2041: uc = '^'; break; /* CARET INSERTION POINT */ |
| 447 | case 0x2042: uc = '*'; break; /* ASTERISM */ |
| 448 | case 0x2043: uc = '='; break; /* HYPHEN BULLET */ |
| 449 | case 0x2044: uc = '/'; break; /* FRACTION SLASH */ |
| 450 | case 0x204B: uc = '\266'; break; /* REVERSED PILCROW SIGN */ |
| 451 | case 0x204C: uc = '\267'; break; /* BLACK LEFTWARDS BULLET */ |
| 452 | case 0x204D: uc = '\267'; break; /* BLACK RIGHTWARDS BULLET */ |
| 453 | case 0x204E: uc = '*'; break; /* LOW ASTERISK */ |
| 454 | case 0x204F: uc = ';'; break; /* REVERSED SEMICOLON */ |
| 455 | default: |
| 456 | break; |
| 457 | } |
| 458 | |
| 459 | if (uc > 0xFF) |
| 460 | /* "Inverted question mark" looks enough like 0xFFFD, |
| 461 | the "Unicode Replacement Character". */ |
| 462 | uc = (ascii_p ? '#' : '\277'); |
| 463 | |
| 464 | if (ascii_p) /* Map Latin1 to the closest ASCII versions. */ |
| 465 | { |
| 466 | const unsigned char latin1_to_ascii[96] = |
| 467 | " !C##Y|S_C#<=-R_##23'uP.,1o>###?" |
| 468 | "AAAAAAECEEEEIIIIDNOOOOOx0UUUUYpS" |
| 469 | "aaaaaaeceeeeiiiionooooo/ouuuuypy"; |
| 470 | if (uc >= 0xA0) |
| 471 | uc = latin1_to_ascii[uc - 0xA0]; |
| 472 | } |
| 473 | |
| 474 | if (uc > 0) |
| 475 | *out++ = (unsigned char) uc; |
| 476 | } |
| 477 | *out = 0; |
| 478 | |
| 479 | /* shrink */ |
| 480 | ret = (unsigned char *) realloc (ret, (out - ret + 1) * sizeof(*ret)); |
| 481 | |
| 482 | return (char *) ret; |
| 483 | } |
| 484 | |
| 485 | |
| 486 | /************************************************************************* |
| 487 | |
| 488 | cd ../hacks ; make test-utf8wc |
| 489 | |
| 490 | *************************************************************************/ |
| 491 | |
| 492 | #ifdef SELFTEST |
| 493 | |
| 494 | /* Convert a UTF8 string to Unicode and back again. |
| 495 | */ |
| 496 | static char * |
| 497 | split_and_join (const char *string) |
| 498 | { |
| 499 | const unsigned char *in = (const unsigned char *) string; |
| 500 | int len = strlen (string); |
| 501 | const unsigned char *end = in + len; |
| 502 | unsigned long *unicode = (unsigned long *) |
| 503 | malloc((len + 1) * sizeof(*unicode)); |
| 504 | int i = 0; |
| 505 | char *ret, *out, *out_end; |
| 506 | |
| 507 | while (in < end) |
| 508 | { |
| 509 | long len2 = utf8_decode (in, len, &unicode[i]); |
| 510 | i++; |
| 511 | in += len2; |
| 512 | } |
| 513 | unicode[i] = 0; |
| 514 | |
| 515 | i = i*6 + 1; |
| 516 | out = ret = (char *) malloc(i); |
| 517 | out_end = out + i; |
| 518 | i = 0; |
| 519 | while (unicode[i]) |
| 520 | { |
| 521 | int len2 = utf8_encode (unicode[i], out, out_end - out); |
| 522 | out += len2; |
| 523 | i++; |
| 524 | } |
| 525 | *out = 0; |
| 526 | free (unicode); |
| 527 | |
| 528 | return ret; |
| 529 | } |
| 530 | |
| 531 | |
| 532 | static void |
| 533 | LOG (FILE *out, const char *prefix, const char *s) |
| 534 | { |
| 535 | fprintf (out, "%6s: \"", prefix); |
| 536 | while (*s) |
| 537 | { |
| 538 | unsigned char c = *s; |
| 539 | if (c == '"' || c == '\\') fprintf(out, "\\%c", c); |
| 540 | else if (c < 32 || c >= 127) fprintf(out, "\\%03o", c); |
| 541 | else fprintf (out, "%c", c); |
| 542 | s++; |
| 543 | } |
| 544 | fprintf (out, "\"\n"); |
| 545 | } |
| 546 | |
| 547 | |
| 548 | int |
| 549 | main (int argc, char **argv) |
| 550 | { |
| 551 | /* Adapted from http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt |
| 552 | */ |
| 553 | |
| 554 | # define URC "\357\277\275" /* 0xFFFD, "Unicode Replacement Character" */ |
| 555 | |
| 556 | static const struct { const char *name, *in, *target, *target2; } tests[] = { |
| 557 | /* 1 Some correct UTF-8 text */ |
| 558 | |
| 559 | /* The Greek word 'kosme': */ |
| 560 | { "1", "\316\272\341\275\271\317\203\316\274\316\265" }, |
| 561 | |
| 562 | |
| 563 | /* 2 Boundary condition test cases */ |
| 564 | |
| 565 | /* 2.1 First possible sequence of a certain length */ |
| 566 | |
| 567 | { "2.1.1", /* 1 byte (U-00000000): */ "\000" }, |
| 568 | { "2.1.2", /* 2 bytes (U-00000080): */ "\302\200" }, |
| 569 | { "2.1.3", /* 3 bytes (U-00000800): */ "\340\240\200" }, |
| 570 | { "2.1.4", /* 4 bytes (U-00010000): */ "\360\220\200\200", 0, URC }, |
| 571 | { "2.1.5", /* 5 bytes (U-00200000): */ "\370\210\200\200\200", URC }, |
| 572 | { "2.1.6", /* 6 bytes (U-04000000): */ "\374\204\200\200\200\200", URC }, |
| 573 | |
| 574 | /* 2.2 Last possible sequence of a certain length */ |
| 575 | |
| 576 | { "2.2.1", /* 1 byte (U-0000007F): */ "\177" }, |
| 577 | { "2.2.2", /* 2 bytes (U-000007FF): */ "\337\277" }, |
| 578 | { "2.2.3", /* 3 bytes (U-0000FFFF): */ "\357\277\277" }, |
| 579 | { "2.2.4", /* 4 bytes (U-001FFFFF): */ "\367\277\277\277", URC }, |
| 580 | { "2.2.5", /* 5 bytes (U-03FFFFFF): */ "\373\277\277\277\277", URC }, |
| 581 | { "2.2.6", /* 6 bytes (U-7FFFFFFF): */ "\375\277\277\277\277\277", URC }, |
| 582 | |
| 583 | /* 2.3 Other boundary conditions */ |
| 584 | |
| 585 | { "2.3.1", /* U-0000D7FF = ed 9f bf = */ "\355\237\277" }, |
| 586 | { "2.3.2", /* U-0000E000 = ee 80 80 = */ "\356\200\200" }, |
| 587 | { "2.3.3", /* U-0000FFFD = ef bf bd = */ URC }, |
| 588 | { "2.3.4", /* U-0010FFFF = f4 8f bf bf = */ "\364\217\277\277", 0, URC }, |
| 589 | { "2.3.5", /* U-00110000 = f4 90 80 80 = */ "\364\220\200\200", URC }, |
| 590 | |
| 591 | |
| 592 | /* 3 Malformed sequences */ |
| 593 | |
| 594 | /* 3.1 Unexpected continuation bytes */ |
| 595 | |
| 596 | /* Each unexpected continuation byte should be separately signalled as a |
| 597 | malformed sequence of its own. */ |
| 598 | |
| 599 | { "3.1.1", /* First continuation byte 0x80: */ "\200", URC }, |
| 600 | { "3.1.2", /* Last continuation byte 0xbf: */ "\277", URC }, |
| 601 | { "3.1.3", /* 2 continuation bytes: */ "\200\277", URC URC }, |
| 602 | { "3.1.4", /* 3 continuation bytes: */ "\200\277\200", URC URC URC }, |
| 603 | { "3.1.5", /* 4 continuation bytes: */ "\200\277\200\277", |
| 604 | URC URC URC URC }, |
| 605 | { "3.1.6", /* 5 continuation bytes: */ "\200\277\200\277\200", |
| 606 | URC URC URC URC URC }, |
| 607 | { "3.1.7", /* 6 continuation bytes: */ "\200\277\200\277\200\277", |
| 608 | URC URC URC URC URC URC }, |
| 609 | { "3.1.8", /* 7 continuation bytes: */ "\200\277\200\277\200\277\200", |
| 610 | URC URC URC URC URC URC URC }, |
| 611 | |
| 612 | { "3.1.9", /* Sequence of all 64 possible continuation bytes (0x80-0xbf):*/ |
| 613 | |
| 614 | "\200\201\202\203\204\205\206\207\210\211\212\213\214\215\216\217" |
| 615 | "\220\221\222\223\224\225\226\227\230\231\232\233\234\235\236\237" |
| 616 | "\240\241\242\243\244\245\246\247\250\251\252\253\254\255\256\257" |
| 617 | "\260\261\262\263\264\265\266\267\270\271\272\273\274\275\276\277", |
| 618 | URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC |
| 619 | URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC |
| 620 | URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC |
| 621 | URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC }, |
| 622 | |
| 623 | /* 3.2 Lonely start characters */ |
| 624 | |
| 625 | { "3.2.1", /* All 32 first bytes of 2-byte sequences (0xc0-0xdf), |
| 626 | each followed by a space character: */ |
| 627 | |
| 628 | "\300 \301 \302 \303 \304 \305 \306 \307 \310 \311 \312 \313 \314 " |
| 629 | "\315 \316 \317 \320 \321 \322 \323 \324 \325 \326 \327 \330 \331 " |
| 630 | "\332 \333 \334 \335 \336 \337 ", |
| 631 | URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC |
| 632 | URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC }, |
| 633 | |
| 634 | { "3.2.2", /* All 16 first bytes of 3-byte sequences (0xe0-0xef), |
| 635 | each followed by a space character: */ |
| 636 | "\340 \341 \342 \343 \344 \345 \346 \347 " |
| 637 | "\350 \351 \352 \353 \354 \355 \356 \357 ", |
| 638 | URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC }, |
| 639 | |
| 640 | { "3.2.3", /* All 8 first bytes of 4-byte sequences (0xf0-0xf7), |
| 641 | each followed by a space character: */ |
| 642 | URC URC URC URC URC URC URC URC }, |
| 643 | |
| 644 | { "3.2.4", /* All 4 first bytes of 5-byte sequences (0xf8-0xfb), |
| 645 | each followed by a space character: */ |
| 646 | "\370 \371 \372 \373 ", |
| 647 | URC URC URC URC }, |
| 648 | |
| 649 | { "3.2.5", /* All 2 first bytes of 6-byte sequences (0xfc-0xfd), |
| 650 | each followed by a space character: */ |
| 651 | "\374 \375 ", URC URC }, |
| 652 | |
| 653 | /* 3.3 Sequences with last continuation byte missing */ |
| 654 | |
| 655 | /* All bytes of an incomplete sequence should be signalled as a single |
| 656 | malformed sequence, i.e., you should see only a single replacement |
| 657 | character in each of the next 10 tests. (Characters as in section 2) */ |
| 658 | |
| 659 | { "3.3.1", /* 2-byte sequence with last byte missing (U+0000): */ |
| 660 | "\300", URC }, |
| 661 | { "3.3.2", /* 3-byte sequence with last byte missing (U+0000): */ |
| 662 | "\340\200", URC }, |
| 663 | { "3.3.3", /* 4-byte sequence with last byte missing (U+0000): */ |
| 664 | "\360\200\200", URC }, |
| 665 | { "3.3.4", /* 5-byte sequence with last byte missing (U+0000): */ |
| 666 | "\370\200\200\200", URC }, |
| 667 | { "3.3.5", /* 6-byte sequence with last byte missing (U+0000): */ |
| 668 | "\374\200\200\200\200", URC }, |
| 669 | { "3.3.6", /* 2-byte sequence with last byte missing (U-000007FF): */ |
| 670 | "\337", URC }, |
| 671 | { "3.3.7", /* 3-byte sequence with last byte missing (U-0000FFFF): */ |
| 672 | "\357\277", URC }, |
| 673 | { "3.3.8", /* 4-byte sequence with last byte missing (U-001FFFFF): */ |
| 674 | "\367\277\277", URC }, |
| 675 | { "3.3.9", /* 5-byte sequence with last byte missing (U-03FFFFFF): */ |
| 676 | "\373\277\277\277", URC }, |
| 677 | { "3.3.10", /* 6-byte sequence with last byte missing (U-7FFFFFFF): */ |
| 678 | "\375\277\277\277\277", URC }, |
| 679 | |
| 680 | /* 3.4 Concatenation of incomplete sequences */ |
| 681 | |
| 682 | /* All the 10 sequences of 3.3 concatenated, you should see 10 malformed |
| 683 | sequences being signalled: */ |
| 684 | |
| 685 | { "3.4", "\300\340\200\360\200\200\370\200\200\200\374\200\200\200\200" |
| 686 | "\337\357\277\367\277\277\373\277\277\277\375\277\277\277\277", |
| 687 | URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC }, |
| 688 | |
| 689 | /* 3.5 Impossible bytes */ |
| 690 | |
| 691 | /* The following two bytes cannot appear in a correct UTF-8 string */ |
| 692 | |
| 693 | { "3.5.1", /* fe = */ "\376", URC }, |
| 694 | { "3.5.2", /* ff = */ "\377", URC }, |
| 695 | { "3.5.3", /* fe fe ff ff = */ "\376\376\377\377", URC URC URC URC }, |
| 696 | |
| 697 | |
| 698 | /* 4 Overlong sequences */ |
| 699 | |
| 700 | /* 4.1 Examples of an overlong ASCII character */ |
| 701 | |
| 702 | { "4.1.1", /* U+002F = c0 af = */ "\300\257", URC }, |
| 703 | { "4.1.2", /* U+002F = e0 80 af = */ "\340\200\257", URC }, |
| 704 | { "4.1.3", /* U+002F = f0 80 80 af = */ "\360\200\200\257", URC }, |
| 705 | { "4.1.4", /* U+002F = f8 80 80 80 af = */ "\370\200\200\200\257", |
| 706 | URC }, |
| 707 | { "4.1.5", /* U+002F = fc 80 80 80 80 af = */ "\374\200\200\200\200\257", |
| 708 | URC }, |
| 709 | |
| 710 | /* 4.2 Maximum overlong sequences */ |
| 711 | |
| 712 | { "4.2.1", /* U-0000007F = c1 bf = */ "\301\277", URC }, |
| 713 | { "4.2.2", /* U-000007FF = e0 9f bf = */ "\340\237\277", URC }, |
| 714 | { "4.2.3", /* U-0000FFFF = f0 8f bf bf = */ "\360\217\277\277", |
| 715 | URC }, |
| 716 | { "4.2.4", /* U-001FFFFF = f8 87 bf bf bf = */ "\370\207\277\277\277", |
| 717 | URC }, |
| 718 | { "4.2.5", /* U-03FFFFFF = fc 83 bf bf bf bf = */ URC }, |
| 719 | |
| 720 | /* 4.3 Overlong representation of the NUL character */ |
| 721 | |
| 722 | { "4.3.1", /* U+0000 = c0 80 = */ "\300\200", URC }, |
| 723 | { "4.3.2", /* U+0000 = e0 80 80 = */ "\340\200\200", URC }, |
| 724 | { "4.3.3", /* U+0000 = f0 80 80 80 = */ "\360\200\200\200", URC }, |
| 725 | { "4.3.4", /* U+0000 = f8 80 80 80 80 = */ "\370\200\200\200\200", |
| 726 | URC }, |
| 727 | { "4.3.5", /* U+0000 = fc 80 80 80 80 80 = */ "\374\200\200\200\200\200", |
| 728 | URC }, |
| 729 | |
| 730 | |
| 731 | /* 5 Illegal code positions */ |
| 732 | |
| 733 | /* 5.1 Single UTF-16 surrogates */ |
| 734 | |
| 735 | { "5.1.1", /* U+D800 = ed a0 80 = */ "\355\240\200", URC }, |
| 736 | { "5.1.2", /* U+DB7F = ed ad bf = */ "\355\255\277", URC }, |
| 737 | { "5.1.3", /* U+DB80 = ed ae 80 = */ "\355\256\200", URC }, |
| 738 | { "5.1.4", /* U+DBFF = ed af bf = */ "\355\257\277", URC }, |
| 739 | { "5.1.5", /* U+DC00 = ed b0 80 = */ "\355\260\200", URC }, |
| 740 | { "5.1.6", /* U+DF80 = ed be 80 = */ "\355\276\200", URC }, |
| 741 | { "5.1.7", /* U+DFFF = ed bf bf = */ "\355\277\277", URC }, |
| 742 | |
| 743 | /* 5.2 Paired UTF-16 surrogates */ |
| 744 | |
| 745 | { "5.2.1", /* U+D800 U+DC00 = ed a0 80 ed b0 80 = */ URC URC }, |
| 746 | { "5.2.2", /* U+D800 U+DFFF = ed a0 80 ed bf bf = */ URC URC }, |
| 747 | { "5.2.3", /* U+DB7F U+DC00 = ed ad bf ed b0 80 = */ URC URC }, |
| 748 | { "5.2.4", /* U+DB7F U+DFFF = ed ad bf ed bf bf = */ URC URC }, |
| 749 | { "5.2.5", /* U+DB80 U+DC00 = ed ae 80 ed b0 80 = */ URC URC }, |
| 750 | { "5.2.6", /* U+DB80 U+DFFF = ed ae 80 ed bf bf = */ URC URC }, |
| 751 | { "5.2.7", /* U+DBFF U+DC00 = ed af bf ed b0 80 = */ URC URC }, |
| 752 | { "5.2.8", /* U+DBFF U+DFFF = ed af bf ed bf bf = */ URC URC }, |
| 753 | |
| 754 | /* 5.3 Other illegal code positions */ |
| 755 | |
| 756 | { "5.3.1", /* U+FFFE = ef bf be = */ "\357\277\276" }, |
| 757 | { "5.3.2", /* U+FFFF = ef bf bf = */ "\357\277\277" }, |
| 758 | |
| 759 | |
| 760 | /* 6 Some other junk */ |
| 761 | |
| 762 | { "6.0", "" }, |
| 763 | { "6.1", "\001\002\003\004\005 ABC" }, |
| 764 | { "6.2", /* every non-ASCII Latin1 character */ |
| 765 | "\302\241\302\242\302\243\302\244\302\245\302\246\302\247\302\250" |
| 766 | "\302\251\302\252\302\253\302\254\302\255\302\256\302\257\302\260" |
| 767 | "\302\261\302\262\302\263\302\264\302\265\302\266\302\267\302\270" |
| 768 | "\302\271\302\272\302\273\302\274\302\275\302\276\302\277\303\200" |
| 769 | "\303\201\303\202\303\203\303\204\303\205\303\206\303\207\303\210" |
| 770 | "\303\211\303\212\303\213\303\214\303\215\303\216\303\217\303\220" |
| 771 | "\303\221\303\222\303\223\303\224\303\225\303\226\303\227\303\230" |
| 772 | "\303\231\303\232\303\233\303\234\303\235\303\236\303\237\303\240" |
| 773 | "\303\241\303\242\303\243\303\244\303\245\303\246\303\247\303\250" |
| 774 | "\303\251\303\252\303\253\303\254\303\255\303\256\303\257\303\260" |
| 775 | "\303\261\303\262\303\263\303\264\303\265\303\266\303\267\303\270" |
| 776 | "\303\271\303\272\303\273\303\274\303\275\303\276\303\277" }, |
| 777 | |
| 778 | { "6.3", /* Christmas tree */ |
| 779 | "\001\002\003\004\005\006\007\010\011\012\013\014\015\016\017\020" |
| 780 | "\021\022\023\024\025\026\027\030\031\032\033\034\035\036\037\040" |
| 781 | "\041\042\043\044\045\046\047\050\051\052\053\054\055\056\057\060" |
| 782 | "\061\062\063\064\065\066\067\070\071\072\073\074\075\076\077\100" |
| 783 | "\101\102\103\104\105\106\107\110\111\112\113\114\115\116\117\120" |
| 784 | "\121\122\123\124\125\126\127\130\131\132\133\134\135\136\137\140" |
| 785 | "\141\142\143\144\145\146\147\150\151\152\153\154\155\156\157\160" |
| 786 | "\161\162\163\164\165\166\167\170\171\172\173\174\175\176\177\200" |
| 787 | "\201\202\203\204\205\206\207\210\211\212\213\214\215\216\217\220" |
| 788 | "\221\222\223\224\225\226\227\230\231\232\233\234\235\236\237\240" |
| 789 | "\241\242\243\244\245\246\247\250\251\252\253\254\255\256\257\260" |
| 790 | "\261\262\263\264\265\266\267\270\271\272\273\274\275\276\277\300" |
| 791 | "\301\302\303\304\305\306\307\310\311\312\313\314\315\316\317\320" |
| 792 | "\321\322\323\324\325\326\327\330\331\332\333\334\335\336\337\340" |
| 793 | "\341\342\343\344\345\346\347\350\351\352\353\354\355\356\357\360" |
| 794 | "\361\362\363\364\365\366\367\370\371\372\373\374\375\376\377", |
| 795 | |
| 796 | "\001\002\003\004\005\006\007\010\011\012\013\014\015\016\017\020" |
| 797 | "\021\022\023\024\025\026\027\030\031\032\033\034\035\036\037" |
| 798 | " !\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ" |
| 799 | "[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\177" |
| 800 | URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC |
| 801 | URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC |
| 802 | URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC |
| 803 | URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC |
| 804 | URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC URC |
| 805 | URC URC URC URC URC URC URC URC URC URC URC URC }, |
| 806 | }; |
| 807 | |
| 808 | int i; |
| 809 | int ok = 1; |
| 810 | for (i = 0; i < sizeof(tests)/sizeof(*tests); i++) |
| 811 | { |
| 812 | const char *name = tests[i].name; |
| 813 | const char *in = tests[i].in; |
| 814 | const char *target = (tests[i].target ? tests[i].target : in); |
| 815 | const char *target2 = (tests[i].target2 ? tests[i].target2 : target); |
| 816 | char *out = split_and_join (in); |
| 817 | XChar2b *out16 = utf8_to_XChar2b (in, 0); |
| 818 | char *out2 = XChar2b_to_utf8 (out16, 0); |
| 819 | if (strcmp (out, target)) |
| 820 | { |
| 821 | LOG (stderr, name, target); |
| 822 | LOG (stderr, "FAIL", out); |
| 823 | fprintf (stderr, "\n"); |
| 824 | ok = 0; |
| 825 | } |
| 826 | if (strcmp (out2, target2)) |
| 827 | { |
| 828 | LOG (stderr, name, target2); |
| 829 | LOG (stderr, "FAIL2", out2); |
| 830 | fprintf (stderr, "\n"); |
| 831 | ok = 0; |
| 832 | } |
| 833 | free (out); |
| 834 | free (out2); |
| 835 | free (out16); |
| 836 | } |
| 837 | |
| 838 | /* Check conversion from UTF8 to Latin1 and ASCII. */ |
| 839 | { |
| 840 | const char *utf8 = ("son \303\256le int\303\251rieure, \303\240 " |
| 841 | "c\303\264t\303\251 de l'alc\303\264ve " |
| 842 | "ovo\303\257de, o\303\271 les b\303\273ches " |
| 843 | "se consument dans l'\303\242tre"); |
| 844 | const char *latin1 = ("son \356le int\351rieure, \340 " |
| 845 | "c\364t\351 de l'alc\364ve ovo\357de, " |
| 846 | "o\371 les b\373ches se consument dans " |
| 847 | "l'\342tre"); |
| 848 | const char *ascii = ("son ile interieure, a cote de l'alcove " |
| 849 | "ovoide, ou les buches se consument dans " |
| 850 | "l'atre"); |
| 851 | char *latin1b = utf8_to_latin1 (utf8, False); |
| 852 | char *ascii2 = utf8_to_latin1 (utf8, True); |
| 853 | if (strcmp (latin1, latin1b)) |
| 854 | { |
| 855 | LOG (stderr, "LATIN1", utf8); |
| 856 | LOG (stderr, "FAIL3", latin1b); |
| 857 | fprintf (stderr, "\n"); |
| 858 | ok = 0; |
| 859 | } |
| 860 | if (strcmp (ascii, ascii2)) |
| 861 | { |
| 862 | LOG (stderr, "ASCII", utf8); |
| 863 | LOG (stderr, "FAIL4", ascii2); |
| 864 | fprintf (stderr, "\n"); |
| 865 | ok = 0; |
| 866 | } |
| 867 | free (latin1b); |
| 868 | free (ascii2); |
| 869 | } |
| 870 | |
| 871 | /* Check de-composition of emoji that should all be treated as a unit |
| 872 | for measurement and display purposes. */ |
| 873 | { |
| 874 | static const char * const tests[] = { |
| 875 | |
| 876 | /* 0: "Man" */ |
| 877 | " \360\237\221\250 ", |
| 878 | |
| 879 | /* 1: "Blackula" = "Vampire, dark skin tone" = 1F9DB 1F3FF */ |
| 880 | " \360\237\247\233\360\237\217\277 ", |
| 881 | |
| 882 | /* 2: "Black male teacher" = "Man, dark skin tone, ZWJ, school" = |
| 883 | 1F468 1F3FF 200D 1F3EB |
| 884 | */ |
| 885 | " \360\237\221\250\360\237\217\277\342\200\215\360\237\217\253 ", |
| 886 | |
| 887 | /* 3: "Female runner" = "Runner, ZWJ, female sign" = 1F3C3 200D 2640 */ |
| 888 | " \360\237\217\203\342\200\215\342\231\200 ", |
| 889 | |
| 890 | /* 4: "Woman astronaut" = "Woman, ZWJ, rocket ship" = 1F3C3 200D 1F680 */ |
| 891 | " \360\237\217\203\342\200\215\360\237\232\200 ", |
| 892 | |
| 893 | /* 5: |
| 894 | Group of people displayed as a single glyph: |
| 895 | Woman, dark skin tone, ZWJ, 1F469 1F3FF 200D |
| 896 | Man, light skin tone, ZWJ, 1F468 1F3FB 200D |
| 897 | Boy, medium skin tone, ZWJ, 1F466 1F3FD 200D |
| 898 | Girl, dark skin tone. 1F467 1F3FF |
| 899 | */ |
| 900 | " \360\237\221\251\360\237\217\277\342\200\215" |
| 901 | "\360\237\221\250\360\237\217\273\342\200\215" |
| 902 | "\360\237\221\246\360\237\217\275\342\200\215" |
| 903 | "\360\237\221\247\360\237\217\277 ", |
| 904 | }; |
| 905 | int i; |
| 906 | for (i = 0; i < sizeof(tests)/sizeof(*tests); i++) |
| 907 | { |
| 908 | int L = 0; |
| 909 | char **out = utf8_split (tests[i], &L); |
| 910 | char name[100]; |
| 911 | int j; |
| 912 | sprintf (name, "SPLIT %d: %d glyphs", i, L-2); |
| 913 | if (L != 3) |
| 914 | { |
| 915 | LOG (stderr, name, tests[i]); |
| 916 | ok = 0; |
| 917 | } |
| 918 | for (j = 0; j < L; j++) |
| 919 | free (out[j]); |
| 920 | free (out); |
| 921 | } |
| 922 | } |
| 923 | |
| 924 | if (ok) fprintf (stderr, "OK\n"); |
| 925 | return (ok == 0); |
| 926 | } |
| 927 | |
| 928 | #endif /* SELFTEST */ |