branches/alto2/src/lib/util/unicode.c
| r26196 | r26197 | |
| 318 | 318 | return rc < count ? rc : count; |
| 319 | 319 | } |
| 320 | 320 | |
| 321 | /** |
| 322 | * @brief return the number of decoded Unicode values in UTF-8 encoded string |
| 323 | * @param src pointer to the array of UTF-8 encoded characters |
| 324 | * @param plen optional pointer to a size_t variable to receive the source string length |
| 325 | * @return number of unicode_char values decoded from the UTF-8 string |
| 326 | */ |
| 327 | size_t utf8_ucharlen(const char* utf8src, size_t * plen) |
| 328 | { |
| 329 | size_t len = 0; |
| 330 | size_t total = 0; |
| 331 | while (*utf8src) { |
| 332 | unsigned char c = (unsigned char) *utf8src; |
| 333 | size_t auxlen; |
| 321 | 334 | |
| 335 | /* determine how many additional bytes we need */ |
| 336 | if (c < 0x80) |
| 337 | { |
| 338 | /* unicode char 0x00000000 - 0x0000007F */ |
| 339 | auxlen = 0; |
| 340 | } |
| 341 | else if (c >= 0xc0 && c < 0xe0) |
| 342 | { |
| 343 | /* unicode char 0x00000080 - 0x000007FF */ |
| 344 | if (0 == utf8src[1]) |
| 345 | return -1; |
| 346 | auxlen = 1; |
| 347 | } |
| 348 | else if (c >= 0xe0 && c < 0xf0) |
| 349 | { |
| 350 | /* unicode char 0x00000800 - 0x0000FFFF */ |
| 351 | if (0 == utf8src[1] || 0 == utf8src[2]) |
| 352 | return -1; |
| 353 | auxlen = 2; |
| 354 | } |
| 355 | else if (c >= 0xf0 && c < 0xf8) |
| 356 | { |
| 357 | /* unicode char 0x00010000 - 0x001FFFFF */ |
| 358 | if (0 == utf8src[1] || 0 == utf8src[2] || 0 == utf8src[3]) |
| 359 | return -1; |
| 360 | auxlen = 3; |
| 361 | } |
| 362 | else if (c >= 0xf8 && c < 0xfc) |
| 363 | { |
| 364 | /* unicode char 0x00200000 - 0x03FFFFFF */ |
| 365 | if (0 == utf8src[1] || 0 == utf8src[2] || 0 == utf8src[3] || 0 == utf8src[4]) |
| 366 | return -1; |
| 367 | auxlen = 4; |
| 368 | } |
| 369 | else if (c >= 0xfc && c < 0xfe) |
| 370 | { |
| 371 | /* unicode char 0x04000000 - 0x7FFFFFFF */ |
| 372 | if (0 == utf8src[1] || 0 == utf8src[2] || 0 == utf8src[3] || 0 == utf8src[4] || 0 == utf8src[5]) |
| 373 | return -1; |
| 374 | auxlen = 5; |
| 375 | } |
| 376 | else |
| 377 | { |
| 378 | /* invalid */ |
| 379 | return -1; |
| 380 | } |
| 381 | total++; |
| 382 | len += auxlen + 1; |
| 383 | utf8src += auxlen + 1; |
| 384 | } |
| 385 | if (plen) |
| 386 | *plen = len; |
| 387 | return total; |
| 388 | } |
| 389 | |
| 322 | 390 | /** |
| 391 | * @brief return the number of decoded Unicode values in UTF-16 encoded string |
| 392 | * @param src pointer to the array of UTF-16 encoded characters |
| 393 | * @param plen optional pointer to a size_t variable to receive the source string length |
| 394 | * @return number of unicode_char values decoded from the UTF-8 string |
| 395 | */ |
| 396 | size_t utf16_ucharlen(const utf16_char* utf16src, size_t * plen) |
| 397 | { |
| 398 | size_t len = 0; |
| 399 | size_t total = 0; |
| 400 | while (*utf16src) { |
| 401 | utf16_char c = (utf16_char) *utf16src; |
| 402 | size_t auxlen; |
| 403 | |
| 404 | if (c >= 0xd800 && c <= 0xdbff) |
| 405 | { |
| 406 | if (0 == utf16src[1]) |
| 407 | return -1; |
| 408 | auxlen = 1; |
| 409 | } |
| 410 | else if (utf16src[0] < 0xdc00 || utf16src[0] > 0xdfff) |
| 411 | { |
| 412 | auxlen = 0; |
| 413 | } |
| 414 | else |
| 415 | return -1; |
| 416 | total++; |
| 417 | len += auxlen + 1; |
| 418 | utf16src += auxlen + 1; |
| 419 | } |
| 420 | if (plen) |
| 421 | *plen = len; |
| 422 | return total; |
| 423 | } |
| 424 | |
| 425 | /** |
| 323 | 426 | * @brief return a pointer to the previous character in a string |
| 324 | 427 | * @param utf8string const pointer to the starting position in the string |
| 325 | 428 | * @return pointer to the character which is not an UTF-8 auxiliary character |
| r26196 | r26197 | |
| 359 | 462 | } |
| 360 | 463 | |
| 361 | 464 | /** |
| 362 | | * @brief return the number of decoded Unicode values in UTF-8 encoded string |
| 363 | | * @param src pointer to the array of UTF-8 encoded characters |
| 364 | | * @return number of unicode_char values decoded from the UTF-8 string |
| 365 | | */ |
| 366 | | size_t utf8_strlen(const char* src) |
| 367 | | { |
| 368 | | int total = 0; |
| 369 | | while (*src) { |
| 370 | | unicode_char uchar; |
| 371 | | int len = uchar_from_utf8(&uchar, src, strlen(src)); |
| 372 | | if (len < 0) |
| 373 | | break; // invalid UTF-8 |
| 374 | | total++; |
| 375 | | src += len; |
| 376 | | } |
| 377 | | return total; |
| 378 | | } |
| 379 | | |
| 380 | | /** |
| 381 | 465 | * @brief load a lookup table 8 bit codes to Unicode values |
| 382 | 466 | * |
| 383 | 467 | * This opens and reads a file %name which has to be in the |
| r26196 | r26197 | |
| 450 | 534 | } |
| 451 | 535 | |
| 452 | 536 | /** |
| 537 | * @brief return an unicode_char array allocated while converted from UTF-8 |
| 538 | * @param utf8char source string encoded in UTF-8 |
| 539 | * @return newly allocated unicode_char string |
| 540 | */ |
| 541 | unicode_char* uchar_strfrom_utf8(const char *utf8src) |
| 542 | { |
| 543 | size_t available; |
| 544 | size_t size = utf8_ucharlen(utf8src, &available); |
| 545 | if (-1 == size) |
| 546 | return NULL; |
| 547 | unicode_char* result = (unicode_char *)calloc(sizeof(unicode_char), size + 1); |
| 548 | unicode_char* dst = result; |
| 549 | while (*utf8src) { |
| 550 | unicode_char uchar; |
| 551 | int len = uchar_from_utf8(&uchar, utf8src, available); |
| 552 | utf8src += len; |
| 553 | available -= len; |
| 554 | *dst++ = uchar; |
| 555 | } |
| 556 | return result; |
| 557 | } |
| 558 | |
| 559 | /** |
| 560 | * @brief return an unicode_char array allocated while converted from UTF-16 |
| 561 | * @param utf16src source string encoded in UTF-16 |
| 562 | * @return newly allocated unicode_char string |
| 563 | */ |
| 564 | unicode_char* uchar_strfrom_utf16(const utf16_char *utf16src) |
| 565 | { |
| 566 | size_t available; |
| 567 | size_t size = utf16_ucharlen(utf16src, &available); |
| 568 | if (-1 == size) |
| 569 | return NULL; |
| 570 | unicode_char* result = (unicode_char *)calloc(sizeof(unicode_char), size + 1); |
| 571 | unicode_char* dst = result; |
| 572 | while (*utf16src) { |
| 573 | unicode_char uchar; |
| 574 | int len = uchar_from_utf16(&uchar, utf16src, available); |
| 575 | utf16src += len; |
| 576 | available -= len; |
| 577 | *dst++ = uchar; |
| 578 | } |
| 579 | return result; |
| 580 | } |
| 581 | |
| 582 | /** |
| 453 | 583 | * @brief return the unicode_char array length |
| 454 | 584 | * @param src pointer to an array of unicode_char |
| 455 | 585 | * @return length of the array until the first 0 |
| r26196 | r26197 | |
| 617 | 747 | |
| 618 | 748 | static unicode_data_t** unicode_data = NULL; |
| 619 | 749 | |
| 620 | | #if NEED_UNICODE_RANGES |
| 621 | | typedef struct { |
| 622 | | unicode_char first, last; |
| 623 | | const char *name; |
| 624 | | } unicode_range_t; |
| 625 | | |
| 626 | | static const unicode_range_t unicode_ranges[] = |
| 627 | | { |
| 628 | | {0x0000, 0x007f, "Basic Latin"}, |
| 629 | | {0x0080, 0x00ff, "Latin-1 Supplement"}, |
| 630 | | {0x0100, 0x017f, "Latin Extended-A"}, |
| 631 | | {0x0180, 0x024f, "Latin Extended-B"}, |
| 632 | | {0x0250, 0x02af, "IPA Extensions"}, |
| 633 | | {0x02b0, 0x02ff, "Spacing Modifier Letters"}, |
| 634 | | {0x0300, 0x036f, "Combining Diacritical Marks"}, |
| 635 | | {0x0370, 0x03ff, "Greek"}, |
| 636 | | {0x0400, 0x04ff, "Cyrillic"}, |
| 637 | | {0x0530, 0x058f, "Armenian"}, |
| 638 | | {0x0590, 0x05ff, "Hebrew"}, |
| 639 | | {0x0600, 0x06ff, "Arabic"}, |
| 640 | | {0x0700, 0x074f, "Syriac"}, |
| 641 | | {0x0780, 0x07bf, "Thaana"}, |
| 642 | | {0x0900, 0x097f, "Devanagari"}, |
| 643 | | {0x0980, 0x09ff, "Bengali"}, |
| 644 | | {0x0a00, 0x0a7f, "Gurmukhi"}, |
| 645 | | {0x0a80, 0x0aff, "Gujarati"}, |
| 646 | | {0x0b00, 0x0b7f, "Oriya"}, |
| 647 | | {0x0b80, 0x0bff, "Tamil"}, |
| 648 | | {0x0c00, 0x0c7f, "Telugu"}, |
| 649 | | {0x0c80, 0x0cff, "Kannada"}, |
| 650 | | {0x0d00, 0x0d7f, "Malayalam"}, |
| 651 | | {0x0d80, 0x0dff, "Sinhala"}, |
| 652 | | {0x0e00, 0x0e7f, "Thai"}, |
| 653 | | {0x0e80, 0x0eff, "Lao"}, |
| 654 | | {0x0f00, 0x0fff, "Tibetan"}, |
| 655 | | {0x1000, 0x109f, "Myanmar"}, |
| 656 | | {0x10a0, 0x10ff, "Georgian"}, |
| 657 | | {0x1100, 0x11ff, "Hangul Jamo"}, |
| 658 | | {0x1200, 0x137f, "Ethiopic"}, |
| 659 | | {0x13a0, 0x13ff, "Cherokee"}, |
| 660 | | {0x1400, 0x167f, "Unified Canadian Aboriginal Syllabic"}, |
| 661 | | {0x1680, 0x169f, "Ogham"}, |
| 662 | | {0x16a0, 0x16ff, "Runic"}, |
| 663 | | {0x1780, 0x17ff, "Khmer"}, |
| 664 | | {0x1800, 0x18af, "Mongolian"}, |
| 665 | | {0x1e00, 0x1eff, "Latin Extended Additional"}, |
| 666 | | {0x1f00, 0x1fff, "Greek Extended"}, |
| 667 | | {0x2000, 0x206f, "General Punctuation"}, |
| 668 | | {0x2070, 0x208f, "Superscripts and Subscripts"}, |
| 669 | | {0x20a0, 0x20cf, "Currency Symbols"}, |
| 670 | | {0x20d0, 0x20ff, "Combining Marks for Symbols"}, |
| 671 | | {0x2100, 0x214f, "Letterlike Symbols"}, |
| 672 | | {0x2150, 0x218f, "Number Forms"}, |
| 673 | | {0x2190, 0x21ff, "Arrows"}, |
| 674 | | {0x2200, 0x22ff, "Mathematical Operators"}, |
| 675 | | {0x2300, 0x23ff, "Miscellaneous Technical"}, |
| 676 | | {0x2400, 0x243f, "Control Pictures"}, |
| 677 | | {0x2440, 0x245f, "Optical Character Recognition"}, |
| 678 | | {0x2460, 0x24ff, "Enclosed Alphanumerics"}, |
| 679 | | {0x2500, 0x257f, "Box Drawing"}, |
| 680 | | {0x2580, 0x259f, "Block Elements"}, |
| 681 | | {0x25a0, 0x25ff, "Geometric Shapes"}, |
| 682 | | {0x2600, 0x26ff, "Miscellaneous Symbols"}, |
| 683 | | {0x2700, 0x27bf, "Dingbats"}, |
| 684 | | {0x2800, 0x28ff, "Braille Patterns"}, |
| 685 | | {0x2e80, 0x2eff, "CJK Radicals Supplement"}, |
| 686 | | {0x2f00, 0x2fdf, "Kangxi Radicals"}, |
| 687 | | {0x2ff0, 0x2fff, "Ideographic Description Characters"}, |
| 688 | | {0x3000, 0x303f, "CJK Symbols and Punctuation"}, |
| 689 | | {0x3040, 0x309f, "Hiragana"}, |
| 690 | | {0x30a0, 0x30ff, "Katakana"}, |
| 691 | | {0x3100, 0x312f, "Bopomofo"}, |
| 692 | | {0x3130, 0x318f, "Hangul Compatibility Jamo"}, |
| 693 | | {0x3190, 0x319f, "Kanbun"}, |
| 694 | | {0x31a0, 0x31bf, "Bopomofo Extended"}, |
| 695 | | {0x3200, 0x32ff, "Enclosed CJK Letters and Months"}, |
| 696 | | {0x3300, 0x33ff, "CJK Compatibility"}, |
| 697 | | {0x3400, 0x4dbf, "CJK Unified Ideographs Extension A"}, |
| 698 | | {0x4e00, 0x9faf, "CJK Unified Ideographs"}, |
| 699 | | {0xa000, 0xa48f, "Yi Syllables"}, |
| 700 | | {0xa490, 0xa4cf, "Yi Radicals"}, |
| 701 | | {0xac00, 0xd7af, "Hangul Syllables"}, |
| 702 | | {0xd800, 0xdb7f, "High Surrogates"}, |
| 703 | | {0xdb80, 0xdbff, "High Private Use Surrogates"}, |
| 704 | | {0xdc00, 0xdfff, "Low Surrogates"}, |
| 705 | | {0xe000, 0xf8ff, "Private Use"}, |
| 706 | | {0xf900, 0xfaff, "CJK Compatibility Ideographs"}, |
| 707 | | {0xfb00, 0xfb4f, "Alphabetic Presentation Forms"}, |
| 708 | | {0xfb50, 0xfdff, "Arabic Presentation Forms-A"}, |
| 709 | | {0xfe20, 0xfe2f, "Combining Half Marks"}, |
| 710 | | {0xfe30, 0xfe4f, "CJK Compatibility Forms"}, |
| 711 | | {0xfe50, 0xfe6f, "Small Form Variants"}, |
| 712 | | {0xfe70, 0xfeff, "Arabic Presentation Forms-B"}, |
| 713 | | {0xff00, 0xffef, "Halfwidth and Fullwidth Forms"}, |
| 714 | | {0xfff0, 0xffff, "Specials"} |
| 715 | | // FIXME: add ranges for the Unicode planes 1 to 16 |
| 716 | | }; |
| 717 | | #endif |
| 718 | | |
| 719 | | #if NEED_UNICODE_CCOM |
| 720 | | static const char *canonical_combining_str(UINT8 val) |
| 721 | | { |
| 722 | | switch (val) |
| 723 | | { |
| 724 | | case 0: return "Spacing, split, enclosing, reordrant, and Tibetan subjoined"; |
| 725 | | case 1: return "Overlays and interior"; |
| 726 | | case 7: return "Nuktas"; |
| 727 | | case 8: return "Hiragana/Katakana voicing marks"; |
| 728 | | case 9: return "Viramas"; |
| 729 | | case 10: return "Start of fixed position classes"; |
| 730 | | case 199: return "End of fixed position classes"; |
| 731 | | case 200: return "Below left attached"; |
| 732 | | case 202: return "Below attached"; |
| 733 | | case 204: return "Below right attached"; |
| 734 | | case 208: return "Left attached (reordrant around single base character)"; |
| 735 | | case 210: return "Right attached"; |
| 736 | | case 212: return "Above left attached"; |
| 737 | | case 214: return "Above attached"; |
| 738 | | case 216: return "Above right attached"; |
| 739 | | case 218: return "Below left"; |
| 740 | | case 220: return "Below"; |
| 741 | | case 222: return "Below right"; |
| 742 | | case 224: return "Left (reordrant around single base character)"; |
| 743 | | case 226: return "Right"; |
| 744 | | case 228: return "Above left"; |
| 745 | | case 230: return "Above"; |
| 746 | | case 232: return "Above right"; |
| 747 | | case 233: return "Double below"; |
| 748 | | case 234: return "Double above"; |
| 749 | | case 240: return "Below (iota subscript)"; |
| 750 | | } |
| 751 | | return "INVALID"; |
| 752 | | } |
| 753 | | #endif |
| 754 | | |
| 755 | 750 | #if NEED_UNICODE_NAME |
| 756 | 751 | const char * unicode_name(unicode_char uchar) |
| 757 | 752 | { |
| r26196 | r26197 | |
| 832 | 827 | { |
| 833 | 828 | if (!unicode_data || uchar >= UNICODE_PLANESIZE || !unicode_data[uchar]) |
| 834 | 829 | return ""; |
| 835 | | return canonical_combining_str(unicode_data[uchar]->canonical_comb); |
| 830 | switch (unicode_data[uchar]->canonical_comb) |
| 831 | { |
| 832 | case 0: return "Spacing, split, enclosing, reordrant, and Tibetan subjoined"; |
| 833 | case 1: return "Overlays and interior"; |
| 834 | case 7: return "Nuktas"; |
| 835 | case 8: return "Hiragana/Katakana voicing marks"; |
| 836 | case 9: return "Viramas"; |
| 837 | case 10: return "Start of fixed position classes"; |
| 838 | case 199: return "End of fixed position classes"; |
| 839 | case 200: return "Below left attached"; |
| 840 | case 202: return "Below attached"; |
| 841 | case 204: return "Below right attached"; |
| 842 | case 208: return "Left attached (reordrant around single base character)"; |
| 843 | case 210: return "Right attached"; |
| 844 | case 212: return "Above left attached"; |
| 845 | case 214: return "Above attached"; |
| 846 | case 216: return "Above right attached"; |
| 847 | case 218: return "Below left"; |
| 848 | case 220: return "Below"; |
| 849 | case 222: return "Below right"; |
| 850 | case 224: return "Left (reordrant around single base character)"; |
| 851 | case 226: return "Right"; |
| 852 | case 228: return "Above left"; |
| 853 | case 230: return "Above"; |
| 854 | case 232: return "Above right"; |
| 855 | case 233: return "Double below"; |
| 856 | case 234: return "Double above"; |
| 857 | case 240: return "Below (iota subscript)"; |
| 858 | } |
| 859 | return "INVALID"; |
| 836 | 860 | } |
| 837 | 861 | #endif |
| 838 | 862 | |
| r26196 | r26197 | |
| 1075 | 1099 | #endif |
| 1076 | 1100 | |
| 1077 | 1101 | #if NEED_UNICODE_RANGES |
| 1102 | typedef struct { |
| 1103 | unicode_char first, last; |
| 1104 | const char *name; |
| 1105 | } unicode_range_t; |
| 1106 | |
| 1107 | static const unicode_range_t unicode_ranges[] = |
| 1108 | { |
| 1109 | {0x0000, 0x007f, "Basic Latin"}, |
| 1110 | {0x0080, 0x00ff, "Latin-1 Supplement"}, |
| 1111 | {0x0100, 0x017f, "Latin Extended-A"}, |
| 1112 | {0x0180, 0x024f, "Latin Extended-B"}, |
| 1113 | {0x0250, 0x02af, "IPA Extensions"}, |
| 1114 | {0x02b0, 0x02ff, "Spacing Modifier Letters"}, |
| 1115 | {0x0300, 0x036f, "Combining Diacritical Marks"}, |
| 1116 | {0x0370, 0x03ff, "Greek"}, |
| 1117 | {0x0400, 0x04ff, "Cyrillic"}, |
| 1118 | {0x0530, 0x058f, "Armenian"}, |
| 1119 | {0x0590, 0x05ff, "Hebrew"}, |
| 1120 | {0x0600, 0x06ff, "Arabic"}, |
| 1121 | {0x0700, 0x074f, "Syriac"}, |
| 1122 | {0x0780, 0x07bf, "Thaana"}, |
| 1123 | {0x0900, 0x097f, "Devanagari"}, |
| 1124 | {0x0980, 0x09ff, "Bengali"}, |
| 1125 | {0x0a00, 0x0a7f, "Gurmukhi"}, |
| 1126 | {0x0a80, 0x0aff, "Gujarati"}, |
| 1127 | {0x0b00, 0x0b7f, "Oriya"}, |
| 1128 | {0x0b80, 0x0bff, "Tamil"}, |
| 1129 | {0x0c00, 0x0c7f, "Telugu"}, |
| 1130 | {0x0c80, 0x0cff, "Kannada"}, |
| 1131 | {0x0d00, 0x0d7f, "Malayalam"}, |
| 1132 | {0x0d80, 0x0dff, "Sinhala"}, |
| 1133 | {0x0e00, 0x0e7f, "Thai"}, |
| 1134 | {0x0e80, 0x0eff, "Lao"}, |
| 1135 | {0x0f00, 0x0fff, "Tibetan"}, |
| 1136 | {0x1000, 0x109f, "Myanmar"}, |
| 1137 | {0x10a0, 0x10ff, "Georgian"}, |
| 1138 | {0x1100, 0x11ff, "Hangul Jamo"}, |
| 1139 | {0x1200, 0x137f, "Ethiopic"}, |
| 1140 | {0x13a0, 0x13ff, "Cherokee"}, |
| 1141 | {0x1400, 0x167f, "Unified Canadian Aboriginal Syllabic"}, |
| 1142 | {0x1680, 0x169f, "Ogham"}, |
| 1143 | {0x16a0, 0x16ff, "Runic"}, |
| 1144 | {0x1780, 0x17ff, "Khmer"}, |
| 1145 | {0x1800, 0x18af, "Mongolian"}, |
| 1146 | {0x1e00, 0x1eff, "Latin Extended Additional"}, |
| 1147 | {0x1f00, 0x1fff, "Greek Extended"}, |
| 1148 | {0x2000, 0x206f, "General Punctuation"}, |
| 1149 | {0x2070, 0x208f, "Superscripts and Subscripts"}, |
| 1150 | {0x20a0, 0x20cf, "Currency Symbols"}, |
| 1151 | {0x20d0, 0x20ff, "Combining Marks for Symbols"}, |
| 1152 | {0x2100, 0x214f, "Letterlike Symbols"}, |
| 1153 | {0x2150, 0x218f, "Number Forms"}, |
| 1154 | {0x2190, 0x21ff, "Arrows"}, |
| 1155 | {0x2200, 0x22ff, "Mathematical Operators"}, |
| 1156 | {0x2300, 0x23ff, "Miscellaneous Technical"}, |
| 1157 | {0x2400, 0x243f, "Control Pictures"}, |
| 1158 | {0x2440, 0x245f, "Optical Character Recognition"}, |
| 1159 | {0x2460, 0x24ff, "Enclosed Alphanumerics"}, |
| 1160 | {0x2500, 0x257f, "Box Drawing"}, |
| 1161 | {0x2580, 0x259f, "Block Elements"}, |
| 1162 | {0x25a0, 0x25ff, "Geometric Shapes"}, |
| 1163 | {0x2600, 0x26ff, "Miscellaneous Symbols"}, |
| 1164 | {0x2700, 0x27bf, "Dingbats"}, |
| 1165 | {0x2800, 0x28ff, "Braille Patterns"}, |
| 1166 | {0x2e80, 0x2eff, "CJK Radicals Supplement"}, |
| 1167 | {0x2f00, 0x2fdf, "Kangxi Radicals"}, |
| 1168 | {0x2ff0, 0x2fff, "Ideographic Description Characters"}, |
| 1169 | {0x3000, 0x303f, "CJK Symbols and Punctuation"}, |
| 1170 | {0x3040, 0x309f, "Hiragana"}, |
| 1171 | {0x30a0, 0x30ff, "Katakana"}, |
| 1172 | {0x3100, 0x312f, "Bopomofo"}, |
| 1173 | {0x3130, 0x318f, "Hangul Compatibility Jamo"}, |
| 1174 | {0x3190, 0x319f, "Kanbun"}, |
| 1175 | {0x31a0, 0x31bf, "Bopomofo Extended"}, |
| 1176 | {0x3200, 0x32ff, "Enclosed CJK Letters and Months"}, |
| 1177 | {0x3300, 0x33ff, "CJK Compatibility"}, |
| 1178 | {0x3400, 0x4dbf, "CJK Unified Ideographs Extension A"}, |
| 1179 | {0x4e00, 0x9faf, "CJK Unified Ideographs"}, |
| 1180 | {0xa000, 0xa48f, "Yi Syllables"}, |
| 1181 | {0xa490, 0xa4cf, "Yi Radicals"}, |
| 1182 | {0xac00, 0xd7af, "Hangul Syllables"}, |
| 1183 | {0xd800, 0xdb7f, "High Surrogates"}, |
| 1184 | {0xdb80, 0xdbff, "High Private Use Surrogates"}, |
| 1185 | {0xdc00, 0xdfff, "Low Surrogates"}, |
| 1186 | {0xe000, 0xf8ff, "Private Use"}, |
| 1187 | {0xf900, 0xfaff, "CJK Compatibility Ideographs"}, |
| 1188 | {0xfb00, 0xfb4f, "Alphabetic Presentation Forms"}, |
| 1189 | {0xfb50, 0xfdff, "Arabic Presentation Forms-A"}, |
| 1190 | {0xfe20, 0xfe2f, "Combining Half Marks"}, |
| 1191 | {0xfe30, 0xfe4f, "CJK Compatibility Forms"}, |
| 1192 | {0xfe50, 0xfe6f, "Small Form Variants"}, |
| 1193 | {0xfe70, 0xfeff, "Arabic Presentation Forms-B"}, |
| 1194 | {0xff00, 0xffef, "Halfwidth and Fullwidth Forms"}, |
| 1195 | {0xfff0, 0xffff, "Specials"} |
| 1196 | // FIXME: add ranges for the Unicode planes 1 to 16 |
| 1197 | }; |
| 1198 | |
| 1078 | 1199 | const char * unicode_range_name(unicode_char uchar) |
| 1079 | 1200 | { |
| 1080 | | static UINT32 hit = 0; |
| 1081 | | UINT32 i; |
| 1201 | int _min = 0; |
| 1202 | int _max = sizeof(unicode_ranges) / sizeof(unicode_ranges[0]) - 1; |
| 1203 | int _mid; |
| 1082 | 1204 | |
| 1083 | | for (i = hit; i < sizeof(unicode_ranges)/sizeof(unicode_ranges[0]); i++) |
| 1205 | /* binary search in table of unicode ranges */ |
| 1206 | while (_max >= _min) |
| 1084 | 1207 | { |
| 1085 | | if (unicode_ranges[i].first <= uchar && uchar <= unicode_ranges[i].last) |
| 1086 | | { |
| 1087 | | hit = i; |
| 1088 | | return unicode_ranges[i].name; |
| 1089 | | } |
| 1208 | _mid = (_min + _max) / 2; |
| 1209 | if (unicode_ranges[_mid].last < uchar) |
| 1210 | _min = _mid + 1; |
| 1211 | else if (unicode_ranges[_mid].first > uchar) |
| 1212 | _max = _mid - 1; |
| 1213 | else if (unicode_ranges[_mid].first <= uchar && unicode_ranges[_mid].last >= uchar) |
| 1214 | return unicode_ranges[_mid].name; |
| 1090 | 1215 | } |
| 1091 | | for (i = 0; i < hit; i++) |
| 1092 | | { |
| 1093 | | if (unicode_ranges[i].first <= uchar && uchar <= unicode_ranges[i].last) |
| 1094 | | { |
| 1095 | | hit = i; |
| 1096 | | return unicode_ranges[i].name; |
| 1097 | | } |
| 1098 | | } |
| 1099 | | |
| 1100 | 1216 | return NULL; |
| 1101 | 1217 | } |
| 1102 | 1218 | |
| 1103 | 1219 | unicode_char unicode_range_first(unicode_char uchar) |
| 1104 | 1220 | { |
| 1105 | | static UINT32 hit = 0; |
| 1106 | | UINT32 i; |
| 1221 | int _min = 0; |
| 1222 | int _max = sizeof(unicode_ranges) / sizeof(unicode_ranges[0]) - 1; |
| 1223 | int _mid; |
| 1107 | 1224 | |
| 1108 | | for (i = hit; i < sizeof(unicode_ranges)/sizeof(unicode_ranges[0]); i++) |
| 1225 | /* binary search in table of unicode ranges */ |
| 1226 | while (_max >= _min) |
| 1109 | 1227 | { |
| 1110 | | if (unicode_ranges[i].first <= uchar && uchar <= unicode_ranges[i].last) |
| 1111 | | { |
| 1112 | | hit = i; |
| 1113 | | return unicode_ranges[i].first; |
| 1114 | | } |
| 1228 | _mid = (_min + _max) / 2; |
| 1229 | if (unicode_ranges[_mid].last < uchar) |
| 1230 | _min = _mid + 1; |
| 1231 | else if (unicode_ranges[_mid].first > uchar) |
| 1232 | _max = _mid - 1; |
| 1233 | else if (unicode_ranges[_mid].first <= uchar && unicode_ranges[_mid].last >= uchar) |
| 1234 | return unicode_ranges[_mid].first; |
| 1115 | 1235 | } |
| 1116 | | for (i = 0; i < hit; i++) |
| 1117 | | { |
| 1118 | | if (unicode_ranges[i].first <= uchar && uchar <= unicode_ranges[i].last) |
| 1119 | | { |
| 1120 | | hit = i; |
| 1121 | | return unicode_ranges[i].first; |
| 1122 | | } |
| 1123 | | } |
| 1124 | | |
| 1125 | 1236 | return uchar; |
| 1126 | 1237 | } |
| 1127 | 1238 | |
| 1128 | 1239 | unicode_char unicode_range_last(unicode_char uchar) |
| 1129 | 1240 | { |
| 1130 | | static UINT32 hit = 0; |
| 1131 | | UINT32 i; |
| 1241 | int _min = 0; |
| 1242 | int _max = sizeof(unicode_ranges) / sizeof(unicode_ranges[0]) - 1; |
| 1243 | int _mid; |
| 1132 | 1244 | |
| 1133 | | for (i = hit; i < sizeof(unicode_ranges)/sizeof(unicode_ranges[0]); i++) |
| 1245 | /* binary search in table of unicode ranges */ |
| 1246 | while (_max >= _min) |
| 1134 | 1247 | { |
| 1135 | | if (unicode_ranges[i].first <= uchar && uchar <= unicode_ranges[i].last) |
| 1136 | | { |
| 1137 | | hit = i; |
| 1138 | | return unicode_ranges[i].last; |
| 1139 | | } |
| 1248 | _mid = (_min + _max) / 2; |
| 1249 | if (unicode_ranges[_mid].last < uchar) |
| 1250 | _min = _mid + 1; |
| 1251 | else if (unicode_ranges[_mid].first > uchar) |
| 1252 | _max = _mid - 1; |
| 1253 | else if (unicode_ranges[_mid].first <= uchar && unicode_ranges[_mid].last >= uchar) |
| 1254 | return unicode_ranges[_mid].last; |
| 1140 | 1255 | } |
| 1141 | | for (i = 0; i < hit; i++) |
| 1142 | | { |
| 1143 | | if (unicode_ranges[i].first <= uchar && uchar <= unicode_ranges[i].last) |
| 1144 | | { |
| 1145 | | hit = i; |
| 1146 | | return unicode_ranges[i].last; |
| 1147 | | } |
| 1148 | | } |
| 1149 | | |
| 1150 | 1256 | return uchar; |
| 1151 | 1257 | } |
| 1152 | 1258 | #endif |