Previous 199869 Revisions Next

r26197 Saturday 16th November, 2013 at 16:22:15 UTC by Jürgen Buchmüller
Rename some unicode functions for perceivability
[/branches/alto2/src/emu/debug]textbuf.c
[/branches/alto2/src/lib/util]unicode.c unicode.h

branches/alto2/src/emu/debug/textbuf.c
r26196r26197
183183   INT32 needed_space;
184184
185185   /* we need to ensure there is enough space for this string plus enough for the max line length */
186   needed_space = utf8_strlen(data) + MAX_LINE_LENGTH;
186   needed_space = utf8_ucharlen(data) + MAX_LINE_LENGTH;
187187
188188   /* make space in the buffer if we need to */
189189   while (buffer_space(text) < needed_space && text->linestart != text->lineend)
branches/alto2/src/lib/util/unicode.c
r26196r26197
318318   return rc < count ? rc : count;
319319}
320320
321/**
322 * @brief return the number of decoded Unicode values in UTF-8 encoded string
323 * @param src pointer to the array of UTF-8 encoded characters
324 * @param plen optional pointer to a size_t variable to receive the source string length
325 * @return number of unicode_char values decoded from the UTF-8 string
326 */
327size_t utf8_ucharlen(const char* utf8src, size_t * plen)
328{
329   size_t len = 0;
330   size_t total = 0;
331   while (*utf8src) {
332      unsigned char c = (unsigned char) *utf8src;
333      size_t auxlen;
321334
335      /* determine how many additional bytes we need */
336      if (c < 0x80)
337      {
338         /* unicode char 0x00000000 - 0x0000007F */
339         auxlen = 0;
340      }
341      else if (c >= 0xc0 && c < 0xe0)
342      {
343         /* unicode char 0x00000080 - 0x000007FF */
344         if (0 == utf8src[1])
345            return -1;
346         auxlen = 1;
347      }
348      else if (c >= 0xe0 && c < 0xf0)
349      {
350         /* unicode char 0x00000800 - 0x0000FFFF */
351         if (0 == utf8src[1] || 0 == utf8src[2])
352            return -1;
353         auxlen = 2;
354      }
355      else if (c >= 0xf0 && c < 0xf8)
356      {
357         /* unicode char 0x00010000 - 0x001FFFFF */
358         if (0 == utf8src[1] || 0 == utf8src[2] || 0 == utf8src[3])
359            return -1;
360         auxlen = 3;
361      }
362      else if (c >= 0xf8 && c < 0xfc)
363      {
364         /* unicode char 0x00200000 - 0x03FFFFFF */
365         if (0 == utf8src[1] || 0 == utf8src[2] || 0 == utf8src[3] || 0 == utf8src[4])
366            return -1;
367         auxlen = 4;
368      }
369      else if (c >= 0xfc && c < 0xfe)
370      {
371         /* unicode char 0x04000000 - 0x7FFFFFFF */
372         if (0 == utf8src[1] || 0 == utf8src[2] || 0 == utf8src[3] || 0 == utf8src[4] || 0 == utf8src[5])
373            return -1;
374         auxlen = 5;
375      }
376      else
377      {
378         /* invalid */
379         return -1;
380      }
381      total++;
382      len += auxlen + 1;
383      utf8src += auxlen + 1;
384   }
385   if (plen)
386      *plen = len;
387   return total;
388}
389
322390/**
391 * @brief return the number of decoded Unicode values in UTF-16 encoded string
392 * @param src pointer to the array of UTF-16 encoded characters
393 * @param plen optional pointer to a size_t variable to receive the source string length
394 * @return number of unicode_char values decoded from the UTF-8 string
395 */
396size_t utf16_ucharlen(const utf16_char* utf16src, size_t * plen)
397{
398   size_t len = 0;
399   size_t total = 0;
400   while (*utf16src) {
401      utf16_char c = (utf16_char) *utf16src;
402      size_t auxlen;
403
404      if (c >= 0xd800 && c <= 0xdbff)
405      {
406         if (0 == utf16src[1])
407            return -1;
408         auxlen = 1;
409      }
410      else if (utf16src[0] < 0xdc00 || utf16src[0] > 0xdfff)
411      {
412         auxlen = 0;
413      }
414      else
415         return -1;
416      total++;
417      len += auxlen + 1;
418      utf16src += auxlen + 1;
419   }
420   if (plen)
421      *plen = len;
422   return total;
423}
424
425/**
323426 * @brief return a pointer to the previous character in a string
324427 * @param utf8string const pointer to the starting position in the string
325428 * @return pointer to the character which is not an UTF-8 auxiliary character
r26196r26197
359462}
360463
361464/**
362 * @brief return the number of decoded Unicode values in UTF-8 encoded string
363 * @param src pointer to the array of UTF-8 encoded characters
364 * @return number of unicode_char values decoded from the UTF-8 string
365 */
366size_t utf8_strlen(const char* src)
367{
368   int total = 0;
369   while (*src) {
370      unicode_char uchar;
371      int len = uchar_from_utf8(&uchar, src, strlen(src));
372      if (len < 0)
373         break;   // invalid UTF-8
374      total++;
375      src += len;
376   }
377   return total;
378}
379
380/**
381465 * @brief load a lookup table 8 bit codes to Unicode values
382466 *
383467 * This opens and reads a file %name which has to be in the
r26196r26197
450534}
451535
452536/**
537 * @brief return an unicode_char array allocated while converted from UTF-8
538 * @param utf8char source string encoded in UTF-8
539 * @return newly allocated unicode_char string
540 */
541unicode_char* uchar_strfrom_utf8(const char *utf8src)
542{
543   size_t available;
544   size_t size = utf8_ucharlen(utf8src, &available);
545   if (-1 == size)
546      return NULL;
547   unicode_char* result = (unicode_char *)calloc(sizeof(unicode_char), size + 1);
548   unicode_char* dst = result;
549   while (*utf8src) {
550      unicode_char uchar;
551      int len = uchar_from_utf8(&uchar, utf8src, available);
552      utf8src += len;
553      available -= len;
554      *dst++ = uchar;
555   }
556   return result;
557}
558
559/**
560 * @brief return an unicode_char array allocated while converted from UTF-16
561 * @param utf16src source string encoded in UTF-16
562 * @return newly allocated unicode_char string
563 */
564unicode_char* uchar_strfrom_utf16(const utf16_char *utf16src)
565{
566   size_t available;
567   size_t size = utf16_ucharlen(utf16src, &available);
568   if (-1 == size)
569      return NULL;
570   unicode_char* result = (unicode_char *)calloc(sizeof(unicode_char), size + 1);
571   unicode_char* dst = result;
572   while (*utf16src) {
573      unicode_char uchar;
574      int len = uchar_from_utf16(&uchar, utf16src, available);
575      utf16src += len;
576      available -= len;
577      *dst++ = uchar;
578   }
579   return result;
580}
581
582/**
453583 * @brief return the unicode_char array length
454584 * @param src pointer to an array of unicode_char
455585 * @return length of the array until the first 0
r26196r26197
617747
618748static unicode_data_t** unicode_data = NULL;
619749
620#if   NEED_UNICODE_RANGES
621typedef struct {
622   unicode_char first, last;
623   const char *name;
624}   unicode_range_t;
625
626static const unicode_range_t unicode_ranges[] =
627{
628   {0x0000, 0x007f, "Basic Latin"},
629   {0x0080, 0x00ff, "Latin-1 Supplement"},
630   {0x0100, 0x017f, "Latin Extended-A"},
631   {0x0180, 0x024f, "Latin Extended-B"},
632   {0x0250, 0x02af, "IPA Extensions"},
633   {0x02b0, 0x02ff, "Spacing Modifier Letters"},
634   {0x0300, 0x036f, "Combining Diacritical Marks"},
635   {0x0370, 0x03ff, "Greek"},
636   {0x0400, 0x04ff, "Cyrillic"},
637   {0x0530, 0x058f, "Armenian"},
638   {0x0590, 0x05ff, "Hebrew"},
639   {0x0600, 0x06ff, "Arabic"},
640   {0x0700, 0x074f, "Syriac"},
641   {0x0780, 0x07bf, "Thaana"},
642   {0x0900, 0x097f, "Devanagari"},
643   {0x0980, 0x09ff, "Bengali"},
644   {0x0a00, 0x0a7f, "Gurmukhi"},
645   {0x0a80, 0x0aff, "Gujarati"},
646   {0x0b00, 0x0b7f, "Oriya"},
647   {0x0b80, 0x0bff, "Tamil"},
648   {0x0c00, 0x0c7f, "Telugu"},
649   {0x0c80, 0x0cff, "Kannada"},
650   {0x0d00, 0x0d7f, "Malayalam"},
651   {0x0d80, 0x0dff, "Sinhala"},
652   {0x0e00, 0x0e7f, "Thai"},
653   {0x0e80, 0x0eff, "Lao"},
654   {0x0f00, 0x0fff, "Tibetan"},
655   {0x1000, 0x109f, "Myanmar"},
656   {0x10a0, 0x10ff, "Georgian"},
657   {0x1100, 0x11ff, "Hangul Jamo"},
658   {0x1200, 0x137f, "Ethiopic"},
659   {0x13a0, 0x13ff, "Cherokee"},
660   {0x1400, 0x167f, "Unified Canadian Aboriginal Syllabic"},
661   {0x1680, 0x169f, "Ogham"},
662   {0x16a0, 0x16ff, "Runic"},
663   {0x1780, 0x17ff, "Khmer"},
664   {0x1800, 0x18af, "Mongolian"},
665   {0x1e00, 0x1eff, "Latin Extended Additional"},
666   {0x1f00, 0x1fff, "Greek Extended"},
667   {0x2000, 0x206f, "General Punctuation"},
668   {0x2070, 0x208f, "Superscripts and Subscripts"},
669   {0x20a0, 0x20cf, "Currency Symbols"},
670   {0x20d0, 0x20ff, "Combining Marks for Symbols"},
671   {0x2100, 0x214f, "Letterlike Symbols"},
672   {0x2150, 0x218f, "Number Forms"},
673   {0x2190, 0x21ff, "Arrows"},
674   {0x2200, 0x22ff, "Mathematical Operators"},
675   {0x2300, 0x23ff, "Miscellaneous Technical"},
676   {0x2400, 0x243f, "Control Pictures"},
677   {0x2440, 0x245f, "Optical Character Recognition"},
678   {0x2460, 0x24ff, "Enclosed Alphanumerics"},
679   {0x2500, 0x257f, "Box Drawing"},
680   {0x2580, 0x259f, "Block Elements"},
681   {0x25a0, 0x25ff, "Geometric Shapes"},
682   {0x2600, 0x26ff, "Miscellaneous Symbols"},
683   {0x2700, 0x27bf, "Dingbats"},
684   {0x2800, 0x28ff, "Braille Patterns"},
685   {0x2e80, 0x2eff, "CJK Radicals Supplement"},
686   {0x2f00, 0x2fdf, "Kangxi Radicals"},
687   {0x2ff0, 0x2fff, "Ideographic Description Characters"},
688   {0x3000, 0x303f, "CJK Symbols and Punctuation"},
689   {0x3040, 0x309f, "Hiragana"},
690   {0x30a0, 0x30ff, "Katakana"},
691   {0x3100, 0x312f, "Bopomofo"},
692   {0x3130, 0x318f, "Hangul Compatibility Jamo"},
693   {0x3190, 0x319f, "Kanbun"},
694   {0x31a0, 0x31bf, "Bopomofo Extended"},
695   {0x3200, 0x32ff, "Enclosed CJK Letters and Months"},
696   {0x3300, 0x33ff, "CJK Compatibility"},
697   {0x3400, 0x4dbf, "CJK Unified Ideographs Extension A"},
698   {0x4e00, 0x9faf, "CJK Unified Ideographs"},
699   {0xa000, 0xa48f, "Yi Syllables"},
700   {0xa490, 0xa4cf, "Yi Radicals"},
701   {0xac00, 0xd7af, "Hangul Syllables"},
702   {0xd800, 0xdb7f, "High Surrogates"},
703   {0xdb80, 0xdbff, "High Private Use Surrogates"},
704   {0xdc00, 0xdfff, "Low Surrogates"},
705   {0xe000, 0xf8ff, "Private Use"},
706   {0xf900, 0xfaff, "CJK Compatibility Ideographs"},
707   {0xfb00, 0xfb4f, "Alphabetic Presentation Forms"},
708   {0xfb50, 0xfdff, "Arabic Presentation Forms-A"},
709   {0xfe20, 0xfe2f, "Combining Half Marks"},
710   {0xfe30, 0xfe4f, "CJK Compatibility Forms"},
711   {0xfe50, 0xfe6f, "Small Form Variants"},
712   {0xfe70, 0xfeff, "Arabic Presentation Forms-B"},
713   {0xff00, 0xffef, "Halfwidth and Fullwidth Forms"},
714   {0xfff0, 0xffff, "Specials"}
715   // FIXME: add ranges for the Unicode planes 1 to 16
716};
717#endif
718
719#if   NEED_UNICODE_CCOM
720static const char *canonical_combining_str(UINT8 val)
721{
722   switch (val)
723   {
724   case 0:      return "Spacing, split, enclosing, reordrant, and Tibetan subjoined";
725   case 1:      return "Overlays and interior";
726   case 7:      return "Nuktas";
727   case 8:      return "Hiragana/Katakana voicing marks";
728   case 9:      return "Viramas";
729   case 10:   return "Start of fixed position classes";
730   case 199:   return "End of fixed position classes";
731   case 200:   return "Below left attached";
732   case 202:   return "Below attached";
733   case 204:   return "Below right attached";
734   case 208:   return "Left attached (reordrant around single base character)";
735   case 210:   return "Right attached";
736   case 212:   return "Above left attached";
737   case 214:   return "Above attached";
738   case 216:   return "Above right attached";
739   case 218:   return "Below left";
740   case 220:   return "Below";
741   case 222:   return "Below right";
742   case 224:   return "Left (reordrant around single base character)";
743   case 226:   return "Right";
744   case 228:   return "Above left";
745   case 230:   return "Above";
746   case 232:   return "Above right";
747   case 233:   return "Double below";
748   case 234:   return "Double above";
749   case 240:   return "Below (iota subscript)";
750   }
751   return "INVALID";
752}
753#endif
754
755750#if   NEED_UNICODE_NAME
756751const char * unicode_name(unicode_char uchar)
757752{
r26196r26197
832827{
833828   if (!unicode_data || uchar >= UNICODE_PLANESIZE || !unicode_data[uchar])
834829      return "";
835   return canonical_combining_str(unicode_data[uchar]->canonical_comb);
830   switch (unicode_data[uchar]->canonical_comb)
831   {
832   case 0:      return "Spacing, split, enclosing, reordrant, and Tibetan subjoined";
833   case 1:      return "Overlays and interior";
834   case 7:      return "Nuktas";
835   case 8:      return "Hiragana/Katakana voicing marks";
836   case 9:      return "Viramas";
837   case 10:   return "Start of fixed position classes";
838   case 199:   return "End of fixed position classes";
839   case 200:   return "Below left attached";
840   case 202:   return "Below attached";
841   case 204:   return "Below right attached";
842   case 208:   return "Left attached (reordrant around single base character)";
843   case 210:   return "Right attached";
844   case 212:   return "Above left attached";
845   case 214:   return "Above attached";
846   case 216:   return "Above right attached";
847   case 218:   return "Below left";
848   case 220:   return "Below";
849   case 222:   return "Below right";
850   case 224:   return "Left (reordrant around single base character)";
851   case 226:   return "Right";
852   case 228:   return "Above left";
853   case 230:   return "Above";
854   case 232:   return "Above right";
855   case 233:   return "Double below";
856   case 234:   return "Double above";
857   case 240:   return "Below (iota subscript)";
858   }
859   return "INVALID";
836860}
837861#endif
838862
r26196r26197
10751099#endif
10761100
10771101#if   NEED_UNICODE_RANGES
1102typedef struct {
1103   unicode_char first, last;
1104   const char *name;
1105}   unicode_range_t;
1106
1107static const unicode_range_t unicode_ranges[] =
1108{
1109   {0x0000, 0x007f, "Basic Latin"},
1110   {0x0080, 0x00ff, "Latin-1 Supplement"},
1111   {0x0100, 0x017f, "Latin Extended-A"},
1112   {0x0180, 0x024f, "Latin Extended-B"},
1113   {0x0250, 0x02af, "IPA Extensions"},
1114   {0x02b0, 0x02ff, "Spacing Modifier Letters"},
1115   {0x0300, 0x036f, "Combining Diacritical Marks"},
1116   {0x0370, 0x03ff, "Greek"},
1117   {0x0400, 0x04ff, "Cyrillic"},
1118   {0x0530, 0x058f, "Armenian"},
1119   {0x0590, 0x05ff, "Hebrew"},
1120   {0x0600, 0x06ff, "Arabic"},
1121   {0x0700, 0x074f, "Syriac"},
1122   {0x0780, 0x07bf, "Thaana"},
1123   {0x0900, 0x097f, "Devanagari"},
1124   {0x0980, 0x09ff, "Bengali"},
1125   {0x0a00, 0x0a7f, "Gurmukhi"},
1126   {0x0a80, 0x0aff, "Gujarati"},
1127   {0x0b00, 0x0b7f, "Oriya"},
1128   {0x0b80, 0x0bff, "Tamil"},
1129   {0x0c00, 0x0c7f, "Telugu"},
1130   {0x0c80, 0x0cff, "Kannada"},
1131   {0x0d00, 0x0d7f, "Malayalam"},
1132   {0x0d80, 0x0dff, "Sinhala"},
1133   {0x0e00, 0x0e7f, "Thai"},
1134   {0x0e80, 0x0eff, "Lao"},
1135   {0x0f00, 0x0fff, "Tibetan"},
1136   {0x1000, 0x109f, "Myanmar"},
1137   {0x10a0, 0x10ff, "Georgian"},
1138   {0x1100, 0x11ff, "Hangul Jamo"},
1139   {0x1200, 0x137f, "Ethiopic"},
1140   {0x13a0, 0x13ff, "Cherokee"},
1141   {0x1400, 0x167f, "Unified Canadian Aboriginal Syllabic"},
1142   {0x1680, 0x169f, "Ogham"},
1143   {0x16a0, 0x16ff, "Runic"},
1144   {0x1780, 0x17ff, "Khmer"},
1145   {0x1800, 0x18af, "Mongolian"},
1146   {0x1e00, 0x1eff, "Latin Extended Additional"},
1147   {0x1f00, 0x1fff, "Greek Extended"},
1148   {0x2000, 0x206f, "General Punctuation"},
1149   {0x2070, 0x208f, "Superscripts and Subscripts"},
1150   {0x20a0, 0x20cf, "Currency Symbols"},
1151   {0x20d0, 0x20ff, "Combining Marks for Symbols"},
1152   {0x2100, 0x214f, "Letterlike Symbols"},
1153   {0x2150, 0x218f, "Number Forms"},
1154   {0x2190, 0x21ff, "Arrows"},
1155   {0x2200, 0x22ff, "Mathematical Operators"},
1156   {0x2300, 0x23ff, "Miscellaneous Technical"},
1157   {0x2400, 0x243f, "Control Pictures"},
1158   {0x2440, 0x245f, "Optical Character Recognition"},
1159   {0x2460, 0x24ff, "Enclosed Alphanumerics"},
1160   {0x2500, 0x257f, "Box Drawing"},
1161   {0x2580, 0x259f, "Block Elements"},
1162   {0x25a0, 0x25ff, "Geometric Shapes"},
1163   {0x2600, 0x26ff, "Miscellaneous Symbols"},
1164   {0x2700, 0x27bf, "Dingbats"},
1165   {0x2800, 0x28ff, "Braille Patterns"},
1166   {0x2e80, 0x2eff, "CJK Radicals Supplement"},
1167   {0x2f00, 0x2fdf, "Kangxi Radicals"},
1168   {0x2ff0, 0x2fff, "Ideographic Description Characters"},
1169   {0x3000, 0x303f, "CJK Symbols and Punctuation"},
1170   {0x3040, 0x309f, "Hiragana"},
1171   {0x30a0, 0x30ff, "Katakana"},
1172   {0x3100, 0x312f, "Bopomofo"},
1173   {0x3130, 0x318f, "Hangul Compatibility Jamo"},
1174   {0x3190, 0x319f, "Kanbun"},
1175   {0x31a0, 0x31bf, "Bopomofo Extended"},
1176   {0x3200, 0x32ff, "Enclosed CJK Letters and Months"},
1177   {0x3300, 0x33ff, "CJK Compatibility"},
1178   {0x3400, 0x4dbf, "CJK Unified Ideographs Extension A"},
1179   {0x4e00, 0x9faf, "CJK Unified Ideographs"},
1180   {0xa000, 0xa48f, "Yi Syllables"},
1181   {0xa490, 0xa4cf, "Yi Radicals"},
1182   {0xac00, 0xd7af, "Hangul Syllables"},
1183   {0xd800, 0xdb7f, "High Surrogates"},
1184   {0xdb80, 0xdbff, "High Private Use Surrogates"},
1185   {0xdc00, 0xdfff, "Low Surrogates"},
1186   {0xe000, 0xf8ff, "Private Use"},
1187   {0xf900, 0xfaff, "CJK Compatibility Ideographs"},
1188   {0xfb00, 0xfb4f, "Alphabetic Presentation Forms"},
1189   {0xfb50, 0xfdff, "Arabic Presentation Forms-A"},
1190   {0xfe20, 0xfe2f, "Combining Half Marks"},
1191   {0xfe30, 0xfe4f, "CJK Compatibility Forms"},
1192   {0xfe50, 0xfe6f, "Small Form Variants"},
1193   {0xfe70, 0xfeff, "Arabic Presentation Forms-B"},
1194   {0xff00, 0xffef, "Halfwidth and Fullwidth Forms"},
1195   {0xfff0, 0xffff, "Specials"}
1196   // FIXME: add ranges for the Unicode planes 1 to 16
1197};
1198
10781199const char * unicode_range_name(unicode_char uchar)
10791200{
1080   static UINT32 hit = 0;
1081   UINT32 i;
1201   int _min = 0;
1202   int _max = sizeof(unicode_ranges) / sizeof(unicode_ranges[0]) - 1;
1203   int _mid;
10821204
1083   for (i = hit; i < sizeof(unicode_ranges)/sizeof(unicode_ranges[0]); i++)
1205   /* binary search in table of unicode ranges */
1206   while (_max >= _min)
10841207   {
1085      if (unicode_ranges[i].first <= uchar && uchar <= unicode_ranges[i].last)
1086      {
1087         hit = i;
1088         return unicode_ranges[i].name;
1089      }
1208      _mid = (_min + _max) / 2;
1209      if (unicode_ranges[_mid].last < uchar)
1210         _min = _mid + 1;
1211      else if (unicode_ranges[_mid].first > uchar)
1212         _max = _mid - 1;
1213      else if (unicode_ranges[_mid].first <= uchar && unicode_ranges[_mid].last >= uchar)
1214         return unicode_ranges[_mid].name;
10901215   }
1091   for (i = 0; i < hit; i++)
1092   {
1093      if (unicode_ranges[i].first <= uchar && uchar <= unicode_ranges[i].last)
1094      {
1095         hit = i;
1096         return unicode_ranges[i].name;
1097      }
1098   }
1099
11001216   return NULL;
11011217}
11021218
11031219unicode_char unicode_range_first(unicode_char uchar)
11041220{
1105   static UINT32 hit = 0;
1106   UINT32 i;
1221   int _min = 0;
1222   int _max = sizeof(unicode_ranges) / sizeof(unicode_ranges[0]) - 1;
1223   int _mid;
11071224
1108   for (i = hit; i < sizeof(unicode_ranges)/sizeof(unicode_ranges[0]); i++)
1225   /* binary search in table of unicode ranges */
1226   while (_max >= _min)
11091227   {
1110      if (unicode_ranges[i].first <= uchar && uchar <= unicode_ranges[i].last)
1111      {
1112         hit = i;
1113         return unicode_ranges[i].first;
1114      }
1228      _mid = (_min + _max) / 2;
1229      if (unicode_ranges[_mid].last < uchar)
1230         _min = _mid + 1;
1231      else if (unicode_ranges[_mid].first > uchar)
1232         _max = _mid - 1;
1233      else if (unicode_ranges[_mid].first <= uchar && unicode_ranges[_mid].last >= uchar)
1234         return unicode_ranges[_mid].first;
11151235   }
1116   for (i = 0; i < hit; i++)
1117   {
1118      if (unicode_ranges[i].first <= uchar && uchar <= unicode_ranges[i].last)
1119      {
1120         hit = i;
1121         return unicode_ranges[i].first;
1122      }
1123   }
1124
11251236   return uchar;
11261237}
11271238
11281239unicode_char unicode_range_last(unicode_char uchar)
11291240{
1130   static UINT32 hit = 0;
1131   UINT32 i;
1241   int _min = 0;
1242   int _max = sizeof(unicode_ranges) / sizeof(unicode_ranges[0]) - 1;
1243   int _mid;
11321244
1133   for (i = hit; i < sizeof(unicode_ranges)/sizeof(unicode_ranges[0]); i++)
1245   /* binary search in table of unicode ranges */
1246   while (_max >= _min)
11341247   {
1135      if (unicode_ranges[i].first <= uchar && uchar <= unicode_ranges[i].last)
1136      {
1137         hit = i;
1138         return unicode_ranges[i].last;
1139      }
1248      _mid = (_min + _max) / 2;
1249      if (unicode_ranges[_mid].last < uchar)
1250         _min = _mid + 1;
1251      else if (unicode_ranges[_mid].first > uchar)
1252         _max = _mid - 1;
1253      else if (unicode_ranges[_mid].first <= uchar && unicode_ranges[_mid].last >= uchar)
1254         return unicode_ranges[_mid].last;
11401255   }
1141   for (i = 0; i < hit; i++)
1142   {
1143      if (unicode_ranges[i].first <= uchar && uchar <= unicode_ranges[i].last)
1144      {
1145         hit = i;
1146         return unicode_ranges[i].last;
1147      }
1148   }
1149
11501256   return uchar;
11511257}
11521258#endif
branches/alto2/src/lib/util/unicode.h
r26196r26197
145145//! convert an unicode character into a UTF-16 sequence with flipped endianness
146146int utf16f_from_uchar(utf16_char *utf16string, size_t count, unicode_char uchar);
147147
148//! return the number of decoded Unicode values in UTF-8 encoded string
149size_t utf8_ucharlen(const char* utf8src, size_t * plen = 0);
150
151//! return the number of decoded Unicode values in UTF-16 encoded string
152size_t utf16_ucharlen(const utf16_char* utf16src, size_t * plen = 0);
153
148154/* misc UTF-8 helpers */
149155//! return a pointer to the previous character in a string
150156const char *utf8_previous_char(const char *utf8string);
r26196r26197
152158//! return true if the given string is a properly formed sequence of UTF-8 characters
153159int utf8_is_valid_string(const char *utf8string);
154160
155//! return the number of decoded Unicode values in UTF-8 encoded string
156size_t utf8_strlen(const char* src);
157
158161/* 8 bit code to Unicode value lookup table handling (e.g. ISO-8859-1 aka Latin1) */
159162//! load a table translating UINT8 (unsigned char) to Unicode values
160163unicode_char * uchar_table_load(const char* name);
r26196r26197
165168//! free a unicode table
166169void uchar_table_free(unicode_char* table);
167170
171//! return an unicode_char array allocated while converted from UTF-8
172unicode_char* ustring_from_utf8(const char *utf8char);
173
174//! return an unicode_char array allocated while converted from UTF-16
175unicode_char* ustring_from_utf16(const utf16_char *utf16char);
176
168177/* unicode_char array functions - string.h like */
169178//! return the unicode_char array length
170179size_t uchar_strlen(const unicode_char* src);
r26196r26197
178187//! print a formatted string of ASCII characters to an unicode_char array (max 256 characters)
179188int uchar_sprintf(unicode_char* dst, const char* format, ...);
180189
190//! print a formatted string of ASCII characters to an unicode_char array (max size characters)
191int uchar_snprintf(unicode_char* dst, size_t size, const char* format, ...);
192
181193//! copy an array of unicode_char from source to destination
182194unicode_char* uchar_strcpy(unicode_char* dst, const unicode_char* src);
183195

Previous 199869 Revisions Next


© 1997-2024 The MAME Team