MAME SVN History

199869 Revisions

r26197 Saturday 16th November, 2013 at 16:22:15 UTC by Jürgen Buchmüller
Rename some unicode functions for perceivability

[/branches/alto2/src/emu/debug]	textbuf.c
[/branches/alto2/src/lib/util]	unicode.c unicode.h

branches/alto2/src/emu/debug/textbuf.c
r26196	r26197
183	183	INT32 needed_space;
184	184
185	185	/* we need to ensure there is enough space for this string plus enough for the max line length */
186		needed_space = utf8_strlen(data) + MAX_LINE_LENGTH;
	186	needed_space = utf8_ucharlen(data) + MAX_LINE_LENGTH;
187	187
188	188	/* make space in the buffer if we need to */
189	189	while (buffer_space(text) < needed_space && text->linestart != text->lineend)

branches/alto2/src/lib/util/unicode.c
r26196	r26197
318	318	return rc < count ? rc : count;
319	319	}
320	320
	321	/**
	322	* @brief return the number of decoded Unicode values in UTF-8 encoded string
	323	* @param src pointer to the array of UTF-8 encoded characters
	324	* @param plen optional pointer to a size_t variable to receive the source string length
	325	* @return number of unicode_char values decoded from the UTF-8 string
	326	*/
	327	size_t utf8_ucharlen(const char* utf8src, size_t * plen)
	328	{
	329	size_t len = 0;
	330	size_t total = 0;
	331	while (*utf8src) {
	332	unsigned char c = (unsigned char) *utf8src;
	333	size_t auxlen;
321	334
	335	/* determine how many additional bytes we need */
	336	if (c < 0x80)
	337	{
	338	/* unicode char 0x00000000 - 0x0000007F */
	339	auxlen = 0;
	340	}
	341	else if (c >= 0xc0 && c < 0xe0)
	342	{
	343	/* unicode char 0x00000080 - 0x000007FF */
	344	if (0 == utf8src[1])
	345	return -1;
	346	auxlen = 1;
	347	}
	348	else if (c >= 0xe0 && c < 0xf0)
	349	{
	350	/* unicode char 0x00000800 - 0x0000FFFF */
	351	if (0 == utf8src[1] \|\| 0 == utf8src[2])
	352	return -1;
	353	auxlen = 2;
	354	}
	355	else if (c >= 0xf0 && c < 0xf8)
	356	{
	357	/* unicode char 0x00010000 - 0x001FFFFF */
	358	if (0 == utf8src[1] \|\| 0 == utf8src[2] \|\| 0 == utf8src[3])
	359	return -1;
	360	auxlen = 3;
	361	}
	362	else if (c >= 0xf8 && c < 0xfc)
	363	{
	364	/* unicode char 0x00200000 - 0x03FFFFFF */
	365	if (0 == utf8src[1] \|\| 0 == utf8src[2] \|\| 0 == utf8src[3] \|\| 0 == utf8src[4])
	366	return -1;
	367	auxlen = 4;
	368	}
	369	else if (c >= 0xfc && c < 0xfe)
	370	{
	371	/* unicode char 0x04000000 - 0x7FFFFFFF */
	372	if (0 == utf8src[1] \|\| 0 == utf8src[2] \|\| 0 == utf8src[3] \|\| 0 == utf8src[4] \|\| 0 == utf8src[5])
	373	return -1;
	374	auxlen = 5;
	375	}
	376	else
	377	{
	378	/* invalid */
	379	return -1;
	380	}
	381	total++;
	382	len += auxlen + 1;
	383	utf8src += auxlen + 1;
	384	}
	385	if (plen)
	386	*plen = len;
	387	return total;
	388	}
	389
322	390	/**
	391	* @brief return the number of decoded Unicode values in UTF-16 encoded string
	392	* @param src pointer to the array of UTF-16 encoded characters
	393	* @param plen optional pointer to a size_t variable to receive the source string length
	394	* @return number of unicode_char values decoded from the UTF-8 string
	395	*/
	396	size_t utf16_ucharlen(const utf16_char* utf16src, size_t * plen)
	397	{
	398	size_t len = 0;
	399	size_t total = 0;
	400	while (*utf16src) {
	401	utf16_char c = (utf16_char) *utf16src;
	402	size_t auxlen;
	403
	404	if (c >= 0xd800 && c <= 0xdbff)
	405	{
	406	if (0 == utf16src[1])
	407	return -1;
	408	auxlen = 1;
	409	}
	410	else if (utf16src[0] < 0xdc00 \|\| utf16src[0] > 0xdfff)
	411	{
	412	auxlen = 0;
	413	}
	414	else
	415	return -1;
	416	total++;
	417	len += auxlen + 1;
	418	utf16src += auxlen + 1;
	419	}
	420	if (plen)
	421	*plen = len;
	422	return total;
	423	}
	424
	425	/**
323	426	* @brief return a pointer to the previous character in a string
324	427	* @param utf8string const pointer to the starting position in the string
325	428	* @return pointer to the character which is not an UTF-8 auxiliary character
r26196	r26197
359	462	}
360	463
361	464	/**
362		* @brief return the number of decoded Unicode values in UTF-8 encoded string
363		* @param src pointer to the array of UTF-8 encoded characters
364		* @return number of unicode_char values decoded from the UTF-8 string
365		*/
366		size_t utf8_strlen(const char* src)
367		{
368		int total = 0;
369		while (*src) {
370		unicode_char uchar;
371		int len = uchar_from_utf8(&uchar, src, strlen(src));
372		if (len < 0)
373		break; // invalid UTF-8
374		total++;
375		src += len;
376		}
377		return total;
378		}
379
380		/**
381	465	* @brief load a lookup table 8 bit codes to Unicode values
382	466	*
383	467	* This opens and reads a file %name which has to be in the
r26196	r26197
450	534	}
451	535
452	536	/**
	537	* @brief return an unicode_char array allocated while converted from UTF-8
	538	* @param utf8char source string encoded in UTF-8
	539	* @return newly allocated unicode_char string
	540	*/
	541	unicode_char* uchar_strfrom_utf8(const char *utf8src)
	542	{
	543	size_t available;
	544	size_t size = utf8_ucharlen(utf8src, &available);
	545	if (-1 == size)
	546	return NULL;
	547	unicode_char* result = (unicode_char *)calloc(sizeof(unicode_char), size + 1);
	548	unicode_char* dst = result;
	549	while (*utf8src) {
	550	unicode_char uchar;
	551	int len = uchar_from_utf8(&uchar, utf8src, available);
	552	utf8src += len;
	553	available -= len;
	554	*dst++ = uchar;
	555	}
	556	return result;
	557	}
	558
	559	/**
	560	* @brief return an unicode_char array allocated while converted from UTF-16
	561	* @param utf16src source string encoded in UTF-16
	562	* @return newly allocated unicode_char string
	563	*/
	564	unicode_char* uchar_strfrom_utf16(const utf16_char *utf16src)
	565	{
	566	size_t available;
	567	size_t size = utf16_ucharlen(utf16src, &available);
	568	if (-1 == size)
	569	return NULL;
	570	unicode_char* result = (unicode_char *)calloc(sizeof(unicode_char), size + 1);
	571	unicode_char* dst = result;
	572	while (*utf16src) {
	573	unicode_char uchar;
	574	int len = uchar_from_utf16(&uchar, utf16src, available);
	575	utf16src += len;
	576	available -= len;
	577	*dst++ = uchar;
	578	}
	579	return result;
	580	}
	581
	582	/**
453	583	* @brief return the unicode_char array length
454	584	* @param src pointer to an array of unicode_char
455	585	* @return length of the array until the first 0
r26196	r26197
617	747
618	748	static unicode_data_t** unicode_data = NULL;
619	749
620		#if NEED_UNICODE_RANGES
621		typedef struct {
622		unicode_char first, last;
623		const char *name;
624		} unicode_range_t;
625
626		static const unicode_range_t unicode_ranges[] =
627		{
628		{0x0000, 0x007f, "Basic Latin"},
629		{0x0080, 0x00ff, "Latin-1 Supplement"},
630		{0x0100, 0x017f, "Latin Extended-A"},
631		{0x0180, 0x024f, "Latin Extended-B"},
632		{0x0250, 0x02af, "IPA Extensions"},
633		{0x02b0, 0x02ff, "Spacing Modifier Letters"},
634		{0x0300, 0x036f, "Combining Diacritical Marks"},
635		{0x0370, 0x03ff, "Greek"},
636		{0x0400, 0x04ff, "Cyrillic"},
637		{0x0530, 0x058f, "Armenian"},
638		{0x0590, 0x05ff, "Hebrew"},
639		{0x0600, 0x06ff, "Arabic"},
640		{0x0700, 0x074f, "Syriac"},
641		{0x0780, 0x07bf, "Thaana"},
642		{0x0900, 0x097f, "Devanagari"},
643		{0x0980, 0x09ff, "Bengali"},
644		{0x0a00, 0x0a7f, "Gurmukhi"},
645		{0x0a80, 0x0aff, "Gujarati"},
646		{0x0b00, 0x0b7f, "Oriya"},
647		{0x0b80, 0x0bff, "Tamil"},
648		{0x0c00, 0x0c7f, "Telugu"},
649		{0x0c80, 0x0cff, "Kannada"},
650		{0x0d00, 0x0d7f, "Malayalam"},
651		{0x0d80, 0x0dff, "Sinhala"},
652		{0x0e00, 0x0e7f, "Thai"},
653		{0x0e80, 0x0eff, "Lao"},
654		{0x0f00, 0x0fff, "Tibetan"},
655		{0x1000, 0x109f, "Myanmar"},
656		{0x10a0, 0x10ff, "Georgian"},
657		{0x1100, 0x11ff, "Hangul Jamo"},
658		{0x1200, 0x137f, "Ethiopic"},
659		{0x13a0, 0x13ff, "Cherokee"},
660		{0x1400, 0x167f, "Unified Canadian Aboriginal Syllabic"},
661		{0x1680, 0x169f, "Ogham"},
662		{0x16a0, 0x16ff, "Runic"},
663		{0x1780, 0x17ff, "Khmer"},
664		{0x1800, 0x18af, "Mongolian"},
665		{0x1e00, 0x1eff, "Latin Extended Additional"},
666		{0x1f00, 0x1fff, "Greek Extended"},
667		{0x2000, 0x206f, "General Punctuation"},
668		{0x2070, 0x208f, "Superscripts and Subscripts"},
669		{0x20a0, 0x20cf, "Currency Symbols"},
670		{0x20d0, 0x20ff, "Combining Marks for Symbols"},
671		{0x2100, 0x214f, "Letterlike Symbols"},
672		{0x2150, 0x218f, "Number Forms"},
673		{0x2190, 0x21ff, "Arrows"},
674		{0x2200, 0x22ff, "Mathematical Operators"},
675		{0x2300, 0x23ff, "Miscellaneous Technical"},
676		{0x2400, 0x243f, "Control Pictures"},
677		{0x2440, 0x245f, "Optical Character Recognition"},
678		{0x2460, 0x24ff, "Enclosed Alphanumerics"},
679		{0x2500, 0x257f, "Box Drawing"},
680		{0x2580, 0x259f, "Block Elements"},
681		{0x25a0, 0x25ff, "Geometric Shapes"},
682		{0x2600, 0x26ff, "Miscellaneous Symbols"},
683		{0x2700, 0x27bf, "Dingbats"},
684		{0x2800, 0x28ff, "Braille Patterns"},
685		{0x2e80, 0x2eff, "CJK Radicals Supplement"},
686		{0x2f00, 0x2fdf, "Kangxi Radicals"},
687		{0x2ff0, 0x2fff, "Ideographic Description Characters"},
688		{0x3000, 0x303f, "CJK Symbols and Punctuation"},
689		{0x3040, 0x309f, "Hiragana"},
690		{0x30a0, 0x30ff, "Katakana"},
691		{0x3100, 0x312f, "Bopomofo"},
692		{0x3130, 0x318f, "Hangul Compatibility Jamo"},
693		{0x3190, 0x319f, "Kanbun"},
694		{0x31a0, 0x31bf, "Bopomofo Extended"},
695		{0x3200, 0x32ff, "Enclosed CJK Letters and Months"},
696		{0x3300, 0x33ff, "CJK Compatibility"},
697		{0x3400, 0x4dbf, "CJK Unified Ideographs Extension A"},
698		{0x4e00, 0x9faf, "CJK Unified Ideographs"},
699		{0xa000, 0xa48f, "Yi Syllables"},
700		{0xa490, 0xa4cf, "Yi Radicals"},
701		{0xac00, 0xd7af, "Hangul Syllables"},
702		{0xd800, 0xdb7f, "High Surrogates"},
703		{0xdb80, 0xdbff, "High Private Use Surrogates"},
704		{0xdc00, 0xdfff, "Low Surrogates"},
705		{0xe000, 0xf8ff, "Private Use"},
706		{0xf900, 0xfaff, "CJK Compatibility Ideographs"},
707		{0xfb00, 0xfb4f, "Alphabetic Presentation Forms"},
708		{0xfb50, 0xfdff, "Arabic Presentation Forms-A"},
709		{0xfe20, 0xfe2f, "Combining Half Marks"},
710		{0xfe30, 0xfe4f, "CJK Compatibility Forms"},
711		{0xfe50, 0xfe6f, "Small Form Variants"},
712		{0xfe70, 0xfeff, "Arabic Presentation Forms-B"},
713		{0xff00, 0xffef, "Halfwidth and Fullwidth Forms"},
714		{0xfff0, 0xffff, "Specials"}
715		// FIXME: add ranges for the Unicode planes 1 to 16
716		};
717		#endif
718
719		#if NEED_UNICODE_CCOM
720		static const char *canonical_combining_str(UINT8 val)
721		{
722		switch (val)
723		{
724		case 0: return "Spacing, split, enclosing, reordrant, and Tibetan subjoined";
725		case 1: return "Overlays and interior";
726		case 7: return "Nuktas";
727		case 8: return "Hiragana/Katakana voicing marks";
728		case 9: return "Viramas";
729		case 10: return "Start of fixed position classes";
730		case 199: return "End of fixed position classes";
731		case 200: return "Below left attached";
732		case 202: return "Below attached";
733		case 204: return "Below right attached";
734		case 208: return "Left attached (reordrant around single base character)";
735		case 210: return "Right attached";
736		case 212: return "Above left attached";
737		case 214: return "Above attached";
738		case 216: return "Above right attached";
739		case 218: return "Below left";
740		case 220: return "Below";
741		case 222: return "Below right";
742		case 224: return "Left (reordrant around single base character)";
743		case 226: return "Right";
744		case 228: return "Above left";
745		case 230: return "Above";
746		case 232: return "Above right";
747		case 233: return "Double below";
748		case 234: return "Double above";
749		case 240: return "Below (iota subscript)";
750		}
751		return "INVALID";
752		}
753		#endif
754
755	750	#if NEED_UNICODE_NAME
756	751	const char * unicode_name(unicode_char uchar)
757	752	{
r26196	r26197
832	827	{
833	828	if (!unicode_data \|\| uchar >= UNICODE_PLANESIZE \|\| !unicode_data[uchar])
834	829	return "";
835		return canonical_combining_str(unicode_data[uchar]->canonical_comb);
	830	switch (unicode_data[uchar]->canonical_comb)
	831	{
	832	case 0: return "Spacing, split, enclosing, reordrant, and Tibetan subjoined";
	833	case 1: return "Overlays and interior";
	834	case 7: return "Nuktas";
	835	case 8: return "Hiragana/Katakana voicing marks";
	836	case 9: return "Viramas";
	837	case 10: return "Start of fixed position classes";
	838	case 199: return "End of fixed position classes";
	839	case 200: return "Below left attached";
	840	case 202: return "Below attached";
	841	case 204: return "Below right attached";
	842	case 208: return "Left attached (reordrant around single base character)";
	843	case 210: return "Right attached";
	844	case 212: return "Above left attached";
	845	case 214: return "Above attached";
	846	case 216: return "Above right attached";
	847	case 218: return "Below left";
	848	case 220: return "Below";
	849	case 222: return "Below right";
	850	case 224: return "Left (reordrant around single base character)";
	851	case 226: return "Right";
	852	case 228: return "Above left";
	853	case 230: return "Above";
	854	case 232: return "Above right";
	855	case 233: return "Double below";
	856	case 234: return "Double above";
	857	case 240: return "Below (iota subscript)";
	858	}
	859	return "INVALID";
836	860	}
837	861	#endif
838	862
r26196	r26197
1075	1099	#endif
1076	1100
1077	1101	#if NEED_UNICODE_RANGES
	1102	typedef struct {
	1103	unicode_char first, last;
	1104	const char *name;
	1105	} unicode_range_t;
	1106
	1107	static const unicode_range_t unicode_ranges[] =
	1108	{
	1109	{0x0000, 0x007f, "Basic Latin"},
	1110	{0x0080, 0x00ff, "Latin-1 Supplement"},
	1111	{0x0100, 0x017f, "Latin Extended-A"},
	1112	{0x0180, 0x024f, "Latin Extended-B"},
	1113	{0x0250, 0x02af, "IPA Extensions"},
	1114	{0x02b0, 0x02ff, "Spacing Modifier Letters"},
	1115	{0x0300, 0x036f, "Combining Diacritical Marks"},
	1116	{0x0370, 0x03ff, "Greek"},
	1117	{0x0400, 0x04ff, "Cyrillic"},
	1118	{0x0530, 0x058f, "Armenian"},
	1119	{0x0590, 0x05ff, "Hebrew"},
	1120	{0x0600, 0x06ff, "Arabic"},
	1121	{0x0700, 0x074f, "Syriac"},
	1122	{0x0780, 0x07bf, "Thaana"},
	1123	{0x0900, 0x097f, "Devanagari"},
	1124	{0x0980, 0x09ff, "Bengali"},
	1125	{0x0a00, 0x0a7f, "Gurmukhi"},
	1126	{0x0a80, 0x0aff, "Gujarati"},
	1127	{0x0b00, 0x0b7f, "Oriya"},
	1128	{0x0b80, 0x0bff, "Tamil"},
	1129	{0x0c00, 0x0c7f, "Telugu"},
	1130	{0x0c80, 0x0cff, "Kannada"},
	1131	{0x0d00, 0x0d7f, "Malayalam"},
	1132	{0x0d80, 0x0dff, "Sinhala"},
	1133	{0x0e00, 0x0e7f, "Thai"},
	1134	{0x0e80, 0x0eff, "Lao"},
	1135	{0x0f00, 0x0fff, "Tibetan"},
	1136	{0x1000, 0x109f, "Myanmar"},
	1137	{0x10a0, 0x10ff, "Georgian"},
	1138	{0x1100, 0x11ff, "Hangul Jamo"},
	1139	{0x1200, 0x137f, "Ethiopic"},
	1140	{0x13a0, 0x13ff, "Cherokee"},
	1141	{0x1400, 0x167f, "Unified Canadian Aboriginal Syllabic"},
	1142	{0x1680, 0x169f, "Ogham"},
	1143	{0x16a0, 0x16ff, "Runic"},
	1144	{0x1780, 0x17ff, "Khmer"},
	1145	{0x1800, 0x18af, "Mongolian"},
	1146	{0x1e00, 0x1eff, "Latin Extended Additional"},
	1147	{0x1f00, 0x1fff, "Greek Extended"},
	1148	{0x2000, 0x206f, "General Punctuation"},
	1149	{0x2070, 0x208f, "Superscripts and Subscripts"},
	1150	{0x20a0, 0x20cf, "Currency Symbols"},
	1151	{0x20d0, 0x20ff, "Combining Marks for Symbols"},
	1152	{0x2100, 0x214f, "Letterlike Symbols"},
	1153	{0x2150, 0x218f, "Number Forms"},
	1154	{0x2190, 0x21ff, "Arrows"},
	1155	{0x2200, 0x22ff, "Mathematical Operators"},
	1156	{0x2300, 0x23ff, "Miscellaneous Technical"},
	1157	{0x2400, 0x243f, "Control Pictures"},
	1158	{0x2440, 0x245f, "Optical Character Recognition"},
	1159	{0x2460, 0x24ff, "Enclosed Alphanumerics"},
	1160	{0x2500, 0x257f, "Box Drawing"},
	1161	{0x2580, 0x259f, "Block Elements"},
	1162	{0x25a0, 0x25ff, "Geometric Shapes"},
	1163	{0x2600, 0x26ff, "Miscellaneous Symbols"},
	1164	{0x2700, 0x27bf, "Dingbats"},
	1165	{0x2800, 0x28ff, "Braille Patterns"},
	1166	{0x2e80, 0x2eff, "CJK Radicals Supplement"},
	1167	{0x2f00, 0x2fdf, "Kangxi Radicals"},
	1168	{0x2ff0, 0x2fff, "Ideographic Description Characters"},
	1169	{0x3000, 0x303f, "CJK Symbols and Punctuation"},
	1170	{0x3040, 0x309f, "Hiragana"},
	1171	{0x30a0, 0x30ff, "Katakana"},
	1172	{0x3100, 0x312f, "Bopomofo"},
	1173	{0x3130, 0x318f, "Hangul Compatibility Jamo"},
	1174	{0x3190, 0x319f, "Kanbun"},
	1175	{0x31a0, 0x31bf, "Bopomofo Extended"},
	1176	{0x3200, 0x32ff, "Enclosed CJK Letters and Months"},
	1177	{0x3300, 0x33ff, "CJK Compatibility"},
	1178	{0x3400, 0x4dbf, "CJK Unified Ideographs Extension A"},
	1179	{0x4e00, 0x9faf, "CJK Unified Ideographs"},
	1180	{0xa000, 0xa48f, "Yi Syllables"},
	1181	{0xa490, 0xa4cf, "Yi Radicals"},
	1182	{0xac00, 0xd7af, "Hangul Syllables"},
	1183	{0xd800, 0xdb7f, "High Surrogates"},
	1184	{0xdb80, 0xdbff, "High Private Use Surrogates"},
	1185	{0xdc00, 0xdfff, "Low Surrogates"},
	1186	{0xe000, 0xf8ff, "Private Use"},
	1187	{0xf900, 0xfaff, "CJK Compatibility Ideographs"},
	1188	{0xfb00, 0xfb4f, "Alphabetic Presentation Forms"},
	1189	{0xfb50, 0xfdff, "Arabic Presentation Forms-A"},
	1190	{0xfe20, 0xfe2f, "Combining Half Marks"},
	1191	{0xfe30, 0xfe4f, "CJK Compatibility Forms"},
	1192	{0xfe50, 0xfe6f, "Small Form Variants"},
	1193	{0xfe70, 0xfeff, "Arabic Presentation Forms-B"},
	1194	{0xff00, 0xffef, "Halfwidth and Fullwidth Forms"},
	1195	{0xfff0, 0xffff, "Specials"}
	1196	// FIXME: add ranges for the Unicode planes 1 to 16
	1197	};
	1198
1078	1199	const char * unicode_range_name(unicode_char uchar)
1079	1200	{
1080		static UINT32 hit = 0;
1081		UINT32 i;
	1201	int _min = 0;
	1202	int _max = sizeof(unicode_ranges) / sizeof(unicode_ranges[0]) - 1;
	1203	int _mid;
1082	1204
1083		for (i = hit; i < sizeof(unicode_ranges)/sizeof(unicode_ranges[0]); i++)
	1205	/* binary search in table of unicode ranges */
	1206	while (_max >= _min)
1084	1207	{
1085		if (unicode_ranges[i].first <= uchar && uchar <= unicode_ranges[i].last)
1086		{
1087		hit = i;
1088		return unicode_ranges[i].name;
1089		}
	1208	_mid = (_min + _max) / 2;
	1209	if (unicode_ranges[_mid].last < uchar)
	1210	_min = _mid + 1;
	1211	else if (unicode_ranges[_mid].first > uchar)
	1212	_max = _mid - 1;
	1213	else if (unicode_ranges[_mid].first <= uchar && unicode_ranges[_mid].last >= uchar)
	1214	return unicode_ranges[_mid].name;
1090	1215	}
1091		for (i = 0; i < hit; i++)
1092		{
1093		if (unicode_ranges[i].first <= uchar && uchar <= unicode_ranges[i].last)
1094		{
1095		hit = i;
1096		return unicode_ranges[i].name;
1097		}
1098		}
1099
1100	1216	return NULL;
1101	1217	}
1102	1218
1103	1219	unicode_char unicode_range_first(unicode_char uchar)
1104	1220	{
1105		static UINT32 hit = 0;
1106		UINT32 i;
	1221	int _min = 0;
	1222	int _max = sizeof(unicode_ranges) / sizeof(unicode_ranges[0]) - 1;
	1223	int _mid;
1107	1224
1108		for (i = hit; i < sizeof(unicode_ranges)/sizeof(unicode_ranges[0]); i++)
	1225	/* binary search in table of unicode ranges */
	1226	while (_max >= _min)
1109	1227	{
1110		if (unicode_ranges[i].first <= uchar && uchar <= unicode_ranges[i].last)
1111		{
1112		hit = i;
1113		return unicode_ranges[i].first;
1114		}
	1228	_mid = (_min + _max) / 2;
	1229	if (unicode_ranges[_mid].last < uchar)
	1230	_min = _mid + 1;
	1231	else if (unicode_ranges[_mid].first > uchar)
	1232	_max = _mid - 1;
	1233	else if (unicode_ranges[_mid].first <= uchar && unicode_ranges[_mid].last >= uchar)
	1234	return unicode_ranges[_mid].first;
1115	1235	}
1116		for (i = 0; i < hit; i++)
1117		{
1118		if (unicode_ranges[i].first <= uchar && uchar <= unicode_ranges[i].last)
1119		{
1120		hit = i;
1121		return unicode_ranges[i].first;
1122		}
1123		}
1124
1125	1236	return uchar;
1126	1237	}
1127	1238
1128	1239	unicode_char unicode_range_last(unicode_char uchar)
1129	1240	{
1130		static UINT32 hit = 0;
1131		UINT32 i;
	1241	int _min = 0;
	1242	int _max = sizeof(unicode_ranges) / sizeof(unicode_ranges[0]) - 1;
	1243	int _mid;
1132	1244
1133		for (i = hit; i < sizeof(unicode_ranges)/sizeof(unicode_ranges[0]); i++)
	1245	/* binary search in table of unicode ranges */
	1246	while (_max >= _min)
1134	1247	{
1135		if (unicode_ranges[i].first <= uchar && uchar <= unicode_ranges[i].last)
1136		{
1137		hit = i;
1138		return unicode_ranges[i].last;
1139		}
	1248	_mid = (_min + _max) / 2;
	1249	if (unicode_ranges[_mid].last < uchar)
	1250	_min = _mid + 1;
	1251	else if (unicode_ranges[_mid].first > uchar)
	1252	_max = _mid - 1;
	1253	else if (unicode_ranges[_mid].first <= uchar && unicode_ranges[_mid].last >= uchar)
	1254	return unicode_ranges[_mid].last;
1140	1255	}
1141		for (i = 0; i < hit; i++)
1142		{
1143		if (unicode_ranges[i].first <= uchar && uchar <= unicode_ranges[i].last)
1144		{
1145		hit = i;
1146		return unicode_ranges[i].last;
1147		}
1148		}
1149
1150	1256	return uchar;
1151	1257	}
1152	1258	#endif

branches/alto2/src/lib/util/unicode.h
r26196	r26197
145	145	//! convert an unicode character into a UTF-16 sequence with flipped endianness
146	146	int utf16f_from_uchar(utf16_char *utf16string, size_t count, unicode_char uchar);
147	147
	148	//! return the number of decoded Unicode values in UTF-8 encoded string
	149	size_t utf8_ucharlen(const char* utf8src, size_t * plen = 0);
	150
	151	//! return the number of decoded Unicode values in UTF-16 encoded string
	152	size_t utf16_ucharlen(const utf16_char* utf16src, size_t * plen = 0);
	153
148	154	/* misc UTF-8 helpers */
149	155	//! return a pointer to the previous character in a string
150	156	const char utf8_previous_char(const char utf8string);
r26196	r26197
152	158	//! return true if the given string is a properly formed sequence of UTF-8 characters
153	159	int utf8_is_valid_string(const char *utf8string);
154	160
155		//! return the number of decoded Unicode values in UTF-8 encoded string
156		size_t utf8_strlen(const char* src);
157
158	161	/* 8 bit code to Unicode value lookup table handling (e.g. ISO-8859-1 aka Latin1) */
159	162	//! load a table translating UINT8 (unsigned char) to Unicode values
160	163	unicode_char * uchar_table_load(const char* name);
r26196	r26197
165	168	//! free a unicode table
166	169	void uchar_table_free(unicode_char* table);
167	170
	171	//! return an unicode_char array allocated while converted from UTF-8
	172	unicode_char* ustring_from_utf8(const char *utf8char);
	173
	174	//! return an unicode_char array allocated while converted from UTF-16
	175	unicode_char* ustring_from_utf16(const utf16_char *utf16char);
	176
168	177	/* unicode_char array functions - string.h like */
169	178	//! return the unicode_char array length
170	179	size_t uchar_strlen(const unicode_char* src);
r26196	r26197
178	187	//! print a formatted string of ASCII characters to an unicode_char array (max 256 characters)
179	188	int uchar_sprintf(unicode_char* dst, const char* format, ...);
180	189
	190	//! print a formatted string of ASCII characters to an unicode_char array (max size characters)
	191	int uchar_snprintf(unicode_char* dst, size_t size, const char* format, ...);
	192
181	193	//! copy an array of unicode_char from source to destination
182	194	unicode_char* uchar_strcpy(unicode_char* dst, const unicode_char* src);
183	195

199869 Revisions