blob: a81a980384283e8ae010f69c8fb1366fcab76317 [file] [log] [blame]
Denys Vlasenko9f93d622010-01-24 07:44:03 +01001/*
2 * This is an implementation of wcwidth() and wcswidth() (defined in
3 * IEEE Std 1002.1-2001) for Unicode.
4 *
5 * http://www.opengroup.org/onlinepubs/007904975/functions/wcwidth.html
6 * http://www.opengroup.org/onlinepubs/007904975/functions/wcswidth.html
7 *
8 * In fixed-width output devices, Latin characters all occupy a single
9 * "cell" position of equal width, whereas ideographic CJK characters
10 * occupy two such cells. Interoperability between terminal-line
11 * applications and (teletype-style) character terminals using the
12 * UTF-8 encoding requires agreement on which character should advance
13 * the cursor by how many cell positions. No established formal
14 * standards exist at present on which Unicode character shall occupy
15 * how many cell positions on character terminals. These routines are
16 * a first attempt of defining such behavior based on simple rules
17 * applied to data provided by the Unicode Consortium.
18 *
19 * For some graphical characters, the Unicode standard explicitly
20 * defines a character-cell width via the definition of the East Asian
21 * FullWidth (F), Wide (W), Half-width (H), and Narrow (Na) classes.
22 * In all these cases, there is no ambiguity about which width a
23 * terminal shall use. For characters in the East Asian Ambiguous (A)
24 * class, the width choice depends purely on a preference of backward
25 * compatibility with either historic CJK or Western practice.
26 * Choosing single-width for these characters is easy to justify as
27 * the appropriate long-term solution, as the CJK practice of
28 * displaying these characters as double-width comes from historic
29 * implementation simplicity (8-bit encoded characters were displayed
30 * single-width and 16-bit ones double-width, even for Greek,
31 * Cyrillic, etc.) and not any typographic considerations.
32 *
33 * Much less clear is the choice of width for the Not East Asian
34 * (Neutral) class. Existing practice does not dictate a width for any
35 * of these characters. It would nevertheless make sense
36 * typographically to allocate two character cells to characters such
37 * as for instance EM SPACE or VOLUME INTEGRAL, which cannot be
38 * represented adequately with a single-width glyph. The following
39 * routines at present merely assign a single-cell width to all
40 * neutral characters, in the interest of simplicity. This is not
41 * entirely satisfactory and should be reconsidered before
42 * establishing a formal standard in this area. At the moment, the
43 * decision which Not East Asian (Neutral) characters should be
44 * represented by double-width glyphs cannot yet be answered by
45 * applying a simple rule from the Unicode database content. Setting
46 * up a proper standard for the behavior of UTF-8 character terminals
47 * will require a careful analysis not only of each Unicode character,
48 * but also of each presentation form, something the author of these
49 * routines has avoided to do so far.
50 *
51 * http://www.unicode.org/unicode/reports/tr11/
52 *
53 * Markus Kuhn -- 2007-05-26 (Unicode 5.0)
54 *
55 * Permission to use, copy, modify, and distribute this software
56 * for any purpose and without fee is hereby granted. The author
57 * disclaims all warranties with regard to this software.
58 *
59 * Latest version: http://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c
60 */
61
Denys Vlasenko40e4e882010-01-31 16:04:30 +010062/* Assigned Unicode character ranges:
63 * Plane Range
64 * 0 0000–FFFF Basic Multilingual Plane
65 * 1 10000–1FFFF Supplementary Multilingual Plane
66 * 2 20000–2FFFF Supplementary Ideographic Plane
67 * 3 30000-3FFFF Tertiary Ideographic Plane (no chars assigned yet)
68 * 4-13 40000–DFFFF currently unassigned
69 * 14 E0000–EFFFF Supplementary Special-purpose Plane
70 * 15 F0000–FFFFF Supplementary Private Use Area-A
71 * 16 100000–10FFFF Supplementary Private Use Area-B
72 *
73 * "Supplementary Special-purpose Plane currently contains non-graphical
74 * characters in two blocks of 128 and 240 characters. The first block
75 * is for language tag characters for use when language cannot be indicated
76 * through other protocols (such as the xml:lang attribute in XML).
77 * The other block contains glyph variation selectors to indicate
78 * an alternate glyph for a character that cannot be determined by context."
79 *
80 * In simpler terms: it is a tool to fix the "Han unification" mess
81 * created by Unicode committee, to select Chinese/Japanese/Korean/Taiwan
82 * version of a character. (They forgot that the whole purpose of the Unicode
83 * was to be able to write all chars in one charset without such tricks).
84 * Until East Asian users say it is actually necessary to support these
85 * code points in console applications like busybox
86 * (i.e. do these chars ever appear in filenames, hostnames, text files
87 * and such?), we are treating these code points as invalid.
88 *
89 * Tertiary Ideographic Plane is also ignored for now,
90 * until Unicode committee assigns something there.
91 */
92
Denys Vlasenkob1edf202010-01-31 16:34:37 +010093#if CONFIG_LAST_SUPPORTED_WCHAR < 126 || CONFIG_LAST_SUPPORTED_WCHAR >= 0x30000
94# define LAST_SUPPORTED_WCHAR 0x2ffff
Denys Vlasenko2edba212010-01-29 09:11:47 +010095#else
96# define LAST_SUPPORTED_WCHAR CONFIG_LAST_SUPPORTED_WCHAR
97#endif
98
Denys Vlasenkob1edf202010-01-31 16:34:37 +010099#if LAST_SUPPORTED_WCHAR >= 0x300
Denys Vlasenko9f93d622010-01-24 07:44:03 +0100100struct interval {
101 uint16_t first;
102 uint16_t last;
103};
104
105/* auxiliary function for binary search in interval table */
Denys Vlasenko46685a42010-01-25 13:24:06 +0100106static int in_interval_table(unsigned ucs, const struct interval *table, unsigned max)
Denys Vlasenko9f93d622010-01-24 07:44:03 +0100107{
Denys Vlasenko46685a42010-01-25 13:24:06 +0100108 unsigned min;
Denys Vlasenko9f93d622010-01-24 07:44:03 +0100109 unsigned mid;
110
111 if (ucs < table[0].first || ucs > table[max].last)
112 return 0;
113
Denys Vlasenko46685a42010-01-25 13:24:06 +0100114 min = 0;
Denys Vlasenko9f93d622010-01-24 07:44:03 +0100115 while (max >= min) {
116 mid = (min + max) / 2;
117 if (ucs > table[mid].last)
Denys Vlasenko46685a42010-01-25 13:24:06 +0100118 min = mid + 1;
Denys Vlasenko9f93d622010-01-24 07:44:03 +0100119 else if (ucs < table[mid].first)
Denys Vlasenko46685a42010-01-25 13:24:06 +0100120 max = mid - 1;
Denys Vlasenko9f93d622010-01-24 07:44:03 +0100121 else
Denys Vlasenko46685a42010-01-25 13:24:06 +0100122 return 1;
123 }
124 return 0;
125}
126
127static int in_uint16_table(unsigned ucs, const uint16_t *table, unsigned max)
128{
129 unsigned min;
130 unsigned mid;
131 unsigned first, last;
132
133 first = table[0] >> 2;
134 last = first + (table[0] & 3);
135 if (ucs < first || ucs > last)
136 return 0;
137
138 min = 0;
139 while (max >= min) {
140 mid = (min + max) / 2;
141 first = table[mid] >> 2;
142 last = first + (table[mid] & 3);
143 if (ucs > last)
144 min = mid + 1;
145 else if (ucs < first)
146 max = mid - 1;
147 else
148 return 1;
Denys Vlasenko9f93d622010-01-24 07:44:03 +0100149 }
150 return 0;
151}
Denys Vlasenko2edba212010-01-29 09:11:47 +0100152#endif
Denys Vlasenko9f93d622010-01-24 07:44:03 +0100153
154
155/* The following two functions define the column width of an ISO 10646
156 * character as follows:
157 *
158 * - The null character (U+0000) has a column width of 0.
159 *
160 * - Other C0/C1 control characters and DEL will lead to a return
161 * value of -1.
162 *
163 * - Non-spacing and enclosing combining characters (general
164 * category code Mn or Me in the Unicode database) have a
165 * column width of 0.
166 *
167 * - SOFT HYPHEN (U+00AD) has a column width of 1.
168 *
169 * - Other format characters (general category code Cf in the Unicode
170 * database) and ZERO WIDTH SPACE (U+200B) have a column width of 0.
171 *
172 * - Hangul Jamo medial vowels and final consonants (U+1160-U+11FF)
173 * have a column width of 0.
174 *
175 * - Spacing characters in the East Asian Wide (W) or East Asian
176 * Full-width (F) category as defined in Unicode Technical
177 * Report #11 have a column width of 2.
178 *
179 * - All remaining characters (including all printable
180 * ISO 8859-1 and WGL4 characters, Unicode control characters,
181 * etc.) have a column width of 1.
182 *
183 * This implementation assumes that wchar_t characters are encoded
184 * in ISO 10646.
185 */
186static int wcwidth(unsigned ucs)
187{
Denys Vlasenkob1edf202010-01-31 16:34:37 +0100188#if LAST_SUPPORTED_WCHAR >= 0x300
Denys Vlasenko9f93d622010-01-24 07:44:03 +0100189 /* sorted list of non-overlapping intervals of non-spacing characters */
190 /* generated by "uniset +cat=Me +cat=Mn +cat=Cf -00AD +1160-11FF +200B c" */
191 static const struct interval combining[] = {
Denys Vlasenko46685a42010-01-25 13:24:06 +0100192#define BIG_(a,b) { a, b },
193#define PAIR(a,b)
194 /* PAIR if < 0x4000 and no more than 4 chars big */
195 BIG_(0x0300, 0x036F)
196 PAIR(0x0483, 0x0486)
197 PAIR(0x0488, 0x0489)
198 BIG_(0x0591, 0x05BD)
199 PAIR(0x05BF, 0x05BF)
200 PAIR(0x05C1, 0x05C2)
201 PAIR(0x05C4, 0x05C5)
202 PAIR(0x05C7, 0x05C7)
203 PAIR(0x0600, 0x0603)
204 BIG_(0x0610, 0x0615)
205 BIG_(0x064B, 0x065E)
206 PAIR(0x0670, 0x0670)
207 BIG_(0x06D6, 0x06E4)
208 PAIR(0x06E7, 0x06E8)
209 PAIR(0x06EA, 0x06ED)
210 PAIR(0x070F, 0x070F)
211 PAIR(0x0711, 0x0711)
212 BIG_(0x0730, 0x074A)
213 BIG_(0x07A6, 0x07B0)
214 BIG_(0x07EB, 0x07F3)
215 PAIR(0x0901, 0x0902)
216 PAIR(0x093C, 0x093C)
217 BIG_(0x0941, 0x0948)
218 PAIR(0x094D, 0x094D)
219 PAIR(0x0951, 0x0954)
220 PAIR(0x0962, 0x0963)
221 PAIR(0x0981, 0x0981)
222 PAIR(0x09BC, 0x09BC)
223 PAIR(0x09C1, 0x09C4)
224 PAIR(0x09CD, 0x09CD)
225 PAIR(0x09E2, 0x09E3)
226 PAIR(0x0A01, 0x0A02)
227 PAIR(0x0A3C, 0x0A3C)
228 PAIR(0x0A41, 0x0A42)
229 PAIR(0x0A47, 0x0A48)
230 PAIR(0x0A4B, 0x0A4D)
231 PAIR(0x0A70, 0x0A71)
232 PAIR(0x0A81, 0x0A82)
233 PAIR(0x0ABC, 0x0ABC)
234 BIG_(0x0AC1, 0x0AC5)
235 PAIR(0x0AC7, 0x0AC8)
236 PAIR(0x0ACD, 0x0ACD)
237 PAIR(0x0AE2, 0x0AE3)
238 PAIR(0x0B01, 0x0B01)
239 PAIR(0x0B3C, 0x0B3C)
240 PAIR(0x0B3F, 0x0B3F)
241 PAIR(0x0B41, 0x0B43)
242 PAIR(0x0B4D, 0x0B4D)
243 PAIR(0x0B56, 0x0B56)
244 PAIR(0x0B82, 0x0B82)
245 PAIR(0x0BC0, 0x0BC0)
246 PAIR(0x0BCD, 0x0BCD)
247 PAIR(0x0C3E, 0x0C40)
248 PAIR(0x0C46, 0x0C48)
249 PAIR(0x0C4A, 0x0C4D)
250 PAIR(0x0C55, 0x0C56)
251 PAIR(0x0CBC, 0x0CBC)
252 PAIR(0x0CBF, 0x0CBF)
253 PAIR(0x0CC6, 0x0CC6)
254 PAIR(0x0CCC, 0x0CCD)
255 PAIR(0x0CE2, 0x0CE3)
256 PAIR(0x0D41, 0x0D43)
257 PAIR(0x0D4D, 0x0D4D)
258 PAIR(0x0DCA, 0x0DCA)
259 PAIR(0x0DD2, 0x0DD4)
260 PAIR(0x0DD6, 0x0DD6)
261 PAIR(0x0E31, 0x0E31)
262 BIG_(0x0E34, 0x0E3A)
263 BIG_(0x0E47, 0x0E4E)
264 PAIR(0x0EB1, 0x0EB1)
265 BIG_(0x0EB4, 0x0EB9)
266 PAIR(0x0EBB, 0x0EBC)
267 BIG_(0x0EC8, 0x0ECD)
268 PAIR(0x0F18, 0x0F19)
269 PAIR(0x0F35, 0x0F35)
270 PAIR(0x0F37, 0x0F37)
271 PAIR(0x0F39, 0x0F39)
272 BIG_(0x0F71, 0x0F7E)
273 BIG_(0x0F80, 0x0F84)
274 PAIR(0x0F86, 0x0F87)
275 PAIR(0x0FC6, 0x0FC6)
276 BIG_(0x0F90, 0x0F97)
277 BIG_(0x0F99, 0x0FBC)
278 PAIR(0x102D, 0x1030)
279 PAIR(0x1032, 0x1032)
280 PAIR(0x1036, 0x1037)
281 PAIR(0x1039, 0x1039)
282 PAIR(0x1058, 0x1059)
283 BIG_(0x1160, 0x11FF)
284 PAIR(0x135F, 0x135F)
285 PAIR(0x1712, 0x1714)
286 PAIR(0x1732, 0x1734)
287 PAIR(0x1752, 0x1753)
288 PAIR(0x1772, 0x1773)
289 PAIR(0x17B4, 0x17B5)
290 BIG_(0x17B7, 0x17BD)
291 PAIR(0x17C6, 0x17C6)
292 BIG_(0x17C9, 0x17D3)
293 PAIR(0x17DD, 0x17DD)
294 PAIR(0x180B, 0x180D)
295 PAIR(0x18A9, 0x18A9)
296 PAIR(0x1920, 0x1922)
297 PAIR(0x1927, 0x1928)
298 PAIR(0x1932, 0x1932)
299 PAIR(0x1939, 0x193B)
300 PAIR(0x1A17, 0x1A18)
301 PAIR(0x1B00, 0x1B03)
302 PAIR(0x1B34, 0x1B34)
303 BIG_(0x1B36, 0x1B3A)
304 PAIR(0x1B3C, 0x1B3C)
305 PAIR(0x1B42, 0x1B42)
306 BIG_(0x1B6B, 0x1B73)
307 BIG_(0x1DC0, 0x1DCA)
308 PAIR(0x1DFE, 0x1DFF)
309 BIG_(0x200B, 0x200F)
310 BIG_(0x202A, 0x202E)
311 PAIR(0x2060, 0x2063)
312 BIG_(0x206A, 0x206F)
313 BIG_(0x20D0, 0x20EF)
314 BIG_(0x302A, 0x302F)
315 PAIR(0x3099, 0x309A)
316 /* Too big to be packed in PAIRs: */
317 { 0xA806, 0xA806 },
318 { 0xA80B, 0xA80B },
319 { 0xA825, 0xA826 },
320 { 0xFB1E, 0xFB1E },
321 { 0xFE00, 0xFE0F },
322 { 0xFE20, 0xFE23 },
323 { 0xFEFF, 0xFEFF },
324 { 0xFFF9, 0xFFFB }
325#undef BIG_
326#undef PAIR
327 };
328 static const uint16_t combining1[] = {
329#define BIG_(a,b)
330#define PAIR(a,b) (a << 2) | (b-a),
331 /* Exact copy-n-paste of the above: */
332 BIG_(0x0300, 0x036F)
333 PAIR(0x0483, 0x0486)
334 PAIR(0x0488, 0x0489)
335 BIG_(0x0591, 0x05BD)
336 PAIR(0x05BF, 0x05BF)
337 PAIR(0x05C1, 0x05C2)
338 PAIR(0x05C4, 0x05C5)
339 PAIR(0x05C7, 0x05C7)
340 PAIR(0x0600, 0x0603)
341 BIG_(0x0610, 0x0615)
342 BIG_(0x064B, 0x065E)
343 PAIR(0x0670, 0x0670)
344 BIG_(0x06D6, 0x06E4)
345 PAIR(0x06E7, 0x06E8)
346 PAIR(0x06EA, 0x06ED)
347 PAIR(0x070F, 0x070F)
348 PAIR(0x0711, 0x0711)
349 BIG_(0x0730, 0x074A)
350 BIG_(0x07A6, 0x07B0)
351 BIG_(0x07EB, 0x07F3)
352 PAIR(0x0901, 0x0902)
353 PAIR(0x093C, 0x093C)
354 BIG_(0x0941, 0x0948)
355 PAIR(0x094D, 0x094D)
356 PAIR(0x0951, 0x0954)
357 PAIR(0x0962, 0x0963)
358 PAIR(0x0981, 0x0981)
359 PAIR(0x09BC, 0x09BC)
360 PAIR(0x09C1, 0x09C4)
361 PAIR(0x09CD, 0x09CD)
362 PAIR(0x09E2, 0x09E3)
363 PAIR(0x0A01, 0x0A02)
364 PAIR(0x0A3C, 0x0A3C)
365 PAIR(0x0A41, 0x0A42)
366 PAIR(0x0A47, 0x0A48)
367 PAIR(0x0A4B, 0x0A4D)
368 PAIR(0x0A70, 0x0A71)
369 PAIR(0x0A81, 0x0A82)
370 PAIR(0x0ABC, 0x0ABC)
371 BIG_(0x0AC1, 0x0AC5)
372 PAIR(0x0AC7, 0x0AC8)
373 PAIR(0x0ACD, 0x0ACD)
374 PAIR(0x0AE2, 0x0AE3)
375 PAIR(0x0B01, 0x0B01)
376 PAIR(0x0B3C, 0x0B3C)
377 PAIR(0x0B3F, 0x0B3F)
378 PAIR(0x0B41, 0x0B43)
379 PAIR(0x0B4D, 0x0B4D)
380 PAIR(0x0B56, 0x0B56)
381 PAIR(0x0B82, 0x0B82)
382 PAIR(0x0BC0, 0x0BC0)
383 PAIR(0x0BCD, 0x0BCD)
384 PAIR(0x0C3E, 0x0C40)
385 PAIR(0x0C46, 0x0C48)
386 PAIR(0x0C4A, 0x0C4D)
387 PAIR(0x0C55, 0x0C56)
388 PAIR(0x0CBC, 0x0CBC)
389 PAIR(0x0CBF, 0x0CBF)
390 PAIR(0x0CC6, 0x0CC6)
391 PAIR(0x0CCC, 0x0CCD)
392 PAIR(0x0CE2, 0x0CE3)
393 PAIR(0x0D41, 0x0D43)
394 PAIR(0x0D4D, 0x0D4D)
395 PAIR(0x0DCA, 0x0DCA)
396 PAIR(0x0DD2, 0x0DD4)
397 PAIR(0x0DD6, 0x0DD6)
398 PAIR(0x0E31, 0x0E31)
399 BIG_(0x0E34, 0x0E3A)
400 BIG_(0x0E47, 0x0E4E)
401 PAIR(0x0EB1, 0x0EB1)
402 BIG_(0x0EB4, 0x0EB9)
403 PAIR(0x0EBB, 0x0EBC)
404 BIG_(0x0EC8, 0x0ECD)
405 PAIR(0x0F18, 0x0F19)
406 PAIR(0x0F35, 0x0F35)
407 PAIR(0x0F37, 0x0F37)
408 PAIR(0x0F39, 0x0F39)
409 BIG_(0x0F71, 0x0F7E)
410 BIG_(0x0F80, 0x0F84)
411 PAIR(0x0F86, 0x0F87)
412 PAIR(0x0FC6, 0x0FC6)
413 BIG_(0x0F90, 0x0F97)
414 BIG_(0x0F99, 0x0FBC)
415 PAIR(0x102D, 0x1030)
416 PAIR(0x1032, 0x1032)
417 PAIR(0x1036, 0x1037)
418 PAIR(0x1039, 0x1039)
419 PAIR(0x1058, 0x1059)
420 BIG_(0x1160, 0x11FF)
421 PAIR(0x135F, 0x135F)
422 PAIR(0x1712, 0x1714)
423 PAIR(0x1732, 0x1734)
424 PAIR(0x1752, 0x1753)
425 PAIR(0x1772, 0x1773)
426 PAIR(0x17B4, 0x17B5)
427 BIG_(0x17B7, 0x17BD)
428 PAIR(0x17C6, 0x17C6)
429 BIG_(0x17C9, 0x17D3)
430 PAIR(0x17DD, 0x17DD)
431 PAIR(0x180B, 0x180D)
432 PAIR(0x18A9, 0x18A9)
433 PAIR(0x1920, 0x1922)
434 PAIR(0x1927, 0x1928)
435 PAIR(0x1932, 0x1932)
436 PAIR(0x1939, 0x193B)
437 PAIR(0x1A17, 0x1A18)
438 PAIR(0x1B00, 0x1B03)
439 PAIR(0x1B34, 0x1B34)
440 BIG_(0x1B36, 0x1B3A)
441 PAIR(0x1B3C, 0x1B3C)
442 PAIR(0x1B42, 0x1B42)
443 BIG_(0x1B6B, 0x1B73)
444 BIG_(0x1DC0, 0x1DCA)
445 PAIR(0x1DFE, 0x1DFF)
446 BIG_(0x200B, 0x200F)
447 BIG_(0x202A, 0x202E)
448 PAIR(0x2060, 0x2063)
449 BIG_(0x206A, 0x206F)
450 BIG_(0x20D0, 0x20EF)
451 BIG_(0x302A, 0x302F)
452 PAIR(0x3099, 0x309A)
453#undef BIG_
454#undef PAIR
455 };
456 struct CHECK {
457#define BIG_(a,b) char big##a[b-a <= 3 ? -1 : 1];
458#define PAIR(a,b) char pair##a[b-a > 3 ? -1 : 1];
459 /* Copy-n-paste it here again to verify correctness */
460#undef BIG_
461#undef PAIR
Denys Vlasenko9f93d622010-01-24 07:44:03 +0100462 };
Denys Vlasenko2edba212010-01-29 09:11:47 +0100463#endif
Denys Vlasenko9f93d622010-01-24 07:44:03 +0100464
465 if (ucs == 0)
466 return 0;
Denys Vlasenkob1edf202010-01-31 16:34:37 +0100467
468 /* Test for 8-bit control characters (00-1f, 80-9f, 7f) */
Denys Vlasenko9f93d622010-01-24 07:44:03 +0100469 if ((ucs & ~0x80) < 0x20 || ucs == 0x7f)
470 return -1;
Denys Vlasenkob1edf202010-01-31 16:34:37 +0100471 /* Quick abort if it is an obviously invalid char */
472 if (ucs > LAST_SUPPORTED_WCHAR)
473 return -1;
474
475 /* Optimization: no combining chars below 0x300 */
476 if (LAST_SUPPORTED_WCHAR < 0x300 || ucs < 0x300)
Denys Vlasenko9f93d622010-01-24 07:44:03 +0100477 return 1;
478
Denys Vlasenkob1edf202010-01-31 16:34:37 +0100479#if LAST_SUPPORTED_WCHAR >= 0x300
480 /* Binary search in table of non-spacing characters */
Denys Vlasenko46685a42010-01-25 13:24:06 +0100481 if (in_interval_table(ucs, combining, ARRAY_SIZE(combining) - 1))
482 return 0;
483 if (in_uint16_table(ucs, combining1, ARRAY_SIZE(combining1) - 1))
Denys Vlasenko9f93d622010-01-24 07:44:03 +0100484 return 0;
485
Denys Vlasenkob1edf202010-01-31 16:34:37 +0100486 /* Optimization: all chars below 0x1100 are not double-width */
487 if (LAST_SUPPORTED_WCHAR < 0x1100 || ucs < 0x1100)
Denys Vlasenko9f93d622010-01-24 07:44:03 +0100488 return 1;
489
Denys Vlasenkob1edf202010-01-31 16:34:37 +0100490# if LAST_SUPPORTED_WCHAR >= 0x1100
491 /* Invalid code points: */
492 /* High (d800..dbff) and low (dc00..dfff) surrogates (valid only in UTF16) */
493 /* Private Use Area (e000..f8ff) */
494 /* Noncharacters fdd0..fdef */
495 if ((LAST_SUPPORTED_WCHAR >= 0xd800 && ucs >= 0xd800 && ucs <= 0xf8ff)
496 || (LAST_SUPPORTED_WCHAR >= 0xfdd0 && ucs >= 0xfdd0 && ucs <= 0xfdef)
Denys Vlasenko40e4e882010-01-31 16:04:30 +0100497 ) {
498 return -1;
499 }
Denys Vlasenko40e4e882010-01-31 16:04:30 +0100500 /* 0xfffe and 0xffff in every plane are invalid */
Denys Vlasenkob1edf202010-01-31 16:34:37 +0100501 if (LAST_SUPPORTED_WCHAR >= 0xfffe && ((ucs & 0xfffe) == 0xfffe)) {
Denys Vlasenko40e4e882010-01-31 16:04:30 +0100502 return -1;
503 }
504
505# if LAST_SUPPORTED_WCHAR >= 0x10000
Denys Vlasenkob1edf202010-01-31 16:34:37 +0100506 if (ucs >= 0x10000) {
507 /* Combining chars in Supplementary Multilingual Plane 0x1xxxx */
508 static const struct interval combining0x10000[] = {
509 { 0x0A01, 0x0A03 }, { 0x0A05, 0x0A06 }, { 0x0A0C, 0x0A0F },
510 { 0x0A38, 0x0A3A }, { 0x0A3F, 0x0A3F }, { 0xD167, 0xD169 },
511 { 0xD173, 0xD182 }, { 0xD185, 0xD18B }, { 0xD1AA, 0xD1AD },
512 { 0xD242, 0xD244 }
513 };
514 /* Binary search in table of non-spacing characters in Supplementary Multilingual Plane */
515 if (in_interval_table(ucs ^ 0x10000, combining0x10000, ARRAY_SIZE(combining0x10000) - 1))
516 return 0;
517 /* Check a few non-spacing chars in Supplementary Special-purpose Plane 0xExxxx */
518 if (LAST_SUPPORTED_WCHAR >= 0xE0001
519 && ( ucs == 0xE0001
520 || (ucs >= 0xE0020 && ucs <= 0xE007F)
521 || (ucs >= 0xE0100 && ucs <= 0xE01EF)
522 )
523 ) {
524 return 0;
525 }
Denys Vlasenko9f93d622010-01-24 07:44:03 +0100526 }
Denys Vlasenkob1edf202010-01-31 16:34:37 +0100527# endif
Denys Vlasenko9f93d622010-01-24 07:44:03 +0100528
Denys Vlasenkob1edf202010-01-31 16:34:37 +0100529 /* If we arrive here, ucs is not a combining or C0/C1 control character.
530 * Check whether it's 1 char or 2-shar wide.
531 */
Denys Vlasenko9f93d622010-01-24 07:44:03 +0100532 return 1 +
533 ( (/*ucs >= 0x1100 &&*/ ucs <= 0x115f) /* Hangul Jamo init. consonants */
Denys Vlasenko2edba212010-01-29 09:11:47 +0100534 || ucs == 0x2329 /* left-pointing angle bracket; also CJK punct. char */
535 || ucs == 0x232a /* right-pointing angle bracket; also CJK punct. char */
Denys Vlasenko9f93d622010-01-24 07:44:03 +0100536 || (ucs >= 0x2e80 && ucs <= 0xa4cf && ucs != 0x303f) /* CJK ... Yi */
Denys Vlasenko460f8272010-01-31 18:12:57 +0100537# if LAST_SUPPORTED_WCHAR >= 0xac00
Denys Vlasenko9f93d622010-01-24 07:44:03 +0100538 || (ucs >= 0xac00 && ucs <= 0xd7a3) /* Hangul Syllables */
539 || (ucs >= 0xf900 && ucs <= 0xfaff) /* CJK Compatibility Ideographs */
540 || (ucs >= 0xfe10 && ucs <= 0xfe19) /* Vertical forms */
541 || (ucs >= 0xfe30 && ucs <= 0xfe6f) /* CJK Compatibility Forms */
542 || (ucs >= 0xff00 && ucs <= 0xff60) /* Fullwidth Forms */
543 || (ucs >= 0xffe0 && ucs <= 0xffe6)
Denys Vlasenko40e4e882010-01-31 16:04:30 +0100544 || ((ucs >> 17) == (2 >> 1)) /* 20000..3ffff: Supplementary and Tertiary Ideographic Planes */
Denys Vlasenko460f8272010-01-31 18:12:57 +0100545# endif
Denys Vlasenko9f93d622010-01-24 07:44:03 +0100546 );
Denys Vlasenko2edba212010-01-29 09:11:47 +0100547# endif
548#endif
Denys Vlasenko9f93d622010-01-24 07:44:03 +0100549}