blob: 8d301f7c3043e216003ea837df6d4ee68378476f [file] [log] [blame]
Denys Vlasenko9f93d622010-01-24 07:44:03 +01001/*
2 * This is an implementation of wcwidth() and wcswidth() (defined in
3 * IEEE Std 1002.1-2001) for Unicode.
4 *
5 * http://www.opengroup.org/onlinepubs/007904975/functions/wcwidth.html
6 * http://www.opengroup.org/onlinepubs/007904975/functions/wcswidth.html
7 *
8 * In fixed-width output devices, Latin characters all occupy a single
9 * "cell" position of equal width, whereas ideographic CJK characters
10 * occupy two such cells. Interoperability between terminal-line
11 * applications and (teletype-style) character terminals using the
12 * UTF-8 encoding requires agreement on which character should advance
13 * the cursor by how many cell positions. No established formal
14 * standards exist at present on which Unicode character shall occupy
15 * how many cell positions on character terminals. These routines are
16 * a first attempt of defining such behavior based on simple rules
17 * applied to data provided by the Unicode Consortium.
18 *
19 * For some graphical characters, the Unicode standard explicitly
20 * defines a character-cell width via the definition of the East Asian
21 * FullWidth (F), Wide (W), Half-width (H), and Narrow (Na) classes.
22 * In all these cases, there is no ambiguity about which width a
23 * terminal shall use. For characters in the East Asian Ambiguous (A)
24 * class, the width choice depends purely on a preference of backward
25 * compatibility with either historic CJK or Western practice.
26 * Choosing single-width for these characters is easy to justify as
27 * the appropriate long-term solution, as the CJK practice of
28 * displaying these characters as double-width comes from historic
29 * implementation simplicity (8-bit encoded characters were displayed
30 * single-width and 16-bit ones double-width, even for Greek,
31 * Cyrillic, etc.) and not any typographic considerations.
32 *
33 * Much less clear is the choice of width for the Not East Asian
34 * (Neutral) class. Existing practice does not dictate a width for any
35 * of these characters. It would nevertheless make sense
36 * typographically to allocate two character cells to characters such
37 * as for instance EM SPACE or VOLUME INTEGRAL, which cannot be
38 * represented adequately with a single-width glyph. The following
39 * routines at present merely assign a single-cell width to all
40 * neutral characters, in the interest of simplicity. This is not
41 * entirely satisfactory and should be reconsidered before
42 * establishing a formal standard in this area. At the moment, the
43 * decision which Not East Asian (Neutral) characters should be
44 * represented by double-width glyphs cannot yet be answered by
45 * applying a simple rule from the Unicode database content. Setting
46 * up a proper standard for the behavior of UTF-8 character terminals
47 * will require a careful analysis not only of each Unicode character,
48 * but also of each presentation form, something the author of these
49 * routines has avoided to do so far.
50 *
51 * http://www.unicode.org/unicode/reports/tr11/
52 *
53 * Markus Kuhn -- 2007-05-26 (Unicode 5.0)
54 *
55 * Permission to use, copy, modify, and distribute this software
56 * for any purpose and without fee is hereby granted. The author
57 * disclaims all warranties with regard to this software.
58 *
59 * Latest version: http://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c
60 */
61
62struct interval {
63 uint16_t first;
64 uint16_t last;
65};
66
67/* auxiliary function for binary search in interval table */
Denys Vlasenko46685a42010-01-25 13:24:06 +010068static int in_interval_table(unsigned ucs, const struct interval *table, unsigned max)
Denys Vlasenko9f93d622010-01-24 07:44:03 +010069{
Denys Vlasenko46685a42010-01-25 13:24:06 +010070 unsigned min;
Denys Vlasenko9f93d622010-01-24 07:44:03 +010071 unsigned mid;
72
73 if (ucs < table[0].first || ucs > table[max].last)
74 return 0;
75
Denys Vlasenko46685a42010-01-25 13:24:06 +010076 min = 0;
Denys Vlasenko9f93d622010-01-24 07:44:03 +010077 while (max >= min) {
78 mid = (min + max) / 2;
79 if (ucs > table[mid].last)
Denys Vlasenko46685a42010-01-25 13:24:06 +010080 min = mid + 1;
Denys Vlasenko9f93d622010-01-24 07:44:03 +010081 else if (ucs < table[mid].first)
Denys Vlasenko46685a42010-01-25 13:24:06 +010082 max = mid - 1;
Denys Vlasenko9f93d622010-01-24 07:44:03 +010083 else
Denys Vlasenko46685a42010-01-25 13:24:06 +010084 return 1;
85 }
86 return 0;
87}
88
89static int in_uint16_table(unsigned ucs, const uint16_t *table, unsigned max)
90{
91 unsigned min;
92 unsigned mid;
93 unsigned first, last;
94
95 first = table[0] >> 2;
96 last = first + (table[0] & 3);
97 if (ucs < first || ucs > last)
98 return 0;
99
100 min = 0;
101 while (max >= min) {
102 mid = (min + max) / 2;
103 first = table[mid] >> 2;
104 last = first + (table[mid] & 3);
105 if (ucs > last)
106 min = mid + 1;
107 else if (ucs < first)
108 max = mid - 1;
109 else
110 return 1;
Denys Vlasenko9f93d622010-01-24 07:44:03 +0100111 }
112 return 0;
113}
114
115
116/* The following two functions define the column width of an ISO 10646
117 * character as follows:
118 *
119 * - The null character (U+0000) has a column width of 0.
120 *
121 * - Other C0/C1 control characters and DEL will lead to a return
122 * value of -1.
123 *
124 * - Non-spacing and enclosing combining characters (general
125 * category code Mn or Me in the Unicode database) have a
126 * column width of 0.
127 *
128 * - SOFT HYPHEN (U+00AD) has a column width of 1.
129 *
130 * - Other format characters (general category code Cf in the Unicode
131 * database) and ZERO WIDTH SPACE (U+200B) have a column width of 0.
132 *
133 * - Hangul Jamo medial vowels and final consonants (U+1160-U+11FF)
134 * have a column width of 0.
135 *
136 * - Spacing characters in the East Asian Wide (W) or East Asian
137 * Full-width (F) category as defined in Unicode Technical
138 * Report #11 have a column width of 2.
139 *
140 * - All remaining characters (including all printable
141 * ISO 8859-1 and WGL4 characters, Unicode control characters,
142 * etc.) have a column width of 1.
143 *
144 * This implementation assumes that wchar_t characters are encoded
145 * in ISO 10646.
146 */
147static int wcwidth(unsigned ucs)
148{
149 /* sorted list of non-overlapping intervals of non-spacing characters */
150 /* generated by "uniset +cat=Me +cat=Mn +cat=Cf -00AD +1160-11FF +200B c" */
151 static const struct interval combining[] = {
Denys Vlasenko46685a42010-01-25 13:24:06 +0100152#define BIG_(a,b) { a, b },
153#define PAIR(a,b)
154 /* PAIR if < 0x4000 and no more than 4 chars big */
155 BIG_(0x0300, 0x036F)
156 PAIR(0x0483, 0x0486)
157 PAIR(0x0488, 0x0489)
158 BIG_(0x0591, 0x05BD)
159 PAIR(0x05BF, 0x05BF)
160 PAIR(0x05C1, 0x05C2)
161 PAIR(0x05C4, 0x05C5)
162 PAIR(0x05C7, 0x05C7)
163 PAIR(0x0600, 0x0603)
164 BIG_(0x0610, 0x0615)
165 BIG_(0x064B, 0x065E)
166 PAIR(0x0670, 0x0670)
167 BIG_(0x06D6, 0x06E4)
168 PAIR(0x06E7, 0x06E8)
169 PAIR(0x06EA, 0x06ED)
170 PAIR(0x070F, 0x070F)
171 PAIR(0x0711, 0x0711)
172 BIG_(0x0730, 0x074A)
173 BIG_(0x07A6, 0x07B0)
174 BIG_(0x07EB, 0x07F3)
175 PAIR(0x0901, 0x0902)
176 PAIR(0x093C, 0x093C)
177 BIG_(0x0941, 0x0948)
178 PAIR(0x094D, 0x094D)
179 PAIR(0x0951, 0x0954)
180 PAIR(0x0962, 0x0963)
181 PAIR(0x0981, 0x0981)
182 PAIR(0x09BC, 0x09BC)
183 PAIR(0x09C1, 0x09C4)
184 PAIR(0x09CD, 0x09CD)
185 PAIR(0x09E2, 0x09E3)
186 PAIR(0x0A01, 0x0A02)
187 PAIR(0x0A3C, 0x0A3C)
188 PAIR(0x0A41, 0x0A42)
189 PAIR(0x0A47, 0x0A48)
190 PAIR(0x0A4B, 0x0A4D)
191 PAIR(0x0A70, 0x0A71)
192 PAIR(0x0A81, 0x0A82)
193 PAIR(0x0ABC, 0x0ABC)
194 BIG_(0x0AC1, 0x0AC5)
195 PAIR(0x0AC7, 0x0AC8)
196 PAIR(0x0ACD, 0x0ACD)
197 PAIR(0x0AE2, 0x0AE3)
198 PAIR(0x0B01, 0x0B01)
199 PAIR(0x0B3C, 0x0B3C)
200 PAIR(0x0B3F, 0x0B3F)
201 PAIR(0x0B41, 0x0B43)
202 PAIR(0x0B4D, 0x0B4D)
203 PAIR(0x0B56, 0x0B56)
204 PAIR(0x0B82, 0x0B82)
205 PAIR(0x0BC0, 0x0BC0)
206 PAIR(0x0BCD, 0x0BCD)
207 PAIR(0x0C3E, 0x0C40)
208 PAIR(0x0C46, 0x0C48)
209 PAIR(0x0C4A, 0x0C4D)
210 PAIR(0x0C55, 0x0C56)
211 PAIR(0x0CBC, 0x0CBC)
212 PAIR(0x0CBF, 0x0CBF)
213 PAIR(0x0CC6, 0x0CC6)
214 PAIR(0x0CCC, 0x0CCD)
215 PAIR(0x0CE2, 0x0CE3)
216 PAIR(0x0D41, 0x0D43)
217 PAIR(0x0D4D, 0x0D4D)
218 PAIR(0x0DCA, 0x0DCA)
219 PAIR(0x0DD2, 0x0DD4)
220 PAIR(0x0DD6, 0x0DD6)
221 PAIR(0x0E31, 0x0E31)
222 BIG_(0x0E34, 0x0E3A)
223 BIG_(0x0E47, 0x0E4E)
224 PAIR(0x0EB1, 0x0EB1)
225 BIG_(0x0EB4, 0x0EB9)
226 PAIR(0x0EBB, 0x0EBC)
227 BIG_(0x0EC8, 0x0ECD)
228 PAIR(0x0F18, 0x0F19)
229 PAIR(0x0F35, 0x0F35)
230 PAIR(0x0F37, 0x0F37)
231 PAIR(0x0F39, 0x0F39)
232 BIG_(0x0F71, 0x0F7E)
233 BIG_(0x0F80, 0x0F84)
234 PAIR(0x0F86, 0x0F87)
235 PAIR(0x0FC6, 0x0FC6)
236 BIG_(0x0F90, 0x0F97)
237 BIG_(0x0F99, 0x0FBC)
238 PAIR(0x102D, 0x1030)
239 PAIR(0x1032, 0x1032)
240 PAIR(0x1036, 0x1037)
241 PAIR(0x1039, 0x1039)
242 PAIR(0x1058, 0x1059)
243 BIG_(0x1160, 0x11FF)
244 PAIR(0x135F, 0x135F)
245 PAIR(0x1712, 0x1714)
246 PAIR(0x1732, 0x1734)
247 PAIR(0x1752, 0x1753)
248 PAIR(0x1772, 0x1773)
249 PAIR(0x17B4, 0x17B5)
250 BIG_(0x17B7, 0x17BD)
251 PAIR(0x17C6, 0x17C6)
252 BIG_(0x17C9, 0x17D3)
253 PAIR(0x17DD, 0x17DD)
254 PAIR(0x180B, 0x180D)
255 PAIR(0x18A9, 0x18A9)
256 PAIR(0x1920, 0x1922)
257 PAIR(0x1927, 0x1928)
258 PAIR(0x1932, 0x1932)
259 PAIR(0x1939, 0x193B)
260 PAIR(0x1A17, 0x1A18)
261 PAIR(0x1B00, 0x1B03)
262 PAIR(0x1B34, 0x1B34)
263 BIG_(0x1B36, 0x1B3A)
264 PAIR(0x1B3C, 0x1B3C)
265 PAIR(0x1B42, 0x1B42)
266 BIG_(0x1B6B, 0x1B73)
267 BIG_(0x1DC0, 0x1DCA)
268 PAIR(0x1DFE, 0x1DFF)
269 BIG_(0x200B, 0x200F)
270 BIG_(0x202A, 0x202E)
271 PAIR(0x2060, 0x2063)
272 BIG_(0x206A, 0x206F)
273 BIG_(0x20D0, 0x20EF)
274 BIG_(0x302A, 0x302F)
275 PAIR(0x3099, 0x309A)
276 /* Too big to be packed in PAIRs: */
277 { 0xA806, 0xA806 },
278 { 0xA80B, 0xA80B },
279 { 0xA825, 0xA826 },
280 { 0xFB1E, 0xFB1E },
281 { 0xFE00, 0xFE0F },
282 { 0xFE20, 0xFE23 },
283 { 0xFEFF, 0xFEFF },
284 { 0xFFF9, 0xFFFB }
285#undef BIG_
286#undef PAIR
287 };
288 static const uint16_t combining1[] = {
289#define BIG_(a,b)
290#define PAIR(a,b) (a << 2) | (b-a),
291 /* Exact copy-n-paste of the above: */
292 BIG_(0x0300, 0x036F)
293 PAIR(0x0483, 0x0486)
294 PAIR(0x0488, 0x0489)
295 BIG_(0x0591, 0x05BD)
296 PAIR(0x05BF, 0x05BF)
297 PAIR(0x05C1, 0x05C2)
298 PAIR(0x05C4, 0x05C5)
299 PAIR(0x05C7, 0x05C7)
300 PAIR(0x0600, 0x0603)
301 BIG_(0x0610, 0x0615)
302 BIG_(0x064B, 0x065E)
303 PAIR(0x0670, 0x0670)
304 BIG_(0x06D6, 0x06E4)
305 PAIR(0x06E7, 0x06E8)
306 PAIR(0x06EA, 0x06ED)
307 PAIR(0x070F, 0x070F)
308 PAIR(0x0711, 0x0711)
309 BIG_(0x0730, 0x074A)
310 BIG_(0x07A6, 0x07B0)
311 BIG_(0x07EB, 0x07F3)
312 PAIR(0x0901, 0x0902)
313 PAIR(0x093C, 0x093C)
314 BIG_(0x0941, 0x0948)
315 PAIR(0x094D, 0x094D)
316 PAIR(0x0951, 0x0954)
317 PAIR(0x0962, 0x0963)
318 PAIR(0x0981, 0x0981)
319 PAIR(0x09BC, 0x09BC)
320 PAIR(0x09C1, 0x09C4)
321 PAIR(0x09CD, 0x09CD)
322 PAIR(0x09E2, 0x09E3)
323 PAIR(0x0A01, 0x0A02)
324 PAIR(0x0A3C, 0x0A3C)
325 PAIR(0x0A41, 0x0A42)
326 PAIR(0x0A47, 0x0A48)
327 PAIR(0x0A4B, 0x0A4D)
328 PAIR(0x0A70, 0x0A71)
329 PAIR(0x0A81, 0x0A82)
330 PAIR(0x0ABC, 0x0ABC)
331 BIG_(0x0AC1, 0x0AC5)
332 PAIR(0x0AC7, 0x0AC8)
333 PAIR(0x0ACD, 0x0ACD)
334 PAIR(0x0AE2, 0x0AE3)
335 PAIR(0x0B01, 0x0B01)
336 PAIR(0x0B3C, 0x0B3C)
337 PAIR(0x0B3F, 0x0B3F)
338 PAIR(0x0B41, 0x0B43)
339 PAIR(0x0B4D, 0x0B4D)
340 PAIR(0x0B56, 0x0B56)
341 PAIR(0x0B82, 0x0B82)
342 PAIR(0x0BC0, 0x0BC0)
343 PAIR(0x0BCD, 0x0BCD)
344 PAIR(0x0C3E, 0x0C40)
345 PAIR(0x0C46, 0x0C48)
346 PAIR(0x0C4A, 0x0C4D)
347 PAIR(0x0C55, 0x0C56)
348 PAIR(0x0CBC, 0x0CBC)
349 PAIR(0x0CBF, 0x0CBF)
350 PAIR(0x0CC6, 0x0CC6)
351 PAIR(0x0CCC, 0x0CCD)
352 PAIR(0x0CE2, 0x0CE3)
353 PAIR(0x0D41, 0x0D43)
354 PAIR(0x0D4D, 0x0D4D)
355 PAIR(0x0DCA, 0x0DCA)
356 PAIR(0x0DD2, 0x0DD4)
357 PAIR(0x0DD6, 0x0DD6)
358 PAIR(0x0E31, 0x0E31)
359 BIG_(0x0E34, 0x0E3A)
360 BIG_(0x0E47, 0x0E4E)
361 PAIR(0x0EB1, 0x0EB1)
362 BIG_(0x0EB4, 0x0EB9)
363 PAIR(0x0EBB, 0x0EBC)
364 BIG_(0x0EC8, 0x0ECD)
365 PAIR(0x0F18, 0x0F19)
366 PAIR(0x0F35, 0x0F35)
367 PAIR(0x0F37, 0x0F37)
368 PAIR(0x0F39, 0x0F39)
369 BIG_(0x0F71, 0x0F7E)
370 BIG_(0x0F80, 0x0F84)
371 PAIR(0x0F86, 0x0F87)
372 PAIR(0x0FC6, 0x0FC6)
373 BIG_(0x0F90, 0x0F97)
374 BIG_(0x0F99, 0x0FBC)
375 PAIR(0x102D, 0x1030)
376 PAIR(0x1032, 0x1032)
377 PAIR(0x1036, 0x1037)
378 PAIR(0x1039, 0x1039)
379 PAIR(0x1058, 0x1059)
380 BIG_(0x1160, 0x11FF)
381 PAIR(0x135F, 0x135F)
382 PAIR(0x1712, 0x1714)
383 PAIR(0x1732, 0x1734)
384 PAIR(0x1752, 0x1753)
385 PAIR(0x1772, 0x1773)
386 PAIR(0x17B4, 0x17B5)
387 BIG_(0x17B7, 0x17BD)
388 PAIR(0x17C6, 0x17C6)
389 BIG_(0x17C9, 0x17D3)
390 PAIR(0x17DD, 0x17DD)
391 PAIR(0x180B, 0x180D)
392 PAIR(0x18A9, 0x18A9)
393 PAIR(0x1920, 0x1922)
394 PAIR(0x1927, 0x1928)
395 PAIR(0x1932, 0x1932)
396 PAIR(0x1939, 0x193B)
397 PAIR(0x1A17, 0x1A18)
398 PAIR(0x1B00, 0x1B03)
399 PAIR(0x1B34, 0x1B34)
400 BIG_(0x1B36, 0x1B3A)
401 PAIR(0x1B3C, 0x1B3C)
402 PAIR(0x1B42, 0x1B42)
403 BIG_(0x1B6B, 0x1B73)
404 BIG_(0x1DC0, 0x1DCA)
405 PAIR(0x1DFE, 0x1DFF)
406 BIG_(0x200B, 0x200F)
407 BIG_(0x202A, 0x202E)
408 PAIR(0x2060, 0x2063)
409 BIG_(0x206A, 0x206F)
410 BIG_(0x20D0, 0x20EF)
411 BIG_(0x302A, 0x302F)
412 PAIR(0x3099, 0x309A)
413#undef BIG_
414#undef PAIR
415 };
416 struct CHECK {
417#define BIG_(a,b) char big##a[b-a <= 3 ? -1 : 1];
418#define PAIR(a,b) char pair##a[b-a > 3 ? -1 : 1];
419 /* Copy-n-paste it here again to verify correctness */
420#undef BIG_
421#undef PAIR
Denys Vlasenko9f93d622010-01-24 07:44:03 +0100422 };
423 static const struct interval combining0x10000[] = {
424 { 0x0A01, 0x0A03 }, { 0x0A05, 0x0A06 }, { 0x0A0C, 0x0A0F },
425 { 0x0A38, 0x0A3A }, { 0x0A3F, 0x0A3F }, { 0xD167, 0xD169 },
426 { 0xD173, 0xD182 }, { 0xD185, 0xD18B }, { 0xD1AA, 0xD1AD },
427 { 0xD242, 0xD244 }
428 };
429
430 if (ucs == 0)
431 return 0;
432 /* test for 8-bit control characters (00-1f, 80-9f, 7f) */
433 if ((ucs & ~0x80) < 0x20 || ucs == 0x7f)
434 return -1;
435 if (ucs < 0x0300) /* optimization */
436 return 1;
437
438 /* binary search in table of non-spacing characters */
Denys Vlasenko46685a42010-01-25 13:24:06 +0100439 if (in_interval_table(ucs, combining, ARRAY_SIZE(combining) - 1))
440 return 0;
441 if (in_uint16_table(ucs, combining1, ARRAY_SIZE(combining1) - 1))
Denys Vlasenko9f93d622010-01-24 07:44:03 +0100442 return 0;
443
444 if (ucs < 0x1100) /* optimization */
445 return 1;
446
447 /* binary search in table of non-spacing characters, cont. */
Denys Vlasenko46685a42010-01-25 13:24:06 +0100448 if (in_interval_table(ucs ^ 0x10000, combining0x10000, ARRAY_SIZE(combining0x10000) - 1))
Denys Vlasenko9f93d622010-01-24 07:44:03 +0100449 return 0;
450 if (ucs == 0xE0001
451 || (ucs >= 0xE0020 && ucs <= 0xE007F)
452 || (ucs >= 0xE0100 && ucs <= 0xE01EF)
453 ) {
454 return 0;
455 }
456
457 /* if we arrive here, ucs is not a combining or C0/C1 control character */
458
459 return 1 +
460 ( (/*ucs >= 0x1100 &&*/ ucs <= 0x115f) /* Hangul Jamo init. consonants */
461 || ucs == 0x2329
462 || ucs == 0x232a
463 || (ucs >= 0x2e80 && ucs <= 0xa4cf && ucs != 0x303f) /* CJK ... Yi */
464 || (ucs >= 0xac00 && ucs <= 0xd7a3) /* Hangul Syllables */
465 || (ucs >= 0xf900 && ucs <= 0xfaff) /* CJK Compatibility Ideographs */
466 || (ucs >= 0xfe10 && ucs <= 0xfe19) /* Vertical forms */
467 || (ucs >= 0xfe30 && ucs <= 0xfe6f) /* CJK Compatibility Forms */
468 || (ucs >= 0xff00 && ucs <= 0xff60) /* Fullwidth Forms */
469 || (ucs >= 0xffe0 && ucs <= 0xffe6)
470 || (ucs >= 0x20000 && ucs <= 0x2fffd)
471 || (ucs >= 0x30000 && ucs <= 0x3fffd)
472 );
473}