diff options
Diffstat (limited to 'lib/uniname/uniname.c')
-rw-r--r-- | lib/uniname/uniname.c | 278 |
1 files changed, 193 insertions, 85 deletions
diff --git a/lib/uniname/uniname.c b/lib/uniname/uniname.c index e4b81cc3..2191f09b 100644 --- a/lib/uniname/uniname.c +++ b/lib/uniname/uniname.c @@ -1,5 +1,5 @@ /* Association between Unicode characters and their names. - Copyright (C) 2000-2002, 2005-2007, 2009-2010 Free Software Foundation, Inc. + Copyright (C) 2000-2002, 2005-2007, 2009-2015 Free Software Foundation, Inc. This program is free software: you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published @@ -45,10 +45,11 @@ #define UNICODE_CHARNAME_WORD_CJK 417 #define UNICODE_CHARNAME_WORD_COMPATIBILITY 6107 static const uint16_t unicode_names[68940] = ...; - static const struct { uint16_t code; uint32_t name:24; } unicode_name_to_code[16626] = ...; - static const struct { uint16_t code; uint32_t name:24; } unicode_code_to_name[16626] = ...; + static const struct { uint16_t index; uint32_t name:24; } unicode_name_to_index[16626] = ...; + static const struct { uint16_t index; uint32_t name:24; } unicode_index_to_name[16626] = ...; #define UNICODE_CHARNAME_MAX_LENGTH 83 #define UNICODE_CHARNAME_MAX_WORDS 13 + static const struct { uint32_t index; uint32_t gap; uint16_t length; } unicode_ranges[401] = ...; */ /* Returns the word with a given index. */ @@ -127,6 +128,82 @@ unicode_name_word_lookup (const char *word, unsigned int length) return -1; } +#define UNINAME_INVALID_INDEX UINT16_MAX + +/* Looks up the internal index of a Unicode character. */ +static uint16_t +unicode_code_to_index (ucs4_t c) +{ + /* Binary search in unicode_ranges. */ + unsigned int i1 = 0; + unsigned int i2 = SIZEOF (unicode_ranges); + + for (;;) + { + unsigned int i = (i1 + i2) >> 1; + ucs4_t start_code = + unicode_ranges[i].index + unicode_ranges[i].gap; + ucs4_t end_code = + start_code + unicode_ranges[i].length - 1; + + if (start_code <= c && c <= end_code) + return c - unicode_ranges[i].gap; + + if (end_code < c) + { + if (i1 == i) + break; + /* Note here: i1 < i < i2. */ + i1 = i; + } + else if (c < start_code) + { + if (i2 == i) + break; + /* Note here: i1 <= i < i2. */ + i2 = i; + } + } + return UNINAME_INVALID_INDEX; +} + +/* Looks up the codepoint of a Unicode character, from the given + internal index. */ +static ucs4_t +unicode_index_to_code (uint16_t index) +{ + /* Binary search in unicode_ranges. */ + unsigned int i1 = 0; + unsigned int i2 = SIZEOF (unicode_ranges); + + for (;;) + { + unsigned int i = (i1 + i2) >> 1; + uint16_t start_index = unicode_ranges[i].index; + uint16_t end_index = start_index + unicode_ranges[i].length - 1; + + if (start_index <= index && index <= end_index) + return index + unicode_ranges[i].gap; + + if (end_index < index) + { + if (i1 == i) + break; + /* Note here: i1 < i < i2. */ + i1 = i; + } + else if (index < start_index) + { + if (i2 == i) + break; + /* Note here: i1 <= i < i2. */ + i2 = i; + } + } + return UNINAME_INVALID; +} + + /* Auxiliary tables for Hangul syllable names, see the Unicode 3.0 book, sections 3.11 and 4.4. */ static const char jamo_initial_short_name[19][3] = @@ -201,80 +278,59 @@ unicode_character_name (ucs4_t c, char *buf) *ptr = '\0'; return buf; } + else if ((c >= 0xFE00 && c <= 0xFE0F) || (c >= 0xE0100 && c <= 0xE01EF)) + { + /* Special case for variation selectors. Keeps the tables + small. */ + + /* buf needs to have at least 19 + 3 bytes here. */ + sprintf (buf, "VARIATION SELECTOR-%d", + c <= 0xFE0F ? c - 0xFE00 + 1 : c - 0xE0100 + 17); + return buf; + } else { - const uint16_t *words; + uint16_t index = unicode_code_to_index (c); + const uint16_t *words = NULL; - /* Transform the code so that it fits in 16 bits. */ - switch (c >> 12) + if (index != UNINAME_INVALID_INDEX) { - case 0x00: case 0x01: case 0x02: case 0x03: case 0x04: - break; - case 0x0A: - c -= 0x05000; - break; - case 0x0F: - c -= 0x09000; - break; - case 0x10: - c -= 0x09000; - break; - case 0x12: - c -= 0x0A000; - break; - case 0x1D: - c -= 0x14000; - break; - case 0x1F: - c -= 0x15000; - break; - case 0x2F: - c -= 0x24000; - break; - case 0xE0: - c -= 0xD4000; - break; - default: - return NULL; + /* Binary search in unicode_code_to_name. */ + unsigned int i1 = 0; + unsigned int i2 = SIZEOF (unicode_index_to_name); + for (;;) + { + unsigned int i = (i1 + i2) >> 1; + if (unicode_index_to_name[i].index == index) + { + words = &unicode_names[unicode_index_to_name[i].name]; + break; + } + else if (unicode_index_to_name[i].index < index) + { + if (i1 == i) + { + words = NULL; + break; + } + /* Note here: i1 < i < i2. */ + i1 = i; + } + else if (unicode_index_to_name[i].index > index) + { + if (i2 == i) + { + words = NULL; + break; + } + /* Note here: i1 <= i < i2. */ + i2 = i; + } + } } - - { - /* Binary search in unicode_code_to_name. */ - unsigned int i1 = 0; - unsigned int i2 = SIZEOF (unicode_code_to_name); - for (;;) - { - unsigned int i = (i1 + i2) >> 1; - if (unicode_code_to_name[i].code == c) - { - words = &unicode_names[unicode_code_to_name[i].name]; - break; - } - else if (unicode_code_to_name[i].code < c) - { - if (i1 == i) - { - words = NULL; - break; - } - /* Note here: i1 < i < i2. */ - i1 = i; - } - else if (unicode_code_to_name[i].code > c) - { - if (i2 == i) - { - words = NULL; - break; - } - /* Note here: i1 <= i < i2. */ - i2 = i; - } - } - } if (words != NULL) { - /* Found it in unicode_code_to_name. Now concatenate the words. */ + /* Found it in unicode_index_to_name. Now concatenate the words. */ /* buf needs to have at least UNICODE_CHARNAME_MAX_LENGTH bytes. */ char *ptr = buf; for (;;) @@ -319,6 +375,37 @@ unicode_name_character (const char *name) if (false) filled_buf: { + { + /* Special case for variation selector aliases. Keeps the + tables small. */ + const char *p1 = buf; + if (ptr >= buf + 3 && *p1++ == 'V') + { + if (*p1++ == 'S') + { + if (*p1 != '0') + { + unsigned int c = 0; + for (;;) + { + if (*p1 >= '0' && *p1 <= '9') + c += (*p1 - '0'); + p1++; + if (p1 == ptr) + { + if (c >= 1 && c <= 16) + return c - 1 + 0xFE00; + else if (c >= 17 && c <= 256) + return c - 17 + 0xE0100; + else + break; + } + c = c * 10; + } + } + } + } + } /* Convert the constituents to uint16_t words. */ uint16_t words[UNICODE_CHARNAME_MAX_WORDS]; uint16_t *wordptr = words; @@ -450,6 +537,38 @@ unicode_name_character (const char *name) } } } + /* Special case for variation selectors. Keeps the + tables small. */ + if (wordptr == &words[1] + && words[0] == UNICODE_CHARNAME_WORD_VARIATION + && p1 + 10 <= ptr + && p1 + 12 >= ptr + && memcmp (p1, "SELECTOR-", 9) == 0) + { + const char *p2 = p1 + 9; + + if (*p2 != '0') + { + unsigned int c = 0; + + for (;;) + { + if (*p2 >= '0' && *p2 <= '9') + c += (*p2 - '0'); + p2++; + if (p2 == ptr) + { + if (c >= 1 && c <= 16) + return c - 1 + 0xFE00; + else if (c >= 17 && c <= 256) + return c - 17 + 0xE0100; + else + break; + } + c = c * 10; + } + } + } } } if (false) @@ -463,15 +582,15 @@ unicode_name_character (const char *name) for (; --i >= 0; ) words[i] = 2 * words[i] + 1; } - /* Binary search in unicode_name_to_code. */ + /* Binary search in unicode_name_to_index. */ { unsigned int i1 = 0; - unsigned int i2 = SIZEOF (unicode_name_to_code); + unsigned int i2 = SIZEOF (unicode_name_to_index); for (;;) { unsigned int i = (i1 + i2) >> 1; const uint16_t *w = words; - const uint16_t *p = &unicode_names[unicode_name_to_code[i].name]; + const uint16_t *p = &unicode_names[unicode_name_to_index[i].name]; unsigned int n = words_length; for (;;) { @@ -493,18 +612,7 @@ unicode_name_character (const char *name) } p++; w++; n--; if (n == 0) - { - unsigned int c = unicode_name_to_code[i].code; - - /* Undo the transformation to 16-bit space. */ - static const unsigned int offset[13] = - { - 0x00000, 0x00000, 0x00000, 0x00000, 0x00000, - 0x05000, 0x09000, 0x09000, 0x0A000, 0x14000, - 0x15000, 0x24000, 0xD4000 - }; - return c + offset[c >> 12]; - } + return unicode_index_to_code (unicode_name_to_index[i].index); } } } |