diff options
Diffstat (limited to 'lib/uniname/uniname.c')
| -rw-r--r-- | lib/uniname/uniname.c | 278 | 
1 files changed, 193 insertions, 85 deletions
| diff --git a/lib/uniname/uniname.c b/lib/uniname/uniname.c index e4b81cc3..2191f09b 100644 --- a/lib/uniname/uniname.c +++ b/lib/uniname/uniname.c @@ -1,5 +1,5 @@  /* Association between Unicode characters and their names. -   Copyright (C) 2000-2002, 2005-2007, 2009-2010 Free Software Foundation, Inc. +   Copyright (C) 2000-2002, 2005-2007, 2009-2015 Free Software Foundation, Inc.     This program is free software: you can redistribute it and/or modify it     under the terms of the GNU Lesser General Public License as published @@ -45,10 +45,11 @@    #define UNICODE_CHARNAME_WORD_CJK 417    #define UNICODE_CHARNAME_WORD_COMPATIBILITY 6107    static const uint16_t unicode_names[68940] = ...; -  static const struct { uint16_t code; uint32_t name:24; } unicode_name_to_code[16626] = ...; -  static const struct { uint16_t code; uint32_t name:24; } unicode_code_to_name[16626] = ...; +  static const struct { uint16_t index; uint32_t name:24; } unicode_name_to_index[16626] = ...; +  static const struct { uint16_t index; uint32_t name:24; } unicode_index_to_name[16626] = ...;    #define UNICODE_CHARNAME_MAX_LENGTH 83    #define UNICODE_CHARNAME_MAX_WORDS 13 +  static const struct { uint32_t index; uint32_t gap; uint16_t length; } unicode_ranges[401] = ...;  */  /* Returns the word with a given index.  */ @@ -127,6 +128,82 @@ unicode_name_word_lookup (const char *word, unsigned int length)    return -1;  } +#define UNINAME_INVALID_INDEX UINT16_MAX + +/* Looks up the internal index of a Unicode character.  */ +static uint16_t +unicode_code_to_index (ucs4_t c) +{ +  /* Binary search in unicode_ranges.  */ +  unsigned int i1 = 0; +  unsigned int i2 = SIZEOF (unicode_ranges); + +  for (;;) +    { +      unsigned int i = (i1 + i2) >> 1; +      ucs4_t start_code = +        unicode_ranges[i].index + unicode_ranges[i].gap; +      ucs4_t end_code = +        start_code + unicode_ranges[i].length - 1; + +      if (start_code <= c && c <= end_code) +        return c - unicode_ranges[i].gap; + +      if (end_code < c) +        { +          if (i1 == i) +            break; +          /* Note here: i1 < i < i2.  */ +          i1 = i; +        } +      else if (c < start_code) +        { +          if (i2 == i) +            break; +          /* Note here: i1 <= i < i2.  */ +          i2 = i; +        } +    } +  return UNINAME_INVALID_INDEX; +} + +/* Looks up the codepoint of a Unicode character, from the given +   internal index.  */ +static ucs4_t +unicode_index_to_code (uint16_t index) +{ +  /* Binary search in unicode_ranges.  */ +  unsigned int i1 = 0; +  unsigned int i2 = SIZEOF (unicode_ranges); + +  for (;;) +    { +      unsigned int i = (i1 + i2) >> 1; +      uint16_t start_index = unicode_ranges[i].index; +      uint16_t end_index = start_index + unicode_ranges[i].length - 1; + +      if (start_index <= index && index <= end_index) +        return index + unicode_ranges[i].gap; + +      if (end_index < index) +        { +          if (i1 == i) +            break; +          /* Note here: i1 < i < i2.  */ +          i1 = i; +        } +      else if (index < start_index) +        { +          if (i2 == i) +            break; +          /* Note here: i1 <= i < i2.  */ +          i2 = i; +        } +    } +  return UNINAME_INVALID; +} + +  /* Auxiliary tables for Hangul syllable names, see the Unicode 3.0 book,     sections 3.11 and 4.4.  */  static const char jamo_initial_short_name[19][3] = @@ -201,80 +278,59 @@ unicode_character_name (ucs4_t c, char *buf)        *ptr = '\0';        return buf;      } +  else if ((c >= 0xFE00 && c <= 0xFE0F) || (c >= 0xE0100 && c <= 0xE01EF)) +    { +      /* Special case for variation selectors. Keeps the tables +         small.  */ + +      /* buf needs to have at least 19 + 3 bytes here.  */ +      sprintf (buf, "VARIATION SELECTOR-%d", +               c <= 0xFE0F ? c - 0xFE00 + 1 : c - 0xE0100 + 17); +      return buf; +    }    else      { -      const uint16_t *words; +      uint16_t index = unicode_code_to_index (c); +      const uint16_t *words = NULL; -      /* Transform the code so that it fits in 16 bits.  */ -      switch (c >> 12) +      if (index != UNINAME_INVALID_INDEX)          { -        case 0x00: case 0x01: case 0x02: case 0x03: case 0x04: -          break; -        case 0x0A: -          c -= 0x05000; -          break; -        case 0x0F: -          c -= 0x09000; -          break; -        case 0x10: -          c -= 0x09000; -          break; -        case 0x12: -          c -= 0x0A000; -          break; -        case 0x1D: -          c -= 0x14000; -          break; -        case 0x1F: -          c -= 0x15000; -          break; -        case 0x2F: -          c -= 0x24000; -          break; -        case 0xE0: -          c -= 0xD4000; -          break; -        default: -          return NULL; +          /* Binary search in unicode_code_to_name.  */ +          unsigned int i1 = 0; +          unsigned int i2 = SIZEOF (unicode_index_to_name); +          for (;;) +            { +              unsigned int i = (i1 + i2) >> 1; +              if (unicode_index_to_name[i].index == index) +                { +                  words = &unicode_names[unicode_index_to_name[i].name]; +                  break; +                } +              else if (unicode_index_to_name[i].index < index) +                { +                  if (i1 == i) +                    { +                      words = NULL; +                      break; +                    } +                  /* Note here: i1 < i < i2.  */ +                  i1 = i; +                } +              else if (unicode_index_to_name[i].index > index) +                { +                  if (i2 == i) +                    { +                      words = NULL; +                      break; +                    } +                  /* Note here: i1 <= i < i2.  */ +                  i2 = i; +                } +            }          } - -      { -        /* Binary search in unicode_code_to_name.  */ -        unsigned int i1 = 0; -        unsigned int i2 = SIZEOF (unicode_code_to_name); -        for (;;) -          { -            unsigned int i = (i1 + i2) >> 1; -            if (unicode_code_to_name[i].code == c) -              { -                words = &unicode_names[unicode_code_to_name[i].name]; -                break; -              } -            else if (unicode_code_to_name[i].code < c) -              { -                if (i1 == i) -                  { -                    words = NULL; -                    break; -                  } -                /* Note here: i1 < i < i2.  */ -                i1 = i; -              } -            else if (unicode_code_to_name[i].code > c) -              { -                if (i2 == i) -                  { -                    words = NULL; -                    break; -                  } -                /* Note here: i1 <= i < i2.  */ -                i2 = i; -              } -          } -      }        if (words != NULL)          { -          /* Found it in unicode_code_to_name. Now concatenate the words.  */ +          /* Found it in unicode_index_to_name. Now concatenate the words.  */            /* buf needs to have at least UNICODE_CHARNAME_MAX_LENGTH bytes.  */            char *ptr = buf;            for (;;) @@ -319,6 +375,37 @@ unicode_name_character (const char *name)        if (false)        filled_buf:          { +          { +            /* Special case for variation selector aliases. Keeps the +               tables small.  */ +            const char *p1 = buf; +            if (ptr >= buf + 3 && *p1++ == 'V') +              { +                if (*p1++ == 'S') +                  { +                    if (*p1 != '0') +                      { +                        unsigned int c = 0; +                        for (;;) +                          { +                            if (*p1 >= '0' && *p1 <= '9') +                              c += (*p1 - '0'); +                            p1++; +                            if (p1 == ptr) +                              { +                                if (c >= 1 && c <= 16) +                                  return c - 1 + 0xFE00; +                                else if (c >= 17 && c <= 256) +                                  return c - 17 + 0xE0100; +                                else +                                  break; +                              } +                            c = c * 10; +                          } +                      } +                  } +              } +          }            /* Convert the constituents to uint16_t words.  */            uint16_t words[UNICODE_CHARNAME_MAX_WORDS];            uint16_t *wordptr = words; @@ -450,6 +537,38 @@ unicode_name_character (const char *name)                            }                        }                    } +                /* Special case for variation selectors. Keeps the +                   tables small.  */ +                if (wordptr == &words[1] +                    && words[0] == UNICODE_CHARNAME_WORD_VARIATION +                    && p1 + 10 <= ptr +                    && p1 + 12 >= ptr +                    && memcmp (p1, "SELECTOR-", 9) == 0) +                  { +                    const char *p2 = p1 + 9; + +                    if (*p2 != '0') +                      { +                        unsigned int c = 0; + +                        for (;;) +                          { +                            if (*p2 >= '0' && *p2 <= '9') +                              c += (*p2 - '0'); +                            p2++; +                            if (p2 == ptr) +                              { +                                if (c >= 1 && c <= 16) +                                  return c - 1 + 0xFE00; +                                else if (c >= 17 && c <= 256) +                                  return c - 17 + 0xE0100; +                                else +                                  break; +                              } +                            c = c * 10; +                          } +                      } +                  }                }            }            if (false) @@ -463,15 +582,15 @@ unicode_name_character (const char *name)                  for (; --i >= 0; )                    words[i] = 2 * words[i] + 1;                } -              /* Binary search in unicode_name_to_code.  */ +              /* Binary search in unicode_name_to_index.  */                {                  unsigned int i1 = 0; -                unsigned int i2 = SIZEOF (unicode_name_to_code); +                unsigned int i2 = SIZEOF (unicode_name_to_index);                  for (;;)                    {                      unsigned int i = (i1 + i2) >> 1;                      const uint16_t *w = words; -                    const uint16_t *p = &unicode_names[unicode_name_to_code[i].name]; +                    const uint16_t *p = &unicode_names[unicode_name_to_index[i].name];                      unsigned int n = words_length;                      for (;;)                        { @@ -493,18 +612,7 @@ unicode_name_character (const char *name)                            }                          p++; w++; n--;                          if (n == 0) -                          { -                            unsigned int c = unicode_name_to_code[i].code; - -                            /* Undo the transformation to 16-bit space.  */ -                            static const unsigned int offset[13] = -                              { -                                0x00000, 0x00000, 0x00000, 0x00000, 0x00000, -                                0x05000, 0x09000, 0x09000, 0x0A000, 0x14000, -                                0x15000, 0x24000, 0xD4000 -                              }; -                            return c + offset[c >> 12]; -                          } +                          return unicode_index_to_code (unicode_name_to_index[i].index);                        }                    }                } | 
