diff options
| author | Jörg Frings-Fürst <debian@jff-webhosting.net> | 2017-09-01 18:53:23 +0200 | 
|---|---|---|
| committer | Jörg Frings-Fürst <debian@jff-webhosting.net> | 2017-09-01 18:53:23 +0200 | 
| commit | b62fc1758f4ae8459e6d7e8386ca547274b4daa2 (patch) | |
| tree | 7665bd379e46db015577fe6851b07b4fe2b6a1c9 /src/unicode.c | |
| parent | ac077032be00edc79afc21983f50bc1cdf9af907 (diff) | |
New upstream version 6.6.1upstream/6.6.1
Diffstat (limited to 'src/unicode.c')
| -rw-r--r-- | src/unicode.c | 200 | 
1 files changed, 200 insertions, 0 deletions
| diff --git a/src/unicode.c b/src/unicode.c index 5b6b3e7..65de74a 100644 --- a/src/unicode.c +++ b/src/unicode.c @@ -664,3 +664,203 @@ onigenc_unicode_get_case_fold_codes_by_str(OnigEncoding enc,    return n;  } + + +#ifdef USE_UNICODE_EXTENDED_GRAPHEME_CLUSTER + +enum EGCB_BREAK_TYPE { +  EGCB_NOT_BREAK = 0, +  EGCB_BREAK     = 1, +  EGCB_BREAK_UNDEF_E_MODIFIER = 2, +  EGCB_BREAK_UNDEF_RI_RI = 3 +}; + +enum EGCB_TYPE { +  EGCB_Other   = 0, +  EGCB_CR      = 1, +  EGCB_LF      = 2, +  EGCB_Control = 3, +  EGCB_Extend  = 4, +  EGCB_Prepend = 5, +  EGCB_Regional_Indicator = 6, +  EGCB_SpacingMark = 7, +  EGCB_ZWJ         = 8, +  EGCB_E_Base         = 9, +  EGCB_E_Base_GAZ     = 10, +  EGCB_E_Modifier     = 11, +  EGCB_Glue_After_Zwj = 12, +  EGCB_L   = 13, +  EGCB_LV  = 14, +  EGCB_LVT = 15, +  EGCB_T   = 16, +  EGCB_V   = 17 +}; + +typedef struct { +  OnigCodePoint  start; +  OnigCodePoint  end; +  enum EGCB_TYPE type; +} EGCB_RANGE_TYPE; + +#include "unicode_egcb_data.c" + +static enum EGCB_TYPE +egcb_get_type(OnigCodePoint code) +{ +  OnigCodePoint low, high, x; +  enum EGCB_TYPE type; + +  for (low = 0, high = EGCB_RANGE_NUM; low < high; ) { +    x = (low + high) >> 1; +    if (code > EGCB_RANGES[x].end) +      low = x + 1; +    else +      high = x; +  } + +  type = (low < EGCB_RANGE_NUM && code >= EGCB_RANGES[low].start) ? +    EGCB_RANGES[low].type : EGCB_Other; +  return type; +} + +#define IS_CONTROL_CR_LF(code)   ((code) <= EGCB_Control && (code) >= EGCB_CR) +#define IS_HANGUL(code)          ((code) >= EGCB_L) + +/* GB1 and GB2 are outside of this function. */ +static enum EGCB_BREAK_TYPE +unicode_egcb_is_break_2code(OnigCodePoint from_code, OnigCodePoint to_code) +{ +  enum EGCB_TYPE from; +  enum EGCB_TYPE to; + +  from = egcb_get_type(from_code); +  to   = egcb_get_type(to_code); + +  /* short cut */ +  if (from == 0 && to == 0) goto GB999; + +  /* GB3 */ +  if (from == EGCB_CR && to == EGCB_LF) return EGCB_NOT_BREAK; +  /* GB4 */ +  if (IS_CONTROL_CR_LF(from)) return EGCB_BREAK; +  /* GB5 */ +  if (IS_CONTROL_CR_LF(to)) return EGCB_BREAK; + +  if (IS_HANGUL(from) && IS_HANGUL(to)) { +    /* GB6 */ +    if (from == EGCB_L && to != EGCB_T) return EGCB_NOT_BREAK; +    /* GB7 */ +    if ((from == EGCB_LV || from == EGCB_V) +        && (to == EGCB_V || to == EGCB_T)) return EGCB_NOT_BREAK; + +    /* GB8 */ +    if ((from == EGCB_LVT || from == EGCB_T) && (to == EGCB_T)) +      return EGCB_NOT_BREAK; + +    goto GB999; +  } + +  /* GB9 */ +  if (to == EGCB_Extend || to == EGCB_ZWJ) return EGCB_NOT_BREAK; + +  /* GB9a */ +  if (to == EGCB_SpacingMark) return EGCB_NOT_BREAK; +  /* GB9b */ +  if (from == EGCB_Prepend) return EGCB_NOT_BREAK; + +  /* GB10 */ +  if (to == EGCB_E_Modifier) { +    if (from == EGCB_E_Base || from == EGCB_E_Base_GAZ) return EGCB_NOT_BREAK; +    if (from == EGCB_Extend) return EGCB_BREAK_UNDEF_E_MODIFIER; +    goto GB999; +  } + +  /* GB11 */ +  if (from == EGCB_ZWJ) { +    if (to == EGCB_Glue_After_Zwj || to == EGCB_E_Base_GAZ) return EGCB_NOT_BREAK; +    goto GB999; +  } + +  /* GB12, GB13 */ +  if (from == EGCB_Regional_Indicator && to == EGCB_Regional_Indicator) { +    return EGCB_BREAK_UNDEF_RI_RI; +  } + + GB999: +  return EGCB_BREAK; +} + +#endif /* USE_UNICODE_EXTENDED_GRAPHEME_CLUSTER */ + +extern int +onigenc_egcb_is_break_position(OnigEncoding enc, UChar* p, UChar* prev, +                               const UChar* start, const UChar* end) +{ +  OnigCodePoint from; +  OnigCodePoint to; +#ifdef USE_UNICODE_EXTENDED_GRAPHEME_CLUSTER +  enum EGCB_BREAK_TYPE btype; +  enum EGCB_TYPE type; +#endif + +  /* GB1 and GB2 */ +  if (p == start) return 1; +  if (p == end)   return 1; + +  if (IS_NULL(prev)) { +    prev = onigenc_get_prev_char_head(enc, start, p); +    if (IS_NULL(prev)) return 1; +  } + +  from = ONIGENC_MBC_TO_CODE(enc, prev, end); +  to   = ONIGENC_MBC_TO_CODE(enc, p, end); + +#ifdef USE_UNICODE_EXTENDED_GRAPHEME_CLUSTER +  if (! ONIGENC_IS_UNICODE_ENCODING(enc)) { +    if (from == 0x000d && to == 0x000a) return 0; +    else return 1; +  } + +  btype = unicode_egcb_is_break_2code(from, to); +  switch (btype) { +  case EGCB_NOT_BREAK: +    return 0; +    break; +  case EGCB_BREAK: +    return 1; +    break; + +  case EGCB_BREAK_UNDEF_E_MODIFIER: +    while ((prev = onigenc_get_prev_char_head(enc, start, prev)) != NULL) { +      from = ONIGENC_MBC_TO_CODE(enc, prev, end); +      type = egcb_get_type(from); +      if (type == EGCB_E_Base || type == EGCB_E_Base_GAZ) +        return 0; +      if (type != EGCB_Extend) +        break; +    } +    break; + +  case EGCB_BREAK_UNDEF_RI_RI: +    { +      int n = 0; +      while ((prev = onigenc_get_prev_char_head(enc, start, prev)) != NULL) { +        from = ONIGENC_MBC_TO_CODE(enc, prev, end); +        type = egcb_get_type(from); +        if (type != EGCB_Regional_Indicator) +          break; + +        n++; +      } +      if ((n % 2) == 0) return 0; +    } +    break; +  } + +  return 1; + +#else +  if (from == 0x000d && to == 0x000a) return 0; +  else return 1; +#endif /* USE_UNICODE_EXTENDED_GRAPHEME_CLUSTER */ +} | 
