diff options
Diffstat (limited to 'lib/unilbrk/u16-possible-linebreaks.c')
| -rw-r--r-- | lib/unilbrk/u16-possible-linebreaks.c | 310 | 
1 files changed, 230 insertions, 80 deletions
| diff --git a/lib/unilbrk/u16-possible-linebreaks.c b/lib/unilbrk/u16-possible-linebreaks.c index 6a9c15b7..ac655f8c 100644 --- a/lib/unilbrk/u16-possible-linebreaks.c +++ b/lib/unilbrk/u16-possible-linebreaks.c @@ -45,25 +45,88 @@ u16_possible_linebreaks_loop (const uint16_t *s, size_t n, const char *encoding,  {    if (n > 0)      { -      int LBP_AI_REPLACEMENT = (is_cjk_encoding (encoding) ? LBP_ID1 : LBP_AL); -      const uint16_t *s_end = s + n; -      int prev_prop = LBP_BK; /* line break property of last character */ -      int last_prop = LBP_BK; /* line break property of last non-space character */ -      char *seen_space = NULL; /* Was a space seen after the last non-space character? */ +      int LBP_AI_REPLACEMENT = (is_cjk_encoding (encoding) ? LBP_ID1 : LBP_AL1);        /* Don't break inside multibyte characters.  */        memset (p, UC_BREAK_PROHIBITED, n); +      const uint16_t *s_end = s + n; + +      /* We need 2 characters of lookahead: +           - 1 character of lookahead for (LB15c,LB19a,LB28a), +           - 2 characters of lookahead for (LB25).  */ +      const uint16_t *lookahead1_end; +      ucs4_t lookahead1_uc; +      int lookahead1_prop_ea; +      const uint16_t *lookahead2_end; +      ucs4_t lookahead2_uc; +      int lookahead2_prop_ea; +      /* Get the first lookahead character.  */ +      lookahead1_end = s; +      lookahead1_end += u16_mbtouc_unsafe (&lookahead1_uc, lookahead1_end, s_end - lookahead1_end); +      lookahead1_prop_ea = unilbrkprop_lookup (lookahead1_uc); +      /* Get the second lookahead character.  */ +      lookahead2_end = lookahead1_end; +      if (lookahead2_end < s_end) +        { +          lookahead2_end += u16_mbtouc_unsafe (&lookahead2_uc, lookahead2_end, s_end - lookahead2_end); +          lookahead2_prop_ea = unilbrkprop_lookup (lookahead2_uc); +        } +      else +        { +          lookahead2_uc = 0xFFFD; +          lookahead2_prop_ea = PROP_EA (LBP_BK, 0); +        } + +      int preceding_prop = LBP_BK; /* line break property of preceding character */ +      int prev_prop = LBP_BK; /* line break property of previous character +                                 (= last character, ignoring intervening characters of class CM or ZWJ) */ +      int prev_ea = 0;        /* EastAsian property of previous character +                                 (= last character, ignoring intervening characters of class CM or ZWJ) */ +      int prev2_ea = 0;       /* EastAsian property of character before the previous character */ +      bool prev_initial_hyphen = false; /* the previous character was a +                                           word-initial hyphen or U+2010 */ +      bool prev_nus = false; /* before the previous character, there was a character +                                with line break property LBP_NU and since then +                                only characters with line break property LBP_SY +                                or LBP_IS */ +      int last_prop = LBP_BK; /* line break property of last non-space character +                                 (= last character, ignoring intervening characters of class SP or CM or ZWJ) */ +      char *seen_space = NULL; /* Was a space seen after the last non-space character? */ +        /* Number of consecutive regional indicator (RI) characters seen           immediately before the current point.  */        size_t ri_count = 0;        do          { -          ucs4_t uc; -          int count = u16_mbtouc_unsafe (&uc, s, s_end - s); -          s += count; -          int prop = unilbrkprop_lookup (uc); +          /* Read the next character.  */ +          size_t count = lookahead1_end - s; +          s = lookahead1_end; +          ucs4_t uc = lookahead1_uc; +          int prop_ea = lookahead1_prop_ea; /* = unilbrkprop_lookup (uc); */ +          int prop = PROP (prop_ea); /* line break property of uc */ +          int ea = EA (prop_ea);     /* EastAsian property of uc */ +          /*  Refill the pipeline of 2 lookahead characters.  */ +          lookahead1_end = lookahead2_end; +          lookahead1_uc = lookahead2_uc; +          lookahead1_prop_ea = lookahead2_prop_ea; +          if (lookahead2_end < s_end) +            { +              lookahead2_end += u16_mbtouc_unsafe (&lookahead2_uc, lookahead2_end, s_end - lookahead2_end); +              lookahead2_prop_ea = unilbrkprop_lookup (lookahead2_uc); +            } +          else +            { +              lookahead2_uc = 0xFFFD; +              lookahead2_prop_ea = PROP_EA (LBP_BK, 0); +            } + +          bool nus = /* ending at the previous character, there was a character +                        with line break property LBP_NU and since then only +                        characters with line break property LBP_SY or LBP_IS */ +            (prev_prop == LBP_NU +             || (prev_nus && (prev_prop == LBP_SY || prev_prop == LBP_IS)));            if (prop == LBP_BK || prop == LBP_LF || prop == LBP_CR)              { @@ -73,7 +136,6 @@ u16_possible_linebreaks_loop (const uint16_t *s, size_t n, const char *encoding,                   a CR-LF sequence.  */                if (prev_prop == cr && prop == LBP_LF)                  p[-1] = UC_BREAK_CR_BEFORE_LF; -              prev_prop = prop;                last_prop = LBP_BK;                seen_space = NULL;              } @@ -95,63 +157,7 @@ u16_possible_linebreaks_loop (const uint16_t *s, size_t n, const char *encoding,                       Treat LBP_SA like LBP_XX.  */                  case LBP_XX:                    /* This is arbitrary.  */ -                  prop = LBP_AL; -                  break; -                case LBP_QU2: -                  /* For (LB15a): Replace LBP_QU2 with LBP_QU1 if the previous -                     character's line break property was not one of -                     BK, CR, LF, OP, QU, GL, SP, ZW.  */ -                  switch (prev_prop) -                    { -                    case LBP_BK: -                    case LBP_CR: -                    case LBP_LF: -                    case LBP_OP1: case LBP_OP2: -                    case LBP_QU1: case LBP_QU2: case LBP_QU3: -                    case LBP_GL: -                    case LBP_SP: -                    case LBP_ZW: -                      break; -                    default: -                      prop = LBP_QU1; -                      break; -                    } -                  break; -                case LBP_QU3: -                  /* For (LB15b): Replace LBP_QU3 with LBP_QU1 if the next -                     character's line break property is not one of -                     BK, CR, LF, SP, GL, WJ, CL, QU, CP, EX, IS, SY, ZW.  */ -                  { -                    int next_prop; -                    if (s < s_end) -                      { -                        ucs4_t next_uc; -                        (void) u16_mbtouc_unsafe (&next_uc, s, s_end - s); -                        next_prop = unilbrkprop_lookup (next_uc); -                      } -                    else -                      next_prop = LBP_BK; -                    switch (next_prop) -                      { -                      case LBP_BK: -                      case LBP_CR: -                      case LBP_LF: -                      case LBP_SP: -                      case LBP_GL: -                      case LBP_WJ: -                      case LBP_CL: -                      case LBP_QU1: case LBP_QU2: case LBP_QU3: -                      case LBP_CP1: case LBP_CP2: -                      case LBP_EX: -                      case LBP_IS: -                      case LBP_SY: -                      case LBP_ZW: -                        break; -                      default: -                        prop = LBP_QU1; -                        break; -                      } -                  } +                  prop = LBP_AL1;                    break;                  } @@ -179,10 +185,15 @@ u16_possible_linebreaks_loop (const uint16_t *s, size_t n, const char *encoding,                        /* (LB4,LB5,LB6) Don't break at the beginning of a line.  */                        *p = UC_BREAK_PROHIBITED;                        /* (LB10) Treat CM or ZWJ as AL.  */ -                      last_prop = LBP_AL; +                      last_prop = LBP_AL1;                        seen_space = NULL;                      } -                  else if (last_prop == LBP_ZW || seen_space != NULL) +                  else if (last_prop == LBP_ZW +                           || (seen_space != NULL +                               /* (LB14) has higher priority than (LB18).  */ +                               && !(last_prop == LBP_OP1 || last_prop == LBP_OP2) +                               /* (LB15a) has higher priority than (LB18).  */ +                               && !(last_prop == LBP_QU2)))                      {                        /* (LB8) Break after zero-width space.  */                        /* (LB18) Break after spaces. @@ -191,7 +202,7 @@ u16_possible_linebreaks_loop (const uint16_t *s, size_t n, const char *encoding,                           NBSP CM sequence is recommended instead of SP CM.  */                        *p = UC_BREAK_POSSIBLE;                        /* (LB10) Treat CM or ZWJ as AL.  */ -                      last_prop = LBP_AL; +                      last_prop = LBP_AL1;                        seen_space = NULL;                      }                    else @@ -216,11 +227,82 @@ u16_possible_linebreaks_loop (const uint16_t *s, size_t n, const char *encoding,                        /* (LB8) Break after zero-width space.  */                        *p = UC_BREAK_POSSIBLE;                      } -                  else if (prev_prop == LBP_ZWJ) +                  else if (preceding_prop == LBP_ZWJ)                      {                        /* (LB8a) Don't break right after a zero-width joiner.  */                        *p = UC_BREAK_PROHIBITED;                      } +                  else if (prop == LBP_IS && prev_prop == LBP_SP +                           && PROP (lookahead1_prop_ea) == LBP_NU) +                    { +                      /* (LB15c) Break before a decimal mark that follows a space.  */ +                      *p = UC_BREAK_POSSIBLE; +                    } +                  else if (((prop == LBP_QU1 || prop == LBP_QU2 || prop == LBP_QU3) +                            && (! prev_ea || ! EA (lookahead1_prop_ea)) +                            /* (LB18) has higher priority than (LB19a).  */ +                            && prev_prop != LBP_SP) +                           || ((prev_prop == LBP_QU1 || prev_prop == LBP_QU2 || prev_prop == LBP_QU3) +                               && (! prev2_ea || ! ea))) +                    { +                      /* (LB19a) Don't break on either side of ambiguous +                         quotation marks, except next to an EastAsian character.  */ +                      *p = UC_BREAK_PROHIBITED; +                    } +                  else if (prev_initial_hyphen +                           && (prop == LBP_AL1 || prop == LBP_AL2)) +                    { +                      /* (LB20a) Don't break after a word-initial hyphen.  */ +                      *p = UC_BREAK_PROHIBITED; +                    } +                  else if (prev_prop == LBP_HL_BA && prop != LBP_HL) +                    { +                      /* (LB21a) Don't break after Hebrew + Hyphen/Break-After, +                         before non-Hebrew.  */ +                      *p = UC_BREAK_PROHIBITED; +                    } +                  else if ((prev_nus +                            && (prev_prop == LBP_CL +                                || prev_prop == LBP_CP1 || prev_prop == LBP_CP2) +                            && (prop == LBP_PO || prop == LBP_PR)) +                           || (nus && (prop == LBP_PO || prop == LBP_PR +                                       || prop == LBP_NU))) +                    { +                      /* (LB25) Don't break numbers.  */ +                      *p = UC_BREAK_PROHIBITED; +                    } +                  else if ((prev_prop == LBP_PO || prev_prop == LBP_PR) +                           && (prop == LBP_OP1 || prop == LBP_OP2) +                           && (PROP (lookahead1_prop_ea) == LBP_NU +                               || (PROP (lookahead1_prop_ea) == LBP_IS +                                   && PROP (lookahead2_prop_ea) == LBP_NU))) +                    { +                      /* (LB25) Don't break numbers.  */ +                      *p = UC_BREAK_PROHIBITED; +                    } +                  else if (prev_prop == LBP_AKLS_VI +                           && (prop == LBP_AK || prop == LBP_AL2)) +                    { +                      /* (LB28a) Don't break inside orthographic syllables of +                         Brahmic scripts, line 3.  */ +                      *p = UC_BREAK_PROHIBITED; +                    } +                  else if (PROP (lookahead1_prop_ea) == LBP_VF +                           && (prop == LBP_AK || prop == LBP_AL2 || prop == LBP_AS) +                           && (prev_prop == LBP_AK || prev_prop == LBP_AL2 || prev_prop == LBP_AS)) +                    { +                      /* (LB28a) Don't break inside orthographic syllables of +                         Brahmic scripts, line 4.  */ +                      *p = UC_BREAK_PROHIBITED; +                    } +                  else if (last_prop == LBP_IS && uc == 0x003C) +                    { +                      /* Partially disable (LB29) Do not break between numeric +                         punctuation and alphabetics ("e.g.").  We find it +                         desirable to break before the HTML tag "</P>" in +                         strings like "<P>Some sentence.</P>".  */ +                      *p = UC_BREAK_POSSIBLE; +                    }                    else if (last_prop == LBP_RI && prop == LBP_RI)                      {                        /* (LB30a) Break between two regional indicator symbols @@ -230,14 +312,37 @@ u16_possible_linebreaks_loop (const uint16_t *s, size_t n, const char *encoding,                              ? UC_BREAK_POSSIBLE                              : UC_BREAK_PROHIBITED);                      } -                  else if (prev_prop == LBP_HL_BA) -                    { -                      /* (LB21a) Don't break after Hebrew + Hyphen/Break-After.  */ -                      *p = UC_BREAK_PROHIBITED; -                    }                    else                      { -                      switch (unilbrk_table [last_prop] [prop]) +                      int this_prop = prop; +                      if (prop == LBP_QU3) +                        { +                          /* For (LB15b): Replace LBP_QU3 with LBP_QU1 if the +                             next character's line break property is not one of +                             BK, CR, LF, SP, GL, WJ, CL, QU, CP, EX, IS, SY, ZW.  */ +                          switch (PROP (lookahead1_prop_ea)) +                            { +                            case LBP_BK: +                            case LBP_CR: +                            case LBP_LF: +                            case LBP_SP: +                            case LBP_GL: +                            case LBP_WJ: +                            case LBP_CL: +                            case LBP_QU1: case LBP_QU2: case LBP_QU3: +                            case LBP_CP1: case LBP_CP2: +                            case LBP_EX: +                            case LBP_IS: +                            case LBP_SY: +                            case LBP_ZW: +                              break; +                            default: +                              this_prop = LBP_QU1; +                              break; +                            } +                        } + +                      switch (unilbrk_table [last_prop] [this_prop])                          {                          case D:                            *p = UC_BREAK_POSSIBLE; @@ -252,15 +357,60 @@ u16_possible_linebreaks_loop (const uint16_t *s, size_t n, const char *encoding,                            abort ();                          }                      } + +                  if (prop == LBP_QU2) +                    { +                      /* For (LB15a): Replace LBP_QU2 with LBP_QU1 if the +                         previous character's line break property was not one of +                         BK, CR, LF, OP, QU, GL, SP, ZW.  */ +                      switch (prev_prop) +                        { +                        case LBP_BK: +                        case LBP_CR: +                        case LBP_LF: +                        case LBP_OP1: case LBP_OP2: +                        case LBP_QU1: case LBP_QU2: case LBP_QU3: +                        case LBP_GL: +                        case LBP_SP: +                        case LBP_ZW: +                          break; +                        default: +                          prop = LBP_QU1; +                          break; +                        } +                    } +                    last_prop = prop;                    seen_space = NULL;                  } +            } -              prev_prop = (prev_prop == LBP_HL && (prop == LBP_HY || prop == LBP_BA) -                           ? LBP_HL_BA -                           : prop); +          /* (LB9) Treat X (CM | ZWJ)* as if it were X, where X is any line +             break class except BK, CR, LF, NL, SP, or ZW.  */ +          if (!((prop == LBP_CM || prop == LBP_ZWJ) +                && !(prev_prop == LBP_BK || prev_prop == LBP_LF || prev_prop == LBP_CR +                     || prev_prop == LBP_SP || prev_prop == LBP_ZW))) +            { +              prev_initial_hyphen = +                (prop == LBP_HY || uc == 0x2010) +                && (prev_prop == LBP_BK || prev_prop == LBP_CR || prev_prop == LBP_LF +                    || prev_prop == LBP_SP || prev_prop == LBP_ZW +                    || prev_prop == LBP_CB || prev_prop == LBP_GL); +              prev_prop = (prop == LBP_VI && (prev_prop == LBP_AK +                                              || prev_prop == LBP_AL2 +                                              || prev_prop == LBP_AS) +                           ? LBP_AKLS_VI : +                           prev_prop == LBP_HL && (prop == LBP_HY +                                                   || (prop == LBP_BA && !ea)) +                           ? LBP_HL_BA : +                           prop); +              prev2_ea = prev_ea; +              prev_ea = ea; +              prev_nus = nus;              } +          preceding_prop = prop; +            if (prop == LBP_RI)              ri_count++;            else | 
