diff options
Diffstat (limited to 'lib/unigbrk/u-grapheme-prev.h')
| -rw-r--r-- | lib/unigbrk/u-grapheme-prev.h | 233 | 
1 files changed, 233 insertions, 0 deletions
| diff --git a/lib/unigbrk/u-grapheme-prev.h b/lib/unigbrk/u-grapheme-prev.h new file mode 100644 index 00000000..0894d599 --- /dev/null +++ b/lib/unigbrk/u-grapheme-prev.h @@ -0,0 +1,233 @@ +/* Grapheme cluster break function. +   Copyright (C) 2010-2025 Free Software Foundation, Inc. + +   This file is free software. +   It is dual-licensed under "the GNU LGPLv3+ or the GNU GPLv2+". +   You can redistribute it and/or modify it under either +     - the terms of the GNU Lesser General Public License as published +       by the Free Software Foundation, either version 3, or (at your +       option) any later version, or +     - the terms of the GNU General Public License as published by the +       Free Software Foundation; either version 2, or (at your option) +       any later version, or +     - the same dual license "the GNU LGPLv3+ or the GNU GPLv2+". + +   This file is distributed in the hope that it will be useful, +   but WITHOUT ANY WARRANTY; without even the implied warranty of +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU +   Lesser General Public License and the GNU General Public License +   for more details. + +   You should have received a copy of the GNU Lesser General Public +   License and of the GNU General Public License along with this +   program.  If not, see <https://www.gnu.org/licenses/>.  */ + +/* Written by Bruno Haible <bruno@clisp.org>, 2025.  */ + +/* This file implements section 3 "Grapheme Cluster Boundaries" +   of Unicode Standard Annex #29 <https://www.unicode.org/reports/tr29/> +   backwards.  */ + +/* Returns true if the string [s_start, s) ends with a sequence of +   Indic_Conjunct_Break values like: +     consonant {extend|linker}* linker {extend|linker}* + */ +static bool +ends_with_incb_consonant_extended_linker_extended (const UNIT *s, +                                                   const UNIT *s_start) +{ +  /* Look for +       consonant {extend|linker}* +     with at least one linker.  */ +  bool seen_linker = false; + +  while (s > s_start) +    { +      const UNIT *prev_s; +      ucs4_t uc; + +      prev_s = U_PREV (&uc, s, s_start); +      if (prev_s == NULL) +        /* Ill-formed UTF-8 encoding. */ +        break; + +      int incb = uc_indic_conjunct_break (uc); +      if (incb == UC_INDIC_CONJUNCT_BREAK_CONSONANT) +        return seen_linker; +      if (!(incb >= UC_INDIC_CONJUNCT_BREAK_LINKER)) +        break; +      seen_linker |= (incb == UC_INDIC_CONJUNCT_BREAK_LINKER); + +      s = prev_s; +    } + +  return false; +} + +/* Returns true if the string [s_start, s) ends with a sequence of +   characters like: +     \p{Extended_Pictographic} Extend* + */ +static bool +ends_with_emoji_modifier_sequence (const UNIT *s, const UNIT *s_start) +{ +  while (s > s_start) +    { +      const UNIT *prev_s; +      ucs4_t uc; + +      prev_s = U_PREV (&uc, s, s_start); +      if (prev_s == NULL) +        /* Ill-formed UTF-8 encoding. */ +        break; + +      if (uc_is_property_extended_pictographic (uc)) +        return true; + +      if (uc_graphemeclusterbreak_property (uc) != GBP_EXTEND) +        break; + +      s = prev_s; +    } + +  return false; +} + +/* Returns the number of consecutive regional indicator (RI) characters +   at the end of the string [s_start, s).  */ +static size_t +ends_with_ri_count (const UNIT *s, const UNIT *s_start) +{ +  size_t ri_count = 0; + +  while (s > s_start) +    { +      const UNIT *prev_s; +      ucs4_t uc; + +      prev_s = U_PREV (&uc, s, s_start); +      if (prev_s == NULL) +        /* Ill-formed UTF-8 encoding. */ +        break; + +      if (uc_graphemeclusterbreak_property (uc) == GBP_RI) +        ri_count++; +      else +        break; + +      s = prev_s; +    } + +  return ri_count; +} + +const UNIT * +FUNC (const UNIT *s, const UNIT *s_start) +{ +  if (s == s_start) +    return NULL; + +  /* Traverse the string backwards, from s down to s_start.  */ + +  /* Grapheme Cluster break property of the next character. +     -1 at the very end of the string.  */ +  int next_char_prop = -1; + +  /* Indic_Conjunct_Break property of the next character. +     -1 at the very end of the string.  */ +  int next_char_incb = -1; + +  /* Extended_Pictographic property of the next character. +     false at the very end of the string.  */ +  bool next_char_epic = false; + +  do +    { +      const UNIT *prev_s; +      ucs4_t uc; + +      prev_s = U_PREV (&uc, s, s_start); +      if (prev_s == NULL) +        { +          /* Ill-formed UTF-8 encoding. */ +          return s_start; +        } + +      int prop = uc_graphemeclusterbreak_property (uc); +      int incb = uc_indic_conjunct_break (uc); +      bool epic = uc_is_property_extended_pictographic (uc); + +      /* Break at the end of the string (GB2).  */ +      if (next_char_prop < 0) +        /* *p = 1 */; +      else +        { +          /* No break between CR and LF (GB3).  */ +          if (prop == GBP_CR && next_char_prop == GBP_LF) +            /* *p = 0 */; +          /* Break before and after newlines (GB4, GB5).  */ +          else if ((prop == GBP_CR +                    || prop == GBP_LF +                    || prop == GBP_CONTROL) +                   || (next_char_prop == GBP_CR +                       || next_char_prop == GBP_LF +                       || next_char_prop == GBP_CONTROL)) +            break /* *p = 1 */; +          /* No break between Hangul syllable sequences (GB6, GB7, GB8).  */ +          else if ((prop == GBP_L +                    && (next_char_prop == GBP_L +                        || next_char_prop == GBP_V +                        || next_char_prop == GBP_LV +                        || next_char_prop == GBP_LVT)) +                   || ((prop == GBP_LV +                        || prop == GBP_V) +                       && (next_char_prop == GBP_V +                           || next_char_prop == GBP_T)) +                   || ((prop == GBP_LVT +                        || prop == GBP_T) +                       && next_char_prop == GBP_T)) +            /* *p = 0 */; +          /* No break before extending characters or ZWJ (GB9).  */ +          else if (next_char_prop == GBP_EXTEND || next_char_prop == GBP_ZWJ) +            /* *p = 0 */; +          /* No break before SpacingMarks (GB9a).  */ +          else if (next_char_prop == GBP_SPACINGMARK) +            /* *p = 0 */; +          /* No break after Prepend characters (GB9b).  */ +          else if (prop == GBP_PREPEND) +            /* *p = 0 */; +          /* No break within certain combinations of Indic_Conjunct_Break +             values: Between +               consonant {extend|linker}* linker {extend|linker}* +             and +               consonant +             (GB9c).  */ +          else if (next_char_incb == UC_INDIC_CONJUNCT_BREAK_CONSONANT +                   && ends_with_incb_consonant_extended_linker_extended (s, s_start)) +            /* *p = 0 */; +          /* No break within emoji modifier sequences or emoji zwj sequences +             (GB11).  */ +          else if (next_char_epic +                   && prop == GBP_ZWJ +                   && ends_with_emoji_modifier_sequence (prev_s, s_start)) +            /* *p = 0 */; +          /* No break between RI if there is an odd number of RI +             characters before (GB12, GB13).  */ +          else if (next_char_prop == GBP_RI +                   && prop == GBP_RI +                   && (ends_with_ri_count (prev_s, s_start) % 2) == 0) +            /* *p = 0 */; +          /* Break everywhere (GB999).  */ +          else +            break /* *p = 1 */; +        } + +      s = prev_s; +      next_char_prop = prop; +      next_char_incb = incb; +      next_char_epic = epic; +    } +  while (s > s_start); + +  return s; +} | 
