diff options
Diffstat (limited to 'lib/unigbrk/u-grapheme-next.h')
| -rw-r--r-- | lib/unigbrk/u-grapheme-next.h | 159 | 
1 files changed, 159 insertions, 0 deletions
| diff --git a/lib/unigbrk/u-grapheme-next.h b/lib/unigbrk/u-grapheme-next.h new file mode 100644 index 00000000..9ca07436 --- /dev/null +++ b/lib/unigbrk/u-grapheme-next.h @@ -0,0 +1,159 @@ +/* Grapheme cluster break function. +   Copyright (C) 2010-2025 Free Software Foundation, Inc. + +   This file is free software. +   It is dual-licensed under "the GNU LGPLv3+ or the GNU GPLv2+". +   You can redistribute it and/or modify it under either +     - the terms of the GNU Lesser General Public License as published +       by the Free Software Foundation, either version 3, or (at your +       option) any later version, or +     - the terms of the GNU General Public License as published by the +       Free Software Foundation; either version 2, or (at your option) +       any later version, or +     - the same dual license "the GNU LGPLv3+ or the GNU GPLv2+". + +   This file is distributed in the hope that it will be useful, +   but WITHOUT ANY WARRANTY; without even the implied warranty of +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU +   Lesser General Public License and the GNU General Public License +   for more details. + +   You should have received a copy of the GNU Lesser General Public +   License and of the GNU General Public License along with this +   program.  If not, see <https://www.gnu.org/licenses/>.  */ + +/* Written by Ben Pfaff, Daiki Ueno, Bruno Haible.  */ + +/* This file implements section 3 "Grapheme Cluster Boundaries" +   of Unicode Standard Annex #29 <https://www.unicode.org/reports/tr29/>.  */ + +const UNIT * +FUNC (const UNIT *s, const UNIT *s_end) +{ +  if (s == s_end) +    return NULL; + +  /* Grapheme Cluster break property of the last character. +     -1 at the very beginning of the string.  */ +  int last_char_prop = -1; + +  /* True if the last character ends a sequence of Indic_Conjunct_Break +     values:  consonant {extend|linker}*  */ +  bool incb_consonant_extended = false; +  /* True if the last character ends a sequence of Indic_Conjunct_Break +     values:  consonant {extend|linker}* linker  */ +  bool incb_consonant_extended_linker = false; +  /* True if the last character ends a sequence of Indic_Conjunct_Break +     values:  consonant {extend|linker}* linker {extend|linker}*  */ +  bool incb_consonant_extended_linker_extended = false; + +  /* True if the last character ends an emoji modifier sequence +     \p{Extended_Pictographic} Extend*.  */ +  bool emoji_modifier_sequence = false; +  /* True if the last character was immediately preceded by an +     emoji modifier sequence   \p{Extended_Pictographic} Extend*.  */ +  bool emoji_modifier_sequence_before_last_char = false; + +  /* Number of consecutive regional indicator (RI) characters seen +     immediately before the current point.  */ +  size_t ri_count = 0; + +  do +    { +      ucs4_t uc; +      int count = U_MBTOUC (&uc, s, s_end - s); +      int prop = uc_graphemeclusterbreak_property (uc); +      int incb = uc_indic_conjunct_break (uc); + +      /* Break at the start of the string (GB1).  */ +      if (last_char_prop < 0) +        /* *p = 1 */; +      else +        { +          /* No break between CR and LF (GB3).  */ +          if (last_char_prop == GBP_CR && prop == GBP_LF) +            /* *p = 0 */; +          /* Break before and after newlines (GB4, GB5).  */ +          else if ((last_char_prop == GBP_CR +                    || last_char_prop == GBP_LF +                    || last_char_prop == GBP_CONTROL) +                   || (prop == GBP_CR +                       || prop == GBP_LF +                       || prop == GBP_CONTROL)) +            break /* *p = 1 */; +          /* No break between Hangul syllable sequences (GB6, GB7, GB8).  */ +          else if ((last_char_prop == GBP_L +                    && (prop == GBP_L +                        || prop == GBP_V +                        || prop == GBP_LV +                        || prop == GBP_LVT)) +                   || ((last_char_prop == GBP_LV +                        || last_char_prop == GBP_V) +                       && (prop == GBP_V +                           || prop == GBP_T)) +                   || ((last_char_prop == GBP_LVT +                        || last_char_prop == GBP_T) +                       && prop == GBP_T)) +            /* *p = 0 */; +          /* No break before extending characters or ZWJ (GB9).  */ +          else if (prop == GBP_EXTEND || prop == GBP_ZWJ) +            /* *p = 0 */; +          /* No break before SpacingMarks (GB9a).  */ +          else if (prop == GBP_SPACINGMARK) +            /* *p = 0 */; +          /* No break after Prepend characters (GB9b).  */ +          else if (last_char_prop == GBP_PREPEND) +            /* *p = 0 */; +          /* No break within certain combinations of Indic_Conjunct_Break +             values: Between +               consonant {extend|linker}* linker {extend|linker}* +             and +               consonant +             (GB9c).  */ +          else if (incb_consonant_extended_linker_extended +                   && incb == UC_INDIC_CONJUNCT_BREAK_CONSONANT) +            /* *p = 0 */; +          /* No break within emoji modifier sequences or emoji zwj sequences +             (GB11).  */ +          else if (last_char_prop == GBP_ZWJ +                   && emoji_modifier_sequence_before_last_char +                   && uc_is_property_extended_pictographic (uc)) +            /* *p = 0 */; +          /* No break between RI if there is an odd number of RI +             characters before (GB12, GB13).  */ +          else if (prop == GBP_RI && (ri_count % 2) != 0) +            /* *p = 0 */; +          /* Break everywhere (GB999).  */ +          else +            break /* *p = 1 */; +        } + +      incb_consonant_extended_linker = +        incb_consonant_extended && incb == UC_INDIC_CONJUNCT_BREAK_LINKER; +      incb_consonant_extended_linker_extended = +        (incb_consonant_extended_linker +         || (incb_consonant_extended_linker_extended +             && incb >= UC_INDIC_CONJUNCT_BREAK_LINKER)); +      incb_consonant_extended = +        (incb == UC_INDIC_CONJUNCT_BREAK_CONSONANT +         || (incb_consonant_extended +             && incb >= UC_INDIC_CONJUNCT_BREAK_LINKER)); + +      emoji_modifier_sequence_before_last_char = emoji_modifier_sequence; +      emoji_modifier_sequence = +        (emoji_modifier_sequence && prop == GBP_EXTEND) +        || uc_is_property_extended_pictographic (uc); + +      last_char_prop = prop; + +      if (prop == GBP_RI) +        ri_count++; +      else +        ri_count = 0; + +      s += count; +    } +  while (s < s_end); + +  return s; +} | 
