diff options
Diffstat (limited to 'lib/unigbrk/u-grapheme-next.h')
-rw-r--r-- | lib/unigbrk/u-grapheme-next.h | 159 |
1 files changed, 159 insertions, 0 deletions
diff --git a/lib/unigbrk/u-grapheme-next.h b/lib/unigbrk/u-grapheme-next.h new file mode 100644 index 00000000..9ca07436 --- /dev/null +++ b/lib/unigbrk/u-grapheme-next.h @@ -0,0 +1,159 @@ +/* Grapheme cluster break function. + Copyright (C) 2010-2025 Free Software Foundation, Inc. + + This file is free software. + It is dual-licensed under "the GNU LGPLv3+ or the GNU GPLv2+". + You can redistribute it and/or modify it under either + - the terms of the GNU Lesser General Public License as published + by the Free Software Foundation, either version 3, or (at your + option) any later version, or + - the terms of the GNU General Public License as published by the + Free Software Foundation; either version 2, or (at your option) + any later version, or + - the same dual license "the GNU LGPLv3+ or the GNU GPLv2+". + + This file is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License and the GNU General Public License + for more details. + + You should have received a copy of the GNU Lesser General Public + License and of the GNU General Public License along with this + program. If not, see <https://www.gnu.org/licenses/>. */ + +/* Written by Ben Pfaff, Daiki Ueno, Bruno Haible. */ + +/* This file implements section 3 "Grapheme Cluster Boundaries" + of Unicode Standard Annex #29 <https://www.unicode.org/reports/tr29/>. */ + +const UNIT * +FUNC (const UNIT *s, const UNIT *s_end) +{ + if (s == s_end) + return NULL; + + /* Grapheme Cluster break property of the last character. + -1 at the very beginning of the string. */ + int last_char_prop = -1; + + /* True if the last character ends a sequence of Indic_Conjunct_Break + values: consonant {extend|linker}* */ + bool incb_consonant_extended = false; + /* True if the last character ends a sequence of Indic_Conjunct_Break + values: consonant {extend|linker}* linker */ + bool incb_consonant_extended_linker = false; + /* True if the last character ends a sequence of Indic_Conjunct_Break + values: consonant {extend|linker}* linker {extend|linker}* */ + bool incb_consonant_extended_linker_extended = false; + + /* True if the last character ends an emoji modifier sequence + \p{Extended_Pictographic} Extend*. */ + bool emoji_modifier_sequence = false; + /* True if the last character was immediately preceded by an + emoji modifier sequence \p{Extended_Pictographic} Extend*. */ + bool emoji_modifier_sequence_before_last_char = false; + + /* Number of consecutive regional indicator (RI) characters seen + immediately before the current point. */ + size_t ri_count = 0; + + do + { + ucs4_t uc; + int count = U_MBTOUC (&uc, s, s_end - s); + int prop = uc_graphemeclusterbreak_property (uc); + int incb = uc_indic_conjunct_break (uc); + + /* Break at the start of the string (GB1). */ + if (last_char_prop < 0) + /* *p = 1 */; + else + { + /* No break between CR and LF (GB3). */ + if (last_char_prop == GBP_CR && prop == GBP_LF) + /* *p = 0 */; + /* Break before and after newlines (GB4, GB5). */ + else if ((last_char_prop == GBP_CR + || last_char_prop == GBP_LF + || last_char_prop == GBP_CONTROL) + || (prop == GBP_CR + || prop == GBP_LF + || prop == GBP_CONTROL)) + break /* *p = 1 */; + /* No break between Hangul syllable sequences (GB6, GB7, GB8). */ + else if ((last_char_prop == GBP_L + && (prop == GBP_L + || prop == GBP_V + || prop == GBP_LV + || prop == GBP_LVT)) + || ((last_char_prop == GBP_LV + || last_char_prop == GBP_V) + && (prop == GBP_V + || prop == GBP_T)) + || ((last_char_prop == GBP_LVT + || last_char_prop == GBP_T) + && prop == GBP_T)) + /* *p = 0 */; + /* No break before extending characters or ZWJ (GB9). */ + else if (prop == GBP_EXTEND || prop == GBP_ZWJ) + /* *p = 0 */; + /* No break before SpacingMarks (GB9a). */ + else if (prop == GBP_SPACINGMARK) + /* *p = 0 */; + /* No break after Prepend characters (GB9b). */ + else if (last_char_prop == GBP_PREPEND) + /* *p = 0 */; + /* No break within certain combinations of Indic_Conjunct_Break + values: Between + consonant {extend|linker}* linker {extend|linker}* + and + consonant + (GB9c). */ + else if (incb_consonant_extended_linker_extended + && incb == UC_INDIC_CONJUNCT_BREAK_CONSONANT) + /* *p = 0 */; + /* No break within emoji modifier sequences or emoji zwj sequences + (GB11). */ + else if (last_char_prop == GBP_ZWJ + && emoji_modifier_sequence_before_last_char + && uc_is_property_extended_pictographic (uc)) + /* *p = 0 */; + /* No break between RI if there is an odd number of RI + characters before (GB12, GB13). */ + else if (prop == GBP_RI && (ri_count % 2) != 0) + /* *p = 0 */; + /* Break everywhere (GB999). */ + else + break /* *p = 1 */; + } + + incb_consonant_extended_linker = + incb_consonant_extended && incb == UC_INDIC_CONJUNCT_BREAK_LINKER; + incb_consonant_extended_linker_extended = + (incb_consonant_extended_linker + || (incb_consonant_extended_linker_extended + && incb >= UC_INDIC_CONJUNCT_BREAK_LINKER)); + incb_consonant_extended = + (incb == UC_INDIC_CONJUNCT_BREAK_CONSONANT + || (incb_consonant_extended + && incb >= UC_INDIC_CONJUNCT_BREAK_LINKER)); + + emoji_modifier_sequence_before_last_char = emoji_modifier_sequence; + emoji_modifier_sequence = + (emoji_modifier_sequence && prop == GBP_EXTEND) + || uc_is_property_extended_pictographic (uc); + + last_char_prop = prop; + + if (prop == GBP_RI) + ri_count++; + else + ri_count = 0; + + s += count; + } + while (s < s_end); + + return s; +} |