diff options
Diffstat (limited to 'lib/unigbrk')
-rw-r--r-- | lib/unigbrk/gbrkprop.h | 126 | ||||
-rw-r--r-- | lib/unigbrk/u-grapheme-breaks.h | 9 | ||||
-rw-r--r-- | lib/unigbrk/u-grapheme-next.h | 159 | ||||
-rw-r--r-- | lib/unigbrk/u-grapheme-prev.h | 233 | ||||
-rw-r--r-- | lib/unigbrk/u16-grapheme-breaks.c | 2 | ||||
-rw-r--r-- | lib/unigbrk/u16-grapheme-next.c | 35 | ||||
-rw-r--r-- | lib/unigbrk/u16-grapheme-prev.c | 43 | ||||
-rw-r--r-- | lib/unigbrk/u32-grapheme-breaks.c | 2 | ||||
-rw-r--r-- | lib/unigbrk/u32-grapheme-next.c | 35 | ||||
-rw-r--r-- | lib/unigbrk/u32-grapheme-prev.c | 40 | ||||
-rw-r--r-- | lib/unigbrk/u8-grapheme-breaks.c | 2 | ||||
-rw-r--r-- | lib/unigbrk/u8-grapheme-next.c | 35 | ||||
-rw-r--r-- | lib/unigbrk/u8-grapheme-prev.c | 43 | ||||
-rw-r--r-- | lib/unigbrk/uc-gbrk-prop.c | 2 | ||||
-rw-r--r-- | lib/unigbrk/uc-grapheme-breaks.c | 2 | ||||
-rw-r--r-- | lib/unigbrk/uc-is-grapheme-break.c | 2 | ||||
-rw-r--r-- | lib/unigbrk/ulc-grapheme-breaks.c | 2 |
17 files changed, 566 insertions, 206 deletions
diff --git a/lib/unigbrk/gbrkprop.h b/lib/unigbrk/gbrkprop.h index 7c098bfd..06b2f5d6 100644 --- a/lib/unigbrk/gbrkprop.h +++ b/lib/unigbrk/gbrkprop.h @@ -1,8 +1,8 @@ /* DO NOT EDIT! GENERATED AUTOMATICALLY! */ /* Grapheme break property of Unicode characters. */ -/* Generated automatically by gen-uni-tables.c for Unicode 16.0.0. */ +/* Generated automatically by gen-uni-tables.c for Unicode 17.0.0. */ -/* Copyright (C) 2000-2024 Free Software Foundation, Inc. +/* Copyright (C) 2000-2025 Free Software Foundation, Inc. This file is free software. It is dual-licensed under "the GNU LGPLv3+ or the GNU GPLv2+". @@ -35,7 +35,7 @@ struct { int level1[15]; short level2[3 << 9]; - unsigned char level3[131 << 7]; + unsigned char level3[133 << 7]; } unigbrkprop = { @@ -114,14 +114,14 @@ unigbrkprop = -1, -1, 9600, -1, -1, 9728, 9856, 9984, 10112, 10240, 10368, 10496, 10624, 10752, 10880, 11008, 11136, 11264, -1, 11392, 11520, 11648, 11776, -1, - 11904, -1, 12032, 12160, 12288, 12416, -1, -1, - 12544, 12672, 12800, 12928, -1, 13056, 13184, -1, + 11904, -1, 12032, 12160, 12288, 12416, 12544, -1, + 12672, 12800, 12928, 13056, -1, 13184, 13312, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - 13312, -1, -1, -1, -1, -1, -1, -1, + 13440, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, @@ -132,10 +132,10 @@ unigbrkprop = -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, 13440, -1, -1, -1, -1, -1, + -1, -1, 13568, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, 13568, 13696, -1, - -1, -1, 13824, -1, -1, -1, 13952, 14080, + -1, -1, -1, -1, -1, 13696, 13824, -1, + -1, -1, 13952, -1, -1, -1, 14080, 14208, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, @@ -155,27 +155,27 @@ unigbrkprop = -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1, 14208, -1, -1, -1, -1, -1, -1, + -1, 14336, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, 14336, -1, - -1, -1, 14464, 14592, 14720, -1, -1, -1, + -1, -1, -1, -1, -1, -1, 14464, -1, + -1, -1, 14592, 14720, 14848, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, 14848, 14976, -1, -1, + -1, -1, -1, -1, 14976, 15104, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - 15104, 15232, 13696, -1, -1, 15360, -1, -1, - -1, 15488, -1, 15616, -1, -1, -1, -1, - -1, 15744, 15872, -1, -1, -1, -1, -1, + 15232, 15360, 13824, -1, -1, 15488, -1, -1, + -1, 15616, -1, 15744, -1, 15872, -1, -1, + -1, 16000, 16128, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, 16000, -1, -1, -1, 16128, + -1, -1, -1, 16256, -1, -1, -1, 16384, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - 16256, 16384, 16512, 16640, 16384, 16384, 16384, 16384, - 16384, 16384, 16384, 16384, 16384, 16384, 16384, 16384, - 16384, 16384, 16384, 16384, 16384, 16384, 16384, 16384, - 16384, 16384, 16384, 16384, 16384, 16384, 16384, 16384, + 16512, 16640, 16768, 16896, 16640, 16640, 16640, 16640, + 16640, 16640, 16640, 16640, 16640, 16640, 16640, 16640, + 16640, 16640, 16640, 16640, 16640, 16640, 16640, 16640, + 16640, 16640, 16640, 16640, 16640, 16640, 16640, 16640, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, @@ -1409,14 +1409,14 @@ unigbrkprop = GBP_EXTEND, GBP_EXTEND, GBP_EXTEND, GBP_EXTEND, GBP_EXTEND, GBP_EXTEND, GBP_EXTEND, GBP_EXTEND, GBP_EXTEND, GBP_EXTEND, GBP_EXTEND, GBP_EXTEND, - GBP_EXTEND, GBP_EXTEND, GBP_EXTEND, GBP_OTHER, - GBP_OTHER, GBP_OTHER, GBP_OTHER, GBP_OTHER, - GBP_OTHER, GBP_OTHER, GBP_OTHER, GBP_OTHER, - GBP_OTHER, GBP_OTHER, GBP_OTHER, GBP_OTHER, - GBP_OTHER, GBP_OTHER, GBP_OTHER, GBP_OTHER, - GBP_OTHER, GBP_OTHER, GBP_OTHER, GBP_OTHER, - GBP_OTHER, GBP_OTHER, GBP_OTHER, GBP_OTHER, - GBP_OTHER, GBP_OTHER, GBP_OTHER, GBP_OTHER, + GBP_EXTEND, GBP_EXTEND, GBP_EXTEND, GBP_EXTEND, + GBP_EXTEND, GBP_EXTEND, GBP_EXTEND, GBP_EXTEND, + GBP_EXTEND, GBP_EXTEND, GBP_EXTEND, GBP_EXTEND, + GBP_EXTEND, GBP_EXTEND, GBP_EXTEND, GBP_EXTEND, + GBP_EXTEND, GBP_EXTEND, GBP_OTHER, GBP_OTHER, + GBP_EXTEND, GBP_EXTEND, GBP_EXTEND, GBP_EXTEND, + GBP_EXTEND, GBP_EXTEND, GBP_EXTEND, GBP_EXTEND, + GBP_EXTEND, GBP_EXTEND, GBP_EXTEND, GBP_EXTEND, GBP_OTHER, GBP_OTHER, GBP_OTHER, GBP_OTHER, GBP_OTHER, GBP_OTHER, GBP_OTHER, GBP_OTHER, GBP_OTHER, GBP_OTHER, GBP_OTHER, GBP_OTHER, @@ -2700,7 +2700,7 @@ unigbrkprop = GBP_OTHER, GBP_OTHER, GBP_OTHER, GBP_OTHER, GBP_OTHER, GBP_OTHER, GBP_OTHER, GBP_OTHER, GBP_OTHER, GBP_OTHER, GBP_OTHER, GBP_OTHER, - GBP_OTHER, GBP_OTHER, GBP_OTHER, GBP_OTHER, + GBP_OTHER, GBP_OTHER, GBP_EXTEND, GBP_EXTEND, GBP_EXTEND, GBP_EXTEND, GBP_EXTEND, GBP_EXTEND, GBP_OTHER, GBP_OTHER, GBP_OTHER, GBP_OTHER, GBP_OTHER, GBP_OTHER, GBP_OTHER, GBP_OTHER, @@ -3324,7 +3324,7 @@ unigbrkprop = GBP_OTHER, GBP_OTHER, GBP_OTHER, GBP_OTHER, GBP_OTHER, GBP_OTHER, GBP_OTHER, GBP_EXTEND, GBP_EXTEND, GBP_EXTEND, GBP_EXTEND, GBP_EXTEND, - GBP_EXTEND, GBP_SPACINGMARK, GBP_PREPEND, GBP_EXTEND, + GBP_EXTEND, GBP_SPACINGMARK, GBP_OTHER, GBP_EXTEND, GBP_EXTEND, GBP_EXTEND, GBP_EXTEND, GBP_OTHER, GBP_OTHER, GBP_OTHER, GBP_OTHER, GBP_OTHER, GBP_OTHER, GBP_OTHER, GBP_OTHER, GBP_EXTEND, @@ -3385,6 +3385,38 @@ unigbrkprop = GBP_OTHER, GBP_OTHER, GBP_OTHER, GBP_OTHER, GBP_OTHER, GBP_OTHER, GBP_OTHER, GBP_OTHER, GBP_OTHER, GBP_OTHER, GBP_OTHER, GBP_OTHER, + GBP_OTHER, GBP_OTHER, GBP_OTHER, GBP_OTHER, + GBP_OTHER, GBP_OTHER, GBP_OTHER, GBP_OTHER, + GBP_OTHER, GBP_OTHER, GBP_OTHER, GBP_OTHER, + GBP_OTHER, GBP_OTHER, GBP_OTHER, GBP_OTHER, + GBP_OTHER, GBP_OTHER, GBP_OTHER, GBP_OTHER, + GBP_OTHER, GBP_OTHER, GBP_OTHER, GBP_OTHER, + GBP_OTHER, GBP_OTHER, GBP_OTHER, GBP_OTHER, + GBP_OTHER, GBP_OTHER, GBP_OTHER, GBP_OTHER, + GBP_OTHER, GBP_OTHER, GBP_OTHER, GBP_OTHER, + GBP_OTHER, GBP_OTHER, GBP_OTHER, GBP_OTHER, + GBP_OTHER, GBP_OTHER, GBP_OTHER, GBP_OTHER, + GBP_OTHER, GBP_OTHER, GBP_OTHER, GBP_OTHER, + GBP_OTHER, GBP_OTHER, GBP_OTHER, GBP_OTHER, + GBP_EXTEND, GBP_SPACINGMARK, GBP_EXTEND, GBP_EXTEND, + GBP_EXTEND, GBP_SPACINGMARK, GBP_EXTEND, GBP_SPACINGMARK, + GBP_OTHER, GBP_OTHER, GBP_OTHER, GBP_OTHER, + GBP_OTHER, GBP_OTHER, GBP_OTHER, GBP_OTHER, + GBP_OTHER, GBP_OTHER, GBP_OTHER, GBP_OTHER, + GBP_OTHER, GBP_OTHER, GBP_OTHER, GBP_OTHER, + GBP_OTHER, GBP_OTHER, GBP_OTHER, GBP_OTHER, + GBP_OTHER, GBP_OTHER, GBP_OTHER, GBP_OTHER, + GBP_OTHER, GBP_OTHER, GBP_OTHER, GBP_OTHER, + GBP_OTHER, GBP_OTHER, GBP_OTHER, GBP_OTHER, + GBP_OTHER, GBP_OTHER, GBP_OTHER, GBP_OTHER, + GBP_OTHER, GBP_OTHER, GBP_OTHER, GBP_OTHER, + GBP_OTHER, GBP_OTHER, GBP_OTHER, GBP_OTHER, + GBP_OTHER, GBP_OTHER, GBP_OTHER, GBP_OTHER, + GBP_OTHER, GBP_OTHER, GBP_OTHER, GBP_OTHER, + GBP_OTHER, GBP_OTHER, GBP_OTHER, GBP_OTHER, + GBP_OTHER, GBP_OTHER, GBP_OTHER, GBP_OTHER, + GBP_OTHER, GBP_OTHER, GBP_OTHER, GBP_OTHER, + GBP_OTHER, GBP_OTHER, GBP_OTHER, GBP_OTHER, GBP_OTHER, GBP_OTHER, GBP_OTHER, GBP_SPACINGMARK, GBP_EXTEND, GBP_EXTEND, GBP_EXTEND, GBP_EXTEND, GBP_EXTEND, GBP_EXTEND, GBP_EXTEND, GBP_OTHER, @@ -4194,6 +4226,38 @@ unigbrkprop = GBP_OTHER, GBP_OTHER, GBP_OTHER, GBP_OTHER, GBP_OTHER, GBP_OTHER, GBP_OTHER, GBP_OTHER, GBP_OTHER, GBP_OTHER, GBP_OTHER, GBP_OTHER, + GBP_OTHER, GBP_OTHER, GBP_OTHER, GBP_OTHER, + GBP_OTHER, GBP_OTHER, GBP_OTHER, GBP_OTHER, + GBP_OTHER, GBP_OTHER, GBP_OTHER, GBP_OTHER, + GBP_OTHER, GBP_OTHER, GBP_OTHER, GBP_OTHER, + GBP_OTHER, GBP_OTHER, GBP_OTHER, GBP_EXTEND, + GBP_OTHER, GBP_OTHER, GBP_EXTEND, GBP_OTHER, + GBP_OTHER, GBP_OTHER, GBP_OTHER, GBP_OTHER, + GBP_OTHER, GBP_OTHER, GBP_EXTEND, GBP_EXTEND, + GBP_OTHER, GBP_OTHER, GBP_OTHER, GBP_OTHER, + GBP_OTHER, GBP_EXTEND, GBP_OTHER, GBP_OTHER, + GBP_OTHER, GBP_OTHER, GBP_OTHER, GBP_OTHER, + GBP_OTHER, GBP_OTHER, GBP_OTHER, GBP_OTHER, + GBP_OTHER, GBP_OTHER, GBP_OTHER, GBP_OTHER, + GBP_OTHER, GBP_OTHER, GBP_OTHER, GBP_OTHER, + GBP_OTHER, GBP_OTHER, GBP_OTHER, GBP_OTHER, + GBP_OTHER, GBP_OTHER, GBP_OTHER, GBP_OTHER, + GBP_OTHER, GBP_OTHER, GBP_OTHER, GBP_OTHER, + GBP_OTHER, GBP_OTHER, GBP_OTHER, GBP_OTHER, + GBP_OTHER, GBP_OTHER, GBP_OTHER, GBP_OTHER, + GBP_OTHER, GBP_OTHER, GBP_OTHER, GBP_OTHER, + GBP_OTHER, GBP_OTHER, GBP_OTHER, GBP_OTHER, + GBP_OTHER, GBP_OTHER, GBP_OTHER, GBP_OTHER, + GBP_OTHER, GBP_OTHER, GBP_OTHER, GBP_OTHER, + GBP_OTHER, GBP_OTHER, GBP_OTHER, GBP_OTHER, + GBP_OTHER, GBP_OTHER, GBP_OTHER, GBP_OTHER, + GBP_OTHER, GBP_OTHER, GBP_OTHER, GBP_OTHER, + GBP_OTHER, GBP_OTHER, GBP_OTHER, GBP_OTHER, + GBP_OTHER, GBP_OTHER, GBP_OTHER, GBP_OTHER, + GBP_OTHER, GBP_OTHER, GBP_OTHER, GBP_OTHER, + GBP_OTHER, GBP_OTHER, GBP_OTHER, GBP_OTHER, + GBP_OTHER, GBP_OTHER, GBP_OTHER, GBP_OTHER, + GBP_OTHER, GBP_OTHER, GBP_OTHER, GBP_OTHER, GBP_EXTEND, GBP_EXTEND, GBP_EXTEND, GBP_EXTEND, GBP_EXTEND, GBP_EXTEND, GBP_EXTEND, GBP_OTHER, GBP_OTHER, GBP_OTHER, GBP_OTHER, GBP_OTHER, diff --git a/lib/unigbrk/u-grapheme-breaks.h b/lib/unigbrk/u-grapheme-breaks.h index 3e8f4953..30d5853a 100644 --- a/lib/unigbrk/u-grapheme-breaks.h +++ b/lib/unigbrk/u-grapheme-breaks.h @@ -1,6 +1,5 @@ /* Grapheme cluster break function. - Copyright (C) 2010-2024 Free Software Foundation, Inc. - Written by Ben Pfaff <blp@cs.stanford.edu>, 2010. + Copyright (C) 2010-2025 Free Software Foundation, Inc. This file is free software. It is dual-licensed under "the GNU LGPLv3+ or the GNU GPLv2+". @@ -23,6 +22,8 @@ License and of the GNU General Public License along with this program. If not, see <https://www.gnu.org/licenses/>. */ +/* Written by Ben Pfaff, Daiki Ueno, Bruno Haible. */ + /* This file implements section 3 "Grapheme Cluster Boundaries" of Unicode Standard Annex #29 <https://www.unicode.org/reports/tr29/>. */ @@ -61,8 +62,9 @@ FUNC (const UNIT *s, size_t n, char *p) /* Don't break inside multibyte characters. */ memset (p, 0, n); - while (s < s_end) + do { + /* Invariant: Here s < s_end. */ ucs4_t uc; int count = U_MBTOUC (&uc, s, s_end - s); int prop = uc_graphemeclusterbreak_property (uc); @@ -157,5 +159,6 @@ FUNC (const UNIT *s, size_t n, char *p) s += count; p += count; } + while (s < s_end); } } diff --git a/lib/unigbrk/u-grapheme-next.h b/lib/unigbrk/u-grapheme-next.h new file mode 100644 index 00000000..9ca07436 --- /dev/null +++ b/lib/unigbrk/u-grapheme-next.h @@ -0,0 +1,159 @@ +/* Grapheme cluster break function. + Copyright (C) 2010-2025 Free Software Foundation, Inc. + + This file is free software. + It is dual-licensed under "the GNU LGPLv3+ or the GNU GPLv2+". + You can redistribute it and/or modify it under either + - the terms of the GNU Lesser General Public License as published + by the Free Software Foundation, either version 3, or (at your + option) any later version, or + - the terms of the GNU General Public License as published by the + Free Software Foundation; either version 2, or (at your option) + any later version, or + - the same dual license "the GNU LGPLv3+ or the GNU GPLv2+". + + This file is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License and the GNU General Public License + for more details. + + You should have received a copy of the GNU Lesser General Public + License and of the GNU General Public License along with this + program. If not, see <https://www.gnu.org/licenses/>. */ + +/* Written by Ben Pfaff, Daiki Ueno, Bruno Haible. */ + +/* This file implements section 3 "Grapheme Cluster Boundaries" + of Unicode Standard Annex #29 <https://www.unicode.org/reports/tr29/>. */ + +const UNIT * +FUNC (const UNIT *s, const UNIT *s_end) +{ + if (s == s_end) + return NULL; + + /* Grapheme Cluster break property of the last character. + -1 at the very beginning of the string. */ + int last_char_prop = -1; + + /* True if the last character ends a sequence of Indic_Conjunct_Break + values: consonant {extend|linker}* */ + bool incb_consonant_extended = false; + /* True if the last character ends a sequence of Indic_Conjunct_Break + values: consonant {extend|linker}* linker */ + bool incb_consonant_extended_linker = false; + /* True if the last character ends a sequence of Indic_Conjunct_Break + values: consonant {extend|linker}* linker {extend|linker}* */ + bool incb_consonant_extended_linker_extended = false; + + /* True if the last character ends an emoji modifier sequence + \p{Extended_Pictographic} Extend*. */ + bool emoji_modifier_sequence = false; + /* True if the last character was immediately preceded by an + emoji modifier sequence \p{Extended_Pictographic} Extend*. */ + bool emoji_modifier_sequence_before_last_char = false; + + /* Number of consecutive regional indicator (RI) characters seen + immediately before the current point. */ + size_t ri_count = 0; + + do + { + ucs4_t uc; + int count = U_MBTOUC (&uc, s, s_end - s); + int prop = uc_graphemeclusterbreak_property (uc); + int incb = uc_indic_conjunct_break (uc); + + /* Break at the start of the string (GB1). */ + if (last_char_prop < 0) + /* *p = 1 */; + else + { + /* No break between CR and LF (GB3). */ + if (last_char_prop == GBP_CR && prop == GBP_LF) + /* *p = 0 */; + /* Break before and after newlines (GB4, GB5). */ + else if ((last_char_prop == GBP_CR + || last_char_prop == GBP_LF + || last_char_prop == GBP_CONTROL) + || (prop == GBP_CR + || prop == GBP_LF + || prop == GBP_CONTROL)) + break /* *p = 1 */; + /* No break between Hangul syllable sequences (GB6, GB7, GB8). */ + else if ((last_char_prop == GBP_L + && (prop == GBP_L + || prop == GBP_V + || prop == GBP_LV + || prop == GBP_LVT)) + || ((last_char_prop == GBP_LV + || last_char_prop == GBP_V) + && (prop == GBP_V + || prop == GBP_T)) + || ((last_char_prop == GBP_LVT + || last_char_prop == GBP_T) + && prop == GBP_T)) + /* *p = 0 */; + /* No break before extending characters or ZWJ (GB9). */ + else if (prop == GBP_EXTEND || prop == GBP_ZWJ) + /* *p = 0 */; + /* No break before SpacingMarks (GB9a). */ + else if (prop == GBP_SPACINGMARK) + /* *p = 0 */; + /* No break after Prepend characters (GB9b). */ + else if (last_char_prop == GBP_PREPEND) + /* *p = 0 */; + /* No break within certain combinations of Indic_Conjunct_Break + values: Between + consonant {extend|linker}* linker {extend|linker}* + and + consonant + (GB9c). */ + else if (incb_consonant_extended_linker_extended + && incb == UC_INDIC_CONJUNCT_BREAK_CONSONANT) + /* *p = 0 */; + /* No break within emoji modifier sequences or emoji zwj sequences + (GB11). */ + else if (last_char_prop == GBP_ZWJ + && emoji_modifier_sequence_before_last_char + && uc_is_property_extended_pictographic (uc)) + /* *p = 0 */; + /* No break between RI if there is an odd number of RI + characters before (GB12, GB13). */ + else if (prop == GBP_RI && (ri_count % 2) != 0) + /* *p = 0 */; + /* Break everywhere (GB999). */ + else + break /* *p = 1 */; + } + + incb_consonant_extended_linker = + incb_consonant_extended && incb == UC_INDIC_CONJUNCT_BREAK_LINKER; + incb_consonant_extended_linker_extended = + (incb_consonant_extended_linker + || (incb_consonant_extended_linker_extended + && incb >= UC_INDIC_CONJUNCT_BREAK_LINKER)); + incb_consonant_extended = + (incb == UC_INDIC_CONJUNCT_BREAK_CONSONANT + || (incb_consonant_extended + && incb >= UC_INDIC_CONJUNCT_BREAK_LINKER)); + + emoji_modifier_sequence_before_last_char = emoji_modifier_sequence; + emoji_modifier_sequence = + (emoji_modifier_sequence && prop == GBP_EXTEND) + || uc_is_property_extended_pictographic (uc); + + last_char_prop = prop; + + if (prop == GBP_RI) + ri_count++; + else + ri_count = 0; + + s += count; + } + while (s < s_end); + + return s; +} diff --git a/lib/unigbrk/u-grapheme-prev.h b/lib/unigbrk/u-grapheme-prev.h new file mode 100644 index 00000000..0894d599 --- /dev/null +++ b/lib/unigbrk/u-grapheme-prev.h @@ -0,0 +1,233 @@ +/* Grapheme cluster break function. + Copyright (C) 2010-2025 Free Software Foundation, Inc. + + This file is free software. + It is dual-licensed under "the GNU LGPLv3+ or the GNU GPLv2+". + You can redistribute it and/or modify it under either + - the terms of the GNU Lesser General Public License as published + by the Free Software Foundation, either version 3, or (at your + option) any later version, or + - the terms of the GNU General Public License as published by the + Free Software Foundation; either version 2, or (at your option) + any later version, or + - the same dual license "the GNU LGPLv3+ or the GNU GPLv2+". + + This file is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License and the GNU General Public License + for more details. + + You should have received a copy of the GNU Lesser General Public + License and of the GNU General Public License along with this + program. If not, see <https://www.gnu.org/licenses/>. */ + +/* Written by Bruno Haible <bruno@clisp.org>, 2025. */ + +/* This file implements section 3 "Grapheme Cluster Boundaries" + of Unicode Standard Annex #29 <https://www.unicode.org/reports/tr29/> + backwards. */ + +/* Returns true if the string [s_start, s) ends with a sequence of + Indic_Conjunct_Break values like: + consonant {extend|linker}* linker {extend|linker}* + */ +static bool +ends_with_incb_consonant_extended_linker_extended (const UNIT *s, + const UNIT *s_start) +{ + /* Look for + consonant {extend|linker}* + with at least one linker. */ + bool seen_linker = false; + + while (s > s_start) + { + const UNIT *prev_s; + ucs4_t uc; + + prev_s = U_PREV (&uc, s, s_start); + if (prev_s == NULL) + /* Ill-formed UTF-8 encoding. */ + break; + + int incb = uc_indic_conjunct_break (uc); + if (incb == UC_INDIC_CONJUNCT_BREAK_CONSONANT) + return seen_linker; + if (!(incb >= UC_INDIC_CONJUNCT_BREAK_LINKER)) + break; + seen_linker |= (incb == UC_INDIC_CONJUNCT_BREAK_LINKER); + + s = prev_s; + } + + return false; +} + +/* Returns true if the string [s_start, s) ends with a sequence of + characters like: + \p{Extended_Pictographic} Extend* + */ +static bool +ends_with_emoji_modifier_sequence (const UNIT *s, const UNIT *s_start) +{ + while (s > s_start) + { + const UNIT *prev_s; + ucs4_t uc; + + prev_s = U_PREV (&uc, s, s_start); + if (prev_s == NULL) + /* Ill-formed UTF-8 encoding. */ + break; + + if (uc_is_property_extended_pictographic (uc)) + return true; + + if (uc_graphemeclusterbreak_property (uc) != GBP_EXTEND) + break; + + s = prev_s; + } + + return false; +} + +/* Returns the number of consecutive regional indicator (RI) characters + at the end of the string [s_start, s). */ +static size_t +ends_with_ri_count (const UNIT *s, const UNIT *s_start) +{ + size_t ri_count = 0; + + while (s > s_start) + { + const UNIT *prev_s; + ucs4_t uc; + + prev_s = U_PREV (&uc, s, s_start); + if (prev_s == NULL) + /* Ill-formed UTF-8 encoding. */ + break; + + if (uc_graphemeclusterbreak_property (uc) == GBP_RI) + ri_count++; + else + break; + + s = prev_s; + } + + return ri_count; +} + +const UNIT * +FUNC (const UNIT *s, const UNIT *s_start) +{ + if (s == s_start) + return NULL; + + /* Traverse the string backwards, from s down to s_start. */ + + /* Grapheme Cluster break property of the next character. + -1 at the very end of the string. */ + int next_char_prop = -1; + + /* Indic_Conjunct_Break property of the next character. + -1 at the very end of the string. */ + int next_char_incb = -1; + + /* Extended_Pictographic property of the next character. + false at the very end of the string. */ + bool next_char_epic = false; + + do + { + const UNIT *prev_s; + ucs4_t uc; + + prev_s = U_PREV (&uc, s, s_start); + if (prev_s == NULL) + { + /* Ill-formed UTF-8 encoding. */ + return s_start; + } + + int prop = uc_graphemeclusterbreak_property (uc); + int incb = uc_indic_conjunct_break (uc); + bool epic = uc_is_property_extended_pictographic (uc); + + /* Break at the end of the string (GB2). */ + if (next_char_prop < 0) + /* *p = 1 */; + else + { + /* No break between CR and LF (GB3). */ + if (prop == GBP_CR && next_char_prop == GBP_LF) + /* *p = 0 */; + /* Break before and after newlines (GB4, GB5). */ + else if ((prop == GBP_CR + || prop == GBP_LF + || prop == GBP_CONTROL) + || (next_char_prop == GBP_CR + || next_char_prop == GBP_LF + || next_char_prop == GBP_CONTROL)) + break /* *p = 1 */; + /* No break between Hangul syllable sequences (GB6, GB7, GB8). */ + else if ((prop == GBP_L + && (next_char_prop == GBP_L + || next_char_prop == GBP_V + || next_char_prop == GBP_LV + || next_char_prop == GBP_LVT)) + || ((prop == GBP_LV + || prop == GBP_V) + && (next_char_prop == GBP_V + || next_char_prop == GBP_T)) + || ((prop == GBP_LVT + || prop == GBP_T) + && next_char_prop == GBP_T)) + /* *p = 0 */; + /* No break before extending characters or ZWJ (GB9). */ + else if (next_char_prop == GBP_EXTEND || next_char_prop == GBP_ZWJ) + /* *p = 0 */; + /* No break before SpacingMarks (GB9a). */ + else if (next_char_prop == GBP_SPACINGMARK) + /* *p = 0 */; + /* No break after Prepend characters (GB9b). */ + else if (prop == GBP_PREPEND) + /* *p = 0 */; + /* No break within certain combinations of Indic_Conjunct_Break + values: Between + consonant {extend|linker}* linker {extend|linker}* + and + consonant + (GB9c). */ + else if (next_char_incb == UC_INDIC_CONJUNCT_BREAK_CONSONANT + && ends_with_incb_consonant_extended_linker_extended (s, s_start)) + /* *p = 0 */; + /* No break within emoji modifier sequences or emoji zwj sequences + (GB11). */ + else if (next_char_epic + && prop == GBP_ZWJ + && ends_with_emoji_modifier_sequence (prev_s, s_start)) + /* *p = 0 */; + /* No break between RI if there is an odd number of RI + characters before (GB12, GB13). */ + else if (next_char_prop == GBP_RI + && prop == GBP_RI + && (ends_with_ri_count (prev_s, s_start) % 2) == 0) + /* *p = 0 */; + /* Break everywhere (GB999). */ + else + break /* *p = 1 */; + } + + s = prev_s; + next_char_prop = prop; + next_char_incb = incb; + next_char_epic = epic; + } + while (s > s_start); + + return s; +} diff --git a/lib/unigbrk/u16-grapheme-breaks.c b/lib/unigbrk/u16-grapheme-breaks.c index 0ae5b4aa..bae4bcb2 100644 --- a/lib/unigbrk/u16-grapheme-breaks.c +++ b/lib/unigbrk/u16-grapheme-breaks.c @@ -1,5 +1,5 @@ /* Grapheme cluster breaks function. - Copyright (C) 2010-2024 Free Software Foundation, Inc. + Copyright (C) 2010-2025 Free Software Foundation, Inc. Written by Ben Pfaff <blp@cs.stanford.edu>, 2010. This file is free software. diff --git a/lib/unigbrk/u16-grapheme-next.c b/lib/unigbrk/u16-grapheme-next.c index 1df56740..5e7a783d 100644 --- a/lib/unigbrk/u16-grapheme-next.c +++ b/lib/unigbrk/u16-grapheme-next.c @@ -1,6 +1,5 @@ /* Next grapheme cluster function. - Copyright (C) 2010-2024 Free Software Foundation, Inc. - Written by Ben Pfaff <blp@cs.stanford.edu>, 2010. + Copyright (C) 2010-2025 Free Software Foundation, Inc. This file is free software. It is dual-licensed under "the GNU LGPLv3+ or the GNU GPLv2+". @@ -23,32 +22,20 @@ License and of the GNU General Public License along with this program. If not, see <https://www.gnu.org/licenses/>. */ +/* Written by Bruno Haible <bruno@clisp.org>, 2025. */ + +/* Don't use the const-improved function macros in this compilation unit. */ +#define _LIBUNISTRING_NO_CONST_GENERICS + #include <config.h> /* Specification. */ #include "unigbrk.h" +#include "unictype.h" #include "unistr.h" -const uint16_t * -u16_grapheme_next (const uint16_t *s, const uint16_t *end) -{ - ucs4_t prev; - int mblen; - - if (s == end) - return NULL; - - for (s += u16_mbtouc (&prev, s, end - s); s != end; s += mblen) - { - ucs4_t next; - - mblen = u16_mbtouc (&next, s, end - s); - if (uc_is_grapheme_break (prev, next)) - break; - - prev = next; - } - - return s; -} +#define FUNC u16_grapheme_next +#define UNIT uint16_t +#define U_MBTOUC u16_mbtouc +#include "u-grapheme-next.h" diff --git a/lib/unigbrk/u16-grapheme-prev.c b/lib/unigbrk/u16-grapheme-prev.c index 804d7bc1..4c70e118 100644 --- a/lib/unigbrk/u16-grapheme-prev.c +++ b/lib/unigbrk/u16-grapheme-prev.c @@ -1,6 +1,5 @@ /* Previous grapheme cluster function. - Copyright (C) 2010-2024 Free Software Foundation, Inc. - Written by Ben Pfaff <blp@cs.stanford.edu>, 2010. + Copyright (C) 2010-2025 Free Software Foundation, Inc. This file is free software. It is dual-licensed under "the GNU LGPLv3+ or the GNU GPLv2+". @@ -23,40 +22,20 @@ License and of the GNU General Public License along with this program. If not, see <https://www.gnu.org/licenses/>. */ +/* Written by Bruno Haible <bruno@clisp.org>, 2025. */ + +/* Don't use the const-improved function macros in this compilation unit. */ +#define _LIBUNISTRING_NO_CONST_GENERICS + #include <config.h> /* Specification. */ #include "unigbrk.h" +#include "unictype.h" #include "unistr.h" -const uint16_t * -u16_grapheme_prev (const uint16_t *s, const uint16_t *start) -{ - ucs4_t next; - - if (s == start) - return NULL; - - s = u16_prev (&next, s, start); - while (s != start) - { - const uint16_t *prev_s; - ucs4_t prev; - - prev_s = u16_prev (&prev, s, start); - if (prev_s == NULL) - { - /* Ill-formed UTF-16 encoding. */ - return start; - } - - if (uc_is_grapheme_break (prev, next)) - break; - - s = prev_s; - next = prev; - } - - return s; -} +#define FUNC u16_grapheme_prev +#define UNIT uint16_t +#define U_PREV u16_prev +#include "u-grapheme-prev.h" diff --git a/lib/unigbrk/u32-grapheme-breaks.c b/lib/unigbrk/u32-grapheme-breaks.c index c4872c27..66c64445 100644 --- a/lib/unigbrk/u32-grapheme-breaks.c +++ b/lib/unigbrk/u32-grapheme-breaks.c @@ -1,5 +1,5 @@ /* Grapheme cluster breaks function. - Copyright (C) 2010-2024 Free Software Foundation, Inc. + Copyright (C) 2010-2025 Free Software Foundation, Inc. Written by Ben Pfaff <blp@cs.stanford.edu>, 2010. This file is free software. diff --git a/lib/unigbrk/u32-grapheme-next.c b/lib/unigbrk/u32-grapheme-next.c index 198857d9..1c9adfa6 100644 --- a/lib/unigbrk/u32-grapheme-next.c +++ b/lib/unigbrk/u32-grapheme-next.c @@ -1,6 +1,5 @@ /* Next grapheme cluster function. - Copyright (C) 2010-2024 Free Software Foundation, Inc. - Written by Ben Pfaff <blp@cs.stanford.edu>, 2010. + Copyright (C) 2010-2025 Free Software Foundation, Inc. This file is free software. It is dual-licensed under "the GNU LGPLv3+ or the GNU GPLv2+". @@ -23,32 +22,20 @@ License and of the GNU General Public License along with this program. If not, see <https://www.gnu.org/licenses/>. */ +/* Written by Bruno Haible <bruno@clisp.org>, 2025. */ + +/* Don't use the const-improved function macros in this compilation unit. */ +#define _LIBUNISTRING_NO_CONST_GENERICS + #include <config.h> /* Specification. */ #include "unigbrk.h" +#include "unictype.h" #include "unistr.h" -const uint32_t * -u32_grapheme_next (const uint32_t *s, const uint32_t *end) -{ - ucs4_t prev; - - if (s == end) - return NULL; - - u32_mbtouc (&prev, s, end - s); - for (s++; s != end; s++) - { - ucs4_t next; - - u32_mbtouc (&next, s, end - s); - if (uc_is_grapheme_break (prev, next)) - break; - - prev = next; - } - - return s; -} +#define FUNC u32_grapheme_next +#define UNIT uint32_t +#define U_MBTOUC u32_mbtouc +#include "u-grapheme-next.h" diff --git a/lib/unigbrk/u32-grapheme-prev.c b/lib/unigbrk/u32-grapheme-prev.c index a6b6983a..977a1977 100644 --- a/lib/unigbrk/u32-grapheme-prev.c +++ b/lib/unigbrk/u32-grapheme-prev.c @@ -1,6 +1,5 @@ /* Previous grapheme cluster function. - Copyright (C) 2010-2024 Free Software Foundation, Inc. - Written by Ben Pfaff <blp@cs.stanford.edu>, 2010. + Copyright (C) 2010-2025 Free Software Foundation, Inc. This file is free software. It is dual-licensed under "the GNU LGPLv3+ or the GNU GPLv2+". @@ -23,37 +22,20 @@ License and of the GNU General Public License along with this program. If not, see <https://www.gnu.org/licenses/>. */ +/* Written by Bruno Haible <bruno@clisp.org>, 2025. */ + +/* Don't use the const-improved function macros in this compilation unit. */ +#define _LIBUNISTRING_NO_CONST_GENERICS + #include <config.h> /* Specification. */ #include "unigbrk.h" +#include "unictype.h" #include "unistr.h" -const uint32_t * -u32_grapheme_prev (const uint32_t *s, const uint32_t *start) -{ - ucs4_t next; - - if (s == start) - return NULL; - - u32_prev (&next, s, start); - for (s--; s != start; s--) - { - ucs4_t prev; - - if (u32_prev (&prev, s, start) == NULL) - { - /* Ill-formed UTF-32 encoding. */ - return start; - } - - if (uc_is_grapheme_break (prev, next)) - break; - - next = prev; - } - - return s; -} +#define FUNC u32_grapheme_prev +#define UNIT uint32_t +#define U_PREV u32_prev +#include "u-grapheme-prev.h" diff --git a/lib/unigbrk/u8-grapheme-breaks.c b/lib/unigbrk/u8-grapheme-breaks.c index 887c1864..04708695 100644 --- a/lib/unigbrk/u8-grapheme-breaks.c +++ b/lib/unigbrk/u8-grapheme-breaks.c @@ -1,5 +1,5 @@ /* Grapheme cluster breaks function. - Copyright (C) 2001-2003, 2006-2024 Free Software Foundation, Inc. + Copyright (C) 2001-2003, 2006-2025 Free Software Foundation, Inc. Written by Ben Pfaff <blp@cs.stanford.edu>, 2010, based on code written by Bruno Haible <bruno@clisp.org>, 2009. diff --git a/lib/unigbrk/u8-grapheme-next.c b/lib/unigbrk/u8-grapheme-next.c index 431c5ee7..2ec094da 100644 --- a/lib/unigbrk/u8-grapheme-next.c +++ b/lib/unigbrk/u8-grapheme-next.c @@ -1,6 +1,5 @@ /* Next grapheme cluster function. - Copyright (C) 2010-2024 Free Software Foundation, Inc. - Written by Ben Pfaff <blp@cs.stanford.edu>, 2010. + Copyright (C) 2010-2025 Free Software Foundation, Inc. This file is free software. It is dual-licensed under "the GNU LGPLv3+ or the GNU GPLv2+". @@ -23,32 +22,20 @@ License and of the GNU General Public License along with this program. If not, see <https://www.gnu.org/licenses/>. */ +/* Written by Bruno Haible <bruno@clisp.org>, 2025. */ + +/* Don't use the const-improved function macros in this compilation unit. */ +#define _LIBUNISTRING_NO_CONST_GENERICS + #include <config.h> /* Specification. */ #include "unigbrk.h" +#include "unictype.h" #include "unistr.h" -const uint8_t * -u8_grapheme_next (const uint8_t *s, const uint8_t *end) -{ - ucs4_t prev; - int mblen; - - if (s == end) - return NULL; - - for (s += u8_mbtouc (&prev, s, end - s); s != end; s += mblen) - { - ucs4_t next; - - mblen = u8_mbtouc (&next, s, end - s); - if (uc_is_grapheme_break (prev, next)) - break; - - prev = next; - } - - return s; -} +#define FUNC u8_grapheme_next +#define UNIT uint8_t +#define U_MBTOUC u8_mbtouc +#include "u-grapheme-next.h" diff --git a/lib/unigbrk/u8-grapheme-prev.c b/lib/unigbrk/u8-grapheme-prev.c index 8a63f55f..a2d872f0 100644 --- a/lib/unigbrk/u8-grapheme-prev.c +++ b/lib/unigbrk/u8-grapheme-prev.c @@ -1,6 +1,5 @@ /* Previous grapheme cluster function. - Copyright (C) 2010-2024 Free Software Foundation, Inc. - Written by Ben Pfaff <blp@cs.stanford.edu>, 2010. + Copyright (C) 2010-2025 Free Software Foundation, Inc. This file is free software. It is dual-licensed under "the GNU LGPLv3+ or the GNU GPLv2+". @@ -23,40 +22,20 @@ License and of the GNU General Public License along with this program. If not, see <https://www.gnu.org/licenses/>. */ +/* Written by Bruno Haible <bruno@clisp.org>, 2025. */ + +/* Don't use the const-improved function macros in this compilation unit. */ +#define _LIBUNISTRING_NO_CONST_GENERICS + #include <config.h> /* Specification. */ #include "unigbrk.h" +#include "unictype.h" #include "unistr.h" -const uint8_t * -u8_grapheme_prev (const uint8_t *s, const uint8_t *start) -{ - ucs4_t next; - - if (s == start) - return NULL; - - s = u8_prev (&next, s, start); - while (s != start) - { - const uint8_t *prev_s; - ucs4_t prev; - - prev_s = u8_prev (&prev, s, start); - if (prev_s == NULL) - { - /* Ill-formed UTF-8 encoding. */ - return start; - } - - if (uc_is_grapheme_break (prev, next)) - break; - - s = prev_s; - next = prev; - } - - return s; -} +#define FUNC u8_grapheme_prev +#define UNIT uint8_t +#define U_PREV u8_prev +#include "u-grapheme-prev.h" diff --git a/lib/unigbrk/uc-gbrk-prop.c b/lib/unigbrk/uc-gbrk-prop.c index 953fe006..8158fa0c 100644 --- a/lib/unigbrk/uc-gbrk-prop.c +++ b/lib/unigbrk/uc-gbrk-prop.c @@ -1,5 +1,5 @@ /* Grapheme cluster break property function. - Copyright (C) 2010-2024 Free Software Foundation, Inc. + Copyright (C) 2010-2025 Free Software Foundation, Inc. Written by Ben Pfaff <blp@cs.stanford.edu>, 2010. This file is free software. diff --git a/lib/unigbrk/uc-grapheme-breaks.c b/lib/unigbrk/uc-grapheme-breaks.c index fbebdac2..4924a97d 100644 --- a/lib/unigbrk/uc-grapheme-breaks.c +++ b/lib/unigbrk/uc-grapheme-breaks.c @@ -1,5 +1,5 @@ /* Grapheme cluster breaks function. - Copyright (C) 2010-2024 Free Software Foundation, Inc. + Copyright (C) 2010-2025 Free Software Foundation, Inc. Written by Ben Pfaff <blp@cs.stanford.edu>, 2010. This file is free software. diff --git a/lib/unigbrk/uc-is-grapheme-break.c b/lib/unigbrk/uc-is-grapheme-break.c index acf4bf3e..1451efa8 100644 --- a/lib/unigbrk/uc-is-grapheme-break.c +++ b/lib/unigbrk/uc-is-grapheme-break.c @@ -1,5 +1,5 @@ /* Grapheme cluster break function. - Copyright (C) 2010-2024 Free Software Foundation, Inc. + Copyright (C) 2010-2025 Free Software Foundation, Inc. Written by Ben Pfaff <blp@cs.stanford.edu>, 2010. This file is free software. diff --git a/lib/unigbrk/ulc-grapheme-breaks.c b/lib/unigbrk/ulc-grapheme-breaks.c index 7b32ad8d..779a59f0 100644 --- a/lib/unigbrk/ulc-grapheme-breaks.c +++ b/lib/unigbrk/ulc-grapheme-breaks.c @@ -1,5 +1,5 @@ /* Grapheme cluster breaks function. - Copyright (C) 2001-2003, 2006-2024 Free Software Foundation, Inc. + Copyright (C) 2001-2003, 2006-2025 Free Software Foundation, Inc. Written by Ben Pfaff <blp@cs.stanford.edu>, 2010, based on code written by Bruno Haible <bruno@clisp.org>, 2009. |