1 files changed, 159 insertions, 0 deletions
diff --git a/lib/unigbrk/u-grapheme-next.h b/lib/unigbrk/u-grapheme-next.h
new file mode 100644
index 00000000..9ca07436
--- /dev/null
+++ b/lib/unigbrk/u-grapheme-next.h
@@ -0,0 +1,159 @@
+/* Grapheme cluster break function.
+   Copyright (C) 2010-2025 Free Software Foundation, Inc.
+
+   This file is free software.
+   It is dual-licensed under "the GNU LGPLv3+ or the GNU GPLv2+".
+   You can redistribute it and/or modify it under either
+     - the terms of the GNU Lesser General Public License as published
+       by the Free Software Foundation, either version 3, or (at your
+       option) any later version, or
+     - the terms of the GNU General Public License as published by the
+       Free Software Foundation; either version 2, or (at your option)
+       any later version, or
+     - the same dual license "the GNU LGPLv3+ or the GNU GPLv2+".
+
+   This file is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License and the GNU General Public License
+   for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License and of the GNU General Public License along with this
+   program.  If not, see <https://www.gnu.org/licenses/>.  */
+
+/* Written by Ben Pfaff, Daiki Ueno, Bruno Haible.  */
+
+/* This file implements section 3 "Grapheme Cluster Boundaries"
+   of Unicode Standard Annex #29 <https://www.unicode.org/reports/tr29/>.  */
+
+const UNIT *
+FUNC (const UNIT *s, const UNIT *s_end)
+{
+  if (s == s_end)
+    return NULL;
+
+  /* Grapheme Cluster break property of the last character.
+     -1 at the very beginning of the string.  */
+  int last_char_prop = -1;
+
+  /* True if the last character ends a sequence of Indic_Conjunct_Break
+     values:  consonant {extend|linker}*  */
+  bool incb_consonant_extended = false;
+  /* True if the last character ends a sequence of Indic_Conjunct_Break
+     values:  consonant {extend|linker}* linker  */
+  bool incb_consonant_extended_linker = false;
+  /* True if the last character ends a sequence of Indic_Conjunct_Break
+     values:  consonant {extend|linker}* linker {extend|linker}*  */
+  bool incb_consonant_extended_linker_extended = false;
+
+  /* True if the last character ends an emoji modifier sequence
+     \p{Extended_Pictographic} Extend*.  */
+  bool emoji_modifier_sequence = false;
+  /* True if the last character was immediately preceded by an
+     emoji modifier sequence   \p{Extended_Pictographic} Extend*.  */
+  bool emoji_modifier_sequence_before_last_char = false;
+
+  /* Number of consecutive regional indicator (RI) characters seen
+     immediately before the current point.  */
+  size_t ri_count = 0;
+
+  do
+    {
+      ucs4_t uc;
+      int count = U_MBTOUC (&uc, s, s_end - s);
+      int prop = uc_graphemeclusterbreak_property (uc);
+      int incb = uc_indic_conjunct_break (uc);
+
+      /* Break at the start of the string (GB1).  */
+      if (last_char_prop < 0)
+        /* *p = 1 */;
+      else
+        {
+          /* No break between CR and LF (GB3).  */
+          if (last_char_prop == GBP_CR && prop == GBP_LF)
+            /* *p = 0 */;
+          /* Break before and after newlines (GB4, GB5).  */
+          else if ((last_char_prop == GBP_CR
+                    || last_char_prop == GBP_LF
+                    || last_char_prop == GBP_CONTROL)
+                   || (prop == GBP_CR
+                       || prop == GBP_LF
+                       || prop == GBP_CONTROL))
+            break /* *p = 1 */;
+          /* No break between Hangul syllable sequences (GB6, GB7, GB8).  */
+          else if ((last_char_prop == GBP_L
+                    && (prop == GBP_L
+                        || prop == GBP_V
+                        || prop == GBP_LV
+                        || prop == GBP_LVT))
+                   || ((last_char_prop == GBP_LV
+                        || last_char_prop == GBP_V)
+                       && (prop == GBP_V
+                           || prop == GBP_T))
+                   || ((last_char_prop == GBP_LVT
+                        || last_char_prop == GBP_T)
+                       && prop == GBP_T))
+            /* *p = 0 */;
+          /* No break before extending characters or ZWJ (GB9).  */
+          else if (prop == GBP_EXTEND || prop == GBP_ZWJ)
+            /* *p = 0 */;
+          /* No break before SpacingMarks (GB9a).  */
+          else if (prop == GBP_SPACINGMARK)
+            /* *p = 0 */;
+          /* No break after Prepend characters (GB9b).  */
+          else if (last_char_prop == GBP_PREPEND)
+            /* *p = 0 */;
+          /* No break within certain combinations of Indic_Conjunct_Break
+             values: Between
+               consonant {extend|linker}* linker {extend|linker}*
+             and
+               consonant
+             (GB9c).  */
+          else if (incb_consonant_extended_linker_extended
+                   && incb == UC_INDIC_CONJUNCT_BREAK_CONSONANT)
+            /* *p = 0 */;
+          /* No break within emoji modifier sequences or emoji zwj sequences
+             (GB11).  */
+          else if (last_char_prop == GBP_ZWJ
+                   && emoji_modifier_sequence_before_last_char
+                   && uc_is_property_extended_pictographic (uc))
+            /* *p = 0 */;
+          /* No break between RI if there is an odd number of RI
+             characters before (GB12, GB13).  */
+          else if (prop == GBP_RI && (ri_count % 2) != 0)
+            /* *p = 0 */;
+          /* Break everywhere (GB999).  */
+          else
+            break /* *p = 1 */;
+        }
+
+      incb_consonant_extended_linker =
+        incb_consonant_extended && incb == UC_INDIC_CONJUNCT_BREAK_LINKER;
+      incb_consonant_extended_linker_extended =
+        (incb_consonant_extended_linker
+         || (incb_consonant_extended_linker_extended
+             && incb >= UC_INDIC_CONJUNCT_BREAK_LINKER));
+      incb_consonant_extended =
+        (incb == UC_INDIC_CONJUNCT_BREAK_CONSONANT
+         || (incb_consonant_extended
+             && incb >= UC_INDIC_CONJUNCT_BREAK_LINKER));
+
+      emoji_modifier_sequence_before_last_char = emoji_modifier_sequence;
+      emoji_modifier_sequence =
+        (emoji_modifier_sequence && prop == GBP_EXTEND)
+        || uc_is_property_extended_pictographic (uc);
+
+      last_char_prop = prop;
+
+      if (prop == GBP_RI)
+        ri_count++;
+      else
+        ri_count = 0;
+
+      s += count;
+    }
+  while (s < s_end);
+
+  return s;
+}