/* Grapheme cluster break function.
Copyright (C) 2010-2025 Free Software Foundation, Inc.
This file is free software.
It is dual-licensed under "the GNU LGPLv3+ or the GNU GPLv2+".
You can redistribute it and/or modify it under either
- the terms of the GNU Lesser General Public License as published
by the Free Software Foundation, either version 3, or (at your
option) any later version, or
- the terms of the GNU General Public License as published by the
Free Software Foundation; either version 2, or (at your option)
any later version, or
- the same dual license "the GNU LGPLv3+ or the GNU GPLv2+".
This file is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License and the GNU General Public License
for more details.
You should have received a copy of the GNU Lesser General Public
License and of the GNU General Public License along with this
program. If not, see . */
/* Written by Bruno Haible , 2025. */
/* This file implements section 3 "Grapheme Cluster Boundaries"
of Unicode Standard Annex #29
backwards. */
/* Returns true if the string [s_start, s) ends with a sequence of
Indic_Conjunct_Break values like:
consonant {extend|linker}* linker {extend|linker}*
*/
static bool
ends_with_incb_consonant_extended_linker_extended (const UNIT *s,
const UNIT *s_start)
{
/* Look for
consonant {extend|linker}*
with at least one linker. */
bool seen_linker = false;
while (s > s_start)
{
const UNIT *prev_s;
ucs4_t uc;
prev_s = U_PREV (&uc, s, s_start);
if (prev_s == NULL)
/* Ill-formed UTF-8 encoding. */
break;
int incb = uc_indic_conjunct_break (uc);
if (incb == UC_INDIC_CONJUNCT_BREAK_CONSONANT)
return seen_linker;
if (!(incb >= UC_INDIC_CONJUNCT_BREAK_LINKER))
break;
seen_linker |= (incb == UC_INDIC_CONJUNCT_BREAK_LINKER);
s = prev_s;
}
return false;
}
/* Returns true if the string [s_start, s) ends with a sequence of
characters like:
\p{Extended_Pictographic} Extend*
*/
static bool
ends_with_emoji_modifier_sequence (const UNIT *s, const UNIT *s_start)
{
while (s > s_start)
{
const UNIT *prev_s;
ucs4_t uc;
prev_s = U_PREV (&uc, s, s_start);
if (prev_s == NULL)
/* Ill-formed UTF-8 encoding. */
break;
if (uc_is_property_extended_pictographic (uc))
return true;
if (uc_graphemeclusterbreak_property (uc) != GBP_EXTEND)
break;
s = prev_s;
}
return false;
}
/* Returns the number of consecutive regional indicator (RI) characters
at the end of the string [s_start, s). */
static size_t
ends_with_ri_count (const UNIT *s, const UNIT *s_start)
{
size_t ri_count = 0;
while (s > s_start)
{
const UNIT *prev_s;
ucs4_t uc;
prev_s = U_PREV (&uc, s, s_start);
if (prev_s == NULL)
/* Ill-formed UTF-8 encoding. */
break;
if (uc_graphemeclusterbreak_property (uc) == GBP_RI)
ri_count++;
else
break;
s = prev_s;
}
return ri_count;
}
const UNIT *
FUNC (const UNIT *s, const UNIT *s_start)
{
if (s == s_start)
return NULL;
/* Traverse the string backwards, from s down to s_start. */
/* Grapheme Cluster break property of the next character.
-1 at the very end of the string. */
int next_char_prop = -1;
/* Indic_Conjunct_Break property of the next character.
-1 at the very end of the string. */
int next_char_incb = -1;
/* Extended_Pictographic property of the next character.
false at the very end of the string. */
bool next_char_epic = false;
do
{
const UNIT *prev_s;
ucs4_t uc;
prev_s = U_PREV (&uc, s, s_start);
if (prev_s == NULL)
{
/* Ill-formed UTF-8 encoding. */
return s_start;
}
int prop = uc_graphemeclusterbreak_property (uc);
int incb = uc_indic_conjunct_break (uc);
bool epic = uc_is_property_extended_pictographic (uc);
/* Break at the end of the string (GB2). */
if (next_char_prop < 0)
/* *p = 1 */;
else
{
/* No break between CR and LF (GB3). */
if (prop == GBP_CR && next_char_prop == GBP_LF)
/* *p = 0 */;
/* Break before and after newlines (GB4, GB5). */
else if ((prop == GBP_CR
|| prop == GBP_LF
|| prop == GBP_CONTROL)
|| (next_char_prop == GBP_CR
|| next_char_prop == GBP_LF
|| next_char_prop == GBP_CONTROL))
break /* *p = 1 */;
/* No break between Hangul syllable sequences (GB6, GB7, GB8). */
else if ((prop == GBP_L
&& (next_char_prop == GBP_L
|| next_char_prop == GBP_V
|| next_char_prop == GBP_LV
|| next_char_prop == GBP_LVT))
|| ((prop == GBP_LV
|| prop == GBP_V)
&& (next_char_prop == GBP_V
|| next_char_prop == GBP_T))
|| ((prop == GBP_LVT
|| prop == GBP_T)
&& next_char_prop == GBP_T))
/* *p = 0 */;
/* No break before extending characters or ZWJ (GB9). */
else if (next_char_prop == GBP_EXTEND || next_char_prop == GBP_ZWJ)
/* *p = 0 */;
/* No break before SpacingMarks (GB9a). */
else if (next_char_prop == GBP_SPACINGMARK)
/* *p = 0 */;
/* No break after Prepend characters (GB9b). */
else if (prop == GBP_PREPEND)
/* *p = 0 */;
/* No break within certain combinations of Indic_Conjunct_Break
values: Between
consonant {extend|linker}* linker {extend|linker}*
and
consonant
(GB9c). */
else if (next_char_incb == UC_INDIC_CONJUNCT_BREAK_CONSONANT
&& ends_with_incb_consonant_extended_linker_extended (s, s_start))
/* *p = 0 */;
/* No break within emoji modifier sequences or emoji zwj sequences
(GB11). */
else if (next_char_epic
&& prop == GBP_ZWJ
&& ends_with_emoji_modifier_sequence (prev_s, s_start))
/* *p = 0 */;
/* No break between RI if there is an odd number of RI
characters before (GB12, GB13). */
else if (next_char_prop == GBP_RI
&& prop == GBP_RI
&& (ends_with_ri_count (prev_s, s_start) % 2) == 0)
/* *p = 0 */;
/* Break everywhere (GB999). */
else
break /* *p = 1 */;
}
s = prev_s;
next_char_prop = prop;
next_char_incb = incb;
next_char_epic = epic;
}
while (s > s_start);
return s;
}