From 4682deeb62247d34de87f8e777f99e2d337fd377 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?J=C3=B6rg=20Frings-F=C3=BCrst?=
Date: Sun, 20 Oct 2024 15:21:43 +0200
Subject: New upstream version 1.3
---
lib/unilbrk/u32-possible-linebreaks.c | 302 +++++++++++++++++++++++++---------
1 file changed, 226 insertions(+), 76 deletions(-)
(limited to 'lib/unilbrk/u32-possible-linebreaks.c')
diff --git a/lib/unilbrk/u32-possible-linebreaks.c b/lib/unilbrk/u32-possible-linebreaks.c
index eb28891d..f242c9a8 100644
--- a/lib/unilbrk/u32-possible-linebreaks.c
+++ b/lib/unilbrk/u32-possible-linebreaks.c
@@ -43,10 +43,49 @@ u32_possible_linebreaks_loop (const uint32_t *s, size_t n, const char *encoding,
{
if (n > 0)
{
- int LBP_AI_REPLACEMENT = (is_cjk_encoding (encoding) ? LBP_ID1 : LBP_AL);
+ int LBP_AI_REPLACEMENT = (is_cjk_encoding (encoding) ? LBP_ID1 : LBP_AL1);
const uint32_t *s_end = s + n;
- int prev_prop = LBP_BK; /* line break property of last character */
- int last_prop = LBP_BK; /* line break property of last non-space character */
+
+ /* We need 2 characters of lookahead:
+ - 1 character of lookahead for (LB15c,LB19a,LB28a),
+ - 2 characters of lookahead for (LB25). */
+ const uint32_t *lookahead1_end;
+ ucs4_t lookahead1_uc;
+ int lookahead1_prop_ea;
+ const uint32_t *lookahead2_end;
+ ucs4_t lookahead2_uc;
+ int lookahead2_prop_ea;
+ /* Get the first lookahead character. */
+ lookahead1_end = s;
+ lookahead1_uc = *lookahead1_end++;
+ lookahead1_prop_ea = unilbrkprop_lookup (lookahead1_uc);
+ /* Get the second lookahead character. */
+ lookahead2_end = lookahead1_end;
+ if (lookahead2_end < s_end)
+ {
+ lookahead2_uc = *lookahead2_end++;
+ lookahead2_prop_ea = unilbrkprop_lookup (lookahead2_uc);
+ }
+ else
+ {
+ lookahead2_uc = 0xFFFD;
+ lookahead2_prop_ea = PROP_EA (LBP_BK, 0);
+ }
+
+ int preceding_prop = LBP_BK; /* line break property of preceding character */
+ int prev_prop = LBP_BK; /* line break property of previous character
+ (= last character, ignoring intervening characters of class CM or ZWJ) */
+ int prev_ea = 0; /* EastAsian property of previous character
+ (= last character, ignoring intervening characters of class CM or ZWJ) */
+ int prev2_ea = 0; /* EastAsian property of character before the previous character */
+ bool prev_initial_hyphen = false; /* the previous character was a
+ word-initial hyphen or U+2010 */
+ bool prev_nus = false; /* before the previous character, there was a character
+ with line break property LBP_NU and since then
+ only characters with line break property LBP_SY
+ or LBP_IS */
+ int last_prop = LBP_BK; /* line break property of last non-space character
+ (= last character, ignoring intervening characters of class SP or CM or ZWJ) */
char *seen_space = NULL; /* Was a space seen after the last non-space character? */
/* Number of consecutive regional indicator (RI) characters seen
@@ -55,9 +94,32 @@ u32_possible_linebreaks_loop (const uint32_t *s, size_t n, const char *encoding,
do
{
- ucs4_t uc = *s;
- s++;
- int prop = unilbrkprop_lookup (uc);
+ /* Read the next character. */
+ s = lookahead1_end;
+ ucs4_t uc = lookahead1_uc;
+ int prop_ea = lookahead1_prop_ea; /* = unilbrkprop_lookup (uc); */
+ int prop = PROP (prop_ea); /* line break property of uc */
+ int ea = EA (prop_ea); /* EastAsian property of uc */
+ /* Refill the pipeline of 2 lookahead characters. */
+ lookahead1_end = lookahead2_end;
+ lookahead1_uc = lookahead2_uc;
+ lookahead1_prop_ea = lookahead2_prop_ea;
+ if (lookahead2_end < s_end)
+ {
+ lookahead2_uc = *lookahead2_end++;
+ lookahead2_prop_ea = unilbrkprop_lookup (lookahead2_uc);
+ }
+ else
+ {
+ lookahead2_uc = 0xFFFD;
+ lookahead2_prop_ea = PROP_EA (LBP_BK, 0);
+ }
+
+ bool nus = /* ending at the previous character, there was a character
+ with line break property LBP_NU and since then only
+ characters with line break property LBP_SY or LBP_IS */
+ (prev_prop == LBP_NU
+ || (prev_nus && (prev_prop == LBP_SY || prev_prop == LBP_IS)));
if (prop == LBP_BK || prop == LBP_LF || prop == LBP_CR)
{
@@ -67,7 +129,6 @@ u32_possible_linebreaks_loop (const uint32_t *s, size_t n, const char *encoding,
a CR-LF sequence. */
if (prev_prop == cr && prop == LBP_LF)
p[-1] = UC_BREAK_CR_BEFORE_LF;
- prev_prop = prop;
last_prop = LBP_BK;
seen_space = NULL;
}
@@ -89,62 +150,7 @@ u32_possible_linebreaks_loop (const uint32_t *s, size_t n, const char *encoding,
Treat LBP_SA like LBP_XX. */
case LBP_XX:
/* This is arbitrary. */
- prop = LBP_AL;
- break;
- case LBP_QU2:
- /* For (LB15a): Replace LBP_QU2 with LBP_QU1 if the previous
- character's line break property was not one of
- BK, CR, LF, OP, QU, GL, SP, ZW. */
- switch (prev_prop)
- {
- case LBP_BK:
- case LBP_CR:
- case LBP_LF:
- case LBP_OP1: case LBP_OP2:
- case LBP_QU1: case LBP_QU2: case LBP_QU3:
- case LBP_GL:
- case LBP_SP:
- case LBP_ZW:
- break;
- default:
- prop = LBP_QU1;
- break;
- }
- break;
- case LBP_QU3:
- /* For (LB15b): Replace LBP_QU3 with LBP_QU1 if the next
- character's line break property is not one of
- BK, CR, LF, SP, GL, WJ, CL, QU, CP, EX, IS, SY, ZW. */
- {
- int next_prop;
- if (s < s_end)
- {
- ucs4_t next_uc = *s;
- next_prop = unilbrkprop_lookup (next_uc);
- }
- else
- next_prop = LBP_BK;
- switch (next_prop)
- {
- case LBP_BK:
- case LBP_CR:
- case LBP_LF:
- case LBP_SP:
- case LBP_GL:
- case LBP_WJ:
- case LBP_CL:
- case LBP_QU1: case LBP_QU2: case LBP_QU3:
- case LBP_CP1: case LBP_CP2:
- case LBP_EX:
- case LBP_IS:
- case LBP_SY:
- case LBP_ZW:
- break;
- default:
- prop = LBP_QU1;
- break;
- }
- }
+ prop = LBP_AL1;
break;
}
@@ -172,10 +178,15 @@ u32_possible_linebreaks_loop (const uint32_t *s, size_t n, const char *encoding,
/* (LB4,LB5,LB6) Don't break at the beginning of a line. */
*p = UC_BREAK_PROHIBITED;
/* (LB10) Treat CM or ZWJ as AL. */
- last_prop = LBP_AL;
+ last_prop = LBP_AL1;
seen_space = NULL;
}
- else if (last_prop == LBP_ZW || seen_space != NULL)
+ else if (last_prop == LBP_ZW
+ || (seen_space != NULL
+ /* (LB14) has higher priority than (LB18). */
+ && !(last_prop == LBP_OP1 || last_prop == LBP_OP2)
+ /* (LB15a) has higher priority than (LB18). */
+ && !(last_prop == LBP_QU2)))
{
/* (LB8) Break after zero-width space. */
/* (LB18) Break after spaces.
@@ -184,7 +195,7 @@ u32_possible_linebreaks_loop (const uint32_t *s, size_t n, const char *encoding,
NBSP CM sequence is recommended instead of SP CM. */
*p = UC_BREAK_POSSIBLE;
/* (LB10) Treat CM or ZWJ as AL. */
- last_prop = LBP_AL;
+ last_prop = LBP_AL1;
seen_space = NULL;
}
else
@@ -209,11 +220,82 @@ u32_possible_linebreaks_loop (const uint32_t *s, size_t n, const char *encoding,
/* (LB8) Break after zero-width space. */
*p = UC_BREAK_POSSIBLE;
}
- else if (prev_prop == LBP_ZWJ)
+ else if (preceding_prop == LBP_ZWJ)
{
/* (LB8a) Don't break right after a zero-width joiner. */
*p = UC_BREAK_PROHIBITED;
}
+ else if (prop == LBP_IS && prev_prop == LBP_SP
+ && PROP (lookahead1_prop_ea) == LBP_NU)
+ {
+ /* (LB15c) Break before a decimal mark that follows a space. */
+ *p = UC_BREAK_POSSIBLE;
+ }
+ else if (((prop == LBP_QU1 || prop == LBP_QU2 || prop == LBP_QU3)
+ && (! prev_ea || ! EA (lookahead1_prop_ea))
+ /* (LB18) has higher priority than (LB19a). */
+ && prev_prop != LBP_SP)
+ || ((prev_prop == LBP_QU1 || prev_prop == LBP_QU2 || prev_prop == LBP_QU3)
+ && (! prev2_ea || ! ea)))
+ {
+ /* (LB19a) Don't break on either side of ambiguous
+ quotation marks, except next to an EastAsian character. */
+ *p = UC_BREAK_PROHIBITED;
+ }
+ else if (prev_initial_hyphen
+ && (prop == LBP_AL1 || prop == LBP_AL2))
+ {
+ /* (LB20a) Don't break after a word-initial hyphen. */
+ *p = UC_BREAK_PROHIBITED;
+ }
+ else if (prev_prop == LBP_HL_BA && prop != LBP_HL)
+ {
+ /* (LB21a) Don't break after Hebrew + Hyphen/Break-After,
+ before non-Hebrew. */
+ *p = UC_BREAK_PROHIBITED;
+ }
+ else if ((prev_nus
+ && (prev_prop == LBP_CL
+ || prev_prop == LBP_CP1 || prev_prop == LBP_CP2)
+ && (prop == LBP_PO || prop == LBP_PR))
+ || (nus && (prop == LBP_PO || prop == LBP_PR
+ || prop == LBP_NU)))
+ {
+ /* (LB25) Don't break numbers. */
+ *p = UC_BREAK_PROHIBITED;
+ }
+ else if ((prev_prop == LBP_PO || prev_prop == LBP_PR)
+ && (prop == LBP_OP1 || prop == LBP_OP2)
+ && (PROP (lookahead1_prop_ea) == LBP_NU
+ || (PROP (lookahead1_prop_ea) == LBP_IS
+ && PROP (lookahead2_prop_ea) == LBP_NU)))
+ {
+ /* (LB25) Don't break numbers. */
+ *p = UC_BREAK_PROHIBITED;
+ }
+ else if (prev_prop == LBP_AKLS_VI
+ && (prop == LBP_AK || prop == LBP_AL2))
+ {
+ /* (LB28a) Don't break inside orthographic syllables of
+ Brahmic scripts, line 3. */
+ *p = UC_BREAK_PROHIBITED;
+ }
+ else if (PROP (lookahead1_prop_ea) == LBP_VF
+ && (prop == LBP_AK || prop == LBP_AL2 || prop == LBP_AS)
+ && (prev_prop == LBP_AK || prev_prop == LBP_AL2 || prev_prop == LBP_AS))
+ {
+ /* (LB28a) Don't break inside orthographic syllables of
+ Brahmic scripts, line 4. */
+ *p = UC_BREAK_PROHIBITED;
+ }
+ else if (last_prop == LBP_IS && uc == 0x003C)
+ {
+ /* Partially disable (LB29) Do not break between numeric
+ punctuation and alphabetics ("e.g."). We find it
+ desirable to break before the HTML tag "
" in
+ strings like "Some sentence.
". */
+ *p = UC_BREAK_POSSIBLE;
+ }
else if (last_prop == LBP_RI && prop == LBP_RI)
{
/* (LB30a) Break between two regional indicator symbols
@@ -223,14 +305,37 @@ u32_possible_linebreaks_loop (const uint32_t *s, size_t n, const char *encoding,
? UC_BREAK_POSSIBLE
: UC_BREAK_PROHIBITED);
}
- else if (prev_prop == LBP_HL_BA)
- {
- /* (LB21a) Don't break after Hebrew + Hyphen/Break-After. */
- *p = UC_BREAK_PROHIBITED;
- }
else
{
- switch (unilbrk_table [last_prop] [prop])
+ int this_prop = prop;
+ if (prop == LBP_QU3)
+ {
+ /* For (LB15b): Replace LBP_QU3 with LBP_QU1 if the
+ next character's line break property is not one of
+ BK, CR, LF, SP, GL, WJ, CL, QU, CP, EX, IS, SY, ZW. */
+ switch (PROP (lookahead1_prop_ea))
+ {
+ case LBP_BK:
+ case LBP_CR:
+ case LBP_LF:
+ case LBP_SP:
+ case LBP_GL:
+ case LBP_WJ:
+ case LBP_CL:
+ case LBP_QU1: case LBP_QU2: case LBP_QU3:
+ case LBP_CP1: case LBP_CP2:
+ case LBP_EX:
+ case LBP_IS:
+ case LBP_SY:
+ case LBP_ZW:
+ break;
+ default:
+ this_prop = LBP_QU1;
+ break;
+ }
+ }
+
+ switch (unilbrk_table [last_prop] [this_prop])
{
case D:
*p = UC_BREAK_POSSIBLE;
@@ -245,15 +350,60 @@ u32_possible_linebreaks_loop (const uint32_t *s, size_t n, const char *encoding,
abort ();
}
}
+
+ if (prop == LBP_QU2)
+ {
+ /* For (LB15a): Replace LBP_QU2 with LBP_QU1 if the
+ previous character's line break property was not one of
+ BK, CR, LF, OP, QU, GL, SP, ZW. */
+ switch (prev_prop)
+ {
+ case LBP_BK:
+ case LBP_CR:
+ case LBP_LF:
+ case LBP_OP1: case LBP_OP2:
+ case LBP_QU1: case LBP_QU2: case LBP_QU3:
+ case LBP_GL:
+ case LBP_SP:
+ case LBP_ZW:
+ break;
+ default:
+ prop = LBP_QU1;
+ break;
+ }
+ }
+
last_prop = prop;
seen_space = NULL;
}
+ }
- prev_prop = (prev_prop == LBP_HL && (prop == LBP_HY || prop == LBP_BA)
- ? LBP_HL_BA
- : prop);
+ /* (LB9) Treat X (CM | ZWJ)* as if it were X, where X is any line
+ break class except BK, CR, LF, NL, SP, or ZW. */
+ if (!((prop == LBP_CM || prop == LBP_ZWJ)
+ && !(prev_prop == LBP_BK || prev_prop == LBP_LF || prev_prop == LBP_CR
+ || prev_prop == LBP_SP || prev_prop == LBP_ZW)))
+ {
+ prev_initial_hyphen =
+ (prop == LBP_HY || uc == 0x2010)
+ && (prev_prop == LBP_BK || prev_prop == LBP_CR || prev_prop == LBP_LF
+ || prev_prop == LBP_SP || prev_prop == LBP_ZW
+ || prev_prop == LBP_CB || prev_prop == LBP_GL);
+ prev_prop = (prop == LBP_VI && (prev_prop == LBP_AK
+ || prev_prop == LBP_AL2
+ || prev_prop == LBP_AS)
+ ? LBP_AKLS_VI :
+ prev_prop == LBP_HL && (prop == LBP_HY
+ || (prop == LBP_BA && !ea))
+ ? LBP_HL_BA :
+ prop);
+ prev2_ea = prev_ea;
+ prev_ea = ea;
+ prev_nus = nus;
}
+ preceding_prop = prop;
+
if (prop == LBP_RI)
ri_count++;
else
--
cgit v1.2.3