diff options
Diffstat (limited to 'lib/unilbrk/u16-possible-linebreaks.c')
-rw-r--r-- | lib/unilbrk/u16-possible-linebreaks.c | 310 |
1 files changed, 230 insertions, 80 deletions
diff --git a/lib/unilbrk/u16-possible-linebreaks.c b/lib/unilbrk/u16-possible-linebreaks.c index 6a9c15b7..ac655f8c 100644 --- a/lib/unilbrk/u16-possible-linebreaks.c +++ b/lib/unilbrk/u16-possible-linebreaks.c @@ -45,25 +45,88 @@ u16_possible_linebreaks_loop (const uint16_t *s, size_t n, const char *encoding, { if (n > 0) { - int LBP_AI_REPLACEMENT = (is_cjk_encoding (encoding) ? LBP_ID1 : LBP_AL); - const uint16_t *s_end = s + n; - int prev_prop = LBP_BK; /* line break property of last character */ - int last_prop = LBP_BK; /* line break property of last non-space character */ - char *seen_space = NULL; /* Was a space seen after the last non-space character? */ + int LBP_AI_REPLACEMENT = (is_cjk_encoding (encoding) ? LBP_ID1 : LBP_AL1); /* Don't break inside multibyte characters. */ memset (p, UC_BREAK_PROHIBITED, n); + const uint16_t *s_end = s + n; + + /* We need 2 characters of lookahead: + - 1 character of lookahead for (LB15c,LB19a,LB28a), + - 2 characters of lookahead for (LB25). */ + const uint16_t *lookahead1_end; + ucs4_t lookahead1_uc; + int lookahead1_prop_ea; + const uint16_t *lookahead2_end; + ucs4_t lookahead2_uc; + int lookahead2_prop_ea; + /* Get the first lookahead character. */ + lookahead1_end = s; + lookahead1_end += u16_mbtouc_unsafe (&lookahead1_uc, lookahead1_end, s_end - lookahead1_end); + lookahead1_prop_ea = unilbrkprop_lookup (lookahead1_uc); + /* Get the second lookahead character. */ + lookahead2_end = lookahead1_end; + if (lookahead2_end < s_end) + { + lookahead2_end += u16_mbtouc_unsafe (&lookahead2_uc, lookahead2_end, s_end - lookahead2_end); + lookahead2_prop_ea = unilbrkprop_lookup (lookahead2_uc); + } + else + { + lookahead2_uc = 0xFFFD; + lookahead2_prop_ea = PROP_EA (LBP_BK, 0); + } + + int preceding_prop = LBP_BK; /* line break property of preceding character */ + int prev_prop = LBP_BK; /* line break property of previous character + (= last character, ignoring intervening characters of class CM or ZWJ) */ + int prev_ea = 0; /* EastAsian property of previous character + (= last character, ignoring intervening characters of class CM or ZWJ) */ + int prev2_ea = 0; /* EastAsian property of character before the previous character */ + bool prev_initial_hyphen = false; /* the previous character was a + word-initial hyphen or U+2010 */ + bool prev_nus = false; /* before the previous character, there was a character + with line break property LBP_NU and since then + only characters with line break property LBP_SY + or LBP_IS */ + int last_prop = LBP_BK; /* line break property of last non-space character + (= last character, ignoring intervening characters of class SP or CM or ZWJ) */ + char *seen_space = NULL; /* Was a space seen after the last non-space character? */ + /* Number of consecutive regional indicator (RI) characters seen immediately before the current point. */ size_t ri_count = 0; do { - ucs4_t uc; - int count = u16_mbtouc_unsafe (&uc, s, s_end - s); - s += count; - int prop = unilbrkprop_lookup (uc); + /* Read the next character. */ + size_t count = lookahead1_end - s; + s = lookahead1_end; + ucs4_t uc = lookahead1_uc; + int prop_ea = lookahead1_prop_ea; /* = unilbrkprop_lookup (uc); */ + int prop = PROP (prop_ea); /* line break property of uc */ + int ea = EA (prop_ea); /* EastAsian property of uc */ + /* Refill the pipeline of 2 lookahead characters. */ + lookahead1_end = lookahead2_end; + lookahead1_uc = lookahead2_uc; + lookahead1_prop_ea = lookahead2_prop_ea; + if (lookahead2_end < s_end) + { + lookahead2_end += u16_mbtouc_unsafe (&lookahead2_uc, lookahead2_end, s_end - lookahead2_end); + lookahead2_prop_ea = unilbrkprop_lookup (lookahead2_uc); + } + else + { + lookahead2_uc = 0xFFFD; + lookahead2_prop_ea = PROP_EA (LBP_BK, 0); + } + + bool nus = /* ending at the previous character, there was a character + with line break property LBP_NU and since then only + characters with line break property LBP_SY or LBP_IS */ + (prev_prop == LBP_NU + || (prev_nus && (prev_prop == LBP_SY || prev_prop == LBP_IS))); if (prop == LBP_BK || prop == LBP_LF || prop == LBP_CR) { @@ -73,7 +136,6 @@ u16_possible_linebreaks_loop (const uint16_t *s, size_t n, const char *encoding, a CR-LF sequence. */ if (prev_prop == cr && prop == LBP_LF) p[-1] = UC_BREAK_CR_BEFORE_LF; - prev_prop = prop; last_prop = LBP_BK; seen_space = NULL; } @@ -95,63 +157,7 @@ u16_possible_linebreaks_loop (const uint16_t *s, size_t n, const char *encoding, Treat LBP_SA like LBP_XX. */ case LBP_XX: /* This is arbitrary. */ - prop = LBP_AL; - break; - case LBP_QU2: - /* For (LB15a): Replace LBP_QU2 with LBP_QU1 if the previous - character's line break property was not one of - BK, CR, LF, OP, QU, GL, SP, ZW. */ - switch (prev_prop) - { - case LBP_BK: - case LBP_CR: - case LBP_LF: - case LBP_OP1: case LBP_OP2: - case LBP_QU1: case LBP_QU2: case LBP_QU3: - case LBP_GL: - case LBP_SP: - case LBP_ZW: - break; - default: - prop = LBP_QU1; - break; - } - break; - case LBP_QU3: - /* For (LB15b): Replace LBP_QU3 with LBP_QU1 if the next - character's line break property is not one of - BK, CR, LF, SP, GL, WJ, CL, QU, CP, EX, IS, SY, ZW. */ - { - int next_prop; - if (s < s_end) - { - ucs4_t next_uc; - (void) u16_mbtouc_unsafe (&next_uc, s, s_end - s); - next_prop = unilbrkprop_lookup (next_uc); - } - else - next_prop = LBP_BK; - switch (next_prop) - { - case LBP_BK: - case LBP_CR: - case LBP_LF: - case LBP_SP: - case LBP_GL: - case LBP_WJ: - case LBP_CL: - case LBP_QU1: case LBP_QU2: case LBP_QU3: - case LBP_CP1: case LBP_CP2: - case LBP_EX: - case LBP_IS: - case LBP_SY: - case LBP_ZW: - break; - default: - prop = LBP_QU1; - break; - } - } + prop = LBP_AL1; break; } @@ -179,10 +185,15 @@ u16_possible_linebreaks_loop (const uint16_t *s, size_t n, const char *encoding, /* (LB4,LB5,LB6) Don't break at the beginning of a line. */ *p = UC_BREAK_PROHIBITED; /* (LB10) Treat CM or ZWJ as AL. */ - last_prop = LBP_AL; + last_prop = LBP_AL1; seen_space = NULL; } - else if (last_prop == LBP_ZW || seen_space != NULL) + else if (last_prop == LBP_ZW + || (seen_space != NULL + /* (LB14) has higher priority than (LB18). */ + && !(last_prop == LBP_OP1 || last_prop == LBP_OP2) + /* (LB15a) has higher priority than (LB18). */ + && !(last_prop == LBP_QU2))) { /* (LB8) Break after zero-width space. */ /* (LB18) Break after spaces. @@ -191,7 +202,7 @@ u16_possible_linebreaks_loop (const uint16_t *s, size_t n, const char *encoding, NBSP CM sequence is recommended instead of SP CM. */ *p = UC_BREAK_POSSIBLE; /* (LB10) Treat CM or ZWJ as AL. */ - last_prop = LBP_AL; + last_prop = LBP_AL1; seen_space = NULL; } else @@ -216,11 +227,82 @@ u16_possible_linebreaks_loop (const uint16_t *s, size_t n, const char *encoding, /* (LB8) Break after zero-width space. */ *p = UC_BREAK_POSSIBLE; } - else if (prev_prop == LBP_ZWJ) + else if (preceding_prop == LBP_ZWJ) { /* (LB8a) Don't break right after a zero-width joiner. */ *p = UC_BREAK_PROHIBITED; } + else if (prop == LBP_IS && prev_prop == LBP_SP + && PROP (lookahead1_prop_ea) == LBP_NU) + { + /* (LB15c) Break before a decimal mark that follows a space. */ + *p = UC_BREAK_POSSIBLE; + } + else if (((prop == LBP_QU1 || prop == LBP_QU2 || prop == LBP_QU3) + && (! prev_ea || ! EA (lookahead1_prop_ea)) + /* (LB18) has higher priority than (LB19a). */ + && prev_prop != LBP_SP) + || ((prev_prop == LBP_QU1 || prev_prop == LBP_QU2 || prev_prop == LBP_QU3) + && (! prev2_ea || ! ea))) + { + /* (LB19a) Don't break on either side of ambiguous + quotation marks, except next to an EastAsian character. */ + *p = UC_BREAK_PROHIBITED; + } + else if (prev_initial_hyphen + && (prop == LBP_AL1 || prop == LBP_AL2)) + { + /* (LB20a) Don't break after a word-initial hyphen. */ + *p = UC_BREAK_PROHIBITED; + } + else if (prev_prop == LBP_HL_BA && prop != LBP_HL) + { + /* (LB21a) Don't break after Hebrew + Hyphen/Break-After, + before non-Hebrew. */ + *p = UC_BREAK_PROHIBITED; + } + else if ((prev_nus + && (prev_prop == LBP_CL + || prev_prop == LBP_CP1 || prev_prop == LBP_CP2) + && (prop == LBP_PO || prop == LBP_PR)) + || (nus && (prop == LBP_PO || prop == LBP_PR + || prop == LBP_NU))) + { + /* (LB25) Don't break numbers. */ + *p = UC_BREAK_PROHIBITED; + } + else if ((prev_prop == LBP_PO || prev_prop == LBP_PR) + && (prop == LBP_OP1 || prop == LBP_OP2) + && (PROP (lookahead1_prop_ea) == LBP_NU + || (PROP (lookahead1_prop_ea) == LBP_IS + && PROP (lookahead2_prop_ea) == LBP_NU))) + { + /* (LB25) Don't break numbers. */ + *p = UC_BREAK_PROHIBITED; + } + else if (prev_prop == LBP_AKLS_VI + && (prop == LBP_AK || prop == LBP_AL2)) + { + /* (LB28a) Don't break inside orthographic syllables of + Brahmic scripts, line 3. */ + *p = UC_BREAK_PROHIBITED; + } + else if (PROP (lookahead1_prop_ea) == LBP_VF + && (prop == LBP_AK || prop == LBP_AL2 || prop == LBP_AS) + && (prev_prop == LBP_AK || prev_prop == LBP_AL2 || prev_prop == LBP_AS)) + { + /* (LB28a) Don't break inside orthographic syllables of + Brahmic scripts, line 4. */ + *p = UC_BREAK_PROHIBITED; + } + else if (last_prop == LBP_IS && uc == 0x003C) + { + /* Partially disable (LB29) Do not break between numeric + punctuation and alphabetics ("e.g."). We find it + desirable to break before the HTML tag "</P>" in + strings like "<P>Some sentence.</P>". */ + *p = UC_BREAK_POSSIBLE; + } else if (last_prop == LBP_RI && prop == LBP_RI) { /* (LB30a) Break between two regional indicator symbols @@ -230,14 +312,37 @@ u16_possible_linebreaks_loop (const uint16_t *s, size_t n, const char *encoding, ? UC_BREAK_POSSIBLE : UC_BREAK_PROHIBITED); } - else if (prev_prop == LBP_HL_BA) - { - /* (LB21a) Don't break after Hebrew + Hyphen/Break-After. */ - *p = UC_BREAK_PROHIBITED; - } else { - switch (unilbrk_table [last_prop] [prop]) + int this_prop = prop; + if (prop == LBP_QU3) + { + /* For (LB15b): Replace LBP_QU3 with LBP_QU1 if the + next character's line break property is not one of + BK, CR, LF, SP, GL, WJ, CL, QU, CP, EX, IS, SY, ZW. */ + switch (PROP (lookahead1_prop_ea)) + { + case LBP_BK: + case LBP_CR: + case LBP_LF: + case LBP_SP: + case LBP_GL: + case LBP_WJ: + case LBP_CL: + case LBP_QU1: case LBP_QU2: case LBP_QU3: + case LBP_CP1: case LBP_CP2: + case LBP_EX: + case LBP_IS: + case LBP_SY: + case LBP_ZW: + break; + default: + this_prop = LBP_QU1; + break; + } + } + + switch (unilbrk_table [last_prop] [this_prop]) { case D: *p = UC_BREAK_POSSIBLE; @@ -252,15 +357,60 @@ u16_possible_linebreaks_loop (const uint16_t *s, size_t n, const char *encoding, abort (); } } + + if (prop == LBP_QU2) + { + /* For (LB15a): Replace LBP_QU2 with LBP_QU1 if the + previous character's line break property was not one of + BK, CR, LF, OP, QU, GL, SP, ZW. */ + switch (prev_prop) + { + case LBP_BK: + case LBP_CR: + case LBP_LF: + case LBP_OP1: case LBP_OP2: + case LBP_QU1: case LBP_QU2: case LBP_QU3: + case LBP_GL: + case LBP_SP: + case LBP_ZW: + break; + default: + prop = LBP_QU1; + break; + } + } + last_prop = prop; seen_space = NULL; } + } - prev_prop = (prev_prop == LBP_HL && (prop == LBP_HY || prop == LBP_BA) - ? LBP_HL_BA - : prop); + /* (LB9) Treat X (CM | ZWJ)* as if it were X, where X is any line + break class except BK, CR, LF, NL, SP, or ZW. */ + if (!((prop == LBP_CM || prop == LBP_ZWJ) + && !(prev_prop == LBP_BK || prev_prop == LBP_LF || prev_prop == LBP_CR + || prev_prop == LBP_SP || prev_prop == LBP_ZW))) + { + prev_initial_hyphen = + (prop == LBP_HY || uc == 0x2010) + && (prev_prop == LBP_BK || prev_prop == LBP_CR || prev_prop == LBP_LF + || prev_prop == LBP_SP || prev_prop == LBP_ZW + || prev_prop == LBP_CB || prev_prop == LBP_GL); + prev_prop = (prop == LBP_VI && (prev_prop == LBP_AK + || prev_prop == LBP_AL2 + || prev_prop == LBP_AS) + ? LBP_AKLS_VI : + prev_prop == LBP_HL && (prop == LBP_HY + || (prop == LBP_BA && !ea)) + ? LBP_HL_BA : + prop); + prev2_ea = prev_ea; + prev_ea = ea; + prev_nus = nus; } + preceding_prop = prop; + if (prop == LBP_RI) ri_count++; else |