diff options
Diffstat (limited to 'lib/uniwbrk/u-wordbreaks.h')
-rw-r--r-- | lib/uniwbrk/u-wordbreaks.h | 70 |
1 files changed, 38 insertions, 32 deletions
diff --git a/lib/uniwbrk/u-wordbreaks.h b/lib/uniwbrk/u-wordbreaks.h index 5b0fce72..47d1e83d 100644 --- a/lib/uniwbrk/u-wordbreaks.h +++ b/lib/uniwbrk/u-wordbreaks.h @@ -1,5 +1,5 @@ /* Word breaks in UTF-8/UTF-16/UTF-32 strings. -*- coding: utf-8 -*- - Copyright (C) 2009-2016 Free Software Foundation, Inc. + Copyright (C) 2009-2017 Free Software Foundation, Inc. Written by Bruno Haible <bruno@clisp.org>, 2009. This program is free software: you can redistribute it and/or @@ -22,7 +22,7 @@ Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License - along with this program. If not, see <http://www.gnu.org/licenses/>. */ + along with this program. If not, see <https://www.gnu.org/licenses/>. */ void FUNC (const UNIT *s, size_t n, char *p) @@ -48,6 +48,8 @@ FUNC (const UNIT *s, size_t n, char *p) -1 at the very beginning of the string. */ int secondlast_compchar_prop = -1; + size_t ri_count = 0; + /* Don't break inside multibyte characters. */ memset (p, 0, n); @@ -60,10 +62,10 @@ FUNC (const UNIT *s, size_t n, char *p) /* No break at the start of the string. */ if (last_char_prop >= 0) { - /* No break between CR and LF. */ + /* No break between CR and LF (WB3). */ if (last_char_prop == WBP_CR && prop == WBP_LF) /* *p = 0 */; - /* Break before and after newlines. */ + /* Break before and after newlines (WB3a, WB3b). */ else if ((last_char_prop == WBP_CR || last_char_prop == WBP_LF || last_char_prop == WBP_NEWLINE) @@ -71,8 +73,12 @@ FUNC (const UNIT *s, size_t n, char *p) || prop == WBP_LF || prop == WBP_NEWLINE)) *p = 1; + /* No break within emoji zwj sequence (WB3c). */ + else if (last_char_prop == WBP_ZWJ && + (prop == WBP_GAZ || prop == WBP_EBG)) + /* *p = 0 */; /* Ignore Format and Extend characters. */ - else if (!(prop == WBP_EXTEND || prop == WBP_FORMAT)) + else if (!(prop == WBP_EXTEND || prop == WBP_FORMAT || prop == WBP_ZWJ)) { /* No break in these situations (see UAX #29): @@ -84,16 +90,8 @@ FUNC (const UNIT *s, size_t n, char *p) Numeric × (MidNum | MidNumLet | SQ) Numeric (WB12) HL × DQ HL (WB7b) HL DQ × HL (WB7c) - (ALetter | HL) × (ALetter | HL) (WB5) - (ALetter | HL) × Numeric (WB9) - Numeric × (ALetter | HL) (WB10) - Numeric × Numeric (WB8) - HL × SQ (WB7a) - Katakana × Katakana (WB13) - (ALetter | HL | Numeric | Katakana) × ExtendNumLet (WB13a) - ExtendNumLet × ExtendNumLet (WB13a) - ExtendNumLet × (ALetter | HL | Numeric | Katakana) (WB13b) - Regional_Indicator × Regional_Indicator (WB13c) + ^ (RI RI)* RI × RI (WB15) + [^RI] (RI RI)* RI × RI (WB16) */ /* No break across certain punctuation. Also, disable word breaks that were recognized earlier (due to lookahead of @@ -117,27 +115,29 @@ FUNC (const UNIT *s, size_t n, char *p) *last_compchar_ptr = 0; /* *p = 0; */ } - /* Break after Format and Extend characters. */ + /* Break before RI, if odd number of RI's are + preceding (WB15, WB16). */ + else if (last_compchar_prop == WBP_RI && prop == WBP_RI) + { + if (ri_count % 2 == 0) + *p = 1; + /* else *p = 0 */ + } + /* Break after Format and Extend character. */ else if (last_compchar_prop == WBP_EXTEND || last_compchar_prop == WBP_FORMAT) *p = 1; else { - /* Normalize property value to table index, - skipping 5 properties: WBP_EXTEND, - WBP_FORMAT, WBP_NEWLINE, WBP_CR, and - WBP_LF. */ - int last_compchar_prop_index = last_compchar_prop; - int prop_index = prop; - - if (last_compchar_prop_index >= WBP_EXTEND) - last_compchar_prop_index -= 5; - - if (prop_index >= WBP_EXTEND) - prop_index -= 5; + int last_compchar_index = + uniwbrk_prop_index[last_compchar_prop]; + int index = uniwbrk_prop_index[prop]; + /* Break between unknown pair (WB999). */ + if (last_compchar_index < 0 || index < 0) + *p = 1; /* Perform a single table lookup. */ - if (uniwbrk_table[last_compchar_prop_index][prop_index]) + else if (uniwbrk_table[last_compchar_index][index]) *p = 1; /* else *p = 0; */ } @@ -145,17 +145,23 @@ FUNC (const UNIT *s, size_t n, char *p) } last_char_prop = prop; - /* Ignore Format and Extend characters, except at the start - of the line. */ + + /* Ignore Format and Extend characters, except at the + start of the line. */ if (last_compchar_prop < 0 || last_compchar_prop == WBP_CR || last_compchar_prop == WBP_LF || last_compchar_prop == WBP_NEWLINE - || !(prop == WBP_EXTEND || prop == WBP_FORMAT)) + || !(prop == WBP_EXTEND || prop == WBP_FORMAT || prop == WBP_ZWJ)) { secondlast_compchar_prop = last_compchar_prop; last_compchar_prop = prop; last_compchar_ptr = p; + + if (prop == WBP_RI) + ri_count++; + else + ri_count = 0; } s += count; |