summaryrefslogtreecommitdiff
path: root/lib/uniwbrk/u-wordbreaks.h
diff options
context:
space:
mode:
Diffstat (limited to 'lib/uniwbrk/u-wordbreaks.h')
-rw-r--r--lib/uniwbrk/u-wordbreaks.h70
1 files changed, 38 insertions, 32 deletions
diff --git a/lib/uniwbrk/u-wordbreaks.h b/lib/uniwbrk/u-wordbreaks.h
index 5b0fce72..47d1e83d 100644
--- a/lib/uniwbrk/u-wordbreaks.h
+++ b/lib/uniwbrk/u-wordbreaks.h
@@ -1,5 +1,5 @@
/* Word breaks in UTF-8/UTF-16/UTF-32 strings. -*- coding: utf-8 -*-
- Copyright (C) 2009-2016 Free Software Foundation, Inc.
+ Copyright (C) 2009-2017 Free Software Foundation, Inc.
Written by Bruno Haible <bruno@clisp.org>, 2009.
This program is free software: you can redistribute it and/or
@@ -22,7 +22,7 @@
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public License
- along with this program. If not, see <http://www.gnu.org/licenses/>. */
+ along with this program. If not, see <https://www.gnu.org/licenses/>. */
void
FUNC (const UNIT *s, size_t n, char *p)
@@ -48,6 +48,8 @@ FUNC (const UNIT *s, size_t n, char *p)
-1 at the very beginning of the string. */
int secondlast_compchar_prop = -1;
+ size_t ri_count = 0;
+
/* Don't break inside multibyte characters. */
memset (p, 0, n);
@@ -60,10 +62,10 @@ FUNC (const UNIT *s, size_t n, char *p)
/* No break at the start of the string. */
if (last_char_prop >= 0)
{
- /* No break between CR and LF. */
+ /* No break between CR and LF (WB3). */
if (last_char_prop == WBP_CR && prop == WBP_LF)
/* *p = 0 */;
- /* Break before and after newlines. */
+ /* Break before and after newlines (WB3a, WB3b). */
else if ((last_char_prop == WBP_CR
|| last_char_prop == WBP_LF
|| last_char_prop == WBP_NEWLINE)
@@ -71,8 +73,12 @@ FUNC (const UNIT *s, size_t n, char *p)
|| prop == WBP_LF
|| prop == WBP_NEWLINE))
*p = 1;
+ /* No break within emoji zwj sequence (WB3c). */
+ else if (last_char_prop == WBP_ZWJ &&
+ (prop == WBP_GAZ || prop == WBP_EBG))
+ /* *p = 0 */;
/* Ignore Format and Extend characters. */
- else if (!(prop == WBP_EXTEND || prop == WBP_FORMAT))
+ else if (!(prop == WBP_EXTEND || prop == WBP_FORMAT || prop == WBP_ZWJ))
{
/* No break in these situations (see UAX #29):
@@ -84,16 +90,8 @@ FUNC (const UNIT *s, size_t n, char *p)
Numeric × (MidNum | MidNumLet | SQ) Numeric (WB12)
HL × DQ HL (WB7b)
HL DQ × HL (WB7c)
- (ALetter | HL) × (ALetter | HL) (WB5)
- (ALetter | HL) × Numeric (WB9)
- Numeric × (ALetter | HL) (WB10)
- Numeric × Numeric (WB8)
- HL × SQ (WB7a)
- Katakana × Katakana (WB13)
- (ALetter | HL | Numeric | Katakana) × ExtendNumLet (WB13a)
- ExtendNumLet × ExtendNumLet (WB13a)
- ExtendNumLet × (ALetter | HL | Numeric | Katakana) (WB13b)
- Regional_Indicator × Regional_Indicator (WB13c)
+ ^ (RI RI)* RI × RI (WB15)
+ [^RI] (RI RI)* RI × RI (WB16)
*/
/* No break across certain punctuation. Also, disable word
breaks that were recognized earlier (due to lookahead of
@@ -117,27 +115,29 @@ FUNC (const UNIT *s, size_t n, char *p)
*last_compchar_ptr = 0;
/* *p = 0; */
}
- /* Break after Format and Extend characters. */
+ /* Break before RI, if odd number of RI's are
+ preceding (WB15, WB16). */
+ else if (last_compchar_prop == WBP_RI && prop == WBP_RI)
+ {
+ if (ri_count % 2 == 0)
+ *p = 1;
+ /* else *p = 0 */
+ }
+ /* Break after Format and Extend character. */
else if (last_compchar_prop == WBP_EXTEND
|| last_compchar_prop == WBP_FORMAT)
*p = 1;
else
{
- /* Normalize property value to table index,
- skipping 5 properties: WBP_EXTEND,
- WBP_FORMAT, WBP_NEWLINE, WBP_CR, and
- WBP_LF. */
- int last_compchar_prop_index = last_compchar_prop;
- int prop_index = prop;
-
- if (last_compchar_prop_index >= WBP_EXTEND)
- last_compchar_prop_index -= 5;
-
- if (prop_index >= WBP_EXTEND)
- prop_index -= 5;
+ int last_compchar_index =
+ uniwbrk_prop_index[last_compchar_prop];
+ int index = uniwbrk_prop_index[prop];
+ /* Break between unknown pair (WB999). */
+ if (last_compchar_index < 0 || index < 0)
+ *p = 1;
/* Perform a single table lookup. */
- if (uniwbrk_table[last_compchar_prop_index][prop_index])
+ else if (uniwbrk_table[last_compchar_index][index])
*p = 1;
/* else *p = 0; */
}
@@ -145,17 +145,23 @@ FUNC (const UNIT *s, size_t n, char *p)
}
last_char_prop = prop;
- /* Ignore Format and Extend characters, except at the start
- of the line. */
+
+ /* Ignore Format and Extend characters, except at the
+ start of the line. */
if (last_compchar_prop < 0
|| last_compchar_prop == WBP_CR
|| last_compchar_prop == WBP_LF
|| last_compchar_prop == WBP_NEWLINE
- || !(prop == WBP_EXTEND || prop == WBP_FORMAT))
+ || !(prop == WBP_EXTEND || prop == WBP_FORMAT || prop == WBP_ZWJ))
{
secondlast_compchar_prop = last_compchar_prop;
last_compchar_prop = prop;
last_compchar_ptr = p;
+
+ if (prop == WBP_RI)
+ ri_count++;
+ else
+ ri_count = 0;
}
s += count;