summaryrefslogtreecommitdiff
path: root/src/unicode.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/unicode.c')
-rw-r--r--src/unicode.c277
1 files changed, 276 insertions, 1 deletions
diff --git a/src/unicode.c b/src/unicode.c
index 04944b9..5820319 100644
--- a/src/unicode.c
+++ b/src/unicode.c
@@ -2,7 +2,7 @@
unicode.c - Oniguruma (regular expression library)
**********************************************************************/
/*-
- * Copyright (c) 2002-2018 K.Kosako <sndgk393 AT ybb DOT ne DOT jp>
+ * Copyright (c) 2002-2019 K.Kosako <sndgk393 AT ybb DOT ne DOT jp>
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
@@ -504,6 +504,281 @@ onigenc_unicode_get_case_fold_codes_by_str(OnigEncoding enc,
#endif
+#ifdef USE_UNICODE_WORD_BREAK
+
+enum WB_TYPE {
+ WB_Any = 0,
+ WB_ALetter,
+ WB_CR,
+ WB_Double_Quote,
+ WB_Extend,
+ WB_ExtendNumLet,
+ WB_Format,
+ WB_Hebrew_Letter,
+ WB_Katakana,
+ WB_LF,
+ WB_MidLetter,
+ WB_MidNum,
+ WB_MidNumLet,
+ WB_Newline,
+ WB_Numeric,
+ WB_Regional_Indicator,
+ WB_Single_Quote,
+ WB_WSegSpace,
+ WB_ZWJ,
+};
+
+typedef struct {
+ OnigCodePoint start;
+ OnigCodePoint end;
+ enum WB_TYPE type;
+} WB_RANGE_TYPE;
+
+#include "unicode_wb_data.c"
+
+static enum WB_TYPE
+wb_get_type(OnigCodePoint code)
+{
+ OnigCodePoint low, high, x;
+ enum WB_TYPE type;
+
+ for (low = 0, high = (OnigCodePoint )WB_RANGE_NUM; low < high; ) {
+ x = (low + high) >> 1;
+ if (code > WB_RANGES[x].end)
+ low = x + 1;
+ else
+ high = x;
+ }
+
+ type = (low < (OnigCodePoint )WB_RANGE_NUM &&
+ code >= WB_RANGES[low].start) ?
+ WB_RANGES[low].type : WB_Any;
+
+ return type;
+}
+
+#define IS_WB_IGNORE_TAIL(t) ((t) == WB_Extend || (t) == WB_Format || (t) == WB_ZWJ)
+#define IS_WB_AHLetter(t) ((t) == WB_ALetter || (t) == WB_Hebrew_Letter)
+#define IS_WB_MidNumLetQ(t) ((t) == WB_MidNumLet || (t) == WB_Single_Quote)
+
+static int
+wb_get_next_main_code(OnigEncoding enc, UChar* p, const UChar* end,
+ OnigCodePoint* rcode, enum WB_TYPE* rtype)
+{
+ OnigCodePoint code;
+ enum WB_TYPE type;
+
+ while (TRUE) {
+ p += enclen(enc, p);
+ if (p >= end) break;
+
+ code = ONIGENC_MBC_TO_CODE(enc, p, end);
+ type = wb_get_type(code);
+ if (! IS_WB_IGNORE_TAIL(type)) {
+ *rcode = code;
+ *rtype = type;
+ return 1;
+ }
+ }
+
+ return 0;
+}
+
+extern int
+onigenc_wb_is_break_position(OnigEncoding enc, UChar* p, UChar* prev,
+ const UChar* start, const UChar* end)
+{
+ int r;
+ UChar* pp;
+ OnigCodePoint cfrom;
+ OnigCodePoint cfrom2;
+ OnigCodePoint cto;
+ OnigCodePoint cto2;
+ enum WB_TYPE from;
+ enum WB_TYPE from2;
+ enum WB_TYPE to;
+ enum WB_TYPE to2;
+
+ /* WB1: sot / Any */
+ if (p == start) return TRUE;
+ /* WB2: Any / eot */
+ if (p == end) return TRUE;
+
+ if (IS_NULL(prev)) {
+ prev = onigenc_get_prev_char_head(enc, start, p);
+ if (IS_NULL(prev)) return TRUE;
+ }
+
+ cfrom = ONIGENC_MBC_TO_CODE(enc, prev, end);
+ cto = ONIGENC_MBC_TO_CODE(enc, p, end);
+
+ from = wb_get_type(cfrom);
+ to = wb_get_type(cto);
+
+ /* short cut */
+ if (from == 0 && to == 0) goto WB999;
+
+ /* WB3: CR + LF */
+ if (from == WB_CR && to == WB_LF) return FALSE;
+
+ /* WB3a: (Newline|CR|LF) / */
+ if (from == WB_Newline || from == WB_CR || from == WB_LF) return TRUE;
+ /* WB3b: / (Newline|CR|LF) */
+ if (to == WB_Newline || to == WB_CR || to == WB_LF) return TRUE;
+
+ /* WB3c: ZWJ + {Extended_Pictographic} */
+ if (from == WB_ZWJ) {
+ if (onigenc_unicode_is_code_ctype(cto, PROP_INDEX_EXTENDEDPICTOGRAPHIC))
+ return FALSE;
+ }
+
+ /* WB3d: WSegSpace + WSegSpace */
+ if (from == WB_WSegSpace && to == WB_WSegSpace) return FALSE;
+
+ /* WB4: X (Extend|Format|ZWJ)* -> X */
+ if (IS_WB_IGNORE_TAIL(to)) return FALSE;
+ if (IS_WB_IGNORE_TAIL(from)) {
+ while ((pp = onigenc_get_prev_char_head(enc, start, prev)) != NULL) {
+ prev = pp;
+ cfrom = ONIGENC_MBC_TO_CODE(enc, prev, end);
+ from = wb_get_type(cfrom);
+ if (! IS_WB_IGNORE_TAIL(from))
+ break;
+ }
+ }
+
+ if (IS_WB_AHLetter(from)) {
+ /* WB5: AHLetter + AHLetter */
+ if (IS_WB_AHLetter(to)) return FALSE;
+
+ /* WB6: AHLetter + (MidLetter | MidNumLetQ) AHLetter */
+ if (to == WB_MidLetter || IS_WB_MidNumLetQ(to)) {
+ r = wb_get_next_main_code(enc, p, end, &cto2, &to2);
+ if (r == 1) {
+ if (IS_WB_AHLetter(to2)) return FALSE;
+ }
+ }
+ }
+
+ /* WB7: AHLetter (MidLetter | MidNumLetQ) + AHLetter */
+ if (from == WB_MidLetter || IS_WB_MidNumLetQ(from)) {
+ if (IS_WB_AHLetter(to)) {
+ from2 = WB_Any;
+ while ((pp = onigenc_get_prev_char_head(enc, start, prev)) != NULL) {
+ prev = pp;
+ cfrom2 = ONIGENC_MBC_TO_CODE(enc, prev, end);
+ from2 = wb_get_type(cfrom2);
+ if (! IS_WB_IGNORE_TAIL(from2))
+ break;
+ }
+
+ if (IS_WB_AHLetter(from2)) return FALSE;
+ }
+ }
+
+ if (from == WB_Hebrew_Letter) {
+ /* WB7a: Hebrew_Letter + Single_Quote */
+ if (to == WB_Single_Quote) return FALSE;
+
+ /* WB7b: Hebrew_Letter + Double_Quote Hebrew_Letter */
+ if (to == WB_Double_Quote) {
+ r = wb_get_next_main_code(enc, p, end, &cto2, &to2);
+ if (r == 1) {
+ if (to2 == WB_Hebrew_Letter) return FALSE;
+ }
+ }
+ }
+
+ /* WB7c: Hebrew_Letter Double_Quote + Hebrew_Letter */
+ if (from == WB_Double_Quote) {
+ if (to == WB_Hebrew_Letter) {
+ from2 = WB_Any;
+ while ((pp = onigenc_get_prev_char_head(enc, start, prev)) != NULL) {
+ prev = pp;
+ cfrom2 = ONIGENC_MBC_TO_CODE(enc, prev, end);
+ from2 = wb_get_type(cfrom2);
+ if (! IS_WB_IGNORE_TAIL(from2))
+ break;
+ }
+
+ if (from2 == WB_Hebrew_Letter) return FALSE;
+ }
+ }
+
+ if (to == WB_Numeric) {
+ /* WB8: Numeric + Numeric */
+ if (from == WB_Numeric) return FALSE;
+
+ /* WB9: AHLetter + Numeric */
+ if (IS_WB_AHLetter(from)) return FALSE;
+
+ /* WB11: Numeric (MidNum | MidNumLetQ) + Numeric */
+ if (from == WB_MidNum || IS_WB_MidNumLetQ(from)) {
+ from2 = WB_Any;
+ while ((pp = onigenc_get_prev_char_head(enc, start, prev)) != NULL) {
+ prev = pp;
+ cfrom2 = ONIGENC_MBC_TO_CODE(enc, prev, end);
+ from2 = wb_get_type(cfrom2);
+ if (! IS_WB_IGNORE_TAIL(from2))
+ break;
+ }
+
+ if (from2 == WB_Numeric) return FALSE;
+ }
+ }
+
+ if (from == WB_Numeric) {
+ /* WB10: Numeric + AHLetter */
+ if (IS_WB_AHLetter(to)) return FALSE;
+
+ /* WB12: Numeric + (MidNum | MidNumLetQ) Numeric */
+ if (to == WB_MidNum || IS_WB_MidNumLetQ(to)) {
+ r = wb_get_next_main_code(enc, p, end, &cto2, &to2);
+ if (r == 1) {
+ if (to2 == WB_Numeric) return FALSE;
+ }
+ }
+ }
+
+ /* WB13: Katakana + Katakana */
+ if (from == WB_Katakana && to == WB_Katakana) return FALSE;
+
+ /* WB13a: (AHLetter | Numeric | Katakana | ExtendNumLet) + ExtendNumLet */
+ if (IS_WB_AHLetter(from) || from == WB_Numeric || from == WB_Katakana
+ || from == WB_ExtendNumLet) {
+ if (to == WB_ExtendNumLet) return FALSE;
+ }
+
+ /* WB13b: ExtendNumLet + (AHLetter | Numeric | Katakana) */
+ if (from == WB_ExtendNumLet) {
+ if (IS_WB_AHLetter(to) || to == WB_Numeric || to == WB_Katakana)
+ return FALSE;
+ }
+
+
+ /* WB15: sot (RI RI)* RI + RI */
+ /* WB16: [^RI] (RI RI)* RI + RI */
+ if (from == WB_Regional_Indicator && to == WB_Regional_Indicator) {
+ int n = 0;
+ while ((prev = onigenc_get_prev_char_head(enc, start, prev)) != NULL) {
+ cfrom2 = ONIGENC_MBC_TO_CODE(enc, prev, end);
+ from2 = wb_get_type(cfrom2);
+ if (from2 != WB_Regional_Indicator)
+ break;
+
+ n++;
+ }
+ if ((n % 2) == 0) return FALSE;
+ }
+
+ WB999:
+ /* WB999: Any / Any */
+ return TRUE;
+}
+
+#endif /* USE_UNICODE_WORD_BREAK */
+
+
#ifdef USE_UNICODE_EXTENDED_GRAPHEME_CLUSTER
enum EGCB_BREAK_TYPE {