From a76fa337cc657dbe669ffb8dbdac606d4d6616f1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=B6rg=20Frings-F=C3=BCrst?= Date: Wed, 31 Aug 2016 03:42:05 +0200 Subject: Imported Upstream version 6.1.0 --- CMakeLists.txt | 2 +- HISTORY | 41 ++ Makefile.am | 2 +- README.md | 8 + configure.ac | 4 +- contributed/libfuzzer-onig.cpp | 31 + dist.info | 2 +- doc/API | 27 +- doc/API.ja | 27 +- doc/RE | 225 +++--- index.html | 5 +- index_ja.html | 5 +- sample/.gitignore | 1 + sample/Makefile.am | 8 +- sample/bug_fix.c | 131 ++++ sample/scan.c | 88 +++ src/ascii.c | 3 +- src/big5.c | 9 +- src/cp1251.c | 3 +- src/euc_jp.c | 9 +- src/euc_kr.c | 12 +- src/euc_tw.c | 9 +- src/gb18030.c | 9 +- src/iso8859_1.c | 3 +- src/iso8859_10.c | 3 +- src/iso8859_11.c | 3 +- src/iso8859_13.c | 3 +- src/iso8859_14.c | 3 +- src/iso8859_15.c | 3 +- src/iso8859_16.c | 3 +- src/iso8859_2.c | 3 +- src/iso8859_3.c | 3 +- src/iso8859_4.c | 3 +- src/iso8859_5.c | 3 +- src/iso8859_6.c | 3 +- src/iso8859_7.c | 3 +- src/iso8859_8.c | 3 +- src/iso8859_9.c | 3 +- src/koi8.c | 3 +- src/koi8_r.c | 3 +- src/oniguruma.h | 23 +- src/regcomp.c | 608 ++++++++-------- src/regenc.c | 43 +- src/regenc.h | 5 +- src/regerror.c | 2 + src/regexec.c | 1560 +++++++++++++++++++++------------------- src/regint.h | 2 + src/regparse.c | 179 +++-- src/regparse.h | 4 +- src/sjis.c | 9 +- src/unicode.c | 16 +- src/utf16_be.c | 9 +- src/utf16_le.c | 18 +- src/utf32_be.c | 9 +- src/utf32_le.c | 9 +- src/utf8.c | 36 +- 56 files changed, 1965 insertions(+), 1279 deletions(-) create mode 100644 contributed/libfuzzer-onig.cpp create mode 100644 sample/bug_fix.c create mode 100644 sample/scan.c diff --git a/CMakeLists.txt b/CMakeLists.txt index 0b245d0..14e22fd 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -4,7 +4,7 @@ cmake_minimum_required(VERSION 2.8) project(oniguruma C) set(PACKAGE onig) -set(PACKAGE_VERSION "6.0.0") +set(PACKAGE_VERSION "6.1.0") set(USE_COMBINATION_EXPLOSION_CHECK 0) set(USE_CRNL_AS_LINE_TERMINATOR 0) diff --git a/HISTORY b/HISTORY index 60f096e..0e9b1c7 100644 --- a/HISTORY +++ b/HISTORY @@ -1,7 +1,48 @@ History +2016/08/29: Version 6.1.0 + +2016/08/28: add contributed/libfuzzer-onig.cpp (thanks hannob) +2016/08/28: update LTVERSION 4:0:0 +2016/08/28: NEW API: onigenc_is_valid_mbc_string(). +2016/08/27: add is_valid_mbc_string() member into OnigEncodingType. +2016/08/27: fix out of bounds read. +2016/08/26: fix out of bounds read. +2016/08/25: disable USE_INVALID_CODE_SCHEME. +2016/08/24: fix out of bounds read. +2016/08/23: doc/RE improved. +2016/08/22: add onig_scan() into doc/API. +2016/08/22: fix bug: Out of bounds read in onig_strcpy() #17 +2016/08/21: fix bug: infinite loop of backreference and group. +2016/08/21: fix out of bounds read in mbc_to_code() #16 +2016/08/18: doc/RE refinements. +2016/08/16: add onig_scan() (NEW API) +2016/08/16: reimplement match stack allocation for case too many repeat + and too many captures in regexp. +2016/08/15: number of captures <= 32767 for bytecode representation. +2016/07/17: don't use int_map_backward for thread-safe. +2016/07/04: fix case of enclosed option in look-behind. +2016/07/04: fix ignore case in look-behind. +2016/05/23: fix memory leak in onig_unicode_define_user_property() +2016/05/20: declare variables at the top of scope. (thanks nmaya) + 2016/05/09: Version 6.0.0 +2016/05/05: add NEW API: onig_unicode_define_user_property() +2016/05/04: update Unicode data to 8.0.0 +2016/05/02: change OnigCodePoint type to unsigned int. +2016/05/02: add doc/UNICODE_PROPERTIES. +2016/04/19: add error code ONIGERR_FAIL_TO_INITIALIZE. +2016/04/18: add make_win64/32.bat. +2016/04/18: fix bug of uninitialized regex_t value on error. +2016/04/16: reimplement Unicode case folding. +2016/04/11: update LTVERSION = 3.0.0 +2016/04/05: remove all THREAD_ macro. +2016/04/05: add init member into OnigEncoding. (add onig_initialize()) +2016/03/28: remove state member of regex. +2016/03/25: move source files into src/ +2016/03/23: rename configre.in to configure.ac. +2015/11/17: fix memory leak. (thanks pigzang) 2015/07/13: change mail address. 2014/12/12: Version 5.9.6 diff --git a/Makefile.am b/Makefile.am index 4201e0b..086b23c 100644 --- a/Makefile.am +++ b/Makefile.am @@ -11,7 +11,7 @@ EXTRA_DIST = oniguruma.pc.in HISTORY README.ja README.md \ doc/API doc/API.ja doc/RE doc/RE.ja doc/FAQ doc/FAQ.ja \ doc/UNICODE_PROPERTIES \ src/Makefile.windows src/config.h.win32 src/config.h.win64 \ - windows/testc.c + windows/testc.c contributed/libfuzzer-onig.cpp bin_SCRIPTS = onig-config diff --git a/README.md b/README.md index dfd6723..a2c49cd 100644 --- a/README.md +++ b/README.md @@ -20,6 +20,12 @@ Supported character encodings: * CP1251: contributed by Byte +New feature of version 6.1 +-------------------------- + +* improved doc/RE +* NEW API: onig_scan() + New feature of version 6.0 -------------------------- @@ -114,6 +120,7 @@ Sample Programs |sample/encode.c |example of some encodings. | |sample/listcap.c |example of the capture history. | |sample/posix.c |POSIX API sample. | +|sample/scan.c |example of using onig_scan(). | |sample/sql.c |example of the variable meta characters. | |sample/user_property.c|example of user defined Unicode property. | @@ -185,5 +192,6 @@ Source Files |utf32_be.c |UTF-32BE encoding | |utf32_le.c |UTF-32LE encoding | |unicode.c |common codes of Unicode encoding | +|unicode_fold_data.c|Unicode folding data | |win32/Makefile |Makefile for Win32 (VC++) | |win32/config.h |config.h for Win32 | diff --git a/configure.ac b/configure.ac index e9926a4..6bd3d73 100644 --- a/configure.ac +++ b/configure.ac @@ -1,5 +1,5 @@ dnl Process this file with autoconf to produce a configure script. -AC_INIT(onig, 6.0.0) +AC_INIT(onig, 6.1.0) AC_CONFIG_MACRO_DIR([m4]) @@ -34,7 +34,7 @@ fi dnl Checks for programs. AC_PROG_CC AM_PROG_LIBTOOL -LTVERSION="3:0:0" +LTVERSION="4:0:0" AC_SUBST(LTVERSION) AC_PROG_INSTALL diff --git a/contributed/libfuzzer-onig.cpp b/contributed/libfuzzer-onig.cpp new file mode 100644 index 0000000..984110d --- /dev/null +++ b/contributed/libfuzzer-onig.cpp @@ -0,0 +1,31 @@ +/* libfuzzer test code for oniguruma + * author: Hanno B旦ck, license: CC0/public domain + +Usage: +* compile oniguruma with something like + ./configure CC=clang LD=clang CFLAGS="-fsanitize-coverage=edge -fsanitize=address" \ + LDFLAGS="-fsanitize-coverage=edge -fsanitize=address" +* Compile libfuzzer stub and link against static libonig.a and libFuzzer.a: + clang++ libfuzzer-onig.cpp src/.libs/libonig.a libFuzzer.a -o libfuzzer-onig \ + -fsanitize-coverage=edge -fsanitize=address +* Put sample patterns in directory "in/" +* Run + ./libfuzzer-onig in + +Consult libfuzzer docs for further details and how to create libFuzzer.a: +http://llvm.org/docs/LibFuzzer.html + + */ +#include +#include +#include + +extern "C" int LLVMFuzzerTestOneInput(const uint8_t * Data, size_t Size) +{ + regex_t *reg; + if (onig_new + (®, Data, Data + Size, ONIG_OPTION_DEFAULT, ONIG_ENCODING_UTF8, + ONIG_SYNTAX_DEFAULT, 0) == 0) + onig_free(reg); + return 0; +} diff --git a/dist.info b/dist.info index 40ad07d..a7633b8 100644 --- a/dist.info +++ b/dist.info @@ -1,7 +1,7 @@ --- This file is part of LuaDist project name = "onig" -version = "6.0.0" +version = "6.1.0" desc = "Oniguruma is a regular expressions library." author = "K.Kosako" diff --git a/doc/API b/doc/API index 9904a06..8e824f5 100644 --- a/doc/API +++ b/doc/API @@ -1,4 +1,4 @@ -Oniguruma API Version 6.0.0 2016/05/06 +Oniguruma API Version 6.1.0 2016/08/22 #include @@ -256,6 +256,27 @@ Oniguruma API Version 6.0.0 2016/05/06 ONIG_OPTION_POSIX_REGION region argument is regmatch_t[] type of POSIX API. +# int onig_scan(regex_t* reg, const UChar* str, const UChar* end, + OnigRegion* region, OnigOptionType option, + int (*scan_callback)(int, int, OnigRegion*, void*), + void* callback_arg) + + Scan string and callback with matching region. + + normal return: number of matching times + error: error code + interruption: return value of callback function (!= 0) + + arguments + 1 reg: regex object + 2 str: target string + 3 end: terminate address of target string + 4 region: address for return group match range info (NULL is allowed) + 5 option: search time option + 6 scan_callback: callback function (defined by user) + 7 callback_arg: optional argument passed to callback + + # OnigRegion* onig_region_new(void) Create a region. @@ -601,6 +622,10 @@ Oniguruma API Version 6.0.0 2016/05/06 2 ranges: property code point ranges (first element is number of ranges.) + [num-of-ranges, 1st-range-start, 1st-range-end, 2nd-range-start... ] + + * Don't destroy the ranges after having called this function. + normal return: ONIG_NORMAL diff --git a/doc/API.ja b/doc/API.ja index ac8cc6a..f617a1c 100644 --- a/doc/API.ja +++ b/doc/API.ja @@ -1,4 +1,4 @@ -鬼車インターフェース Version 6.0.0 2016/05/06 +鬼車インターフェース Version 6.1.0 2016/08/22 #include @@ -256,6 +256,27 @@ ONIG_OPTION_POSIX_REGION region引数をPOSIX APIのregmatch_t[]にする +# int onig_scan(regex_t* reg, const UChar* str, const UChar* end, + OnigRegion* region, OnigOptionType option, + int (*scan_callback)(int, int, OnigRegion*, void*), + void* callback_arg) + + 正規表現で文字列をスキャンして、マッチングする毎にコールバック関数を呼び出す。 + + 正常終了: マッチ回数 (0回も含める) + エラー: エラーコード (< 0) + 中断: コールバック関数が0以外の戻り値を返したとき、その値を戻り値として中断 + + 引数 + 1 reg: 正規表現オブジェクト + 2 str: 検索対象文字列 + 3 end: 検索対象文字列の終端アドレス + 4 region: マッチ領域情報(region) (NULLも許される) + 5 option: 検索時オプション + 6 scan_callback: コールバック関数 + 7 callback_arg: コールバック関数に渡される付加引数値 + + # OnigRegion* onig_region_new(void) マッチ領域情報(region)を作成する。 @@ -608,6 +629,10 @@ 2 ranges: プロパティコードポイント範囲 (最初の要素は範囲の数) + [num-of-ranges, 1st-range-start, 1st-range-end, 2nd-range-start... ] + + * この関数を呼んだ後で、rangesを変更/破壊しないこと + 正常終了戻り値: ONIG_NORMAL diff --git a/doc/RE b/doc/RE index b4bf536..e8a6aa4 100644 --- a/doc/RE +++ b/doc/RE @@ -1,35 +1,35 @@ -Oniguruma Regular Expressions Version 6.0.0 2016/05/02 +Oniguruma Regular Expressions Version 6.0.0 2016/08/18 syntax: ONIG_SYNTAX_RUBY (default) 1. Syntax elements - \ escape (enable or disable meta character meaning) + \ escape (enable or disable meta character) | alternation (...) group - [...] character class + [...] character class 2. Characters - \t horizontal tab (0x09) - \v vertical tab (0x0B) - \n newline (0x0A) - \r return (0x0D) - \b back space (0x08) - \f form feed (0x0C) - \a bell (0x07) - \e escape (0x1B) - \nnn octal char (encoded byte value) - \xHH hexadecimal char (encoded byte value) - \x{7HHHHHHH} wide hexadecimal char (character code point value) - \cx control char (character code point value) - \C-x control char (character code point value) - \M-x meta (x|0x80) (character code point value) - \M-\C-x meta control char (character code point value) - - (* \b is effective in character class [...] only) + \t horizontal tab (0x09) + \v vertical tab (0x0B) + \n newline (line feed) (0x0A) + \r carriage return (0x0D) + \b backspace (0x08) + \f form feed (0x0C) + \a bell (0x07) + \e escape (0x1B) + \nnn octal char (encoded byte value) + \xHH hexadecimal char (encoded byte value) + \x{7HHHHHHH} wide hexadecimal char (character code point value) + \cx control char (character code point value) + \C-x control char (character code point value) + \M-x meta (x|0x80) (character code point value) + \M-\C-x meta control char (character code point value) + + (* \b as backspace is effective in character class only) 3. Character types @@ -39,12 +39,12 @@ syntax: ONIG_SYNTAX_RUBY (default) \w word character Not Unicode: - alphanumeric, "_" and multibyte char. + alphanumeric, "_" and multibyte char. Unicode: General_Category -- (Letter|Mark|Number|Connector_Punctuation) - \W non word char + \W non-word char \s whitespace char @@ -52,22 +52,22 @@ syntax: ONIG_SYNTAX_RUBY (default) \t, \n, \v, \f, \r, \x20 Unicode: - 0009, 000A, 000B, 000C, 000D, 0085(NEL), + 0009, 000A, 000B, 000C, 000D, 0085(NEL), General_Category -- Line_Separator -- Paragraph_Separator -- Space_Separator - \S non whitespace char + \S non-whitespace char \d decimal digit char Unicode: General_Category -- Decimal_Number - \D non decimal digit char + \D non-decimal-digit char \h hexadecimal digit char [0-9a-fA-F] - \H non hexadecimal digit char + \H non-hexdigit char Character Property @@ -80,7 +80,7 @@ syntax: ONIG_SYNTAX_RUBY (default) + works on all encodings Alnum, Alpha, Blank, Cntrl, Digit, Graph, Lower, - Print, Punct, Space, Upper, XDigit, Word, ASCII, + Print, Punct, Space, Upper, XDigit, Word, ASCII + works on EUC_JP, Shift_JIS Hiragana, Katakana @@ -97,9 +97,9 @@ syntax: ONIG_SYNTAX_RUBY (default) ? 1 or 0 times * 0 or more times + 1 or more times - {n,m} at least n but not more than m times + {n,m} at least n but no more than m times {n,} at least n times - {,n} at least 0 but not more than n times ({0,n}) + {,n} at least 0 but no more than n times ({0,n}) {n} n times reluctant @@ -107,11 +107,11 @@ syntax: ONIG_SYNTAX_RUBY (default) ?? 1 or 0 times *? 0 or more times +? 1 or more times - {n,m}? at least n but not more than m times + {n,m}? at least n but not more than m times {n,}? at least n times {,n}? at least 0 but not more than n times (== {0,n}?) - possessive (greedy and does not backtrack after repeated) + possessive (greedy and does not backtrack once match) ?+ 1 or 0 times *+ 0 or more times @@ -127,24 +127,24 @@ syntax: ONIG_SYNTAX_RUBY (default) ^ beginning of the line $ end of the line \b word boundary - \B not word boundary + \B non-word boundary \A beginning of string \Z end of string, or before newline at the end \z end of string - \G matching start position + \G where the current search attempt begins 6. Character class - ^... negative class (lowest precedence operator) + ^... negative class (lowest precedence) x-y range from x to y [...] set (character class in character class) - ..&&.. intersection (low precedence at the next of ^) - + ..&&.. intersection (low precedence, only higher than ^) + ex. [a-w&&[^c-g]z] ==> ([a-w] AND ([^c-g] OR z)) ==> [abh-w] - * If you want to use '[', '-', ']' as a normal character - in a character class, you should escape these characters by '\'. + * If you want to use '[', '-', or ']' as a normal character + in character class, you should escape them with '\'. POSIX bracket ([:xxxxx:], negate [:^xxxxx:]) @@ -196,79 +196,75 @@ syntax: ONIG_SYNTAX_RUBY (default) (?imx-imx) option on/off i: ignore case - m: multi-line (dot(.) match newline) + m: multi-line (dot (.) also matches newline) x: extended form (?imx-imx:subexp) option on/off for subexp - (?:subexp) not captured group - (subexp) captured group + (?:subexp) non-capturing group + (subexp) capturing group (?=subexp) look-ahead (?!subexp) negative look-ahead (?<=subexp) look-behind (?subexp) atomic group - don't backtrack in subexp. + no backtracks in subexp. (?subexp), (?'name'subexp) define named group - (All characters of the name must be a word character.) + (Each character of the name must be a word character.) - Not only a name but a number is assigned like a captured + Not only a name but a number is assigned like a capturing group. - Assigning the same name as two or more subexps is allowed. - In this case, a subexp call can not be performed although - the back reference is possible. + Assigning the same name to two or more subexps is allowed. + +8. Backreferences -8. Back reference + When we say "backreference a group," it actually means, "re-match the same + text matched by the subexp in that group." - \n back reference by group number (n >= 1) - \k back reference by group number (n >= 1) - \k'n' back reference by group number (n >= 1) - \k<-n> back reference by relative group number (n >= 1) - \k'-n' back reference by relative group number (n >= 1) - \k back reference by group name - \k'name' back reference by group name + \n \k \k'n' (n >= 1) backreference the nth group in the regexp + \k<-n> \k'-n' (n >= 1) backreference the nth group counting + backwards from the referring position + \k \k'name' backreference a group with the specified name - In the back reference by the multiplex definition name, - a subexp with a large number is referred to preferentially. - (When not matched, a group of the small number is referred to.) + When backreferencing with a name that is assigned to more than one groups, + the last group with the name is checked first, if not matched then the + previous one with the name, and so on, until there is a match. - * Back reference by group number is forbidden if named group is defined - in the pattern and ONIG_OPTION_CAPTURE_GROUP is not setted. + * Backreference by number is forbidden if any named group is defined and + ONIG_OPTION_CAPTURE_GROUP is not set. - back reference with nest level + backreference with recursion level - level: 0, 1, 2, ... + (n >= 1, level >= 0) - \k (n >= 1) - \k (n >= 1) - \k'n+level' (n >= 1) - \k'n-level' (n >= 1) + \k \k'n+level' + \k \k'n-level' - \k - \k - \k'name+level' - \k'name-level' + \k \k'name+level' + \k \k'name-level' - Destinate relative nest level from back reference position. + Destine a group on the recursion level relative to the referring position. ex 1. + /\A(?|.|(?:(?.)\g\k))\z/.match("reee") /\A(?|.|(?:(?.)\g\k))\z/.match("reer") + \k refers to the (?.) on the same recursion level with it. + ex 2. r = Regexp.compile(<<'__REGEXP__'.strip, Regexp::EXTENDED) @@ -280,53 +276,56 @@ syntax: ONIG_SYNTAX_RUBY (default) \g __REGEXP__ - p r.match('fbbbf').captures + p r.match("fbbbf").captures + + +9. Subexp calls ("Tanaka Akira special") + When we say "call a group," it actually means, "re-execute the subexp in + that group." + \g \g'n' (n >= 1) call the nth group + \g<-n> \g'-n' (n >= 1) call the nth group counting backwards from + the calling position + \g \g'name' call the group with the specified name -9. Subexp call ("Tanaka Akira special") + * Left-most recursive calls are not allowed. - \g call by group name - \g'name' call by group name - \g call by group number (n >= 1) - \g'n' call by group number (n >= 1) - \g<-n> call by relative group number (n >= 1) - \g'-n' call by relative group number (n >= 1) + ex. (?a|\gb) => error + (?a|b\gc) => OK - * left-most recursive call is not allowed. - ex. (?a|\gb) => error - (?a|b\gc) => OK + * Calls with a name that is assigned to more than one groups are not + allowed. - * Call by group number is forbidden if named group is defined in the pattern - and ONIG_OPTION_CAPTURE_GROUP is not setted. + * Call by number is forbidden if any named group is defined and + ONIG_OPTION_CAPTURE_GROUP is not set. - * If the option status of called group is different from calling position - then the group's option is effective. + * The option status of the called group is always effective. - ex. (?-i:\g)(?i:(?a)){0} match to "A" + ex. /(?-i:\g)(?i:(?a)){0}/.match("A") 10. Captured group - Behavior of the no-named group (...) changes with the following conditions. + Behavior of an unnamed group (...) changes with the following conditions. (But named group is not changed.) case 1. /.../ (named group is not used, no option) - (...) is treated as a captured group. + (...) is treated as a capturing group. case 2. /.../g (named group is not used, 'g' option) - (...) is treated as a no-captured group (?:...). + (...) is treated as a non-capturing group (?:...). case 3. /..(?..)../ (named group is used, no option) - (...) is treated as a no-captured group (?:...). + (...) is treated as a non-capturing group. numbered-backref/call is not allowed. case 4. /..(?..)../G (named group is used, 'G' option) - (...) is treated as a captured group. + (...) is treated as a capturing group. numbered-backref/call is allowed. where @@ -338,14 +337,14 @@ syntax: ONIG_SYNTAX_RUBY (default) ----------------------------- -A-1. Syntax depend options +A-1. Syntax-dependent options + ONIG_SYNTAX_RUBY - (?m): dot(.) match newline + (?m): dot (.) also matches newline + ONIG_SYNTAX_PERL and ONIG_SYNTAX_JAVA - (?s): dot(.) match newline - (?m): ^ match after newline, $ match before newline + (?s): dot (.) also matches newline + (?m): ^ matches after newline, $ matches before newline A-2. Original extensions @@ -356,7 +355,7 @@ A-2. Original extensions + subexp call \g, \g -A-3. Lacked features compare with perl 5.8.0 +A-3. Missing features compared with perl 5.8.0 + \N{name} + \l,\u,\L,\U, \X, \C @@ -373,12 +372,12 @@ A-4. Differences with Japanized GNU regex(version 0.12) of Ruby 1.8 + add character property (\p{property}, \P{property}) + add hexadecimal digit char type (\h, \H) + add look-behind - (?<=fixed-char-length-pattern), (?Oniguruma (Japanese)

-(c) K.Kosako, updated at: 2016/05/06 +(c) K.Kosako, updated at: 2016/08/22

@@ -16,6 +16,7 @@
What's new
    +
  • 2016/08/29: Version 6.1.0 released.
  • 2016/05/09: Version 6.0.0 released.
  • 2014/12/12: Version 5.9.6 released.
@@ -65,7 +66,7 @@ About 2.x, please contact him.
* 2.x supports Ruby1.6/1.8.

-
Documents: (version 6.0.0) +
Documents: (version 6.1.0)
  • Regular Expressions (Japanese: EUC-JP) diff --git a/index_ja.html b/index_ja.html index 3b089fc..9c68c85 100644 --- a/index_ja.html +++ b/index_ja.html @@ -8,7 +8,7 @@

    薔取

    -(c) K.Kosako, 腟贋: 2016/05/06 +(c) K.Kosako, 腟贋: 2016/08/22

    @@ -16,6 +16,7 @@
    贋井
      +
    • 2016/08/29: Version 6.1.0 若
    • 2016/05/09: Version 6.0.0 若
    • 2014/12/12: Version 5.9.6 若
    @@ -65,7 +66,7 @@ ISO-8859-11, ISO-8859-13, ISO-8859-14, ISO-8859-15, ISO-8859-16
    * 2.xRuby1.6/1.8腟莨若帥ゃ篏 (2006綛贋т絎腟篋)

    -
    ャ<潟: (version 6.0.0) +
    ャ<潟: (version 6.1.0)
    • 罩h頫 (ユ茯: EUC-JP) diff --git a/sample/.gitignore b/sample/.gitignore index 963d2e4..79fab44 100644 --- a/sample/.gitignore +++ b/sample/.gitignore @@ -7,4 +7,5 @@ /sql /syntax /user_property +/bug_fix /log* diff --git a/sample/Makefile.am b/sample/Makefile.am index 53f0d08..6799ecd 100644 --- a/sample/Makefile.am +++ b/sample/Makefile.am @@ -6,9 +6,9 @@ LDADD = $(lib_onig) AM_LDFLAGS = -L$(prefix)/lib AM_CPPFLAGS = -I../src -I$(includedir) -TESTS = encode listcap names posix simple sql syntax user_property +TESTS = encode listcap names posix simple sql syntax user_property bug_fix -check_PROGRAMS = encode listcap names posix simple sql syntax user_property +check_PROGRAMS = encode listcap names posix simple sql syntax user_property bug_fix encode_SOURCES = encode.c listcap_SOURCES = listcap.c @@ -18,10 +18,11 @@ simple_SOURCES = simple.c sql_SOURCES = sql.c syntax_SOURCES = syntax.c user_property_SOURCES = user_property.c +bug_fix = bug_fix.c sampledir = . -test: encode listcap names posix simple sql syntax user_property +test: encode listcap names posix simple sql syntax user_property bug_fix $(sampledir)/encode $(sampledir)/listcap $(sampledir)/names @@ -30,3 +31,4 @@ test: encode listcap names posix simple sql syntax user_property $(sampledir)/sql $(sampledir)/syntax $(sampledir)/user_property + $(sampledir)/bug_fix diff --git a/sample/bug_fix.c b/sample/bug_fix.c new file mode 100644 index 0000000..9a45a78 --- /dev/null +++ b/sample/bug_fix.c @@ -0,0 +1,131 @@ +/* + * bug_fix.c + */ +#include +#include "oniguruma.h" + +static OnigCaseFoldType CF = ONIGENC_CASE_FOLD_MIN; + +static int +search(regex_t* reg, unsigned char* str, unsigned char* end) +{ + int r; + unsigned char *start, *range; + OnigRegion *region; + + region = onig_region_new(); + + start = str; + range = end; + r = onig_search(reg, str, end, start, range, region, ONIG_OPTION_NONE); + if (r >= 0) { + int i; + + fprintf(stderr, "match at %d (%s)\n", r, + ONIGENC_NAME(onig_get_encoding(reg))); + for (i = 0; i < region->num_regs; i++) { + fprintf(stderr, "%d: (%d-%d)\n", i, region->beg[i], region->end[i]); + } + } + else if (r == ONIG_MISMATCH) { + fprintf(stderr, "search fail (%s)\n", + ONIGENC_NAME(onig_get_encoding(reg))); + } + else { /* error */ + char s[ONIG_MAX_ERROR_MESSAGE_LEN]; + onig_error_code_to_str(s, r); + fprintf(stderr, "ERROR: %s\n", s); + fprintf(stderr, " (%s)\n", ONIGENC_NAME(onig_get_encoding(reg))); + return -1; + } + + onig_region_free(region, 1 /* 1:free self, 0:free contents only */); + return 0; +} + +static int +exec_deluxe(OnigEncoding pattern_enc, OnigEncoding str_enc, + OnigOptionType options, char* apattern, char* astr) +{ + int r; + unsigned char *end; + regex_t* reg; + OnigCompileInfo ci; + OnigErrorInfo einfo; + UChar* pattern = (UChar* )apattern; + UChar* str = (UChar* )astr; + + onig_initialize(&str_enc, 1); + + ci.num_of_elements = 5; + ci.pattern_enc = pattern_enc; + ci.target_enc = str_enc; + ci.syntax = ONIG_SYNTAX_DEFAULT; + ci.option = options; + ci.case_fold_flag = CF; + + r = onig_new_deluxe(®, pattern, + pattern + onigenc_str_bytelen_null(pattern_enc, pattern), + &ci, &einfo); + if (r != ONIG_NORMAL) { + char s[ONIG_MAX_ERROR_MESSAGE_LEN]; + onig_error_code_to_str(s, r, &einfo); + fprintf(stderr, "ERROR: %s\n", s); + return -1; + } + + end = str + onigenc_str_bytelen_null(str_enc, str); + r = search(reg, str, end); + + onig_free(reg); + onig_end(); + return 0; +} + +static int +exec(OnigEncoding enc, OnigOptionType options, char* apattern, char* astr) +{ + int r; + unsigned char *end; + regex_t* reg; + OnigErrorInfo einfo; + UChar* pattern = (UChar* )apattern; + UChar* str = (UChar* )astr; + + onig_initialize(&enc, 1); + + r = onig_new(®, pattern, + pattern + onigenc_str_bytelen_null(enc, pattern), + options, enc, ONIG_SYNTAX_DEFAULT, &einfo); + if (r != ONIG_NORMAL) { + char s[ONIG_MAX_ERROR_MESSAGE_LEN]; + onig_error_code_to_str(s, r, &einfo); + fprintf(stderr, "ERROR: %s\n", s); + return -1; + } + + end = str + onigenc_str_bytelen_null(enc, str); + r = search(reg, str, end); + + onig_free(reg); + onig_end(); + return 0; +} + + + +extern int main(int argc, char* argv[]) +{ + /* fix ignore case in look-behind + commit: 3340ec2cc5627172665303fe248c9793354d2251 */ + exec_deluxe(ONIG_ENCODING_UTF8, ONIG_ENCODING_UTF8, + ONIG_OPTION_IGNORECASE, + "(?<=\305\211)a", "\312\274na"); /* \u{0149}a \u{02bc}na */ + + exec(ONIG_ENCODING_UTF8, ONIG_OPTION_NONE, "(\\2)(\\1)", "aa"); /* fail. */ + + exec(ONIG_ENCODING_UTF8, ONIG_OPTION_FIND_LONGEST, + "a*", "aa aaa aaaa aaaaa "); /* match 12-17 */ + + return 0; +} diff --git a/sample/scan.c b/sample/scan.c new file mode 100644 index 0000000..ad5ae74 --- /dev/null +++ b/sample/scan.c @@ -0,0 +1,88 @@ +/* + * scan.c + */ +#include +#include +#include "oniguruma.h" + +static int +scan_callback(int n, int r, OnigRegion* region, void* arg) +{ + int i; + + fprintf(stdout, "scan: %d\n", n); + + fprintf(stdout, "match at %d\n", r); + for (i = 0; i < region->num_regs; i++) { + fprintf(stdout, "%d: (%d-%d)\n", i, region->beg[i], region->end[i]); + } + + return 0; +} + +static int +scan(regex_t* reg, unsigned char* str, unsigned char* end) +{ + int r; + OnigRegion *region; + + region = onig_region_new(); + + r = onig_scan(reg, str, end, region, ONIG_OPTION_NONE, scan_callback, NULL); + if (r >= 0) { + fprintf(stdout, "total: %d match\n", r); + } + else { /* error */ + char s[ONIG_MAX_ERROR_MESSAGE_LEN]; + onig_error_code_to_str((OnigUChar* )s, r); + fprintf(stderr, "ERROR: %s\n", s); + return -1; + } + + onig_region_free(region, 1 /* 1:free self, 0:free contents only */); + return 0; +} + +static int +exec(OnigEncoding enc, OnigOptionType options, char* apattern, char* astr) +{ + int r; + unsigned char *end; + regex_t* reg; + OnigErrorInfo einfo; + UChar* pattern_end; + UChar* pattern = (UChar* )apattern; + UChar* str = (UChar* )astr; + + onig_initialize(&enc, 1); + + pattern_end = pattern + onigenc_str_bytelen_null(enc, pattern); + + r = onig_new(®, pattern, pattern_end, options, enc, ONIG_SYNTAX_DEFAULT, &einfo); + if (r != ONIG_NORMAL) { + char s[ONIG_MAX_ERROR_MESSAGE_LEN]; + onig_error_code_to_str((OnigUChar* )s, r, &einfo); + fprintf(stderr, "ERROR: %s\n", s); + return -1; + } + + end = str + onigenc_str_bytelen_null(enc, str); + r = scan(reg, str, end); + + onig_free(reg); + onig_end(); + return 0; +} + + +extern int main(int argc, char* argv[]) +{ + exec(ONIG_ENCODING_UTF8, ONIG_OPTION_NONE, + "\\Ga+\\s*", "a aa aaa baaa"); + + fprintf(stdout, "\n"); + exec(ONIG_ENCODING_UTF8, ONIG_OPTION_NONE, + "a+\\s*", "a aa aaa baaa"); + + return 0; +} diff --git a/src/ascii.c b/src/ascii.c index 92db179..b21878d 100644 --- a/src/ascii.c +++ b/src/ascii.c @@ -56,5 +56,6 @@ OnigEncodingType OnigEncodingASCII = { onigenc_single_byte_left_adjust_char_head, onigenc_always_true_is_allowed_reverse_match, NULL, /* init */ - NULL /* is_initialized */ + NULL, /* is_initialized */ + onigenc_always_true_is_valid_mbc_string }; diff --git a/src/big5.c b/src/big5.c index 3c90eaa..3d44975 100644 --- a/src/big5.c +++ b/src/big5.c @@ -54,6 +54,12 @@ big5_mbc_enc_len(const UChar* p) return EncLen_BIG5[*p]; } +static int +is_valid_mbc_string(const UChar* s, const UChar* end) +{ + return onigenc_length_check_is_valid_mbc_string(ONIG_ENCODING_BIG5, s, end); +} + static OnigCodePoint big5_mbc_to_code(const UChar* p, const UChar* end) { @@ -160,5 +166,6 @@ OnigEncodingType OnigEncodingBIG5 = { big5_left_adjust_char_head, big5_is_allowed_reverse_match, NULL, /* init */ - NULL /* is_initialized */ + NULL, /* is_initialized */ + is_valid_mbc_string }; diff --git a/src/cp1251.c b/src/cp1251.c index af45847..4d655bb 100644 --- a/src/cp1251.c +++ b/src/cp1251.c @@ -198,5 +198,6 @@ OnigEncodingType OnigEncodingCP1251 = { onigenc_single_byte_left_adjust_char_head, onigenc_always_true_is_allowed_reverse_match, NULL, /* init */ - NULL /* is_initialized */ + NULL, /* is_initialized */ + onigenc_always_true_is_valid_mbc_string }; diff --git a/src/euc_jp.c b/src/euc_jp.c index 17f53e7..19422ce 100644 --- a/src/euc_jp.c +++ b/src/euc_jp.c @@ -56,6 +56,12 @@ mbc_enc_len(const UChar* p) return EncLen_EUCJP[*p]; } +static int +is_valid_mbc_string(const UChar* s, const UChar* end) +{ + return onigenc_length_check_is_valid_mbc_string(ONIG_ENCODING_EUC_JP, s, end); +} + static OnigCodePoint mbc_to_code(const UChar* p, const UChar* end) { @@ -269,5 +275,6 @@ OnigEncodingType OnigEncodingEUC_JP = { left_adjust_char_head, is_allowed_reverse_match, NULL, /* init */ - NULL /* is_initialized */ + NULL, /* is_initialized */ + is_valid_mbc_string }; diff --git a/src/euc_kr.c b/src/euc_kr.c index 769104b..12803cd 100644 --- a/src/euc_kr.c +++ b/src/euc_kr.c @@ -54,6 +54,12 @@ euckr_mbc_enc_len(const UChar* p) return EncLen_EUCKR[*p]; } +static int +is_valid_mbc_string(const UChar* s, const UChar* end) +{ + return onigenc_length_check_is_valid_mbc_string(ONIG_ENCODING_EUC_KR, s, end); +} + static OnigCodePoint euckr_mbc_to_code(const UChar* p, const UChar* end) { @@ -136,7 +142,8 @@ OnigEncodingType OnigEncodingEUC_KR = { euckr_left_adjust_char_head, euckr_is_allowed_reverse_match, NULL, /* init */ - NULL /* is_initialized */ + NULL, /* is_initialized */ + is_valid_mbc_string }; /* Same with OnigEncodingEUC_KR except the name */ @@ -158,5 +165,6 @@ OnigEncodingType OnigEncodingEUC_CN = { euckr_left_adjust_char_head, euckr_is_allowed_reverse_match, NULL, /* init */ - NULL /* is_initialized */ + NULL, /* is_initialized */ + is_valid_mbc_string }; diff --git a/src/euc_tw.c b/src/euc_tw.c index f757961..4e07567 100644 --- a/src/euc_tw.c +++ b/src/euc_tw.c @@ -54,6 +54,12 @@ euctw_mbc_enc_len(const UChar* p) return EncLen_EUCTW[*p]; } +static int +is_valid_mbc_string(const UChar* s, const UChar* end) +{ + return onigenc_length_check_is_valid_mbc_string(ONIG_ENCODING_EUC_TW, s, end); +} + static OnigCodePoint euctw_mbc_to_code(const UChar* p, const UChar* end) { @@ -127,5 +133,6 @@ OnigEncodingType OnigEncodingEUC_TW = { euctw_left_adjust_char_head, euctw_is_allowed_reverse_match, NULL, /* init */ - NULL /* is_initialized */ + NULL, /* is_initialized */ + is_valid_mbc_string }; diff --git a/src/gb18030.c b/src/gb18030.c index beeda06..36fc3de 100644 --- a/src/gb18030.c +++ b/src/gb18030.c @@ -75,6 +75,12 @@ gb18030_mbc_enc_len(const UChar* p) return 2; } +static int +is_valid_mbc_string(const UChar* s, const UChar* end) +{ + return onigenc_length_check_is_valid_mbc_string(ONIG_ENCODING_GB18030, s, end); +} + static OnigCodePoint gb18030_mbc_to_code(const UChar* p, const UChar* end) { @@ -493,5 +499,6 @@ OnigEncodingType OnigEncodingGB18030 = { gb18030_left_adjust_char_head, gb18030_is_allowed_reverse_match, NULL, /* init */ - NULL /* is_initialized */ + NULL, /* is_initialized */ + is_valid_mbc_string }; diff --git a/src/iso8859_1.c b/src/iso8859_1.c index b2d8c9a..573931f 100644 --- a/src/iso8859_1.c +++ b/src/iso8859_1.c @@ -270,5 +270,6 @@ OnigEncodingType OnigEncodingISO_8859_1 = { onigenc_single_byte_left_adjust_char_head, onigenc_always_true_is_allowed_reverse_match, NULL, /* init */ - NULL /* is_initialized */ + NULL, /* is_initialized */ + onigenc_always_true_is_valid_mbc_string }; diff --git a/src/iso8859_10.c b/src/iso8859_10.c index a6dbe5c..91b18d4 100644 --- a/src/iso8859_10.c +++ b/src/iso8859_10.c @@ -237,5 +237,6 @@ OnigEncodingType OnigEncodingISO_8859_10 = { onigenc_single_byte_left_adjust_char_head, onigenc_always_true_is_allowed_reverse_match, NULL, /* init */ - NULL /* is_initialized */ + NULL, /* is_initialized */ + onigenc_always_true_is_valid_mbc_string }; diff --git a/src/iso8859_11.c b/src/iso8859_11.c index d671af8..518be25 100644 --- a/src/iso8859_11.c +++ b/src/iso8859_11.c @@ -94,5 +94,6 @@ OnigEncodingType OnigEncodingISO_8859_11 = { onigenc_single_byte_left_adjust_char_head, onigenc_always_true_is_allowed_reverse_match, NULL, /* init */ - NULL /* is_initialized */ + NULL, /* is_initialized */ + onigenc_always_true_is_valid_mbc_string }; diff --git a/src/iso8859_13.c b/src/iso8859_13.c index c97e24e..d1f39a2 100644 --- a/src/iso8859_13.c +++ b/src/iso8859_13.c @@ -226,5 +226,6 @@ OnigEncodingType OnigEncodingISO_8859_13 = { onigenc_single_byte_left_adjust_char_head, onigenc_always_true_is_allowed_reverse_match, NULL, /* init */ - NULL /* is_initialized */ + NULL, /* is_initialized */ + onigenc_always_true_is_valid_mbc_string }; diff --git a/src/iso8859_14.c b/src/iso8859_14.c index edf5313..3361b0d 100644 --- a/src/iso8859_14.c +++ b/src/iso8859_14.c @@ -239,5 +239,6 @@ OnigEncodingType OnigEncodingISO_8859_14 = { onigenc_single_byte_left_adjust_char_head, onigenc_always_true_is_allowed_reverse_match, NULL, /* init */ - NULL /* is_initialized */ + NULL, /* is_initialized */ + onigenc_always_true_is_valid_mbc_string }; diff --git a/src/iso8859_15.c b/src/iso8859_15.c index 24aa573..b09e876 100644 --- a/src/iso8859_15.c +++ b/src/iso8859_15.c @@ -233,5 +233,6 @@ OnigEncodingType OnigEncodingISO_8859_15 = { onigenc_single_byte_left_adjust_char_head, onigenc_always_true_is_allowed_reverse_match, NULL, /* init */ - NULL /* is_initialized */ + NULL, /* is_initialized */ + onigenc_always_true_is_valid_mbc_string }; diff --git a/src/iso8859_16.c b/src/iso8859_16.c index 4f4c2b8..29a350d 100644 --- a/src/iso8859_16.c +++ b/src/iso8859_16.c @@ -235,5 +235,6 @@ OnigEncodingType OnigEncodingISO_8859_16 = { onigenc_single_byte_left_adjust_char_head, onigenc_always_true_is_allowed_reverse_match, NULL, /* init */ - NULL /* is_initialized */ + NULL, /* is_initialized */ + onigenc_always_true_is_valid_mbc_string }; diff --git a/src/iso8859_2.c b/src/iso8859_2.c index 16c1549..9eb3536 100644 --- a/src/iso8859_2.c +++ b/src/iso8859_2.c @@ -233,5 +233,6 @@ OnigEncodingType OnigEncodingISO_8859_2 = { onigenc_single_byte_left_adjust_char_head, onigenc_always_true_is_allowed_reverse_match, NULL, /* init */ - NULL /* is_initialized */ + NULL, /* is_initialized */ + onigenc_always_true_is_valid_mbc_string }; diff --git a/src/iso8859_3.c b/src/iso8859_3.c index 549f01e..862823a 100644 --- a/src/iso8859_3.c +++ b/src/iso8859_3.c @@ -233,5 +233,6 @@ OnigEncodingType OnigEncodingISO_8859_3 = { onigenc_single_byte_left_adjust_char_head, onigenc_always_true_is_allowed_reverse_match, NULL, /* init */ - NULL /* is_initialized */ + NULL, /* is_initialized */ + onigenc_always_true_is_valid_mbc_string }; diff --git a/src/iso8859_4.c b/src/iso8859_4.c index 74b3a88..db706da 100644 --- a/src/iso8859_4.c +++ b/src/iso8859_4.c @@ -235,5 +235,6 @@ OnigEncodingType OnigEncodingISO_8859_4 = { onigenc_single_byte_left_adjust_char_head, onigenc_always_true_is_allowed_reverse_match, NULL, /* init */ - NULL /* is_initialized */ + NULL, /* is_initialized */ + onigenc_always_true_is_valid_mbc_string }; diff --git a/src/iso8859_5.c b/src/iso8859_5.c index 70c799a..0e03e9c 100644 --- a/src/iso8859_5.c +++ b/src/iso8859_5.c @@ -224,5 +224,6 @@ OnigEncodingType OnigEncodingISO_8859_5 = { onigenc_single_byte_left_adjust_char_head, onigenc_always_true_is_allowed_reverse_match, NULL, /* init */ - NULL /* is_initialized */ + NULL, /* is_initialized */ + onigenc_always_true_is_valid_mbc_string }; diff --git a/src/iso8859_6.c b/src/iso8859_6.c index e36579a..6289af5 100644 --- a/src/iso8859_6.c +++ b/src/iso8859_6.c @@ -94,5 +94,6 @@ OnigEncodingType OnigEncodingISO_8859_6 = { onigenc_single_byte_left_adjust_char_head, onigenc_always_true_is_allowed_reverse_match, NULL, /* init */ - NULL /* is_initialized */ + NULL, /* is_initialized */ + onigenc_always_true_is_valid_mbc_string }; diff --git a/src/iso8859_7.c b/src/iso8859_7.c index 87a1acd..75b520f 100644 --- a/src/iso8859_7.c +++ b/src/iso8859_7.c @@ -220,5 +220,6 @@ OnigEncodingType OnigEncodingISO_8859_7 = { onigenc_single_byte_left_adjust_char_head, onigenc_always_true_is_allowed_reverse_match, NULL, /* init */ - NULL /* is_initialized */ + NULL, /* is_initialized */ + onigenc_always_true_is_valid_mbc_string }; diff --git a/src/iso8859_8.c b/src/iso8859_8.c index e34a3bb..5f18345 100644 --- a/src/iso8859_8.c +++ b/src/iso8859_8.c @@ -94,5 +94,6 @@ OnigEncodingType OnigEncodingISO_8859_8 = { onigenc_single_byte_left_adjust_char_head, onigenc_always_true_is_allowed_reverse_match, NULL, /* init */ - NULL /* is_initialized */ + NULL, /* is_initialized */ + onigenc_always_true_is_valid_mbc_string }; diff --git a/src/iso8859_9.c b/src/iso8859_9.c index 8cbbbd6..d0c06bb 100644 --- a/src/iso8859_9.c +++ b/src/iso8859_9.c @@ -226,5 +226,6 @@ OnigEncodingType OnigEncodingISO_8859_9 = { onigenc_single_byte_left_adjust_char_head, onigenc_always_true_is_allowed_reverse_match, NULL, /* init */ - NULL /* is_initialized */ + NULL, /* is_initialized */ + onigenc_always_true_is_valid_mbc_string }; diff --git a/src/koi8.c b/src/koi8.c index 9c11776..80f89e9 100644 --- a/src/koi8.c +++ b/src/koi8.c @@ -248,5 +248,6 @@ OnigEncodingType OnigEncodingKOI8 = { onigenc_single_byte_left_adjust_char_head, onigenc_always_true_is_allowed_reverse_match, NULL, /* init */ - NULL /* is_initialized */ + NULL, /* is_initialized */ + onigenc_always_true_is_valid_mbc_string }; diff --git a/src/koi8_r.c b/src/koi8_r.c index 2f090f4..f8ef34f 100644 --- a/src/koi8_r.c +++ b/src/koi8_r.c @@ -210,5 +210,6 @@ OnigEncodingType OnigEncodingKOI8_R = { onigenc_single_byte_left_adjust_char_head, onigenc_always_true_is_allowed_reverse_match, NULL, /* init */ - NULL /* is_initialized */ + NULL, /* is_initialized */ + onigenc_always_true_is_valid_mbc_string }; diff --git a/src/oniguruma.h b/src/oniguruma.h index 6e62b50..75301ca 100644 --- a/src/oniguruma.h +++ b/src/oniguruma.h @@ -35,7 +35,7 @@ extern "C" { #define ONIGURUMA #define ONIGURUMA_VERSION_MAJOR 6 -#define ONIGURUMA_VERSION_MINOR 0 +#define ONIGURUMA_VERSION_MINOR 1 #define ONIGURUMA_VERSION_TEENY 0 #ifdef __cplusplus @@ -103,9 +103,9 @@ extern "C" { typedef unsigned int OnigCodePoint; typedef unsigned char OnigUChar; typedef unsigned int OnigCtype; -typedef unsigned int OnigDistance; +typedef unsigned int OnigLen; -#define ONIG_INFINITE_DISTANCE ~((OnigDistance )0) +#define ONIG_INFINITE_DISTANCE ~((OnigLen )0) typedef unsigned int OnigCaseFoldType; /* case fold flag */ @@ -165,6 +165,7 @@ typedef struct OnigEncodingTypeST { int (*is_allowed_reverse_match)(const OnigUChar* p, const OnigUChar* end); int (*init)(void); int (*is_initialized)(void); + int (*is_valid_mbc_string)(const OnigUChar* s, const OnigUChar* end); } OnigEncodingType; typedef OnigEncodingType* OnigEncoding; @@ -279,6 +280,8 @@ ONIG_EXTERN OnigEncodingType OnigEncodingGB18030; (enc)->is_allowed_reverse_match(s,end) #define ONIGENC_LEFT_ADJUST_CHAR_HEAD(enc,start,s) \ (enc)->left_adjust_char_head(start, s) +#define ONIGENC_IS_VALID_MBC_STRING(enc,s,end) \ + (enc)->is_valid_mbc_string(s,end) #define ONIGENC_APPLY_ALL_CASE_FOLD(enc,case_fold_flag,f,arg) \ (enc)->apply_all_case_fold(case_fold_flag,f,arg) #define ONIGENC_GET_CASE_FOLD_CODES_BY_STR(enc,case_fold_flag,p,end,acs) \ @@ -360,6 +363,8 @@ ONIG_EXTERN int onigenc_strlen_null P_((OnigEncoding enc, const OnigUChar* p)); ONIG_EXTERN int onigenc_str_bytelen_null P_((OnigEncoding enc, const OnigUChar* p)); +ONIG_EXTERN +int onigenc_is_valid_mbc_string P_((OnigEncoding enc, const UChar* s, const UChar* end)); @@ -367,6 +372,7 @@ int onigenc_str_bytelen_null P_((OnigEncoding enc, const OnigUChar* p)); /* config parameters */ #define ONIG_NREGION 10 +#define ONIG_MAX_CAPTURE_NUM 32767 #define ONIG_MAX_BACKREF_NUM 1000 #define ONIG_MAX_REPEAT_NUM 100000 #define ONIG_MAX_MULTI_BYTE_RANGES_NUM 10000 @@ -575,6 +581,7 @@ ONIG_EXTERN OnigSyntaxType* OnigDefaultSyntax; #define ONIGERR_TOO_BIG_BACKREF_NUMBER -207 #define ONIGERR_INVALID_BACKREF -208 #define ONIGERR_NUMBERED_BACKREF_OR_CALL_NOT_ALLOWED -209 +#define ONIGERR_TOO_MANY_CAPTURES -210 #define ONIGERR_TOO_LONG_WIDE_CHAR_VALUE -212 #define ONIGERR_EMPTY_GROUP_NAME -214 #define ONIGERR_INVALID_GROUP_NAME -215 @@ -679,16 +686,16 @@ typedef struct re_pattern_buffer { int optimize; /* optimize flag */ int threshold_len; /* search str-length for apply optimize */ int anchor; /* BEGIN_BUF, BEGIN_POS, (SEMI_)END_BUF */ - OnigDistance anchor_dmin; /* (SEMI_)END_BUF anchor distance */ - OnigDistance anchor_dmax; /* (SEMI_)END_BUF anchor distance */ + OnigLen anchor_dmin; /* (SEMI_)END_BUF anchor distance */ + OnigLen anchor_dmax; /* (SEMI_)END_BUF anchor distance */ int sub_anchor; /* start-anchor for exact or map */ unsigned char *exact; unsigned char *exact_end; unsigned char map[ONIG_CHAR_TABLE_SIZE]; /* used as BM skip or char-map */ int *int_map; /* BM skip for exact_len > 255 */ int *int_map_backward; /* BM skip for backward search */ - OnigDistance dmin; /* min-distance of exact or map */ - OnigDistance dmax; /* max-distance of exact or map */ + OnigLen dmin; /* min-distance of exact or map */ + OnigLen dmax; /* max-distance of exact or map */ /* regex_t link chain */ struct re_pattern_buffer* chain; /* escape compile-conflict */ @@ -735,6 +742,8 @@ void onig_free P_((OnigRegex)); ONIG_EXTERN void onig_free_body P_((OnigRegex)); ONIG_EXTERN +int onig_scan(regex_t* reg, const UChar* str, const UChar* end, OnigRegion* region, OnigOptionType option, int (*scan_callback)(int, int, OnigRegion*, void*), void* callback_arg); +ONIG_EXTERN int onig_search P_((OnigRegex, const OnigUChar* str, const OnigUChar* end, const OnigUChar* start, const OnigUChar* range, OnigRegion* region, OnigOptionType option)); ONIG_EXTERN int onig_match P_((OnigRegex, const OnigUChar* str, const OnigUChar* end, const OnigUChar* at, OnigRegion* region, OnigOptionType option)); diff --git a/src/regcomp.c b/src/regcomp.c index 8b5b206..5c0f21f 100644 --- a/src/regcomp.c +++ b/src/regcomp.c @@ -89,8 +89,8 @@ swap_node(Node* a, Node* b) } } -static OnigDistance -distance_add(OnigDistance d1, OnigDistance d2) +static OnigLen +distance_add(OnigLen d1, OnigLen d2) { if (d1 == ONIG_INFINITE_DISTANCE || d2 == ONIG_INFINITE_DISTANCE) return ONIG_INFINITE_DISTANCE; @@ -100,8 +100,8 @@ distance_add(OnigDistance d1, OnigDistance d2) } } -static OnigDistance -distance_multiply(OnigDistance d, int m) +static OnigLen +distance_multiply(OnigLen d, int m) { if (m == 0) return 0; @@ -2021,245 +2021,6 @@ quantifiers_memory_node_info(Node* node) } #endif /* USE_MONOMANIAC_CHECK_CAPTURES_IN_ENDLESS_REPEAT */ -static int -get_min_match_length(Node* node, OnigDistance *min, ScanEnv* env) -{ - OnigDistance tmin; - int r = 0; - - *min = 0; - switch (NTYPE(node)) { - case NT_BREF: - { - int i; - int* backs; - Node** nodes = SCANENV_MEM_NODES(env); - BRefNode* br = NBREF(node); - if (br->state & NST_RECURSION) break; - - backs = BACKREFS_P(br); - if (backs[0] > env->num_mem) return ONIGERR_INVALID_BACKREF; - r = get_min_match_length(nodes[backs[0]], min, env); - if (r != 0) break; - for (i = 1; i < br->back_num; i++) { - if (backs[i] > env->num_mem) return ONIGERR_INVALID_BACKREF; - r = get_min_match_length(nodes[backs[i]], &tmin, env); - if (r != 0) break; - if (*min > tmin) *min = tmin; - } - } - break; - -#ifdef USE_SUBEXP_CALL - case NT_CALL: - if (IS_CALL_RECURSION(NCALL(node))) { - EncloseNode* en = NENCLOSE(NCALL(node)->target); - if (IS_ENCLOSE_MIN_FIXED(en)) - *min = en->min_len; - } - else - r = get_min_match_length(NCALL(node)->target, min, env); - break; -#endif - - case NT_LIST: - do { - r = get_min_match_length(NCAR(node), &tmin, env); - if (r == 0) *min += tmin; - } while (r == 0 && IS_NOT_NULL(node = NCDR(node))); - break; - - case NT_ALT: - { - Node *x, *y; - y = node; - do { - x = NCAR(y); - r = get_min_match_length(x, &tmin, env); - if (r != 0) break; - if (y == node) *min = tmin; - else if (*min > tmin) *min = tmin; - } while (r == 0 && IS_NOT_NULL(y = NCDR(y))); - } - break; - - case NT_STR: - { - StrNode* sn = NSTR(node); - *min = sn->end - sn->s; - } - break; - - case NT_CTYPE: - *min = 1; - break; - - case NT_CCLASS: - case NT_CANY: - *min = 1; - break; - - case NT_QTFR: - { - QtfrNode* qn = NQTFR(node); - - if (qn->lower > 0) { - r = get_min_match_length(qn->target, min, env); - if (r == 0) - *min = distance_multiply(*min, qn->lower); - } - } - break; - - case NT_ENCLOSE: - { - EncloseNode* en = NENCLOSE(node); - switch (en->type) { - case ENCLOSE_MEMORY: -#ifdef USE_SUBEXP_CALL - if (IS_ENCLOSE_MIN_FIXED(en)) - *min = en->min_len; - else { - r = get_min_match_length(en->target, min, env); - if (r == 0) { - en->min_len = *min; - SET_ENCLOSE_STATUS(node, NST_MIN_FIXED); - } - } - break; -#endif - case ENCLOSE_OPTION: - case ENCLOSE_STOP_BACKTRACK: - r = get_min_match_length(en->target, min, env); - break; - } - } - break; - - case NT_ANCHOR: - default: - break; - } - - return r; -} - -static int -get_max_match_length(Node* node, OnigDistance *max, ScanEnv* env) -{ - OnigDistance tmax; - int r = 0; - - *max = 0; - switch (NTYPE(node)) { - case NT_LIST: - do { - r = get_max_match_length(NCAR(node), &tmax, env); - if (r == 0) - *max = distance_add(*max, tmax); - } while (r == 0 && IS_NOT_NULL(node = NCDR(node))); - break; - - case NT_ALT: - do { - r = get_max_match_length(NCAR(node), &tmax, env); - if (r == 0 && *max < tmax) *max = tmax; - } while (r == 0 && IS_NOT_NULL(node = NCDR(node))); - break; - - case NT_STR: - { - StrNode* sn = NSTR(node); - *max = sn->end - sn->s; - } - break; - - case NT_CTYPE: - *max = ONIGENC_MBC_MAXLEN_DIST(env->enc); - break; - - case NT_CCLASS: - case NT_CANY: - *max = ONIGENC_MBC_MAXLEN_DIST(env->enc); - break; - - case NT_BREF: - { - int i; - int* backs; - Node** nodes = SCANENV_MEM_NODES(env); - BRefNode* br = NBREF(node); - if (br->state & NST_RECURSION) { - *max = ONIG_INFINITE_DISTANCE; - break; - } - backs = BACKREFS_P(br); - for (i = 0; i < br->back_num; i++) { - if (backs[i] > env->num_mem) return ONIGERR_INVALID_BACKREF; - r = get_max_match_length(nodes[backs[i]], &tmax, env); - if (r != 0) break; - if (*max < tmax) *max = tmax; - } - } - break; - -#ifdef USE_SUBEXP_CALL - case NT_CALL: - if (! IS_CALL_RECURSION(NCALL(node))) - r = get_max_match_length(NCALL(node)->target, max, env); - else - *max = ONIG_INFINITE_DISTANCE; - break; -#endif - - case NT_QTFR: - { - QtfrNode* qn = NQTFR(node); - - if (qn->upper != 0) { - r = get_max_match_length(qn->target, max, env); - if (r == 0 && *max != 0) { - if (! IS_REPEAT_INFINITE(qn->upper)) - *max = distance_multiply(*max, qn->upper); - else - *max = ONIG_INFINITE_DISTANCE; - } - } - } - break; - - case NT_ENCLOSE: - { - EncloseNode* en = NENCLOSE(node); - switch (en->type) { - case ENCLOSE_MEMORY: -#ifdef USE_SUBEXP_CALL - if (IS_ENCLOSE_MAX_FIXED(en)) - *max = en->max_len; - else { - r = get_max_match_length(en->target, max, env); - if (r == 0) { - en->max_len = *max; - SET_ENCLOSE_STATUS(node, NST_MAX_FIXED); - } - } - break; -#endif - case ENCLOSE_OPTION: - case ENCLOSE_STOP_BACKTRACK: - r = get_max_match_length(en->target, max, env); - break; - } - } - break; - - case NT_ANCHOR: - default: - break; - } - - return r; -} #define GET_CHAR_LEN_VARLEN -1 #define GET_CHAR_LEN_TOP_ALT_VARLEN -2 @@ -2706,6 +2467,257 @@ check_type_tree(Node* node, int type_mask, int enclose_mask, int anchor_mask) return r; } +static int +get_min_len(Node* node, OnigLen *min, ScanEnv* env) +{ + OnigLen tmin; + int r = 0; + + *min = 0; + switch (NTYPE(node)) { + case NT_BREF: + { + int i; + int* backs; + Node** nodes = SCANENV_MEM_NODES(env); + BRefNode* br = NBREF(node); + if (br->state & NST_RECURSION) break; + + backs = BACKREFS_P(br); + if (backs[0] > env->num_mem) return ONIGERR_INVALID_BACKREF; + r = get_min_len(nodes[backs[0]], min, env); + if (r != 0) break; + for (i = 1; i < br->back_num; i++) { + if (backs[i] > env->num_mem) return ONIGERR_INVALID_BACKREF; + r = get_min_len(nodes[backs[i]], &tmin, env); + if (r != 0) break; + if (*min > tmin) *min = tmin; + } + } + break; + +#ifdef USE_SUBEXP_CALL + case NT_CALL: + if (IS_CALL_RECURSION(NCALL(node))) { + EncloseNode* en = NENCLOSE(NCALL(node)->target); + if (IS_ENCLOSE_MIN_FIXED(en)) + *min = en->min_len; + } + else + r = get_min_len(NCALL(node)->target, min, env); + break; +#endif + + case NT_LIST: + do { + r = get_min_len(NCAR(node), &tmin, env); + if (r == 0) *min += tmin; + } while (r == 0 && IS_NOT_NULL(node = NCDR(node))); + break; + + case NT_ALT: + { + Node *x, *y; + y = node; + do { + x = NCAR(y); + r = get_min_len(x, &tmin, env); + if (r != 0) break; + if (y == node) *min = tmin; + else if (*min > tmin) *min = tmin; + } while (r == 0 && IS_NOT_NULL(y = NCDR(y))); + } + break; + + case NT_STR: + { + StrNode* sn = NSTR(node); + *min = sn->end - sn->s; + } + break; + + case NT_CTYPE: + *min = 1; + break; + + case NT_CCLASS: + case NT_CANY: + *min = 1; + break; + + case NT_QTFR: + { + QtfrNode* qn = NQTFR(node); + + if (qn->lower > 0) { + r = get_min_len(qn->target, min, env); + if (r == 0) + *min = distance_multiply(*min, qn->lower); + } + } + break; + + case NT_ENCLOSE: + { + EncloseNode* en = NENCLOSE(node); + switch (en->type) { + case ENCLOSE_MEMORY: + if (IS_ENCLOSE_MIN_FIXED(en)) + *min = en->min_len; + else { + if (IS_ENCLOSE_MARK1(NENCLOSE(node))) + *min = 0; // recursive + else { + SET_ENCLOSE_STATUS(node, NST_MARK1); + r = get_min_len(en->target, min, env); + CLEAR_ENCLOSE_STATUS(node, NST_MARK1); + if (r == 0) { + en->min_len = *min; + SET_ENCLOSE_STATUS(node, NST_MIN_FIXED); + } + } + } + break; + + case ENCLOSE_OPTION: + case ENCLOSE_STOP_BACKTRACK: + r = get_min_len(en->target, min, env); + break; + } + } + break; + + case NT_ANCHOR: + default: + break; + } + + return r; +} + +static int +get_max_len(Node* node, OnigLen *max, ScanEnv* env) +{ + OnigLen tmax; + int r = 0; + + *max = 0; + switch (NTYPE(node)) { + case NT_LIST: + do { + r = get_max_len(NCAR(node), &tmax, env); + if (r == 0) + *max = distance_add(*max, tmax); + } while (r == 0 && IS_NOT_NULL(node = NCDR(node))); + break; + + case NT_ALT: + do { + r = get_max_len(NCAR(node), &tmax, env); + if (r == 0 && *max < tmax) *max = tmax; + } while (r == 0 && IS_NOT_NULL(node = NCDR(node))); + break; + + case NT_STR: + { + StrNode* sn = NSTR(node); + *max = sn->end - sn->s; + } + break; + + case NT_CTYPE: + *max = ONIGENC_MBC_MAXLEN_DIST(env->enc); + break; + + case NT_CCLASS: + case NT_CANY: + *max = ONIGENC_MBC_MAXLEN_DIST(env->enc); + break; + + case NT_BREF: + { + int i; + int* backs; + Node** nodes = SCANENV_MEM_NODES(env); + BRefNode* br = NBREF(node); + if (br->state & NST_RECURSION) { + *max = ONIG_INFINITE_DISTANCE; + break; + } + backs = BACKREFS_P(br); + for (i = 0; i < br->back_num; i++) { + if (backs[i] > env->num_mem) return ONIGERR_INVALID_BACKREF; + r = get_max_len(nodes[backs[i]], &tmax, env); + if (r != 0) break; + if (*max < tmax) *max = tmax; + } + } + break; + +#ifdef USE_SUBEXP_CALL + case NT_CALL: + if (! IS_CALL_RECURSION(NCALL(node))) + r = get_max_len(NCALL(node)->target, max, env); + else + *max = ONIG_INFINITE_DISTANCE; + break; +#endif + + case NT_QTFR: + { + QtfrNode* qn = NQTFR(node); + + if (qn->upper != 0) { + r = get_max_len(qn->target, max, env); + if (r == 0 && *max != 0) { + if (! IS_REPEAT_INFINITE(qn->upper)) + *max = distance_multiply(*max, qn->upper); + else + *max = ONIG_INFINITE_DISTANCE; + } + } + } + break; + + case NT_ENCLOSE: + { + EncloseNode* en = NENCLOSE(node); + switch (en->type) { + case ENCLOSE_MEMORY: + if (IS_ENCLOSE_MAX_FIXED(en)) + *max = en->max_len; + else { + if (IS_ENCLOSE_MARK1(NENCLOSE(node))) + *max = ONIG_INFINITE_DISTANCE; + else { + SET_ENCLOSE_STATUS(node, NST_MARK1); + r = get_max_len(en->target, max, env); + CLEAR_ENCLOSE_STATUS(node, NST_MARK1); + if (r == 0) { + en->max_len = *max; + SET_ENCLOSE_STATUS(node, NST_MAX_FIXED); + } + } + } + break; + + case ENCLOSE_OPTION: + case ENCLOSE_STOP_BACKTRACK: + r = get_max_len(en->target, max, env); + break; + } + } + break; + + case NT_ANCHOR: + default: + break; + } + + return r; +} + + #ifdef USE_SUBEXP_CALL #define RECURSION_EXIST 1 @@ -2722,7 +2734,7 @@ subexp_inf_recursive_check(Node* node, ScanEnv* env, int head) case NT_LIST: { Node *x; - OnigDistance min; + OnigLen min; int ret; x = node; @@ -2731,7 +2743,7 @@ subexp_inf_recursive_check(Node* node, ScanEnv* env, int head) if (ret < 0 || ret == RECURSION_INFINITE) return ret; r |= ret; if (head) { - ret = get_min_match_length(NCAR(x), &min, env); + ret = get_min_len(NCAR(x), &min, env); if (ret != 0) return ret; if (min != 0) head = 0; } @@ -3094,6 +3106,8 @@ divide_look_behind_alternatives(Node* node) AnchorNode* an = NANCHOR(node); int anc_type = an->type; + /* fprintf(stderr, "divide_look_behind: %d\n", (int )node); */ + head = an->target; np = NCAR(head); swap_node(node, head); @@ -3123,6 +3137,8 @@ setup_look_behind(Node* node, regex_t* reg, ScanEnv* env) int r, len; AnchorNode* an = NANCHOR(node); + /* fprintf(stderr, "setup_look_behind: %x\n", (int )node); */ + r = get_char_length_tree(an->target, reg, &len); if (r == 0) an->char_len = len; @@ -3719,7 +3735,7 @@ setup_tree(Node* node, regex_t* reg, int state, ScanEnv* env) case NT_QTFR: { - OnigDistance d; + OnigLen d; QtfrNode* qn = NQTFR(node); Node* target = qn->target; @@ -3728,7 +3744,7 @@ setup_tree(Node* node, regex_t* reg, int state, ScanEnv* env) } if (IS_REPEAT_INFINITE(qn->upper) || qn->upper >= 1) { - r = get_min_match_length(target, &d, env); + r = get_min_len(target, &d, env); if (r) break; if (d == 0) { qn->target_empty_info = NQ_TARGET_IS_EMPTY; @@ -3740,7 +3756,7 @@ setup_tree(Node* node, regex_t* reg, int state, ScanEnv* env) } #endif #if 0 - r = get_max_match_length(target, &d, env); + r = get_max_len(target, &d, env); if (r == 0 && d == 0) { /* ()* ==> ()?, ()+ ==> () */ qn->upper = 1; @@ -3855,8 +3871,8 @@ setup_tree(Node* node, regex_t* reg, int state, ScanEnv* env) ( BIT_NT_LIST | BIT_NT_ALT | BIT_NT_STR | BIT_NT_CCLASS | BIT_NT_CTYPE | \ BIT_NT_CANY | BIT_NT_ANCHOR | BIT_NT_ENCLOSE | BIT_NT_QTFR | BIT_NT_CALL ) -#define ALLOWED_ENCLOSE_IN_LB ( ENCLOSE_MEMORY ) -#define ALLOWED_ENCLOSE_IN_LB_NOT 0 +#define ALLOWED_ENCLOSE_IN_LB ( ENCLOSE_MEMORY | ENCLOSE_OPTION ) +#define ALLOWED_ENCLOSE_IN_LB_NOT ENCLOSE_OPTION #define ALLOWED_ANCHOR_IN_LB \ ( ANCHOR_LOOK_BEHIND | ANCHOR_BEGIN_LINE | ANCHOR_END_LINE | ANCHOR_BEGIN_BUF | ANCHOR_BEGIN_POSITION ) @@ -3869,9 +3885,9 @@ setup_tree(Node* node, regex_t* reg, int state, ScanEnv* env) ALLOWED_ENCLOSE_IN_LB, ALLOWED_ANCHOR_IN_LB); if (r < 0) return r; if (r > 0) return ONIGERR_INVALID_LOOK_BEHIND_PATTERN; - r = setup_look_behind(node, reg, env); - if (r != 0) return r; r = setup_tree(an->target, reg, state, env); + if (r != 0) return r; + r = setup_look_behind(node, reg, env); } break; @@ -3881,9 +3897,9 @@ setup_tree(Node* node, regex_t* reg, int state, ScanEnv* env) ALLOWED_ENCLOSE_IN_LB_NOT, ALLOWED_ANCHOR_IN_LB_NOT); if (r < 0) return r; if (r > 0) return ONIGERR_INVALID_LOOK_BEHIND_PATTERN; - r = setup_look_behind(node, reg, env); - if (r != 0) return r; r = setup_tree(an->target, reg, (state | IN_NOT), env); + if (r != 0) return r; + r = setup_look_behind(node, reg, env); } break; } @@ -3927,8 +3943,8 @@ set_bm_skip(UChar* s, UChar* end, OnigEncoding enc ARG_UNUSED, #define OPT_EXACT_MAXLEN 24 typedef struct { - OnigDistance min; /* min byte length */ - OnigDistance max; /* max byte length */ + OnigLen min; /* min byte length */ + OnigLen max; /* max byte length */ } MinMaxLen; typedef struct { @@ -4052,7 +4068,7 @@ is_equal_mml(MinMaxLen* a, MinMaxLen* b) static void -set_mml(MinMaxLen* mml, OnigDistance min, OnigDistance max) +set_mml(MinMaxLen* mml, OnigLen min, OnigLen max) { mml->min = min; mml->max = max; @@ -4080,7 +4096,7 @@ add_mml(MinMaxLen* to, MinMaxLen* from) #if 0 static void -add_len_mml(MinMaxLen* to, OnigDistance len) +add_len_mml(MinMaxLen* to, OnigLen len) { to->min = distance_add(to->min, len); to->max = distance_add(to->max, len); @@ -4115,7 +4131,7 @@ copy_opt_anc_info(OptAncInfo* to, OptAncInfo* from) static void concat_opt_anc_info(OptAncInfo* to, OptAncInfo* left, OptAncInfo* right, - OnigDistance left_len, OnigDistance right_len) + OnigLen left_len, OnigLen right_len) { clear_opt_anc_info(to); @@ -4628,8 +4644,8 @@ optimize_node_left(Node* node, NodeOptInfo* opt, OptEnv* env) /* no need to check ignore case. (setted in setup_tree()) */ if (IS_NOT_NULL(cc->mbuf) || IS_NCCLASS_NOT(cc)) { - OnigDistance min = ONIGENC_MBC_MINLEN(env->enc); - OnigDistance max = ONIGENC_MBC_MAXLEN_DIST(env->enc); + OnigLen min = ONIGENC_MBC_MINLEN(env->enc); + OnigLen max = ONIGENC_MBC_MAXLEN_DIST(env->enc); set_mml(&opt->len, min, max); } @@ -4682,8 +4698,8 @@ optimize_node_left(Node* node, NodeOptInfo* opt, OptEnv* env) case NT_CANY: { - OnigDistance min = ONIGENC_MBC_MINLEN(env->enc); - OnigDistance max = ONIGENC_MBC_MAXLEN_DIST(env->enc); + OnigLen min = ONIGENC_MBC_MINLEN(env->enc); + OnigLen max = ONIGENC_MBC_MAXLEN_DIST(env->enc); set_mml(&opt->len, min, max); } break; @@ -4729,7 +4745,7 @@ optimize_node_left(Node* node, NodeOptInfo* opt, OptEnv* env) { int i; int* backs; - OnigDistance min, max, tmin, tmax; + OnigLen min, max, tmin, tmax; Node** nodes = SCANENV_MEM_NODES(env->scan_env); BRefNode* br = NBREF(node); @@ -4738,14 +4754,14 @@ optimize_node_left(Node* node, NodeOptInfo* opt, OptEnv* env) break; } backs = BACKREFS_P(br); - r = get_min_match_length(nodes[backs[0]], &min, env->scan_env); + r = get_min_len(nodes[backs[0]], &min, env->scan_env); if (r != 0) break; - r = get_max_match_length(nodes[backs[0]], &max, env->scan_env); + r = get_max_len(nodes[backs[0]], &max, env->scan_env); if (r != 0) break; for (i = 1; i < br->back_num; i++) { - r = get_min_match_length(nodes[backs[i]], &tmin, env->scan_env); + r = get_min_len(nodes[backs[i]], &tmin, env->scan_env); if (r != 0) break; - r = get_max_match_length(nodes[backs[i]], &tmax, env->scan_env); + r = get_max_len(nodes[backs[i]], &tmax, env->scan_env); if (r != 0) break; if (min > tmin) min = tmin; if (max < tmax) max = tmax; @@ -4770,7 +4786,7 @@ optimize_node_left(Node* node, NodeOptInfo* opt, OptEnv* env) case NT_QTFR: { int i; - OnigDistance min, max; + OnigLen min, max; NodeOptInfo nopt; QtfrNode* qn = NQTFR(node); @@ -4839,7 +4855,7 @@ optimize_node_left(Node* node, NodeOptInfo* opt, OptEnv* env) #ifdef USE_SUBEXP_CALL en->opt_count++; if (en->opt_count > MAX_NODE_OPT_INFO_REF_COUNT) { - OnigDistance min, max; + OnigLen min, max; min = 0; max = ONIG_INFINITE_DISTANCE; @@ -5059,7 +5075,7 @@ static void print_enc_string(FILE* fp, OnigEncoding enc, } static void -print_distance_range(FILE* f, OnigDistance a, OnigDistance b) +print_distance_range(FILE* f, OnigLen a, OnigLen b) { if (a == ONIG_INFINITE_DISTANCE) fputs("inf", f); @@ -5147,7 +5163,7 @@ print_optimize_info(FILE* f, regex_t* reg) for (p = reg->exact; p < reg->exact_end; p++) { fputc(*p, f); } - fprintf(f, "]: length: %d\n", (reg->exact_end - reg->exact)); + fprintf(f, "]: length: %ld\n", (reg->exact_end - reg->exact)); } else if (reg->optimize & ONIG_OPTIMIZE_MAP) { int c, i, n = 0; @@ -5431,6 +5447,8 @@ onig_reg_init(regex_t* reg, OnigOptionType option, r = onig_initialize_encoding(enc); if (r != 0) return ONIGERR_FAIL_TO_INITIALIZE; + + onig_warning("You didn't call onig_initialize() explicitly"); #endif } @@ -5935,7 +5953,7 @@ onig_print_compiled_byte_code(FILE* f, UChar* bp, UChar** nextp, GET_POINTER_INC(cc, bp); n = bitset_on_num(cc->bs); - fprintf(f, ":%u:%d", (unsigned int )cc, n); + fprintf(f, ":%p:%d", cc, n); } break; @@ -6067,9 +6085,9 @@ print_indent_tree(FILE* f, Node* node, int indent) case NT_LIST: case NT_ALT: if (NTYPE(node) == NT_LIST) - fprintf(f, "\n", (int )node); + fprintf(f, "\n", node); else - fprintf(f, "\n", (int )node); + fprintf(f, "\n", node); print_indent_tree(f, NCAR(node), indent + add); while (IS_NOT_NULL(node = NCDR(node))) { @@ -6082,8 +6100,8 @@ print_indent_tree(FILE* f, Node* node, int indent) break; case NT_STR: - fprintf(f, "", - (NSTRING_IS_RAW(node) ? "-raw" : ""), (int )node); + fprintf(f, "", + (NSTRING_IS_RAW(node) ? "-raw" : ""), node); for (p = NSTR(node)->s; p < NSTR(node)->end; p++) { if (*p >= 0x20 && *p < 0x7f) fputc(*p, f); @@ -6094,7 +6112,7 @@ print_indent_tree(FILE* f, Node* node, int indent) break; case NT_CCLASS: - fprintf(f, "", (int )node); + fprintf(f, "", node); if (IS_NCCLASS_NOT(NCCLASS(node))) fputs(" not", f); if (NCCLASS(node)->mbuf) { BBuf* bbuf = NCCLASS(node)->mbuf; @@ -6106,7 +6124,7 @@ print_indent_tree(FILE* f, Node* node, int indent) break; case NT_CTYPE: - fprintf(f, " ", (int )node); + fprintf(f, " ", node); switch (NCTYPE(node)->ctype) { case ONIGENC_CTYPE_WORD: if (NCTYPE(node)->not != 0) @@ -6122,11 +6140,11 @@ print_indent_tree(FILE* f, Node* node, int indent) break; case NT_CANY: - fprintf(f, "", (int )node); + fprintf(f, "", node); break; case NT_ANCHOR: - fprintf(f, " ", (int )node); + fprintf(f, " ", node); switch (NANCHOR(node)->type) { case ANCHOR_BEGIN_BUF: fputs("begin buf", f); break; case ANCHOR_END_BUF: fputs("end buf", f); break; @@ -6141,10 +6159,22 @@ print_indent_tree(FILE* f, Node* node, int indent) case ANCHOR_WORD_BEGIN: fputs("word begin", f); break; case ANCHOR_WORD_END: fputs("word end", f); break; #endif - case ANCHOR_PREC_READ: fputs("prec read", f); break; - case ANCHOR_PREC_READ_NOT: fputs("prec read not", f); break; - case ANCHOR_LOOK_BEHIND: fputs("look_behind", f); break; - case ANCHOR_LOOK_BEHIND_NOT: fputs("look_behind_not",f); break; + case ANCHOR_PREC_READ: + fprintf(f, "prec read\n"); + print_indent_tree(f, NANCHOR(node)->target, indent + add); + break; + case ANCHOR_PREC_READ_NOT: + fprintf(f, "prec read not\n"); + print_indent_tree(f, NANCHOR(node)->target, indent + add); + break; + case ANCHOR_LOOK_BEHIND: + fprintf(f, "look behind\n"); + print_indent_tree(f, NANCHOR(node)->target, indent + add); + break; + case ANCHOR_LOOK_BEHIND_NOT: + fprintf(f, "look behind not\n"); + print_indent_tree(f, NANCHOR(node)->target, indent + add); + break; default: fprintf(f, "ERROR: undefined anchor type.\n"); @@ -6157,7 +6187,7 @@ print_indent_tree(FILE* f, Node* node, int indent) int* p; BRefNode* br = NBREF(node); p = BACKREFS_P(br); - fprintf(f, "", (int )node); + fprintf(f, "", node); for (i = 0; i < br->back_num; i++) { if (i > 0) fputs(", ", f); fprintf(f, "%d", p[i]); @@ -6169,21 +6199,21 @@ print_indent_tree(FILE* f, Node* node, int indent) case NT_CALL: { CallNode* cn = NCALL(node); - fprintf(f, "", (int )node); + fprintf(f, "", node); p_string(f, cn->name_end - cn->name, cn->name); } break; #endif case NT_QTFR: - fprintf(f, "{%d,%d}%s\n", (int )node, + fprintf(f, "{%d,%d}%s\n", node, NQTFR(node)->lower, NQTFR(node)->upper, (NQTFR(node)->greedy ? "" : "?")); print_indent_tree(f, NQTFR(node)->target, indent + add); break; case NT_ENCLOSE: - fprintf(f, " ", (int )node); + fprintf(f, " ", node); switch (NENCLOSE(node)->type) { case ENCLOSE_OPTION: fprintf(f, "option:%d", NENCLOSE(node)->option); diff --git a/src/regenc.c b/src/regenc.c index 01bfd1d..554a622 100644 --- a/src/regenc.c +++ b/src/regenc.c @@ -2,7 +2,7 @@ regenc.c - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2007 K.Kosako + * Copyright (c) 2002-2016 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -108,6 +108,20 @@ onigenc_step_back(OnigEncoding enc, const UChar* start, const UChar* s, int n) return (UChar* )s; } +#if 0 +extern int +onigenc_mbc_enc_len_end(OnigEncoding enc, const UChar* p, const UChar* end) +{ + int len; + int n; + + len = ONIGENC_MBC_ENC_LEN(enc, p); + n = (int )(end - p); + + return (n < len ? n : len); +} +#endif + extern UChar* onigenc_step(OnigEncoding enc, const UChar* p, const UChar* end, int n) { @@ -649,6 +663,33 @@ onigenc_always_false_is_allowed_reverse_match(const UChar* s ARG_UNUSED, return FALSE; } +extern int +onigenc_always_true_is_valid_mbc_string(const UChar* s ARG_UNUSED, + const UChar* end ARG_UNUSED) +{ + return TRUE; +} + +extern int +onigenc_length_check_is_valid_mbc_string(OnigEncoding enc, + const UChar* p, const UChar* end) +{ + while (p < end) { + p += enclen(enc, p); + } + + if (p != end) + return FALSE; + else + return TRUE; +} + +extern int +onigenc_is_valid_mbc_string(OnigEncoding enc, const UChar* s, const UChar* end) +{ + return ONIGENC_IS_VALID_MBC_STRING(enc, s, end); +} + extern OnigCodePoint onigenc_mbn_mbc_to_code(OnigEncoding enc, const UChar* p, const UChar* end) { diff --git a/src/regenc.h b/src/regenc.h index 65eb17e..49227fa 100644 --- a/src/regenc.h +++ b/src/regenc.h @@ -71,7 +71,7 @@ typedef struct { #define ONIG_CHECK_NULL_RETURN(p) if (ONIG_IS_NULL(p)) return NULL #define ONIG_CHECK_NULL_RETURN_VAL(p,val) if (ONIG_IS_NULL(p)) return (val) -#define enclen(enc,p) ONIGENC_MBC_ENC_LEN(enc,p) +#define enclen(enc,p) ONIGENC_MBC_ENC_LEN(enc,p) /* character types bit flag */ #define BIT_CTYPE_NEWLINE (1<< ONIGENC_CTYPE_NEWLINE) @@ -133,6 +133,8 @@ ONIG_EXTERN int onigenc_single_byte_code_to_mbc P_((OnigCodePoint code, UChar *b ONIG_EXTERN UChar* onigenc_single_byte_left_adjust_char_head P_((const UChar* start, const UChar* s)); ONIG_EXTERN int onigenc_always_true_is_allowed_reverse_match P_((const UChar* s, const UChar* end)); ONIG_EXTERN int onigenc_always_false_is_allowed_reverse_match P_((const UChar* s, const UChar* end)); +ONIG_EXTERN int onigenc_always_true_is_valid_mbc_string P_((const UChar* s, const UChar* end)); +ONIG_EXTERN int onigenc_length_check_is_valid_mbc_string P_((OnigEncoding enc, const UChar* s, const UChar* end)); /* methods for multi byte encoding */ ONIG_EXTERN OnigCodePoint onigenc_mbn_mbc_to_code P_((OnigEncoding enc, const UChar* p, const UChar* end)); @@ -227,6 +229,7 @@ ONIG_EXTERN const UChar OnigEncAsciiToLowerCaseTable[]; ONIG_EXTERN const UChar OnigEncAsciiToUpperCaseTable[]; ONIG_EXTERN const unsigned short OnigEncAsciiCtypeTable[]; + #define ONIGENC_IS_ASCII_CODE(code) ((code) < 0x80) #define ONIGENC_ASCII_CODE_TO_LOWER_CASE(c) OnigEncAsciiToLowerCaseTable[c] #define ONIGENC_ASCII_CODE_TO_UPPER_CASE(c) OnigEncAsciiToUpperCaseTable[c] diff --git a/src/regerror.c b/src/regerror.c index 16009bb..05fc9d8 100644 --- a/src/regerror.c +++ b/src/regerror.c @@ -140,6 +140,8 @@ onig_error_code_to_format(int code) #endif case ONIGERR_NUMBERED_BACKREF_OR_CALL_NOT_ALLOWED: p = "numbered backref/call is not allowed. (use name)"; break; + case ONIGERR_TOO_MANY_CAPTURES: + p = "too many captures"; break; case ONIGERR_TOO_BIG_WIDE_CHAR_VALUE: p = "too big wide-char value"; break; case ONIGERR_TOO_LONG_WIDE_CHAR_VALUE: diff --git a/src/regexec.c b/src/regexec.c index 2c768e1..70ac89e 100644 --- a/src/regexec.c +++ b/src/regexec.c @@ -2,7 +2,7 @@ regexec.c - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2008 K.Kosako + * Copyright (c) 2002-2016 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -327,19 +327,21 @@ onig_region_copy(OnigRegion* to, OnigRegion* from) #define STK_MASK_MEM_END_OR_MARK 0x8000 /* MEM_END or MEM_END_MARK */ #ifdef USE_FIND_LONGEST_SEARCH_ALL_OF_RANGE -#define MATCH_ARG_INIT(msa, arg_option, arg_region, arg_start) do {\ +#define MATCH_ARG_INIT(msa, reg, arg_option, arg_region, arg_start) do {\ (msa).stack_p = (void* )0;\ (msa).options = (arg_option);\ (msa).region = (arg_region);\ (msa).start = (arg_start);\ (msa).best_len = ONIG_MISMATCH;\ + (msa).ptr_num = (reg)->num_repeat + (reg)->num_mem * 2;\ } while(0) #else -#define MATCH_ARG_INIT(msa, arg_option, arg_region, arg_start) do {\ +#define MATCH_ARG_INIT(msa, reg, arg_option, arg_region, arg_start) do {\ (msa).stack_p = (void* )0;\ (msa).options = (arg_option);\ (msa).region = (arg_region);\ (msa).start = (arg_start);\ + (msa).ptr_num = (reg)->num_repeat + (reg)->num_mem * 2;\ } while(0) #endif @@ -369,7 +371,7 @@ onig_region_copy(OnigRegion* to, OnigRegion* from) (msa).state_check_buff = (void* )0;\ (msa).state_check_buff_size = 0;\ }\ - } while(0) +} while(0) #define MATCH_ARG_FREE(msa) do {\ if ((msa).stack_p) xfree((msa).stack_p);\ @@ -383,32 +385,59 @@ onig_region_copy(OnigRegion* to, OnigRegion* from) #endif +#define ALLOCA_PTR_NUM_LIMIT 50 -#define STACK_INIT(alloc_addr, ptr_num, stack_num) do {\ +#define STACK_INIT(stack_num) do {\ if (msa->stack_p) {\ - alloc_addr = (char* )xalloca(sizeof(char*) * (ptr_num));\ - stk_alloc = (OnigStackType* )(msa->stack_p);\ - stk_base = stk_alloc;\ + is_alloca = 0;\ + alloc_base = msa->stack_p;\ + stk_base = (OnigStackType* )(alloc_base\ + + (sizeof(OnigStackIndex) * msa->ptr_num));\ stk = stk_base;\ stk_end = stk_base + msa->stack_n;\ }\ + else if (msa->ptr_num > ALLOCA_PTR_NUM_LIMIT) {\ + is_alloca = 0;\ + alloc_base = (char* )xmalloc(sizeof(OnigStackIndex) * msa->ptr_num\ + + sizeof(OnigStackType) * (stack_num));\ + stk_base = (OnigStackType* )(alloc_base\ + + (sizeof(OnigStackIndex) * msa->ptr_num));\ + stk = stk_base;\ + stk_end = stk_base + (stack_num);\ + }\ else {\ - alloc_addr = (char* )xalloca(sizeof(char*) * (ptr_num)\ - + sizeof(OnigStackType) * (stack_num));\ - stk_alloc = (OnigStackType* )(alloc_addr + sizeof(char*) * (ptr_num));\ - stk_base = stk_alloc;\ + is_alloca = 1;\ + alloc_base = (char* )xalloca(sizeof(OnigStackIndex) * msa->ptr_num\ + + sizeof(OnigStackType) * (stack_num));\ + stk_base = (OnigStackType* )(alloc_base\ + + (sizeof(OnigStackIndex) * msa->ptr_num));\ stk = stk_base;\ stk_end = stk_base + (stack_num);\ }\ -} while(0) +} while(0); + #define STACK_SAVE do{\ - if (stk_base != stk_alloc) {\ - msa->stack_p = stk_base;\ - msa->stack_n = stk_end - stk_base;\ + msa->stack_n = stk_end - stk_base;\ + if (is_alloca != 0) {\ + size_t size = sizeof(OnigStackIndex) * msa->ptr_num \ + + sizeof(OnigStackType) * msa->stack_n;\ + msa->stack_p = xmalloc(size);\ + xmemcpy(msa->stack_p, alloc_base, size);\ + }\ + else {\ + msa->stack_p = alloc_base;\ };\ } while(0) +#define UPDATE_FOR_STACK_REALLOC do{\ + repeat_stk = (OnigStackIndex* )alloc_base;\ + mem_start_stk = (OnigStackIndex* )(repeat_stk + reg->num_repeat);\ + mem_end_stk = mem_start_stk + num_mem;\ + mem_start_stk--; /* for index start from 1 */\ + mem_end_stk--; /* for index start from 1 */\ +} while(0) + static unsigned int MatchStackLimitSize = DEFAULT_MATCH_STACK_LIMIT_SIZE; extern unsigned int @@ -425,50 +454,65 @@ onig_set_match_stack_limit_size(unsigned int size) } static int -stack_double(OnigStackType** arg_stk_base, OnigStackType** arg_stk_end, - OnigStackType** arg_stk, OnigStackType* stk_alloc, OnigMatchArg* msa) +stack_double(int is_alloca, char** arg_alloc_base, + OnigStackType** arg_stk_base, + OnigStackType** arg_stk_end, OnigStackType** arg_stk, + OnigMatchArg* msa) { unsigned int n; - OnigStackType *x, *stk_base, *stk_end, *stk; + int used; + size_t size; + char* alloc_base; + char* new_alloc_base; + OnigStackType *stk_base, *stk_end, *stk; + alloc_base = *arg_alloc_base; stk_base = *arg_stk_base; stk_end = *arg_stk_end; stk = *arg_stk; n = stk_end - stk_base; - if (stk_base == stk_alloc && IS_NULL(msa->stack_p)) { - x = (OnigStackType* )xmalloc(sizeof(OnigStackType) * n * 2); - if (IS_NULL(x)) { + n *= 2; + size = sizeof(OnigStackIndex) * msa->ptr_num + sizeof(OnigStackType) * n; + if (is_alloca != 0) { + new_alloc_base = (char* )xmalloc(size); + if (IS_NULL(new_alloc_base)) { STACK_SAVE; return ONIGERR_MEMORY; } - xmemcpy(x, stk_base, n * sizeof(OnigStackType)); - n *= 2; + xmemcpy(new_alloc_base, alloc_base, size); } else { - n *= 2; if (MatchStackLimitSize != 0 && n > MatchStackLimitSize) { if ((unsigned int )(stk_end - stk_base) == MatchStackLimitSize) return ONIGERR_MATCH_STACK_LIMIT_OVER; else n = MatchStackLimitSize; } - x = (OnigStackType* )xrealloc(stk_base, sizeof(OnigStackType) * n); - if (IS_NULL(x)) { + new_alloc_base = (char* )xrealloc(alloc_base, size); + if (IS_NULL(new_alloc_base)) { STACK_SAVE; return ONIGERR_MEMORY; } } - *arg_stk = x + (stk - stk_base); - *arg_stk_base = x; - *arg_stk_end = x + n; + + alloc_base = new_alloc_base; + used = stk - stk_base; + *arg_alloc_base = alloc_base; + *arg_stk_base = (OnigStackType* )(alloc_base + + (sizeof(OnigStackIndex) * msa->ptr_num)); + *arg_stk = *arg_stk_base + used; + *arg_stk_end = *arg_stk_base + n; return 0; } #define STACK_ENSURE(n) do {\ if (stk_end - stk < (n)) {\ - int r = stack_double(&stk_base, &stk_end, &stk, stk_alloc, msa);\ + int r = stack_double(is_alloca, &alloc_base, &stk_base, &stk_end, &stk,\ + msa);\ if (r != 0) { STACK_SAVE; return r; } \ + is_alloca = 0;\ + UPDATE_FOR_STACK_REALLOC;\ }\ } while(0) @@ -1108,33 +1152,33 @@ static int backref_match_at_nested_level(regex_t* reg } else if (level == nest) { if (k->type == STK_MEM_START) { - if (mem_is_in_memp(k->u.mem.num, mem_num, memp)) { - pstart = k->u.mem.pstr; - if (pend != NULL_UCHARP) { - if (pend - pstart > send - *s) return 0; /* or goto next_mem; */ - p = pstart; - ss = *s; - - if (ignore_case != 0) { - if (string_cmp_ic(reg->enc, case_fold_flag, - pstart, &ss, (int )(pend - pstart)) == 0) - return 0; /* or goto next_mem; */ - } - else { - while (p < pend) { - if (*p++ != *ss++) return 0; /* or goto next_mem; */ - } - } - - *s = ss; - return 1; - } - } + if (mem_is_in_memp(k->u.mem.num, mem_num, memp)) { + pstart = k->u.mem.pstr; + if (pend != NULL_UCHARP) { + if (pend - pstart > send - *s) return 0; /* or goto next_mem; */ + p = pstart; + ss = *s; + + if (ignore_case != 0) { + if (string_cmp_ic(reg->enc, case_fold_flag, + pstart, &ss, (int )(pend - pstart)) == 0) + return 0; /* or goto next_mem; */ + } + else { + while (p < pend) { + if (*p++ != *ss++) return 0; /* or goto next_mem; */ + } + } + + *s = ss; + return 1; + } + } } else if (k->type == STK_MEM_END) { - if (mem_is_in_memp(k->u.mem.num, mem_num, memp)) { - pend = k->u.mem.pstr; - } + if (mem_is_in_memp(k->u.mem.num, mem_num, memp)) { + pend = k->u.mem.pstr; + } } } k--; @@ -1247,13 +1291,10 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, LengthType tlen, tlen2; MemNumType mem; RelAddrType addr; - OnigOptionType option = reg->options; - OnigEncoding encode = reg->enc; - OnigCaseFoldType case_fold_flag = reg->case_fold_flag; UChar *s, *q, *sbegin; - UChar *p = reg->p; - char *alloca_base; - OnigStackType *stk_alloc, *stk_base, *stk, *stk_end; + int is_alloca; + char *alloc_base; + OnigStackType *stk_base, *stk, *stk_end; OnigStackType *stkp; /* used as any purpose. */ OnigStackIndex si; OnigStackIndex *repeat_stk; @@ -1263,19 +1304,16 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, unsigned char* state_check_buff = msa->state_check_buff; int num_comb_exp_check = reg->num_comb_exp_check; #endif - n = reg->num_repeat + reg->num_mem * 2; + UChar *p = reg->p; + OnigOptionType option = reg->options; + OnigEncoding encode = reg->enc; + OnigCaseFoldType case_fold_flag = reg->case_fold_flag; - STACK_INIT(alloca_base, n, INIT_MATCH_STACK_SIZE); + //n = reg->num_repeat + reg->num_mem * 2; pop_level = reg->stack_pop_level; num_mem = reg->num_mem; - repeat_stk = (OnigStackIndex* )alloca_base; - - mem_start_stk = (OnigStackIndex* )(repeat_stk + reg->num_repeat); - mem_end_stk = mem_start_stk + num_mem; - mem_start_stk--; /* for index start from 1, - mem_start_stk[1]..mem_start_stk[num_mem] */ - mem_end_stk--; /* for index start from 1, - mem_end_stk[1]..mem_end_stk[num_mem] */ + STACK_INIT(INIT_MATCH_STACK_SIZE); + UPDATE_FOR_STACK_REALLOC; for (i = 1; i <= num_mem; i++) { mem_start_stk[i] = mem_end_stk[i] = INVALID_STACK_INDEX; } @@ -1316,64 +1354,64 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, case OP_END: MOP_IN(OP_END); n = s - sstart; if (n > best_len) { - OnigRegion* region; + OnigRegion* region; #ifdef USE_FIND_LONGEST_SEARCH_ALL_OF_RANGE - if (IS_FIND_LONGEST(option)) { - if (n > msa->best_len) { - msa->best_len = n; - msa->best_s = (UChar* )sstart; - } - else - goto end_best_len; + if (IS_FIND_LONGEST(option)) { + if (n > msa->best_len) { + msa->best_len = n; + msa->best_s = (UChar* )sstart; + } + else + goto end_best_len; } #endif - best_len = n; - region = msa->region; - if (region) { + best_len = n; + region = msa->region; + if (region) { #ifdef USE_POSIX_API_REGION_OPTION - if (IS_POSIX_REGION(msa->options)) { - posix_regmatch_t* rmt = (posix_regmatch_t* )region; - - rmt[0].rm_so = sstart - str; - rmt[0].rm_eo = s - str; - for (i = 1; i <= num_mem; i++) { - if (mem_end_stk[i] != INVALID_STACK_INDEX) { - if (BIT_STATUS_AT(reg->bt_mem_start, i)) - rmt[i].rm_so = STACK_AT(mem_start_stk[i])->u.mem.pstr - str; - else - rmt[i].rm_so = (UChar* )((void* )(mem_start_stk[i])) - str; - - rmt[i].rm_eo = (BIT_STATUS_AT(reg->bt_mem_end, i) - ? STACK_AT(mem_end_stk[i])->u.mem.pstr - : (UChar* )((void* )mem_end_stk[i])) - str; - } - else { - rmt[i].rm_so = rmt[i].rm_eo = ONIG_REGION_NOTPOS; - } - } - } - else { + if (IS_POSIX_REGION(msa->options)) { + posix_regmatch_t* rmt = (posix_regmatch_t* )region; + + rmt[0].rm_so = sstart - str; + rmt[0].rm_eo = s - str; + for (i = 1; i <= num_mem; i++) { + if (mem_end_stk[i] != INVALID_STACK_INDEX) { + if (BIT_STATUS_AT(reg->bt_mem_start, i)) + rmt[i].rm_so = STACK_AT(mem_start_stk[i])->u.mem.pstr - str; + else + rmt[i].rm_so = (UChar* )((void* )(mem_start_stk[i])) - str; + + rmt[i].rm_eo = (BIT_STATUS_AT(reg->bt_mem_end, i) + ? STACK_AT(mem_end_stk[i])->u.mem.pstr + : (UChar* )((void* )mem_end_stk[i])) - str; + } + else { + rmt[i].rm_so = rmt[i].rm_eo = ONIG_REGION_NOTPOS; + } + } + } + else { #endif /* USE_POSIX_API_REGION_OPTION */ - region->beg[0] = sstart - str; - region->end[0] = s - str; - for (i = 1; i <= num_mem; i++) { - if (mem_end_stk[i] != INVALID_STACK_INDEX) { - if (BIT_STATUS_AT(reg->bt_mem_start, i)) - region->beg[i] = STACK_AT(mem_start_stk[i])->u.mem.pstr - str; - else - region->beg[i] = (UChar* )((void* )mem_start_stk[i]) - str; - - region->end[i] = (BIT_STATUS_AT(reg->bt_mem_end, i) - ? STACK_AT(mem_end_stk[i])->u.mem.pstr - : (UChar* )((void* )mem_end_stk[i])) - str; - } - else { - region->beg[i] = region->end[i] = ONIG_REGION_NOTPOS; - } - } + region->beg[0] = sstart - str; + region->end[0] = s - str; + for (i = 1; i <= num_mem; i++) { + if (mem_end_stk[i] != INVALID_STACK_INDEX) { + if (BIT_STATUS_AT(reg->bt_mem_start, i)) + region->beg[i] = STACK_AT(mem_start_stk[i])->u.mem.pstr - str; + else + region->beg[i] = (UChar* )((void* )mem_start_stk[i]) - str; + + region->end[i] = (BIT_STATUS_AT(reg->bt_mem_end, i) + ? STACK_AT(mem_end_stk[i])->u.mem.pstr + : (UChar* )((void* )mem_end_stk[i])) - str; + } + else { + region->beg[i] = region->end[i] = ONIG_REGION_NOTPOS; + } + } #ifdef USE_CAPTURE_HISTORY - if (reg->capture_history != 0) { + if (reg->capture_history != 0) { int r; OnigCaptureTreeNode* node; @@ -1397,12 +1435,12 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, best_len = r; /* error code */ goto finish; } - } + } #endif /* USE_CAPTURE_HISTORY */ #ifdef USE_POSIX_API_REGION_OPTION - } /* else IS_POSIX_REGION() */ + } /* else IS_POSIX_REGION() */ #endif - } /* if (region) */ + } /* if (region) */ } /* n > best_len */ #ifdef USE_FIND_LONGEST_SEARCH_ALL_OF_RANGE @@ -1411,13 +1449,13 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, MOP_OUT; if (IS_FIND_CONDITION(option)) { - if (IS_FIND_NOT_EMPTY(option) && s == sstart) { - best_len = ONIG_MISMATCH; - goto fail; /* for retry */ - } - if (IS_FIND_LONGEST(option) && DATA_ENSURE_CHECK1) { - goto fail; /* for retry */ - } + if (IS_FIND_NOT_EMPTY(option) && s == sstart) { + best_len = ONIG_MISMATCH; + goto fail; /* for retry */ + } + if (IS_FIND_LONGEST(option) && DATA_ENSURE_CHECK1) { + goto fail; /* for retry */ + } } /* default behavior: return first-matching result. */ @@ -1438,22 +1476,22 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, case OP_EXACT1_IC: MOP_IN(OP_EXACT1_IC); { - int len; - UChar *q, lowbuf[ONIGENC_MBC_CASE_FOLD_MAXLEN]; - - DATA_ENSURE(1); - len = ONIGENC_MBC_CASE_FOLD(encode, - /* DISABLE_CASE_FOLD_MULTI_CHAR(case_fold_flag), */ - case_fold_flag, - &s, end, lowbuf); - DATA_ENSURE(0); - q = lowbuf; - while (len-- > 0) { - if (*p != *q) { + int len; + UChar *q, lowbuf[ONIGENC_MBC_CASE_FOLD_MAXLEN]; + + DATA_ENSURE(1); + len = ONIGENC_MBC_CASE_FOLD(encode, + /* DISABLE_CASE_FOLD_MULTI_CHAR(case_fold_flag), */ + case_fold_flag, + &s, end, lowbuf); + DATA_ENSURE(0); + q = lowbuf; + while (len-- > 0) { + if (*p != *q) { goto fail; } - p++; q++; - } + p++; q++; + } } MOP_OUT; break; @@ -1518,7 +1556,7 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, GET_LENGTH_INC(tlen, p); DATA_ENSURE(tlen); while (tlen-- > 0) { - if (*p++ != *s++) goto fail; + if (*p++ != *s++) goto fail; } sprev = s - 1; MOP_OUT; @@ -1527,26 +1565,26 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, case OP_EXACTN_IC: MOP_IN(OP_EXACTN_IC); { - int len; - UChar *q, *endp, lowbuf[ONIGENC_MBC_CASE_FOLD_MAXLEN]; + int len; + UChar *q, *endp, lowbuf[ONIGENC_MBC_CASE_FOLD_MAXLEN]; - GET_LENGTH_INC(tlen, p); - endp = p + tlen; + GET_LENGTH_INC(tlen, p); + endp = p + tlen; - while (p < endp) { - sprev = s; - DATA_ENSURE(1); - len = ONIGENC_MBC_CASE_FOLD(encode, - /* DISABLE_CASE_FOLD_MULTI_CHAR(case_fold_flag), */ - case_fold_flag, - &s, end, lowbuf); - DATA_ENSURE(0); - q = lowbuf; - while (len-- > 0) { - if (*p != *q) goto fail; - p++; q++; - } - } + while (p < endp) { + sprev = s; + DATA_ENSURE(1); + len = ONIGENC_MBC_CASE_FOLD(encode, + /* DISABLE_CASE_FOLD_MULTI_CHAR(case_fold_flag), */ + case_fold_flag, + &s, end, lowbuf); + DATA_ENSURE(0); + q = lowbuf; + while (len-- > 0) { + if (*p != *q) goto fail; + p++; q++; + } + } } MOP_OUT; @@ -1600,10 +1638,10 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, GET_LENGTH_INC(tlen, p); DATA_ENSURE(tlen * 2); while (tlen-- > 0) { - if (*p != *s) goto fail; - p++; s++; - if (*p != *s) goto fail; - p++; s++; + if (*p != *s) goto fail; + p++; s++; + if (*p != *s) goto fail; + p++; s++; } sprev = s - 2; MOP_OUT; @@ -1614,12 +1652,12 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, GET_LENGTH_INC(tlen, p); DATA_ENSURE(tlen * 3); while (tlen-- > 0) { - if (*p != *s) goto fail; - p++; s++; - if (*p != *s) goto fail; - p++; s++; - if (*p != *s) goto fail; - p++; s++; + if (*p != *s) goto fail; + p++; s++; + if (*p != *s) goto fail; + p++; s++; + if (*p != *s) goto fail; + p++; s++; } sprev = s - 3; MOP_OUT; @@ -1632,8 +1670,8 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, tlen2 *= tlen; DATA_ENSURE(tlen2); while (tlen2-- > 0) { - if (*p != *s) goto fail; - p++; s++; + if (*p != *s) goto fail; + p++; s++; } sprev = s - tlen; MOP_OUT; @@ -1654,23 +1692,23 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, cclass_mb: GET_LENGTH_INC(tlen, p); { - OnigCodePoint code; - UChar *ss; - int mb_len; + OnigCodePoint code; + UChar *ss; + int mb_len; - DATA_ENSURE(1); - mb_len = enclen(encode, s); - DATA_ENSURE(mb_len); - ss = s; - s += mb_len; - code = ONIGENC_MBC_TO_CODE(encode, ss, s); + DATA_ENSURE(1); + mb_len = enclen(encode, s); + DATA_ENSURE(mb_len); + ss = s; + s += mb_len; + code = ONIGENC_MBC_TO_CODE(encode, ss, s); #ifdef PLATFORM_UNALIGNED_WORD_ACCESS - if (! onig_is_in_code_range(p, code)) goto fail; + if (! onig_is_in_code_range(p, code)) goto fail; #else - q = p; - ALIGNMENT_RIGHT(q); - if (! onig_is_in_code_range(q, code)) goto fail; + q = p; + ALIGNMENT_RIGHT(q); + if (! onig_is_in_code_range(q, code)) goto fail; #endif } p += tlen; @@ -1680,17 +1718,17 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, case OP_CCLASS_MIX: MOP_IN(OP_CCLASS_MIX); DATA_ENSURE(1); if (ONIGENC_IS_MBC_HEAD(encode, s)) { - p += SIZE_BITSET; - goto cclass_mb; + p += SIZE_BITSET; + goto cclass_mb; } else { - if (BITSET_AT(((BitSetRef )p), *s) == 0) - goto fail; + if (BITSET_AT(((BitSetRef )p), *s) == 0) + goto fail; - p += SIZE_BITSET; - GET_LENGTH_INC(tlen, p); - p += tlen; - s++; + p += SIZE_BITSET; + GET_LENGTH_INC(tlen, p); + p += tlen; + s++; } MOP_OUT; break; @@ -1706,36 +1744,36 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, case OP_CCLASS_MB_NOT: MOP_IN(OP_CCLASS_MB_NOT); DATA_ENSURE(1); if (! ONIGENC_IS_MBC_HEAD(encode, s)) { - s++; - GET_LENGTH_INC(tlen, p); - p += tlen; - goto cc_mb_not_success; + s++; + GET_LENGTH_INC(tlen, p); + p += tlen; + goto cc_mb_not_success; } cclass_mb_not: GET_LENGTH_INC(tlen, p); { - OnigCodePoint code; - UChar *ss; - int mb_len = enclen(encode, s); + OnigCodePoint code; + UChar *ss; + int mb_len = enclen(encode, s); - if (! DATA_ENSURE_CHECK(mb_len)) { + if (! DATA_ENSURE_CHECK(mb_len)) { DATA_ENSURE(1); - s = (UChar* )end; - p += tlen; - goto cc_mb_not_success; - } + s = (UChar* )end; + p += tlen; + goto cc_mb_not_success; + } - ss = s; - s += mb_len; - code = ONIGENC_MBC_TO_CODE(encode, ss, s); + ss = s; + s += mb_len; + code = ONIGENC_MBC_TO_CODE(encode, ss, s); #ifdef PLATFORM_UNALIGNED_WORD_ACCESS - if (onig_is_in_code_range(p, code)) goto fail; + if (onig_is_in_code_range(p, code)) goto fail; #else - q = p; - ALIGNMENT_RIGHT(q); - if (onig_is_in_code_range(q, code)) goto fail; + q = p; + ALIGNMENT_RIGHT(q); + if (onig_is_in_code_range(q, code)) goto fail; #endif } p += tlen; @@ -1747,36 +1785,36 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, case OP_CCLASS_MIX_NOT: MOP_IN(OP_CCLASS_MIX_NOT); DATA_ENSURE(1); if (ONIGENC_IS_MBC_HEAD(encode, s)) { - p += SIZE_BITSET; - goto cclass_mb_not; + p += SIZE_BITSET; + goto cclass_mb_not; } else { - if (BITSET_AT(((BitSetRef )p), *s) != 0) - goto fail; + if (BITSET_AT(((BitSetRef )p), *s) != 0) + goto fail; - p += SIZE_BITSET; - GET_LENGTH_INC(tlen, p); - p += tlen; - s++; + p += SIZE_BITSET; + GET_LENGTH_INC(tlen, p); + p += tlen; + s++; } MOP_OUT; break; case OP_CCLASS_NODE: MOP_IN(OP_CCLASS_NODE); { - OnigCodePoint code; + OnigCodePoint code; void *node; int mb_len; UChar *ss; DATA_ENSURE(1); GET_POINTER_INC(node, p); - mb_len = enclen(encode, s); - ss = s; - s += mb_len; - DATA_ENSURE(0); - code = ONIGENC_MBC_TO_CODE(encode, ss, s); - if (onig_is_code_in_cc_len(mb_len, code, node) == 0) goto fail; + mb_len = enclen(encode, s); + ss = s; + s += mb_len; + DATA_ENSURE(0); + code = ONIGENC_MBC_TO_CODE(encode, ss, s); + if (onig_is_code_in_cc_len(mb_len, code, node) == 0) goto fail; } MOP_OUT; break; @@ -1800,8 +1838,8 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, case OP_ANYCHAR_STAR: MOP_IN(OP_ANYCHAR_STAR); while (DATA_ENSURE_CHECK1) { - STACK_PUSH_ALT(p, s, sprev); - n = enclen(encode, s); + STACK_PUSH_ALT(p, s, sprev); + n = enclen(encode, s); DATA_ENSURE(n); if (ONIGENC_IS_MBC_NEWLINE(encode, s, end)) goto fail; sprev = s; @@ -1812,27 +1850,27 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, case OP_ANYCHAR_ML_STAR: MOP_IN(OP_ANYCHAR_ML_STAR); while (DATA_ENSURE_CHECK1) { - STACK_PUSH_ALT(p, s, sprev); - n = enclen(encode, s); - if (n > 1) { - DATA_ENSURE(n); - sprev = s; - s += n; - } - else { - sprev = s; - s++; - } + STACK_PUSH_ALT(p, s, sprev); + n = enclen(encode, s); + if (n > 1) { + DATA_ENSURE(n); + sprev = s; + s += n; + } + else { + sprev = s; + s++; + } } MOP_OUT; break; case OP_ANYCHAR_STAR_PEEK_NEXT: MOP_IN(OP_ANYCHAR_STAR_PEEK_NEXT); while (DATA_ENSURE_CHECK1) { - if (*p == *s) { - STACK_PUSH_ALT(p + 1, s, sprev); - } - n = enclen(encode, s); + if (*p == *s) { + STACK_PUSH_ALT(p + 1, s, sprev); + } + n = enclen(encode, s); DATA_ENSURE(n); if (ONIGENC_IS_MBC_NEWLINE(encode, s, end)) goto fail; sprev = s; @@ -1844,19 +1882,19 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, case OP_ANYCHAR_ML_STAR_PEEK_NEXT:MOP_IN(OP_ANYCHAR_ML_STAR_PEEK_NEXT); while (DATA_ENSURE_CHECK1) { - if (*p == *s) { - STACK_PUSH_ALT(p + 1, s, sprev); - } - n = enclen(encode, s); - if (n > 1) { - DATA_ENSURE(n); - sprev = s; - s += n; - } - else { - sprev = s; - s++; - } + if (*p == *s) { + STACK_PUSH_ALT(p + 1, s, sprev); + } + n = enclen(encode, s); + if (n > 1) { + DATA_ENSURE(n); + sprev = s; + s += n; + } + else { + sprev = s; + s++; + } } p++; MOP_OUT; @@ -1866,11 +1904,11 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, case OP_STATE_CHECK_ANYCHAR_STAR: MOP_IN(OP_STATE_CHECK_ANYCHAR_STAR); GET_STATE_CHECK_NUM_INC(mem, p); while (DATA_ENSURE_CHECK1) { - STATE_CHECK_VAL(scv, mem); - if (scv) goto fail; + STATE_CHECK_VAL(scv, mem); + if (scv) goto fail; - STACK_PUSH_ALT_WITH_STATE_CHECK(p, s, sprev, mem); - n = enclen(encode, s); + STACK_PUSH_ALT_WITH_STATE_CHECK(p, s, sprev, mem); + n = enclen(encode, s); DATA_ENSURE(n); if (ONIGENC_IS_MBC_NEWLINE(encode, s, end)) goto fail; sprev = s; @@ -1884,20 +1922,20 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, GET_STATE_CHECK_NUM_INC(mem, p); while (DATA_ENSURE_CHECK1) { - STATE_CHECK_VAL(scv, mem); - if (scv) goto fail; - - STACK_PUSH_ALT_WITH_STATE_CHECK(p, s, sprev, mem); - n = enclen(encode, s); - if (n > 1) { - DATA_ENSURE(n); - sprev = s; - s += n; - } - else { - sprev = s; - s++; - } + STATE_CHECK_VAL(scv, mem); + if (scv) goto fail; + + STACK_PUSH_ALT_WITH_STATE_CHECK(p, s, sprev, mem); + n = enclen(encode, s); + if (n > 1) { + DATA_ENSURE(n); + sprev = s; + s += n; + } + else { + sprev = s; + s++; + } } MOP_OUT; break; @@ -1906,7 +1944,7 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, case OP_WORD: MOP_IN(OP_WORD); DATA_ENSURE(1); if (! ONIGENC_IS_MBC_WORD(encode, s, end)) - goto fail; + goto fail; s += enclen(encode, s); MOP_OUT; @@ -1915,7 +1953,7 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, case OP_NOT_WORD: MOP_IN(OP_NOT_WORD); DATA_ENSURE(1); if (ONIGENC_IS_MBC_WORD(encode, s, end)) - goto fail; + goto fail; s += enclen(encode, s); MOP_OUT; @@ -1923,18 +1961,18 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, case OP_WORD_BOUND: MOP_IN(OP_WORD_BOUND); if (ON_STR_BEGIN(s)) { - DATA_ENSURE(1); - if (! ONIGENC_IS_MBC_WORD(encode, s, end)) - goto fail; + DATA_ENSURE(1); + if (! ONIGENC_IS_MBC_WORD(encode, s, end)) + goto fail; } else if (ON_STR_END(s)) { - if (! ONIGENC_IS_MBC_WORD(encode, sprev, end)) - goto fail; + if (! ONIGENC_IS_MBC_WORD(encode, sprev, end)) + goto fail; } else { - if (ONIGENC_IS_MBC_WORD(encode, s, end) - == ONIGENC_IS_MBC_WORD(encode, sprev, end)) - goto fail; + if (ONIGENC_IS_MBC_WORD(encode, s, end) + == ONIGENC_IS_MBC_WORD(encode, sprev, end)) + goto fail; } MOP_OUT; continue; @@ -1942,17 +1980,17 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, case OP_NOT_WORD_BOUND: MOP_IN(OP_NOT_WORD_BOUND); if (ON_STR_BEGIN(s)) { - if (DATA_ENSURE_CHECK1 && ONIGENC_IS_MBC_WORD(encode, s, end)) - goto fail; + if (DATA_ENSURE_CHECK1 && ONIGENC_IS_MBC_WORD(encode, s, end)) + goto fail; } else if (ON_STR_END(s)) { - if (ONIGENC_IS_MBC_WORD(encode, sprev, end)) - goto fail; + if (ONIGENC_IS_MBC_WORD(encode, sprev, end)) + goto fail; } else { - if (ONIGENC_IS_MBC_WORD(encode, s, end) - != ONIGENC_IS_MBC_WORD(encode, sprev, end)) - goto fail; + if (ONIGENC_IS_MBC_WORD(encode, s, end) + != ONIGENC_IS_MBC_WORD(encode, sprev, end)) + goto fail; } MOP_OUT; continue; @@ -1961,20 +1999,20 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, #ifdef USE_WORD_BEGIN_END case OP_WORD_BEGIN: MOP_IN(OP_WORD_BEGIN); if (DATA_ENSURE_CHECK1 && ONIGENC_IS_MBC_WORD(encode, s, end)) { - if (ON_STR_BEGIN(s) || !ONIGENC_IS_MBC_WORD(encode, sprev, end)) { - MOP_OUT; - continue; - } + if (ON_STR_BEGIN(s) || !ONIGENC_IS_MBC_WORD(encode, sprev, end)) { + MOP_OUT; + continue; + } } goto fail; break; case OP_WORD_END: MOP_IN(OP_WORD_END); if (!ON_STR_BEGIN(s) && ONIGENC_IS_MBC_WORD(encode, sprev, end)) { - if (ON_STR_END(s) || !ONIGENC_IS_MBC_WORD(encode, s, end)) { - MOP_OUT; - continue; - } + if (ON_STR_END(s) || !ONIGENC_IS_MBC_WORD(encode, s, end)) { + MOP_OUT; + continue; + } } goto fail; break; @@ -1996,13 +2034,13 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, case OP_BEGIN_LINE: MOP_IN(OP_BEGIN_LINE); if (ON_STR_BEGIN(s)) { - if (IS_NOTBOL(msa->options)) goto fail; - MOP_OUT; - continue; + if (IS_NOTBOL(msa->options)) goto fail; + MOP_OUT; + continue; } else if (ONIGENC_IS_MBC_NEWLINE(encode, sprev, end) && !ON_STR_END(s)) { - MOP_OUT; - continue; + MOP_OUT; + continue; } goto fail; break; @@ -2010,23 +2048,23 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, case OP_END_LINE: MOP_IN(OP_END_LINE); if (ON_STR_END(s)) { #ifndef USE_NEWLINE_AT_END_OF_STRING_HAS_EMPTY_LINE - if (IS_EMPTY_STR || !ONIGENC_IS_MBC_NEWLINE(encode, sprev, end)) { + if (IS_EMPTY_STR || !ONIGENC_IS_MBC_NEWLINE(encode, sprev, end)) { #endif - if (IS_NOTEOL(msa->options)) goto fail; - MOP_OUT; - continue; + if (IS_NOTEOL(msa->options)) goto fail; + MOP_OUT; + continue; #ifndef USE_NEWLINE_AT_END_OF_STRING_HAS_EMPTY_LINE - } + } #endif } else if (ONIGENC_IS_MBC_NEWLINE(encode, s, end)) { - MOP_OUT; - continue; + MOP_OUT; + continue; } #ifdef USE_CRNL_AS_LINE_TERMINATOR else if (ONIGENC_IS_MBC_CRNL(encode, s, end)) { - MOP_OUT; - continue; + MOP_OUT; + continue; } #endif goto fail; @@ -2035,24 +2073,24 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, case OP_SEMI_END_BUF: MOP_IN(OP_SEMI_END_BUF); if (ON_STR_END(s)) { #ifndef USE_NEWLINE_AT_END_OF_STRING_HAS_EMPTY_LINE - if (IS_EMPTY_STR || !ONIGENC_IS_MBC_NEWLINE(encode, sprev, end)) { + if (IS_EMPTY_STR || !ONIGENC_IS_MBC_NEWLINE(encode, sprev, end)) { #endif - if (IS_NOTEOL(msa->options)) goto fail; - MOP_OUT; - continue; + if (IS_NOTEOL(msa->options)) goto fail; + MOP_OUT; + continue; #ifndef USE_NEWLINE_AT_END_OF_STRING_HAS_EMPTY_LINE - } + } #endif } else if (ONIGENC_IS_MBC_NEWLINE(encode, s, end) && - ON_STR_END(s + enclen(encode, s))) { - MOP_OUT; - continue; + ON_STR_END(s + enclen(encode, s))) { + MOP_OUT; + continue; } #ifdef USE_CRNL_AS_LINE_TERMINATOR else if (ONIGENC_IS_MBC_CRNL(encode, s, end)) { UChar* ss = s + enclen(encode, s); - ss += enclen(encode, ss); + ss += enclen(encode, ss); if (ON_STR_END(ss)) { MOP_OUT; continue; @@ -2064,7 +2102,7 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, case OP_BEGIN_POSITION: MOP_IN(OP_BEGIN_POSITION); if (s != msa->start) - goto fail; + goto fail; MOP_OUT; continue; @@ -2114,9 +2152,9 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, STACK_GET_MEM_START(mem, stkp); if (BIT_STATUS_AT(reg->bt_mem_start, mem)) - mem_start_stk[mem] = GET_STACK_INDEX(stkp); + mem_start_stk[mem] = GET_STACK_INDEX(stkp); else - mem_start_stk[mem] = (OnigStackIndex )((void* )stkp->u.mem.pstr); + mem_start_stk[mem] = (OnigStackIndex )((void* )stkp->u.mem.pstr); STACK_PUSH_MEM_END_MARK(mem); MOP_OUT; @@ -2138,171 +2176,170 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, GET_MEMNUM_INC(mem, p); backref: { - int len; - UChar *pstart, *pend; - - /* if you want to remove following line, - you should check in parse and compile time. */ - if (mem > num_mem) goto fail; - if (mem_end_stk[mem] == INVALID_STACK_INDEX) goto fail; - if (mem_start_stk[mem] == INVALID_STACK_INDEX) goto fail; - - if (BIT_STATUS_AT(reg->bt_mem_start, mem)) - pstart = STACK_AT(mem_start_stk[mem])->u.mem.pstr; - else - pstart = (UChar* )((void* )mem_start_stk[mem]); - - pend = (BIT_STATUS_AT(reg->bt_mem_end, mem) - ? STACK_AT(mem_end_stk[mem])->u.mem.pstr - : (UChar* )((void* )mem_end_stk[mem])); - n = pend - pstart; - DATA_ENSURE(n); - sprev = s; - STRING_CMP(pstart, s, n); - while (sprev + (len = enclen(encode, sprev)) < s) - sprev += len; + int len; + UChar *pstart, *pend; + + /* if you want to remove following line, + you should check in parse and compile time. */ + if (mem > num_mem) goto fail; + if (mem_end_stk[mem] == INVALID_STACK_INDEX) goto fail; + if (mem_start_stk[mem] == INVALID_STACK_INDEX) goto fail; + + if (BIT_STATUS_AT(reg->bt_mem_start, mem)) + pstart = STACK_AT(mem_start_stk[mem])->u.mem.pstr; + else + pstart = (UChar* )((void* )mem_start_stk[mem]); + + pend = (BIT_STATUS_AT(reg->bt_mem_end, mem) + ? STACK_AT(mem_end_stk[mem])->u.mem.pstr + : (UChar* )((void* )mem_end_stk[mem])); + n = pend - pstart; + DATA_ENSURE(n); + sprev = s; + STRING_CMP(pstart, s, n); + while (sprev + (len = enclen(encode, sprev)) < s) + sprev += len; - MOP_OUT; - continue; + MOP_OUT; + continue; } break; case OP_BACKREFN_IC: MOP_IN(OP_BACKREFN_IC); GET_MEMNUM_INC(mem, p); { - int len; - UChar *pstart, *pend; - - /* if you want to remove following line, - you should check in parse and compile time. */ - if (mem > num_mem) goto fail; - if (mem_end_stk[mem] == INVALID_STACK_INDEX) goto fail; - if (mem_start_stk[mem] == INVALID_STACK_INDEX) goto fail; - - if (BIT_STATUS_AT(reg->bt_mem_start, mem)) - pstart = STACK_AT(mem_start_stk[mem])->u.mem.pstr; - else - pstart = (UChar* )((void* )mem_start_stk[mem]); - - pend = (BIT_STATUS_AT(reg->bt_mem_end, mem) - ? STACK_AT(mem_end_stk[mem])->u.mem.pstr - : (UChar* )((void* )mem_end_stk[mem])); - n = pend - pstart; - DATA_ENSURE(n); - sprev = s; - STRING_CMP_IC(case_fold_flag, pstart, &s, n); - while (sprev + (len = enclen(encode, sprev)) < s) - sprev += len; + int len; + UChar *pstart, *pend; + + /* if you want to remove following line, + you should check in parse and compile time. */ + if (mem > num_mem) goto fail; + if (mem_end_stk[mem] == INVALID_STACK_INDEX) goto fail; + if (mem_start_stk[mem] == INVALID_STACK_INDEX) goto fail; + + if (BIT_STATUS_AT(reg->bt_mem_start, mem)) + pstart = STACK_AT(mem_start_stk[mem])->u.mem.pstr; + else + pstart = (UChar* )((void* )mem_start_stk[mem]); + + pend = (BIT_STATUS_AT(reg->bt_mem_end, mem) + ? STACK_AT(mem_end_stk[mem])->u.mem.pstr + : (UChar* )((void* )mem_end_stk[mem])); + n = pend - pstart; + DATA_ENSURE(n); + sprev = s; + STRING_CMP_IC(case_fold_flag, pstart, &s, n); + while (sprev + (len = enclen(encode, sprev)) < s) + sprev += len; - MOP_OUT; - continue; + MOP_OUT; + continue; } break; case OP_BACKREF_MULTI: MOP_IN(OP_BACKREF_MULTI); { - int len, is_fail; - UChar *pstart, *pend, *swork; - - GET_LENGTH_INC(tlen, p); - for (i = 0; i < tlen; i++) { - GET_MEMNUM_INC(mem, p); - - if (mem_end_stk[mem] == INVALID_STACK_INDEX) continue; - if (mem_start_stk[mem] == INVALID_STACK_INDEX) continue; - - if (BIT_STATUS_AT(reg->bt_mem_start, mem)) - pstart = STACK_AT(mem_start_stk[mem])->u.mem.pstr; - else - pstart = (UChar* )((void* )mem_start_stk[mem]); - - pend = (BIT_STATUS_AT(reg->bt_mem_end, mem) - ? STACK_AT(mem_end_stk[mem])->u.mem.pstr - : (UChar* )((void* )mem_end_stk[mem])); - n = pend - pstart; - DATA_ENSURE(n); - sprev = s; - swork = s; - STRING_CMP_VALUE(pstart, swork, n, is_fail); - if (is_fail) continue; - s = swork; - while (sprev + (len = enclen(encode, sprev)) < s) - sprev += len; - - p += (SIZE_MEMNUM * (tlen - i - 1)); - break; /* success */ - } - if (i == tlen) goto fail; - MOP_OUT; - continue; + int len, is_fail; + UChar *pstart, *pend, *swork; + + GET_LENGTH_INC(tlen, p); + for (i = 0; i < tlen; i++) { + GET_MEMNUM_INC(mem, p); + + if (mem_end_stk[mem] == INVALID_STACK_INDEX) continue; + if (mem_start_stk[mem] == INVALID_STACK_INDEX) continue; + + if (BIT_STATUS_AT(reg->bt_mem_start, mem)) + pstart = STACK_AT(mem_start_stk[mem])->u.mem.pstr; + else + pstart = (UChar* )((void* )mem_start_stk[mem]); + + pend = (BIT_STATUS_AT(reg->bt_mem_end, mem) + ? STACK_AT(mem_end_stk[mem])->u.mem.pstr + : (UChar* )((void* )mem_end_stk[mem])); + n = pend - pstart; + DATA_ENSURE(n); + sprev = s; + swork = s; + STRING_CMP_VALUE(pstart, swork, n, is_fail); + if (is_fail) continue; + s = swork; + while (sprev + (len = enclen(encode, sprev)) < s) + sprev += len; + + p += (SIZE_MEMNUM * (tlen - i - 1)); + break; /* success */ + } + if (i == tlen) goto fail; + MOP_OUT; + continue; } break; case OP_BACKREF_MULTI_IC: MOP_IN(OP_BACKREF_MULTI_IC); { - int len, is_fail; - UChar *pstart, *pend, *swork; - - GET_LENGTH_INC(tlen, p); - for (i = 0; i < tlen; i++) { - GET_MEMNUM_INC(mem, p); - - if (mem_end_stk[mem] == INVALID_STACK_INDEX) continue; - if (mem_start_stk[mem] == INVALID_STACK_INDEX) continue; - - if (BIT_STATUS_AT(reg->bt_mem_start, mem)) - pstart = STACK_AT(mem_start_stk[mem])->u.mem.pstr; - else - pstart = (UChar* )((void* )mem_start_stk[mem]); - - pend = (BIT_STATUS_AT(reg->bt_mem_end, mem) - ? STACK_AT(mem_end_stk[mem])->u.mem.pstr - : (UChar* )((void* )mem_end_stk[mem])); - n = pend - pstart; - DATA_ENSURE(n); - sprev = s; - swork = s; - STRING_CMP_VALUE_IC(case_fold_flag, pstart, &swork, n, is_fail); - if (is_fail) continue; - s = swork; - while (sprev + (len = enclen(encode, sprev)) < s) - sprev += len; - - p += (SIZE_MEMNUM * (tlen - i - 1)); - break; /* success */ - } - if (i == tlen) goto fail; - MOP_OUT; - continue; + int len, is_fail; + UChar *pstart, *pend, *swork; + + GET_LENGTH_INC(tlen, p); + for (i = 0; i < tlen; i++) { + GET_MEMNUM_INC(mem, p); + + if (mem_end_stk[mem] == INVALID_STACK_INDEX) continue; + if (mem_start_stk[mem] == INVALID_STACK_INDEX) continue; + + if (BIT_STATUS_AT(reg->bt_mem_start, mem)) + pstart = STACK_AT(mem_start_stk[mem])->u.mem.pstr; + else + pstart = (UChar* )((void* )mem_start_stk[mem]); + + pend = (BIT_STATUS_AT(reg->bt_mem_end, mem) + ? STACK_AT(mem_end_stk[mem])->u.mem.pstr + : (UChar* )((void* )mem_end_stk[mem])); + n = pend - pstart; + DATA_ENSURE(n); + sprev = s; + swork = s; + STRING_CMP_VALUE_IC(case_fold_flag, pstart, &swork, n, is_fail); + if (is_fail) continue; + s = swork; + while (sprev + (len = enclen(encode, sprev)) < s) + sprev += len; + + p += (SIZE_MEMNUM * (tlen - i - 1)); + break; /* success */ + } + if (i == tlen) goto fail; + MOP_OUT; + continue; } break; #ifdef USE_BACKREF_WITH_LEVEL case OP_BACKREF_WITH_LEVEL: { - int len; - OnigOptionType ic; - LengthType level; + int len; + OnigOptionType ic; + LengthType level; - GET_OPTION_INC(ic, p); - GET_LENGTH_INC(level, p); - GET_LENGTH_INC(tlen, p); + GET_OPTION_INC(ic, p); + GET_LENGTH_INC(level, p); + GET_LENGTH_INC(tlen, p); - sprev = s; - if (backref_match_at_nested_level(reg, stk, stk_base, ic - , case_fold_flag, (int )level, (int )tlen, p, &s, end)) { - while (sprev + (len = enclen(encode, sprev)) < s) - sprev += len; + sprev = s; + if (backref_match_at_nested_level(reg, stk, stk_base, ic + , case_fold_flag, (int )level, (int )tlen, p, &s, end)) { + while (sprev + (len = enclen(encode, sprev)) < s) + sprev += len; - p += (SIZE_MEMNUM * tlen); - } - else - goto fail; + p += (SIZE_MEMNUM * tlen); + } + else + goto fail; - MOP_OUT; - continue; + MOP_OUT; + continue; } - break; #endif @@ -2331,33 +2368,33 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, case OP_NULL_CHECK_END: MOP_IN(OP_NULL_CHECK_END); { - int isnull; + int isnull; - GET_MEMNUM_INC(mem, p); /* mem: null check id */ - STACK_NULL_CHECK(isnull, mem, s); - if (isnull) { + GET_MEMNUM_INC(mem, p); /* mem: null check id */ + STACK_NULL_CHECK(isnull, mem, s); + if (isnull) { #ifdef ONIG_DEBUG_MATCH - fprintf(stderr, "NULL_CHECK_END: skip id:%d, s:%d\n", - (int )mem, (int )s); -#endif - null_check_found: - /* empty loop founded, skip next instruction */ - switch (*p++) { - case OP_JUMP: - case OP_PUSH: - p += SIZE_RELADDR; - break; - case OP_REPEAT_INC: - case OP_REPEAT_INC_NG: - case OP_REPEAT_INC_SG: - case OP_REPEAT_INC_NG_SG: - p += SIZE_MEMNUM; - break; - default: - goto unexpected_bytecode_error; - break; - } - } + fprintf(stderr, "NULL_CHECK_END: skip id:%d, s:%d\n", + (int )mem, (int )s); +#endif + null_check_found: + /* empty loop founded, skip next instruction */ + switch (*p++) { + case OP_JUMP: + case OP_PUSH: + p += SIZE_RELADDR; + break; + case OP_REPEAT_INC: + case OP_REPEAT_INC_NG: + case OP_REPEAT_INC_SG: + case OP_REPEAT_INC_NG_SG: + p += SIZE_MEMNUM; + break; + default: + goto unexpected_bytecode_error; + break; + } + } } MOP_OUT; continue; @@ -2366,18 +2403,18 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, #ifdef USE_MONOMANIAC_CHECK_CAPTURES_IN_ENDLESS_REPEAT case OP_NULL_CHECK_END_MEMST: MOP_IN(OP_NULL_CHECK_END_MEMST); { - int isnull; + int isnull; - GET_MEMNUM_INC(mem, p); /* mem: null check id */ - STACK_NULL_CHECK_MEMST(isnull, mem, s, reg); - if (isnull) { + GET_MEMNUM_INC(mem, p); /* mem: null check id */ + STACK_NULL_CHECK_MEMST(isnull, mem, s, reg); + if (isnull) { #ifdef ONIG_DEBUG_MATCH - fprintf(stderr, "NULL_CHECK_END_MEMST: skip id:%d, s:%d\n", - (int )mem, (int )s); + fprintf(stderr, "NULL_CHECK_END_MEMST: skip id:%d, s:%d\n", + (int )mem, (int )s); #endif - if (isnull == -1) goto fail; - goto null_check_found; - } + if (isnull == -1) goto fail; + goto null_check_found; + } } MOP_OUT; continue; @@ -2388,25 +2425,25 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, case OP_NULL_CHECK_END_MEMST_PUSH: MOP_IN(OP_NULL_CHECK_END_MEMST_PUSH); { - int isnull; + int isnull; - GET_MEMNUM_INC(mem, p); /* mem: null check id */ + GET_MEMNUM_INC(mem, p); /* mem: null check id */ #ifdef USE_MONOMANIAC_CHECK_CAPTURES_IN_ENDLESS_REPEAT - STACK_NULL_CHECK_MEMST_REC(isnull, mem, s, reg); + STACK_NULL_CHECK_MEMST_REC(isnull, mem, s, reg); #else - STACK_NULL_CHECK_REC(isnull, mem, s); + STACK_NULL_CHECK_REC(isnull, mem, s); #endif - if (isnull) { + if (isnull) { #ifdef ONIG_DEBUG_MATCH - fprintf(stderr, "NULL_CHECK_END_MEMST_PUSH: skip id:%d, s:%d\n", - (int )mem, (int )s); + fprintf(stderr, "NULL_CHECK_END_MEMST_PUSH: skip id:%d, s:%d\n", + (int )mem, (int )s); #endif - if (isnull == -1) goto fail; - goto null_check_found; - } - else { - STACK_PUSH_NULL_CHECK_END(mem); - } + if (isnull == -1) goto fail; + goto null_check_found; + } + else { + STACK_PUSH_NULL_CHECK_END(mem); + } } MOP_OUT; continue; @@ -2445,10 +2482,10 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, GET_RELADDR_INC(addr, p); STATE_CHECK_VAL(scv, mem); if (scv) { - p += addr; + p += addr; } else { - STACK_PUSH_ALT_WITH_STATE_CHECK(p + addr, s, sprev, mem); + STACK_PUSH_ALT_WITH_STATE_CHECK(p + addr, s, sprev, mem); } MOP_OUT; continue; @@ -2474,10 +2511,10 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, case OP_PUSH_OR_JUMP_EXACT1: MOP_IN(OP_PUSH_OR_JUMP_EXACT1); GET_RELADDR_INC(addr, p); if (*p == *s && DATA_ENSURE_CHECK1) { - p++; - STACK_PUSH_ALT(p + addr, s, sprev); - MOP_OUT; - continue; + p++; + STACK_PUSH_ALT(p + addr, s, sprev); + MOP_OUT; + continue; } p += (addr + 1); MOP_OUT; @@ -2487,10 +2524,10 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, case OP_PUSH_IF_PEEK_NEXT: MOP_IN(OP_PUSH_IF_PEEK_NEXT); GET_RELADDR_INC(addr, p); if (*p == *s) { - p++; - STACK_PUSH_ALT(p + addr, s, sprev); - MOP_OUT; - continue; + p++; + STACK_PUSH_ALT(p + addr, s, sprev); + MOP_OUT; + continue; } p++; MOP_OUT; @@ -2499,16 +2536,16 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, case OP_REPEAT: MOP_IN(OP_REPEAT); { - GET_MEMNUM_INC(mem, p); /* mem: OP_REPEAT ID */ - GET_RELADDR_INC(addr, p); + GET_MEMNUM_INC(mem, p); /* mem: OP_REPEAT ID */ + GET_RELADDR_INC(addr, p); - STACK_ENSURE(1); - repeat_stk[mem] = GET_STACK_INDEX(stk); - STACK_PUSH_REPEAT(mem, p); + STACK_ENSURE(1); + repeat_stk[mem] = GET_STACK_INDEX(stk); + STACK_PUSH_REPEAT(mem, p); - if (reg->repeat_range[mem].lower == 0) { - STACK_PUSH_ALT(p + addr, s, sprev); - } + if (reg->repeat_range[mem].lower == 0) { + STACK_PUSH_ALT(p + addr, s, sprev); + } } MOP_OUT; continue; @@ -2516,17 +2553,17 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, case OP_REPEAT_NG: MOP_IN(OP_REPEAT_NG); { - GET_MEMNUM_INC(mem, p); /* mem: OP_REPEAT ID */ - GET_RELADDR_INC(addr, p); + GET_MEMNUM_INC(mem, p); /* mem: OP_REPEAT ID */ + GET_RELADDR_INC(addr, p); - STACK_ENSURE(1); - repeat_stk[mem] = GET_STACK_INDEX(stk); - STACK_PUSH_REPEAT(mem, p); + STACK_ENSURE(1); + repeat_stk[mem] = GET_STACK_INDEX(stk); + STACK_PUSH_REPEAT(mem, p); - if (reg->repeat_range[mem].lower == 0) { - STACK_PUSH_ALT(p, s, sprev); - p += addr; - } + if (reg->repeat_range[mem].lower == 0) { + STACK_PUSH_ALT(p, s, sprev); + p += addr; + } } MOP_OUT; continue; @@ -2604,9 +2641,9 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, case OP_POP_POS: MOP_IN(OP_POP_POS); { - STACK_POS_END(stkp); - s = stkp->u.state.pstr; - sprev = stkp->u.state.pstr_prev; + STACK_POS_END(stkp); + s = stkp->u.state.pstr; + sprev = stkp->u.state.pstr_prev; } MOP_OUT; continue; @@ -2650,15 +2687,15 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, GET_LENGTH_INC(tlen, p); q = (UChar* )ONIGENC_STEP_BACK(encode, str, s, (int )tlen); if (IS_NULL(q)) { - /* too short case -> success. ex. /(? success. ex. /(?map[*se]; t = s; @@ -2913,8 +2950,8 @@ bm_search_notrev(regex_t* reg, const UChar* target, const UChar* target_end, p = se = s + tlen1; t = tail; while (*p == *t) { - if (t == target) return (UChar* )s; - p--; t--; + if (t == target) return (UChar* )s; + p--; t--; } skip = reg->int_map[*se]; t = s; @@ -2945,8 +2982,8 @@ bm_search(regex_t* reg, const UChar* target, const UChar* target_end, p = s; t = tail; while (*p == *t) { - if (t == target) return (UChar* )p; - p--; t--; + if (t == target) return (UChar* )p; + p--; t--; } s += reg->map[*s]; } @@ -2956,8 +2993,8 @@ bm_search(regex_t* reg, const UChar* target, const UChar* target_end, p = s; t = tail; while (*p == *t) { - if (t == target) return (UChar* )p; - p--; t--; + if (t == target) return (UChar* )p; + p--; t--; } s += reg->int_map[*s]; } @@ -2965,6 +3002,7 @@ bm_search(regex_t* reg, const UChar* target, const UChar* target_end, return (UChar* )NULL; } +#ifdef USE_INT_MAP_BACKWARD static int set_bm_backward_skip(UChar* s, UChar* end, OnigEncoding enc ARG_UNUSED, int** skip) @@ -3015,6 +3053,7 @@ bm_search_backward(regex_t* reg, const UChar* target, const UChar* target_end, return (UChar* )NULL; } +#endif static UChar* map_search(OnigEncoding enc, UChar map[], @@ -3053,7 +3092,7 @@ onig_match(regex_t* reg, const UChar* str, const UChar* end, const UChar* at, On UChar *prev; OnigMatchArg msa; - MATCH_ARG_INIT(msa, option, region, at); + MATCH_ARG_INIT(msa, reg, option, region, at); #ifdef USE_COMBINATION_EXPLOSION_CHECK { int offset = at - str; @@ -3142,58 +3181,58 @@ forward_search_range(regex_t* reg, const UChar* str, const UChar* end, UChar* s, switch (reg->sub_anchor) { case ANCHOR_BEGIN_LINE: - if (!ON_STR_BEGIN(p)) { - prev = onigenc_get_prev_char_head(reg->enc, - (pprev ? pprev : str), p); - if (!ONIGENC_IS_MBC_NEWLINE(reg->enc, prev, end)) - goto retry_gate; - } - break; + if (!ON_STR_BEGIN(p)) { + prev = onigenc_get_prev_char_head(reg->enc, + (pprev ? pprev : str), p); + if (!ONIGENC_IS_MBC_NEWLINE(reg->enc, prev, end)) + goto retry_gate; + } + break; case ANCHOR_END_LINE: - if (ON_STR_END(p)) { + if (ON_STR_END(p)) { #ifndef USE_NEWLINE_AT_END_OF_STRING_HAS_EMPTY_LINE - prev = (UChar* )onigenc_get_prev_char_head(reg->enc, - (pprev ? pprev : str), p); - if (prev && ONIGENC_IS_MBC_NEWLINE(reg->enc, prev, end)) - goto retry_gate; + prev = (UChar* )onigenc_get_prev_char_head(reg->enc, + (pprev ? pprev : str), p); + if (prev && ONIGENC_IS_MBC_NEWLINE(reg->enc, prev, end)) + goto retry_gate; #endif - } - else if (! ONIGENC_IS_MBC_NEWLINE(reg->enc, p, end) + } + else if (! ONIGENC_IS_MBC_NEWLINE(reg->enc, p, end) #ifdef USE_CRNL_AS_LINE_TERMINATOR - && ! ONIGENC_IS_MBC_CRNL(reg->enc, p, end) + && ! ONIGENC_IS_MBC_CRNL(reg->enc, p, end) #endif - ) - goto retry_gate; - break; + ) + goto retry_gate; + break; } } if (reg->dmax == 0) { *low = p; if (low_prev) { - if (*low > s) - *low_prev = onigenc_get_prev_char_head(reg->enc, s, p); - else - *low_prev = onigenc_get_prev_char_head(reg->enc, - (pprev ? pprev : str), p); + if (*low > s) + *low_prev = onigenc_get_prev_char_head(reg->enc, s, p); + else + *low_prev = onigenc_get_prev_char_head(reg->enc, + (pprev ? pprev : str), p); } } else { if (reg->dmax != ONIG_INFINITE_DISTANCE) { - *low = p - reg->dmax; - if (*low > s) { - *low = onigenc_get_right_adjust_char_head_with_prev(reg->enc, s, - *low, (const UChar** )low_prev); - if (low_prev && IS_NULL(*low_prev)) - *low_prev = onigenc_get_prev_char_head(reg->enc, - (pprev ? pprev : s), *low); - } - else { - if (low_prev) - *low_prev = onigenc_get_prev_char_head(reg->enc, - (pprev ? pprev : str), *low); - } + *low = p - reg->dmax; + if (*low > s) { + *low = onigenc_get_right_adjust_char_head_with_prev(reg->enc, s, + *low, (const UChar** )low_prev); + if (low_prev && IS_NULL(*low_prev)) + *low_prev = onigenc_get_prev_char_head(reg->enc, + (pprev ? pprev : s), *low); + } + else { + if (low_prev) + *low_prev = onigenc_get_prev_char_head(reg->enc, + (pprev ? pprev : str), *low); + } } } /* no needs to adjust *high, *high is used as range check only */ @@ -3210,8 +3249,6 @@ forward_search_range(regex_t* reg, const UChar* str, const UChar* end, UChar* s, return 0; /* fail */ } -static int set_bm_backward_skip P_((UChar* s, UChar* end, OnigEncoding enc, - int** skip)); #define BM_BACKWARD_SEARCH_LENGTH_THRESHOLD 100 @@ -3220,7 +3257,6 @@ backward_search_range(regex_t* reg, const UChar* str, const UChar* end, UChar* s, const UChar* range, UChar* adjrange, UChar** low, UChar** high) { - int r; UChar *p; range += reg->dmin; @@ -3242,16 +3278,22 @@ backward_search_range(regex_t* reg, const UChar* str, const UChar* end, case ONIG_OPTIMIZE_EXACT_BM: case ONIG_OPTIMIZE_EXACT_BM_NOT_REV: +#ifdef USE_INT_MAP_BACKWARD if (IS_NULL(reg->int_map_backward)) { + int r; + if (s - range < BM_BACKWARD_SEARCH_LENGTH_THRESHOLD) - goto exact_method; + goto exact_method; r = set_bm_backward_skip(reg->exact, reg->exact_end, reg->enc, - &(reg->int_map_backward)); + &(reg->int_map_backward)); if (r) return r; } p = bm_search_backward(reg, reg->exact, reg->exact_end, range, adjrange, - end, p); + end, p); +#else + goto exact_method; +#endif break; case ONIG_OPTIMIZE_MAP: @@ -3265,36 +3307,36 @@ backward_search_range(regex_t* reg, const UChar* str, const UChar* end, switch (reg->sub_anchor) { case ANCHOR_BEGIN_LINE: - if (!ON_STR_BEGIN(p)) { - prev = onigenc_get_prev_char_head(reg->enc, str, p); - if (!ONIGENC_IS_MBC_NEWLINE(reg->enc, prev, end)) { - p = prev; - goto retry; - } - } - break; + if (!ON_STR_BEGIN(p)) { + prev = onigenc_get_prev_char_head(reg->enc, str, p); + if (!ONIGENC_IS_MBC_NEWLINE(reg->enc, prev, end)) { + p = prev; + goto retry; + } + } + break; case ANCHOR_END_LINE: - if (ON_STR_END(p)) { + if (ON_STR_END(p)) { #ifndef USE_NEWLINE_AT_END_OF_STRING_HAS_EMPTY_LINE - prev = onigenc_get_prev_char_head(reg->enc, adjrange, p); - if (IS_NULL(prev)) goto fail; - if (ONIGENC_IS_MBC_NEWLINE(reg->enc, prev, end)) { - p = prev; - goto retry; - } -#endif - } - else if (! ONIGENC_IS_MBC_NEWLINE(reg->enc, p, end) + prev = onigenc_get_prev_char_head(reg->enc, adjrange, p); + if (IS_NULL(prev)) goto fail; + if (ONIGENC_IS_MBC_NEWLINE(reg->enc, prev, end)) { + p = prev; + goto retry; + } +#endif + } + else if (! ONIGENC_IS_MBC_NEWLINE(reg->enc, p, end) #ifdef USE_CRNL_AS_LINE_TERMINATOR - && ! ONIGENC_IS_MBC_CRNL(reg->enc, p, end) + && ! ONIGENC_IS_MBC_CRNL(reg->enc, p, end) #endif - ) { - p = onigenc_get_prev_char_head(reg->enc, adjrange, p); - if (IS_NULL(p)) goto fail; - goto retry; - } - break; + ) { + p = onigenc_get_prev_char_head(reg->enc, adjrange, p); + if (IS_NULL(p)) goto fail; + goto retry; + } + break; } } @@ -3405,56 +3447,56 @@ onig_search(regex_t* reg, const UChar* str, const UChar* end, /* search start-position only */ begin_position: if (range > start) - range = start + 1; + range = start + 1; else - range = start; + range = start; } else if (reg->anchor & ANCHOR_BEGIN_BUF) { /* search str-position only */ if (range > start) { - if (start != str) goto mismatch_no_msa; - range = str + 1; + if (start != str) goto mismatch_no_msa; + range = str + 1; } else { - if (range <= str) { - start = str; - range = str; - } - else - goto mismatch_no_msa; + if (range <= str) { + start = str; + range = str; + } + else + goto mismatch_no_msa; } } else if (reg->anchor & ANCHOR_END_BUF) { min_semi_end = max_semi_end = (UChar* )end; end_buf: - if ((OnigDistance )(max_semi_end - str) < reg->anchor_dmin) + if ((OnigLen )(max_semi_end - str) < reg->anchor_dmin) goto mismatch_no_msa; if (range > start) { - if ((OnigDistance )(min_semi_end - start) > reg->anchor_dmax) { - start = min_semi_end - reg->anchor_dmax; - if (start < end) - start = onigenc_get_right_adjust_char_head(reg->enc, str, start); - else { /* match with empty at end */ - start = onigenc_get_prev_char_head(reg->enc, str, end); - } - } - if ((OnigDistance )(max_semi_end - (range - 1)) < reg->anchor_dmin) { - range = max_semi_end - reg->anchor_dmin + 1; - } - - if (start >= range) goto mismatch_no_msa; + if ((OnigLen )(min_semi_end - start) > reg->anchor_dmax) { + start = min_semi_end - reg->anchor_dmax; + if (start < end) + start = onigenc_get_right_adjust_char_head(reg->enc, str, start); + else { /* match with empty at end */ + start = onigenc_get_prev_char_head(reg->enc, str, end); + } + } + if ((OnigLen )(max_semi_end - (range - 1)) < reg->anchor_dmin) { + range = max_semi_end - reg->anchor_dmin + 1; + } + + if (start >= range) goto mismatch_no_msa; } else { - if ((OnigDistance )(min_semi_end - range) > reg->anchor_dmax) { - range = min_semi_end - reg->anchor_dmax; - } - if ((OnigDistance )(max_semi_end - start) < reg->anchor_dmin) { - start = max_semi_end - reg->anchor_dmin; - start = ONIGENC_LEFT_ADJUST_CHAR_HEAD(reg->enc, str, start); - } - if (range > start) goto mismatch_no_msa; + if ((OnigLen )(min_semi_end - range) > reg->anchor_dmax) { + range = min_semi_end - reg->anchor_dmax; + } + if ((OnigLen )(max_semi_end - start) < reg->anchor_dmin) { + start = max_semi_end - reg->anchor_dmin; + start = ONIGENC_LEFT_ADJUST_CHAR_HEAD(reg->enc, str, start); + } + if (range > start) goto mismatch_no_msa; } } else if (reg->anchor & ANCHOR_SEMI_END_BUF) { @@ -3462,22 +3504,22 @@ onig_search(regex_t* reg, const UChar* str, const UChar* end, max_semi_end = (UChar* )end; if (ONIGENC_IS_MBC_NEWLINE(reg->enc, pre_end, end)) { - min_semi_end = pre_end; + min_semi_end = pre_end; #ifdef USE_CRNL_AS_LINE_TERMINATOR - pre_end = ONIGENC_STEP_BACK(reg->enc, str, pre_end, 1); - if (IS_NOT_NULL(pre_end) && - ONIGENC_IS_MBC_CRNL(reg->enc, pre_end, end)) { - min_semi_end = pre_end; - } + pre_end = ONIGENC_STEP_BACK(reg->enc, str, pre_end, 1); + if (IS_NOT_NULL(pre_end) && + ONIGENC_IS_MBC_CRNL(reg->enc, pre_end, end)) { + min_semi_end = pre_end; + } #endif - if (min_semi_end > str && start <= min_semi_end) { - goto end_buf; - } + if (min_semi_end > str && start <= min_semi_end) { + goto end_buf; + } } else { - min_semi_end = (UChar* )end; - goto end_buf; + min_semi_end = (UChar* )end; + goto end_buf; } } else if ((reg->anchor & ANCHOR_ANYCHAR_STAR_ML)) { @@ -3496,7 +3538,7 @@ onig_search(regex_t* reg, const UChar* str, const UChar* end, s = (UChar* )start; prev = (UChar* )NULL; - MATCH_ARG_INIT(msa, option, region, start); + MATCH_ARG_INIT(msa, reg, option, region, start); #ifdef USE_COMBINATION_EXPLOSION_CHECK msa.state_check_buff = (void* )0; msa.state_check_buff_size = 0; /* NO NEED, for valgrind */ @@ -3512,7 +3554,7 @@ onig_search(regex_t* reg, const UChar* str, const UChar* end, (int )(end - str), (int )(start - str), (int )(range - str)); #endif - MATCH_ARG_INIT(msa, option, region, orig_start); + MATCH_ARG_INIT(msa, reg, option, region, orig_start); #ifdef USE_COMBINATION_EXPLOSION_CHECK { int offset = (MIN(start, range) - str); @@ -3532,36 +3574,36 @@ onig_search(regex_t* reg, const UChar* str, const UChar* end, sch_range = (UChar* )range; if (reg->dmax != 0) { - if (reg->dmax == ONIG_INFINITE_DISTANCE) - sch_range = (UChar* )end; - else { - sch_range += reg->dmax; - if (sch_range > end) sch_range = (UChar* )end; - } + if (reg->dmax == ONIG_INFINITE_DISTANCE) + sch_range = (UChar* )end; + else { + sch_range += reg->dmax; + if (sch_range > end) sch_range = (UChar* )end; + } } if ((end - start) < reg->threshold_len) goto mismatch; if (reg->dmax != ONIG_INFINITE_DISTANCE) { - do { - if (! forward_search_range(reg, str, end, s, sch_range, - &low, &high, &low_prev)) goto mismatch; - if (s < low) { - s = low; - prev = low_prev; - } - while (s <= high) { - MATCH_AND_RETURN_CHECK(orig_range); - prev = s; - s += enclen(reg->enc, s); - } - } while (s < range); - goto mismatch; + do { + if (! forward_search_range(reg, str, end, s, sch_range, + &low, &high, &low_prev)) goto mismatch; + if (s < low) { + s = low; + prev = low_prev; + } + while (s <= high) { + MATCH_AND_RETURN_CHECK(orig_range); + prev = s; + s += enclen(reg->enc, s); + } + } while (s < range); + goto mismatch; } else { /* check only. */ - if (! forward_search_range(reg, str, end, s, sch_range, - &low, &high, (UChar** )NULL)) goto mismatch; + if (! forward_search_range(reg, str, end, s, sch_range, + &low, &high, (UChar** )NULL)) goto mismatch; if ((reg->anchor & ANCHOR_ANYCHAR_STAR) != 0) { do { @@ -3599,47 +3641,47 @@ onig_search(regex_t* reg, const UChar* str, const UChar* end, UChar *low, *high, *adjrange, *sch_start; if (range < end) - adjrange = ONIGENC_LEFT_ADJUST_CHAR_HEAD(reg->enc, str, range); + adjrange = ONIGENC_LEFT_ADJUST_CHAR_HEAD(reg->enc, str, range); else - adjrange = (UChar* )end; + adjrange = (UChar* )end; if (reg->dmax != ONIG_INFINITE_DISTANCE && - (end - range) >= reg->threshold_len) { - do { - sch_start = s + reg->dmax; - if (sch_start > end) sch_start = (UChar* )end; - if (backward_search_range(reg, str, end, sch_start, range, adjrange, - &low, &high) <= 0) - goto mismatch; - - if (s > high) - s = high; - - while (s >= low) { - prev = onigenc_get_prev_char_head(reg->enc, str, s); - MATCH_AND_RETURN_CHECK(orig_start); - s = prev; - } - } while (s >= range); - goto mismatch; + (end - range) >= reg->threshold_len) { + do { + sch_start = s + reg->dmax; + if (sch_start > end) sch_start = (UChar* )end; + if (backward_search_range(reg, str, end, sch_start, range, adjrange, + &low, &high) <= 0) + goto mismatch; + + if (s > high) + s = high; + + while (s >= low) { + prev = onigenc_get_prev_char_head(reg->enc, str, s); + MATCH_AND_RETURN_CHECK(orig_start); + s = prev; + } + } while (s >= range); + goto mismatch; } else { /* check only. */ - if ((end - range) < reg->threshold_len) goto mismatch; - - sch_start = s; - if (reg->dmax != 0) { - if (reg->dmax == ONIG_INFINITE_DISTANCE) - sch_start = (UChar* )end; - else { - sch_start += reg->dmax; - if (sch_start > end) sch_start = (UChar* )end; - else - sch_start = ONIGENC_LEFT_ADJUST_CHAR_HEAD(reg->enc, - start, sch_start); - } - } - if (backward_search_range(reg, str, end, sch_start, range, adjrange, - &low, &high) <= 0) goto mismatch; + if ((end - range) < reg->threshold_len) goto mismatch; + + sch_start = s; + if (reg->dmax != 0) { + if (reg->dmax == ONIG_INFINITE_DISTANCE) + sch_start = (UChar* )end; + else { + sch_start += reg->dmax; + if (sch_start > end) sch_start = (UChar* )end; + else + sch_start = ONIGENC_LEFT_ADJUST_CHAR_HEAD(reg->enc, + start, sch_start); + } + } + if (backward_search_range(reg, str, end, sch_start, range, adjrange, + &low, &high) <= 0) goto mismatch; } } @@ -3694,6 +3736,46 @@ onig_search(regex_t* reg, const UChar* str, const UChar* end, return s - str; } +extern int +onig_scan(regex_t* reg, const UChar* str, const UChar* end, + OnigRegion* region, OnigOptionType option, + int (*scan_callback)(int, int, OnigRegion*, void*), + void* callback_arg) +{ + int r; + int n; + int rs; + const UChar* start; + + n = 0; + start = str; + while (1) { + r = onig_search(reg, str, end, start, end, region, option); + if (r >= 0) { + rs = scan_callback(n, r, region, callback_arg); + n++; + if (rs != 0) + return rs; + + if (region->end[0] == start - str) + start++; + else + start = str + region->end[0]; + + if (start > end) + break; + } + else if (r == ONIG_MISMATCH) { + break; + } + else { /* error */ + return r; + } + } + + return n; +} + extern OnigEncoding onig_get_encoding(regex_t* reg) { diff --git a/src/regint.h b/src/regint.h index 5476626..d320e26 100644 --- a/src/regint.h +++ b/src/regint.h @@ -708,6 +708,7 @@ typedef struct { int stack_n; OnigOptionType options; OnigRegion* region; + int ptr_num; const UChar* start; /* search start position (for \G: BEGIN_POSITION) */ #ifdef USE_FIND_LONGEST_SEARCH_ALL_OF_RANGE int best_len; /* for ONIG_OPTION_FIND_LONGEST */ @@ -750,6 +751,7 @@ extern void onig_print_statistics P_((FILE* f)); #endif #endif +extern void onig_warning(const char* s); extern UChar* onig_error_code_to_format P_((int code)); extern void onig_snprintf_with_pattern PV_((UChar buf[], int bufsize, OnigEncoding enc, UChar* pat, UChar* pat_end, const UChar *fmt, ...)); extern int onig_bbuf_init P_((BBuf* buf, int size)); diff --git a/src/regparse.c b/src/regparse.c index e06d9d2..6be8366 100644 --- a/src/regparse.c +++ b/src/regparse.c @@ -26,7 +26,6 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ - #include "regparse.h" #include "st.h" @@ -97,6 +96,14 @@ extern void onig_set_verb_warn_func(OnigWarnFunc f) onig_verb_warn = f; } +extern void +onig_warning(const char* s) +{ + if (onig_warn == onig_null_warn) return ; + + (*onig_warn)(s); +} + static void bbuf_free(BBuf* bbuf) { @@ -957,6 +964,9 @@ scan_env_add_mem_entry(ScanEnv* env) Node** p; need = env->num_mem + 1; + if (need > ONIG_MAX_CAPTURE_NUM) + return ONIGERR_TOO_MANY_CAPTURES; + if (need >= SCANENV_MEMNODES_SIZE) { if (env->mem_alloc <= need) { if (IS_NULL(env->mem_nodes_dynamic)) { @@ -1987,8 +1997,8 @@ or_cclass(CClassNode* dest, CClassNode* cc, OnigEncoding enc) return 0; } -static int -conv_backslash_value(int c, ScanEnv* env) +static OnigCodePoint +conv_backslash_value(OnigCodePoint c, ScanEnv* env) { if (IS_SYNTAX_OP(env->syntax, ONIG_SYN_OP_ESC_CONTROL_CHARS)) { switch (c) { @@ -2259,7 +2269,7 @@ fetch_range_quantifier(UChar** src, UChar* end, OnigToken* tok, ScanEnv* env) if (p == prev) { if (non_low != 0) - goto invalid; + goto invalid; up = REPEAT_INFINITE; /* {n,} : {n,infinite} */ } } @@ -2291,15 +2301,17 @@ fetch_range_quantifier(UChar** src, UChar* end, OnigToken* tok, ScanEnv* env) return r; /* 0: normal {n,m}, 2: fixed {n} */ invalid: - if (syn_allow) + if (syn_allow) { + *src = p; return 1; /* OK */ + } else return ONIGERR_INVALID_REPEAT_RANGE_PATTERN; } /* \M-, \C-, \c, or \... */ static int -fetch_escaped_value(UChar** src, UChar* end, ScanEnv* env) +fetch_escaped_value(UChar** src, UChar* end, ScanEnv* env, OnigCodePoint* val) { int v; OnigCodePoint c; @@ -2318,9 +2330,8 @@ fetch_escaped_value(UChar** src, UChar* end, ScanEnv* env) if (PEND) return ONIGERR_END_PATTERN_AT_META; PFETCH_S(c); if (c == MC_ESC(env->syntax)) { - v = fetch_escaped_value(&p, end, env); + v = fetch_escaped_value(&p, end, env, &c); if (v < 0) return v; - c = (OnigCodePoint )v; } c = ((c & 0xff) | 0x80); } @@ -2348,9 +2359,8 @@ fetch_escaped_value(UChar** src, UChar* end, ScanEnv* env) } else { if (c == MC_ESC(env->syntax)) { - v = fetch_escaped_value(&p, end, env); + v = fetch_escaped_value(&p, end, env, &c); if (v < 0) return v; - c = (OnigCodePoint )v; } c &= 0x9f; } @@ -2367,7 +2377,8 @@ fetch_escaped_value(UChar** src, UChar* end, ScanEnv* env) } *src = p; - return c; + *val = c; + return 0; } static int fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env); @@ -2463,6 +2474,10 @@ fetch_name_with_level(OnigCodePoint start_code, UChar** src, UChar* end, int level; int flag = (c == '-' ? -1 : 1); + if (PEND) { + r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME; + goto end; + } PFETCH(c); if (! ONIGENC_IS_CODE_DIGIT(enc, c)) goto err; PUNFETCH; @@ -2471,9 +2486,11 @@ fetch_name_with_level(OnigCodePoint start_code, UChar** src, UChar* end, *rlevel = (level * flag); exist_level = 1; - PFETCH(c); - if (c == end_code) - goto end; + if (!PEND) { + PFETCH(c); + if (c == end_code) + goto end; + } } err: @@ -2880,6 +2897,8 @@ fetch_token_in_cc(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env) case 'p': case 'P': + if (PEND) break; + c2 = PPEEK; if (c2 == '{' && IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_P_BRACE_CHAR_PROPERTY)) { @@ -2887,7 +2906,7 @@ fetch_token_in_cc(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env) tok->type = TK_CHAR_PROPERTY; tok->u.prop.not = (c == 'P' ? 1 : 0); - if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_P_BRACE_CIRCUMFLEX_NOT)) { + if (!PEND && IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_P_BRACE_CIRCUMFLEX_NOT)) { PFETCH(c2); if (c2 == '^') { tok->u.prop.not = (tok->u.prop.not == 0 ? 1 : 0); @@ -2903,25 +2922,25 @@ fetch_token_in_cc(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env) prev = p; if (PPEEK_IS('{') && IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_X_BRACE_HEX8)) { - PINC; - num = scan_unsigned_hexadecimal_number(&p, end, 8, enc); - if (num < 0) return ONIGERR_TOO_BIG_WIDE_CHAR_VALUE; - if (!PEND) { + PINC; + num = scan_unsigned_hexadecimal_number(&p, end, 8, enc); + if (num < 0) return ONIGERR_TOO_BIG_WIDE_CHAR_VALUE; + if (!PEND) { c2 = PPEEK; if (ONIGENC_IS_CODE_XDIGIT(enc, c2)) return ONIGERR_TOO_LONG_WIDE_CHAR_VALUE; } - if (p > prev + enclen(enc, prev) && !PEND && (PPEEK_IS('}'))) { - PINC; - tok->type = TK_CODE_POINT; - tok->base = 16; - tok->u.code = (OnigCodePoint )num; - } - else { - /* can't read nothing or invalid format */ - p = prev; - } + if (p > prev + enclen(enc, prev) && !PEND && (PPEEK_IS('}'))) { + PINC; + tok->type = TK_CODE_POINT; + tok->base = 16; + tok->u.code = (OnigCodePoint )num; + } + else { + /* can't read nothing or invalid format */ + p = prev; + } } else if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_X_HEX2)) { num = scan_unsigned_hexadecimal_number(&p, end, 2, enc); @@ -2969,10 +2988,10 @@ fetch_token_in_cc(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env) default: PUNFETCH; - num = fetch_escaped_value(&p, end, env); + num = fetch_escaped_value(&p, end, env, &c2); if (num < 0) return num; - if (tok->u.c != num) { - tok->u.code = (OnigCodePoint )num; + if (tok->u.c != c2) { + tok->u.code = c2; tok->type = TK_CODE_POINT; } break; @@ -3332,7 +3351,7 @@ fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env) if (num < 0) return ONIGERR_TOO_BIG_NUMBER; if (p == prev) { /* can't read nothing. */ num = 0; /* but, it's not error */ - } + } tok->type = TK_RAW_BYTE; tok->base = 8; tok->u.c = num; @@ -3344,7 +3363,7 @@ fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env) #ifdef USE_NAMED_GROUP case 'k': - if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_K_NAMED_BACKREF)) { + if (!PEND && IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_K_NAMED_BACKREF)) { PFETCH(c); if (c == '<' || c == '\'') { UChar* name_end; @@ -3417,7 +3436,7 @@ fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env) #ifdef USE_SUBEXP_CALL case 'g': - if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_G_SUBEXP_CALL)) { + if (!PEND && IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_G_SUBEXP_CALL)) { PFETCH(c); if (c == '<' || c == '\'') { int gnum; @@ -3446,13 +3465,14 @@ fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env) case 'p': case 'P': - if (PPEEK_IS('{') && + if (!PEND && PPEEK_IS('{') && IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_P_BRACE_CHAR_PROPERTY)) { PINC; tok->type = TK_CHAR_PROPERTY; tok->u.prop.not = (c == 'P' ? 1 : 0); - if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_P_BRACE_CIRCUMFLEX_NOT)) { + if (!PEND && + IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_P_BRACE_CIRCUMFLEX_NOT)) { PFETCH(c); if (c == '^') { tok->u.prop.not = (tok->u.prop.not == 0 ? 1 : 0); @@ -3464,16 +3484,20 @@ fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env) break; default: - PUNFETCH; - num = fetch_escaped_value(&p, end, env); - if (num < 0) return num; - /* set_raw: */ - if (tok->u.c != num) { - tok->type = TK_CODE_POINT; - tok->u.code = (OnigCodePoint )num; - } - else { /* string */ - p = tok->backp + enclen(enc, tok->backp); + { + OnigCodePoint c2; + + PUNFETCH; + num = fetch_escaped_value(&p, end, env, &c2); + if (num < 0) return num; + /* set_raw: */ + if (tok->u.c != c2) { + tok->type = TK_CODE_POINT; + tok->u.code = c2; + } + else { /* string */ + p = tok->backp + enclen(enc, tok->backp); + } } break; } @@ -3548,10 +3572,10 @@ fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env) if (r < 0) return r; /* error */ if (r == 0) goto greedy_check; else if (r == 2) { /* {n} */ - if (IS_SYNTAX_BV(syn, ONIG_SYN_FIXED_INTERVAL_IS_GREEDY_ONLY)) - goto possessive_check; + if (IS_SYNTAX_BV(syn, ONIG_SYN_FIXED_INTERVAL_IS_GREEDY_ONLY)) + goto possessive_check; - goto greedy_check; + goto greedy_check; } /* r == 1 : normal char */ break; @@ -3562,10 +3586,10 @@ fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env) break; case '(': - if (PPEEK_IS('?') && + if (!PEND && PPEEK_IS('?') && IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_QMARK_GROUP_EFFECT)) { PINC; - if (PPEEK_IS('#')) { + if (!PEND && PPEEK_IS('#')) { PFETCH(c); while (1) { if (PEND) return ONIGERR_END_PATTERN_IN_GROUP; @@ -3612,7 +3636,7 @@ fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env) case ']': if (*src > env->pattern) /* /].../ is allowed. */ - CLOSE_BRACKET_WITHOUT_ESC_WARN(env, (UChar* )"]"); + CLOSE_BRACKET_WITHOUT_ESC_WARN(env, (UChar* )"]"); break; case '#': @@ -3975,8 +3999,9 @@ next_state_val(CClassNode* cc, OnigCodePoint *vs, OnigCodePoint v, switch (*state) { case CCS_VALUE: - if (*type == CCV_SB) + if (*type == CCV_SB) { BITSET_SET_BIT(cc->bs, (int )(*vs)); + } else if (*type == CCV_CODE_POINT) { r = add_code_range(&(cc->mbuf), env, *vs, *vs); if (r < 0) return r; @@ -3989,13 +4014,13 @@ next_state_val(CClassNode* cc, OnigCodePoint *vs, OnigCodePoint v, if (*vs > 0xff || v > 0xff) return ONIGERR_INVALID_CODE_POINT_VALUE; - if (*vs > v) { - if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_EMPTY_RANGE_IN_CC)) - goto ccs_range_end; - else - return ONIGERR_EMPTY_RANGE_IN_CHAR_CLASS; - } - bitset_set_range(cc->bs, (int )*vs, (int )v); + if (*vs > v) { + if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_EMPTY_RANGE_IN_CC)) + goto ccs_range_end; + else + return ONIGERR_EMPTY_RANGE_IN_CHAR_CLASS; + } + bitset_set_range(cc->bs, (int )*vs, (int )v); } else { r = add_code_range(&(cc->mbuf), env, *vs, v); @@ -4006,15 +4031,15 @@ next_state_val(CClassNode* cc, OnigCodePoint *vs, OnigCodePoint v, #if 0 if (intype == CCV_CODE_POINT && *type == CCV_SB) { #endif - if (*vs > v) { - if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_EMPTY_RANGE_IN_CC)) - goto ccs_range_end; - else - return ONIGERR_EMPTY_RANGE_IN_CHAR_CLASS; - } - bitset_set_range(cc->bs, (int )*vs, (int )(v < 0xff ? v : 0xff)); - r = add_code_range(&(cc->mbuf), env, (OnigCodePoint )*vs, v); - if (r < 0) return r; + if (*vs > v) { + if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_EMPTY_RANGE_IN_CC)) + goto ccs_range_end; + else + return ONIGERR_EMPTY_RANGE_IN_CHAR_CLASS; + } + bitset_set_range(cc->bs, (int )*vs, (int )(v < 0xff ? v : 0xff)); + r = add_code_range(&(cc->mbuf), env, (OnigCodePoint )*vs, v); + if (r < 0) return r; #if 0 } else @@ -4110,6 +4135,7 @@ parse_char_class(Node** np, OnigToken* tok, UChar** src, UChar* end, fetched = 0; switch (r) { case TK_CHAR: + any_char_in: len = ONIGENC_CODE_TO_MBCLEN(env->enc, tok->u.c); if (len > 1) { in_type = CCV_CODE_POINT; @@ -4119,7 +4145,7 @@ parse_char_class(Node** np, OnigToken* tok, UChar** src, UChar* end, goto err; } else { - sb_char: + /* sb_char: */ in_type = CCV_SB; } v = (OnigCodePoint )tok->u.c; @@ -4265,7 +4291,7 @@ parse_char_class(Node** np, OnigToken* tok, UChar** src, UChar* end, } else if (state == CCS_RANGE) { CC_ESC_WARN(env, (UChar* )"-"); - goto sb_char; /* [!--x] is allowed */ + goto any_char_in; /* [!--x] is allowed */ } else { /* CCS_COMPLETE */ r = fetch_token_in_cc(tok, &p, end, env); @@ -4279,7 +4305,7 @@ parse_char_class(Node** np, OnigToken* tok, UChar** src, UChar* end, if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_DOUBLE_RANGE_OP_IN_CC)) { CC_ESC_WARN(env, (UChar* )"-"); - goto sb_char; /* [0-9-a] is allowed as [0-9\-a] */ + goto any_char_in; /* [0-9-a] is allowed as [0-9\-a] */ } r = ONIGERR_UNMATCHED_RANGE_SPECIFIER_IN_CHAR_CLASS; goto err; @@ -4452,6 +4478,7 @@ parse_enclose(Node** np, OnigToken* tok, int term, UChar** src, UChar* end, #endif case '<': /* look behind (?<=...), (?= ONIGENC_MBC_MINLEN(env->enc)) { - if (len == enclen(env->enc, NSTR(*np)->s)) { + if (len == enclen(env->enc, NSTR(*np)->s)) {//should not enclen_end() r = fetch_token(tok, src, end, env); NSTRING_CLEAR_RAW(*np); goto string_end; @@ -5300,6 +5327,10 @@ onig_parse_make_tree(Node** root, const UChar* pattern, const UChar* end, env->reg = reg; *root = NULL; + + if (! ONIGENC_IS_VALID_MBC_STRING(env->enc, pattern, end)) + return ONIGERR_INVALID_WIDE_CHAR_VALUE; + p = (UChar* )pattern; r = parse_regexp(root, &p, (UChar* )end, env); reg->num_mem = env->num_mem; diff --git a/src/regparse.h b/src/regparse.h index fff707a..9e366fe 100644 --- a/src/regparse.h +++ b/src/regparse.h @@ -191,8 +191,8 @@ typedef struct { struct _Node* target; AbsAddrType call_addr; /* for multiple call reference */ - OnigDistance min_len; /* min length (byte) */ - OnigDistance max_len; /* max length (byte) */ + OnigLen min_len; /* min length (byte) */ + OnigLen max_len; /* max length (byte) */ int char_len; /* character length */ int opt_count; /* referenced count in optimize_node_left() */ } EncloseNode; diff --git a/src/sjis.c b/src/sjis.c index 84843ae..a607b3d 100644 --- a/src/sjis.c +++ b/src/sjis.c @@ -76,6 +76,12 @@ mbc_enc_len(const UChar* p) return EncLen_SJIS[*p]; } +static int +is_valid_mbc_string(const UChar* s, const UChar* end) +{ + return onigenc_length_check_is_valid_mbc_string(ONIG_ENCODING_SJIS, s, end); +} + static int code_to_mbclen(OnigCodePoint code) { @@ -303,5 +309,6 @@ OnigEncodingType OnigEncodingSJIS = { left_adjust_char_head, is_allowed_reverse_match, NULL, /* init */ - NULL /* is_initialized */ + NULL, /* is_initialized */ + is_valid_mbc_string }; diff --git a/src/unicode.c b/src/unicode.c index df20ef9..8812ca2 100644 --- a/src/unicode.c +++ b/src/unicode.c @@ -111,8 +111,10 @@ onig_unicode_define_user_property(const char* name, OnigCodePoint* ranges) n = 0; for (i = 0; i < len; i++) { c = name[i]; - if (c <= 0 || c >= 0x80) + if (c <= 0 || c >= 0x80) { + xfree(s); return ONIGERR_INVALID_CHAR_PROPERTY_NAME; + } if (c != ' ' && c != '-' && c != '_') { s[n] = c; @@ -483,12 +485,13 @@ onigenc_unicode_get_case_fold_codes_by_str(OnigEncoding enc, buk = unicode_unfold_key(code); if (buk != 0) { if (buk->fold_len == 1) { + int un; items[0].byte_len = len; items[0].code_len = 1; items[0].code[0] = *FOLDS1_FOLD(buk->index); n++; - int un = FOLDS1_UNFOLDS_NUM(buk->index); + un = FOLDS1_UNFOLDS_NUM(buk->index); for (i = 0; i < un; i++) { OnigCodePoint unfold = FOLDS1_UNFOLDS(buk->index)[i]; if (unfold != code) { @@ -517,8 +520,9 @@ onigenc_unicode_get_case_fold_codes_by_str(OnigEncoding enc, } for (fn = 0; fn < 2; fn++) { + int index; cs[fn][0] = FOLDS2_FOLD(buk->index)[fn]; - int index = unicode_fold1_key(&cs[fn][0]); + index = unicode_fold1_key(&cs[fn][0]); if (index >= 0) { int m = FOLDS1_UNFOLDS_NUM(index); for (i = 0; i < m; i++) { @@ -553,8 +557,9 @@ onigenc_unicode_get_case_fold_codes_by_str(OnigEncoding enc, } for (fn = 0; fn < 3; fn++) { + int index; cs[fn][0] = FOLDS3_FOLD(buk->index)[fn]; - int index = unicode_fold1_key(&cs[fn][0]); + index = unicode_fold1_key(&cs[fn][0]); if (index >= 0) { int m = FOLDS1_UNFOLDS_NUM(index); for (i = 0; i < m; i++) { @@ -603,6 +608,7 @@ onigenc_unicode_get_case_fold_codes_by_str(OnigEncoding enc, p += len; if (p < end) { int clen; + int index; codes[0] = code; code = ONIGENC_MBC_TO_CODE(enc, p, end); @@ -617,7 +623,7 @@ onigenc_unicode_get_case_fold_codes_by_str(OnigEncoding enc, clen = enclen(enc, p); len += clen; - int index = unicode_fold2_key(codes); + index = unicode_fold2_key(codes); if (index >= 0) { m = FOLDS2_UNFOLDS_NUM(index); for (i = 0; i < m; i++) { diff --git a/src/utf16_be.c b/src/utf16_be.c index e93b42a..f220cca 100644 --- a/src/utf16_be.c +++ b/src/utf16_be.c @@ -54,6 +54,12 @@ utf16be_mbc_enc_len(const UChar* p) return EncLen_UTF16[*p]; } +static int +is_valid_mbc_string(const UChar* s, const UChar* end) +{ + return onigenc_length_check_is_valid_mbc_string(ONIG_ENCODING_UTF16_BE, s, end); +} + static int utf16be_is_mbc_newline(const UChar* p, const UChar* end) { @@ -224,5 +230,6 @@ OnigEncodingType OnigEncodingUTF16_BE = { utf16be_left_adjust_char_head, onigenc_always_false_is_allowed_reverse_match, NULL, /* init */ - NULL /* is_initialized */ + NULL, /* is_initialized */ + is_valid_mbc_string }; diff --git a/src/utf16_le.c b/src/utf16_le.c index 2d9af52..89bc72f 100644 --- a/src/utf16_le.c +++ b/src/utf16_le.c @@ -60,6 +60,21 @@ utf16le_mbc_enc_len(const UChar* p) return EncLen_UTF16[*(p+1)]; } +static int +is_valid_mbc_string(const UChar* p, const UChar* end) +{ + const UChar* end1 = end - 1; + + while (p < end1) { + p += utf16le_mbc_enc_len(p); + } + + if (p != end) + return FALSE; + else + return TRUE; +} + static int utf16le_is_mbc_newline(const UChar* p, const UChar* end) { @@ -225,5 +240,6 @@ OnigEncodingType OnigEncodingUTF16_LE = { utf16le_left_adjust_char_head, onigenc_always_false_is_allowed_reverse_match, NULL, /* init */ - NULL /* is_initialized */ + NULL, /* is_initialized */ + is_valid_mbc_string }; diff --git a/src/utf32_be.c b/src/utf32_be.c index b8f64af..d0c7f39 100644 --- a/src/utf32_be.c +++ b/src/utf32_be.c @@ -35,6 +35,12 @@ utf32be_mbc_enc_len(const UChar* p ARG_UNUSED) return 4; } +static int +is_valid_mbc_string(const UChar* s, const UChar* end) +{ + return onigenc_length_check_is_valid_mbc_string(ONIG_ENCODING_UTF32_BE, s, end); +} + static int utf32be_is_mbc_newline(const UChar* p, const UChar* end) { @@ -183,5 +189,6 @@ OnigEncodingType OnigEncodingUTF32_BE = { utf32be_left_adjust_char_head, onigenc_always_false_is_allowed_reverse_match, NULL, /* init */ - NULL /* is_initialized */ + NULL, /* is_initialized */ + is_valid_mbc_string }; diff --git a/src/utf32_le.c b/src/utf32_le.c index a5a048e..33200d1 100644 --- a/src/utf32_le.c +++ b/src/utf32_le.c @@ -35,6 +35,12 @@ utf32le_mbc_enc_len(const UChar* p ARG_UNUSED) return 4; } +static int +is_valid_mbc_string(const UChar* s, const UChar* end) +{ + return onigenc_length_check_is_valid_mbc_string(ONIG_ENCODING_UTF32_LE, s, end); +} + static int utf32le_is_mbc_newline(const UChar* p, const UChar* end) { @@ -183,5 +189,6 @@ OnigEncodingType OnigEncodingUTF32_LE = { utf32le_left_adjust_char_head, onigenc_always_false_is_allowed_reverse_match, NULL, /* init */ - NULL /* is_initialized */ + NULL, /* is_initialized */ + is_valid_mbc_string }; diff --git a/src/utf8.c b/src/utf8.c index b78e7eb..219b7ea 100644 --- a/src/utf8.c +++ b/src/utf8.c @@ -29,7 +29,7 @@ #include "regenc.h" -#define USE_INVALID_CODE_SCHEME +//#define USE_INVALID_CODE_SCHEME #ifdef USE_INVALID_CODE_SCHEME /* virtual codepoint values for invalid encoding byte 0xfe and 0xff */ @@ -39,6 +39,7 @@ #endif #define utf8_islead(c) ((UChar )((c) & 0xc0) != 0x80) +#define utf8_istail(c) ((UChar )((c) & 0xc0) == 0x80) static const int EncLen_UTF8[] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, @@ -65,6 +66,30 @@ mbc_enc_len(const UChar* p) return EncLen_UTF8[*p]; } +static int +is_valid_mbc_string(const UChar* p, const UChar* end) +{ + int i, len; + + while (p < end) { + if (! utf8_islead(*p)) + return FALSE; + + len = mbc_enc_len(p++); + if (len > 1) { + for (i = 1; i < len; i++) { + if (p == end) + return FALSE; + + if (! utf8_istail(*p++)) + return FALSE; + } + } + } + + return TRUE; +} + static int is_mbc_newline(const UChar* p, const UChar* end) { @@ -91,12 +116,14 @@ is_mbc_newline(const UChar* p, const UChar* end) } static OnigCodePoint -mbc_to_code(const UChar* p, const UChar* end ARG_UNUSED) +mbc_to_code(const UChar* p, const UChar* end) { int c, len; OnigCodePoint n; - len = enclen(ONIG_ENCODING_UTF8, p); + len = mbc_enc_len(p); + if (len > end - p) len = end - p; + c = *p++; if (len > 1) { len--; @@ -303,5 +330,6 @@ OnigEncodingType OnigEncodingUTF8 = { left_adjust_char_head, onigenc_always_true_is_allowed_reverse_match, NULL, /* init */ - NULL /* is_initialized */ + NULL, /* is_initialized */ + is_valid_mbc_string }; -- cgit v1.2.3