summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJörg Frings-Fürst <debian@jff-webhosting.net>2016-08-31 03:42:05 +0200
committerJörg Frings-Fürst <debian@jff-webhosting.net>2016-08-31 03:42:05 +0200
commita76fa337cc657dbe669ffb8dbdac606d4d6616f1 (patch)
treea6f004237df60876d087f79ac369fdc2545697c9
parent5e01a4852b31d537307994248869caf38b4023cc (diff)
Imported Upstream version 6.1.0upstream/6.1.0
-rw-r--r--CMakeLists.txt2
-rw-r--r--HISTORY41
-rw-r--r--Makefile.am2
-rw-r--r--README.md8
-rw-r--r--configure.ac4
-rw-r--r--contributed/libfuzzer-onig.cpp31
-rw-r--r--dist.info2
-rw-r--r--doc/API27
-rw-r--r--doc/API.ja27
-rw-r--r--doc/RE225
-rw-r--r--index.html5
-rw-r--r--index_ja.html5
-rw-r--r--sample/.gitignore1
-rw-r--r--sample/Makefile.am8
-rw-r--r--sample/bug_fix.c131
-rw-r--r--sample/scan.c88
-rw-r--r--src/ascii.c3
-rw-r--r--src/big5.c9
-rw-r--r--src/cp1251.c3
-rw-r--r--src/euc_jp.c9
-rw-r--r--src/euc_kr.c12
-rw-r--r--src/euc_tw.c9
-rw-r--r--src/gb18030.c9
-rw-r--r--src/iso8859_1.c3
-rw-r--r--src/iso8859_10.c3
-rw-r--r--src/iso8859_11.c3
-rw-r--r--src/iso8859_13.c3
-rw-r--r--src/iso8859_14.c3
-rw-r--r--src/iso8859_15.c3
-rw-r--r--src/iso8859_16.c3
-rw-r--r--src/iso8859_2.c3
-rw-r--r--src/iso8859_3.c3
-rw-r--r--src/iso8859_4.c3
-rw-r--r--src/iso8859_5.c3
-rw-r--r--src/iso8859_6.c3
-rw-r--r--src/iso8859_7.c3
-rw-r--r--src/iso8859_8.c3
-rw-r--r--src/iso8859_9.c3
-rw-r--r--src/koi8.c3
-rw-r--r--src/koi8_r.c3
-rw-r--r--src/oniguruma.h23
-rw-r--r--src/regcomp.c608
-rw-r--r--src/regenc.c43
-rw-r--r--src/regenc.h5
-rw-r--r--src/regerror.c2
-rw-r--r--src/regexec.c1560
-rw-r--r--src/regint.h2
-rw-r--r--src/regparse.c179
-rw-r--r--src/regparse.h4
-rw-r--r--src/sjis.c9
-rw-r--r--src/unicode.c16
-rw-r--r--src/utf16_be.c9
-rw-r--r--src/utf16_le.c18
-rw-r--r--src/utf32_be.c9
-rw-r--r--src/utf32_le.c9
-rw-r--r--src/utf8.c36
56 files changed, 1965 insertions, 1279 deletions
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 0b245d0..14e22fd 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -4,7 +4,7 @@ cmake_minimum_required(VERSION 2.8)
project(oniguruma C)
set(PACKAGE onig)
-set(PACKAGE_VERSION "6.0.0")
+set(PACKAGE_VERSION "6.1.0")
set(USE_COMBINATION_EXPLOSION_CHECK 0)
set(USE_CRNL_AS_LINE_TERMINATOR 0)
diff --git a/HISTORY b/HISTORY
index 60f096e..0e9b1c7 100644
--- a/HISTORY
+++ b/HISTORY
@@ -1,7 +1,48 @@
History
+2016/08/29: Version 6.1.0
+
+2016/08/28: add contributed/libfuzzer-onig.cpp (thanks hannob)
+2016/08/28: update LTVERSION 4:0:0
+2016/08/28: NEW API: onigenc_is_valid_mbc_string().
+2016/08/27: add is_valid_mbc_string() member into OnigEncodingType.
+2016/08/27: fix out of bounds read.
+2016/08/26: fix out of bounds read.
+2016/08/25: disable USE_INVALID_CODE_SCHEME.
+2016/08/24: fix out of bounds read.
+2016/08/23: doc/RE improved.
+2016/08/22: add onig_scan() into doc/API.
+2016/08/22: fix bug: Out of bounds read in onig_strcpy() #17
+2016/08/21: fix bug: infinite loop of backreference and group.
+2016/08/21: fix out of bounds read in mbc_to_code() #16
+2016/08/18: doc/RE refinements.
+2016/08/16: add onig_scan() (NEW API)
+2016/08/16: reimplement match stack allocation for case too many repeat
+ and too many captures in regexp.
+2016/08/15: number of captures <= 32767 for bytecode representation.
+2016/07/17: don't use int_map_backward for thread-safe.
+2016/07/04: fix case of enclosed option in look-behind.
+2016/07/04: fix ignore case in look-behind.
+2016/05/23: fix memory leak in onig_unicode_define_user_property()
+2016/05/20: declare variables at the top of scope. (thanks nmaya)
+
2016/05/09: Version 6.0.0
+2016/05/05: add NEW API: onig_unicode_define_user_property()
+2016/05/04: update Unicode data to 8.0.0
+2016/05/02: change OnigCodePoint type to unsigned int.
+2016/05/02: add doc/UNICODE_PROPERTIES.
+2016/04/19: add error code ONIGERR_FAIL_TO_INITIALIZE.
+2016/04/18: add make_win64/32.bat.
+2016/04/18: fix bug of uninitialized regex_t value on error.
+2016/04/16: reimplement Unicode case folding.
+2016/04/11: update LTVERSION = 3.0.0
+2016/04/05: remove all THREAD_ macro.
+2016/04/05: add init member into OnigEncoding. (add onig_initialize())
+2016/03/28: remove state member of regex.
+2016/03/25: move source files into src/
+2016/03/23: rename configre.in to configure.ac.
+2015/11/17: fix memory leak. (thanks pigzang)
2015/07/13: change mail address.
2014/12/12: Version 5.9.6
diff --git a/Makefile.am b/Makefile.am
index 4201e0b..086b23c 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -11,7 +11,7 @@ EXTRA_DIST = oniguruma.pc.in HISTORY README.ja README.md \
doc/API doc/API.ja doc/RE doc/RE.ja doc/FAQ doc/FAQ.ja \
doc/UNICODE_PROPERTIES \
src/Makefile.windows src/config.h.win32 src/config.h.win64 \
- windows/testc.c
+ windows/testc.c contributed/libfuzzer-onig.cpp
bin_SCRIPTS = onig-config
diff --git a/README.md b/README.md
index dfd6723..a2c49cd 100644
--- a/README.md
+++ b/README.md
@@ -20,6 +20,12 @@ Supported character encodings:
* CP1251: contributed by Byte
+New feature of version 6.1
+--------------------------
+
+* improved doc/RE
+* NEW API: onig_scan()
+
New feature of version 6.0
--------------------------
@@ -114,6 +120,7 @@ Sample Programs
|sample/encode.c |example of some encodings. |
|sample/listcap.c |example of the capture history. |
|sample/posix.c |POSIX API sample. |
+|sample/scan.c |example of using onig_scan(). |
|sample/sql.c |example of the variable meta characters. |
|sample/user_property.c|example of user defined Unicode property. |
@@ -185,5 +192,6 @@ Source Files
|utf32_be.c |UTF-32BE encoding |
|utf32_le.c |UTF-32LE encoding |
|unicode.c |common codes of Unicode encoding |
+|unicode_fold_data.c|Unicode folding data |
|win32/Makefile |Makefile for Win32 (VC++) |
|win32/config.h |config.h for Win32 |
diff --git a/configure.ac b/configure.ac
index e9926a4..6bd3d73 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1,5 +1,5 @@
dnl Process this file with autoconf to produce a configure script.
-AC_INIT(onig, 6.0.0)
+AC_INIT(onig, 6.1.0)
AC_CONFIG_MACRO_DIR([m4])
@@ -34,7 +34,7 @@ fi
dnl Checks for programs.
AC_PROG_CC
AM_PROG_LIBTOOL
-LTVERSION="3:0:0"
+LTVERSION="4:0:0"
AC_SUBST(LTVERSION)
AC_PROG_INSTALL
diff --git a/contributed/libfuzzer-onig.cpp b/contributed/libfuzzer-onig.cpp
new file mode 100644
index 0000000..984110d
--- /dev/null
+++ b/contributed/libfuzzer-onig.cpp
@@ -0,0 +1,31 @@
+/* libfuzzer test code for oniguruma
+ * author: Hanno Böck, license: CC0/public domain
+
+Usage:
+* compile oniguruma with something like
+ ./configure CC=clang LD=clang CFLAGS="-fsanitize-coverage=edge -fsanitize=address" \
+ LDFLAGS="-fsanitize-coverage=edge -fsanitize=address"
+* Compile libfuzzer stub and link against static libonig.a and libFuzzer.a:
+ clang++ libfuzzer-onig.cpp src/.libs/libonig.a libFuzzer.a -o libfuzzer-onig \
+ -fsanitize-coverage=edge -fsanitize=address
+* Put sample patterns in directory "in/"
+* Run
+ ./libfuzzer-onig in
+
+Consult libfuzzer docs for further details and how to create libFuzzer.a:
+http://llvm.org/docs/LibFuzzer.html
+
+ */
+#include <stdint.h>
+#include <string.h>
+#include <oniguruma.h>
+
+extern "C" int LLVMFuzzerTestOneInput(const uint8_t * Data, size_t Size)
+{
+ regex_t *reg;
+ if (onig_new
+ (&reg, Data, Data + Size, ONIG_OPTION_DEFAULT, ONIG_ENCODING_UTF8,
+ ONIG_SYNTAX_DEFAULT, 0) == 0)
+ onig_free(reg);
+ return 0;
+}
diff --git a/dist.info b/dist.info
index 40ad07d..a7633b8 100644
--- a/dist.info
+++ b/dist.info
@@ -1,7 +1,7 @@
--- This file is part of LuaDist project
name = "onig"
-version = "6.0.0"
+version = "6.1.0"
desc = "Oniguruma is a regular expressions library."
author = "K.Kosako"
diff --git a/doc/API b/doc/API
index 9904a06..8e824f5 100644
--- a/doc/API
+++ b/doc/API
@@ -1,4 +1,4 @@
-Oniguruma API Version 6.0.0 2016/05/06
+Oniguruma API Version 6.1.0 2016/08/22
#include <oniguruma.h>
@@ -256,6 +256,27 @@ Oniguruma API Version 6.0.0 2016/05/06
ONIG_OPTION_POSIX_REGION region argument is regmatch_t[] type of POSIX API.
+# int onig_scan(regex_t* reg, const UChar* str, const UChar* end,
+ OnigRegion* region, OnigOptionType option,
+ int (*scan_callback)(int, int, OnigRegion*, void*),
+ void* callback_arg)
+
+ Scan string and callback with matching region.
+
+ normal return: number of matching times
+ error: error code
+ interruption: return value of callback function (!= 0)
+
+ arguments
+ 1 reg: regex object
+ 2 str: target string
+ 3 end: terminate address of target string
+ 4 region: address for return group match range info (NULL is allowed)
+ 5 option: search time option
+ 6 scan_callback: callback function (defined by user)
+ 7 callback_arg: optional argument passed to callback
+
+
# OnigRegion* onig_region_new(void)
Create a region.
@@ -601,6 +622,10 @@ Oniguruma API Version 6.0.0 2016/05/06
2 ranges: property code point ranges
(first element is number of ranges.)
+ [num-of-ranges, 1st-range-start, 1st-range-end, 2nd-range-start... ]
+
+ * Don't destroy the ranges after having called this function.
+
normal return: ONIG_NORMAL
diff --git a/doc/API.ja b/doc/API.ja
index ac8cc6a..f617a1c 100644
--- a/doc/API.ja
+++ b/doc/API.ja
@@ -1,4 +1,4 @@
-֥󥿡ե Version 6.0.0 2016/05/06
+֥󥿡ե Version 6.1.0 2016/08/22
#include <oniguruma.h>
@@ -256,6 +256,27 @@
ONIG_OPTION_POSIX_REGION regionPOSIX APIregmatch_t[]ˤ
+# int onig_scan(regex_t* reg, const UChar* str, const UChar* end,
+ OnigRegion* region, OnigOptionType option,
+ int (*scan_callback)(int, int, OnigRegion*, void*),
+ void* callback_arg)
+
+ ɽʸ򥹥󤷤ơޥå󥰤˥ХåؿƤӽФ
+
+ ェλ: ޥå (0ޤ)
+ 顼: 顼 (< 0)
+ : Хåؿʳ֤ͤȤͤͤȤ
+
+
+ 1 reg: ɽ֥
+ 2 str: оʸ
+ 3 end: оʸνüɥ쥹
+ 4 region: ޥåΰ(region) (NULL)
+ 5 option: ץ
+ 6 scan_callback: Хåؿ
+ 7 callback_arg: ХåؿϤղð
+
+
# OnigRegion* onig_region_new(void)
ޥåΰ(region)롣
@@ -608,6 +629,10 @@
2 ranges: ץѥƥɥݥϰ
(ǽǤϰϤο)
+ [num-of-ranges, 1st-range-start, 1st-range-end, 2nd-range-start... ]
+
+ * δؿƤǡrangesѹ/˲ʤ
+
ェλ: ONIG_NORMAL
diff --git a/doc/RE b/doc/RE
index b4bf536..e8a6aa4 100644
--- a/doc/RE
+++ b/doc/RE
@@ -1,35 +1,35 @@
-Oniguruma Regular Expressions Version 6.0.0 2016/05/02
+Oniguruma Regular Expressions Version 6.0.0 2016/08/18
syntax: ONIG_SYNTAX_RUBY (default)
1. Syntax elements
- \ escape (enable or disable meta character meaning)
+ \ escape (enable or disable meta character)
| alternation
(...) group
- [...] character class
+ [...] character class
2. Characters
- \t horizontal tab (0x09)
- \v vertical tab (0x0B)
- \n newline (0x0A)
- \r return (0x0D)
- \b back space (0x08)
- \f form feed (0x0C)
- \a bell (0x07)
- \e escape (0x1B)
- \nnn octal char (encoded byte value)
- \xHH hexadecimal char (encoded byte value)
- \x{7HHHHHHH} wide hexadecimal char (character code point value)
- \cx control char (character code point value)
- \C-x control char (character code point value)
- \M-x meta (x|0x80) (character code point value)
- \M-\C-x meta control char (character code point value)
-
- (* \b is effective in character class [...] only)
+ \t horizontal tab (0x09)
+ \v vertical tab (0x0B)
+ \n newline (line feed) (0x0A)
+ \r carriage return (0x0D)
+ \b backspace (0x08)
+ \f form feed (0x0C)
+ \a bell (0x07)
+ \e escape (0x1B)
+ \nnn octal char (encoded byte value)
+ \xHH hexadecimal char (encoded byte value)
+ \x{7HHHHHHH} wide hexadecimal char (character code point value)
+ \cx control char (character code point value)
+ \C-x control char (character code point value)
+ \M-x meta (x|0x80) (character code point value)
+ \M-\C-x meta control char (character code point value)
+
+ (* \b as backspace is effective in character class only)
3. Character types
@@ -39,12 +39,12 @@ syntax: ONIG_SYNTAX_RUBY (default)
\w word character
Not Unicode:
- alphanumeric, "_" and multibyte char.
+ alphanumeric, "_" and multibyte char.
Unicode:
General_Category -- (Letter|Mark|Number|Connector_Punctuation)
- \W non word char
+ \W non-word char
\s whitespace char
@@ -52,22 +52,22 @@ syntax: ONIG_SYNTAX_RUBY (default)
\t, \n, \v, \f, \r, \x20
Unicode:
- 0009, 000A, 000B, 000C, 000D, 0085(NEL),
+ 0009, 000A, 000B, 000C, 000D, 0085(NEL),
General_Category -- Line_Separator
-- Paragraph_Separator
-- Space_Separator
- \S non whitespace char
+ \S non-whitespace char
\d decimal digit char
Unicode: General_Category -- Decimal_Number
- \D non decimal digit char
+ \D non-decimal-digit char
\h hexadecimal digit char [0-9a-fA-F]
- \H non hexadecimal digit char
+ \H non-hexdigit char
Character Property
@@ -80,7 +80,7 @@ syntax: ONIG_SYNTAX_RUBY (default)
+ works on all encodings
Alnum, Alpha, Blank, Cntrl, Digit, Graph, Lower,
- Print, Punct, Space, Upper, XDigit, Word, ASCII,
+ Print, Punct, Space, Upper, XDigit, Word, ASCII
+ works on EUC_JP, Shift_JIS
Hiragana, Katakana
@@ -97,9 +97,9 @@ syntax: ONIG_SYNTAX_RUBY (default)
? 1 or 0 times
* 0 or more times
+ 1 or more times
- {n,m} at least n but not more than m times
+ {n,m} at least n but no more than m times
{n,} at least n times
- {,n} at least 0 but not more than n times ({0,n})
+ {,n} at least 0 but no more than n times ({0,n})
{n} n times
reluctant
@@ -107,11 +107,11 @@ syntax: ONIG_SYNTAX_RUBY (default)
?? 1 or 0 times
*? 0 or more times
+? 1 or more times
- {n,m}? at least n but not more than m times
+ {n,m}? at least n but not more than m times
{n,}? at least n times
{,n}? at least 0 but not more than n times (== {0,n}?)
- possessive (greedy and does not backtrack after repeated)
+ possessive (greedy and does not backtrack once match)
?+ 1 or 0 times
*+ 0 or more times
@@ -127,24 +127,24 @@ syntax: ONIG_SYNTAX_RUBY (default)
^ beginning of the line
$ end of the line
\b word boundary
- \B not word boundary
+ \B non-word boundary
\A beginning of string
\Z end of string, or before newline at the end
\z end of string
- \G matching start position
+ \G where the current search attempt begins
6. Character class
- ^... negative class (lowest precedence operator)
+ ^... negative class (lowest precedence)
x-y range from x to y
[...] set (character class in character class)
- ..&&.. intersection (low precedence at the next of ^)
-
+ ..&&.. intersection (low precedence, only higher than ^)
+
ex. [a-w&&[^c-g]z] ==> ([a-w] AND ([^c-g] OR z)) ==> [abh-w]
- * If you want to use '[', '-', ']' as a normal character
- in a character class, you should escape these characters by '\'.
+ * If you want to use '[', '-', or ']' as a normal character
+ in character class, you should escape them with '\'.
POSIX bracket ([:xxxxx:], negate [:^xxxxx:])
@@ -196,79 +196,75 @@ syntax: ONIG_SYNTAX_RUBY (default)
(?imx-imx) option on/off
i: ignore case
- m: multi-line (dot(.) match newline)
+ m: multi-line (dot (.) also matches newline)
x: extended form
(?imx-imx:subexp) option on/off for subexp
- (?:subexp) not captured group
- (subexp) captured group
+ (?:subexp) non-capturing group
+ (subexp) capturing group
(?=subexp) look-ahead
(?!subexp) negative look-ahead
(?<=subexp) look-behind
(?<!subexp) negative look-behind
- Subexp of look-behind must be fixed character length.
- But different character length is allowed in top level
- alternatives only.
+ Subexp of look-behind must be fixed-width.
+ But top-level alternatives can be of various lengths.
ex. (?<=a|bc) is OK. (?<=aaa(?:b|cd)) is not allowed.
- In negative-look-behind, captured group isn't allowed,
- but shy group(?:) is allowed.
+ In negative look-behind, capturing group isn't allowed,
+ but non-capturing group (?:) is allowed.
(?>subexp) atomic group
- don't backtrack in subexp.
+ no backtracks in subexp.
(?<name>subexp), (?'name'subexp)
define named group
- (All characters of the name must be a word character.)
+ (Each character of the name must be a word character.)
- Not only a name but a number is assigned like a captured
+ Not only a name but a number is assigned like a capturing
group.
- Assigning the same name as two or more subexps is allowed.
- In this case, a subexp call can not be performed although
- the back reference is possible.
+ Assigning the same name to two or more subexps is allowed.
+
+8. Backreferences
-8. Back reference
+ When we say "backreference a group," it actually means, "re-match the same
+ text matched by the subexp in that group."
- \n back reference by group number (n >= 1)
- \k<n> back reference by group number (n >= 1)
- \k'n' back reference by group number (n >= 1)
- \k<-n> back reference by relative group number (n >= 1)
- \k'-n' back reference by relative group number (n >= 1)
- \k<name> back reference by group name
- \k'name' back reference by group name
+ \n \k<n> \k'n' (n >= 1) backreference the nth group in the regexp
+ \k<-n> \k'-n' (n >= 1) backreference the nth group counting
+ backwards from the referring position
+ \k<name> \k'name' backreference a group with the specified name
- In the back reference by the multiplex definition name,
- a subexp with a large number is referred to preferentially.
- (When not matched, a group of the small number is referred to.)
+ When backreferencing with a name that is assigned to more than one groups,
+ the last group with the name is checked first, if not matched then the
+ previous one with the name, and so on, until there is a match.
- * Back reference by group number is forbidden if named group is defined
- in the pattern and ONIG_OPTION_CAPTURE_GROUP is not setted.
+ * Backreference by number is forbidden if any named group is defined and
+ ONIG_OPTION_CAPTURE_GROUP is not set.
- back reference with nest level
+ backreference with recursion level
- level: 0, 1, 2, ...
+ (n >= 1, level >= 0)
- \k<n+level> (n >= 1)
- \k<n-level> (n >= 1)
- \k'n+level' (n >= 1)
- \k'n-level' (n >= 1)
+ \k<n+level> \k'n+level'
+ \k<n-level> \k'n-level'
- \k<name+level>
- \k<name-level>
- \k'name+level'
- \k'name-level'
+ \k<name+level> \k'name+level'
+ \k<name-level> \k'name-level'
- Destinate relative nest level from back reference position.
+ Destine a group on the recursion level relative to the referring position.
ex 1.
+ /\A(?<a>|.|(?:(?<b>.)\g<a>\k<b>))\z/.match("reee")
/\A(?<a>|.|(?:(?<b>.)\g<a>\k<b+0>))\z/.match("reer")
+ \k<b+0> refers to the (?<b>.) on the same recursion level with it.
+
ex 2.
r = Regexp.compile(<<'__REGEXP__'.strip, Regexp::EXTENDED)
@@ -280,53 +276,56 @@ syntax: ONIG_SYNTAX_RUBY (default)
\g<element>
__REGEXP__
- p r.match('<foo>f<bar>bbb</bar>f</foo>').captures
+ p r.match("<foo>f<bar>bbb</bar>f</foo>").captures
+
+
+9. Subexp calls ("Tanaka Akira special")
+ When we say "call a group," it actually means, "re-execute the subexp in
+ that group."
+ \g<n> \g'n' (n >= 1) call the nth group
+ \g<-n> \g'-n' (n >= 1) call the nth group counting backwards from
+ the calling position
+ \g<name> \g'name' call the group with the specified name
-9. Subexp call ("Tanaka Akira special")
+ * Left-most recursive calls are not allowed.
- \g<name> call by group name
- \g'name' call by group name
- \g<n> call by group number (n >= 1)
- \g'n' call by group number (n >= 1)
- \g<-n> call by relative group number (n >= 1)
- \g'-n' call by relative group number (n >= 1)
+ ex. (?<name>a|\g<name>b) => error
+ (?<name>a|b\g<name>c) => OK
- * left-most recursive call is not allowed.
- ex. (?<name>a|\g<name>b) => error
- (?<name>a|b\g<name>c) => OK
+ * Calls with a name that is assigned to more than one groups are not
+ allowed.
- * Call by group number is forbidden if named group is defined in the pattern
- and ONIG_OPTION_CAPTURE_GROUP is not setted.
+ * Call by number is forbidden if any named group is defined and
+ ONIG_OPTION_CAPTURE_GROUP is not set.
- * If the option status of called group is different from calling position
- then the group's option is effective.
+ * The option status of the called group is always effective.
- ex. (?-i:\g<name>)(?i:(?<name>a)){0} match to "A"
+ ex. /(?-i:\g<name>)(?i:(?<name>a)){0}/.match("A")
10. Captured group
- Behavior of the no-named group (...) changes with the following conditions.
+ Behavior of an unnamed group (...) changes with the following conditions.
(But named group is not changed.)
case 1. /.../ (named group is not used, no option)
- (...) is treated as a captured group.
+ (...) is treated as a capturing group.
case 2. /.../g (named group is not used, 'g' option)
- (...) is treated as a no-captured group (?:...).
+ (...) is treated as a non-capturing group (?:...).
case 3. /..(?<name>..)../ (named group is used, no option)
- (...) is treated as a no-captured group (?:...).
+ (...) is treated as a non-capturing group.
numbered-backref/call is not allowed.
case 4. /..(?<name>..)../G (named group is used, 'G' option)
- (...) is treated as a captured group.
+ (...) is treated as a capturing group.
numbered-backref/call is allowed.
where
@@ -338,14 +337,14 @@ syntax: ONIG_SYNTAX_RUBY (default)
-----------------------------
-A-1. Syntax depend options
+A-1. Syntax-dependent options
+ ONIG_SYNTAX_RUBY
- (?m): dot(.) match newline
+ (?m): dot (.) also matches newline
+ ONIG_SYNTAX_PERL and ONIG_SYNTAX_JAVA
- (?s): dot(.) match newline
- (?m): ^ match after newline, $ match before newline
+ (?s): dot (.) also matches newline
+ (?m): ^ matches after newline, $ matches before newline
A-2. Original extensions
@@ -356,7 +355,7 @@ A-2. Original extensions
+ subexp call \g<name>, \g<group-num>
-A-3. Lacked features compare with perl 5.8.0
+A-3. Missing features compared with perl 5.8.0
+ \N{name}
+ \l,\u,\L,\U, \X, \C
@@ -373,12 +372,12 @@ A-4. Differences with Japanized GNU regex(version 0.12) of Ruby 1.8
+ add character property (\p{property}, \P{property})
+ add hexadecimal digit char type (\h, \H)
+ add look-behind
- (?<=fixed-char-length-pattern), (?<!fixed-char-length-pattern)
+ (?<=fixed-width-pattern), (?<!fixed-width-pattern)
+ add possessive quantifier. ?+, *+, ++
+ add operations in character class. [], &&
('[' must be escaped as an usual char in character class.)
+ add named group and subexp call.
- + octal or hexadecimal number sequence can be treated as
+ + octal or hexadecimal number sequence can be treated as
a multibyte code char in character class if multibyte encoding
is specified.
(ex. [\xa1\xa2], [\xa1\xa7-\xa4\xa1])
@@ -389,29 +388,29 @@ A-4. Differences with Japanized GNU regex(version 0.12) of Ruby 1.8
ex. (?:(?i)a|b) is interpreted as (?:(?i:a|b)), not (?:(?i:a)|b).
+ isolated option is not transparent to previous pattern.
ex. a(?i)* is a syntax error pattern.
- + allowed incompleted left brace as an usual string.
+ + allowed unpaired left brace as a normal character.
ex. /{/, /({)/, /a{2,3/ etc...
+ negative POSIX bracket [:^xxxx:] is supported.
+ POSIX bracket [:ascii:] is added.
+ repeat of look-ahead is not allowed.
ex. /(?=a)*/, /(?!b){5}/
- + Ignore case option is effective to numbered character.
+ + Ignore case option is effective to escape sequence.
ex. /\x61/i =~ "A"
- + In the range quantifier, the number of the minimum is omissible.
+ + In the range quantifier, the number of the minimum is optional.
/a{,n}/ == /a{0,n}/
- The simultanious abbreviation of the number of times of the minimum
- and the maximum is not allowed. (/a{,}/)
- + /a{n}?/ is not a non-greedy operator.
+ The omission of both minimum and maximum values is not allowed.
+ /a{,}/
+ + /{n}?/ is not a reluctant quantifier.
/a{n}?/ == /(?:a{n})?/
- + invalid back reference is checked and cause error.
+ + invalid back reference is checked and raises error.
/\1/, /(a)\2/
- + Zero-length match in infinite repeat stops the repeat,
+ + Zero-width match in an infinite loop stops the repeat,
then changes of the capture group status are checked as stop condition.
/(?:()|())*\1\2/ =~ ""
/(?:\1a|())*/ =~ "a"
-A-5. Disabled functions by default syntax
+A-5. Features disabled in default syntax
+ capture history
diff --git a/index.html b/index.html
index 9082a3f..e7c263e 100644
--- a/index.html
+++ b/index.html
@@ -8,7 +8,7 @@
<h1>Oniguruma</h1> (<a href="index_ja.html">Japanese</a>)
<p>
-(c) K.Kosako, updated at: 2016/05/06
+(c) K.Kosako, updated at: 2016/08/22
</p>
<dl>
@@ -16,6 +16,7 @@
<dt><b>What's new</b>
</font>
<ul>
+<li>2016/08/29: Version 6.1.0 released.</li>
<li>2016/05/09: Version 6.0.0 released.</li>
<li>2014/12/12: Version 5.9.6 released.</li>
</ul>
@@ -65,7 +66,7 @@ About 2.x, please contact him.<br>
* 2.x supports Ruby1.6/1.8.<br>
<br>
-<dt><b>Documents:</b> (version 6.0.0)
+<dt><b>Documents:</b> (version 6.1.0)
<ul>
<li> <a href="doc/RE.txt">Regular Expressions</a>
<a href="doc/RE.ja.txt">(Japanese: EUC-JP)</a>
diff --git a/index_ja.html b/index_ja.html
index 3b089fc..9c68c85 100644
--- a/index_ja.html
+++ b/index_ja.html
@@ -8,7 +8,7 @@
<h1>鬼車</h1>
<p>
-(c) K.Kosako, 最終更新: 2016/05/06
+(c) K.Kosako, 最終更新: 2016/08/22
</p>
<dl>
@@ -16,6 +16,7 @@
<dt><b>更新情報</b>
</font>
<ul>
+<li>2016/08/29: Version 6.1.0 リリース</li>
<li>2016/05/09: Version 6.0.0 リリース</li>
<li>2014/12/12: Version 5.9.6 リリース</li>
</ul>
@@ -65,7 +66,7 @@ ISO-8859-11, ISO-8859-13, ISO-8859-14, ISO-8859-15, ISO-8859-16<br>
* 2.xはRuby1.6/1.8組込みライブラリとして動作する。 (2006年末で保守を終了)<br>
<br>
-<dt><b>ドキュメント:</b> (version 6.0.0)
+<dt><b>ドキュメント:</b> (version 6.1.0)
<ul>
<li> <a href="doc/RE.txt">正規表現</a>
<a href="doc/RE.ja.txt">(日本語: EUC-JP)</a>
diff --git a/sample/.gitignore b/sample/.gitignore
index 963d2e4..79fab44 100644
--- a/sample/.gitignore
+++ b/sample/.gitignore
@@ -7,4 +7,5 @@
/sql
/syntax
/user_property
+/bug_fix
/log*
diff --git a/sample/Makefile.am b/sample/Makefile.am
index 53f0d08..6799ecd 100644
--- a/sample/Makefile.am
+++ b/sample/Makefile.am
@@ -6,9 +6,9 @@ LDADD = $(lib_onig)
AM_LDFLAGS = -L$(prefix)/lib
AM_CPPFLAGS = -I../src -I$(includedir)
-TESTS = encode listcap names posix simple sql syntax user_property
+TESTS = encode listcap names posix simple sql syntax user_property bug_fix
-check_PROGRAMS = encode listcap names posix simple sql syntax user_property
+check_PROGRAMS = encode listcap names posix simple sql syntax user_property bug_fix
encode_SOURCES = encode.c
listcap_SOURCES = listcap.c
@@ -18,10 +18,11 @@ simple_SOURCES = simple.c
sql_SOURCES = sql.c
syntax_SOURCES = syntax.c
user_property_SOURCES = user_property.c
+bug_fix = bug_fix.c
sampledir = .
-test: encode listcap names posix simple sql syntax user_property
+test: encode listcap names posix simple sql syntax user_property bug_fix
$(sampledir)/encode
$(sampledir)/listcap
$(sampledir)/names
@@ -30,3 +31,4 @@ test: encode listcap names posix simple sql syntax user_property
$(sampledir)/sql
$(sampledir)/syntax
$(sampledir)/user_property
+ $(sampledir)/bug_fix
diff --git a/sample/bug_fix.c b/sample/bug_fix.c
new file mode 100644
index 0000000..9a45a78
--- /dev/null
+++ b/sample/bug_fix.c
@@ -0,0 +1,131 @@
+/*
+ * bug_fix.c
+ */
+#include <stdio.h>
+#include "oniguruma.h"
+
+static OnigCaseFoldType CF = ONIGENC_CASE_FOLD_MIN;
+
+static int
+search(regex_t* reg, unsigned char* str, unsigned char* end)
+{
+ int r;
+ unsigned char *start, *range;
+ OnigRegion *region;
+
+ region = onig_region_new();
+
+ start = str;
+ range = end;
+ r = onig_search(reg, str, end, start, range, region, ONIG_OPTION_NONE);
+ if (r >= 0) {
+ int i;
+
+ fprintf(stderr, "match at %d (%s)\n", r,
+ ONIGENC_NAME(onig_get_encoding(reg)));
+ for (i = 0; i < region->num_regs; i++) {
+ fprintf(stderr, "%d: (%d-%d)\n", i, region->beg[i], region->end[i]);
+ }
+ }
+ else if (r == ONIG_MISMATCH) {
+ fprintf(stderr, "search fail (%s)\n",
+ ONIGENC_NAME(onig_get_encoding(reg)));
+ }
+ else { /* error */
+ char s[ONIG_MAX_ERROR_MESSAGE_LEN];
+ onig_error_code_to_str(s, r);
+ fprintf(stderr, "ERROR: %s\n", s);
+ fprintf(stderr, " (%s)\n", ONIGENC_NAME(onig_get_encoding(reg)));
+ return -1;
+ }
+
+ onig_region_free(region, 1 /* 1:free self, 0:free contents only */);
+ return 0;
+}
+
+static int
+exec_deluxe(OnigEncoding pattern_enc, OnigEncoding str_enc,
+ OnigOptionType options, char* apattern, char* astr)
+{
+ int r;
+ unsigned char *end;
+ regex_t* reg;
+ OnigCompileInfo ci;
+ OnigErrorInfo einfo;
+ UChar* pattern = (UChar* )apattern;
+ UChar* str = (UChar* )astr;
+
+ onig_initialize(&str_enc, 1);
+
+ ci.num_of_elements = 5;
+ ci.pattern_enc = pattern_enc;
+ ci.target_enc = str_enc;
+ ci.syntax = ONIG_SYNTAX_DEFAULT;
+ ci.option = options;
+ ci.case_fold_flag = CF;
+
+ r = onig_new_deluxe(&reg, pattern,
+ pattern + onigenc_str_bytelen_null(pattern_enc, pattern),
+ &ci, &einfo);
+ if (r != ONIG_NORMAL) {
+ char s[ONIG_MAX_ERROR_MESSAGE_LEN];
+ onig_error_code_to_str(s, r, &einfo);
+ fprintf(stderr, "ERROR: %s\n", s);
+ return -1;
+ }
+
+ end = str + onigenc_str_bytelen_null(str_enc, str);
+ r = search(reg, str, end);
+
+ onig_free(reg);
+ onig_end();
+ return 0;
+}
+
+static int
+exec(OnigEncoding enc, OnigOptionType options, char* apattern, char* astr)
+{
+ int r;
+ unsigned char *end;
+ regex_t* reg;
+ OnigErrorInfo einfo;
+ UChar* pattern = (UChar* )apattern;
+ UChar* str = (UChar* )astr;
+
+ onig_initialize(&enc, 1);
+
+ r = onig_new(&reg, pattern,
+ pattern + onigenc_str_bytelen_null(enc, pattern),
+ options, enc, ONIG_SYNTAX_DEFAULT, &einfo);
+ if (r != ONIG_NORMAL) {
+ char s[ONIG_MAX_ERROR_MESSAGE_LEN];
+ onig_error_code_to_str(s, r, &einfo);
+ fprintf(stderr, "ERROR: %s\n", s);
+ return -1;
+ }
+
+ end = str + onigenc_str_bytelen_null(enc, str);
+ r = search(reg, str, end);
+
+ onig_free(reg);
+ onig_end();
+ return 0;
+}
+
+
+
+extern int main(int argc, char* argv[])
+{
+ /* fix ignore case in look-behind
+ commit: 3340ec2cc5627172665303fe248c9793354d2251 */
+ exec_deluxe(ONIG_ENCODING_UTF8, ONIG_ENCODING_UTF8,
+ ONIG_OPTION_IGNORECASE,
+ "(?<=\305\211)a", "\312\274na"); /* \u{0149}a \u{02bc}na */
+
+ exec(ONIG_ENCODING_UTF8, ONIG_OPTION_NONE, "(\\2)(\\1)", "aa"); /* fail. */
+
+ exec(ONIG_ENCODING_UTF8, ONIG_OPTION_FIND_LONGEST,
+ "a*", "aa aaa aaaa aaaaa "); /* match 12-17 */
+
+ return 0;
+}
diff --git a/sample/scan.c b/sample/scan.c
new file mode 100644
index 0000000..ad5ae74
--- /dev/null
+++ b/sample/scan.c
@@ -0,0 +1,88 @@
+/*
+ * scan.c
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include "oniguruma.h"
+
+static int
+scan_callback(int n, int r, OnigRegion* region, void* arg)
+{
+ int i;
+
+ fprintf(stdout, "scan: %d\n", n);
+
+ fprintf(stdout, "match at %d\n", r);
+ for (i = 0; i < region->num_regs; i++) {
+ fprintf(stdout, "%d: (%d-%d)\n", i, region->beg[i], region->end[i]);
+ }
+
+ return 0;
+}
+
+static int
+scan(regex_t* reg, unsigned char* str, unsigned char* end)
+{
+ int r;
+ OnigRegion *region;
+
+ region = onig_region_new();
+
+ r = onig_scan(reg, str, end, region, ONIG_OPTION_NONE, scan_callback, NULL);
+ if (r >= 0) {
+ fprintf(stdout, "total: %d match\n", r);
+ }
+ else { /* error */
+ char s[ONIG_MAX_ERROR_MESSAGE_LEN];
+ onig_error_code_to_str((OnigUChar* )s, r);
+ fprintf(stderr, "ERROR: %s\n", s);
+ return -1;
+ }
+
+ onig_region_free(region, 1 /* 1:free self, 0:free contents only */);
+ return 0;
+}
+
+static int
+exec(OnigEncoding enc, OnigOptionType options, char* apattern, char* astr)
+{
+ int r;
+ unsigned char *end;
+ regex_t* reg;
+ OnigErrorInfo einfo;
+ UChar* pattern_end;
+ UChar* pattern = (UChar* )apattern;
+ UChar* str = (UChar* )astr;
+
+ onig_initialize(&enc, 1);
+
+ pattern_end = pattern + onigenc_str_bytelen_null(enc, pattern);
+
+ r = onig_new(&reg, pattern, pattern_end, options, enc, ONIG_SYNTAX_DEFAULT, &einfo);
+ if (r != ONIG_NORMAL) {
+ char s[ONIG_MAX_ERROR_MESSAGE_LEN];
+ onig_error_code_to_str((OnigUChar* )s, r, &einfo);
+ fprintf(stderr, "ERROR: %s\n", s);
+ return -1;
+ }
+
+ end = str + onigenc_str_bytelen_null(enc, str);
+ r = scan(reg, str, end);
+
+ onig_free(reg);
+ onig_end();
+ return 0;
+}
+
+
+extern int main(int argc, char* argv[])
+{
+ exec(ONIG_ENCODING_UTF8, ONIG_OPTION_NONE,
+ "\\Ga+\\s*", "a aa aaa baaa");
+
+ fprintf(stdout, "\n");
+ exec(ONIG_ENCODING_UTF8, ONIG_OPTION_NONE,
+ "a+\\s*", "a aa aaa baaa");
+
+ return 0;
+}
diff --git a/src/ascii.c b/src/ascii.c
index 92db179..b21878d 100644
--- a/src/ascii.c
+++ b/src/ascii.c
@@ -56,5 +56,6 @@ OnigEncodingType OnigEncodingASCII = {
onigenc_single_byte_left_adjust_char_head,
onigenc_always_true_is_allowed_reverse_match,
NULL, /* init */
- NULL /* is_initialized */
+ NULL, /* is_initialized */
+ onigenc_always_true_is_valid_mbc_string
};
diff --git a/src/big5.c b/src/big5.c
index 3c90eaa..3d44975 100644
--- a/src/big5.c
+++ b/src/big5.c
@@ -54,6 +54,12 @@ big5_mbc_enc_len(const UChar* p)
return EncLen_BIG5[*p];
}
+static int
+is_valid_mbc_string(const UChar* s, const UChar* end)
+{
+ return onigenc_length_check_is_valid_mbc_string(ONIG_ENCODING_BIG5, s, end);
+}
+
static OnigCodePoint
big5_mbc_to_code(const UChar* p, const UChar* end)
{
@@ -160,5 +166,6 @@ OnigEncodingType OnigEncodingBIG5 = {
big5_left_adjust_char_head,
big5_is_allowed_reverse_match,
NULL, /* init */
- NULL /* is_initialized */
+ NULL, /* is_initialized */
+ is_valid_mbc_string
};
diff --git a/src/cp1251.c b/src/cp1251.c
index af45847..4d655bb 100644
--- a/src/cp1251.c
+++ b/src/cp1251.c
@@ -198,5 +198,6 @@ OnigEncodingType OnigEncodingCP1251 = {
onigenc_single_byte_left_adjust_char_head,
onigenc_always_true_is_allowed_reverse_match,
NULL, /* init */
- NULL /* is_initialized */
+ NULL, /* is_initialized */
+ onigenc_always_true_is_valid_mbc_string
};
diff --git a/src/euc_jp.c b/src/euc_jp.c
index 17f53e7..19422ce 100644
--- a/src/euc_jp.c
+++ b/src/euc_jp.c
@@ -56,6 +56,12 @@ mbc_enc_len(const UChar* p)
return EncLen_EUCJP[*p];
}
+static int
+is_valid_mbc_string(const UChar* s, const UChar* end)
+{
+ return onigenc_length_check_is_valid_mbc_string(ONIG_ENCODING_EUC_JP, s, end);
+}
+
static OnigCodePoint
mbc_to_code(const UChar* p, const UChar* end)
{
@@ -269,5 +275,6 @@ OnigEncodingType OnigEncodingEUC_JP = {
left_adjust_char_head,
is_allowed_reverse_match,
NULL, /* init */
- NULL /* is_initialized */
+ NULL, /* is_initialized */
+ is_valid_mbc_string
};
diff --git a/src/euc_kr.c b/src/euc_kr.c
index 769104b..12803cd 100644
--- a/src/euc_kr.c
+++ b/src/euc_kr.c
@@ -54,6 +54,12 @@ euckr_mbc_enc_len(const UChar* p)
return EncLen_EUCKR[*p];
}
+static int
+is_valid_mbc_string(const UChar* s, const UChar* end)
+{
+ return onigenc_length_check_is_valid_mbc_string(ONIG_ENCODING_EUC_KR, s, end);
+}
+
static OnigCodePoint
euckr_mbc_to_code(const UChar* p, const UChar* end)
{
@@ -136,7 +142,8 @@ OnigEncodingType OnigEncodingEUC_KR = {
euckr_left_adjust_char_head,
euckr_is_allowed_reverse_match,
NULL, /* init */
- NULL /* is_initialized */
+ NULL, /* is_initialized */
+ is_valid_mbc_string
};
/* Same with OnigEncodingEUC_KR except the name */
@@ -158,5 +165,6 @@ OnigEncodingType OnigEncodingEUC_CN = {
euckr_left_adjust_char_head,
euckr_is_allowed_reverse_match,
NULL, /* init */
- NULL /* is_initialized */
+ NULL, /* is_initialized */
+ is_valid_mbc_string
};
diff --git a/src/euc_tw.c b/src/euc_tw.c
index f757961..4e07567 100644
--- a/src/euc_tw.c
+++ b/src/euc_tw.c
@@ -54,6 +54,12 @@ euctw_mbc_enc_len(const UChar* p)
return EncLen_EUCTW[*p];
}
+static int
+is_valid_mbc_string(const UChar* s, const UChar* end)
+{
+ return onigenc_length_check_is_valid_mbc_string(ONIG_ENCODING_EUC_TW, s, end);
+}
+
static OnigCodePoint
euctw_mbc_to_code(const UChar* p, const UChar* end)
{
@@ -127,5 +133,6 @@ OnigEncodingType OnigEncodingEUC_TW = {
euctw_left_adjust_char_head,
euctw_is_allowed_reverse_match,
NULL, /* init */
- NULL /* is_initialized */
+ NULL, /* is_initialized */
+ is_valid_mbc_string
};
diff --git a/src/gb18030.c b/src/gb18030.c
index beeda06..36fc3de 100644
--- a/src/gb18030.c
+++ b/src/gb18030.c
@@ -75,6 +75,12 @@ gb18030_mbc_enc_len(const UChar* p)
return 2;
}
+static int
+is_valid_mbc_string(const UChar* s, const UChar* end)
+{
+ return onigenc_length_check_is_valid_mbc_string(ONIG_ENCODING_GB18030, s, end);
+}
+
static OnigCodePoint
gb18030_mbc_to_code(const UChar* p, const UChar* end)
{
@@ -493,5 +499,6 @@ OnigEncodingType OnigEncodingGB18030 = {
gb18030_left_adjust_char_head,
gb18030_is_allowed_reverse_match,
NULL, /* init */
- NULL /* is_initialized */
+ NULL, /* is_initialized */
+ is_valid_mbc_string
};
diff --git a/src/iso8859_1.c b/src/iso8859_1.c
index b2d8c9a..573931f 100644
--- a/src/iso8859_1.c
+++ b/src/iso8859_1.c
@@ -270,5 +270,6 @@ OnigEncodingType OnigEncodingISO_8859_1 = {
onigenc_single_byte_left_adjust_char_head,
onigenc_always_true_is_allowed_reverse_match,
NULL, /* init */
- NULL /* is_initialized */
+ NULL, /* is_initialized */
+ onigenc_always_true_is_valid_mbc_string
};
diff --git a/src/iso8859_10.c b/src/iso8859_10.c
index a6dbe5c..91b18d4 100644
--- a/src/iso8859_10.c
+++ b/src/iso8859_10.c
@@ -237,5 +237,6 @@ OnigEncodingType OnigEncodingISO_8859_10 = {
onigenc_single_byte_left_adjust_char_head,
onigenc_always_true_is_allowed_reverse_match,
NULL, /* init */
- NULL /* is_initialized */
+ NULL, /* is_initialized */
+ onigenc_always_true_is_valid_mbc_string
};
diff --git a/src/iso8859_11.c b/src/iso8859_11.c
index d671af8..518be25 100644
--- a/src/iso8859_11.c
+++ b/src/iso8859_11.c
@@ -94,5 +94,6 @@ OnigEncodingType OnigEncodingISO_8859_11 = {
onigenc_single_byte_left_adjust_char_head,
onigenc_always_true_is_allowed_reverse_match,
NULL, /* init */
- NULL /* is_initialized */
+ NULL, /* is_initialized */
+ onigenc_always_true_is_valid_mbc_string
};
diff --git a/src/iso8859_13.c b/src/iso8859_13.c
index c97e24e..d1f39a2 100644
--- a/src/iso8859_13.c
+++ b/src/iso8859_13.c
@@ -226,5 +226,6 @@ OnigEncodingType OnigEncodingISO_8859_13 = {
onigenc_single_byte_left_adjust_char_head,
onigenc_always_true_is_allowed_reverse_match,
NULL, /* init */
- NULL /* is_initialized */
+ NULL, /* is_initialized */
+ onigenc_always_true_is_valid_mbc_string
};
diff --git a/src/iso8859_14.c b/src/iso8859_14.c
index edf5313..3361b0d 100644
--- a/src/iso8859_14.c
+++ b/src/iso8859_14.c
@@ -239,5 +239,6 @@ OnigEncodingType OnigEncodingISO_8859_14 = {
onigenc_single_byte_left_adjust_char_head,
onigenc_always_true_is_allowed_reverse_match,
NULL, /* init */
- NULL /* is_initialized */
+ NULL, /* is_initialized */
+ onigenc_always_true_is_valid_mbc_string
};
diff --git a/src/iso8859_15.c b/src/iso8859_15.c
index 24aa573..b09e876 100644
--- a/src/iso8859_15.c
+++ b/src/iso8859_15.c
@@ -233,5 +233,6 @@ OnigEncodingType OnigEncodingISO_8859_15 = {
onigenc_single_byte_left_adjust_char_head,
onigenc_always_true_is_allowed_reverse_match,
NULL, /* init */
- NULL /* is_initialized */
+ NULL, /* is_initialized */
+ onigenc_always_true_is_valid_mbc_string
};
diff --git a/src/iso8859_16.c b/src/iso8859_16.c
index 4f4c2b8..29a350d 100644
--- a/src/iso8859_16.c
+++ b/src/iso8859_16.c
@@ -235,5 +235,6 @@ OnigEncodingType OnigEncodingISO_8859_16 = {
onigenc_single_byte_left_adjust_char_head,
onigenc_always_true_is_allowed_reverse_match,
NULL, /* init */
- NULL /* is_initialized */
+ NULL, /* is_initialized */
+ onigenc_always_true_is_valid_mbc_string
};
diff --git a/src/iso8859_2.c b/src/iso8859_2.c
index 16c1549..9eb3536 100644
--- a/src/iso8859_2.c
+++ b/src/iso8859_2.c
@@ -233,5 +233,6 @@ OnigEncodingType OnigEncodingISO_8859_2 = {
onigenc_single_byte_left_adjust_char_head,
onigenc_always_true_is_allowed_reverse_match,
NULL, /* init */
- NULL /* is_initialized */
+ NULL, /* is_initialized */
+ onigenc_always_true_is_valid_mbc_string
};
diff --git a/src/iso8859_3.c b/src/iso8859_3.c
index 549f01e..862823a 100644
--- a/src/iso8859_3.c
+++ b/src/iso8859_3.c
@@ -233,5 +233,6 @@ OnigEncodingType OnigEncodingISO_8859_3 = {
onigenc_single_byte_left_adjust_char_head,
onigenc_always_true_is_allowed_reverse_match,
NULL, /* init */
- NULL /* is_initialized */
+ NULL, /* is_initialized */
+ onigenc_always_true_is_valid_mbc_string
};
diff --git a/src/iso8859_4.c b/src/iso8859_4.c
index 74b3a88..db706da 100644
--- a/src/iso8859_4.c
+++ b/src/iso8859_4.c
@@ -235,5 +235,6 @@ OnigEncodingType OnigEncodingISO_8859_4 = {
onigenc_single_byte_left_adjust_char_head,
onigenc_always_true_is_allowed_reverse_match,
NULL, /* init */
- NULL /* is_initialized */
+ NULL, /* is_initialized */
+ onigenc_always_true_is_valid_mbc_string
};
diff --git a/src/iso8859_5.c b/src/iso8859_5.c
index 70c799a..0e03e9c 100644
--- a/src/iso8859_5.c
+++ b/src/iso8859_5.c
@@ -224,5 +224,6 @@ OnigEncodingType OnigEncodingISO_8859_5 = {
onigenc_single_byte_left_adjust_char_head,
onigenc_always_true_is_allowed_reverse_match,
NULL, /* init */
- NULL /* is_initialized */
+ NULL, /* is_initialized */
+ onigenc_always_true_is_valid_mbc_string
};
diff --git a/src/iso8859_6.c b/src/iso8859_6.c
index e36579a..6289af5 100644
--- a/src/iso8859_6.c
+++ b/src/iso8859_6.c
@@ -94,5 +94,6 @@ OnigEncodingType OnigEncodingISO_8859_6 = {
onigenc_single_byte_left_adjust_char_head,
onigenc_always_true_is_allowed_reverse_match,
NULL, /* init */
- NULL /* is_initialized */
+ NULL, /* is_initialized */
+ onigenc_always_true_is_valid_mbc_string
};
diff --git a/src/iso8859_7.c b/src/iso8859_7.c
index 87a1acd..75b520f 100644
--- a/src/iso8859_7.c
+++ b/src/iso8859_7.c
@@ -220,5 +220,6 @@ OnigEncodingType OnigEncodingISO_8859_7 = {
onigenc_single_byte_left_adjust_char_head,
onigenc_always_true_is_allowed_reverse_match,
NULL, /* init */
- NULL /* is_initialized */
+ NULL, /* is_initialized */
+ onigenc_always_true_is_valid_mbc_string
};
diff --git a/src/iso8859_8.c b/src/iso8859_8.c
index e34a3bb..5f18345 100644
--- a/src/iso8859_8.c
+++ b/src/iso8859_8.c
@@ -94,5 +94,6 @@ OnigEncodingType OnigEncodingISO_8859_8 = {
onigenc_single_byte_left_adjust_char_head,
onigenc_always_true_is_allowed_reverse_match,
NULL, /* init */
- NULL /* is_initialized */
+ NULL, /* is_initialized */
+ onigenc_always_true_is_valid_mbc_string
};
diff --git a/src/iso8859_9.c b/src/iso8859_9.c
index 8cbbbd6..d0c06bb 100644
--- a/src/iso8859_9.c
+++ b/src/iso8859_9.c
@@ -226,5 +226,6 @@ OnigEncodingType OnigEncodingISO_8859_9 = {
onigenc_single_byte_left_adjust_char_head,
onigenc_always_true_is_allowed_reverse_match,
NULL, /* init */
- NULL /* is_initialized */
+ NULL, /* is_initialized */
+ onigenc_always_true_is_valid_mbc_string
};
diff --git a/src/koi8.c b/src/koi8.c
index 9c11776..80f89e9 100644
--- a/src/koi8.c
+++ b/src/koi8.c
@@ -248,5 +248,6 @@ OnigEncodingType OnigEncodingKOI8 = {
onigenc_single_byte_left_adjust_char_head,
onigenc_always_true_is_allowed_reverse_match,
NULL, /* init */
- NULL /* is_initialized */
+ NULL, /* is_initialized */
+ onigenc_always_true_is_valid_mbc_string
};
diff --git a/src/koi8_r.c b/src/koi8_r.c
index 2f090f4..f8ef34f 100644
--- a/src/koi8_r.c
+++ b/src/koi8_r.c
@@ -210,5 +210,6 @@ OnigEncodingType OnigEncodingKOI8_R = {
onigenc_single_byte_left_adjust_char_head,
onigenc_always_true_is_allowed_reverse_match,
NULL, /* init */
- NULL /* is_initialized */
+ NULL, /* is_initialized */
+ onigenc_always_true_is_valid_mbc_string
};
diff --git a/src/oniguruma.h b/src/oniguruma.h
index 6e62b50..75301ca 100644
--- a/src/oniguruma.h
+++ b/src/oniguruma.h
@@ -35,7 +35,7 @@ extern "C" {
#define ONIGURUMA
#define ONIGURUMA_VERSION_MAJOR 6
-#define ONIGURUMA_VERSION_MINOR 0
+#define ONIGURUMA_VERSION_MINOR 1
#define ONIGURUMA_VERSION_TEENY 0
#ifdef __cplusplus
@@ -103,9 +103,9 @@ extern "C" {
typedef unsigned int OnigCodePoint;
typedef unsigned char OnigUChar;
typedef unsigned int OnigCtype;
-typedef unsigned int OnigDistance;
+typedef unsigned int OnigLen;
-#define ONIG_INFINITE_DISTANCE ~((OnigDistance )0)
+#define ONIG_INFINITE_DISTANCE ~((OnigLen )0)
typedef unsigned int OnigCaseFoldType; /* case fold flag */
@@ -165,6 +165,7 @@ typedef struct OnigEncodingTypeST {
int (*is_allowed_reverse_match)(const OnigUChar* p, const OnigUChar* end);
int (*init)(void);
int (*is_initialized)(void);
+ int (*is_valid_mbc_string)(const OnigUChar* s, const OnigUChar* end);
} OnigEncodingType;
typedef OnigEncodingType* OnigEncoding;
@@ -279,6 +280,8 @@ ONIG_EXTERN OnigEncodingType OnigEncodingGB18030;
(enc)->is_allowed_reverse_match(s,end)
#define ONIGENC_LEFT_ADJUST_CHAR_HEAD(enc,start,s) \
(enc)->left_adjust_char_head(start, s)
+#define ONIGENC_IS_VALID_MBC_STRING(enc,s,end) \
+ (enc)->is_valid_mbc_string(s,end)
#define ONIGENC_APPLY_ALL_CASE_FOLD(enc,case_fold_flag,f,arg) \
(enc)->apply_all_case_fold(case_fold_flag,f,arg)
#define ONIGENC_GET_CASE_FOLD_CODES_BY_STR(enc,case_fold_flag,p,end,acs) \
@@ -360,6 +363,8 @@ ONIG_EXTERN
int onigenc_strlen_null P_((OnigEncoding enc, const OnigUChar* p));
ONIG_EXTERN
int onigenc_str_bytelen_null P_((OnigEncoding enc, const OnigUChar* p));
+ONIG_EXTERN
+int onigenc_is_valid_mbc_string P_((OnigEncoding enc, const UChar* s, const UChar* end));
@@ -367,6 +372,7 @@ int onigenc_str_bytelen_null P_((OnigEncoding enc, const OnigUChar* p));
/* config parameters */
#define ONIG_NREGION 10
+#define ONIG_MAX_CAPTURE_NUM 32767
#define ONIG_MAX_BACKREF_NUM 1000
#define ONIG_MAX_REPEAT_NUM 100000
#define ONIG_MAX_MULTI_BYTE_RANGES_NUM 10000
@@ -575,6 +581,7 @@ ONIG_EXTERN OnigSyntaxType* OnigDefaultSyntax;
#define ONIGERR_TOO_BIG_BACKREF_NUMBER -207
#define ONIGERR_INVALID_BACKREF -208
#define ONIGERR_NUMBERED_BACKREF_OR_CALL_NOT_ALLOWED -209
+#define ONIGERR_TOO_MANY_CAPTURES -210
#define ONIGERR_TOO_LONG_WIDE_CHAR_VALUE -212
#define ONIGERR_EMPTY_GROUP_NAME -214
#define ONIGERR_INVALID_GROUP_NAME -215
@@ -679,16 +686,16 @@ typedef struct re_pattern_buffer {
int optimize; /* optimize flag */
int threshold_len; /* search str-length for apply optimize */
int anchor; /* BEGIN_BUF, BEGIN_POS, (SEMI_)END_BUF */
- OnigDistance anchor_dmin; /* (SEMI_)END_BUF anchor distance */
- OnigDistance anchor_dmax; /* (SEMI_)END_BUF anchor distance */
+ OnigLen anchor_dmin; /* (SEMI_)END_BUF anchor distance */
+ OnigLen anchor_dmax; /* (SEMI_)END_BUF anchor distance */
int sub_anchor; /* start-anchor for exact or map */
unsigned char *exact;
unsigned char *exact_end;
unsigned char map[ONIG_CHAR_TABLE_SIZE]; /* used as BM skip or char-map */
int *int_map; /* BM skip for exact_len > 255 */
int *int_map_backward; /* BM skip for backward search */
- OnigDistance dmin; /* min-distance of exact or map */
- OnigDistance dmax; /* max-distance of exact or map */
+ OnigLen dmin; /* min-distance of exact or map */
+ OnigLen dmax; /* max-distance of exact or map */
/* regex_t link chain */
struct re_pattern_buffer* chain; /* escape compile-conflict */
@@ -735,6 +742,8 @@ void onig_free P_((OnigRegex));
ONIG_EXTERN
void onig_free_body P_((OnigRegex));
ONIG_EXTERN
+int onig_scan(regex_t* reg, const UChar* str, const UChar* end, OnigRegion* region, OnigOptionType option, int (*scan_callback)(int, int, OnigRegion*, void*), void* callback_arg);
+ONIG_EXTERN
int onig_search P_((OnigRegex, const OnigUChar* str, const OnigUChar* end, const OnigUChar* start, const OnigUChar* range, OnigRegion* region, OnigOptionType option));
ONIG_EXTERN
int onig_match P_((OnigRegex, const OnigUChar* str, const OnigUChar* end, const OnigUChar* at, OnigRegion* region, OnigOptionType option));
diff --git a/src/regcomp.c b/src/regcomp.c
index 8b5b206..5c0f21f 100644
--- a/src/regcomp.c
+++ b/src/regcomp.c
@@ -89,8 +89,8 @@ swap_node(Node* a, Node* b)
}
}
-static OnigDistance
-distance_add(OnigDistance d1, OnigDistance d2)
+static OnigLen
+distance_add(OnigLen d1, OnigLen d2)
{
if (d1 == ONIG_INFINITE_DISTANCE || d2 == ONIG_INFINITE_DISTANCE)
return ONIG_INFINITE_DISTANCE;
@@ -100,8 +100,8 @@ distance_add(OnigDistance d1, OnigDistance d2)
}
}
-static OnigDistance
-distance_multiply(OnigDistance d, int m)
+static OnigLen
+distance_multiply(OnigLen d, int m)
{
if (m == 0) return 0;
@@ -2021,245 +2021,6 @@ quantifiers_memory_node_info(Node* node)
}
#endif /* USE_MONOMANIAC_CHECK_CAPTURES_IN_ENDLESS_REPEAT */
-static int
-get_min_match_length(Node* node, OnigDistance *min, ScanEnv* env)
-{
- OnigDistance tmin;
- int r = 0;
-
- *min = 0;
- switch (NTYPE(node)) {
- case NT_BREF:
- {
- int i;
- int* backs;
- Node** nodes = SCANENV_MEM_NODES(env);
- BRefNode* br = NBREF(node);
- if (br->state & NST_RECURSION) break;
-
- backs = BACKREFS_P(br);
- if (backs[0] > env->num_mem) return ONIGERR_INVALID_BACKREF;
- r = get_min_match_length(nodes[backs[0]], min, env);
- if (r != 0) break;
- for (i = 1; i < br->back_num; i++) {
- if (backs[i] > env->num_mem) return ONIGERR_INVALID_BACKREF;
- r = get_min_match_length(nodes[backs[i]], &tmin, env);
- if (r != 0) break;
- if (*min > tmin) *min = tmin;
- }
- }
- break;
-
-#ifdef USE_SUBEXP_CALL
- case NT_CALL:
- if (IS_CALL_RECURSION(NCALL(node))) {
- EncloseNode* en = NENCLOSE(NCALL(node)->target);
- if (IS_ENCLOSE_MIN_FIXED(en))
- *min = en->min_len;
- }
- else
- r = get_min_match_length(NCALL(node)->target, min, env);
- break;
-#endif
-
- case NT_LIST:
- do {
- r = get_min_match_length(NCAR(node), &tmin, env);
- if (r == 0) *min += tmin;
- } while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
- break;
-
- case NT_ALT:
- {
- Node *x, *y;
- y = node;
- do {
- x = NCAR(y);
- r = get_min_match_length(x, &tmin, env);
- if (r != 0) break;
- if (y == node) *min = tmin;
- else if (*min > tmin) *min = tmin;
- } while (r == 0 && IS_NOT_NULL(y = NCDR(y)));
- }
- break;
-
- case NT_STR:
- {
- StrNode* sn = NSTR(node);
- *min = sn->end - sn->s;
- }
- break;
-
- case NT_CTYPE:
- *min = 1;
- break;
-
- case NT_CCLASS:
- case NT_CANY:
- *min = 1;
- break;
-
- case NT_QTFR:
- {
- QtfrNode* qn = NQTFR(node);
-
- if (qn->lower > 0) {
- r = get_min_match_length(qn->target, min, env);
- if (r == 0)
- *min = distance_multiply(*min, qn->lower);
- }
- }
- break;
-
- case NT_ENCLOSE:
- {
- EncloseNode* en = NENCLOSE(node);
- switch (en->type) {
- case ENCLOSE_MEMORY:
-#ifdef USE_SUBEXP_CALL
- if (IS_ENCLOSE_MIN_FIXED(en))
- *min = en->min_len;
- else {
- r = get_min_match_length(en->target, min, env);
- if (r == 0) {
- en->min_len = *min;
- SET_ENCLOSE_STATUS(node, NST_MIN_FIXED);
- }
- }
- break;
-#endif
- case ENCLOSE_OPTION:
- case ENCLOSE_STOP_BACKTRACK:
- r = get_min_match_length(en->target, min, env);
- break;
- }
- }
- break;
-
- case NT_ANCHOR:
- default:
- break;
- }
-
- return r;
-}
-
-static int
-get_max_match_length(Node* node, OnigDistance *max, ScanEnv* env)
-{
- OnigDistance tmax;
- int r = 0;
-
- *max = 0;
- switch (NTYPE(node)) {
- case NT_LIST:
- do {
- r = get_max_match_length(NCAR(node), &tmax, env);
- if (r == 0)
- *max = distance_add(*max, tmax);
- } while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
- break;
-
- case NT_ALT:
- do {
- r = get_max_match_length(NCAR(node), &tmax, env);
- if (r == 0 && *max < tmax) *max = tmax;
- } while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
- break;
-
- case NT_STR:
- {
- StrNode* sn = NSTR(node);
- *max = sn->end - sn->s;
- }
- break;
-
- case NT_CTYPE:
- *max = ONIGENC_MBC_MAXLEN_DIST(env->enc);
- break;
-
- case NT_CCLASS:
- case NT_CANY:
- *max = ONIGENC_MBC_MAXLEN_DIST(env->enc);
- break;
-
- case NT_BREF:
- {
- int i;
- int* backs;
- Node** nodes = SCANENV_MEM_NODES(env);
- BRefNode* br = NBREF(node);
- if (br->state & NST_RECURSION) {
- *max = ONIG_INFINITE_DISTANCE;
- break;
- }
- backs = BACKREFS_P(br);
- for (i = 0; i < br->back_num; i++) {
- if (backs[i] > env->num_mem) return ONIGERR_INVALID_BACKREF;
- r = get_max_match_length(nodes[backs[i]], &tmax, env);
- if (r != 0) break;
- if (*max < tmax) *max = tmax;
- }
- }
- break;
-
-#ifdef USE_SUBEXP_CALL
- case NT_CALL:
- if (! IS_CALL_RECURSION(NCALL(node)))
- r = get_max_match_length(NCALL(node)->target, max, env);
- else
- *max = ONIG_INFINITE_DISTANCE;
- break;
-#endif
-
- case NT_QTFR:
- {
- QtfrNode* qn = NQTFR(node);
-
- if (qn->upper != 0) {
- r = get_max_match_length(qn->target, max, env);
- if (r == 0 && *max != 0) {
- if (! IS_REPEAT_INFINITE(qn->upper))
- *max = distance_multiply(*max, qn->upper);
- else
- *max = ONIG_INFINITE_DISTANCE;
- }
- }
- }
- break;
-
- case NT_ENCLOSE:
- {
- EncloseNode* en = NENCLOSE(node);
- switch (en->type) {
- case ENCLOSE_MEMORY:
-#ifdef USE_SUBEXP_CALL
- if (IS_ENCLOSE_MAX_FIXED(en))
- *max = en->max_len;
- else {
- r = get_max_match_length(en->target, max, env);
- if (r == 0) {
- en->max_len = *max;
- SET_ENCLOSE_STATUS(node, NST_MAX_FIXED);
- }
- }
- break;
-#endif
- case ENCLOSE_OPTION:
- case ENCLOSE_STOP_BACKTRACK:
- r = get_max_match_length(en->target, max, env);
- break;
- }
- }
- break;
-
- case NT_ANCHOR:
- default:
- break;
- }
-
- return r;
-}
#define GET_CHAR_LEN_VARLEN -1
#define GET_CHAR_LEN_TOP_ALT_VARLEN -2
@@ -2706,6 +2467,257 @@ check_type_tree(Node* node, int type_mask, int enclose_mask, int anchor_mask)
return r;
}
+static int
+get_min_len(Node* node, OnigLen *min, ScanEnv* env)
+{
+ OnigLen tmin;
+ int r = 0;
+
+ *min = 0;
+ switch (NTYPE(node)) {
+ case NT_BREF:
+ {
+ int i;
+ int* backs;
+ Node** nodes = SCANENV_MEM_NODES(env);
+ BRefNode* br = NBREF(node);
+ if (br->state & NST_RECURSION) break;
+
+ backs = BACKREFS_P(br);
+ if (backs[0] > env->num_mem) return ONIGERR_INVALID_BACKREF;
+ r = get_min_len(nodes[backs[0]], min, env);
+ if (r != 0) break;
+ for (i = 1; i < br->back_num; i++) {
+ if (backs[i] > env->num_mem) return ONIGERR_INVALID_BACKREF;
+ r = get_min_len(nodes[backs[i]], &tmin, env);
+ if (r != 0) break;
+ if (*min > tmin) *min = tmin;
+ }
+ }
+ break;
+
+#ifdef USE_SUBEXP_CALL
+ case NT_CALL:
+ if (IS_CALL_RECURSION(NCALL(node))) {
+ EncloseNode* en = NENCLOSE(NCALL(node)->target);
+ if (IS_ENCLOSE_MIN_FIXED(en))
+ *min = en->min_len;
+ }
+ else
+ r = get_min_len(NCALL(node)->target, min, env);
+ break;
+#endif
+
+ case NT_LIST:
+ do {
+ r = get_min_len(NCAR(node), &tmin, env);
+ if (r == 0) *min += tmin;
+ } while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
+ break;
+
+ case NT_ALT:
+ {
+ Node *x, *y;
+ y = node;
+ do {
+ x = NCAR(y);
+ r = get_min_len(x, &tmin, env);
+ if (r != 0) break;
+ if (y == node) *min = tmin;
+ else if (*min > tmin) *min = tmin;
+ } while (r == 0 && IS_NOT_NULL(y = NCDR(y)));
+ }
+ break;
+
+ case NT_STR:
+ {
+ StrNode* sn = NSTR(node);
+ *min = sn->end - sn->s;
+ }
+ break;
+
+ case NT_CTYPE:
+ *min = 1;
+ break;
+
+ case NT_CCLASS:
+ case NT_CANY:
+ *min = 1;
+ break;
+
+ case NT_QTFR:
+ {
+ QtfrNode* qn = NQTFR(node);
+
+ if (qn->lower > 0) {
+ r = get_min_len(qn->target, min, env);
+ if (r == 0)
+ *min = distance_multiply(*min, qn->lower);
+ }
+ }
+ break;
+
+ case NT_ENCLOSE:
+ {
+ EncloseNode* en = NENCLOSE(node);
+ switch (en->type) {
+ case ENCLOSE_MEMORY:
+ if (IS_ENCLOSE_MIN_FIXED(en))
+ *min = en->min_len;
+ else {
+ if (IS_ENCLOSE_MARK1(NENCLOSE(node)))
+ *min = 0; // recursive
+ else {
+ SET_ENCLOSE_STATUS(node, NST_MARK1);
+ r = get_min_len(en->target, min, env);
+ CLEAR_ENCLOSE_STATUS(node, NST_MARK1);
+ if (r == 0) {
+ en->min_len = *min;
+ SET_ENCLOSE_STATUS(node, NST_MIN_FIXED);
+ }
+ }
+ }
+ break;
+
+ case ENCLOSE_OPTION:
+ case ENCLOSE_STOP_BACKTRACK:
+ r = get_min_len(en->target, min, env);
+ break;
+ }
+ }
+ break;
+
+ case NT_ANCHOR:
+ default:
+ break;
+ }
+
+ return r;
+}
+
+static int
+get_max_len(Node* node, OnigLen *max, ScanEnv* env)
+{
+ OnigLen tmax;
+ int r = 0;
+
+ *max = 0;
+ switch (NTYPE(node)) {
+ case NT_LIST:
+ do {
+ r = get_max_len(NCAR(node), &tmax, env);
+ if (r == 0)
+ *max = distance_add(*max, tmax);
+ } while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
+ break;
+
+ case NT_ALT:
+ do {
+ r = get_max_len(NCAR(node), &tmax, env);
+ if (r == 0 && *max < tmax) *max = tmax;
+ } while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
+ break;
+
+ case NT_STR:
+ {
+ StrNode* sn = NSTR(node);
+ *max = sn->end - sn->s;
+ }
+ break;
+
+ case NT_CTYPE:
+ *max = ONIGENC_MBC_MAXLEN_DIST(env->enc);
+ break;
+
+ case NT_CCLASS:
+ case NT_CANY:
+ *max = ONIGENC_MBC_MAXLEN_DIST(env->enc);
+ break;
+
+ case NT_BREF:
+ {
+ int i;
+ int* backs;
+ Node** nodes = SCANENV_MEM_NODES(env);
+ BRefNode* br = NBREF(node);
+ if (br->state & NST_RECURSION) {
+ *max = ONIG_INFINITE_DISTANCE;
+ break;
+ }
+ backs = BACKREFS_P(br);
+ for (i = 0; i < br->back_num; i++) {
+ if (backs[i] > env->num_mem) return ONIGERR_INVALID_BACKREF;
+ r = get_max_len(nodes[backs[i]], &tmax, env);
+ if (r != 0) break;
+ if (*max < tmax) *max = tmax;
+ }
+ }
+ break;
+
+#ifdef USE_SUBEXP_CALL
+ case NT_CALL:
+ if (! IS_CALL_RECURSION(NCALL(node)))
+ r = get_max_len(NCALL(node)->target, max, env);
+ else
+ *max = ONIG_INFINITE_DISTANCE;
+ break;
+#endif
+
+ case NT_QTFR:
+ {
+ QtfrNode* qn = NQTFR(node);
+
+ if (qn->upper != 0) {
+ r = get_max_len(qn->target, max, env);
+ if (r == 0 && *max != 0) {
+ if (! IS_REPEAT_INFINITE(qn->upper))
+ *max = distance_multiply(*max, qn->upper);
+ else
+ *max = ONIG_INFINITE_DISTANCE;
+ }
+ }
+ }
+ break;
+
+ case NT_ENCLOSE:
+ {
+ EncloseNode* en = NENCLOSE(node);
+ switch (en->type) {
+ case ENCLOSE_MEMORY:
+ if (IS_ENCLOSE_MAX_FIXED(en))
+ *max = en->max_len;
+ else {
+ if (IS_ENCLOSE_MARK1(NENCLOSE(node)))
+ *max = ONIG_INFINITE_DISTANCE;
+ else {
+ SET_ENCLOSE_STATUS(node, NST_MARK1);
+ r = get_max_len(en->target, max, env);
+ CLEAR_ENCLOSE_STATUS(node, NST_MARK1);
+ if (r == 0) {
+ en->max_len = *max;
+ SET_ENCLOSE_STATUS(node, NST_MAX_FIXED);
+ }
+ }
+ }
+ break;
+
+ case ENCLOSE_OPTION:
+ case ENCLOSE_STOP_BACKTRACK:
+ r = get_max_len(en->target, max, env);
+ break;
+ }
+ }
+ break;
+
+ case NT_ANCHOR:
+ default:
+ break;
+ }
+
+ return r;
+}
+
+
#ifdef USE_SUBEXP_CALL
#define RECURSION_EXIST 1
@@ -2722,7 +2734,7 @@ subexp_inf_recursive_check(Node* node, ScanEnv* env, int head)
case NT_LIST:
{
Node *x;
- OnigDistance min;
+ OnigLen min;
int ret;
x = node;
@@ -2731,7 +2743,7 @@ subexp_inf_recursive_check(Node* node, ScanEnv* env, int head)
if (ret < 0 || ret == RECURSION_INFINITE) return ret;
r |= ret;
if (head) {
- ret = get_min_match_length(NCAR(x), &min, env);
+ ret = get_min_len(NCAR(x), &min, env);
if (ret != 0) return ret;
if (min != 0) head = 0;
}
@@ -3094,6 +3106,8 @@ divide_look_behind_alternatives(Node* node)
AnchorNode* an = NANCHOR(node);
int anc_type = an->type;
+ /* fprintf(stderr, "divide_look_behind: %d\n", (int )node); */
+
head = an->target;
np = NCAR(head);
swap_node(node, head);
@@ -3123,6 +3137,8 @@ setup_look_behind(Node* node, regex_t* reg, ScanEnv* env)
int r, len;
AnchorNode* an = NANCHOR(node);
+ /* fprintf(stderr, "setup_look_behind: %x\n", (int )node); */
+
r = get_char_length_tree(an->target, reg, &len);
if (r == 0)
an->char_len = len;
@@ -3719,7 +3735,7 @@ setup_tree(Node* node, regex_t* reg, int state, ScanEnv* env)
case NT_QTFR:
{
- OnigDistance d;
+ OnigLen d;
QtfrNode* qn = NQTFR(node);
Node* target = qn->target;
@@ -3728,7 +3744,7 @@ setup_tree(Node* node, regex_t* reg, int state, ScanEnv* env)
}
if (IS_REPEAT_INFINITE(qn->upper) || qn->upper >= 1) {
- r = get_min_match_length(target, &d, env);
+ r = get_min_len(target, &d, env);
if (r) break;
if (d == 0) {
qn->target_empty_info = NQ_TARGET_IS_EMPTY;
@@ -3740,7 +3756,7 @@ setup_tree(Node* node, regex_t* reg, int state, ScanEnv* env)
}
#endif
#if 0
- r = get_max_match_length(target, &d, env);
+ r = get_max_len(target, &d, env);
if (r == 0 && d == 0) {
/* ()* ==> ()?, ()+ ==> () */
qn->upper = 1;
@@ -3855,8 +3871,8 @@ setup_tree(Node* node, regex_t* reg, int state, ScanEnv* env)
( BIT_NT_LIST | BIT_NT_ALT | BIT_NT_STR | BIT_NT_CCLASS | BIT_NT_CTYPE | \
BIT_NT_CANY | BIT_NT_ANCHOR | BIT_NT_ENCLOSE | BIT_NT_QTFR | BIT_NT_CALL )
-#define ALLOWED_ENCLOSE_IN_LB ( ENCLOSE_MEMORY )
-#define ALLOWED_ENCLOSE_IN_LB_NOT 0
+#define ALLOWED_ENCLOSE_IN_LB ( ENCLOSE_MEMORY | ENCLOSE_OPTION )
+#define ALLOWED_ENCLOSE_IN_LB_NOT ENCLOSE_OPTION
#define ALLOWED_ANCHOR_IN_LB \
( ANCHOR_LOOK_BEHIND | ANCHOR_BEGIN_LINE | ANCHOR_END_LINE | ANCHOR_BEGIN_BUF | ANCHOR_BEGIN_POSITION )
@@ -3869,9 +3885,9 @@ setup_tree(Node* node, regex_t* reg, int state, ScanEnv* env)
ALLOWED_ENCLOSE_IN_LB, ALLOWED_ANCHOR_IN_LB);
if (r < 0) return r;
if (r > 0) return ONIGERR_INVALID_LOOK_BEHIND_PATTERN;
- r = setup_look_behind(node, reg, env);
- if (r != 0) return r;
r = setup_tree(an->target, reg, state, env);
+ if (r != 0) return r;
+ r = setup_look_behind(node, reg, env);
}
break;
@@ -3881,9 +3897,9 @@ setup_tree(Node* node, regex_t* reg, int state, ScanEnv* env)
ALLOWED_ENCLOSE_IN_LB_NOT, ALLOWED_ANCHOR_IN_LB_NOT);
if (r < 0) return r;
if (r > 0) return ONIGERR_INVALID_LOOK_BEHIND_PATTERN;
- r = setup_look_behind(node, reg, env);
- if (r != 0) return r;
r = setup_tree(an->target, reg, (state | IN_NOT), env);
+ if (r != 0) return r;
+ r = setup_look_behind(node, reg, env);
}
break;
}
@@ -3927,8 +3943,8 @@ set_bm_skip(UChar* s, UChar* end, OnigEncoding enc ARG_UNUSED,
#define OPT_EXACT_MAXLEN 24
typedef struct {
- OnigDistance min; /* min byte length */
- OnigDistance max; /* max byte length */
+ OnigLen min; /* min byte length */
+ OnigLen max; /* max byte length */
} MinMaxLen;
typedef struct {
@@ -4052,7 +4068,7 @@ is_equal_mml(MinMaxLen* a, MinMaxLen* b)
static void
-set_mml(MinMaxLen* mml, OnigDistance min, OnigDistance max)
+set_mml(MinMaxLen* mml, OnigLen min, OnigLen max)
{
mml->min = min;
mml->max = max;
@@ -4080,7 +4096,7 @@ add_mml(MinMaxLen* to, MinMaxLen* from)
#if 0
static void
-add_len_mml(MinMaxLen* to, OnigDistance len)
+add_len_mml(MinMaxLen* to, OnigLen len)
{
to->min = distance_add(to->min, len);
to->max = distance_add(to->max, len);
@@ -4115,7 +4131,7 @@ copy_opt_anc_info(OptAncInfo* to, OptAncInfo* from)
static void
concat_opt_anc_info(OptAncInfo* to, OptAncInfo* left, OptAncInfo* right,
- OnigDistance left_len, OnigDistance right_len)
+ OnigLen left_len, OnigLen right_len)
{
clear_opt_anc_info(to);
@@ -4628,8 +4644,8 @@ optimize_node_left(Node* node, NodeOptInfo* opt, OptEnv* env)
/* no need to check ignore case. (setted in setup_tree()) */
if (IS_NOT_NULL(cc->mbuf) || IS_NCCLASS_NOT(cc)) {
- OnigDistance min = ONIGENC_MBC_MINLEN(env->enc);
- OnigDistance max = ONIGENC_MBC_MAXLEN_DIST(env->enc);
+ OnigLen min = ONIGENC_MBC_MINLEN(env->enc);
+ OnigLen max = ONIGENC_MBC_MAXLEN_DIST(env->enc);
set_mml(&opt->len, min, max);
}
@@ -4682,8 +4698,8 @@ optimize_node_left(Node* node, NodeOptInfo* opt, OptEnv* env)
case NT_CANY:
{
- OnigDistance min = ONIGENC_MBC_MINLEN(env->enc);
- OnigDistance max = ONIGENC_MBC_MAXLEN_DIST(env->enc);
+ OnigLen min = ONIGENC_MBC_MINLEN(env->enc);
+ OnigLen max = ONIGENC_MBC_MAXLEN_DIST(env->enc);
set_mml(&opt->len, min, max);
}
break;
@@ -4729,7 +4745,7 @@ optimize_node_left(Node* node, NodeOptInfo* opt, OptEnv* env)
{
int i;
int* backs;
- OnigDistance min, max, tmin, tmax;
+ OnigLen min, max, tmin, tmax;
Node** nodes = SCANENV_MEM_NODES(env->scan_env);
BRefNode* br = NBREF(node);
@@ -4738,14 +4754,14 @@ optimize_node_left(Node* node, NodeOptInfo* opt, OptEnv* env)
break;
}
backs = BACKREFS_P(br);
- r = get_min_match_length(nodes[backs[0]], &min, env->scan_env);
+ r = get_min_len(nodes[backs[0]], &min, env->scan_env);
if (r != 0) break;
- r = get_max_match_length(nodes[backs[0]], &max, env->scan_env);
+ r = get_max_len(nodes[backs[0]], &max, env->scan_env);
if (r != 0) break;
for (i = 1; i < br->back_num; i++) {
- r = get_min_match_length(nodes[backs[i]], &tmin, env->scan_env);
+ r = get_min_len(nodes[backs[i]], &tmin, env->scan_env);
if (r != 0) break;
- r = get_max_match_length(nodes[backs[i]], &tmax, env->scan_env);
+ r = get_max_len(nodes[backs[i]], &tmax, env->scan_env);
if (r != 0) break;
if (min > tmin) min = tmin;
if (max < tmax) max = tmax;
@@ -4770,7 +4786,7 @@ optimize_node_left(Node* node, NodeOptInfo* opt, OptEnv* env)
case NT_QTFR:
{
int i;
- OnigDistance min, max;
+ OnigLen min, max;
NodeOptInfo nopt;
QtfrNode* qn = NQTFR(node);
@@ -4839,7 +4855,7 @@ optimize_node_left(Node* node, NodeOptInfo* opt, OptEnv* env)
#ifdef USE_SUBEXP_CALL
en->opt_count++;
if (en->opt_count > MAX_NODE_OPT_INFO_REF_COUNT) {
- OnigDistance min, max;
+ OnigLen min, max;
min = 0;
max = ONIG_INFINITE_DISTANCE;
@@ -5059,7 +5075,7 @@ static void print_enc_string(FILE* fp, OnigEncoding enc,
}
static void
-print_distance_range(FILE* f, OnigDistance a, OnigDistance b)
+print_distance_range(FILE* f, OnigLen a, OnigLen b)
{
if (a == ONIG_INFINITE_DISTANCE)
fputs("inf", f);
@@ -5147,7 +5163,7 @@ print_optimize_info(FILE* f, regex_t* reg)
for (p = reg->exact; p < reg->exact_end; p++) {
fputc(*p, f);
}
- fprintf(f, "]: length: %d\n", (reg->exact_end - reg->exact));
+ fprintf(f, "]: length: %ld\n", (reg->exact_end - reg->exact));
}
else if (reg->optimize & ONIG_OPTIMIZE_MAP) {
int c, i, n = 0;
@@ -5431,6 +5447,8 @@ onig_reg_init(regex_t* reg, OnigOptionType option,
r = onig_initialize_encoding(enc);
if (r != 0)
return ONIGERR_FAIL_TO_INITIALIZE;
+
+ onig_warning("You didn't call onig_initialize() explicitly");
#endif
}
@@ -5935,7 +5953,7 @@ onig_print_compiled_byte_code(FILE* f, UChar* bp, UChar** nextp,
GET_POINTER_INC(cc, bp);
n = bitset_on_num(cc->bs);
- fprintf(f, ":%u:%d", (unsigned int )cc, n);
+ fprintf(f, ":%p:%d", cc, n);
}
break;
@@ -6067,9 +6085,9 @@ print_indent_tree(FILE* f, Node* node, int indent)
case NT_LIST:
case NT_ALT:
if (NTYPE(node) == NT_LIST)
- fprintf(f, "<list:%x>\n", (int )node);
+ fprintf(f, "<list:%p>\n", node);
else
- fprintf(f, "<alt:%x>\n", (int )node);
+ fprintf(f, "<alt:%p>\n", node);
print_indent_tree(f, NCAR(node), indent + add);
while (IS_NOT_NULL(node = NCDR(node))) {
@@ -6082,8 +6100,8 @@ print_indent_tree(FILE* f, Node* node, int indent)
break;
case NT_STR:
- fprintf(f, "<string%s:%x>",
- (NSTRING_IS_RAW(node) ? "-raw" : ""), (int )node);
+ fprintf(f, "<string%s:%p>",
+ (NSTRING_IS_RAW(node) ? "-raw" : ""), node);
for (p = NSTR(node)->s; p < NSTR(node)->end; p++) {
if (*p >= 0x20 && *p < 0x7f)
fputc(*p, f);
@@ -6094,7 +6112,7 @@ print_indent_tree(FILE* f, Node* node, int indent)
break;
case NT_CCLASS:
- fprintf(f, "<cclass:%x>", (int )node);
+ fprintf(f, "<cclass:%p>", node);
if (IS_NCCLASS_NOT(NCCLASS(node))) fputs(" not", f);
if (NCCLASS(node)->mbuf) {
BBuf* bbuf = NCCLASS(node)->mbuf;
@@ -6106,7 +6124,7 @@ print_indent_tree(FILE* f, Node* node, int indent)
break;
case NT_CTYPE:
- fprintf(f, "<ctype:%x> ", (int )node);
+ fprintf(f, "<ctype:%p> ", node);
switch (NCTYPE(node)->ctype) {
case ONIGENC_CTYPE_WORD:
if (NCTYPE(node)->not != 0)
@@ -6122,11 +6140,11 @@ print_indent_tree(FILE* f, Node* node, int indent)
break;
case NT_CANY:
- fprintf(f, "<anychar:%x>", (int )node);
+ fprintf(f, "<anychar:%p>", node);
break;
case NT_ANCHOR:
- fprintf(f, "<anchor:%x> ", (int )node);
+ fprintf(f, "<anchor:%p> ", node);
switch (NANCHOR(node)->type) {
case ANCHOR_BEGIN_BUF: fputs("begin buf", f); break;
case ANCHOR_END_BUF: fputs("end buf", f); break;
@@ -6141,10 +6159,22 @@ print_indent_tree(FILE* f, Node* node, int indent)
case ANCHOR_WORD_BEGIN: fputs("word begin", f); break;
case ANCHOR_WORD_END: fputs("word end", f); break;
#endif
- case ANCHOR_PREC_READ: fputs("prec read", f); break;
- case ANCHOR_PREC_READ_NOT: fputs("prec read not", f); break;
- case ANCHOR_LOOK_BEHIND: fputs("look_behind", f); break;
- case ANCHOR_LOOK_BEHIND_NOT: fputs("look_behind_not",f); break;
+ case ANCHOR_PREC_READ:
+ fprintf(f, "prec read\n");
+ print_indent_tree(f, NANCHOR(node)->target, indent + add);
+ break;
+ case ANCHOR_PREC_READ_NOT:
+ fprintf(f, "prec read not\n");
+ print_indent_tree(f, NANCHOR(node)->target, indent + add);
+ break;
+ case ANCHOR_LOOK_BEHIND:
+ fprintf(f, "look behind\n");
+ print_indent_tree(f, NANCHOR(node)->target, indent + add);
+ break;
+ case ANCHOR_LOOK_BEHIND_NOT:
+ fprintf(f, "look behind not\n");
+ print_indent_tree(f, NANCHOR(node)->target, indent + add);
+ break;
default:
fprintf(f, "ERROR: undefined anchor type.\n");
@@ -6157,7 +6187,7 @@ print_indent_tree(FILE* f, Node* node, int indent)
int* p;
BRefNode* br = NBREF(node);
p = BACKREFS_P(br);
- fprintf(f, "<backref:%x>", (int )node);
+ fprintf(f, "<backref:%p>", node);
for (i = 0; i < br->back_num; i++) {
if (i > 0) fputs(", ", f);
fprintf(f, "%d", p[i]);
@@ -6169,21 +6199,21 @@ print_indent_tree(FILE* f, Node* node, int indent)
case NT_CALL:
{
CallNode* cn = NCALL(node);
- fprintf(f, "<call:%x>", (int )node);
+ fprintf(f, "<call:%p>", node);
p_string(f, cn->name_end - cn->name, cn->name);
}
break;
#endif
case NT_QTFR:
- fprintf(f, "<quantifier:%x>{%d,%d}%s\n", (int )node,
+ fprintf(f, "<quantifier:%p>{%d,%d}%s\n", node,
NQTFR(node)->lower, NQTFR(node)->upper,
(NQTFR(node)->greedy ? "" : "?"));
print_indent_tree(f, NQTFR(node)->target, indent + add);
break;
case NT_ENCLOSE:
- fprintf(f, "<enclose:%x> ", (int )node);
+ fprintf(f, "<enclose:%p> ", node);
switch (NENCLOSE(node)->type) {
case ENCLOSE_OPTION:
fprintf(f, "option:%d", NENCLOSE(node)->option);
diff --git a/src/regenc.c b/src/regenc.c
index 01bfd1d..554a622 100644
--- a/src/regenc.c
+++ b/src/regenc.c
@@ -2,7 +2,7 @@
regenc.c - Oniguruma (regular expression library)
**********************************************************************/
/*-
- * Copyright (c) 2002-2007 K.Kosako <sndgk393 AT ybb DOT ne DOT jp>
+ * Copyright (c) 2002-2016 K.Kosako <sndgk393 AT ybb DOT ne DOT jp>
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
@@ -108,6 +108,20 @@ onigenc_step_back(OnigEncoding enc, const UChar* start, const UChar* s, int n)
return (UChar* )s;
}
+#if 0
+extern int
+onigenc_mbc_enc_len_end(OnigEncoding enc, const UChar* p, const UChar* end)
+{
+ int len;
+ int n;
+
+ len = ONIGENC_MBC_ENC_LEN(enc, p);
+ n = (int )(end - p);
+
+ return (n < len ? n : len);
+}
+#endif
+
extern UChar*
onigenc_step(OnigEncoding enc, const UChar* p, const UChar* end, int n)
{
@@ -649,6 +663,33 @@ onigenc_always_false_is_allowed_reverse_match(const UChar* s ARG_UNUSED,
return FALSE;
}
+extern int
+onigenc_always_true_is_valid_mbc_string(const UChar* s ARG_UNUSED,
+ const UChar* end ARG_UNUSED)
+{
+ return TRUE;
+}
+
+extern int
+onigenc_length_check_is_valid_mbc_string(OnigEncoding enc,
+ const UChar* p, const UChar* end)
+{
+ while (p < end) {
+ p += enclen(enc, p);
+ }
+
+ if (p != end)
+ return FALSE;
+ else
+ return TRUE;
+}
+
+extern int
+onigenc_is_valid_mbc_string(OnigEncoding enc, const UChar* s, const UChar* end)
+{
+ return ONIGENC_IS_VALID_MBC_STRING(enc, s, end);
+}
+
extern OnigCodePoint
onigenc_mbn_mbc_to_code(OnigEncoding enc, const UChar* p, const UChar* end)
{
diff --git a/src/regenc.h b/src/regenc.h
index 65eb17e..49227fa 100644
--- a/src/regenc.h
+++ b/src/regenc.h
@@ -71,7 +71,7 @@ typedef struct {
#define ONIG_CHECK_NULL_RETURN(p) if (ONIG_IS_NULL(p)) return NULL
#define ONIG_CHECK_NULL_RETURN_VAL(p,val) if (ONIG_IS_NULL(p)) return (val)
-#define enclen(enc,p) ONIGENC_MBC_ENC_LEN(enc,p)
+#define enclen(enc,p) ONIGENC_MBC_ENC_LEN(enc,p)
/* character types bit flag */
#define BIT_CTYPE_NEWLINE (1<< ONIGENC_CTYPE_NEWLINE)
@@ -133,6 +133,8 @@ ONIG_EXTERN int onigenc_single_byte_code_to_mbc P_((OnigCodePoint code, UChar *b
ONIG_EXTERN UChar* onigenc_single_byte_left_adjust_char_head P_((const UChar* start, const UChar* s));
ONIG_EXTERN int onigenc_always_true_is_allowed_reverse_match P_((const UChar* s, const UChar* end));
ONIG_EXTERN int onigenc_always_false_is_allowed_reverse_match P_((const UChar* s, const UChar* end));
+ONIG_EXTERN int onigenc_always_true_is_valid_mbc_string P_((const UChar* s, const UChar* end));
+ONIG_EXTERN int onigenc_length_check_is_valid_mbc_string P_((OnigEncoding enc, const UChar* s, const UChar* end));
/* methods for multi byte encoding */
ONIG_EXTERN OnigCodePoint onigenc_mbn_mbc_to_code P_((OnigEncoding enc, const UChar* p, const UChar* end));
@@ -227,6 +229,7 @@ ONIG_EXTERN const UChar OnigEncAsciiToLowerCaseTable[];
ONIG_EXTERN const UChar OnigEncAsciiToUpperCaseTable[];
ONIG_EXTERN const unsigned short OnigEncAsciiCtypeTable[];
+
#define ONIGENC_IS_ASCII_CODE(code) ((code) < 0x80)
#define ONIGENC_ASCII_CODE_TO_LOWER_CASE(c) OnigEncAsciiToLowerCaseTable[c]
#define ONIGENC_ASCII_CODE_TO_UPPER_CASE(c) OnigEncAsciiToUpperCaseTable[c]
diff --git a/src/regerror.c b/src/regerror.c
index 16009bb..05fc9d8 100644
--- a/src/regerror.c
+++ b/src/regerror.c
@@ -140,6 +140,8 @@ onig_error_code_to_format(int code)
#endif
case ONIGERR_NUMBERED_BACKREF_OR_CALL_NOT_ALLOWED:
p = "numbered backref/call is not allowed. (use name)"; break;
+ case ONIGERR_TOO_MANY_CAPTURES:
+ p = "too many captures"; break;
case ONIGERR_TOO_BIG_WIDE_CHAR_VALUE:
p = "too big wide-char value"; break;
case ONIGERR_TOO_LONG_WIDE_CHAR_VALUE:
diff --git a/src/regexec.c b/src/regexec.c
index 2c768e1..70ac89e 100644
--- a/src/regexec.c
+++ b/src/regexec.c
@@ -2,7 +2,7 @@
regexec.c - Oniguruma (regular expression library)
**********************************************************************/
/*-
- * Copyright (c) 2002-2008 K.Kosako <sndgk393 AT ybb DOT ne DOT jp>
+ * Copyright (c) 2002-2016 K.Kosako <sndgk393 AT ybb DOT ne DOT jp>
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
@@ -327,19 +327,21 @@ onig_region_copy(OnigRegion* to, OnigRegion* from)
#define STK_MASK_MEM_END_OR_MARK 0x8000 /* MEM_END or MEM_END_MARK */
#ifdef USE_FIND_LONGEST_SEARCH_ALL_OF_RANGE
-#define MATCH_ARG_INIT(msa, arg_option, arg_region, arg_start) do {\
+#define MATCH_ARG_INIT(msa, reg, arg_option, arg_region, arg_start) do {\
(msa).stack_p = (void* )0;\
(msa).options = (arg_option);\
(msa).region = (arg_region);\
(msa).start = (arg_start);\
(msa).best_len = ONIG_MISMATCH;\
+ (msa).ptr_num = (reg)->num_repeat + (reg)->num_mem * 2;\
} while(0)
#else
-#define MATCH_ARG_INIT(msa, arg_option, arg_region, arg_start) do {\
+#define MATCH_ARG_INIT(msa, reg, arg_option, arg_region, arg_start) do {\
(msa).stack_p = (void* )0;\
(msa).options = (arg_option);\
(msa).region = (arg_region);\
(msa).start = (arg_start);\
+ (msa).ptr_num = (reg)->num_repeat + (reg)->num_mem * 2;\
} while(0)
#endif
@@ -369,7 +371,7 @@ onig_region_copy(OnigRegion* to, OnigRegion* from)
(msa).state_check_buff = (void* )0;\
(msa).state_check_buff_size = 0;\
}\
- } while(0)
+} while(0)
#define MATCH_ARG_FREE(msa) do {\
if ((msa).stack_p) xfree((msa).stack_p);\
@@ -383,32 +385,59 @@ onig_region_copy(OnigRegion* to, OnigRegion* from)
#endif
+#define ALLOCA_PTR_NUM_LIMIT 50
-#define STACK_INIT(alloc_addr, ptr_num, stack_num) do {\
+#define STACK_INIT(stack_num) do {\
if (msa->stack_p) {\
- alloc_addr = (char* )xalloca(sizeof(char*) * (ptr_num));\
- stk_alloc = (OnigStackType* )(msa->stack_p);\
- stk_base = stk_alloc;\
+ is_alloca = 0;\
+ alloc_base = msa->stack_p;\
+ stk_base = (OnigStackType* )(alloc_base\
+ + (sizeof(OnigStackIndex) * msa->ptr_num));\
stk = stk_base;\
stk_end = stk_base + msa->stack_n;\
}\
+ else if (msa->ptr_num > ALLOCA_PTR_NUM_LIMIT) {\
+ is_alloca = 0;\
+ alloc_base = (char* )xmalloc(sizeof(OnigStackIndex) * msa->ptr_num\
+ + sizeof(OnigStackType) * (stack_num));\
+ stk_base = (OnigStackType* )(alloc_base\
+ + (sizeof(OnigStackIndex) * msa->ptr_num));\
+ stk = stk_base;\
+ stk_end = stk_base + (stack_num);\
+ }\
else {\
- alloc_addr = (char* )xalloca(sizeof(char*) * (ptr_num)\
- + sizeof(OnigStackType) * (stack_num));\
- stk_alloc = (OnigStackType* )(alloc_addr + sizeof(char*) * (ptr_num));\
- stk_base = stk_alloc;\
+ is_alloca = 1;\
+ alloc_base = (char* )xalloca(sizeof(OnigStackIndex) * msa->ptr_num\
+ + sizeof(OnigStackType) * (stack_num));\
+ stk_base = (OnigStackType* )(alloc_base\
+ + (sizeof(OnigStackIndex) * msa->ptr_num));\
stk = stk_base;\
stk_end = stk_base + (stack_num);\
}\
-} while(0)
+} while(0);
+
#define STACK_SAVE do{\
- if (stk_base != stk_alloc) {\
- msa->stack_p = stk_base;\
- msa->stack_n = stk_end - stk_base;\
+ msa->stack_n = stk_end - stk_base;\
+ if (is_alloca != 0) {\
+ size_t size = sizeof(OnigStackIndex) * msa->ptr_num \
+ + sizeof(OnigStackType) * msa->stack_n;\
+ msa->stack_p = xmalloc(size);\
+ xmemcpy(msa->stack_p, alloc_base, size);\
+ }\
+ else {\
+ msa->stack_p = alloc_base;\
};\
} while(0)
+#define UPDATE_FOR_STACK_REALLOC do{\
+ repeat_stk = (OnigStackIndex* )alloc_base;\
+ mem_start_stk = (OnigStackIndex* )(repeat_stk + reg->num_repeat);\
+ mem_end_stk = mem_start_stk + num_mem;\
+ mem_start_stk--; /* for index start from 1 */\
+ mem_end_stk--; /* for index start from 1 */\
+} while(0)
+
static unsigned int MatchStackLimitSize = DEFAULT_MATCH_STACK_LIMIT_SIZE;
extern unsigned int
@@ -425,50 +454,65 @@ onig_set_match_stack_limit_size(unsigned int size)
}
static int
-stack_double(OnigStackType** arg_stk_base, OnigStackType** arg_stk_end,
- OnigStackType** arg_stk, OnigStackType* stk_alloc, OnigMatchArg* msa)
+stack_double(int is_alloca, char** arg_alloc_base,
+ OnigStackType** arg_stk_base,
+ OnigStackType** arg_stk_end, OnigStackType** arg_stk,
+ OnigMatchArg* msa)
{
unsigned int n;
- OnigStackType *x, *stk_base, *stk_end, *stk;
+ int used;
+ size_t size;
+ char* alloc_base;
+ char* new_alloc_base;
+ OnigStackType *stk_base, *stk_end, *stk;
+ alloc_base = *arg_alloc_base;
stk_base = *arg_stk_base;
stk_end = *arg_stk_end;
stk = *arg_stk;
n = stk_end - stk_base;
- if (stk_base == stk_alloc && IS_NULL(msa->stack_p)) {
- x = (OnigStackType* )xmalloc(sizeof(OnigStackType) * n * 2);
- if (IS_NULL(x)) {
+ n *= 2;
+ size = sizeof(OnigStackIndex) * msa->ptr_num + sizeof(OnigStackType) * n;
+ if (is_alloca != 0) {
+ new_alloc_base = (char* )xmalloc(size);
+ if (IS_NULL(new_alloc_base)) {
STACK_SAVE;
return ONIGERR_MEMORY;
}
- xmemcpy(x, stk_base, n * sizeof(OnigStackType));
- n *= 2;
+ xmemcpy(new_alloc_base, alloc_base, size);
}
else {
- n *= 2;
if (MatchStackLimitSize != 0 && n > MatchStackLimitSize) {
if ((unsigned int )(stk_end - stk_base) == MatchStackLimitSize)
return ONIGERR_MATCH_STACK_LIMIT_OVER;
else
n = MatchStackLimitSize;
}
- x = (OnigStackType* )xrealloc(stk_base, sizeof(OnigStackType) * n);
- if (IS_NULL(x)) {
+ new_alloc_base = (char* )xrealloc(alloc_base, size);
+ if (IS_NULL(new_alloc_base)) {
STACK_SAVE;
return ONIGERR_MEMORY;
}
}
- *arg_stk = x + (stk - stk_base);
- *arg_stk_base = x;
- *arg_stk_end = x + n;
+
+ alloc_base = new_alloc_base;
+ used = stk - stk_base;
+ *arg_alloc_base = alloc_base;
+ *arg_stk_base = (OnigStackType* )(alloc_base
+ + (sizeof(OnigStackIndex) * msa->ptr_num));
+ *arg_stk = *arg_stk_base + used;
+ *arg_stk_end = *arg_stk_base + n;
return 0;
}
#define STACK_ENSURE(n) do {\
if (stk_end - stk < (n)) {\
- int r = stack_double(&stk_base, &stk_end, &stk, stk_alloc, msa);\
+ int r = stack_double(is_alloca, &alloc_base, &stk_base, &stk_end, &stk,\
+ msa);\
if (r != 0) { STACK_SAVE; return r; } \
+ is_alloca = 0;\
+ UPDATE_FOR_STACK_REALLOC;\
}\
} while(0)
@@ -1108,33 +1152,33 @@ static int backref_match_at_nested_level(regex_t* reg
}
else if (level == nest) {
if (k->type == STK_MEM_START) {
- if (mem_is_in_memp(k->u.mem.num, mem_num, memp)) {
- pstart = k->u.mem.pstr;
- if (pend != NULL_UCHARP) {
- if (pend - pstart > send - *s) return 0; /* or goto next_mem; */
- p = pstart;
- ss = *s;
-
- if (ignore_case != 0) {
- if (string_cmp_ic(reg->enc, case_fold_flag,
- pstart, &ss, (int )(pend - pstart)) == 0)
- return 0; /* or goto next_mem; */
- }
- else {
- while (p < pend) {
- if (*p++ != *ss++) return 0; /* or goto next_mem; */
- }
- }
-
- *s = ss;
- return 1;
- }
- }
+ if (mem_is_in_memp(k->u.mem.num, mem_num, memp)) {
+ pstart = k->u.mem.pstr;
+ if (pend != NULL_UCHARP) {
+ if (pend - pstart > send - *s) return 0; /* or goto next_mem; */
+ p = pstart;
+ ss = *s;
+
+ if (ignore_case != 0) {
+ if (string_cmp_ic(reg->enc, case_fold_flag,
+ pstart, &ss, (int )(pend - pstart)) == 0)
+ return 0; /* or goto next_mem; */
+ }
+ else {
+ while (p < pend) {
+ if (*p++ != *ss++) return 0; /* or goto next_mem; */
+ }
+ }
+
+ *s = ss;
+ return 1;
+ }
+ }
}
else if (k->type == STK_MEM_END) {
- if (mem_is_in_memp(k->u.mem.num, mem_num, memp)) {
- pend = k->u.mem.pstr;
- }
+ if (mem_is_in_memp(k->u.mem.num, mem_num, memp)) {
+ pend = k->u.mem.pstr;
+ }
}
}
k--;
@@ -1247,13 +1291,10 @@ match_at(regex_t* reg, const UChar* str, const UChar* end,
LengthType tlen, tlen2;
MemNumType mem;
RelAddrType addr;
- OnigOptionType option = reg->options;
- OnigEncoding encode = reg->enc;
- OnigCaseFoldType case_fold_flag = reg->case_fold_flag;
UChar *s, *q, *sbegin;
- UChar *p = reg->p;
- char *alloca_base;
- OnigStackType *stk_alloc, *stk_base, *stk, *stk_end;
+ int is_alloca;
+ char *alloc_base;
+ OnigStackType *stk_base, *stk, *stk_end;
OnigStackType *stkp; /* used as any purpose. */
OnigStackIndex si;
OnigStackIndex *repeat_stk;
@@ -1263,19 +1304,16 @@ match_at(regex_t* reg, const UChar* str, const UChar* end,
unsigned char* state_check_buff = msa->state_check_buff;
int num_comb_exp_check = reg->num_comb_exp_check;
#endif
- n = reg->num_repeat + reg->num_mem * 2;
+ UChar *p = reg->p;
+ OnigOptionType option = reg->options;
+ OnigEncoding encode = reg->enc;
+ OnigCaseFoldType case_fold_flag = reg->case_fold_flag;
- STACK_INIT(alloca_base, n, INIT_MATCH_STACK_SIZE);
+ //n = reg->num_repeat + reg->num_mem * 2;
pop_level = reg->stack_pop_level;
num_mem = reg->num_mem;
- repeat_stk = (OnigStackIndex* )alloca_base;
-
- mem_start_stk = (OnigStackIndex* )(repeat_stk + reg->num_repeat);
- mem_end_stk = mem_start_stk + num_mem;
- mem_start_stk--; /* for index start from 1,
- mem_start_stk[1]..mem_start_stk[num_mem] */
- mem_end_stk--; /* for index start from 1,
- mem_end_stk[1]..mem_end_stk[num_mem] */
+ STACK_INIT(INIT_MATCH_STACK_SIZE);
+ UPDATE_FOR_STACK_REALLOC;
for (i = 1; i <= num_mem; i++) {
mem_start_stk[i] = mem_end_stk[i] = INVALID_STACK_INDEX;
}
@@ -1316,64 +1354,64 @@ match_at(regex_t* reg, const UChar* str, const UChar* end,
case OP_END: MOP_IN(OP_END);
n = s - sstart;
if (n > best_len) {
- OnigRegion* region;
+ OnigRegion* region;
#ifdef USE_FIND_LONGEST_SEARCH_ALL_OF_RANGE
- if (IS_FIND_LONGEST(option)) {
- if (n > msa->best_len) {
- msa->best_len = n;
- msa->best_s = (UChar* )sstart;
- }
- else
- goto end_best_len;
+ if (IS_FIND_LONGEST(option)) {
+ if (n > msa->best_len) {
+ msa->best_len = n;
+ msa->best_s = (UChar* )sstart;
+ }
+ else
+ goto end_best_len;
}
#endif
- best_len = n;
- region = msa->region;
- if (region) {
+ best_len = n;
+ region = msa->region;
+ if (region) {
#ifdef USE_POSIX_API_REGION_OPTION
- if (IS_POSIX_REGION(msa->options)) {
- posix_regmatch_t* rmt = (posix_regmatch_t* )region;
-
- rmt[0].rm_so = sstart - str;
- rmt[0].rm_eo = s - str;
- for (i = 1; i <= num_mem; i++) {
- if (mem_end_stk[i] != INVALID_STACK_INDEX) {
- if (BIT_STATUS_AT(reg->bt_mem_start, i))
- rmt[i].rm_so = STACK_AT(mem_start_stk[i])->u.mem.pstr - str;
- else
- rmt[i].rm_so = (UChar* )((void* )(mem_start_stk[i])) - str;
-
- rmt[i].rm_eo = (BIT_STATUS_AT(reg->bt_mem_end, i)
- ? STACK_AT(mem_end_stk[i])->u.mem.pstr
- : (UChar* )((void* )mem_end_stk[i])) - str;
- }
- else {
- rmt[i].rm_so = rmt[i].rm_eo = ONIG_REGION_NOTPOS;
- }
- }
- }
- else {
+ if (IS_POSIX_REGION(msa->options)) {
+ posix_regmatch_t* rmt = (posix_regmatch_t* )region;
+
+ rmt[0].rm_so = sstart - str;
+ rmt[0].rm_eo = s - str;
+ for (i = 1; i <= num_mem; i++) {
+ if (mem_end_stk[i] != INVALID_STACK_INDEX) {
+ if (BIT_STATUS_AT(reg->bt_mem_start, i))
+ rmt[i].rm_so = STACK_AT(mem_start_stk[i])->u.mem.pstr - str;
+ else
+ rmt[i].rm_so = (UChar* )((void* )(mem_start_stk[i])) - str;
+
+ rmt[i].rm_eo = (BIT_STATUS_AT(reg->bt_mem_end, i)
+ ? STACK_AT(mem_end_stk[i])->u.mem.pstr
+ : (UChar* )((void* )mem_end_stk[i])) - str;
+ }
+ else {
+ rmt[i].rm_so = rmt[i].rm_eo = ONIG_REGION_NOTPOS;
+ }
+ }
+ }
+ else {
#endif /* USE_POSIX_API_REGION_OPTION */
- region->beg[0] = sstart - str;
- region->end[0] = s - str;
- for (i = 1; i <= num_mem; i++) {
- if (mem_end_stk[i] != INVALID_STACK_INDEX) {
- if (BIT_STATUS_AT(reg->bt_mem_start, i))
- region->beg[i] = STACK_AT(mem_start_stk[i])->u.mem.pstr - str;
- else
- region->beg[i] = (UChar* )((void* )mem_start_stk[i]) - str;
-
- region->end[i] = (BIT_STATUS_AT(reg->bt_mem_end, i)
- ? STACK_AT(mem_end_stk[i])->u.mem.pstr
- : (UChar* )((void* )mem_end_stk[i])) - str;
- }
- else {
- region->beg[i] = region->end[i] = ONIG_REGION_NOTPOS;
- }
- }
+ region->beg[0] = sstart - str;
+ region->end[0] = s - str;
+ for (i = 1; i <= num_mem; i++) {
+ if (mem_end_stk[i] != INVALID_STACK_INDEX) {
+ if (BIT_STATUS_AT(reg->bt_mem_start, i))
+ region->beg[i] = STACK_AT(mem_start_stk[i])->u.mem.pstr - str;
+ else
+ region->beg[i] = (UChar* )((void* )mem_start_stk[i]) - str;
+
+ region->end[i] = (BIT_STATUS_AT(reg->bt_mem_end, i)
+ ? STACK_AT(mem_end_stk[i])->u.mem.pstr
+ : (UChar* )((void* )mem_end_stk[i])) - str;
+ }
+ else {
+ region->beg[i] = region->end[i] = ONIG_REGION_NOTPOS;
+ }
+ }
#ifdef USE_CAPTURE_HISTORY
- if (reg->capture_history != 0) {
+ if (reg->capture_history != 0) {
int r;
OnigCaptureTreeNode* node;
@@ -1397,12 +1435,12 @@ match_at(regex_t* reg, const UChar* str, const UChar* end,
best_len = r; /* error code */
goto finish;
}
- }
+ }
#endif /* USE_CAPTURE_HISTORY */
#ifdef USE_POSIX_API_REGION_OPTION
- } /* else IS_POSIX_REGION() */
+ } /* else IS_POSIX_REGION() */
#endif
- } /* if (region) */
+ } /* if (region) */
} /* n > best_len */
#ifdef USE_FIND_LONGEST_SEARCH_ALL_OF_RANGE
@@ -1411,13 +1449,13 @@ match_at(regex_t* reg, const UChar* str, const UChar* end,
MOP_OUT;
if (IS_FIND_CONDITION(option)) {
- if (IS_FIND_NOT_EMPTY(option) && s == sstart) {
- best_len = ONIG_MISMATCH;
- goto fail; /* for retry */
- }
- if (IS_FIND_LONGEST(option) && DATA_ENSURE_CHECK1) {
- goto fail; /* for retry */
- }
+ if (IS_FIND_NOT_EMPTY(option) && s == sstart) {
+ best_len = ONIG_MISMATCH;
+ goto fail; /* for retry */
+ }
+ if (IS_FIND_LONGEST(option) && DATA_ENSURE_CHECK1) {
+ goto fail; /* for retry */
+ }
}
/* default behavior: return first-matching result. */
@@ -1438,22 +1476,22 @@ match_at(regex_t* reg, const UChar* str, const UChar* end,
case OP_EXACT1_IC: MOP_IN(OP_EXACT1_IC);
{
- int len;
- UChar *q, lowbuf[ONIGENC_MBC_CASE_FOLD_MAXLEN];
-
- DATA_ENSURE(1);
- len = ONIGENC_MBC_CASE_FOLD(encode,
- /* DISABLE_CASE_FOLD_MULTI_CHAR(case_fold_flag), */
- case_fold_flag,
- &s, end, lowbuf);
- DATA_ENSURE(0);
- q = lowbuf;
- while (len-- > 0) {
- if (*p != *q) {
+ int len;
+ UChar *q, lowbuf[ONIGENC_MBC_CASE_FOLD_MAXLEN];
+
+ DATA_ENSURE(1);
+ len = ONIGENC_MBC_CASE_FOLD(encode,
+ /* DISABLE_CASE_FOLD_MULTI_CHAR(case_fold_flag), */
+ case_fold_flag,
+ &s, end, lowbuf);
+ DATA_ENSURE(0);
+ q = lowbuf;
+ while (len-- > 0) {
+ if (*p != *q) {
goto fail;
}
- p++; q++;
- }
+ p++; q++;
+ }
}
MOP_OUT;
break;
@@ -1518,7 +1556,7 @@ match_at(regex_t* reg, const UChar* str, const UChar* end,
GET_LENGTH_INC(tlen, p);
DATA_ENSURE(tlen);
while (tlen-- > 0) {
- if (*p++ != *s++) goto fail;
+ if (*p++ != *s++) goto fail;
}
sprev = s - 1;
MOP_OUT;
@@ -1527,26 +1565,26 @@ match_at(regex_t* reg, const UChar* str, const UChar* end,
case OP_EXACTN_IC: MOP_IN(OP_EXACTN_IC);
{
- int len;
- UChar *q, *endp, lowbuf[ONIGENC_MBC_CASE_FOLD_MAXLEN];
+ int len;
+ UChar *q, *endp, lowbuf[ONIGENC_MBC_CASE_FOLD_MAXLEN];
- GET_LENGTH_INC(tlen, p);
- endp = p + tlen;
+ GET_LENGTH_INC(tlen, p);
+ endp = p + tlen;
- while (p < endp) {
- sprev = s;
- DATA_ENSURE(1);
- len = ONIGENC_MBC_CASE_FOLD(encode,
- /* DISABLE_CASE_FOLD_MULTI_CHAR(case_fold_flag), */
- case_fold_flag,
- &s, end, lowbuf);
- DATA_ENSURE(0);
- q = lowbuf;
- while (len-- > 0) {
- if (*p != *q) goto fail;
- p++; q++;
- }
- }
+ while (p < endp) {
+ sprev = s;
+ DATA_ENSURE(1);
+ len = ONIGENC_MBC_CASE_FOLD(encode,
+ /* DISABLE_CASE_FOLD_MULTI_CHAR(case_fold_flag), */
+ case_fold_flag,
+ &s, end, lowbuf);
+ DATA_ENSURE(0);
+ q = lowbuf;
+ while (len-- > 0) {
+ if (*p != *q) goto fail;
+ p++; q++;
+ }
+ }
}
MOP_OUT;
@@ -1600,10 +1638,10 @@ match_at(regex_t* reg, const UChar* str, const UChar* end,
GET_LENGTH_INC(tlen, p);
DATA_ENSURE(tlen * 2);
while (tlen-- > 0) {
- if (*p != *s) goto fail;
- p++; s++;
- if (*p != *s) goto fail;
- p++; s++;
+ if (*p != *s) goto fail;
+ p++; s++;
+ if (*p != *s) goto fail;
+ p++; s++;
}
sprev = s - 2;
MOP_OUT;
@@ -1614,12 +1652,12 @@ match_at(regex_t* reg, const UChar* str, const UChar* end,
GET_LENGTH_INC(tlen, p);
DATA_ENSURE(tlen * 3);
while (tlen-- > 0) {
- if (*p != *s) goto fail;
- p++; s++;
- if (*p != *s) goto fail;
- p++; s++;
- if (*p != *s) goto fail;
- p++; s++;
+ if (*p != *s) goto fail;
+ p++; s++;
+ if (*p != *s) goto fail;
+ p++; s++;
+ if (*p != *s) goto fail;
+ p++; s++;
}
sprev = s - 3;
MOP_OUT;
@@ -1632,8 +1670,8 @@ match_at(regex_t* reg, const UChar* str, const UChar* end,
tlen2 *= tlen;
DATA_ENSURE(tlen2);
while (tlen2-- > 0) {
- if (*p != *s) goto fail;
- p++; s++;
+ if (*p != *s) goto fail;
+ p++; s++;
}
sprev = s - tlen;
MOP_OUT;
@@ -1654,23 +1692,23 @@ match_at(regex_t* reg, const UChar* str, const UChar* end,
cclass_mb:
GET_LENGTH_INC(tlen, p);
{
- OnigCodePoint code;
- UChar *ss;
- int mb_len;
+ OnigCodePoint code;
+ UChar *ss;
+ int mb_len;
- DATA_ENSURE(1);
- mb_len = enclen(encode, s);
- DATA_ENSURE(mb_len);
- ss = s;
- s += mb_len;
- code = ONIGENC_MBC_TO_CODE(encode, ss, s);
+ DATA_ENSURE(1);
+ mb_len = enclen(encode, s);
+ DATA_ENSURE(mb_len);
+ ss = s;
+ s += mb_len;
+ code = ONIGENC_MBC_TO_CODE(encode, ss, s);
#ifdef PLATFORM_UNALIGNED_WORD_ACCESS
- if (! onig_is_in_code_range(p, code)) goto fail;
+ if (! onig_is_in_code_range(p, code)) goto fail;
#else
- q = p;
- ALIGNMENT_RIGHT(q);
- if (! onig_is_in_code_range(q, code)) goto fail;
+ q = p;
+ ALIGNMENT_RIGHT(q);
+ if (! onig_is_in_code_range(q, code)) goto fail;
#endif
}
p += tlen;
@@ -1680,17 +1718,17 @@ match_at(regex_t* reg, const UChar* str, const UChar* end,
case OP_CCLASS_MIX: MOP_IN(OP_CCLASS_MIX);
DATA_ENSURE(1);
if (ONIGENC_IS_MBC_HEAD(encode, s)) {
- p += SIZE_BITSET;
- goto cclass_mb;
+ p += SIZE_BITSET;
+ goto cclass_mb;
}
else {
- if (BITSET_AT(((BitSetRef )p), *s) == 0)
- goto fail;
+ if (BITSET_AT(((BitSetRef )p), *s) == 0)
+ goto fail;
- p += SIZE_BITSET;
- GET_LENGTH_INC(tlen, p);
- p += tlen;
- s++;
+ p += SIZE_BITSET;
+ GET_LENGTH_INC(tlen, p);
+ p += tlen;
+ s++;
}
MOP_OUT;
break;
@@ -1706,36 +1744,36 @@ match_at(regex_t* reg, const UChar* str, const UChar* end,
case OP_CCLASS_MB_NOT: MOP_IN(OP_CCLASS_MB_NOT);
DATA_ENSURE(1);
if (! ONIGENC_IS_MBC_HEAD(encode, s)) {
- s++;
- GET_LENGTH_INC(tlen, p);
- p += tlen;
- goto cc_mb_not_success;
+ s++;
+ GET_LENGTH_INC(tlen, p);
+ p += tlen;
+ goto cc_mb_not_success;
}
cclass_mb_not:
GET_LENGTH_INC(tlen, p);
{
- OnigCodePoint code;
- UChar *ss;
- int mb_len = enclen(encode, s);
+ OnigCodePoint code;
+ UChar *ss;
+ int mb_len = enclen(encode, s);
- if (! DATA_ENSURE_CHECK(mb_len)) {
+ if (! DATA_ENSURE_CHECK(mb_len)) {
DATA_ENSURE(1);
- s = (UChar* )end;
- p += tlen;
- goto cc_mb_not_success;
- }
+ s = (UChar* )end;
+ p += tlen;
+ goto cc_mb_not_success;
+ }
- ss = s;
- s += mb_len;
- code = ONIGENC_MBC_TO_CODE(encode, ss, s);
+ ss = s;
+ s += mb_len;
+ code = ONIGENC_MBC_TO_CODE(encode, ss, s);
#ifdef PLATFORM_UNALIGNED_WORD_ACCESS
- if (onig_is_in_code_range(p, code)) goto fail;
+ if (onig_is_in_code_range(p, code)) goto fail;
#else
- q = p;
- ALIGNMENT_RIGHT(q);
- if (onig_is_in_code_range(q, code)) goto fail;
+ q = p;
+ ALIGNMENT_RIGHT(q);
+ if (onig_is_in_code_range(q, code)) goto fail;
#endif
}
p += tlen;
@@ -1747,36 +1785,36 @@ match_at(regex_t* reg, const UChar* str, const UChar* end,
case OP_CCLASS_MIX_NOT: MOP_IN(OP_CCLASS_MIX_NOT);
DATA_ENSURE(1);
if (ONIGENC_IS_MBC_HEAD(encode, s)) {
- p += SIZE_BITSET;
- goto cclass_mb_not;
+ p += SIZE_BITSET;
+ goto cclass_mb_not;
}
else {
- if (BITSET_AT(((BitSetRef )p), *s) != 0)
- goto fail;
+ if (BITSET_AT(((BitSetRef )p), *s) != 0)
+ goto fail;
- p += SIZE_BITSET;
- GET_LENGTH_INC(tlen, p);
- p += tlen;
- s++;
+ p += SIZE_BITSET;
+ GET_LENGTH_INC(tlen, p);
+ p += tlen;
+ s++;
}
MOP_OUT;
break;
case OP_CCLASS_NODE: MOP_IN(OP_CCLASS_NODE);
{
- OnigCodePoint code;
+ OnigCodePoint code;
void *node;
int mb_len;
UChar *ss;
DATA_ENSURE(1);
GET_POINTER_INC(node, p);
- mb_len = enclen(encode, s);
- ss = s;
- s += mb_len;
- DATA_ENSURE(0);
- code = ONIGENC_MBC_TO_CODE(encode, ss, s);
- if (onig_is_code_in_cc_len(mb_len, code, node) == 0) goto fail;
+ mb_len = enclen(encode, s);
+ ss = s;
+ s += mb_len;
+ DATA_ENSURE(0);
+ code = ONIGENC_MBC_TO_CODE(encode, ss, s);
+ if (onig_is_code_in_cc_len(mb_len, code, node) == 0) goto fail;
}
MOP_OUT;
break;
@@ -1800,8 +1838,8 @@ match_at(regex_t* reg, const UChar* str, const UChar* end,
case OP_ANYCHAR_STAR: MOP_IN(OP_ANYCHAR_STAR);
while (DATA_ENSURE_CHECK1) {
- STACK_PUSH_ALT(p, s, sprev);
- n = enclen(encode, s);
+ STACK_PUSH_ALT(p, s, sprev);
+ n = enclen(encode, s);
DATA_ENSURE(n);
if (ONIGENC_IS_MBC_NEWLINE(encode, s, end)) goto fail;
sprev = s;
@@ -1812,27 +1850,27 @@ match_at(regex_t* reg, const UChar* str, const UChar* end,
case OP_ANYCHAR_ML_STAR: MOP_IN(OP_ANYCHAR_ML_STAR);
while (DATA_ENSURE_CHECK1) {
- STACK_PUSH_ALT(p, s, sprev);
- n = enclen(encode, s);
- if (n > 1) {
- DATA_ENSURE(n);
- sprev = s;
- s += n;
- }
- else {
- sprev = s;
- s++;
- }
+ STACK_PUSH_ALT(p, s, sprev);
+ n = enclen(encode, s);
+ if (n > 1) {
+ DATA_ENSURE(n);
+ sprev = s;
+ s += n;
+ }
+ else {
+ sprev = s;
+ s++;
+ }
}
MOP_OUT;
break;
case OP_ANYCHAR_STAR_PEEK_NEXT: MOP_IN(OP_ANYCHAR_STAR_PEEK_NEXT);
while (DATA_ENSURE_CHECK1) {
- if (*p == *s) {
- STACK_PUSH_ALT(p + 1, s, sprev);
- }
- n = enclen(encode, s);
+ if (*p == *s) {
+ STACK_PUSH_ALT(p + 1, s, sprev);
+ }
+ n = enclen(encode, s);
DATA_ENSURE(n);
if (ONIGENC_IS_MBC_NEWLINE(encode, s, end)) goto fail;
sprev = s;
@@ -1844,19 +1882,19 @@ match_at(regex_t* reg, const UChar* str, const UChar* end,
case OP_ANYCHAR_ML_STAR_PEEK_NEXT:MOP_IN(OP_ANYCHAR_ML_STAR_PEEK_NEXT);
while (DATA_ENSURE_CHECK1) {
- if (*p == *s) {
- STACK_PUSH_ALT(p + 1, s, sprev);
- }
- n = enclen(encode, s);
- if (n > 1) {
- DATA_ENSURE(n);
- sprev = s;
- s += n;
- }
- else {
- sprev = s;
- s++;
- }
+ if (*p == *s) {
+ STACK_PUSH_ALT(p + 1, s, sprev);
+ }
+ n = enclen(encode, s);
+ if (n > 1) {
+ DATA_ENSURE(n);
+ sprev = s;
+ s += n;
+ }
+ else {
+ sprev = s;
+ s++;
+ }
}
p++;
MOP_OUT;
@@ -1866,11 +1904,11 @@ match_at(regex_t* reg, const UChar* str, const UChar* end,
case OP_STATE_CHECK_ANYCHAR_STAR: MOP_IN(OP_STATE_CHECK_ANYCHAR_STAR);
GET_STATE_CHECK_NUM_INC(mem, p);
while (DATA_ENSURE_CHECK1) {
- STATE_CHECK_VAL(scv, mem);
- if (scv) goto fail;
+ STATE_CHECK_VAL(scv, mem);
+ if (scv) goto fail;
- STACK_PUSH_ALT_WITH_STATE_CHECK(p, s, sprev, mem);
- n = enclen(encode, s);
+ STACK_PUSH_ALT_WITH_STATE_CHECK(p, s, sprev, mem);
+ n = enclen(encode, s);
DATA_ENSURE(n);
if (ONIGENC_IS_MBC_NEWLINE(encode, s, end)) goto fail;
sprev = s;
@@ -1884,20 +1922,20 @@ match_at(regex_t* reg, const UChar* str, const UChar* end,
GET_STATE_CHECK_NUM_INC(mem, p);
while (DATA_ENSURE_CHECK1) {
- STATE_CHECK_VAL(scv, mem);
- if (scv) goto fail;
-
- STACK_PUSH_ALT_WITH_STATE_CHECK(p, s, sprev, mem);
- n = enclen(encode, s);
- if (n > 1) {
- DATA_ENSURE(n);
- sprev = s;
- s += n;
- }
- else {
- sprev = s;
- s++;
- }
+ STATE_CHECK_VAL(scv, mem);
+ if (scv) goto fail;
+
+ STACK_PUSH_ALT_WITH_STATE_CHECK(p, s, sprev, mem);
+ n = enclen(encode, s);
+ if (n > 1) {
+ DATA_ENSURE(n);
+ sprev = s;
+ s += n;
+ }
+ else {
+ sprev = s;
+ s++;
+ }
}
MOP_OUT;
break;
@@ -1906,7 +1944,7 @@ match_at(regex_t* reg, const UChar* str, const UChar* end,
case OP_WORD: MOP_IN(OP_WORD);
DATA_ENSURE(1);
if (! ONIGENC_IS_MBC_WORD(encode, s, end))
- goto fail;
+ goto fail;
s += enclen(encode, s);
MOP_OUT;
@@ -1915,7 +1953,7 @@ match_at(regex_t* reg, const UChar* str, const UChar* end,
case OP_NOT_WORD: MOP_IN(OP_NOT_WORD);
DATA_ENSURE(1);
if (ONIGENC_IS_MBC_WORD(encode, s, end))
- goto fail;
+ goto fail;
s += enclen(encode, s);
MOP_OUT;
@@ -1923,18 +1961,18 @@ match_at(regex_t* reg, const UChar* str, const UChar* end,
case OP_WORD_BOUND: MOP_IN(OP_WORD_BOUND);
if (ON_STR_BEGIN(s)) {
- DATA_ENSURE(1);
- if (! ONIGENC_IS_MBC_WORD(encode, s, end))
- goto fail;
+ DATA_ENSURE(1);
+ if (! ONIGENC_IS_MBC_WORD(encode, s, end))
+ goto fail;
}
else if (ON_STR_END(s)) {
- if (! ONIGENC_IS_MBC_WORD(encode, sprev, end))
- goto fail;
+ if (! ONIGENC_IS_MBC_WORD(encode, sprev, end))
+ goto fail;
}
else {
- if (ONIGENC_IS_MBC_WORD(encode, s, end)
- == ONIGENC_IS_MBC_WORD(encode, sprev, end))
- goto fail;
+ if (ONIGENC_IS_MBC_WORD(encode, s, end)
+ == ONIGENC_IS_MBC_WORD(encode, sprev, end))
+ goto fail;
}
MOP_OUT;
continue;
@@ -1942,17 +1980,17 @@ match_at(regex_t* reg, const UChar* str, const UChar* end,
case OP_NOT_WORD_BOUND: MOP_IN(OP_NOT_WORD_BOUND);
if (ON_STR_BEGIN(s)) {
- if (DATA_ENSURE_CHECK1 && ONIGENC_IS_MBC_WORD(encode, s, end))
- goto fail;
+ if (DATA_ENSURE_CHECK1 && ONIGENC_IS_MBC_WORD(encode, s, end))
+ goto fail;
}
else if (ON_STR_END(s)) {
- if (ONIGENC_IS_MBC_WORD(encode, sprev, end))
- goto fail;
+ if (ONIGENC_IS_MBC_WORD(encode, sprev, end))
+ goto fail;
}
else {
- if (ONIGENC_IS_MBC_WORD(encode, s, end)
- != ONIGENC_IS_MBC_WORD(encode, sprev, end))
- goto fail;
+ if (ONIGENC_IS_MBC_WORD(encode, s, end)
+ != ONIGENC_IS_MBC_WORD(encode, sprev, end))
+ goto fail;
}
MOP_OUT;
continue;
@@ -1961,20 +1999,20 @@ match_at(regex_t* reg, const UChar* str, const UChar* end,
#ifdef USE_WORD_BEGIN_END
case OP_WORD_BEGIN: MOP_IN(OP_WORD_BEGIN);
if (DATA_ENSURE_CHECK1 && ONIGENC_IS_MBC_WORD(encode, s, end)) {
- if (ON_STR_BEGIN(s) || !ONIGENC_IS_MBC_WORD(encode, sprev, end)) {
- MOP_OUT;
- continue;
- }
+ if (ON_STR_BEGIN(s) || !ONIGENC_IS_MBC_WORD(encode, sprev, end)) {
+ MOP_OUT;
+ continue;
+ }
}
goto fail;
break;
case OP_WORD_END: MOP_IN(OP_WORD_END);
if (!ON_STR_BEGIN(s) && ONIGENC_IS_MBC_WORD(encode, sprev, end)) {
- if (ON_STR_END(s) || !ONIGENC_IS_MBC_WORD(encode, s, end)) {
- MOP_OUT;
- continue;
- }
+ if (ON_STR_END(s) || !ONIGENC_IS_MBC_WORD(encode, s, end)) {
+ MOP_OUT;
+ continue;
+ }
}
goto fail;
break;
@@ -1996,13 +2034,13 @@ match_at(regex_t* reg, const UChar* str, const UChar* end,
case OP_BEGIN_LINE: MOP_IN(OP_BEGIN_LINE);
if (ON_STR_BEGIN(s)) {
- if (IS_NOTBOL(msa->options)) goto fail;
- MOP_OUT;
- continue;
+ if (IS_NOTBOL(msa->options)) goto fail;
+ MOP_OUT;
+ continue;
}
else if (ONIGENC_IS_MBC_NEWLINE(encode, sprev, end) && !ON_STR_END(s)) {
- MOP_OUT;
- continue;
+ MOP_OUT;
+ continue;
}
goto fail;
break;
@@ -2010,23 +2048,23 @@ match_at(regex_t* reg, const UChar* str, const UChar* end,
case OP_END_LINE: MOP_IN(OP_END_LINE);
if (ON_STR_END(s)) {
#ifndef USE_NEWLINE_AT_END_OF_STRING_HAS_EMPTY_LINE
- if (IS_EMPTY_STR || !ONIGENC_IS_MBC_NEWLINE(encode, sprev, end)) {
+ if (IS_EMPTY_STR || !ONIGENC_IS_MBC_NEWLINE(encode, sprev, end)) {
#endif
- if (IS_NOTEOL(msa->options)) goto fail;
- MOP_OUT;
- continue;
+ if (IS_NOTEOL(msa->options)) goto fail;
+ MOP_OUT;
+ continue;
#ifndef USE_NEWLINE_AT_END_OF_STRING_HAS_EMPTY_LINE
- }
+ }
#endif
}
else if (ONIGENC_IS_MBC_NEWLINE(encode, s, end)) {
- MOP_OUT;
- continue;
+ MOP_OUT;
+ continue;
}
#ifdef USE_CRNL_AS_LINE_TERMINATOR
else if (ONIGENC_IS_MBC_CRNL(encode, s, end)) {
- MOP_OUT;
- continue;
+ MOP_OUT;
+ continue;
}
#endif
goto fail;
@@ -2035,24 +2073,24 @@ match_at(regex_t* reg, const UChar* str, const UChar* end,
case OP_SEMI_END_BUF: MOP_IN(OP_SEMI_END_BUF);
if (ON_STR_END(s)) {
#ifndef USE_NEWLINE_AT_END_OF_STRING_HAS_EMPTY_LINE
- if (IS_EMPTY_STR || !ONIGENC_IS_MBC_NEWLINE(encode, sprev, end)) {
+ if (IS_EMPTY_STR || !ONIGENC_IS_MBC_NEWLINE(encode, sprev, end)) {
#endif
- if (IS_NOTEOL(msa->options)) goto fail;
- MOP_OUT;
- continue;
+ if (IS_NOTEOL(msa->options)) goto fail;
+ MOP_OUT;
+ continue;
#ifndef USE_NEWLINE_AT_END_OF_STRING_HAS_EMPTY_LINE
- }
+ }
#endif
}
else if (ONIGENC_IS_MBC_NEWLINE(encode, s, end) &&
- ON_STR_END(s + enclen(encode, s))) {
- MOP_OUT;
- continue;
+ ON_STR_END(s + enclen(encode, s))) {
+ MOP_OUT;
+ continue;
}
#ifdef USE_CRNL_AS_LINE_TERMINATOR
else if (ONIGENC_IS_MBC_CRNL(encode, s, end)) {
UChar* ss = s + enclen(encode, s);
- ss += enclen(encode, ss);
+ ss += enclen(encode, ss);
if (ON_STR_END(ss)) {
MOP_OUT;
continue;
@@ -2064,7 +2102,7 @@ match_at(regex_t* reg, const UChar* str, const UChar* end,
case OP_BEGIN_POSITION: MOP_IN(OP_BEGIN_POSITION);
if (s != msa->start)
- goto fail;
+ goto fail;
MOP_OUT;
continue;
@@ -2114,9 +2152,9 @@ match_at(regex_t* reg, const UChar* str, const UChar* end,
STACK_GET_MEM_START(mem, stkp);
if (BIT_STATUS_AT(reg->bt_mem_start, mem))
- mem_start_stk[mem] = GET_STACK_INDEX(stkp);
+ mem_start_stk[mem] = GET_STACK_INDEX(stkp);
else
- mem_start_stk[mem] = (OnigStackIndex )((void* )stkp->u.mem.pstr);
+ mem_start_stk[mem] = (OnigStackIndex )((void* )stkp->u.mem.pstr);
STACK_PUSH_MEM_END_MARK(mem);
MOP_OUT;
@@ -2138,171 +2176,170 @@ match_at(regex_t* reg, const UChar* str, const UChar* end,
GET_MEMNUM_INC(mem, p);
backref:
{
- int len;
- UChar *pstart, *pend;
-
- /* if you want to remove following line,
- you should check in parse and compile time. */
- if (mem > num_mem) goto fail;
- if (mem_end_stk[mem] == INVALID_STACK_INDEX) goto fail;
- if (mem_start_stk[mem] == INVALID_STACK_INDEX) goto fail;
-
- if (BIT_STATUS_AT(reg->bt_mem_start, mem))
- pstart = STACK_AT(mem_start_stk[mem])->u.mem.pstr;
- else
- pstart = (UChar* )((void* )mem_start_stk[mem]);
-
- pend = (BIT_STATUS_AT(reg->bt_mem_end, mem)
- ? STACK_AT(mem_end_stk[mem])->u.mem.pstr
- : (UChar* )((void* )mem_end_stk[mem]));
- n = pend - pstart;
- DATA_ENSURE(n);
- sprev = s;
- STRING_CMP(pstart, s, n);
- while (sprev + (len = enclen(encode, sprev)) < s)
- sprev += len;
+ int len;
+ UChar *pstart, *pend;
+
+ /* if you want to remove following line,
+ you should check in parse and compile time. */
+ if (mem > num_mem) goto fail;
+ if (mem_end_stk[mem] == INVALID_STACK_INDEX) goto fail;
+ if (mem_start_stk[mem] == INVALID_STACK_INDEX) goto fail;
+
+ if (BIT_STATUS_AT(reg->bt_mem_start, mem))
+ pstart = STACK_AT(mem_start_stk[mem])->u.mem.pstr;
+ else
+ pstart = (UChar* )((void* )mem_start_stk[mem]);
+
+ pend = (BIT_STATUS_AT(reg->bt_mem_end, mem)
+ ? STACK_AT(mem_end_stk[mem])->u.mem.pstr
+ : (UChar* )((void* )mem_end_stk[mem]));
+ n = pend - pstart;
+ DATA_ENSURE(n);
+ sprev = s;
+ STRING_CMP(pstart, s, n);
+ while (sprev + (len = enclen(encode, sprev)) < s)
+ sprev += len;
- MOP_OUT;
- continue;
+ MOP_OUT;
+ continue;
}
break;
case OP_BACKREFN_IC: MOP_IN(OP_BACKREFN_IC);
GET_MEMNUM_INC(mem, p);
{
- int len;
- UChar *pstart, *pend;
-
- /* if you want to remove following line,
- you should check in parse and compile time. */
- if (mem > num_mem) goto fail;
- if (mem_end_stk[mem] == INVALID_STACK_INDEX) goto fail;
- if (mem_start_stk[mem] == INVALID_STACK_INDEX) goto fail;
-
- if (BIT_STATUS_AT(reg->bt_mem_start, mem))
- pstart = STACK_AT(mem_start_stk[mem])->u.mem.pstr;
- else
- pstart = (UChar* )((void* )mem_start_stk[mem]);
-
- pend = (BIT_STATUS_AT(reg->bt_mem_end, mem)
- ? STACK_AT(mem_end_stk[mem])->u.mem.pstr
- : (UChar* )((void* )mem_end_stk[mem]));
- n = pend - pstart;
- DATA_ENSURE(n);
- sprev = s;
- STRING_CMP_IC(case_fold_flag, pstart, &s, n);
- while (sprev + (len = enclen(encode, sprev)) < s)
- sprev += len;
+ int len;
+ UChar *pstart, *pend;
+
+ /* if you want to remove following line,
+ you should check in parse and compile time. */
+ if (mem > num_mem) goto fail;
+ if (mem_end_stk[mem] == INVALID_STACK_INDEX) goto fail;
+ if (mem_start_stk[mem] == INVALID_STACK_INDEX) goto fail;
+
+ if (BIT_STATUS_AT(reg->bt_mem_start, mem))
+ pstart = STACK_AT(mem_start_stk[mem])->u.mem.pstr;
+ else
+ pstart = (UChar* )((void* )mem_start_stk[mem]);
+
+ pend = (BIT_STATUS_AT(reg->bt_mem_end, mem)
+ ? STACK_AT(mem_end_stk[mem])->u.mem.pstr
+ : (UChar* )((void* )mem_end_stk[mem]));
+ n = pend - pstart;
+ DATA_ENSURE(n);
+ sprev = s;
+ STRING_CMP_IC(case_fold_flag, pstart, &s, n);
+ while (sprev + (len = enclen(encode, sprev)) < s)
+ sprev += len;
- MOP_OUT;
- continue;
+ MOP_OUT;
+ continue;
}
break;
case OP_BACKREF_MULTI: MOP_IN(OP_BACKREF_MULTI);
{
- int len, is_fail;
- UChar *pstart, *pend, *swork;
-
- GET_LENGTH_INC(tlen, p);
- for (i = 0; i < tlen; i++) {
- GET_MEMNUM_INC(mem, p);
-
- if (mem_end_stk[mem] == INVALID_STACK_INDEX) continue;
- if (mem_start_stk[mem] == INVALID_STACK_INDEX) continue;
-
- if (BIT_STATUS_AT(reg->bt_mem_start, mem))
- pstart = STACK_AT(mem_start_stk[mem])->u.mem.pstr;
- else
- pstart = (UChar* )((void* )mem_start_stk[mem]);
-
- pend = (BIT_STATUS_AT(reg->bt_mem_end, mem)
- ? STACK_AT(mem_end_stk[mem])->u.mem.pstr
- : (UChar* )((void* )mem_end_stk[mem]));
- n = pend - pstart;
- DATA_ENSURE(n);
- sprev = s;
- swork = s;
- STRING_CMP_VALUE(pstart, swork, n, is_fail);
- if (is_fail) continue;
- s = swork;
- while (sprev + (len = enclen(encode, sprev)) < s)
- sprev += len;
-
- p += (SIZE_MEMNUM * (tlen - i - 1));
- break; /* success */
- }
- if (i == tlen) goto fail;
- MOP_OUT;
- continue;
+ int len, is_fail;
+ UChar *pstart, *pend, *swork;
+
+ GET_LENGTH_INC(tlen, p);
+ for (i = 0; i < tlen; i++) {
+ GET_MEMNUM_INC(mem, p);
+
+ if (mem_end_stk[mem] == INVALID_STACK_INDEX) continue;
+ if (mem_start_stk[mem] == INVALID_STACK_INDEX) continue;
+
+ if (BIT_STATUS_AT(reg->bt_mem_start, mem))
+ pstart = STACK_AT(mem_start_stk[mem])->u.mem.pstr;
+ else
+ pstart = (UChar* )((void* )mem_start_stk[mem]);
+
+ pend = (BIT_STATUS_AT(reg->bt_mem_end, mem)
+ ? STACK_AT(mem_end_stk[mem])->u.mem.pstr
+ : (UChar* )((void* )mem_end_stk[mem]));
+ n = pend - pstart;
+ DATA_ENSURE(n);
+ sprev = s;
+ swork = s;
+ STRING_CMP_VALUE(pstart, swork, n, is_fail);
+ if (is_fail) continue;
+ s = swork;
+ while (sprev + (len = enclen(encode, sprev)) < s)
+ sprev += len;
+
+ p += (SIZE_MEMNUM * (tlen - i - 1));
+ break; /* success */
+ }
+ if (i == tlen) goto fail;
+ MOP_OUT;
+ continue;
}
break;
case OP_BACKREF_MULTI_IC: MOP_IN(OP_BACKREF_MULTI_IC);
{
- int len, is_fail;
- UChar *pstart, *pend, *swork;
-
- GET_LENGTH_INC(tlen, p);
- for (i = 0; i < tlen; i++) {
- GET_MEMNUM_INC(mem, p);
-
- if (mem_end_stk[mem] == INVALID_STACK_INDEX) continue;
- if (mem_start_stk[mem] == INVALID_STACK_INDEX) continue;
-
- if (BIT_STATUS_AT(reg->bt_mem_start, mem))
- pstart = STACK_AT(mem_start_stk[mem])->u.mem.pstr;
- else
- pstart = (UChar* )((void* )mem_start_stk[mem]);
-
- pend = (BIT_STATUS_AT(reg->bt_mem_end, mem)
- ? STACK_AT(mem_end_stk[mem])->u.mem.pstr
- : (UChar* )((void* )mem_end_stk[mem]));
- n = pend - pstart;
- DATA_ENSURE(n);
- sprev = s;
- swork = s;
- STRING_CMP_VALUE_IC(case_fold_flag, pstart, &swork, n, is_fail);
- if (is_fail) continue;
- s = swork;
- while (sprev + (len = enclen(encode, sprev)) < s)
- sprev += len;
-
- p += (SIZE_MEMNUM * (tlen - i - 1));
- break; /* success */
- }
- if (i == tlen) goto fail;
- MOP_OUT;
- continue;
+ int len, is_fail;
+ UChar *pstart, *pend, *swork;
+
+ GET_LENGTH_INC(tlen, p);
+ for (i = 0; i < tlen; i++) {
+ GET_MEMNUM_INC(mem, p);
+
+ if (mem_end_stk[mem] == INVALID_STACK_INDEX) continue;
+ if (mem_start_stk[mem] == INVALID_STACK_INDEX) continue;
+
+ if (BIT_STATUS_AT(reg->bt_mem_start, mem))
+ pstart = STACK_AT(mem_start_stk[mem])->u.mem.pstr;
+ else
+ pstart = (UChar* )((void* )mem_start_stk[mem]);
+
+ pend = (BIT_STATUS_AT(reg->bt_mem_end, mem)
+ ? STACK_AT(mem_end_stk[mem])->u.mem.pstr
+ : (UChar* )((void* )mem_end_stk[mem]));
+ n = pend - pstart;
+ DATA_ENSURE(n);
+ sprev = s;
+ swork = s;
+ STRING_CMP_VALUE_IC(case_fold_flag, pstart, &swork, n, is_fail);
+ if (is_fail) continue;
+ s = swork;
+ while (sprev + (len = enclen(encode, sprev)) < s)
+ sprev += len;
+
+ p += (SIZE_MEMNUM * (tlen - i - 1));
+ break; /* success */
+ }
+ if (i == tlen) goto fail;
+ MOP_OUT;
+ continue;
}
break;
#ifdef USE_BACKREF_WITH_LEVEL
case OP_BACKREF_WITH_LEVEL:
{
- int len;
- OnigOptionType ic;
- LengthType level;
+ int len;
+ OnigOptionType ic;
+ LengthType level;
- GET_OPTION_INC(ic, p);
- GET_LENGTH_INC(level, p);
- GET_LENGTH_INC(tlen, p);
+ GET_OPTION_INC(ic, p);
+ GET_LENGTH_INC(level, p);
+ GET_LENGTH_INC(tlen, p);
- sprev = s;
- if (backref_match_at_nested_level(reg, stk, stk_base, ic
- , case_fold_flag, (int )level, (int )tlen, p, &s, end)) {
- while (sprev + (len = enclen(encode, sprev)) < s)
- sprev += len;
+ sprev = s;
+ if (backref_match_at_nested_level(reg, stk, stk_base, ic
+ , case_fold_flag, (int )level, (int )tlen, p, &s, end)) {
+ while (sprev + (len = enclen(encode, sprev)) < s)
+ sprev += len;
- p += (SIZE_MEMNUM * tlen);
- }
- else
- goto fail;
+ p += (SIZE_MEMNUM * tlen);
+ }
+ else
+ goto fail;
- MOP_OUT;
- continue;
+ MOP_OUT;
+ continue;
}
-
break;
#endif
@@ -2331,33 +2368,33 @@ match_at(regex_t* reg, const UChar* str, const UChar* end,
case OP_NULL_CHECK_END: MOP_IN(OP_NULL_CHECK_END);
{
- int isnull;
+ int isnull;
- GET_MEMNUM_INC(mem, p); /* mem: null check id */
- STACK_NULL_CHECK(isnull, mem, s);
- if (isnull) {
+ GET_MEMNUM_INC(mem, p); /* mem: null check id */
+ STACK_NULL_CHECK(isnull, mem, s);
+ if (isnull) {
#ifdef ONIG_DEBUG_MATCH
- fprintf(stderr, "NULL_CHECK_END: skip id:%d, s:%d\n",
- (int )mem, (int )s);
-#endif
- null_check_found:
- /* empty loop founded, skip next instruction */
- switch (*p++) {
- case OP_JUMP:
- case OP_PUSH:
- p += SIZE_RELADDR;
- break;
- case OP_REPEAT_INC:
- case OP_REPEAT_INC_NG:
- case OP_REPEAT_INC_SG:
- case OP_REPEAT_INC_NG_SG:
- p += SIZE_MEMNUM;
- break;
- default:
- goto unexpected_bytecode_error;
- break;
- }
- }
+ fprintf(stderr, "NULL_CHECK_END: skip id:%d, s:%d\n",
+ (int )mem, (int )s);
+#endif
+ null_check_found:
+ /* empty loop founded, skip next instruction */
+ switch (*p++) {
+ case OP_JUMP:
+ case OP_PUSH:
+ p += SIZE_RELADDR;
+ break;
+ case OP_REPEAT_INC:
+ case OP_REPEAT_INC_NG:
+ case OP_REPEAT_INC_SG:
+ case OP_REPEAT_INC_NG_SG:
+ p += SIZE_MEMNUM;
+ break;
+ default:
+ goto unexpected_bytecode_error;
+ break;
+ }
+ }
}
MOP_OUT;
continue;
@@ -2366,18 +2403,18 @@ match_at(regex_t* reg, const UChar* str, const UChar* end,
#ifdef USE_MONOMANIAC_CHECK_CAPTURES_IN_ENDLESS_REPEAT
case OP_NULL_CHECK_END_MEMST: MOP_IN(OP_NULL_CHECK_END_MEMST);
{
- int isnull;
+ int isnull;
- GET_MEMNUM_INC(mem, p); /* mem: null check id */
- STACK_NULL_CHECK_MEMST(isnull, mem, s, reg);
- if (isnull) {
+ GET_MEMNUM_INC(mem, p); /* mem: null check id */
+ STACK_NULL_CHECK_MEMST(isnull, mem, s, reg);
+ if (isnull) {
#ifdef ONIG_DEBUG_MATCH
- fprintf(stderr, "NULL_CHECK_END_MEMST: skip id:%d, s:%d\n",
- (int )mem, (int )s);
+ fprintf(stderr, "NULL_CHECK_END_MEMST: skip id:%d, s:%d\n",
+ (int )mem, (int )s);
#endif
- if (isnull == -1) goto fail;
- goto null_check_found;
- }
+ if (isnull == -1) goto fail;
+ goto null_check_found;
+ }
}
MOP_OUT;
continue;
@@ -2388,25 +2425,25 @@ match_at(regex_t* reg, const UChar* str, const UChar* end,
case OP_NULL_CHECK_END_MEMST_PUSH:
MOP_IN(OP_NULL_CHECK_END_MEMST_PUSH);
{
- int isnull;
+ int isnull;
- GET_MEMNUM_INC(mem, p); /* mem: null check id */
+ GET_MEMNUM_INC(mem, p); /* mem: null check id */
#ifdef USE_MONOMANIAC_CHECK_CAPTURES_IN_ENDLESS_REPEAT
- STACK_NULL_CHECK_MEMST_REC(isnull, mem, s, reg);
+ STACK_NULL_CHECK_MEMST_REC(isnull, mem, s, reg);
#else
- STACK_NULL_CHECK_REC(isnull, mem, s);
+ STACK_NULL_CHECK_REC(isnull, mem, s);
#endif
- if (isnull) {
+ if (isnull) {
#ifdef ONIG_DEBUG_MATCH
- fprintf(stderr, "NULL_CHECK_END_MEMST_PUSH: skip id:%d, s:%d\n",
- (int )mem, (int )s);
+ fprintf(stderr, "NULL_CHECK_END_MEMST_PUSH: skip id:%d, s:%d\n",
+ (int )mem, (int )s);
#endif
- if (isnull == -1) goto fail;
- goto null_check_found;
- }
- else {
- STACK_PUSH_NULL_CHECK_END(mem);
- }
+ if (isnull == -1) goto fail;
+ goto null_check_found;
+ }
+ else {
+ STACK_PUSH_NULL_CHECK_END(mem);
+ }
}
MOP_OUT;
continue;
@@ -2445,10 +2482,10 @@ match_at(regex_t* reg, const UChar* str, const UChar* end,
GET_RELADDR_INC(addr, p);
STATE_CHECK_VAL(scv, mem);
if (scv) {
- p += addr;
+ p += addr;
}
else {
- STACK_PUSH_ALT_WITH_STATE_CHECK(p + addr, s, sprev, mem);
+ STACK_PUSH_ALT_WITH_STATE_CHECK(p + addr, s, sprev, mem);
}
MOP_OUT;
continue;
@@ -2474,10 +2511,10 @@ match_at(regex_t* reg, const UChar* str, const UChar* end,
case OP_PUSH_OR_JUMP_EXACT1: MOP_IN(OP_PUSH_OR_JUMP_EXACT1);
GET_RELADDR_INC(addr, p);
if (*p == *s && DATA_ENSURE_CHECK1) {
- p++;
- STACK_PUSH_ALT(p + addr, s, sprev);
- MOP_OUT;
- continue;
+ p++;
+ STACK_PUSH_ALT(p + addr, s, sprev);
+ MOP_OUT;
+ continue;
}
p += (addr + 1);
MOP_OUT;
@@ -2487,10 +2524,10 @@ match_at(regex_t* reg, const UChar* str, const UChar* end,
case OP_PUSH_IF_PEEK_NEXT: MOP_IN(OP_PUSH_IF_PEEK_NEXT);
GET_RELADDR_INC(addr, p);
if (*p == *s) {
- p++;
- STACK_PUSH_ALT(p + addr, s, sprev);
- MOP_OUT;
- continue;
+ p++;
+ STACK_PUSH_ALT(p + addr, s, sprev);
+ MOP_OUT;
+ continue;
}
p++;
MOP_OUT;
@@ -2499,16 +2536,16 @@ match_at(regex_t* reg, const UChar* str, const UChar* end,
case OP_REPEAT: MOP_IN(OP_REPEAT);
{
- GET_MEMNUM_INC(mem, p); /* mem: OP_REPEAT ID */
- GET_RELADDR_INC(addr, p);
+ GET_MEMNUM_INC(mem, p); /* mem: OP_REPEAT ID */
+ GET_RELADDR_INC(addr, p);
- STACK_ENSURE(1);
- repeat_stk[mem] = GET_STACK_INDEX(stk);
- STACK_PUSH_REPEAT(mem, p);
+ STACK_ENSURE(1);
+ repeat_stk[mem] = GET_STACK_INDEX(stk);
+ STACK_PUSH_REPEAT(mem, p);
- if (reg->repeat_range[mem].lower == 0) {
- STACK_PUSH_ALT(p + addr, s, sprev);
- }
+ if (reg->repeat_range[mem].lower == 0) {
+ STACK_PUSH_ALT(p + addr, s, sprev);
+ }
}
MOP_OUT;
continue;
@@ -2516,17 +2553,17 @@ match_at(regex_t* reg, const UChar* str, const UChar* end,
case OP_REPEAT_NG: MOP_IN(OP_REPEAT_NG);
{
- GET_MEMNUM_INC(mem, p); /* mem: OP_REPEAT ID */
- GET_RELADDR_INC(addr, p);
+ GET_MEMNUM_INC(mem, p); /* mem: OP_REPEAT ID */
+ GET_RELADDR_INC(addr, p);
- STACK_ENSURE(1);
- repeat_stk[mem] = GET_STACK_INDEX(stk);
- STACK_PUSH_REPEAT(mem, p);
+ STACK_ENSURE(1);
+ repeat_stk[mem] = GET_STACK_INDEX(stk);
+ STACK_PUSH_REPEAT(mem, p);
- if (reg->repeat_range[mem].lower == 0) {
- STACK_PUSH_ALT(p, s, sprev);
- p += addr;
- }
+ if (reg->repeat_range[mem].lower == 0) {
+ STACK_PUSH_ALT(p, s, sprev);
+ p += addr;
+ }
}
MOP_OUT;
continue;
@@ -2604,9 +2641,9 @@ match_at(regex_t* reg, const UChar* str, const UChar* end,
case OP_POP_POS: MOP_IN(OP_POP_POS);
{
- STACK_POS_END(stkp);
- s = stkp->u.state.pstr;
- sprev = stkp->u.state.pstr_prev;
+ STACK_POS_END(stkp);
+ s = stkp->u.state.pstr;
+ sprev = stkp->u.state.pstr_prev;
}
MOP_OUT;
continue;
@@ -2650,15 +2687,15 @@ match_at(regex_t* reg, const UChar* str, const UChar* end,
GET_LENGTH_INC(tlen, p);
q = (UChar* )ONIGENC_STEP_BACK(encode, str, s, (int )tlen);
if (IS_NULL(q)) {
- /* too short case -> success. ex. /(?<!XXX)a/.match("a")
- If you want to change to fail, replace following line. */
- p += addr;
- /* goto fail; */
+ /* too short case -> success. ex. /(?<!XXX)a/.match("a")
+ If you want to change to fail, replace following line. */
+ p += addr;
+ /* goto fail; */
}
else {
- STACK_PUSH_LOOK_BEHIND_NOT(p + addr, s, sprev);
- s = q;
- sprev = (UChar* )onigenc_get_prev_char_head(encode, str, s);
+ STACK_PUSH_LOOK_BEHIND_NOT(p + addr, s, sprev);
+ s = q;
+ sprev = (UChar* )onigenc_get_prev_char_head(encode, str, s);
}
MOP_OUT;
continue;
@@ -2755,12 +2792,12 @@ slow_search(OnigEncoding enc, UChar* target, UChar* target_end,
p = s + 1;
t = target + 1;
while (t < target_end) {
- if (*t != *p++)
- break;
- t++;
+ if (*t != *p++)
+ break;
+ t++;
}
if (t == target_end)
- return s;
+ return s;
}
s += enclen(enc, s);
}
@@ -2832,12 +2869,12 @@ slow_search_backward(OnigEncoding enc, UChar* target, UChar* target_end,
p = s + 1;
t = target + 1;
while (t < target_end) {
- if (*t != *p++)
- break;
- t++;
+ if (*t != *p++)
+ break;
+ t++;
}
if (t == target_end)
- return s;
+ return s;
}
s = (UChar* )onigenc_get_prev_char_head(enc, adjust_text, s);
}
@@ -2898,8 +2935,8 @@ bm_search_notrev(regex_t* reg, const UChar* target, const UChar* target_end,
p = se = s + tlen1;
t = tail;
while (*p == *t) {
- if (t == target) return (UChar* )s;
- p--; t--;
+ if (t == target) return (UChar* )s;
+ p--; t--;
}
skip = reg->map[*se];
t = s;
@@ -2913,8 +2950,8 @@ bm_search_notrev(regex_t* reg, const UChar* target, const UChar* target_end,
p = se = s + tlen1;
t = tail;
while (*p == *t) {
- if (t == target) return (UChar* )s;
- p--; t--;
+ if (t == target) return (UChar* )s;
+ p--; t--;
}
skip = reg->int_map[*se];
t = s;
@@ -2945,8 +2982,8 @@ bm_search(regex_t* reg, const UChar* target, const UChar* target_end,
p = s;
t = tail;
while (*p == *t) {
- if (t == target) return (UChar* )p;
- p--; t--;
+ if (t == target) return (UChar* )p;
+ p--; t--;
}
s += reg->map[*s];
}
@@ -2956,8 +2993,8 @@ bm_search(regex_t* reg, const UChar* target, const UChar* target_end,
p = s;
t = tail;
while (*p == *t) {
- if (t == target) return (UChar* )p;
- p--; t--;
+ if (t == target) return (UChar* )p;
+ p--; t--;
}
s += reg->int_map[*s];
}
@@ -2965,6 +3002,7 @@ bm_search(regex_t* reg, const UChar* target, const UChar* target_end,
return (UChar* )NULL;
}
+#ifdef USE_INT_MAP_BACKWARD
static int
set_bm_backward_skip(UChar* s, UChar* end, OnigEncoding enc ARG_UNUSED,
int** skip)
@@ -3015,6 +3053,7 @@ bm_search_backward(regex_t* reg, const UChar* target, const UChar* target_end,
return (UChar* )NULL;
}
+#endif
static UChar*
map_search(OnigEncoding enc, UChar map[],
@@ -3053,7 +3092,7 @@ onig_match(regex_t* reg, const UChar* str, const UChar* end, const UChar* at, On
UChar *prev;
OnigMatchArg msa;
- MATCH_ARG_INIT(msa, option, region, at);
+ MATCH_ARG_INIT(msa, reg, option, region, at);
#ifdef USE_COMBINATION_EXPLOSION_CHECK
{
int offset = at - str;
@@ -3142,58 +3181,58 @@ forward_search_range(regex_t* reg, const UChar* str, const UChar* end, UChar* s,
switch (reg->sub_anchor) {
case ANCHOR_BEGIN_LINE:
- if (!ON_STR_BEGIN(p)) {
- prev = onigenc_get_prev_char_head(reg->enc,
- (pprev ? pprev : str), p);
- if (!ONIGENC_IS_MBC_NEWLINE(reg->enc, prev, end))
- goto retry_gate;
- }
- break;
+ if (!ON_STR_BEGIN(p)) {
+ prev = onigenc_get_prev_char_head(reg->enc,
+ (pprev ? pprev : str), p);
+ if (!ONIGENC_IS_MBC_NEWLINE(reg->enc, prev, end))
+ goto retry_gate;
+ }
+ break;
case ANCHOR_END_LINE:
- if (ON_STR_END(p)) {
+ if (ON_STR_END(p)) {
#ifndef USE_NEWLINE_AT_END_OF_STRING_HAS_EMPTY_LINE
- prev = (UChar* )onigenc_get_prev_char_head(reg->enc,
- (pprev ? pprev : str), p);
- if (prev && ONIGENC_IS_MBC_NEWLINE(reg->enc, prev, end))
- goto retry_gate;
+ prev = (UChar* )onigenc_get_prev_char_head(reg->enc,
+ (pprev ? pprev : str), p);
+ if (prev && ONIGENC_IS_MBC_NEWLINE(reg->enc, prev, end))
+ goto retry_gate;
#endif
- }
- else if (! ONIGENC_IS_MBC_NEWLINE(reg->enc, p, end)
+ }
+ else if (! ONIGENC_IS_MBC_NEWLINE(reg->enc, p, end)
#ifdef USE_CRNL_AS_LINE_TERMINATOR
- && ! ONIGENC_IS_MBC_CRNL(reg->enc, p, end)
+ && ! ONIGENC_IS_MBC_CRNL(reg->enc, p, end)
#endif
- )
- goto retry_gate;
- break;
+ )
+ goto retry_gate;
+ break;
}
}
if (reg->dmax == 0) {
*low = p;
if (low_prev) {
- if (*low > s)
- *low_prev = onigenc_get_prev_char_head(reg->enc, s, p);
- else
- *low_prev = onigenc_get_prev_char_head(reg->enc,
- (pprev ? pprev : str), p);
+ if (*low > s)
+ *low_prev = onigenc_get_prev_char_head(reg->enc, s, p);
+ else
+ *low_prev = onigenc_get_prev_char_head(reg->enc,
+ (pprev ? pprev : str), p);
}
}
else {
if (reg->dmax != ONIG_INFINITE_DISTANCE) {
- *low = p - reg->dmax;
- if (*low > s) {
- *low = onigenc_get_right_adjust_char_head_with_prev(reg->enc, s,
- *low, (const UChar** )low_prev);
- if (low_prev && IS_NULL(*low_prev))
- *low_prev = onigenc_get_prev_char_head(reg->enc,
- (pprev ? pprev : s), *low);
- }
- else {
- if (low_prev)
- *low_prev = onigenc_get_prev_char_head(reg->enc,
- (pprev ? pprev : str), *low);
- }
+ *low = p - reg->dmax;
+ if (*low > s) {
+ *low = onigenc_get_right_adjust_char_head_with_prev(reg->enc, s,
+ *low, (const UChar** )low_prev);
+ if (low_prev && IS_NULL(*low_prev))
+ *low_prev = onigenc_get_prev_char_head(reg->enc,
+ (pprev ? pprev : s), *low);
+ }
+ else {
+ if (low_prev)
+ *low_prev = onigenc_get_prev_char_head(reg->enc,
+ (pprev ? pprev : str), *low);
+ }
}
}
/* no needs to adjust *high, *high is used as range check only */
@@ -3210,8 +3249,6 @@ forward_search_range(regex_t* reg, const UChar* str, const UChar* end, UChar* s,
return 0; /* fail */
}
-static int set_bm_backward_skip P_((UChar* s, UChar* end, OnigEncoding enc,
- int** skip));
#define BM_BACKWARD_SEARCH_LENGTH_THRESHOLD 100
@@ -3220,7 +3257,6 @@ backward_search_range(regex_t* reg, const UChar* str, const UChar* end,
UChar* s, const UChar* range, UChar* adjrange,
UChar** low, UChar** high)
{
- int r;
UChar *p;
range += reg->dmin;
@@ -3242,16 +3278,22 @@ backward_search_range(regex_t* reg, const UChar* str, const UChar* end,
case ONIG_OPTIMIZE_EXACT_BM:
case ONIG_OPTIMIZE_EXACT_BM_NOT_REV:
+#ifdef USE_INT_MAP_BACKWARD
if (IS_NULL(reg->int_map_backward)) {
+ int r;
+
if (s - range < BM_BACKWARD_SEARCH_LENGTH_THRESHOLD)
- goto exact_method;
+ goto exact_method;
r = set_bm_backward_skip(reg->exact, reg->exact_end, reg->enc,
- &(reg->int_map_backward));
+ &(reg->int_map_backward));
if (r) return r;
}
p = bm_search_backward(reg, reg->exact, reg->exact_end, range, adjrange,
- end, p);
+ end, p);
+#else
+ goto exact_method;
+#endif
break;
case ONIG_OPTIMIZE_MAP:
@@ -3265,36 +3307,36 @@ backward_search_range(regex_t* reg, const UChar* str, const UChar* end,
switch (reg->sub_anchor) {
case ANCHOR_BEGIN_LINE:
- if (!ON_STR_BEGIN(p)) {
- prev = onigenc_get_prev_char_head(reg->enc, str, p);
- if (!ONIGENC_IS_MBC_NEWLINE(reg->enc, prev, end)) {
- p = prev;
- goto retry;
- }
- }
- break;
+ if (!ON_STR_BEGIN(p)) {
+ prev = onigenc_get_prev_char_head(reg->enc, str, p);
+ if (!ONIGENC_IS_MBC_NEWLINE(reg->enc, prev, end)) {
+ p = prev;
+ goto retry;
+ }
+ }
+ break;
case ANCHOR_END_LINE:
- if (ON_STR_END(p)) {
+ if (ON_STR_END(p)) {
#ifndef USE_NEWLINE_AT_END_OF_STRING_HAS_EMPTY_LINE
- prev = onigenc_get_prev_char_head(reg->enc, adjrange, p);
- if (IS_NULL(prev)) goto fail;
- if (ONIGENC_IS_MBC_NEWLINE(reg->enc, prev, end)) {
- p = prev;
- goto retry;
- }
-#endif
- }
- else if (! ONIGENC_IS_MBC_NEWLINE(reg->enc, p, end)
+ prev = onigenc_get_prev_char_head(reg->enc, adjrange, p);
+ if (IS_NULL(prev)) goto fail;
+ if (ONIGENC_IS_MBC_NEWLINE(reg->enc, prev, end)) {
+ p = prev;
+ goto retry;
+ }
+#endif
+ }
+ else if (! ONIGENC_IS_MBC_NEWLINE(reg->enc, p, end)
#ifdef USE_CRNL_AS_LINE_TERMINATOR
- && ! ONIGENC_IS_MBC_CRNL(reg->enc, p, end)
+ && ! ONIGENC_IS_MBC_CRNL(reg->enc, p, end)
#endif
- ) {
- p = onigenc_get_prev_char_head(reg->enc, adjrange, p);
- if (IS_NULL(p)) goto fail;
- goto retry;
- }
- break;
+ ) {
+ p = onigenc_get_prev_char_head(reg->enc, adjrange, p);
+ if (IS_NULL(p)) goto fail;
+ goto retry;
+ }
+ break;
}
}
@@ -3405,56 +3447,56 @@ onig_search(regex_t* reg, const UChar* str, const UChar* end,
/* search start-position only */
begin_position:
if (range > start)
- range = start + 1;
+ range = start + 1;
else
- range = start;
+ range = start;
}
else if (reg->anchor & ANCHOR_BEGIN_BUF) {
/* search str-position only */
if (range > start) {
- if (start != str) goto mismatch_no_msa;
- range = str + 1;
+ if (start != str) goto mismatch_no_msa;
+ range = str + 1;
}
else {
- if (range <= str) {
- start = str;
- range = str;
- }
- else
- goto mismatch_no_msa;
+ if (range <= str) {
+ start = str;
+ range = str;
+ }
+ else
+ goto mismatch_no_msa;
}
}
else if (reg->anchor & ANCHOR_END_BUF) {
min_semi_end = max_semi_end = (UChar* )end;
end_buf:
- if ((OnigDistance )(max_semi_end - str) < reg->anchor_dmin)
+ if ((OnigLen )(max_semi_end - str) < reg->anchor_dmin)
goto mismatch_no_msa;
if (range > start) {
- if ((OnigDistance )(min_semi_end - start) > reg->anchor_dmax) {
- start = min_semi_end - reg->anchor_dmax;
- if (start < end)
- start = onigenc_get_right_adjust_char_head(reg->enc, str, start);
- else { /* match with empty at end */
- start = onigenc_get_prev_char_head(reg->enc, str, end);
- }
- }
- if ((OnigDistance )(max_semi_end - (range - 1)) < reg->anchor_dmin) {
- range = max_semi_end - reg->anchor_dmin + 1;
- }
-
- if (start >= range) goto mismatch_no_msa;
+ if ((OnigLen )(min_semi_end - start) > reg->anchor_dmax) {
+ start = min_semi_end - reg->anchor_dmax;
+ if (start < end)
+ start = onigenc_get_right_adjust_char_head(reg->enc, str, start);
+ else { /* match with empty at end */
+ start = onigenc_get_prev_char_head(reg->enc, str, end);
+ }
+ }
+ if ((OnigLen )(max_semi_end - (range - 1)) < reg->anchor_dmin) {
+ range = max_semi_end - reg->anchor_dmin + 1;
+ }
+
+ if (start >= range) goto mismatch_no_msa;
}
else {
- if ((OnigDistance )(min_semi_end - range) > reg->anchor_dmax) {
- range = min_semi_end - reg->anchor_dmax;
- }
- if ((OnigDistance )(max_semi_end - start) < reg->anchor_dmin) {
- start = max_semi_end - reg->anchor_dmin;
- start = ONIGENC_LEFT_ADJUST_CHAR_HEAD(reg->enc, str, start);
- }
- if (range > start) goto mismatch_no_msa;
+ if ((OnigLen )(min_semi_end - range) > reg->anchor_dmax) {
+ range = min_semi_end - reg->anchor_dmax;
+ }
+ if ((OnigLen )(max_semi_end - start) < reg->anchor_dmin) {
+ start = max_semi_end - reg->anchor_dmin;
+ start = ONIGENC_LEFT_ADJUST_CHAR_HEAD(reg->enc, str, start);
+ }
+ if (range > start) goto mismatch_no_msa;
}
}
else if (reg->anchor & ANCHOR_SEMI_END_BUF) {
@@ -3462,22 +3504,22 @@ onig_search(regex_t* reg, const UChar* str, const UChar* end,
max_semi_end = (UChar* )end;
if (ONIGENC_IS_MBC_NEWLINE(reg->enc, pre_end, end)) {
- min_semi_end = pre_end;
+ min_semi_end = pre_end;
#ifdef USE_CRNL_AS_LINE_TERMINATOR
- pre_end = ONIGENC_STEP_BACK(reg->enc, str, pre_end, 1);
- if (IS_NOT_NULL(pre_end) &&
- ONIGENC_IS_MBC_CRNL(reg->enc, pre_end, end)) {
- min_semi_end = pre_end;
- }
+ pre_end = ONIGENC_STEP_BACK(reg->enc, str, pre_end, 1);
+ if (IS_NOT_NULL(pre_end) &&
+ ONIGENC_IS_MBC_CRNL(reg->enc, pre_end, end)) {
+ min_semi_end = pre_end;
+ }
#endif
- if (min_semi_end > str && start <= min_semi_end) {
- goto end_buf;
- }
+ if (min_semi_end > str && start <= min_semi_end) {
+ goto end_buf;
+ }
}
else {
- min_semi_end = (UChar* )end;
- goto end_buf;
+ min_semi_end = (UChar* )end;
+ goto end_buf;
}
}
else if ((reg->anchor & ANCHOR_ANYCHAR_STAR_ML)) {
@@ -3496,7 +3538,7 @@ onig_search(regex_t* reg, const UChar* str, const UChar* end,
s = (UChar* )start;
prev = (UChar* )NULL;
- MATCH_ARG_INIT(msa, option, region, start);
+ MATCH_ARG_INIT(msa, reg, option, region, start);
#ifdef USE_COMBINATION_EXPLOSION_CHECK
msa.state_check_buff = (void* )0;
msa.state_check_buff_size = 0; /* NO NEED, for valgrind */
@@ -3512,7 +3554,7 @@ onig_search(regex_t* reg, const UChar* str, const UChar* end,
(int )(end - str), (int )(start - str), (int )(range - str));
#endif
- MATCH_ARG_INIT(msa, option, region, orig_start);
+ MATCH_ARG_INIT(msa, reg, option, region, orig_start);
#ifdef USE_COMBINATION_EXPLOSION_CHECK
{
int offset = (MIN(start, range) - str);
@@ -3532,36 +3574,36 @@ onig_search(regex_t* reg, const UChar* str, const UChar* end,
sch_range = (UChar* )range;
if (reg->dmax != 0) {
- if (reg->dmax == ONIG_INFINITE_DISTANCE)
- sch_range = (UChar* )end;
- else {
- sch_range += reg->dmax;
- if (sch_range > end) sch_range = (UChar* )end;
- }
+ if (reg->dmax == ONIG_INFINITE_DISTANCE)
+ sch_range = (UChar* )end;
+ else {
+ sch_range += reg->dmax;
+ if (sch_range > end) sch_range = (UChar* )end;
+ }
}
if ((end - start) < reg->threshold_len)
goto mismatch;
if (reg->dmax != ONIG_INFINITE_DISTANCE) {
- do {
- if (! forward_search_range(reg, str, end, s, sch_range,
- &low, &high, &low_prev)) goto mismatch;
- if (s < low) {
- s = low;
- prev = low_prev;
- }
- while (s <= high) {
- MATCH_AND_RETURN_CHECK(orig_range);
- prev = s;
- s += enclen(reg->enc, s);
- }
- } while (s < range);
- goto mismatch;
+ do {
+ if (! forward_search_range(reg, str, end, s, sch_range,
+ &low, &high, &low_prev)) goto mismatch;
+ if (s < low) {
+ s = low;
+ prev = low_prev;
+ }
+ while (s <= high) {
+ MATCH_AND_RETURN_CHECK(orig_range);
+ prev = s;
+ s += enclen(reg->enc, s);
+ }
+ } while (s < range);
+ goto mismatch;
}
else { /* check only. */
- if (! forward_search_range(reg, str, end, s, sch_range,
- &low, &high, (UChar** )NULL)) goto mismatch;
+ if (! forward_search_range(reg, str, end, s, sch_range,
+ &low, &high, (UChar** )NULL)) goto mismatch;
if ((reg->anchor & ANCHOR_ANYCHAR_STAR) != 0) {
do {
@@ -3599,47 +3641,47 @@ onig_search(regex_t* reg, const UChar* str, const UChar* end,
UChar *low, *high, *adjrange, *sch_start;
if (range < end)
- adjrange = ONIGENC_LEFT_ADJUST_CHAR_HEAD(reg->enc, str, range);
+ adjrange = ONIGENC_LEFT_ADJUST_CHAR_HEAD(reg->enc, str, range);
else
- adjrange = (UChar* )end;
+ adjrange = (UChar* )end;
if (reg->dmax != ONIG_INFINITE_DISTANCE &&
- (end - range) >= reg->threshold_len) {
- do {
- sch_start = s + reg->dmax;
- if (sch_start > end) sch_start = (UChar* )end;
- if (backward_search_range(reg, str, end, sch_start, range, adjrange,
- &low, &high) <= 0)
- goto mismatch;
-
- if (s > high)
- s = high;
-
- while (s >= low) {
- prev = onigenc_get_prev_char_head(reg->enc, str, s);
- MATCH_AND_RETURN_CHECK(orig_start);
- s = prev;
- }
- } while (s >= range);
- goto mismatch;
+ (end - range) >= reg->threshold_len) {
+ do {
+ sch_start = s + reg->dmax;
+ if (sch_start > end) sch_start = (UChar* )end;
+ if (backward_search_range(reg, str, end, sch_start, range, adjrange,
+ &low, &high) <= 0)
+ goto mismatch;
+
+ if (s > high)
+ s = high;
+
+ while (s >= low) {
+ prev = onigenc_get_prev_char_head(reg->enc, str, s);
+ MATCH_AND_RETURN_CHECK(orig_start);
+ s = prev;
+ }
+ } while (s >= range);
+ goto mismatch;
}
else { /* check only. */
- if ((end - range) < reg->threshold_len) goto mismatch;
-
- sch_start = s;
- if (reg->dmax != 0) {
- if (reg->dmax == ONIG_INFINITE_DISTANCE)
- sch_start = (UChar* )end;
- else {
- sch_start += reg->dmax;
- if (sch_start > end) sch_start = (UChar* )end;
- else
- sch_start = ONIGENC_LEFT_ADJUST_CHAR_HEAD(reg->enc,
- start, sch_start);
- }
- }
- if (backward_search_range(reg, str, end, sch_start, range, adjrange,
- &low, &high) <= 0) goto mismatch;
+ if ((end - range) < reg->threshold_len) goto mismatch;
+
+ sch_start = s;
+ if (reg->dmax != 0) {
+ if (reg->dmax == ONIG_INFINITE_DISTANCE)
+ sch_start = (UChar* )end;
+ else {
+ sch_start += reg->dmax;
+ if (sch_start > end) sch_start = (UChar* )end;
+ else
+ sch_start = ONIGENC_LEFT_ADJUST_CHAR_HEAD(reg->enc,
+ start, sch_start);
+ }
+ }
+ if (backward_search_range(reg, str, end, sch_start, range, adjrange,
+ &low, &high) <= 0) goto mismatch;
}
}
@@ -3694,6 +3736,46 @@ onig_search(regex_t* reg, const UChar* str, const UChar* end,
return s - str;
}
+extern int
+onig_scan(regex_t* reg, const UChar* str, const UChar* end,
+ OnigRegion* region, OnigOptionType option,
+ int (*scan_callback)(int, int, OnigRegion*, void*),
+ void* callback_arg)
+{
+ int r;
+ int n;
+ int rs;
+ const UChar* start;
+
+ n = 0;
+ start = str;
+ while (1) {
+ r = onig_search(reg, str, end, start, end, region, option);
+ if (r >= 0) {
+ rs = scan_callback(n, r, region, callback_arg);
+ n++;
+ if (rs != 0)
+ return rs;
+
+ if (region->end[0] == start - str)
+ start++;
+ else
+ start = str + region->end[0];
+
+ if (start > end)
+ break;
+ }
+ else if (r == ONIG_MISMATCH) {
+ break;
+ }
+ else { /* error */
+ return r;
+ }
+ }
+
+ return n;
+}
+
extern OnigEncoding
onig_get_encoding(regex_t* reg)
{
diff --git a/src/regint.h b/src/regint.h
index 5476626..d320e26 100644
--- a/src/regint.h
+++ b/src/regint.h
@@ -708,6 +708,7 @@ typedef struct {
int stack_n;
OnigOptionType options;
OnigRegion* region;
+ int ptr_num;
const UChar* start; /* search start position (for \G: BEGIN_POSITION) */
#ifdef USE_FIND_LONGEST_SEARCH_ALL_OF_RANGE
int best_len; /* for ONIG_OPTION_FIND_LONGEST */
@@ -750,6 +751,7 @@ extern void onig_print_statistics P_((FILE* f));
#endif
#endif
+extern void onig_warning(const char* s);
extern UChar* onig_error_code_to_format P_((int code));
extern void onig_snprintf_with_pattern PV_((UChar buf[], int bufsize, OnigEncoding enc, UChar* pat, UChar* pat_end, const UChar *fmt, ...));
extern int onig_bbuf_init P_((BBuf* buf, int size));
diff --git a/src/regparse.c b/src/regparse.c
index e06d9d2..6be8366 100644
--- a/src/regparse.c
+++ b/src/regparse.c
@@ -26,7 +26,6 @@
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
-
#include "regparse.h"
#include "st.h"
@@ -97,6 +96,14 @@ extern void onig_set_verb_warn_func(OnigWarnFunc f)
onig_verb_warn = f;
}
+extern void
+onig_warning(const char* s)
+{
+ if (onig_warn == onig_null_warn) return ;
+
+ (*onig_warn)(s);
+}
+
static void
bbuf_free(BBuf* bbuf)
{
@@ -957,6 +964,9 @@ scan_env_add_mem_entry(ScanEnv* env)
Node** p;
need = env->num_mem + 1;
+ if (need > ONIG_MAX_CAPTURE_NUM)
+ return ONIGERR_TOO_MANY_CAPTURES;
+
if (need >= SCANENV_MEMNODES_SIZE) {
if (env->mem_alloc <= need) {
if (IS_NULL(env->mem_nodes_dynamic)) {
@@ -1987,8 +1997,8 @@ or_cclass(CClassNode* dest, CClassNode* cc, OnigEncoding enc)
return 0;
}
-static int
-conv_backslash_value(int c, ScanEnv* env)
+static OnigCodePoint
+conv_backslash_value(OnigCodePoint c, ScanEnv* env)
{
if (IS_SYNTAX_OP(env->syntax, ONIG_SYN_OP_ESC_CONTROL_CHARS)) {
switch (c) {
@@ -2259,7 +2269,7 @@ fetch_range_quantifier(UChar** src, UChar* end, OnigToken* tok, ScanEnv* env)
if (p == prev) {
if (non_low != 0)
- goto invalid;
+ goto invalid;
up = REPEAT_INFINITE; /* {n,} : {n,infinite} */
}
}
@@ -2291,15 +2301,17 @@ fetch_range_quantifier(UChar** src, UChar* end, OnigToken* tok, ScanEnv* env)
return r; /* 0: normal {n,m}, 2: fixed {n} */
invalid:
- if (syn_allow)
+ if (syn_allow) {
+ *src = p;
return 1; /* OK */
+ }
else
return ONIGERR_INVALID_REPEAT_RANGE_PATTERN;
}
/* \M-, \C-, \c, or \... */
static int
-fetch_escaped_value(UChar** src, UChar* end, ScanEnv* env)
+fetch_escaped_value(UChar** src, UChar* end, ScanEnv* env, OnigCodePoint* val)
{
int v;
OnigCodePoint c;
@@ -2318,9 +2330,8 @@ fetch_escaped_value(UChar** src, UChar* end, ScanEnv* env)
if (PEND) return ONIGERR_END_PATTERN_AT_META;
PFETCH_S(c);
if (c == MC_ESC(env->syntax)) {
- v = fetch_escaped_value(&p, end, env);
+ v = fetch_escaped_value(&p, end, env, &c);
if (v < 0) return v;
- c = (OnigCodePoint )v;
}
c = ((c & 0xff) | 0x80);
}
@@ -2348,9 +2359,8 @@ fetch_escaped_value(UChar** src, UChar* end, ScanEnv* env)
}
else {
if (c == MC_ESC(env->syntax)) {
- v = fetch_escaped_value(&p, end, env);
+ v = fetch_escaped_value(&p, end, env, &c);
if (v < 0) return v;
- c = (OnigCodePoint )v;
}
c &= 0x9f;
}
@@ -2367,7 +2377,8 @@ fetch_escaped_value(UChar** src, UChar* end, ScanEnv* env)
}
*src = p;
- return c;
+ *val = c;
+ return 0;
}
static int fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env);
@@ -2463,6 +2474,10 @@ fetch_name_with_level(OnigCodePoint start_code, UChar** src, UChar* end,
int level;
int flag = (c == '-' ? -1 : 1);
+ if (PEND) {
+ r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME;
+ goto end;
+ }
PFETCH(c);
if (! ONIGENC_IS_CODE_DIGIT(enc, c)) goto err;
PUNFETCH;
@@ -2471,9 +2486,11 @@ fetch_name_with_level(OnigCodePoint start_code, UChar** src, UChar* end,
*rlevel = (level * flag);
exist_level = 1;
- PFETCH(c);
- if (c == end_code)
- goto end;
+ if (!PEND) {
+ PFETCH(c);
+ if (c == end_code)
+ goto end;
+ }
}
err:
@@ -2880,6 +2897,8 @@ fetch_token_in_cc(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env)
case 'p':
case 'P':
+ if (PEND) break;
+
c2 = PPEEK;
if (c2 == '{' &&
IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_P_BRACE_CHAR_PROPERTY)) {
@@ -2887,7 +2906,7 @@ fetch_token_in_cc(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env)
tok->type = TK_CHAR_PROPERTY;
tok->u.prop.not = (c == 'P' ? 1 : 0);
- if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_P_BRACE_CIRCUMFLEX_NOT)) {
+ if (!PEND && IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_P_BRACE_CIRCUMFLEX_NOT)) {
PFETCH(c2);
if (c2 == '^') {
tok->u.prop.not = (tok->u.prop.not == 0 ? 1 : 0);
@@ -2903,25 +2922,25 @@ fetch_token_in_cc(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env)
prev = p;
if (PPEEK_IS('{') && IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_X_BRACE_HEX8)) {
- PINC;
- num = scan_unsigned_hexadecimal_number(&p, end, 8, enc);
- if (num < 0) return ONIGERR_TOO_BIG_WIDE_CHAR_VALUE;
- if (!PEND) {
+ PINC;
+ num = scan_unsigned_hexadecimal_number(&p, end, 8, enc);
+ if (num < 0) return ONIGERR_TOO_BIG_WIDE_CHAR_VALUE;
+ if (!PEND) {
c2 = PPEEK;
if (ONIGENC_IS_CODE_XDIGIT(enc, c2))
return ONIGERR_TOO_LONG_WIDE_CHAR_VALUE;
}
- if (p > prev + enclen(enc, prev) && !PEND && (PPEEK_IS('}'))) {
- PINC;
- tok->type = TK_CODE_POINT;
- tok->base = 16;
- tok->u.code = (OnigCodePoint )num;
- }
- else {
- /* can't read nothing or invalid format */
- p = prev;
- }
+ if (p > prev + enclen(enc, prev) && !PEND && (PPEEK_IS('}'))) {
+ PINC;
+ tok->type = TK_CODE_POINT;
+ tok->base = 16;
+ tok->u.code = (OnigCodePoint )num;
+ }
+ else {
+ /* can't read nothing or invalid format */
+ p = prev;
+ }
}
else if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_X_HEX2)) {
num = scan_unsigned_hexadecimal_number(&p, end, 2, enc);
@@ -2969,10 +2988,10 @@ fetch_token_in_cc(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env)
default:
PUNFETCH;
- num = fetch_escaped_value(&p, end, env);
+ num = fetch_escaped_value(&p, end, env, &c2);
if (num < 0) return num;
- if (tok->u.c != num) {
- tok->u.code = (OnigCodePoint )num;
+ if (tok->u.c != c2) {
+ tok->u.code = c2;
tok->type = TK_CODE_POINT;
}
break;
@@ -3332,7 +3351,7 @@ fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env)
if (num < 0) return ONIGERR_TOO_BIG_NUMBER;
if (p == prev) { /* can't read nothing. */
num = 0; /* but, it's not error */
- }
+ }
tok->type = TK_RAW_BYTE;
tok->base = 8;
tok->u.c = num;
@@ -3344,7 +3363,7 @@ fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env)
#ifdef USE_NAMED_GROUP
case 'k':
- if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_K_NAMED_BACKREF)) {
+ if (!PEND && IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_K_NAMED_BACKREF)) {
PFETCH(c);
if (c == '<' || c == '\'') {
UChar* name_end;
@@ -3417,7 +3436,7 @@ fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env)
#ifdef USE_SUBEXP_CALL
case 'g':
- if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_G_SUBEXP_CALL)) {
+ if (!PEND && IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_G_SUBEXP_CALL)) {
PFETCH(c);
if (c == '<' || c == '\'') {
int gnum;
@@ -3446,13 +3465,14 @@ fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env)
case 'p':
case 'P':
- if (PPEEK_IS('{') &&
+ if (!PEND && PPEEK_IS('{') &&
IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_P_BRACE_CHAR_PROPERTY)) {
PINC;
tok->type = TK_CHAR_PROPERTY;
tok->u.prop.not = (c == 'P' ? 1 : 0);
- if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_P_BRACE_CIRCUMFLEX_NOT)) {
+ if (!PEND &&
+ IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_P_BRACE_CIRCUMFLEX_NOT)) {
PFETCH(c);
if (c == '^') {
tok->u.prop.not = (tok->u.prop.not == 0 ? 1 : 0);
@@ -3464,16 +3484,20 @@ fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env)
break;
default:
- PUNFETCH;
- num = fetch_escaped_value(&p, end, env);
- if (num < 0) return num;
- /* set_raw: */
- if (tok->u.c != num) {
- tok->type = TK_CODE_POINT;
- tok->u.code = (OnigCodePoint )num;
- }
- else { /* string */
- p = tok->backp + enclen(enc, tok->backp);
+ {
+ OnigCodePoint c2;
+
+ PUNFETCH;
+ num = fetch_escaped_value(&p, end, env, &c2);
+ if (num < 0) return num;
+ /* set_raw: */
+ if (tok->u.c != c2) {
+ tok->type = TK_CODE_POINT;
+ tok->u.code = c2;
+ }
+ else { /* string */
+ p = tok->backp + enclen(enc, tok->backp);
+ }
}
break;
}
@@ -3548,10 +3572,10 @@ fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env)
if (r < 0) return r; /* error */
if (r == 0) goto greedy_check;
else if (r == 2) { /* {n} */
- if (IS_SYNTAX_BV(syn, ONIG_SYN_FIXED_INTERVAL_IS_GREEDY_ONLY))
- goto possessive_check;
+ if (IS_SYNTAX_BV(syn, ONIG_SYN_FIXED_INTERVAL_IS_GREEDY_ONLY))
+ goto possessive_check;
- goto greedy_check;
+ goto greedy_check;
}
/* r == 1 : normal char */
break;
@@ -3562,10 +3586,10 @@ fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env)
break;
case '(':
- if (PPEEK_IS('?') &&
+ if (!PEND && PPEEK_IS('?') &&
IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_QMARK_GROUP_EFFECT)) {
PINC;
- if (PPEEK_IS('#')) {
+ if (!PEND && PPEEK_IS('#')) {
PFETCH(c);
while (1) {
if (PEND) return ONIGERR_END_PATTERN_IN_GROUP;
@@ -3612,7 +3636,7 @@ fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env)
case ']':
if (*src > env->pattern) /* /].../ is allowed. */
- CLOSE_BRACKET_WITHOUT_ESC_WARN(env, (UChar* )"]");
+ CLOSE_BRACKET_WITHOUT_ESC_WARN(env, (UChar* )"]");
break;
case '#':
@@ -3975,8 +3999,9 @@ next_state_val(CClassNode* cc, OnigCodePoint *vs, OnigCodePoint v,
switch (*state) {
case CCS_VALUE:
- if (*type == CCV_SB)
+ if (*type == CCV_SB) {
BITSET_SET_BIT(cc->bs, (int )(*vs));
+ }
else if (*type == CCV_CODE_POINT) {
r = add_code_range(&(cc->mbuf), env, *vs, *vs);
if (r < 0) return r;
@@ -3989,13 +4014,13 @@ next_state_val(CClassNode* cc, OnigCodePoint *vs, OnigCodePoint v,
if (*vs > 0xff || v > 0xff)
return ONIGERR_INVALID_CODE_POINT_VALUE;
- if (*vs > v) {
- if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_EMPTY_RANGE_IN_CC))
- goto ccs_range_end;
- else
- return ONIGERR_EMPTY_RANGE_IN_CHAR_CLASS;
- }
- bitset_set_range(cc->bs, (int )*vs, (int )v);
+ if (*vs > v) {
+ if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_EMPTY_RANGE_IN_CC))
+ goto ccs_range_end;
+ else
+ return ONIGERR_EMPTY_RANGE_IN_CHAR_CLASS;
+ }
+ bitset_set_range(cc->bs, (int )*vs, (int )v);
}
else {
r = add_code_range(&(cc->mbuf), env, *vs, v);
@@ -4006,15 +4031,15 @@ next_state_val(CClassNode* cc, OnigCodePoint *vs, OnigCodePoint v,
#if 0
if (intype == CCV_CODE_POINT && *type == CCV_SB) {
#endif
- if (*vs > v) {
- if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_EMPTY_RANGE_IN_CC))
- goto ccs_range_end;
- else
- return ONIGERR_EMPTY_RANGE_IN_CHAR_CLASS;
- }
- bitset_set_range(cc->bs, (int )*vs, (int )(v < 0xff ? v : 0xff));
- r = add_code_range(&(cc->mbuf), env, (OnigCodePoint )*vs, v);
- if (r < 0) return r;
+ if (*vs > v) {
+ if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_EMPTY_RANGE_IN_CC))
+ goto ccs_range_end;
+ else
+ return ONIGERR_EMPTY_RANGE_IN_CHAR_CLASS;
+ }
+ bitset_set_range(cc->bs, (int )*vs, (int )(v < 0xff ? v : 0xff));
+ r = add_code_range(&(cc->mbuf), env, (OnigCodePoint )*vs, v);
+ if (r < 0) return r;
#if 0
}
else
@@ -4110,6 +4135,7 @@ parse_char_class(Node** np, OnigToken* tok, UChar** src, UChar* end,
fetched = 0;
switch (r) {
case TK_CHAR:
+ any_char_in:
len = ONIGENC_CODE_TO_MBCLEN(env->enc, tok->u.c);
if (len > 1) {
in_type = CCV_CODE_POINT;
@@ -4119,7 +4145,7 @@ parse_char_class(Node** np, OnigToken* tok, UChar** src, UChar* end,
goto err;
}
else {
- sb_char:
+ /* sb_char: */
in_type = CCV_SB;
}
v = (OnigCodePoint )tok->u.c;
@@ -4265,7 +4291,7 @@ parse_char_class(Node** np, OnigToken* tok, UChar** src, UChar* end,
}
else if (state == CCS_RANGE) {
CC_ESC_WARN(env, (UChar* )"-");
- goto sb_char; /* [!--x] is allowed */
+ goto any_char_in; /* [!--x] is allowed */
}
else { /* CCS_COMPLETE */
r = fetch_token_in_cc(tok, &p, end, env);
@@ -4279,7 +4305,7 @@ parse_char_class(Node** np, OnigToken* tok, UChar** src, UChar* end,
if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_DOUBLE_RANGE_OP_IN_CC)) {
CC_ESC_WARN(env, (UChar* )"-");
- goto sb_char; /* [0-9-a] is allowed as [0-9\-a] */
+ goto any_char_in; /* [0-9-a] is allowed as [0-9\-a] */
}
r = ONIGERR_UNMATCHED_RANGE_SPECIFIER_IN_CHAR_CLASS;
goto err;
@@ -4452,6 +4478,7 @@ parse_enclose(Node** np, OnigToken* tok, int term, UChar** src, UChar* end,
#endif
case '<': /* look behind (?<=...), (?<!...) */
+ if (PEND) return ONIGERR_END_PATTERN_WITH_UNMATCHED_PARENTHESIS;
PFETCH(c);
if (c == '=')
*np = onig_node_new_anchor(ANCHOR_LOOK_BEHIND);
@@ -4924,7 +4951,7 @@ parse_exp(Node** np, OnigToken* tok, int term,
len = 1;
while (1) {
if (len >= ONIGENC_MBC_MINLEN(env->enc)) {
- if (len == enclen(env->enc, NSTR(*np)->s)) {
+ if (len == enclen(env->enc, NSTR(*np)->s)) {//should not enclen_end()
r = fetch_token(tok, src, end, env);
NSTRING_CLEAR_RAW(*np);
goto string_end;
@@ -5300,6 +5327,10 @@ onig_parse_make_tree(Node** root, const UChar* pattern, const UChar* end,
env->reg = reg;
*root = NULL;
+
+ if (! ONIGENC_IS_VALID_MBC_STRING(env->enc, pattern, end))
+ return ONIGERR_INVALID_WIDE_CHAR_VALUE;
+
p = (UChar* )pattern;
r = parse_regexp(root, &p, (UChar* )end, env);
reg->num_mem = env->num_mem;
diff --git a/src/regparse.h b/src/regparse.h
index fff707a..9e366fe 100644
--- a/src/regparse.h
+++ b/src/regparse.h
@@ -191,8 +191,8 @@ typedef struct {
struct _Node* target;
AbsAddrType call_addr;
/* for multiple call reference */
- OnigDistance min_len; /* min length (byte) */
- OnigDistance max_len; /* max length (byte) */
+ OnigLen min_len; /* min length (byte) */
+ OnigLen max_len; /* max length (byte) */
int char_len; /* character length */
int opt_count; /* referenced count in optimize_node_left() */
} EncloseNode;
diff --git a/src/sjis.c b/src/sjis.c
index 84843ae..a607b3d 100644
--- a/src/sjis.c
+++ b/src/sjis.c
@@ -77,6 +77,12 @@ mbc_enc_len(const UChar* p)
}
static int
+is_valid_mbc_string(const UChar* s, const UChar* end)
+{
+ return onigenc_length_check_is_valid_mbc_string(ONIG_ENCODING_SJIS, s, end);
+}
+
+static int
code_to_mbclen(OnigCodePoint code)
{
if (code < 256) {
@@ -303,5 +309,6 @@ OnigEncodingType OnigEncodingSJIS = {
left_adjust_char_head,
is_allowed_reverse_match,
NULL, /* init */
- NULL /* is_initialized */
+ NULL, /* is_initialized */
+ is_valid_mbc_string
};
diff --git a/src/unicode.c b/src/unicode.c
index df20ef9..8812ca2 100644
--- a/src/unicode.c
+++ b/src/unicode.c
@@ -111,8 +111,10 @@ onig_unicode_define_user_property(const char* name, OnigCodePoint* ranges)
n = 0;
for (i = 0; i < len; i++) {
c = name[i];
- if (c <= 0 || c >= 0x80)
+ if (c <= 0 || c >= 0x80) {
+ xfree(s);
return ONIGERR_INVALID_CHAR_PROPERTY_NAME;
+ }
if (c != ' ' && c != '-' && c != '_') {
s[n] = c;
@@ -483,12 +485,13 @@ onigenc_unicode_get_case_fold_codes_by_str(OnigEncoding enc,
buk = unicode_unfold_key(code);
if (buk != 0) {
if (buk->fold_len == 1) {
+ int un;
items[0].byte_len = len;
items[0].code_len = 1;
items[0].code[0] = *FOLDS1_FOLD(buk->index);
n++;
- int un = FOLDS1_UNFOLDS_NUM(buk->index);
+ un = FOLDS1_UNFOLDS_NUM(buk->index);
for (i = 0; i < un; i++) {
OnigCodePoint unfold = FOLDS1_UNFOLDS(buk->index)[i];
if (unfold != code) {
@@ -517,8 +520,9 @@ onigenc_unicode_get_case_fold_codes_by_str(OnigEncoding enc,
}
for (fn = 0; fn < 2; fn++) {
+ int index;
cs[fn][0] = FOLDS2_FOLD(buk->index)[fn];
- int index = unicode_fold1_key(&cs[fn][0]);
+ index = unicode_fold1_key(&cs[fn][0]);
if (index >= 0) {
int m = FOLDS1_UNFOLDS_NUM(index);
for (i = 0; i < m; i++) {
@@ -553,8 +557,9 @@ onigenc_unicode_get_case_fold_codes_by_str(OnigEncoding enc,
}
for (fn = 0; fn < 3; fn++) {
+ int index;
cs[fn][0] = FOLDS3_FOLD(buk->index)[fn];
- int index = unicode_fold1_key(&cs[fn][0]);
+ index = unicode_fold1_key(&cs[fn][0]);
if (index >= 0) {
int m = FOLDS1_UNFOLDS_NUM(index);
for (i = 0; i < m; i++) {
@@ -603,6 +608,7 @@ onigenc_unicode_get_case_fold_codes_by_str(OnigEncoding enc,
p += len;
if (p < end) {
int clen;
+ int index;
codes[0] = code;
code = ONIGENC_MBC_TO_CODE(enc, p, end);
@@ -617,7 +623,7 @@ onigenc_unicode_get_case_fold_codes_by_str(OnigEncoding enc,
clen = enclen(enc, p);
len += clen;
- int index = unicode_fold2_key(codes);
+ index = unicode_fold2_key(codes);
if (index >= 0) {
m = FOLDS2_UNFOLDS_NUM(index);
for (i = 0; i < m; i++) {
diff --git a/src/utf16_be.c b/src/utf16_be.c
index e93b42a..f220cca 100644
--- a/src/utf16_be.c
+++ b/src/utf16_be.c
@@ -55,6 +55,12 @@ utf16be_mbc_enc_len(const UChar* p)
}
static int
+is_valid_mbc_string(const UChar* s, const UChar* end)
+{
+ return onigenc_length_check_is_valid_mbc_string(ONIG_ENCODING_UTF16_BE, s, end);
+}
+
+static int
utf16be_is_mbc_newline(const UChar* p, const UChar* end)
{
if (p + 1 < end) {
@@ -224,5 +230,6 @@ OnigEncodingType OnigEncodingUTF16_BE = {
utf16be_left_adjust_char_head,
onigenc_always_false_is_allowed_reverse_match,
NULL, /* init */
- NULL /* is_initialized */
+ NULL, /* is_initialized */
+ is_valid_mbc_string
};
diff --git a/src/utf16_le.c b/src/utf16_le.c
index 2d9af52..89bc72f 100644
--- a/src/utf16_le.c
+++ b/src/utf16_le.c
@@ -61,6 +61,21 @@ utf16le_mbc_enc_len(const UChar* p)
}
static int
+is_valid_mbc_string(const UChar* p, const UChar* end)
+{
+ const UChar* end1 = end - 1;
+
+ while (p < end1) {
+ p += utf16le_mbc_enc_len(p);
+ }
+
+ if (p != end)
+ return FALSE;
+ else
+ return TRUE;
+}
+
+static int
utf16le_is_mbc_newline(const UChar* p, const UChar* end)
{
if (p + 1 < end) {
@@ -225,5 +240,6 @@ OnigEncodingType OnigEncodingUTF16_LE = {
utf16le_left_adjust_char_head,
onigenc_always_false_is_allowed_reverse_match,
NULL, /* init */
- NULL /* is_initialized */
+ NULL, /* is_initialized */
+ is_valid_mbc_string
};
diff --git a/src/utf32_be.c b/src/utf32_be.c
index b8f64af..d0c7f39 100644
--- a/src/utf32_be.c
+++ b/src/utf32_be.c
@@ -36,6 +36,12 @@ utf32be_mbc_enc_len(const UChar* p ARG_UNUSED)
}
static int
+is_valid_mbc_string(const UChar* s, const UChar* end)
+{
+ return onigenc_length_check_is_valid_mbc_string(ONIG_ENCODING_UTF32_BE, s, end);
+}
+
+static int
utf32be_is_mbc_newline(const UChar* p, const UChar* end)
{
if (p + 3 < end) {
@@ -183,5 +189,6 @@ OnigEncodingType OnigEncodingUTF32_BE = {
utf32be_left_adjust_char_head,
onigenc_always_false_is_allowed_reverse_match,
NULL, /* init */
- NULL /* is_initialized */
+ NULL, /* is_initialized */
+ is_valid_mbc_string
};
diff --git a/src/utf32_le.c b/src/utf32_le.c
index a5a048e..33200d1 100644
--- a/src/utf32_le.c
+++ b/src/utf32_le.c
@@ -36,6 +36,12 @@ utf32le_mbc_enc_len(const UChar* p ARG_UNUSED)
}
static int
+is_valid_mbc_string(const UChar* s, const UChar* end)
+{
+ return onigenc_length_check_is_valid_mbc_string(ONIG_ENCODING_UTF32_LE, s, end);
+}
+
+static int
utf32le_is_mbc_newline(const UChar* p, const UChar* end)
{
if (p + 3 < end) {
@@ -183,5 +189,6 @@ OnigEncodingType OnigEncodingUTF32_LE = {
utf32le_left_adjust_char_head,
onigenc_always_false_is_allowed_reverse_match,
NULL, /* init */
- NULL /* is_initialized */
+ NULL, /* is_initialized */
+ is_valid_mbc_string
};
diff --git a/src/utf8.c b/src/utf8.c
index b78e7eb..219b7ea 100644
--- a/src/utf8.c
+++ b/src/utf8.c
@@ -29,7 +29,7 @@
#include "regenc.h"
-#define USE_INVALID_CODE_SCHEME
+//#define USE_INVALID_CODE_SCHEME
#ifdef USE_INVALID_CODE_SCHEME
/* virtual codepoint values for invalid encoding byte 0xfe and 0xff */
@@ -39,6 +39,7 @@
#endif
#define utf8_islead(c) ((UChar )((c) & 0xc0) != 0x80)
+#define utf8_istail(c) ((UChar )((c) & 0xc0) == 0x80)
static const int EncLen_UTF8[] = {
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
@@ -66,6 +67,30 @@ mbc_enc_len(const UChar* p)
}
static int
+is_valid_mbc_string(const UChar* p, const UChar* end)
+{
+ int i, len;
+
+ while (p < end) {
+ if (! utf8_islead(*p))
+ return FALSE;
+
+ len = mbc_enc_len(p++);
+ if (len > 1) {
+ for (i = 1; i < len; i++) {
+ if (p == end)
+ return FALSE;
+
+ if (! utf8_istail(*p++))
+ return FALSE;
+ }
+ }
+ }
+
+ return TRUE;
+}
+
+static int
is_mbc_newline(const UChar* p, const UChar* end)
{
if (p < end) {
@@ -91,12 +116,14 @@ is_mbc_newline(const UChar* p, const UChar* end)
}
static OnigCodePoint
-mbc_to_code(const UChar* p, const UChar* end ARG_UNUSED)
+mbc_to_code(const UChar* p, const UChar* end)
{
int c, len;
OnigCodePoint n;
- len = enclen(ONIG_ENCODING_UTF8, p);
+ len = mbc_enc_len(p);
+ if (len > end - p) len = end - p;
+
c = *p++;
if (len > 1) {
len--;
@@ -303,5 +330,6 @@ OnigEncodingType OnigEncodingUTF8 = {
left_adjust_char_head,
onigenc_always_true_is_allowed_reverse_match,
NULL, /* init */
- NULL /* is_initialized */
+ NULL, /* is_initialized */
+ is_valid_mbc_string
};