From b134093d75235a90f09ff591137aed9dbdad6e89 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=B6rg=20Frings-F=C3=BCrst?= Date: Wed, 7 Aug 2019 09:32:37 +0200 Subject: Correct typo in watch file --- debian/watch | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/debian/watch b/debian/watch index 8a7b475..2f0e85f 100644 --- a/debian/watch +++ b/debian/watch @@ -4,4 +4,4 @@ dversionmangle=s/\+(debian|dfsg|ds|deb)\d*$//,\ uversionmangle=s/(\d)[_\.\-\+]?((RC|rc|pre|dev|beta|alpha)\d*)$/$1~$2/;s/RC/rc/;s/\-/\./g;s/\_/\./g,\ filenamemangle=s/(?:.*?)?(?:rel|v|oniguruma|ONIGURUMA)?[\-\_]?(\d\S+)\.(tgz|tbz|txz|(?:tar\.(?:gz|bz2|xz)))/oniguruma-$1.$2/ \ https://github.com/kkos/oniguruma/tags \ -(?:.*?/)?(?:rel|v|oniguruma|ONIGURUMA)?[\-\_]?(\d\S+)\.(?:tgz|tbz|txz|(?:tar\.(?:gz|bz2|xz))) \ +(?:.*?/)?(?:rel|v|oniguruma|ONIGURUMA)?[\-\_]?(\d\S+)\.(?:tgz|tbz|txz|(?:tar\.(?:gz|bz2|xz))) -- cgit v1.2.3 From 40f3d0030e6e98bcb02d6523e5ee48497dec49a6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=B6rg=20Frings-F=C3=BCrst?= Date: Wed, 7 Aug 2019 09:32:48 +0200 Subject: New upstream version 6.9.3 --- .gitignore | 3 + CMakeLists.txt | 41 ++++--- HISTORY | 14 +++ Makefile.am | 6 + README.md | 71 ++++++----- build_harnesses.sh | 31 +++++ configure.ac | 2 +- contributed/libfuzzer-onig.cpp | 2 + doc/API | 26 ++++- doc/API.ja | 20 +++- doc/UNICODE_PROPERTIES | 2 +- harnesses/ascii_compatible.dict | 111 ++++++++++++++++++ harnesses/deluxe-encode-harness.c | 239 ++++++++++++++++++++++++++++++++++++++ harnesses/dict_conv.py | 72 ++++++++++++ harnesses/encode-harness.c | 170 +++++++++++++++++++++++++++ harnesses/syntax-harness.c | 120 +++++++++++++++++++ index.html | 4 +- index_ja.html | 4 +- sample/bug_fix.c | 56 ++------- sample/crnl.c | 2 + sample/encode.c | 142 ++++------------------ sample/listcap.c | 2 + sample/names.c | 3 + sample/posix.c | 5 + sample/scan.c | 2 + sample/simple.c | 3 + sample/sql.c | 4 + sample/syntax.c | 2 + sample/user_property.c | 5 + src/gb18030.c | 6 +- src/oniguruma.h | 11 +- src/regcomp.c | 156 +++++++++++++++---------- src/regenc.c | 2 + src/regerror.c | 17 +++ src/regexec.c | 130 ++++++++++++++------- src/regext.c | 6 +- src/regint.h | 6 +- src/regparse.c | 190 +++++++++++++++++------------- src/regparse.h | 22 ++-- src/utf16_be.c | 35 +++++- src/utf16_le.c | 26 ++++- test/test_utf8.c | 13 +++ test/testu.c | 15 --- 43 files changed, 1350 insertions(+), 449 deletions(-) create mode 100755 build_harnesses.sh create mode 100644 harnesses/ascii_compatible.dict create mode 100644 harnesses/deluxe-encode-harness.c create mode 100644 harnesses/dict_conv.py create mode 100644 harnesses/encode-harness.c create mode 100644 harnesses/syntax-harness.c diff --git a/.gitignore b/.gitignore index 6af6a82..227b7df 100644 --- a/.gitignore +++ b/.gitignore @@ -32,6 +32,7 @@ Makefile.in m4/*.m4 /coverage /coverage.info +/fuzzers # src/ /src/CaseFolding.txt @@ -62,3 +63,5 @@ m4/*.m4 /sample/count /sample/bug_fix /sample/log* + +/harnesses/utf16*.dict diff --git a/CMakeLists.txt b/CMakeLists.txt index f3eca6b..c59bfe3 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,28 +1,19 @@ cmake_minimum_required(VERSION 3.1) -project(oniguruma VERSION 6.9.2) +project(oniguruma + VERSION 6.9.3 + LANGUAGES C) set(PACKAGE onig) set(PACKAGE_VERSION ${PROJECT_VERSION}) option(BUILD_SHARED_LIBS "Build shared libraries" ON) option(ENABLE_POSIX_API "Include POSIX API" ON) - -set(USE_CRNL_AS_LINE_TERMINATOR 0) -set(VERSION ${PACKAGE_VERSION}) - if(MSVC) - # Force to always compile with W4 - if(CMAKE_CXX_FLAGS MATCHES "/W[0-4]") - string(REGEX REPLACE "/W[0-4]" "/W4" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}") - else() - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /W4") - endif() -elseif(CMAKE_COMPILER_IS_GNUCXX) - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall") -elseif(CMAKE_COMPILER_IS_GNUCC) - set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wall") + option(MSVC_STATIC_RUNTIME "Build with static runtime" OFF) endif() +set(USE_CRNL_AS_LINE_TERMINATOR 0) +set(VERSION ${PACKAGE_VERSION}) include(CheckCSourceCompiles) include(CheckIncludeFiles) @@ -73,6 +64,26 @@ target_include_directories(onig PUBLIC $ $) +if(MSVC) + target_compile_options(onig PRIVATE + #/W4 + ) + if(MSVC_STATIC_RUNTIME) + target_compile_options(onig PRIVATE + $<$:/MT> + $<$:/MTd> + $<$:/MT> + $<$:/MTd> + ) + target_compile_definitions(onig PUBLIC -DONIG_STATIC) + endif() +elseif(CMAKE_COMPILER_IS_GNUCC) + target_compile_options(onig PRIVATE + -Wall + ) +endif() + + # Installation (https://github.com/forexample/package-example) # Introduce variables: diff --git a/HISTORY b/HISTORY index 3649e4e..0380cb4 100644 --- a/HISTORY +++ b/HISTORY @@ -1,5 +1,19 @@ History +2019/08/06: Version 6.9.3 (secirity fix release) + +2019/07/30: add ONIG_SYN_ALLOW_INVALID_CODE_END_OF_RANGE +2019/07/29: add STK_PREC_READ_START/END stack type +2019/07/29: Fix #147: Stack Exhaustion Problem caused by some parsing functions +2019/07/11: add a dictionary file for libfuzzer +2019/07/07: add harnesses directory +2019/07/05-2019/07/29: fix many problems found by libfuzzer programs +2019/06/27: deprecate onig_new_deluxe() +2019/06/27: Fix CVE-2019-13224: don't allow different encodings for onig_new_deluxe() +2019/06/27: Fix CVE-2019-13225: problem in converting if-then-else pattern + +2019/05/07: Version 6.9.2 (same as Release Candidate 3) + 2019/04/23: Release Candidate 3 for 6.9.2 2019/04/23: add doc/SYNTAX.md into distribution file 2019/04/09: Release Candidate 2 for 6.9.2 diff --git a/Makefile.am b/Makefile.am index 6045eae..a0bbc7b 100644 --- a/Makefile.am +++ b/Makefile.am @@ -39,6 +39,12 @@ pkgconfig_DATA = oniguruma.pc all-test: cd test; make test +sanitize: + make clean + ./configure CC=clang CFLAGS="-O -g -fsanitize=address" + make + make all-test + cov: make lcov-clear cd test; make CFLAGS="--coverage" test diff --git a/README.md b/README.md index 873f86d..6a4783b 100644 --- a/README.md +++ b/README.md @@ -27,46 +27,55 @@ Supported character encodings: * doc/SYNTAX.md: contributed by seanofw -New feature of version 6.9.2 ------------------------------------ +Version 6.9.3 (security fix release) +------------------------------------ +* Fixed CVE-2019-13224 +* Fixed CVE-2019-13225 +* Fixed many problems (found by libfuzzer programs) + + +Version 6.9.2 (Reiwa) +--------------------- + +* add doc/SYNTAX.md * Update Unicode version 12.1.0 -* NEW: Unicode Text Segment mode option (?y{g}) (?y{w}) +* NEW: Unicode Text Segment mode option (?y{g}) (?y{w}) (*original) g: Extended Grapheme Cluster mode / w: Word mode (Unicode Standard Annex #29 [http://unicode.org/reports/tr29/]) -New feature of version 6.9.1 --------------------------- +Version 6.9.1 +------------- * Speed improvement (* especially UTF-8) -New feature of version 6.9.0 --------------------------- +Version 6.9.0 +------------- * Update Unicode version 11.0.0 * NEW: add Emoji properties -New feature of version 6.8.2 --------------------------- +Version 6.8.2 +------------- * Fix: #80 UChar in header causes issue * NEW API: onig_set_callout_user_data_of_match_param() (* omission in 6.8.0) * add doc/CALLOUTS.API and doc/CALLOUTS.API.ja -New feature of version 6.8.1 --------------------------- +Version 6.8.1 +------------- * Update shared library version to 5.0.0 for API incompatible changes from 6.7.1 -New feature of version 6.8.0 --------------------------- +Version 6.8.0 +------------- * Retry-limit-in-match function enabled by default * NEW: configure option --enable-posix-api=no (* enabled by default) @@ -77,14 +86,14 @@ New feature of version 6.8.0 * Examples of Callouts program: [callout.c](sample/callout.c), [count.c](sample/count.c), [echo.c](sample/echo.c) -New feature of version 6.7.1 --------------------------- +Version 6.7.1 +------------- * NEW: Mechanism of retry-limit-in-match (* disabled by default) -New feature of version 6.7.0 --------------------------- +Version 6.7.0 +------------- * NEW: hexadecimal codepoint \uHHHH * NEW: add ONIG_SYNTAX_ONIGURUMA (== ONIG_SYNTAX_DEFAULT) @@ -92,8 +101,8 @@ New feature of version 6.7.0 * Reduced size of object file -New feature of version 6.6.0 --------------------------- +Version 6.6.0 +------------- * NEW: ASCII only mode options for character type/property (?WDSP) * NEW: Extended Grapheme Cluster boundary \y, \Y @@ -101,8 +110,8 @@ New feature of version 6.6.0 * Range-clear (Absent-clear) operator restores previous range in retractions. -New feature of version 6.5.0 --------------------------- +Version 6.5.0 +------------- * NEW: \K (keep) * NEW: \R (general newline) \N (no newline) @@ -114,16 +123,16 @@ New feature of version 6.5.0 * NEW: Absent stopper (?~|absent) (*original) -New feature of version 6.4.0 --------------------------- +Version 6.4.0 +------------- * Fix fatal problem of endless repeat on Windows * NEW: call zero (call the total regexp) \g<0> * NEW: relative backref/call by positive number \k<+n>, \g<+n> -New feature of version 6.3.0 --------------------------- +Version 6.3.0 +------------- * NEW: octal codepoint \o{.....} * Fixed CVE-2017-9224 @@ -134,20 +143,20 @@ New feature of version 6.3.0 * Fixed CVE-2017-9229 -New feature of version 6.1.2 --------------------------- +Version 6.1.2 +------------- * allow word bound, word begin and word end in look-behind. * NEW option: ONIG_OPTION_CHECK_VALIDITY_OF_STRING -New feature of version 6.1 --------------------------- +Version 6.1 +----------- * improved doc/RE * NEW API: onig_scan() -New feature of version 6.0 --------------------------- +Version 6.0 +----------- * Update Unicode 8.0 Property/Case-folding * NEW API: onig_unicode_define_user_property() diff --git a/build_harnesses.sh b/build_harnesses.sh new file mode 100755 index 0000000..54dc9ff --- /dev/null +++ b/build_harnesses.sh @@ -0,0 +1,31 @@ +#!/bin/bash + +make clean +autoreconf -vfi + +# build the library with ASAN +#NO_LINK="-fsanitize=fuzzer-no-link" +NO_LINK="" +./configure CC=clang LD=clang CFLAGS="-g -fsanitize=address -fno-omit-frame-pointer $NO_LINK" LDFLAGS="-g -fsanitize=address -fno-omit-frame-pointer $NO_LINK" +make -j4 + +OUT=`pwd`/fuzzers +mkdir -p $OUT +LIBFUZZER_FLAGS="-fsanitize=fuzzer,address -fno-omit-frame-pointer" +#LIBS="src/.libs/libonig.a" +LIBS="src/.libs/libonig.a /usr/local/lib/libLLVMFuzzerMain.a" + +CFLAGS="-Isrc -g $LIBFUZZER_FLAGS" + +# Libfuzzer builds +clang++ contributed/libfuzzer-onig.cpp $LIBS $CFLAGS -o $OUT/libfuzzer-onig +clang harnesses/syntax-harness.c $LIBS $CFLAGS -o $OUT/syntax-libfuzzer +clang harnesses/encode-harness.c $LIBS $CFLAGS -o $OUT/encode-libfuzzer +clang harnesses/deluxe-encode-harness.c $LIBS $CFLAGS -o $OUT/deluxe-encode-libfuzzer + +clang -DUTF16_BE harnesses/encode-harness.c $LIBS $CFLAGS -o $OUT/utf16-be-libfuzzer +clang -DUTF16_LE harnesses/encode-harness.c $LIBS $CFLAGS -o $OUT/utf16-le-libfuzzer +clang -DWITH_READ_MAIN harnesses/encode-harness.c src/.libs/libonig.a $CFLAGS -o $OUT/main-encode +clang -DWITH_READ_MAIN -DUTF16_LE harnesses/encode-harness.c src/.libs/libonig.a $CFLAGS -o $OUT/main-utf16-le +clang -DWITH_READ_MAIN -DUTF16_BE harnesses/encode-harness.c src/.libs/libonig.a $CFLAGS -o $OUT/main-utf16-be +clang -DWITH_READ_MAIN harnesses/deluxe-encode-harness.c $LIBS $CFLAGS -o $OUT/main-deluxe-encode diff --git a/configure.ac b/configure.ac index 010a0d8..62c9fa5 100644 --- a/configure.ac +++ b/configure.ac @@ -1,5 +1,5 @@ dnl Process this file with autoconf to produce a configure script. -AC_INIT(onig, 6.9.2) +AC_INIT(onig, 6.9.3) AC_CONFIG_MACRO_DIR([m4]) diff --git a/contributed/libfuzzer-onig.cpp b/contributed/libfuzzer-onig.cpp index e137b73..526c826 100644 --- a/contributed/libfuzzer-onig.cpp +++ b/contributed/libfuzzer-onig.cpp @@ -29,6 +29,8 @@ extern "C" int LLVMFuzzerTestOneInput(const uint8_t * Data, size_t Size) #ifdef FULL_TEST onig_initialize(&enc, 1); + onig_set_retry_limit_in_match(120); + onig_set_parse_depth_limit(120); #endif if (onig_new(®, Data, Data + Size, ONIG_OPTION_DEFAULT, enc, diff --git a/doc/API b/doc/API index 2309e5e..049db02 100644 --- a/doc/API +++ b/doc/API @@ -1,4 +1,4 @@ -Oniguruma API Version 6.9.2 2019/03/25 +Oniguruma API Version 6.9.3 2019/07/06 #include @@ -168,6 +168,9 @@ Oniguruma API Version 6.9.2 2019/03/25 # int onig_new_deluxe(regex_t** reg, const UChar* pattern, const UChar* pattern_end, OnigCompileInfo* ci, OnigErrorInfo* einfo) + This function is deprecate, and it does not allow the case where + the encoding of pattern and target is different. + Create a regex object. This function is deluxe version of onig_new(). @@ -299,6 +302,7 @@ Oniguruma API Version 6.9.2 2019/03/25 const UChar* range, OnigRegion* region, OnigOptionType option) Search string and return search result and matching region. + Do not pass invalid byte string in the regex character encoding. normal return: match position offset (i.e. p - str >= 0) not found: ONIG_MISMATCH (< 0) @@ -323,15 +327,19 @@ Oniguruma API Version 6.9.2 2019/03/25 const UChar* start, const UChar* range, OnigRegion* region, OnigOptionType option, OnigMatchParam* mp) - arguments - 1-7: same as onig_search() - 8 mp: match parameter values (match_stack_limit, retry_limit_in_match) + Search string and return search result and matching region. + Do not pass invalid byte string in the regex character encoding. + + arguments + 1-7: same as onig_search() + 8 mp: match parameter values (match_stack_limit, retry_limit_in_match) # int onig_match(regex_t* reg, const UChar* str, const UChar* end, const UChar* at, OnigRegion* region, OnigOptionType option) Match string and return result and matching region. + Do not pass invalid byte string in the regex character encoding. normal return: match length (>= 0) not match: ONIG_MISMATCH ( < 0) @@ -353,6 +361,9 @@ Oniguruma API Version 6.9.2 2019/03/25 const UChar* at, OnigRegion* region, OnigOptionType option, OnigMatchParam* mp) + Match string and return result and matching region. + Do not pass invalid byte string in the regex character encoding. + arguments 1-6: same as onig_match() 7 mp: match parameter values (match_stack_limit, retry_limit_in_match) @@ -364,6 +375,7 @@ Oniguruma API Version 6.9.2 2019/03/25 void* callback_arg) Scan string and callback with matching region. + Do not pass invalid byte string in the regex character encoding. normal return: number of matching times error: error code @@ -611,14 +623,20 @@ Oniguruma API Version 6.9.2 2019/03/25 # int onigenc_strlen(OnigEncoding enc, const UChar* s, const UChar* end) + + Return number of characters in the string. + + # int onigenc_strlen_null(OnigEncoding enc, const UChar* s) Return number of characters in the string. + Do not pass invalid byte string in the character encoding. # int onigenc_str_bytelen_null(OnigEncoding enc, const UChar* s) Return number of bytes in the string. + Do not pass invalid byte string in the character encoding. # int onig_set_default_syntax(OnigSyntaxType* syntax) diff --git a/doc/API.ja b/doc/API.ja index 164d0b8..5871558 100644 --- a/doc/API.ja +++ b/doc/API.ja @@ -1,4 +1,4 @@ -鬼車インターフェース Version 6.9.2 2019/03/29 +鬼車インターフェース Version 6.9.3 2019/07/06 #include @@ -167,6 +167,9 @@ # int onig_new_deluxe(regex_t** reg, const UChar* pattern, const UChar* pattern_end, OnigCompileInfo* ci, OnigErrorInfo* einfo) + この関数は廃止予定。 + パターンと対象文字列の文字エンコーディングが異なる場合を許さなくなった。 + 正規表現オブジェクト(regex)を作成する。 この関数は、onig_new()のデラックス版。 @@ -298,6 +301,7 @@ const UChar* range, OnigRegion* region, OnigOptionType option) 正規表現で文字列を検索し、検索結果とマッチ領域を返す。 + 正規表現オブジェクトの文字エンコーディングで、検索文字列として不正な文字列を渡してはいけない。 正常終了戻り値: マッチ位置 (p - str >= 0) 検索失敗: ONIG_MISMATCH (< 0) @@ -322,6 +326,9 @@ const UChar* start, const UChar* range, OnigRegion* region, OnigOptionType option, OnigMatchParam* mp) + 正規表現で文字列を検索し、検索結果とマッチ領域を返す。 + 正規表現オブジェクトの文字エンコーディングで、検索文字列として不正な文字列を渡してはいけない。 + 引数 1-7: onig_search()と同じ 8 mp: マッチパラメタ値 (match_stack_limit, retry_limit_in_match) @@ -331,6 +338,7 @@ const UChar* at, OnigRegion* region, OnigOptionType option) 文字列の指定位置でマッチングを行い、結果とマッチ領域を返す。 + 正規表現オブジェクトの文字エンコーディングで、検索文字列として不正な文字列を渡してはいけない。 正常終了戻り値: マッチしたバイト長 (>= 0) not match: ONIG_MISMATCH ( < 0) @@ -352,6 +360,9 @@ const UChar* at, OnigRegion* region, OnigOptionType option, OnigMatchParam* mp) + 文字列の指定位置でマッチングを行い、結果とマッチ領域を返す。 + 正規表現オブジェクトの文字エンコーディングで、検索文字列として不正な文字列を渡してはいけない。 + 引数 1-6: onig_match()と同じ 7 mp: マッチパラメタ値 (match_stack_limit, retry_limit_in_match) @@ -363,6 +374,7 @@ void* callback_arg) 正規表現で文字列をスキャンして、マッチングする毎にコールバック関数を呼び出す。 + 正規表現オブジェクトの文字エンコーディングで、検索文字列として不正な文字列を渡してはいけない。 正常終了: マッチ回数 (0回も含める) エラー: エラーコード (< 0) @@ -616,14 +628,20 @@ # int onigenc_strlen(OnigEncoding enc, const UChar* s, const UChar* end) + + 文字列の文字数を返す。 + + # int onigenc_strlen_null(OnigEncoding enc, const UChar* s) 文字列の文字数を返す。 + 文字エンコーディングに対して、不正な文字列を渡してはいけない。 # int onigenc_str_bytelen_null(OnigEncoding enc, const UChar* s) 文字列のバイト数を返す。 + 文字エンコーディングに対して、不正な文字列を渡してはいけない。 # int onig_set_default_syntax(OnigSyntaxType* syntax) diff --git a/doc/UNICODE_PROPERTIES b/doc/UNICODE_PROPERTIES index 1148b4d..ff2a6ce 100644 --- a/doc/UNICODE_PROPERTIES +++ b/doc/UNICODE_PROPERTIES @@ -1,4 +1,4 @@ -Unicode Properties (from Unicode Version: 12.0.0) +Unicode Properties (from Unicode Version: 12.1.0) 15: ASCII_Hex_Digit 16: Adlam diff --git a/harnesses/ascii_compatible.dict b/harnesses/ascii_compatible.dict new file mode 100644 index 0000000..820bf47 --- /dev/null +++ b/harnesses/ascii_compatible.dict @@ -0,0 +1,111 @@ +# First-pass fuzzing dictionary for Oniguruma by Mark Griffin +"\\o{17777777777}" +"\\777" +"\\u" +"\\uFFFF" +"\\xFF" +"\\x{70000000}" +"\\C-" +"\\M-\\C-" +"\\X" +"\\p{" +"\\p{^" +"}" +"]" +")" +"\\n" +"\\r" +"\\R" +"\\W" +"\\w" +"\\s" +"\\S" +"\\d" +"\\O" +"\\X" +"\\b" +"\\y" +"\\Y" +"\\A" +"\\z" +"\\K" +"\\G" +"\\p{Print}" +"\\p{ASCII}" +"\\p{Alnum}" +"{0,2}" +"{3,}" +"{,3}" +"{5}" +"{4,2}" +"??" +"*?" +"+?" +"*+" +"{1,3}+" +"(?>" +"\\B" +"(?y{" +"[abcd1-9]" +"[\\w\\d" +"[\\p{Alphabetic}" +"[\\P{Arabic}" +"[\\x{ffff}" +"[a-w&&" +"[^" +"[:graph:]" +"[^:cntrl:]" +"(?i:" +"(?i)" +"(?m:" +"(?x:" +"(?W:" +"(?y-:" +"(?y{w}:" +"(?P:" +"(?#" +"(?:" +"(?=" +"(?!" +"(?<=" +"(?" +"(?" +"(?{" +"(?{....}[x])" +"(?{.}[x]>)" +"(?{{{.}}})" +"(?~" +"(?~a)" +"(?~|a|.*)" +"(?~|(?:a|b))" +"(?~|)" +"(?(.) |.)" +"(?('-n'))" +"(?(n+0))" +"(?(n+1))" +"(?(n-1))" +"(?())" +"(?())" +"(?())" +"(*ERROR{-2000})" +"(*COUNT[tag]{X})" +"\\1" +"\\2" +"\\k" +"\\k<1>" +"\\k<2>" +"\\k<-1>" +"\\k<-2>" +"\\k" +"\\k" +"\\k" +"\\g<-1>" +"\\g" +"name" +"(?a|b\\gc)" +"(?-i:\\g)" +"\\N{name}" +"\\p{Hiragana}" +"\\p{Katakana}" +"\\p{Emoji}" diff --git a/harnesses/deluxe-encode-harness.c b/harnesses/deluxe-encode-harness.c new file mode 100644 index 0000000..e1f84a5 --- /dev/null +++ b/harnesses/deluxe-encode-harness.c @@ -0,0 +1,239 @@ +/* + * deluxe-encode-harness.c + * contributed by Mark Griffin + */ +#include +#include "oniguruma.h" + +#include +#include + +#define DEFAULT_LIMIT 120 +typedef unsigned char uint8_t; + +static int +search(regex_t* reg, unsigned char* str, unsigned char* end) +{ + int r; + unsigned char *start, *range; + OnigRegion *region; + + region = onig_region_new(); + + start = str; + range = end; + r = onig_search(reg, str, end, start, range, region, ONIG_OPTION_NONE); + if (r >= 0) { + int i; + + fprintf(stdout, "match at %d (%s)\n", r, + ONIGENC_NAME(onig_get_encoding(reg))); + for (i = 0; i < region->num_regs; i++) { + fprintf(stdout, "%d: (%d-%d)\n", i, region->beg[i], region->end[i]); + } + } + else if (r == ONIG_MISMATCH) { + fprintf(stdout, "search fail (%s)\n", + ONIGENC_NAME(onig_get_encoding(reg))); + } + else { /* error */ + char s[ONIG_MAX_ERROR_MESSAGE_LEN]; + onig_error_code_to_str((UChar* )s, r); + fprintf(stdout, "ERROR: %s\n", s); + fprintf(stdout, " (%s)\n", ONIGENC_NAME(onig_get_encoding(reg))); + onig_region_free(region, 1 /* 1:free self, 0:free contents only */); + return -1; + } + + onig_region_free(region, 1 /* 1:free self, 0:free contents only */); + return 0; +} + +static int +exec(OnigEncoding enc, OnigOptionType options, + char* apattern, char* apattern_end, char* astr, char* astr_end) +{ + int r; + regex_t* reg; + OnigErrorInfo einfo; + UChar* pattern = (UChar* )apattern; + UChar* str = (UChar* )astr; + UChar* pattern_end = (UChar* )apattern_end; + unsigned char *end = (unsigned char* )astr_end; + + onig_initialize(&enc, 1); + onig_set_retry_limit_in_match(DEFAULT_LIMIT); + onig_set_parse_depth_limit(DEFAULT_LIMIT); + + r = onig_new(®, pattern, pattern_end, + options, enc, ONIG_SYNTAX_DEFAULT, &einfo); + if (r != ONIG_NORMAL) { + char s[ONIG_MAX_ERROR_MESSAGE_LEN]; + onig_error_code_to_str((UChar* )s, r, &einfo); + fprintf(stdout, "ERROR: %s\n", s); + onig_end(); + return -1; + } + + r = search(reg, str, end); + + onig_free(reg); + onig_end(); + return 0; +} + +static OnigCaseFoldType CF = ONIGENC_CASE_FOLD_MIN; + +static int +exec_deluxe(OnigEncoding pattern_enc, OnigEncoding str_enc, + OnigOptionType options, char* apattern, char* apattern_end, + char* astr, char* astr_end) +{ + int r; + regex_t* reg; + OnigCompileInfo ci; + OnigErrorInfo einfo; + UChar* pattern = (UChar* )apattern; + UChar* str = (UChar* )astr; + UChar* pattern_end = (UChar* )apattern_end; + unsigned char* end = (unsigned char* )astr_end; + + onig_initialize(&str_enc, 1); + onig_set_retry_limit_in_match(DEFAULT_LIMIT); + onig_set_parse_depth_limit(DEFAULT_LIMIT); + + ci.num_of_elements = 5; + ci.pattern_enc = pattern_enc; + ci.target_enc = str_enc; + ci.syntax = ONIG_SYNTAX_DEFAULT; + ci.option = options; + ci.case_fold_flag = CF; + + r = onig_new_deluxe(®, pattern, pattern_end, &ci, &einfo); + if (r != ONIG_NORMAL) { + char s[ONIG_MAX_ERROR_MESSAGE_LEN]; + onig_error_code_to_str((UChar* )s, r, &einfo); + fprintf(stdout, "ERROR: %s\n", s); + onig_end(); + return -1; + } + + if (onigenc_is_valid_mbc_string(str_enc, str, end) != 0) { + r = search(reg, str, end); + } + + onig_free(reg); + onig_end(); + return 0; +} + +#define PATTERN_SIZE 48 +#define NUM_CONTROL_BYTES 1 +#define MIN_STR_SIZE 2 +int LLVMFuzzerTestOneInput(const uint8_t * Data, size_t Size) +{ + int r; + size_t remaining_size; + unsigned char *data; + unsigned char pat_encoding_choice; + unsigned char str_encoding_choice; + unsigned char *pattern; + unsigned char *str; + unsigned char *pattern_end; + unsigned char *str_end; + unsigned int num_encodings; + OnigEncodingType *pattern_enc; + OnigEncodingType *str_enc; + + OnigEncodingType *encodings[] = { + ONIG_ENCODING_ASCII, + ONIG_ENCODING_ISO_8859_1, + ONIG_ENCODING_ISO_8859_2, + ONIG_ENCODING_ISO_8859_3, + ONIG_ENCODING_ISO_8859_4, + ONIG_ENCODING_ISO_8859_5, + ONIG_ENCODING_ISO_8859_6, + ONIG_ENCODING_ISO_8859_7, + ONIG_ENCODING_ISO_8859_8, + ONIG_ENCODING_ISO_8859_9, + ONIG_ENCODING_ISO_8859_10, + ONIG_ENCODING_ISO_8859_11, + ONIG_ENCODING_ISO_8859_13, + ONIG_ENCODING_ISO_8859_14, + ONIG_ENCODING_ISO_8859_15, + ONIG_ENCODING_ISO_8859_16, + ONIG_ENCODING_UTF8, + ONIG_ENCODING_UTF16_BE, + ONIG_ENCODING_UTF16_LE, + ONIG_ENCODING_UTF32_BE, + ONIG_ENCODING_UTF32_LE, + ONIG_ENCODING_EUC_JP, + ONIG_ENCODING_EUC_TW, + ONIG_ENCODING_EUC_KR, + ONIG_ENCODING_EUC_CN, + ONIG_ENCODING_SJIS, + //ONIG_ENCODING_KOI8, + ONIG_ENCODING_KOI8_R, + ONIG_ENCODING_CP1251, + ONIG_ENCODING_BIG5, + ONIG_ENCODING_GB18030, + }; + + if (Size <= (NUM_CONTROL_BYTES + PATTERN_SIZE + MIN_STR_SIZE)) + return 0; + if (Size > 0x1000) + return 0; + + remaining_size = Size; + data = (unsigned char *)(Data); + + // pull off bytes to switch off + pat_encoding_choice = data[0]; + data++; + remaining_size--; + str_encoding_choice = data[0]; + data++; + remaining_size--; + + // copy first PATTERN_SIZE bytes off to be the pattern + pattern = (unsigned char *)malloc(PATTERN_SIZE+4); + memset(pattern, 0, PATTERN_SIZE+4); + memcpy(pattern, data, PATTERN_SIZE); + pattern_end = pattern + PATTERN_SIZE; + data += PATTERN_SIZE; + remaining_size -= PATTERN_SIZE; + + str = (unsigned char*)malloc(remaining_size+4); + memset(str, 0, remaining_size+4); + memcpy(str, data, remaining_size); + str_end = str + remaining_size; + + num_encodings = sizeof(encodings) / sizeof(encodings[0]); + pattern_enc = encodings[pat_encoding_choice % num_encodings]; + str_enc = encodings[str_encoding_choice % num_encodings]; + + r = exec_deluxe(pattern_enc, str_enc, ONIG_OPTION_NONE, (char *)pattern, (char *)pattern_end, (char *)str, (char *)str_end); + + free(pattern); + free(str); + + return r; +} + + +#ifdef WITH_READ_MAIN + +#include + +extern int main(int argc, char* argv[]) +{ + size_t n; + uint8_t Data[10000]; + + n = read(0, Data, sizeof(Data)); + fprintf(stdout, "n: %ld\n", n); + LLVMFuzzerTestOneInput(Data, n); + + return 0; +} +#endif /* WITH_READ_MAIN */ diff --git a/harnesses/dict_conv.py b/harnesses/dict_conv.py new file mode 100644 index 0000000..f721293 --- /dev/null +++ b/harnesses/dict_conv.py @@ -0,0 +1,72 @@ +# -*- coding: utf-8 -*- +# dict_conv.py (Python3 script) + +import sys + +ENC_UTF16_BE = 1 +ENC_UTF16_LE = 2 + +def add_char(enc, s, c): + if enc == ENC_UTF16_BE: + s += "\\x00" + + s += c + if enc == ENC_UTF16_LE: + s += "\\x00" + + return s + +def conv(enc, s): + n = len(s) + r = "" + i = 0 + while i < n: + c = s[i] + if c == '\\': + c = s[i+1] + if c == '\\' or c == '"': + r = add_char(enc, r, "\\" + c) + i += 2 + continue + else: + raise("Unknown escape {0}".format(s)) + + r = add_char(enc, r, c) + i += 1 + + return r + +def main(enc): + print("# This file was generated by dict_conv.py.") + for line in sys.stdin: + s = line.strip() + if s[0] == '#': + print(s) + continue + + if s[0] == '"' and s[-1] == '"': + s = conv(enc, s[1:-1]) + print("\"{0}\"".format(s)) + else: + raise("Invalid format {0}".format(s)) + +def usage(argv): + raise RuntimeError("Usage: python {0} utf16_be/utf16_le".format(argv[0])) + + +if __name__ == "__main__": + argv = sys.argv + argc = len(argv) + + if argc >= 2: + s = argv[1] + if s == 'utf16_be': + enc = ENC_UTF16_BE + elif s == 'utf16_le': + enc = ENC_UTF16_LE + else: + usage(argv) + else: + usage(argv) + + main(enc) diff --git a/harnesses/encode-harness.c b/harnesses/encode-harness.c new file mode 100644 index 0000000..e57fd4f --- /dev/null +++ b/harnesses/encode-harness.c @@ -0,0 +1,170 @@ +/* + * encode-harness.c + * contributed by Mark Griffin + */ +#include +#include "oniguruma.h" + +#include +#include + +#define PARSE_DEPTH_LIMIT 120 +#define RETRY_LIMIT 4000 + +typedef unsigned char uint8_t; + +static int +search(regex_t* reg, unsigned char* str, unsigned char* end) +{ + int r; + unsigned char *start, *range; + OnigRegion *region; + + region = onig_region_new(); + + start = str; + range = end; + r = onig_search(reg, str, end, start, range, region, ONIG_OPTION_NONE); + if (r >= 0) { + int i; + + fprintf(stdout, "match at %d (%s)\n", r, + ONIGENC_NAME(onig_get_encoding(reg))); + for (i = 0; i < region->num_regs; i++) { + fprintf(stdout, "%d: (%d-%d)\n", i, region->beg[i], region->end[i]); + } + } + else if (r == ONIG_MISMATCH) { + fprintf(stdout, "search fail (%s)\n", + ONIGENC_NAME(onig_get_encoding(reg))); + } + else { /* error */ + char s[ONIG_MAX_ERROR_MESSAGE_LEN]; + onig_error_code_to_str((UChar* )s, r); + fprintf(stdout, "ERROR: %s\n", s); + fprintf(stdout, " (%s)\n", ONIGENC_NAME(onig_get_encoding(reg))); + onig_region_free(region, 1 /* 1:free self, 0:free contents only */); + return -1; + } + + onig_region_free(region, 1 /* 1:free self, 0:free contents only */); + return 0; +} + +static int +exec(OnigEncoding enc, OnigOptionType options, + char* apattern, char* apattern_end, char* astr, UChar* end) +{ + int r; + regex_t* reg; + OnigErrorInfo einfo; + UChar* pattern = (UChar* )apattern; + UChar* str = (UChar* )astr; + UChar* pattern_end = (UChar* )apattern_end; + + onig_initialize(&enc, 1); + onig_set_retry_limit_in_match(RETRY_LIMIT); + onig_set_parse_depth_limit(PARSE_DEPTH_LIMIT); + + r = onig_new(®, pattern, pattern_end, + options, enc, ONIG_SYNTAX_DEFAULT, &einfo); + if (r != ONIG_NORMAL) { + char s[ONIG_MAX_ERROR_MESSAGE_LEN]; + onig_error_code_to_str((UChar* )s, r, &einfo); + fprintf(stdout, "ERROR: %s\n", s); + onig_end(); + return -1; + } + + if (onigenc_is_valid_mbc_string(enc, str, end) != 0) { + r = search(reg, str, end); + } + + onig_free(reg); + onig_end(); + return 0; +} + +#define PATTERN_SIZE 32 +#define NUM_CONTROL_BYTES 1 +#define MIN_STR_SIZE 1 +int LLVMFuzzerTestOneInput(const uint8_t * Data, size_t Size) +{ + if (Size <= (NUM_CONTROL_BYTES + PATTERN_SIZE + MIN_STR_SIZE)) + return 0; + if (Size > 0x1000) + return 0; + + unsigned char *pattern_end; + unsigned char *str_null_end; + + size_t remaining_size = Size; + unsigned char *data = (unsigned char *)(Data); + + // pull off one byte to switch off + unsigned char encoding_choice = data[0]; + data++; + remaining_size--; + + // copy first PATTERN_SIZE bytes off to be the pattern + unsigned char *pattern = (unsigned char *)malloc(PATTERN_SIZE+4); + memset(pattern, 0, PATTERN_SIZE+4); + memcpy(pattern, data, PATTERN_SIZE); + pattern_end = pattern + PATTERN_SIZE; + data += PATTERN_SIZE; + remaining_size -= PATTERN_SIZE; + + unsigned char *str = (unsigned char*)malloc(remaining_size+4); + memset(str, 0, remaining_size+4); + memcpy(str, data, remaining_size); + str_null_end = str + remaining_size; + + int r; + OnigEncodingType *encodings[] = { + ONIG_ENCODING_SJIS, + ONIG_ENCODING_EUC_JP, + ONIG_ENCODING_CP1251, + ONIG_ENCODING_ISO_8859_1, + ONIG_ENCODING_UTF8, + ONIG_ENCODING_KOI8_R, + ONIG_ENCODING_BIG5 + }; + + OnigEncodingType *enc; + +#ifdef UTF16_BE + enc = ONIG_ENCODING_UTF16_BE; +#else +#ifdef UTF16_LE + enc = ONIG_ENCODING_UTF16_LE; +#else + int num_encodings = sizeof(encodings)/sizeof(encodings[0]); + enc = encodings[encoding_choice % num_encodings]; +#endif +#endif + + r = exec(enc, ONIG_OPTION_NONE, (char *)pattern, (char *)pattern_end, + (char *)str, str_null_end); + + free(pattern); + free(str); + + return r; +} + +#ifdef WITH_READ_MAIN + +#include + +extern int main(int argc, char* argv[]) +{ + size_t n; + uint8_t Data[10000]; + + n = read(0, Data, sizeof(Data)); + fprintf(stdout, "n: %ld\n", n); + LLVMFuzzerTestOneInput(Data, n); + + return 0; +} +#endif /* WITH_READ_MAIN */ diff --git a/harnesses/syntax-harness.c b/harnesses/syntax-harness.c new file mode 100644 index 0000000..0fb3587 --- /dev/null +++ b/harnesses/syntax-harness.c @@ -0,0 +1,120 @@ +/* + * syntax-harness.c + * contributed by Mark Griffin + */ +#include +#include +#include "oniguruma.h" + +#include + +#define DEFAULT_LIMIT 120 +typedef unsigned char uint8_t; + +extern int exec(OnigSyntaxType* syntax, char* apattern, char* astr) +{ + int r; + unsigned char *start, *range, *end; + regex_t* reg; + OnigErrorInfo einfo; + OnigRegion *region; + UChar* pattern = (UChar* )apattern; + UChar* str = (UChar* )astr; + + r = onig_new(®, pattern, pattern + strlen((char* )pattern), + ONIG_OPTION_DEFAULT, ONIG_ENCODING_ASCII, syntax, &einfo); + if (r != ONIG_NORMAL) { + char s[ONIG_MAX_ERROR_MESSAGE_LEN]; + onig_error_code_to_str((UChar* )s, r, &einfo); + fprintf(stdout, "ERROR: %s\n", s); + return -1; + } + + region = onig_region_new(); + + end = str + strlen((char* )str); + start = str; + range = end; + r = onig_search(reg, str, end, start, range, region, ONIG_OPTION_NONE); + if (r >= 0) { + int i; + + fprintf(stdout, "match at %d\n", r); + for (i = 0; i < region->num_regs; i++) { + fprintf(stdout, "%d: (%d-%d)\n", i, region->beg[i], region->end[i]); + } + } + else if (r == ONIG_MISMATCH) { + fprintf(stdout, "search fail\n"); + } + else { /* error */ + char s[ONIG_MAX_ERROR_MESSAGE_LEN]; + onig_error_code_to_str((UChar* )s, r); + fprintf(stdout, "ERROR: %s\n", s); + onig_region_free(region, 1 /* 1:free self, 0:free contents only */); + onig_free(reg); + return -1; + } + + onig_region_free(region, 1 /* 1:free self, 0:free contents only */); + onig_free(reg); + return 0; +} + +#define PATTERN_SIZE 64 +#define NUM_CONTROL_BYTES 1 +#define MIN_STR_SIZE 1 +int LLVMFuzzerTestOneInput(const uint8_t * Data, size_t Size) +{ + if (Size <= (NUM_CONTROL_BYTES + PATTERN_SIZE + MIN_STR_SIZE)) + return 0; + if (Size > 0x1000) + return 0; + size_t remaining_size = Size; + unsigned char *data = (unsigned char *)(Data); + + // pull off one byte to switch syntax choice + unsigned char syntax_choice = data[0]; + data++; + remaining_size--; + + // copy first PATTERN_SIZE bytes off to be the pattern + unsigned char *pattern = (unsigned char *)malloc(PATTERN_SIZE+1); + memset(pattern, 0, PATTERN_SIZE+1); + memcpy(pattern, data, PATTERN_SIZE); + data += PATTERN_SIZE; + remaining_size -= PATTERN_SIZE; + + unsigned char *str = (unsigned char*)malloc(remaining_size+1); + memset(str, 0, remaining_size+1); + memcpy(str, data, remaining_size); + + OnigEncoding use_encs[] = { ONIG_ENCODING_ASCII }; + onig_initialize(use_encs, sizeof(use_encs)/sizeof(use_encs[0])); + + onig_set_retry_limit_in_match(DEFAULT_LIMIT); + onig_set_parse_depth_limit(DEFAULT_LIMIT); + + OnigSyntaxType *syntaxes[] = { + ONIG_SYNTAX_POSIX_EXTENDED, + ONIG_SYNTAX_EMACS, + ONIG_SYNTAX_GREP, + ONIG_SYNTAX_GNU_REGEX, + ONIG_SYNTAX_JAVA, + ONIG_SYNTAX_PERL_NG, + ONIG_SYNTAX_RUBY, + ONIG_SYNTAX_ONIGURUMA, + }; + OnigSyntaxType *syntax = syntaxes[syntax_choice % 8]; + + int r; + r = exec(syntax, (char *)pattern, (char *)str); + // r = exec(ONIG_SYNTAX_JAVA, "\\p{XDigit}\\P{XDigit}[a-c&&b-g]", "bgc"); + + onig_end(); + + free(pattern); + free(str); + + return 0; +} diff --git a/index.html b/index.html index 5ad8231..58ba66d 100644 --- a/index.html +++ b/index.html @@ -8,7 +8,7 @@

Oniguruma

(Japanese)

-(c) K.Kosako, updated at: 2018/12/06 +(c) K.Kosako, updated at: 2019/08/05

@@ -16,6 +16,8 @@
What's new
    +
  • 2019/08/06: Version 6.9.3 released.
  • +
  • 2019/05/07: Version 6.9.2 released.
  • 2018/12/11: Version 6.9.1 released.
  • 2018/09/03: Version 6.9.0 released.
  • 2018/04/17: Version 6.8.2 released.
  • diff --git a/index_ja.html b/index_ja.html index 0ada788..6b75c6c 100644 --- a/index_ja.html +++ b/index_ja.html @@ -8,7 +8,7 @@

    鬼車

    -(c) K.Kosako, 最終更新: 2018/12/06 +(c) K.Kosako, 最終更新: 2019/08/05

    @@ -16,6 +16,8 @@
    更新情報
      +
    • 2019/08/06: Version 6.9.3 リリース
    • +
    • 2019/05/07: Version 6.9.2 リリース
    • 2018/12/11: Version 6.9.1 リリース
    • 2018/09/03: Version 6.9.0 リリース
    • 2018/04/17: Version 6.8.2 リリース
    • diff --git a/sample/bug_fix.c b/sample/bug_fix.c index 81c2784..3f60c5b 100644 --- a/sample/bug_fix.c +++ b/sample/bug_fix.c @@ -4,8 +4,6 @@ #include #include "oniguruma.h" -static OnigCaseFoldType CF = ONIGENC_CASE_FOLD_MIN; - static int search(regex_t* reg, unsigned char* str, unsigned char* end) { @@ -36,6 +34,7 @@ search(regex_t* reg, unsigned char* str, unsigned char* end) onig_error_code_to_str((UChar* )s, r); fprintf(stderr, "ERROR: %s\n", s); fprintf(stderr, " (%s)\n", ONIGENC_NAME(onig_get_encoding(reg))); + onig_region_free(region, 1 /* 1:free self, 0:free contents only */); return -1; } @@ -43,45 +42,6 @@ search(regex_t* reg, unsigned char* str, unsigned char* end) return 0; } -static int -exec_deluxe(OnigEncoding pattern_enc, OnigEncoding str_enc, - OnigOptionType options, char* apattern, char* astr) -{ - int r; - unsigned char *end; - regex_t* reg; - OnigCompileInfo ci; - OnigErrorInfo einfo; - UChar* pattern = (UChar* )apattern; - UChar* str = (UChar* )astr; - - onig_initialize(&str_enc, 1); - - ci.num_of_elements = 5; - ci.pattern_enc = pattern_enc; - ci.target_enc = str_enc; - ci.syntax = ONIG_SYNTAX_DEFAULT; - ci.option = options; - ci.case_fold_flag = CF; - - r = onig_new_deluxe(®, pattern, - pattern + onigenc_str_bytelen_null(pattern_enc, pattern), - &ci, &einfo); - if (r != ONIG_NORMAL) { - char s[ONIG_MAX_ERROR_MESSAGE_LEN]; - onig_error_code_to_str((UChar* )s, r, &einfo); - fprintf(stderr, "ERROR: %s\n", s); - return -1; - } - - end = str + onigenc_str_bytelen_null(str_enc, str); - r = search(reg, str, end); - - onig_free(reg); - onig_end(); - return 0; -} - static int exec(OnigEncoding enc, OnigOptionType options, char* apattern, char* astr) { @@ -92,8 +52,6 @@ exec(OnigEncoding enc, OnigOptionType options, char* apattern, char* astr) UChar* pattern = (UChar* )apattern; UChar* str = (UChar* )astr; - onig_initialize(&enc, 1); - r = onig_new(®, pattern, pattern + onigenc_str_bytelen_null(enc, pattern), options, enc, ONIG_SYNTAX_DEFAULT, &einfo); @@ -108,7 +66,6 @@ exec(OnigEncoding enc, OnigOptionType options, char* apattern, char* astr) r = search(reg, str, end); onig_free(reg); - onig_end(); return 0; } @@ -116,16 +73,21 @@ exec(OnigEncoding enc, OnigOptionType options, char* apattern, char* astr) extern int main(int argc, char* argv[]) { + OnigEncoding use_encs[1]; + + use_encs[0] = ONIG_ENCODING_UTF8; + onig_initialize(use_encs, 1); + /* fix ignore case in look-behind commit: 3340ec2cc5627172665303fe248c9793354d2251 */ - exec_deluxe(ONIG_ENCODING_UTF8, ONIG_ENCODING_UTF8, - ONIG_OPTION_IGNORECASE, - "(?<=\305\211)a", "\312\274na"); /* \u{0149}a \u{02bc}na */ + exec(ONIG_ENCODING_UTF8, ONIG_OPTION_IGNORECASE, + "(?<=\305\211)a", "\312\274na"); /* \u{0149}a \u{02bc}na */ exec(ONIG_ENCODING_UTF8, ONIG_OPTION_NONE, "(\\2)(\\1)", "aa"); /* fail. */ exec(ONIG_ENCODING_UTF8, ONIG_OPTION_FIND_LONGEST, "a*", "aa aaa aaaa aaaaa "); /* match 12-17 */ + onig_end(); return 0; } diff --git a/sample/crnl.c b/sample/crnl.c index 3ad1210..bfa563e 100644 --- a/sample/crnl.c +++ b/sample/crnl.c @@ -65,6 +65,8 @@ x(int no, char* pattern_arg, char* str_arg, char s[ONIG_MAX_ERROR_MESSAGE_LEN]; onig_error_code_to_str(s, r); fprintf(stderr, "ERROR: %s\n", s); + onig_region_free(region, 1 /* 1:free self, 0:free contents only */); + onig_free(reg); return -1; } diff --git a/sample/encode.c b/sample/encode.c index 8a03ab8..c5d4771 100644 --- a/sample/encode.c +++ b/sample/encode.c @@ -34,6 +34,7 @@ search(regex_t* reg, unsigned char* str, unsigned char* end) onig_error_code_to_str((UChar* )s, r); fprintf(stderr, "ERROR: %s\n", s); fprintf(stderr, " (%s)\n", ONIGENC_NAME(onig_get_encoding(reg))); + onig_region_free(region, 1 /* 1:free self, 0:free contents only */); return -1; } @@ -72,55 +73,6 @@ exec(OnigEncoding enc, OnigOptionType options, return 0; } -static OnigCaseFoldType CF = ONIGENC_CASE_FOLD_MIN; - -#if 0 -static void -set_case_fold(OnigCaseFoldType cf) -{ - CF = cf; -} -#endif - -static int -exec_deluxe(OnigEncoding pattern_enc, OnigEncoding str_enc, - OnigOptionType options, char* apattern, char* astr) -{ - int r; - unsigned char *end; - regex_t* reg; - OnigCompileInfo ci; - OnigErrorInfo einfo; - UChar* pattern = (UChar* )apattern; - UChar* str = (UChar* )astr; - - onig_initialize(&str_enc, 1); - - ci.num_of_elements = 5; - ci.pattern_enc = pattern_enc; - ci.target_enc = str_enc; - ci.syntax = ONIG_SYNTAX_DEFAULT; - ci.option = options; - ci.case_fold_flag = CF; - - r = onig_new_deluxe(®, pattern, - pattern + onigenc_str_bytelen_null(pattern_enc, pattern), - &ci, &einfo); - if (r != ONIG_NORMAL) { - char s[ONIG_MAX_ERROR_MESSAGE_LEN]; - onig_error_code_to_str((UChar* )s, r, &einfo); - fprintf(stderr, "ERROR: %s\n", s); - return -1; - } - - end = str + onigenc_str_bytelen_null(str_enc, str); - r = search(reg, str, end); - - onig_free(reg); - onig_end(); - return 0; -} - extern int main(int argc, char* argv[]) { int r; @@ -196,39 +148,6 @@ extern int main(int argc, char* argv[]) r = exec(ONIG_ENCODING_ISO_8859_1, ONIG_OPTION_IGNORECASE, "is", "iss"); - r = exec_deluxe(ONIG_ENCODING_ASCII, ONIG_ENCODING_UTF16_BE, - ONIG_OPTION_NONE, "a+", - "\000b\000a\000a\000a\000c\000c\000\000"); - - r = exec_deluxe(ONIG_ENCODING_ASCII, ONIG_ENCODING_UTF16_LE, - ONIG_OPTION_NONE, "a+", - "b\000a\000a\000a\000a\000c\000\000\000"); - - r = exec_deluxe(ONIG_ENCODING_UTF16_BE, ONIG_ENCODING_UTF16_LE, - ONIG_OPTION_NONE, - "\000b\000a\000a\000a\000c\000c\000\000", - "x\000b\000a\000a\000a\000c\000c\000\000\000"); - - r = exec_deluxe(ONIG_ENCODING_ISO_8859_1, ONIG_ENCODING_UTF16_BE, - ONIG_OPTION_IGNORECASE, - "\337", "\000S\000S\000\000"); - - r = exec_deluxe(ONIG_ENCODING_ISO_8859_1, ONIG_ENCODING_UTF16_BE, - ONIG_OPTION_IGNORECASE, - "SS", "\000\337\000\000"); - - r = exec_deluxe(ONIG_ENCODING_ISO_8859_1, ONIG_ENCODING_UTF16_LE, - ONIG_OPTION_IGNORECASE, - "\337", "S\000S\000\000\000"); - - r = exec_deluxe(ONIG_ENCODING_ISO_8859_1, ONIG_ENCODING_UTF32_BE, - ONIG_OPTION_IGNORECASE, - "SS", "\000\000\000\337\000\000\000\000"); - - r = exec_deluxe(ONIG_ENCODING_ISO_8859_1, ONIG_ENCODING_UTF32_LE, - ONIG_OPTION_IGNORECASE, - "\337", "S\000\000\000S\000\000\000\000\000\000\000"); - r = exec(ONIG_ENCODING_UTF16_BE, ONIG_OPTION_NONE, "\000[\000[\000:\000a\000l\000n\000u\000m\000:\000]\000]\000+\000\000", "\000#\002\120\000a\000Z\012\077\012\076\012\075\000\000"); @@ -242,44 +161,34 @@ extern int main(int argc, char* argv[]) r = exec(ONIG_ENCODING_GB18030, ONIG_OPTION_IGNORECASE, "(Aa\\d)+", "BaA5Aa0234"); - r = exec_deluxe(ONIG_ENCODING_ISO_8859_1, ONIG_ENCODING_UTF16_BE, - ONIG_OPTION_NONE, - "^\\P{Hiragana}\\p{^Hiragana}(\\p{Hiragana}+)$", - "\060\100\060\240\060\101\060\102\060\226\060\237\000\000"); - - r = exec_deluxe(ONIG_ENCODING_UTF16_BE, ONIG_ENCODING_UTF16_BE, - ONIG_OPTION_IGNORECASE, - "\000[\000\337\000]\000\000", "\000S\000S\000\000"); + r = exec(ONIG_ENCODING_UTF16_BE, ONIG_OPTION_IGNORECASE, + "\000[\000\337\000]\000\000", "\000S\000S\000\000"); - r = exec_deluxe(ONIG_ENCODING_UTF16_BE, ONIG_ENCODING_UTF16_BE, - ONIG_OPTION_IGNORECASE, - "\000[\000\337\000]\000\000", "\000s\000S\000\000"); + r = exec(ONIG_ENCODING_UTF16_BE, ONIG_OPTION_IGNORECASE, + "\000[\000\337\000]\000\000", "\000s\000S\000\000"); - r = exec_deluxe(ONIG_ENCODING_UTF16_BE, ONIG_ENCODING_UTF16_BE, - ONIG_OPTION_IGNORECASE, - "\000^\000[\000\001\000-\377\375\000]\000$\000\000", - "\000s\000S\000\000"); + r = exec(ONIG_ENCODING_UTF16_BE, ONIG_OPTION_IGNORECASE, + "\000^\000[\000\001\000-\377\375\000]\000$\000\000", + "\000s\000S\000\000"); - r = exec_deluxe(ONIG_ENCODING_UTF16_BE, ONIG_ENCODING_UTF16_BE, - ONIG_OPTION_IGNORECASE, - "\000S\000S\000\000", - "\000S\000T\000\337\000\000"); + r = exec(ONIG_ENCODING_UTF16_BE, ONIG_OPTION_IGNORECASE, + "\000S\000S\000\000", + "\000S\000T\000\337\000\000"); - r = exec_deluxe(ONIG_ENCODING_UTF16_BE, ONIG_ENCODING_UTF16_BE, - ONIG_OPTION_IGNORECASE, - "\000S\000T\000S\000S\000\000", - "\000S\000t\000s\000S\000\000"); + r = exec(ONIG_ENCODING_UTF16_BE, ONIG_OPTION_IGNORECASE, + "\000S\000T\000S\000S\000\000", + "\000S\000t\000s\000S\000\000"); { UChar pat[] = { 0x1f, 0xfc, 0x00, 0x00 }; UChar str1[] = { 0x21, 0x26, 0x1f, 0xbe, 0x00, 0x00 }; UChar str2[] = { 0x1f, 0xf3, 0x00, 0x00 }; - r = exec_deluxe(ONIG_ENCODING_UTF16_BE, ONIG_ENCODING_UTF16_BE, - ONIG_OPTION_IGNORECASE, (char* )pat, (char* )str1); + r = exec(ONIG_ENCODING_UTF16_BE, ONIG_OPTION_IGNORECASE, + (char* )pat, (char* )str1); - r = exec_deluxe(ONIG_ENCODING_UTF16_BE, ONIG_ENCODING_UTF16_BE, - ONIG_OPTION_IGNORECASE, (char* )pat, (char* )str2); + r = exec(ONIG_ENCODING_UTF16_BE, ONIG_OPTION_IGNORECASE, + (char* )pat, (char* )str2); } #if 0 @@ -287,17 +196,14 @@ extern int main(int argc, char* argv[]) set_case_fold(ONIGENC_CASE_FOLD_TURKISH_AZERI); - r = exec_deluxe(ONIG_ENCODING_UTF8, ONIG_ENCODING_UTF8, - ONIG_OPTION_IGNORECASE, - "Ii", "\304\261\304\260"); + r = exec(ONIG_ENCODING_UTF8, ONIG_ENCODING_UTF8, ONIG_OPTION_IGNORECASE, + "Ii", "\304\261\304\260"); - r = exec_deluxe(ONIG_ENCODING_UTF16_BE, ONIG_ENCODING_UTF16_BE, - ONIG_OPTION_IGNORECASE, - "\000I\000i\000\000", "\001\061\001\060\000\000"); + r = exec(ONIG_ENCODING_UTF16_BE, ONIG_OPTION_IGNORECASE, + "\000I\000i\000\000", "\001\061\001\060\000\000"); - r = exec_deluxe(ONIG_ENCODING_UTF16_BE, ONIG_ENCODING_UTF16_BE, - ONIG_OPTION_IGNORECASE, - "\001\061\001\060\000\000", "\000I\000i\000\000"); + r = exec(ONIG_ENCODING_UTF16_BE, ONIG_OPTION_IGNORECASE, + "\001\061\001\060\000\000", "\000I\000i\000\000"); set_case_fold(ONIGENC_CASE_FOLD_MIN); #endif diff --git a/sample/listcap.c b/sample/listcap.c index e0fe23a..a73f7d4 100644 --- a/sample/listcap.c +++ b/sample/listcap.c @@ -69,6 +69,8 @@ extern int ex(unsigned char* str, unsigned char* pattern, else { /* error */ char s[ONIG_MAX_ERROR_MESSAGE_LEN]; onig_error_code_to_str((UChar* )s, r); + onig_region_free(region, 1 /* 1:free self, 0:free contents only */); + onig_free(reg); return -1; } diff --git a/sample/names.c b/sample/names.c index a838056..9b1eb24 100644 --- a/sample/names.c +++ b/sample/names.c @@ -65,6 +65,9 @@ extern int main(int argc, char* argv[]) else { /* error */ char s[ONIG_MAX_ERROR_MESSAGE_LEN]; onig_error_code_to_str((UChar* )s, r); + onig_region_free(region, 1 /* 1:free self, 0:free contents only */); + onig_free(reg); + onig_end(); return -1; } diff --git a/sample/posix.c b/sample/posix.c index 35ccb68..c555936 100644 --- a/sample/posix.c +++ b/sample/posix.c @@ -49,6 +49,7 @@ extern int main(int argc, char* argv[]) regerror(r, ®, buf, sizeof(buf)); fprintf(stderr, "ERROR: %s\n", buf); regfree(®); + onig_end(); return -1; } x(®, pattern, (UChar* )"aaabbbbd"); @@ -60,6 +61,7 @@ extern int main(int argc, char* argv[]) regerror(r, ®, buf, sizeof(buf)); fprintf(stderr, "ERROR: %s\n", buf); regfree(®); + onig_end(); return -1; } x(®, pattern, (UChar* )"a+b{2,7}d?|uuu"); @@ -71,6 +73,7 @@ extern int main(int argc, char* argv[]) regerror(r, ®, buf, sizeof(buf)); fprintf(stderr, "ERROR: %s\n", buf); regfree(®); + onig_end(); return -1; } x(®, pattern, (UChar* )"aaaabbbbbbd"); @@ -83,6 +86,7 @@ extern int main(int argc, char* argv[]) regerror(r, ®, buf, sizeof(buf)); fprintf(stderr, "ERROR: %s\n", buf); regfree(®); + onig_end(); return -1; } x(®, pattern, (UChar* )"aaabbbbd)"); @@ -93,6 +97,7 @@ extern int main(int argc, char* argv[]) regerror(r, ®, buf, sizeof(buf)); fprintf(stderr, "ERROR: %s\n", buf); regfree(®); + onig_end(); return -1; } x(®, pattern, (UChar* )"a\nb\n"); diff --git a/sample/scan.c b/sample/scan.c index ad5ae74..4039e46 100644 --- a/sample/scan.c +++ b/sample/scan.c @@ -36,6 +36,7 @@ scan(regex_t* reg, unsigned char* str, unsigned char* end) char s[ONIG_MAX_ERROR_MESSAGE_LEN]; onig_error_code_to_str((OnigUChar* )s, r); fprintf(stderr, "ERROR: %s\n", s); + onig_region_free(region, 1 /* 1:free self, 0:free contents only */); return -1; } @@ -63,6 +64,7 @@ exec(OnigEncoding enc, OnigOptionType options, char* apattern, char* astr) char s[ONIG_MAX_ERROR_MESSAGE_LEN]; onig_error_code_to_str((OnigUChar* )s, r, &einfo); fprintf(stderr, "ERROR: %s\n", s); + onig_end(); return -1; } diff --git a/sample/simple.c b/sample/simple.c index 95110b8..5a14042 100644 --- a/sample/simple.c +++ b/sample/simple.c @@ -49,6 +49,9 @@ extern int main(int argc, char* argv[]) char s[ONIG_MAX_ERROR_MESSAGE_LEN]; onig_error_code_to_str((UChar* )s, r); fprintf(stderr, "ERROR: %s\n", s); + onig_region_free(region, 1 /* 1:free self, 0:free contents only */); + onig_free(reg); + onig_end(); return -1; } diff --git a/sample/sql.c b/sample/sql.c index 8e95f70..1602ac9 100644 --- a/sample/sql.c +++ b/sample/sql.c @@ -42,6 +42,7 @@ extern int main(int argc, char* argv[]) char s[ONIG_MAX_ERROR_MESSAGE_LEN]; onig_error_code_to_str((UChar* )s, r, &einfo); fprintf(stderr, "ERROR: %s\n", s); + onig_end(); return -1; } @@ -66,6 +67,9 @@ extern int main(int argc, char* argv[]) char s[ONIG_MAX_ERROR_MESSAGE_LEN]; onig_error_code_to_str((UChar* )s, r); fprintf(stderr, "ERROR: %s\n", s); + onig_region_free(region, 1 /* 1:free self, 0:free contents only */); + onig_free(reg); + onig_end(); return -1; } diff --git a/sample/syntax.c b/sample/syntax.c index e292079..e034608 100644 --- a/sample/syntax.c +++ b/sample/syntax.c @@ -45,6 +45,8 @@ extern int exec(OnigSyntaxType* syntax, char* apattern, char* astr) char s[ONIG_MAX_ERROR_MESSAGE_LEN]; onig_error_code_to_str((UChar* )s, r); fprintf(stderr, "ERROR: %s\n", s); + onig_region_free(region, 1 /* 1:free self, 0:free contents only */); + onig_free(reg); return -1; } diff --git a/sample/user_property.c b/sample/user_property.c index 8b2abd2..d52adc0 100644 --- a/sample/user_property.c +++ b/sample/user_property.c @@ -40,6 +40,7 @@ main(int argc, char* argv[]) char s[ONIG_MAX_ERROR_MESSAGE_LEN]; onig_error_code_to_str((UChar* )s, r); fprintf(stderr, "ERROR: %s\n", s); + onig_end(); return -1; } @@ -52,6 +53,7 @@ main(int argc, char* argv[]) char s[ONIG_MAX_ERROR_MESSAGE_LEN]; onig_error_code_to_str((UChar* )s, r, &einfo); fprintf(stderr, "onig_new: ERROR: %s\n", s); + onig_end(); return -1; } @@ -76,6 +78,9 @@ main(int argc, char* argv[]) char s[ONIG_MAX_ERROR_MESSAGE_LEN]; onig_error_code_to_str((UChar* )s, r); fprintf(stderr, "ERROR: %s\n", s); + onig_region_free(region, 1 /* 1:free self, 0:free contents only */); + onig_free(reg); + onig_end(); return -1; } diff --git a/src/gb18030.c b/src/gb18030.c index 7654432..8d415b0 100644 --- a/src/gb18030.c +++ b/src/gb18030.c @@ -2,7 +2,7 @@ gb18030.c - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2005-2018 KUBO Takehiro + * Copyright (c) 2005-2019 KUBO Takehiro * K.Kosako * All rights reserved. * @@ -67,11 +67,11 @@ gb18030_mbc_enc_len(const UChar* p) { if (GB18030_MAP[*p] != CM) return 1; + p++; if (GB18030_MAP[*p] == C4) return 4; - if (GB18030_MAP[*p] == C1) - return 1; /* illegal sequence */ + return 2; } diff --git a/src/oniguruma.h b/src/oniguruma.h index f6aa5ba..90cf2d9 100644 --- a/src/oniguruma.h +++ b/src/oniguruma.h @@ -36,9 +36,9 @@ extern "C" { #define ONIGURUMA #define ONIGURUMA_VERSION_MAJOR 6 #define ONIGURUMA_VERSION_MINOR 9 -#define ONIGURUMA_VERSION_TEENY 2 +#define ONIGURUMA_VERSION_TEENY 3 -#define ONIGURUMA_VERSION_INT 60902 +#define ONIGURUMA_VERSION_INT 60903 #ifndef P_ #if defined(__STDC__) || defined(_WIN32) @@ -52,6 +52,7 @@ extern "C" { # define PV_(args) args #endif +#ifndef ONIG_STATIC #ifndef ONIG_EXTERN #if defined(_WIN32) && !defined(__GNUC__) #if defined(ONIGURUMA_EXPORT) @@ -65,6 +66,9 @@ extern "C" { #ifndef ONIG_EXTERN #define ONIG_EXTERN extern #endif +#else +#define ONIG_EXTERN extern +#endif /* PART: character encoding */ @@ -517,6 +521,7 @@ ONIG_EXTERN OnigSyntaxType* OnigDefaultSyntax; #define ONIG_SYN_BACKSLASH_ESCAPE_IN_CC (1U<<21) /* [..\w..] etc.. */ #define ONIG_SYN_ALLOW_EMPTY_RANGE_IN_CC (1U<<22) #define ONIG_SYN_ALLOW_DOUBLE_RANGE_OP_IN_CC (1U<<23) /* [0-9-a]=[0-9\-a] */ +#define ONIG_SYN_ALLOW_INVALID_CODE_END_OF_RANGE_IN_CC (1U<<26) /* syntax (behavior) warning */ #define ONIG_SYN_WARN_CC_OP_NOT_ESCAPED (1U<<24) /* [,-,] */ #define ONIG_SYN_WARN_REDUNDANT_NESTED_REPEAT (1U<<25) /* (?:a*)+ */ @@ -766,6 +771,8 @@ int onig_init P_((void)); ONIG_EXTERN int onig_error_code_to_str PV_((OnigUChar* s, int err_code, ...)); ONIG_EXTERN +int onig_is_error_code_needs_param PV_((int code)); +ONIG_EXTERN void onig_set_warn_func P_((OnigWarnFunc f)); ONIG_EXTERN void onig_set_verb_warn_func P_((OnigWarnFunc f)); diff --git a/src/regcomp.c b/src/regcomp.c index c2c04a4..b96c793 100644 --- a/src/regcomp.c +++ b/src/regcomp.c @@ -599,12 +599,34 @@ select_str_opcode(int mb_len, int str_len, int ignore_case) } static int -compile_tree_empty_check(Node* node, regex_t* reg, int empty_info, ScanEnv* env) +is_strict_real_node(Node* node) +{ + switch (NODE_TYPE(node)) { + case NODE_STRING: + { + StrNode* sn = STR_(node); + return (sn->end != sn->s); + } + break; + + case NODE_CCLASS: + case NODE_CTYPE: + return 1; + break; + + default: + return 0; + break; + } +} + +static int +compile_tree_empty_check(Node* node, regex_t* reg, int emptiness, ScanEnv* env) { int r; int saved_num_null_check = reg->num_null_check; - if (empty_info != BODY_IS_NOT_EMPTY) { + if (emptiness != BODY_IS_NOT_EMPTY) { r = add_op(reg, OP_EMPTY_CHECK_START); if (r != 0) return r; COP(reg)->empty_check_start.mem = reg->num_null_check; /* NULL CHECK ID */ @@ -614,12 +636,12 @@ compile_tree_empty_check(Node* node, regex_t* reg, int empty_info, ScanEnv* env) r = compile_tree(node, reg, env); if (r != 0) return r; - if (empty_info != BODY_IS_NOT_EMPTY) { - if (empty_info == BODY_IS_EMPTY) + if (emptiness != BODY_IS_NOT_EMPTY) { + if (emptiness == BODY_IS_EMPTY_POSSIBILITY) r = add_op(reg, OP_EMPTY_CHECK_END); - else if (empty_info == BODY_IS_EMPTY_MEM) + else if (emptiness == BODY_IS_EMPTY_POSSIBILITY_MEM) r = add_op(reg, OP_EMPTY_CHECK_END_MEMST); - else if (empty_info == BODY_IS_EMPTY_REC) + else if (emptiness == BODY_IS_EMPTY_POSSIBILITY_REC) r = add_op(reg, OP_EMPTY_CHECK_END_MEMST_PUSH); if (r != 0) return r; @@ -895,12 +917,12 @@ entry_repeat_range(regex_t* reg, int id, int lower, int upper) } p[id].lower = lower; - p[id].upper = (IS_REPEAT_INFINITE(upper) ? 0x7fffffff : upper); + p[id].upper = (IS_INFINITE_REPEAT(upper) ? 0x7fffffff : upper); return 0; } static int -compile_range_repeat_node(QuantNode* qn, int target_len, int empty_info, +compile_range_repeat_node(QuantNode* qn, int target_len, int emptiness, regex_t* reg, ScanEnv* env) { int r; @@ -915,7 +937,7 @@ compile_range_repeat_node(QuantNode* qn, int target_len, int empty_info, r = entry_repeat_range(reg, num_repeat, qn->lower, qn->upper); if (r != 0) return r; - r = compile_tree_empty_check(NODE_QUANT_BODY(qn), reg, empty_info, env); + r = compile_tree_empty_check(NODE_QUANT_BODY(qn), reg, emptiness, env); if (r != 0) return r; if ( @@ -937,7 +959,7 @@ compile_range_repeat_node(QuantNode* qn, int target_len, int empty_info, static int is_anychar_infinite_greedy(QuantNode* qn) { - if (qn->greedy && IS_REPEAT_INFINITE(qn->upper) && + if (qn->greedy && IS_INFINITE_REPEAT(qn->upper) && NODE_IS_ANYCHAR(NODE_QUANT_BODY(qn))) return 1; else @@ -951,8 +973,8 @@ static int compile_length_quantifier_node(QuantNode* qn, regex_t* reg) { int len, mod_tlen; - int infinite = IS_REPEAT_INFINITE(qn->upper); - enum BodyEmpty empty_info = qn->empty_info; + int infinite = IS_INFINITE_REPEAT(qn->upper); + enum BodyEmptyType emptiness = qn->emptiness; int tlen = compile_length_tree(NODE_QUANT_BODY(qn), reg); if (tlen < 0) return tlen; @@ -969,10 +991,9 @@ compile_length_quantifier_node(QuantNode* qn, regex_t* reg) } } - if (empty_info == BODY_IS_NOT_EMPTY) - mod_tlen = tlen; - else - mod_tlen = tlen + (SIZE_OP_EMPTY_CHECK_START + SIZE_OP_EMPTY_CHECK_END); + mod_tlen = tlen; + if (emptiness != BODY_IS_NOT_EMPTY) + mod_tlen += SIZE_OP_EMPTY_CHECK_START + SIZE_OP_EMPTY_CHECK_END; if (infinite && (qn->lower <= 1 || @@ -1026,8 +1047,8 @@ static int compile_quantifier_node(QuantNode* qn, regex_t* reg, ScanEnv* env) { int i, r, mod_tlen; - int infinite = IS_REPEAT_INFINITE(qn->upper); - enum BodyEmpty empty_info = qn->empty_info; + int infinite = IS_INFINITE_REPEAT(qn->upper); + enum BodyEmptyType emptiness = qn->emptiness; int tlen = compile_length_tree(NODE_QUANT_BODY(qn), reg); if (tlen < 0) return tlen; @@ -1055,10 +1076,9 @@ compile_quantifier_node(QuantNode* qn, regex_t* reg, ScanEnv* env) } } - if (empty_info == BODY_IS_NOT_EMPTY) - mod_tlen = tlen; - else - mod_tlen = tlen + (SIZE_OP_EMPTY_CHECK_START + SIZE_OP_EMPTY_CHECK_END); + mod_tlen = tlen; + if (emptiness != BODY_IS_NOT_EMPTY) + mod_tlen += SIZE_OP_EMPTY_CHECK_START + SIZE_OP_EMPTY_CHECK_END; if (infinite && (qn->lower <= 1 || @@ -1096,7 +1116,7 @@ compile_quantifier_node(QuantNode* qn, regex_t* reg, ScanEnv* env) COP(reg)->push_or_jump_exact1.addr = SIZE_INC_OP + mod_tlen + SIZE_OP_JUMP; COP(reg)->push_or_jump_exact1.c = STR_(qn->head_exact)->s[0]; - r = compile_tree_empty_check(NODE_QUANT_BODY(qn), reg, empty_info, env); + r = compile_tree_empty_check(NODE_QUANT_BODY(qn), reg, emptiness, env); if (r != 0) return r; addr = -(mod_tlen + (int )SIZE_OP_PUSH_OR_JUMP_EXACT1); @@ -1109,7 +1129,7 @@ compile_quantifier_node(QuantNode* qn, regex_t* reg, ScanEnv* env) COP(reg)->push_if_peek_next.addr = SIZE_INC_OP + mod_tlen + SIZE_OP_JUMP; COP(reg)->push_if_peek_next.c = STR_(qn->next_head_exact)->s[0]; - r = compile_tree_empty_check(NODE_QUANT_BODY(qn), reg, empty_info, env); + r = compile_tree_empty_check(NODE_QUANT_BODY(qn), reg, emptiness, env); if (r != 0) return r; addr = -(mod_tlen + (int )SIZE_OP_PUSH_IF_PEEK_NEXT); @@ -1119,7 +1139,7 @@ compile_quantifier_node(QuantNode* qn, regex_t* reg, ScanEnv* env) if (r != 0) return r; COP(reg)->push.addr = SIZE_INC_OP + mod_tlen + SIZE_OP_JUMP; - r = compile_tree_empty_check(NODE_QUANT_BODY(qn), reg, empty_info, env); + r = compile_tree_empty_check(NODE_QUANT_BODY(qn), reg, emptiness, env); if (r != 0) return r; addr = -(mod_tlen + (int )SIZE_OP_PUSH); @@ -1134,7 +1154,7 @@ compile_quantifier_node(QuantNode* qn, regex_t* reg, ScanEnv* env) if (r != 0) return r; COP(reg)->jump.addr = mod_tlen + SIZE_INC_OP; - r = compile_tree_empty_check(NODE_QUANT_BODY(qn), reg, empty_info, env); + r = compile_tree_empty_check(NODE_QUANT_BODY(qn), reg, emptiness, env); if (r != 0) return r; r = add_op(reg, OP_PUSH); @@ -1188,7 +1208,7 @@ compile_quantifier_node(QuantNode* qn, regex_t* reg, ScanEnv* env) r = compile_tree(NODE_QUANT_BODY(qn), reg, env); } else { - r = compile_range_repeat_node(qn, mod_tlen, empty_info, reg, env); + r = compile_range_repeat_node(qn, mod_tlen, emptiness, reg, env); } return r; } @@ -1273,7 +1293,7 @@ compile_length_bag_node(BagNode* node, regex_t* reg) break; case BAG_STOP_BACKTRACK: - if (NODE_IS_STOP_BT_SIMPLE_REPEAT(node)) { + if (NODE_IS_STRICT_REAL_REPEAT(node)) { int v; QuantNode* qn; @@ -1307,8 +1327,9 @@ compile_length_bag_node(BagNode* node, regex_t* reg) len += tlen; } + len += SIZE_OP_JUMP + SIZE_OP_ATOMIC_END; + if (IS_NOT_NULL(Else)) { - len += SIZE_OP_JUMP; tlen = compile_length_tree(Else, reg); if (tlen < 0) return tlen; len += tlen; @@ -1423,7 +1444,7 @@ compile_bag_node(BagNode* node, regex_t* reg, ScanEnv* env) break; case BAG_STOP_BACKTRACK: - if (NODE_IS_STOP_BT_SIMPLE_REPEAT(node)) { + if (NODE_IS_STRICT_REAL_REPEAT(node)) { QuantNode* qn = QUANT_(NODE_BAG_BODY(node)); r = compile_tree_n_times(NODE_QUANT_BODY(qn), qn->lower, reg, env); if (r != 0) return r; @@ -1455,7 +1476,7 @@ compile_bag_node(BagNode* node, regex_t* reg, ScanEnv* env) case BAG_IF_ELSE: { - int cond_len, then_len, jump_len; + int cond_len, then_len, else_len, jump_len; Node* cond = NODE_BAG_BODY(node); Node* Then = node->te.Then; Node* Else = node->te.Else; @@ -1472,8 +1493,7 @@ compile_bag_node(BagNode* node, regex_t* reg, ScanEnv* env) else then_len = 0; - jump_len = cond_len + then_len + SIZE_OP_ATOMIC_END; - if (IS_NOT_NULL(Else)) jump_len += SIZE_OP_JUMP; + jump_len = cond_len + then_len + SIZE_OP_ATOMIC_END + SIZE_OP_JUMP; r = add_op(reg, OP_PUSH); if (r != 0) return r; @@ -1490,11 +1510,20 @@ compile_bag_node(BagNode* node, regex_t* reg, ScanEnv* env) } if (IS_NOT_NULL(Else)) { - int else_len = compile_length_tree(Else, reg); - r = add_op(reg, OP_JUMP); - if (r != 0) return r; - COP(reg)->jump.addr = else_len + SIZE_INC_OP; + else_len = compile_length_tree(Else, reg); + if (else_len < 0) return else_len; + } + else + else_len = 0; + + r = add_op(reg, OP_JUMP); + if (r != 0) return r; + COP(reg)->jump.addr = SIZE_OP_ATOMIC_END + else_len + SIZE_INC_OP; + r = add_op(reg, OP_ATOMIC_END); + if (r != 0) return r; + + if (IS_NOT_NULL(Else)) { r = compile_tree(Else, reg, env); } } @@ -3035,7 +3064,7 @@ tree_max_len(Node* node, ScanEnv* env) if (qn->upper != 0) { len = tree_max_len(NODE_BODY(node), env); if (len != 0) { - if (! IS_REPEAT_INFINITE(qn->upper)) + if (! IS_INFINITE_REPEAT(qn->upper)) len = distance_multiply(len, qn->upper); else len = INFINITE_LEN; @@ -3581,7 +3610,7 @@ next_setup(Node* node, Node* next_node, regex_t* reg) type = NODE_TYPE(node); if (type == NODE_QUANT) { QuantNode* qn = QUANT_(node); - if (qn->greedy && IS_REPEAT_INFINITE(qn->upper)) { + if (qn->greedy && IS_INFINITE_REPEAT(qn->upper)) { #ifdef USE_QUANT_PEEK_NEXT Node* n = get_head_value_node(next_node, 1, reg); /* '\0': for UTF-16BE etc... */ @@ -3591,7 +3620,7 @@ next_setup(Node* node, Node* next_node, regex_t* reg) #endif /* automatic posseivation a*b ==> (?>a*)b */ if (qn->lower <= 1) { - if (NODE_IS_SIMPLE_TYPE(NODE_BODY(node))) { + if (is_strict_real_node(NODE_BODY(node))) { Node *x, *y; x = get_head_value_node(NODE_BODY(node), 0, reg); if (IS_NOT_NULL(x)) { @@ -3599,7 +3628,7 @@ next_setup(Node* node, Node* next_node, regex_t* reg) if (IS_NOT_NULL(y) && is_exclusive(x, y, reg)) { Node* en = onig_node_new_bag(BAG_STOP_BACKTRACK); CHECK_NULL_RETURN_MEMERR(en); - NODE_STATUS_ADD(en, STOP_BT_SIMPLE_REPEAT); + NODE_STATUS_ADD(en, STRICT_REAL_REPEAT); swap_node(node, en); NODE_BODY(node) = en; } @@ -4001,11 +4030,11 @@ expand_case_fold_string(Node* node, regex_t* reg, int state) return r; } -#ifdef USE_INSISTENT_CHECK_CAPTURES_IN_EMPTY_REPEAT -static enum BodyEmpty +#ifdef USE_STUBBORN_CHECK_CAPTURES_IN_EMPTY_REPEAT +static enum BodyEmptyType quantifiers_memory_node_info(Node* node) { - int r = BODY_IS_EMPTY; + int r = BODY_IS_EMPTY_POSSIBILITY; switch (NODE_TYPE(node)) { case NODE_LIST: @@ -4022,7 +4051,7 @@ quantifiers_memory_node_info(Node* node) #ifdef USE_CALL case NODE_CALL: if (NODE_IS_RECURSION(node)) { - return BODY_IS_EMPTY_REC; /* tiny version */ + return BODY_IS_EMPTY_POSSIBILITY_REC; /* tiny version */ } else r = quantifiers_memory_node_info(NODE_BODY(node)); @@ -4044,9 +4073,9 @@ quantifiers_memory_node_info(Node* node) switch (en->type) { case BAG_MEMORY: if (NODE_IS_RECURSION(node)) { - return BODY_IS_EMPTY_REC; + return BODY_IS_EMPTY_POSSIBILITY_REC; } - return BODY_IS_EMPTY_MEM; + return BODY_IS_EMPTY_POSSIBILITY_MEM; break; case BAG_OPTION: @@ -4083,7 +4112,7 @@ quantifiers_memory_node_info(Node* node) return r; } -#endif /* USE_INSISTENT_CHECK_CAPTURES_IN_EMPTY_REPEAT */ +#endif /* USE_STUBBORN_CHECK_CAPTURES_IN_EMPTY_REPEAT */ #ifdef USE_CALL @@ -4351,7 +4380,7 @@ setup_called_state_call(Node* node, int state) { QuantNode* qn = QUANT_(node); - if (IS_REPEAT_INFINITE(qn->upper) || qn->upper >= 2) + if (IS_INFINITE_REPEAT(qn->upper) || qn->upper >= 2) state |= IN_REAL_REPEAT; if (qn->lower != qn->upper) state |= IN_VAR_REPEAT; @@ -4468,7 +4497,7 @@ setup_called_state(Node* node, int state) { QuantNode* qn = QUANT_(node); - if (IS_REPEAT_INFINITE(qn->upper) || qn->upper >= 2) + if (IS_INFINITE_REPEAT(qn->upper) || qn->upper >= 2) state |= IN_REAL_REPEAT; if (qn->lower != qn->upper) state |= IN_VAR_REPEAT; @@ -4600,24 +4629,24 @@ setup_quant(Node* node, regex_t* reg, int state, ScanEnv* env) NODE_STATUS_ADD(node, IN_MULTI_ENTRY); } - if (IS_REPEAT_INFINITE(qn->upper) || qn->upper >= 1) { + if (IS_INFINITE_REPEAT(qn->upper) || qn->upper >= 1) { d = tree_min_len(body, env); if (d == 0) { -#ifdef USE_INSISTENT_CHECK_CAPTURES_IN_EMPTY_REPEAT - qn->empty_info = quantifiers_memory_node_info(body); - if (qn->empty_info == BODY_IS_EMPTY_REC) { +#ifdef USE_STUBBORN_CHECK_CAPTURES_IN_EMPTY_REPEAT + qn->emptiness = quantifiers_memory_node_info(body); + if (qn->emptiness == BODY_IS_EMPTY_POSSIBILITY_REC) { if (NODE_TYPE(body) == NODE_BAG && BAG_(body)->type == BAG_MEMORY) { MEM_STATUS_ON(env->bt_mem_end, BAG_(body)->m.regnum); } } #else - qn->empty_info = BODY_IS_EMPTY; + qn->emptiness = BODY_IS_EMPTY_POSSIBILITY; #endif } } - if (IS_REPEAT_INFINITE(qn->upper) || qn->upper >= 2) + if (IS_INFINITE_REPEAT(qn->upper) || qn->upper >= 2) state |= IN_REAL_REPEAT; if (qn->lower != qn->upper) state |= IN_VAR_REPEAT; @@ -4628,7 +4657,7 @@ setup_quant(Node* node, regex_t* reg, int state, ScanEnv* env) /* expand string */ #define EXPAND_STRING_MAX_LENGTH 100 if (NODE_TYPE(body) == NODE_STRING) { - if (!IS_REPEAT_INFINITE(qn->lower) && qn->lower == qn->upper && + if (!IS_INFINITE_REPEAT(qn->lower) && qn->lower == qn->upper && qn->lower > 1 && qn->lower <= EXPAND_STRING_MAX_LENGTH) { int len = NODE_STRING_LEN(body); StrNode* sn = STR_(body); @@ -4646,7 +4675,7 @@ setup_quant(Node* node, regex_t* reg, int state, ScanEnv* env) } } - if (qn->greedy && (qn->empty_info == BODY_IS_NOT_EMPTY)) { + if (qn->greedy && (qn->emptiness == BODY_IS_NOT_EMPTY)) { if (NODE_TYPE(body) == NODE_QUANT) { QuantNode* tqn = QUANT_(body); if (IS_NOT_NULL(tqn->head_exact)) { @@ -4663,7 +4692,7 @@ setup_quant(Node* node, regex_t* reg, int state, ScanEnv* env) } /* setup_tree does the following work. - 1. check empty loop. (set qn->empty_info) + 1. check empty loop. (set qn->emptiness) 2. expand ignore-case in char class. 3. set memory status bit flags. (reg->mem_stats) 4. set qn->head_exact for [push, exact] -> [push_or_jump_exact1, exact]. @@ -4752,10 +4781,10 @@ setup_tree(Node* node, regex_t* reg, int state, ScanEnv* env) r = setup_tree(target, reg, state, env); if (NODE_TYPE(target) == NODE_QUANT) { QuantNode* tqn = QUANT_(target); - if (IS_REPEAT_INFINITE(tqn->upper) && tqn->lower <= 1 && + if (IS_INFINITE_REPEAT(tqn->upper) && tqn->lower <= 1 && tqn->greedy != 0) { /* (?>a*), a*+ etc... */ - if (NODE_IS_SIMPLE_TYPE(NODE_BODY(target))) - NODE_STATUS_ADD(node, STOP_BT_SIMPLE_REPEAT); + if (is_strict_real_node(NODE_BODY(target))) + NODE_STATUS_ADD(node, STRICT_REAL_REPEAT); } } } @@ -5752,7 +5781,7 @@ optimize_nodes(Node* node, OptNode* opt, OptEnv* env) opt->sm.reach_end = 0; } - if (IS_REPEAT_INFINITE(qn->upper)) { + if (IS_INFINITE_REPEAT(qn->upper)) { if (env->mmd.max == 0 && NODE_IS_ANYCHAR(NODE_BODY(node)) && qn->greedy != 0) { if (IS_MULTILINE(CTYPE_OPTION(NODE_QUANT_BODY(qn), env))) @@ -6672,6 +6701,7 @@ onig_is_code_in_cc(OnigEncoding enc, OnigCodePoint code, CClassNode* cc) } else { len = ONIGENC_CODE_TO_MBCLEN(enc, code); + if (len < 0) return 0; } return onig_is_code_in_cc_len(len, code, cc); } diff --git a/src/regenc.c b/src/regenc.c index 6376565..9fab721 100644 --- a/src/regenc.c +++ b/src/regenc.c @@ -853,6 +853,8 @@ onigenc_mbn_is_mbc_ambiguous(OnigEncoding enc, OnigCaseFoldType flag, extern int onigenc_mb2_code_to_mbclen(OnigCodePoint code) { + if ((code & (~0xffff)) != 0) return ONIGERR_INVALID_CODE_POINT_VALUE; + if ((code & 0xff00) != 0) return 2; else return 1; } diff --git a/src/regerror.c b/src/regerror.c index 7564827..e6d1806 100644 --- a/src/regerror.c +++ b/src/regerror.c @@ -257,6 +257,23 @@ static int to_ascii(OnigEncoding enc, UChar *s, UChar *end, } +extern int +onig_is_error_code_needs_param(int code) +{ + switch (code) { + case ONIGERR_UNDEFINED_NAME_REFERENCE: + case ONIGERR_UNDEFINED_GROUP_REFERENCE: + case ONIGERR_MULTIPLEX_DEFINED_NAME: + case ONIGERR_MULTIPLEX_DEFINITION_NAME_CALL: + case ONIGERR_INVALID_GROUP_NAME: + case ONIGERR_INVALID_CHAR_IN_GROUP_NAME: + case ONIGERR_INVALID_CHAR_PROPERTY_NAME: + return 1; + default: + return 0; + } +} + /* for ONIG_MAX_ERROR_MESSAGE_LEN */ #define MAX_ERROR_PAR_LEN 30 diff --git a/src/regexec.c b/src/regexec.c index 6618996..f957b75 100644 --- a/src/regexec.c +++ b/src/regexec.c @@ -980,6 +980,8 @@ onig_region_copy(OnigRegion* to, OnigRegion* from) #define STK_CALL_FRAME 0x0400 #define STK_RETURN 0x0500 #define STK_SAVE_VAL 0x0600 +#define STK_PREC_READ_START 0x0700 +#define STK_PREC_READ_END 0x0800 /* stack type check mask */ #define STK_MASK_POP_USED STK_ALT_FLAG @@ -1544,8 +1546,8 @@ stack_double(int is_alloca, char** arg_alloc_base, #define STACK_PUSH_ALT(pat,s,sprev) STACK_PUSH(STK_ALT,pat,s,sprev) #define STACK_PUSH_SUPER_ALT(pat,s,sprev) STACK_PUSH(STK_SUPER_ALT,pat,s,sprev) -#define STACK_PUSH_POS(s,sprev) \ - STACK_PUSH(STK_TO_VOID_START,(Operation* )0,s,sprev) +#define STACK_PUSH_PREC_READ_START(s,sprev) \ + STACK_PUSH(STK_PREC_READ_START,(Operation* )0,s,sprev) #define STACK_PUSH_ALT_PREC_READ_NOT(pat,s,sprev) \ STACK_PUSH(STK_ALT_PREC_READ_NOT,pat,s,sprev) #define STACK_PUSH_TO_VOID_START STACK_PUSH_TYPE(STK_TO_VOID_START) @@ -1887,6 +1889,27 @@ stack_double(int is_alloca, char** arg_alloc_base, }\ } while(0) +#define STACK_GET_PREC_READ_START(k) do {\ + int level = 0;\ + k = stk;\ + while (1) {\ + k--;\ + STACK_BASE_CHECK(k, "STACK_GET_PREC_READ_START");\ + if (IS_TO_VOID_TARGET(k)) {\ + k->type = STK_VOID;\ + }\ + else if (k->type == STK_PREC_READ_START) {\ + if (level == 0) {\ + break;\ + }\ + level--;\ + }\ + else if (k->type == STK_PREC_READ_END) {\ + level++;\ + }\ + }\ +} while(0) + #define STACK_EMPTY_CHECK(isnull,sid,s) do {\ StackType* k = stk;\ while (1) {\ @@ -1913,7 +1936,7 @@ stack_double(int is_alloca, char** arg_alloc_base, }\ } while (0) -#ifdef USE_INSISTENT_CHECK_CAPTURES_IN_EMPTY_REPEAT +#ifdef USE_STUBBORN_CHECK_CAPTURES_IN_EMPTY_REPEAT #define STACK_EMPTY_CHECK_MEM(isnull,sid,s,reg) do {\ StackType* k = stk;\ while (1) {\ @@ -1927,9 +1950,10 @@ stack_double(int is_alloca, char** arg_alloc_base, }\ else {\ UChar* endp;\ + int level = 0;\ (isnull) = 1;\ while (k < stk) {\ - if (k->type == STK_MEM_START) {\ + if (k->type == STK_MEM_START && level == 0) {\ STACK_MEM_START_GET_PREV_END_ADDR(k, reg, endp);\ if (endp == 0) {\ (isnull) = 0; break;\ @@ -1941,6 +1965,12 @@ stack_double(int is_alloca, char** arg_alloc_base, (isnull) = -1; /* empty, but position changed */ \ }\ }\ + else if (k->type == STK_PREC_READ_START) {\ + level++;\ + }\ + else if (k->type == STK_PREC_READ_END) {\ + level--;\ + }\ k++;\ }\ break;\ @@ -1965,10 +1995,11 @@ stack_double(int is_alloca, char** arg_alloc_base, }\ else {\ UChar* endp;\ + int prec_level = 0;\ (isnull) = 1;\ while (k < stk) {\ if (k->type == STK_MEM_START) {\ - if (level == 0) {\ + if (level == 0 && prec_level == 0) {\ STACK_MEM_START_GET_PREV_END_ADDR(k, reg, endp);\ if (endp == 0) {\ (isnull) = 0; break;\ @@ -1987,6 +2018,12 @@ stack_double(int is_alloca, char** arg_alloc_base, else if (k->type == STK_EMPTY_CHECK_END) {\ if (k->zid == (sid)) level--;\ }\ + else if (k->type == STK_PREC_READ_START) {\ + prec_level++;\ + }\ + else if (k->type == STK_PREC_READ_END) {\ + prec_level--;\ + }\ k++;\ }\ break;\ @@ -2023,7 +2060,7 @@ stack_double(int is_alloca, char** arg_alloc_base, }\ }\ } while(0) -#endif /* USE_INSISTENT_CHECK_CAPTURES_IN_EMPTY_REPEAT */ +#endif /* USE_STUBBORN_CHECK_CAPTURES_IN_EMPTY_REPEAT */ #define STACK_GET_REPEAT(sid, k) do {\ int level = 0;\ @@ -2968,6 +3005,7 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, NEXT_OUT; CASE_OP(CCLASS_MB) + DATA_ENSURE(1); if (! ONIGENC_IS_MBC_HEAD(encode, s)) goto fail; cclass_mb: @@ -3441,11 +3479,13 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, ? STACK_AT(mem_end_stk[mem])->u.mem.pstr : (UChar* )((void* )mem_end_stk[mem])); n = (int )(pend - pstart); - DATA_ENSURE(n); - sprev = s; - STRING_CMP(pstart, s, n); - while (sprev + (len = enclen(encode, sprev)) < s) - sprev += len; + if (n != 0) { + DATA_ENSURE(n); + sprev = s; + STRING_CMP(s, pstart, n); + while (sprev + (len = enclen(encode, sprev)) < s) + sprev += len; + } } INC_OP; JUMP_OUT; @@ -3468,11 +3508,13 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, ? STACK_AT(mem_end_stk[mem])->u.mem.pstr : (UChar* )((void* )mem_end_stk[mem])); n = (int )(pend - pstart); - DATA_ENSURE(n); - sprev = s; - STRING_CMP_IC(case_fold_flag, pstart, &s, n); - while (sprev + (len = enclen(encode, sprev)) < s) - sprev += len; + if (n != 0) { + DATA_ENSURE(n); + sprev = s; + STRING_CMP_IC(case_fold_flag, pstart, &s, n); + while (sprev + (len = enclen(encode, sprev)) < s) + sprev += len; + } } INC_OP; JUMP_OUT; @@ -3498,15 +3540,16 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, ? STACK_AT(mem_end_stk[mem])->u.mem.pstr : (UChar* )((void* )mem_end_stk[mem])); n = (int )(pend - pstart); - DATA_ENSURE(n); - sprev = s; - swork = s; - STRING_CMP_VALUE(pstart, swork, n, is_fail); - if (is_fail) continue; - s = swork; - while (sprev + (len = enclen(encode, sprev)) < s) - sprev += len; - + if (n != 0) { + DATA_ENSURE(n); + sprev = s; + swork = s; + STRING_CMP_VALUE(swork, pstart, n, is_fail); + if (is_fail) continue; + s = swork; + while (sprev + (len = enclen(encode, sprev)) < s) + sprev += len; + } break; /* success */ } if (i == tlen) goto fail; @@ -3535,15 +3578,16 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, ? STACK_AT(mem_end_stk[mem])->u.mem.pstr : (UChar* )((void* )mem_end_stk[mem])); n = (int )(pend - pstart); - DATA_ENSURE(n); - sprev = s; - swork = s; - STRING_CMP_VALUE_IC(case_fold_flag, pstart, &swork, n, is_fail); - if (is_fail) continue; - s = swork; - while (sprev + (len = enclen(encode, sprev)) < s) - sprev += len; - + if (n != 0) { + DATA_ENSURE(n); + sprev = s; + swork = s; + STRING_CMP_VALUE_IC(case_fold_flag, pstart, &swork, n, is_fail); + if (is_fail) continue; + s = swork; + while (sprev + (len = enclen(encode, sprev)) < s) + sprev += len; + } break; /* success */ } if (i == tlen) goto fail; @@ -3560,6 +3604,7 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, int len; int level; MemNumType* mems; + UChar* ssave; n = 0; backref_with_level: @@ -3567,10 +3612,11 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, tlen = p->backref_general.num; mems = tlen == 1 ? &(p->backref_general.n1) : p->backref_general.ns; - sprev = s; + ssave = s; if (backref_match_at_nested_level(reg, stk, stk_base, n, case_fold_flag, level, (int )tlen, mems, &s, end)) { - if (sprev < end) { + if (ssave != s) { + sprev = ssave; while (sprev + (len = enclen(encode, sprev)) < s) sprev += len; } @@ -3658,7 +3704,7 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, } JUMP_OUT; -#ifdef USE_INSISTENT_CHECK_CAPTURES_IN_EMPTY_REPEAT +#ifdef USE_STUBBORN_CHECK_CAPTURES_IN_EMPTY_REPEAT CASE_OP(EMPTY_CHECK_END_MEMST) { int is_empty; @@ -3683,7 +3729,7 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, int is_empty; mem = p->empty_check_end.mem; /* mem: null check id */ -#ifdef USE_INSISTENT_CHECK_CAPTURES_IN_EMPTY_REPEAT +#ifdef USE_STUBBORN_CHECK_CAPTURES_IN_EMPTY_REPEAT STACK_EMPTY_CHECK_MEM_REC(is_empty, mem, s, reg); #else STACK_EMPTY_CHECK_REC(is_empty, mem, s); @@ -3851,14 +3897,15 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, goto repeat_inc_ng; CASE_OP(PREC_READ_START) - STACK_PUSH_POS(s, sprev); + STACK_PUSH_PREC_READ_START(s, sprev); INC_OP; JUMP_OUT; CASE_OP(PREC_READ_END) - STACK_EXEC_TO_VOID(stkp); + STACK_GET_PREC_READ_START(stkp); s = stkp->u.state.pstr; sprev = stkp->u.state.pstr_prev; + STACK_PUSH(STK_PREC_READ_END,0,0,0); INC_OP; JUMP_OUT; @@ -5443,6 +5490,9 @@ onig_builtin_error(OnigCalloutArgs* args, void* user_data ARG_UNUSED) if (n >= 0) { n = ONIGERR_INVALID_CALLOUT_BODY; } + else if (onig_is_error_code_needs_param(n)) { + n = ONIGERR_INVALID_CALLOUT_BODY; + } return n; } diff --git a/src/regext.c b/src/regext.c index fa4b360..965c793 100644 --- a/src/regext.c +++ b/src/regext.c @@ -29,6 +29,7 @@ #include "regint.h" +#if 0 static void conv_ext0be32(const UChar* s, const UChar* end, UChar* conv) { @@ -158,6 +159,7 @@ conv_encoding(OnigEncoding from, OnigEncoding to, const UChar* s, const UChar* e return ONIGERR_NOT_SUPPORTED_ENCODING_COMBINATION; } +#endif extern int onig_new_deluxe(regex_t** reg, const UChar* pattern, const UChar* pattern_end, @@ -169,9 +171,7 @@ onig_new_deluxe(regex_t** reg, const UChar* pattern, const UChar* pattern_end, if (IS_NOT_NULL(einfo)) einfo->par = (UChar* )NULL; if (ci->pattern_enc != ci->target_enc) { - r = conv_encoding(ci->pattern_enc, ci->target_enc, pattern, pattern_end, - &cpat, &cpat_end); - if (r != 0) return r; + return ONIGERR_NOT_SUPPORTED_ENCODING_COMBINATION; } else { cpat = (UChar* )pattern; diff --git a/src/regint.h b/src/regint.h index 56767e8..38389a1 100644 --- a/src/regint.h +++ b/src/regint.h @@ -63,7 +63,7 @@ #define USE_CALL #define USE_CALLOUT #define USE_BACKREF_WITH_LEVEL /* \k, \k */ -#define USE_INSISTENT_CHECK_CAPTURES_IN_EMPTY_REPEAT /* /(?:()|())*\2/ */ +#define USE_STUBBORN_CHECK_CAPTURES_IN_EMPTY_REPEAT /* /(?:()|())*\2/ */ #define USE_NEWLINE_AT_END_OF_STRING_HAS_EMPTY_LINE /* /\n$/ =~ "\n" */ #define USE_WARNING_REDUNDANT_NESTED_REPEAT_OPERATOR #define USE_RETRY_LIMIT_IN_MATCH @@ -348,8 +348,8 @@ typedef unsigned int MemStatusType; #define DISABLE_CASE_FOLD_MULTI_CHAR(case_fold_flag) \ ((case_fold_flag) & ~INTERNAL_ONIGENC_CASE_FOLD_MULTI_CHAR) -#define REPEAT_INFINITE -1 -#define IS_REPEAT_INFINITE(n) ((n) == REPEAT_INFINITE) +#define INFINITE_REPEAT -1 +#define IS_INFINITE_REPEAT(n) ((n) == INFINITE_REPEAT) /* bitset */ #define BITS_PER_BYTE 8 diff --git a/src/regparse.c b/src/regparse.c index f1deea3..7f8b1a9 100644 --- a/src/regparse.c +++ b/src/regparse.c @@ -77,6 +77,7 @@ OnigSyntaxType OnigSyntaxOniguruma = { ONIG_SYN_CAPTURE_ONLY_NAMED_GROUP | ONIG_SYN_ALLOW_MULTIPLEX_DEFINITION_NAME | ONIG_SYN_FIXED_INTERVAL_IS_GREEDY_ONLY | + ONIG_SYN_ALLOW_INVALID_CODE_END_OF_RANGE_IN_CC | ONIG_SYN_WARN_CC_OP_NOT_ESCAPED | ONIG_SYN_WARN_REDUNDANT_NESTED_REPEAT ) , ONIG_OPTION_NONE @@ -1093,6 +1094,35 @@ onig_name_to_group_numbers(regex_t* reg, const UChar* name, return e->back_num; } +static int +name_to_group_numbers(ScanEnv* env, const UChar* name, const UChar* name_end, + int** nums) +{ + regex_t* reg; + NameEntry* e; + + reg = env->reg; + e = name_find(reg, name, name_end); + + if (IS_NULL(e)) { + onig_scan_env_set_error_string(env, ONIGERR_UNDEFINED_NAME_REFERENCE, + (UChar* )name, (UChar* )name_end); + return ONIGERR_UNDEFINED_NAME_REFERENCE; + } + + switch (e->back_num) { + case 0: + break; + case 1: + *nums = &(e->back_ref1); + break; + default: + *nums = e->back_refs; + break; + } + return e->back_num; +} + extern int onig_name_to_backref_number(regex_t* reg, const UChar* name, const UChar* name_end, OnigRegion *region) @@ -1869,8 +1899,8 @@ callout_tag_table_new(CalloutTagTable** rt) } static int -callout_tag_entry_raw(CalloutTagTable* t, UChar* name, UChar* name_end, - CalloutTagVal entry_val) +callout_tag_entry_raw(ScanEnv* env, CalloutTagTable* t, UChar* name, + UChar* name_end, CalloutTagVal entry_val) { int r; CalloutTagVal val; @@ -1879,8 +1909,11 @@ callout_tag_entry_raw(CalloutTagTable* t, UChar* name, UChar* name_end, return ONIGERR_INVALID_CALLOUT_TAG_NAME; val = callout_tag_find(t, name, name_end); - if (val >= 0) + if (val >= 0) { + onig_scan_env_set_error_string(env, ONIGERR_MULTIPLEX_DEFINED_NAME, + name, name_end); return ONIGERR_MULTIPLEX_DEFINED_NAME; + } r = onig_st_insert_strend(t, name, name_end, (HashDataType )entry_val); if (r < 0) return r; @@ -1909,7 +1942,7 @@ ext_ensure_tag_table(regex_t* reg) } static int -callout_tag_entry(regex_t* reg, UChar* name, UChar* name_end, +callout_tag_entry(ScanEnv* env, regex_t* reg, UChar* name, UChar* name_end, CalloutTagVal entry_val) { int r; @@ -1921,7 +1954,7 @@ callout_tag_entry(regex_t* reg, UChar* name, UChar* name_end, ext = onig_get_regex_ext(reg); CHECK_NULL_RETURN_MEMERR(ext); - r = callout_tag_entry_raw(ext->tag_table, name, name_end, entry_val); + r = callout_tag_entry_raw(env, ext->tag_table, name, name_end, entry_val); e = onig_reg_callout_list_at(reg, (int )entry_val); CHECK_NULL_RETURN_MEMERR(e); @@ -2391,10 +2424,10 @@ node_new_quantifier(int lower, int upper, int by_number) CHECK_NULL_RETURN(node); NODE_SET_TYPE(node, NODE_QUANT); - QUANT_(node)->lower = lower; - QUANT_(node)->upper = upper; - QUANT_(node)->greedy = 1; - QUANT_(node)->empty_info = BODY_IS_NOT_EMPTY; + QUANT_(node)->lower = lower; + QUANT_(node)->upper = upper; + QUANT_(node)->greedy = 1; + QUANT_(node)->emptiness = BODY_IS_NOT_EMPTY; QUANT_(node)->head_exact = NULL_NODE; QUANT_(node)->next_head_exact = NULL_NODE; QUANT_(node)->is_refered = 0; @@ -2694,7 +2727,7 @@ make_text_segment(Node** node, ScanEnv* env) ns[0] = x; ns[1] = NULL_NODE; - x = node_new_quantifier(0, REPEAT_INFINITE, 1); + x = node_new_quantifier(0, INFINITE_REPEAT, 1); if (IS_NULL(x)) goto err; NODE_BODY(x) = ns[0]; @@ -3044,7 +3077,7 @@ make_absent_tree(Node** node, Node* absent, Node* expr, int is_range_cutter, if (expr == NULL_NODE) { /* default expr \O* */ - quant = node_new_quantifier(0, REPEAT_INFINITE, 0); + quant = node_new_quantifier(0, INFINITE_REPEAT, 0); if (IS_NULL(quant)) goto err0; r = node_new_true_anychar(&body, env); @@ -3086,7 +3119,7 @@ make_absent_tree(Node** node, Node* absent, Node* expr, int is_range_cutter, if (r != 0) goto err; possessive = 1; - r = make_absent_engine(&ns[2], id1, absent, ns[3], 0, REPEAT_INFINITE, + r = make_absent_engine(&ns[2], id1, absent, ns[3], 0, INFINITE_REPEAT, possessive, is_range_cutter, env); if (r != 0) goto err; @@ -3236,10 +3269,18 @@ node_new_empty(void) static Node* node_new_str_raw_char(UChar c) { + int i; UChar p[1]; + Node* node; p[0] = c; - return node_new_str_raw(p, p + 1); + node = node_new_str_raw(p, p + 1); + + /* clear buf tail */ + for (i = 1; i < NODE_STRING_BUF_SIZE; i++) + STR_(node)->buf[i] = '\0'; + + return node; } static Node* @@ -3275,24 +3316,6 @@ str_node_can_be_split(Node* node, OnigEncoding enc) return 0; } -#ifdef USE_PAD_TO_SHORT_BYTE_CHAR -static int -node_str_head_pad(StrNode* sn, int num, UChar val) -{ - UChar buf[NODE_STRING_BUF_SIZE]; - int i, len; - - len = sn->end - sn->s; - onig_strcpy(buf, sn->s, sn->end); - onig_strcpy(&(sn->s[num]), buf, buf + len); - sn->end += num; - - for (i = 0; i < num; i++) { - sn->s[i] = val; - } -} -#endif - extern int onig_scan_unsigned_number(UChar** src, const UChar* end, OnigEncoding enc) { @@ -3877,19 +3900,19 @@ quantifier_type_num(QuantNode* q) if (q->greedy) { if (q->lower == 0) { if (q->upper == 1) return 0; - else if (IS_REPEAT_INFINITE(q->upper)) return 1; + else if (IS_INFINITE_REPEAT(q->upper)) return 1; } else if (q->lower == 1) { - if (IS_REPEAT_INFINITE(q->upper)) return 2; + if (IS_INFINITE_REPEAT(q->upper)) return 2; } } else { if (q->lower == 0) { if (q->upper == 1) return 3; - else if (IS_REPEAT_INFINITE(q->upper)) return 4; + else if (IS_INFINITE_REPEAT(q->upper)) return 4; } else if (q->lower == 1) { - if (IS_REPEAT_INFINITE(q->upper)) return 5; + if (IS_INFINITE_REPEAT(q->upper)) return 5; } } return -1; @@ -3926,8 +3949,8 @@ onig_reduce_nested_quantifier(Node* pnode, Node* cnode) pnum = quantifier_type_num(p); cnum = quantifier_type_num(c); if (pnum < 0 || cnum < 0) { - if ((p->lower == p->upper) && ! IS_REPEAT_INFINITE(p->upper)) { - if ((c->lower == c->upper) && ! IS_REPEAT_INFINITE(c->upper)) { + if ((p->lower == p->upper) && ! IS_INFINITE_REPEAT(p->upper)) { + if ((c->lower == c->upper) && ! IS_INFINITE_REPEAT(c->upper)) { int n = onig_positive_int_multiply(p->lower, c->lower); if (n >= 0) { p->lower = p->upper = n; @@ -3946,11 +3969,11 @@ onig_reduce_nested_quantifier(Node* pnode, Node* cnode) break; case RQ_A: NODE_BODY(pnode) = NODE_BODY(cnode); - p->lower = 0; p->upper = REPEAT_INFINITE; p->greedy = 1; + p->lower = 0; p->upper = INFINITE_REPEAT; p->greedy = 1; break; case RQ_AQ: NODE_BODY(pnode) = NODE_BODY(cnode); - p->lower = 0; p->upper = REPEAT_INFINITE; p->greedy = 0; + p->lower = 0; p->upper = INFINITE_REPEAT; p->greedy = 0; break; case RQ_QQ: NODE_BODY(pnode) = NODE_BODY(cnode); @@ -3959,13 +3982,13 @@ onig_reduce_nested_quantifier(Node* pnode, Node* cnode) case RQ_P_QQ: NODE_BODY(pnode) = cnode; p->lower = 0; p->upper = 1; p->greedy = 0; - c->lower = 1; c->upper = REPEAT_INFINITE; c->greedy = 1; + c->lower = 1; c->upper = INFINITE_REPEAT; c->greedy = 1; return ; break; case RQ_PQ_Q: NODE_BODY(pnode) = cnode; p->lower = 0; p->upper = 1; p->greedy = 1; - c->lower = 1; c->upper = REPEAT_INFINITE; c->greedy = 0; + c->lower = 1; c->upper = INFINITE_REPEAT; c->greedy = 0; return ; break; case RQ_ASIS: @@ -4158,7 +4181,7 @@ fetch_interval_quantifier(UChar** src, UChar* end, PToken* tok, ScanEnv* env) if (p == prev) { if (non_low != 0) goto invalid; - up = REPEAT_INFINITE; /* {n,} : {n,infinite} */ + up = INFINITE_REPEAT; /* {n,} : {n,infinite} */ } } else { @@ -4178,7 +4201,7 @@ fetch_interval_quantifier(UChar** src, UChar* end, PToken* tok, ScanEnv* env) } if (c != '}') goto invalid; - if (!IS_REPEAT_INFINITE(up) && low > up) { + if (!IS_INFINITE_REPEAT(up) && low > up) { /* {n,m}+ supported case */ if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_PLUS_POSSESSIVE_INTERVAL)) return ONIGERR_UPPER_SMALLER_THAN_LOWER_IN_REPEAT_RANGE; @@ -4959,7 +4982,7 @@ fetch_token(PToken* tok, UChar** src, UChar* end, ScanEnv* env) if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_ASTERISK_ZERO_INF)) break; tok->type = TK_REPEAT; tok->u.repeat.lower = 0; - tok->u.repeat.upper = REPEAT_INFINITE; + tok->u.repeat.upper = INFINITE_REPEAT; goto greedy_check; break; @@ -4967,7 +4990,7 @@ fetch_token(PToken* tok, UChar** src, UChar* end, ScanEnv* env) if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_PLUS_ONE_INF)) break; tok->type = TK_REPEAT; tok->u.repeat.lower = 1; - tok->u.repeat.upper = REPEAT_INFINITE; + tok->u.repeat.upper = INFINITE_REPEAT; goto greedy_check; break; @@ -5358,10 +5381,8 @@ fetch_token(PToken* tok, UChar** src, UChar* end, ScanEnv* env) tok->u.backref.ref1 = back_num; } else { - num = onig_name_to_group_numbers(env->reg, prev, name_end, &backs); + num = name_to_group_numbers(env, prev, name_end, &backs); if (num <= 0) { - onig_scan_env_set_error_string(env, - ONIGERR_UNDEFINED_NAME_REFERENCE, prev, name_end); return ONIGERR_UNDEFINED_NAME_REFERENCE; } if (IS_SYNTAX_BV(syn, ONIG_SYN_STRICT_CHECK_BACKREF)) { @@ -5514,7 +5535,7 @@ fetch_token(PToken* tok, UChar** src, UChar* end, ScanEnv* env) #endif tok->type = TK_REPEAT; tok->u.repeat.lower = 0; - tok->u.repeat.upper = REPEAT_INFINITE; + tok->u.repeat.upper = INFINITE_REPEAT; goto greedy_check; break; @@ -5525,7 +5546,7 @@ fetch_token(PToken* tok, UChar** src, UChar* end, ScanEnv* env) #endif tok->type = TK_REPEAT; tok->u.repeat.lower = 1; - tok->u.repeat.upper = REPEAT_INFINITE; + tok->u.repeat.upper = INFINITE_REPEAT; goto greedy_check; break; @@ -5608,7 +5629,7 @@ fetch_token(PToken* tok, UChar** src, UChar* end, ScanEnv* env) tok->u.call.gnum = 0; tok->u.call.name = p; PINC; - if (! PPEEK_IS(')')) return ONIGERR_INVALID_GROUP_NAME; + if (! PPEEK_IS(')')) return ONIGERR_UNDEFINED_GROUP_OPTION; tok->u.call.name_end = p; break; @@ -6249,6 +6270,7 @@ parse_char_class(Node** np, PToken* tok, UChar** src, UChar* end, ScanEnv* env) env->parse_depth++; if (env->parse_depth > ParseDepthLimit) return ONIGERR_PARSE_DEPTH_LIMIT_OVER; + prev_cc = (CClassNode* )NULL; r = fetch_token_in_cc(tok, src, end, env); if (r == TK_CHAR && tok->u.c == '^' && tok->escaped == 0) { @@ -6301,10 +6323,11 @@ parse_char_class(Node** np, PToken* tok, UChar** src, UChar* end, ScanEnv* env) case TK_RAW_BYTE: /* tok->base != 0 : octal or hexadec. */ if (! ONIGENC_IS_SINGLEBYTE(env->enc) && tok->base != 0) { + int i, j; UChar buf[ONIGENC_CODE_TO_MBC_MAXLEN]; UChar* bufe = buf + ONIGENC_CODE_TO_MBC_MAXLEN; UChar* psave = p; - int i, base = tok->base; + int base = tok->base; buf[0] = tok->u.c; for (i = 1; i < ONIGENC_MBC_MAXLEN(env->enc); i++) { @@ -6322,6 +6345,9 @@ parse_char_class(Node** np, PToken* tok, UChar** src, UChar* end, ScanEnv* env) goto err; } + /* clear buf tail */ + for (j = i; j < ONIGENC_CODE_TO_MBC_MAXLEN; j++) buf[j] = '\0'; + len = enclen(env->enc, buf); if (i < len) { r = ONIGERR_TOO_SHORT_MULTI_BYTE_STRING; @@ -6359,8 +6385,13 @@ parse_char_class(Node** np, PToken* tok, UChar** src, UChar* end, ScanEnv* env) val_entry: len = ONIGENC_CODE_TO_MBCLEN(env->enc, v); if (len < 0) { - r = len; - goto err; + if (state != CCS_RANGE || + ! IS_SYNTAX_BV(env->syntax, + ONIG_SYN_ALLOW_INVALID_CODE_END_OF_RANGE_IN_CC) || + v < 0x100 || ONIGENC_MBC_MAXLEN(env->enc) == 1) { + r = len; + goto err; + } } in_type = (len == 1 ? CCV_SB : CCV_CODE_POINT); val_entry2: @@ -6673,7 +6704,7 @@ parse_callout_of_contents(Node** np, int cterm, UChar** src, UChar* end, ScanEnv } if (tag_start != tag_end) { - r = callout_tag_entry(env->reg, tag_start, tag_end, num); + r = callout_tag_entry(env, env->reg, tag_start, tag_end, num); if (r != ONIG_NORMAL) return r; } @@ -6994,7 +7025,7 @@ parse_callout_of_name(Node** np, int cterm, UChar** src, UChar* end, ScanEnv* en } if (tag_start != tag_end) { - r = callout_tag_entry(env->reg, tag_start, tag_end, num); + r = callout_tag_entry(env, env->reg, tag_start, tag_end, num); if (r != ONIG_NORMAL) return r; } @@ -7271,10 +7302,8 @@ parse_bag(Node** np, PToken* tok, int term, UChar** src, UChar* end, int num; int* backs; - num = onig_name_to_group_numbers(env->reg, prev, name_end, &backs); + num = name_to_group_numbers(env, prev, name_end, &backs); if (num <= 0) { - onig_scan_env_set_error_string(env, - ONIGERR_UNDEFINED_NAME_REFERENCE, prev, name_end); return ONIGERR_UNDEFINED_NAME_REFERENCE; } if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_STRICT_CHECK_BACKREF)) { @@ -7414,6 +7443,7 @@ parse_bag(Node** np, PToken* tok, int term, UChar** src, UChar* end, } break; +#ifdef USE_CAPTURE_HISTORY case '@': if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_ATMARK_CAPTURE_HISTORY)) { if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_LT_NAMED_GROUP)) { @@ -7441,6 +7471,7 @@ parse_bag(Node** np, PToken* tok, int term, UChar** src, UChar* end, return ONIGERR_UNDEFINED_GROUP_OPTION; } break; +#endif #ifdef USE_POSIXLINE_OPTION case 'p': @@ -7688,7 +7719,7 @@ set_quantifier(Node* qnode, Node* target, int group, ScanEnv* env) if (targetq_num >= 0 && nestq_num < 0) { if (targetq_num == 1 || targetq_num == 2) { /* * or + */ /* (?:a*){n,m}, (?:a+){n,m} => (?:a*){n,n}, (?:a+){n,n} */ - if (! IS_REPEAT_INFINITE(qn->upper) && qn->upper > 1 && qn->greedy) { + if (! IS_INFINITE_REPEAT(qn->upper) && qn->upper > 1 && qn->greedy) { qn->upper = (qn->lower == 0 ? 1 : qn->lower); } } @@ -7826,14 +7857,18 @@ static int parse_exp(Node** np, PToken* tok, int term, UChar** src, UChar* end, ScanEnv* env, int group_head) { - int r, len, group = 0; + int r, len, group; Node* qn; Node** tp; + unsigned int parse_depth; + group = 0; *np = NULL; if (tok->type == (enum TokenSyms )term) goto end_of_token; + parse_depth = env->parse_depth; + switch (tok->type) { case TK_ALT: case TK_EOT: @@ -7914,36 +7949,29 @@ parse_exp(Node** np, PToken* tok, int term, UChar** src, UChar* end, len = 1; while (1) { if (len >= ONIGENC_MBC_MINLEN(env->enc)) { - if (len == enclen(env->enc, STR_(*np)->s)) {/* should not enclen_end() */ + if (len == enclen(env->enc, STR_(*np)->s)) { r = fetch_token(tok, src, end, env); - NODE_STRING_CLEAR_RAW(*np); - goto string_end; + goto tk_raw_byte_end; } } r = fetch_token(tok, src, end, env); if (r < 0) return r; - if (r != TK_RAW_BYTE) { - /* Don't use this, it is wrong for little endian encodings. */ -#ifdef USE_PAD_TO_SHORT_BYTE_CHAR - int rem; - if (len < ONIGENC_MBC_MINLEN(env->enc)) { - rem = ONIGENC_MBC_MINLEN(env->enc) - len; - (void )node_str_head_pad(STR_(*np), rem, (UChar )0); - if (len + rem == enclen(env->enc, STR_(*np)->s)) { - NODE_STRING_CLEAR_RAW(*np); - goto string_end; - } - } -#endif + if (r != TK_RAW_BYTE) return ONIGERR_TOO_SHORT_MULTI_BYTE_STRING; - } r = node_str_cat_char(*np, (UChar )tok->u.c); if (r < 0) return r; len++; } + + tk_raw_byte_end: + if (! ONIGENC_IS_VALID_MBC_STRING(env->enc, STR_(*np)->s, STR_(*np)->end)) + return ONIGERR_INVALID_WIDE_CHAR_VALUE; + + NODE_STRING_CLEAR_RAW(*np); + goto string_end; } break; @@ -8055,7 +8083,7 @@ parse_exp(Node** np, PToken* tok, int term, UChar** src, UChar* end, case TK_ANYCHAR_ANYTIME: *np = node_new_anychar(); CHECK_NULL_RETURN_MEMERR(*np); - qn = node_new_quantifier(0, REPEAT_INFINITE, 0); + qn = node_new_quantifier(0, INFINITE_REPEAT, 0); CHECK_NULL_RETURN_MEMERR(qn); NODE_BODY(qn) = *np; *np = qn; @@ -8158,6 +8186,10 @@ parse_exp(Node** np, PToken* tok, int term, UChar** src, UChar* end, if (is_invalid_quantifier_target(*tp)) return ONIGERR_TARGET_OF_REPEAT_OPERATOR_INVALID; + parse_depth++; + if (parse_depth > ParseDepthLimit) + return ONIGERR_PARSE_DEPTH_LIMIT_OVER; + qn = node_new_quantifier(tok->u.repeat.lower, tok->u.repeat.upper, r == TK_INTERVAL); CHECK_NULL_RETURN_MEMERR(qn); diff --git a/src/regparse.h b/src/regparse.h index b7a2867..231f7b5 100644 --- a/src/regparse.h +++ b/src/regparse.h @@ -66,11 +66,11 @@ enum GimmickType { #endif }; -enum BodyEmpty { - BODY_IS_NOT_EMPTY = 0, - BODY_IS_EMPTY = 1, - BODY_IS_EMPTY_MEM = 2, - BODY_IS_EMPTY_REC = 3 +enum BodyEmptyType { + BODY_IS_NOT_EMPTY = 0, + BODY_IS_EMPTY_POSSIBILITY = 1, + BODY_IS_EMPTY_POSSIBILITY_MEM = 2, + BODY_IS_EMPTY_POSSIBILITY_REC = 3 }; typedef struct { @@ -101,7 +101,7 @@ typedef struct { int lower; int upper; int greedy; - enum BodyEmpty empty_info; + enum BodyEmptyType emptiness; struct _Node* head_exact; struct _Node* next_head_exact; int is_refered; /* include called node. don't eliminate even if {0} */ @@ -252,10 +252,6 @@ typedef struct _Node { #define NODE_BIT_CALL NODE_TYPE2BIT(NODE_CALL) #define NODE_BIT_GIMMICK NODE_TYPE2BIT(NODE_GIMMICK) -#define NODE_IS_SIMPLE_TYPE(node) \ - ((NODE_TYPE2BIT(NODE_TYPE(node)) & \ - (NODE_BIT_STRING | NODE_BIT_CCLASS | NODE_BIT_CTYPE | NODE_BIT_BACKREF)) != 0) - #define NODE_TYPE(node) ((node)->u.base.node_type) #define NODE_SET_TYPE(node, ntype) (node)->u.base.node_type = (ntype) @@ -314,7 +310,7 @@ typedef struct _Node { #define NODE_ST_CLEN_FIXED (1<<2) #define NODE_ST_MARK1 (1<<3) #define NODE_ST_MARK2 (1<<4) -#define NODE_ST_STOP_BT_SIMPLE_REPEAT (1<<5) +#define NODE_ST_STRICT_REAL_REPEAT (1<<5) #define NODE_ST_RECURSION (1<<6) #define NODE_ST_CALLED (1<<7) #define NODE_ST_ADDR_FIXED (1<<8) @@ -357,8 +353,8 @@ typedef struct _Node { #define NODE_IS_SUPER(node) ((NODE_STATUS(node) & NODE_ST_SUPER) != 0) #define NODE_IS_PROHIBIT_RECURSION(node) \ ((NODE_STATUS(node) & NODE_ST_PROHIBIT_RECURSION) != 0) -#define NODE_IS_STOP_BT_SIMPLE_REPEAT(node) \ - ((NODE_STATUS(node) & NODE_ST_STOP_BT_SIMPLE_REPEAT) != 0) +#define NODE_IS_STRICT_REAL_REPEAT(node) \ + ((NODE_STATUS(node) & NODE_ST_STRICT_REAL_REPEAT) != 0) #define NODE_BODY(node) ((node)->u.base.body) #define NODE_QUANT_BODY(node) ((node)->body) diff --git a/src/utf16_be.c b/src/utf16_be.c index 22bf74d..b66d868 100644 --- a/src/utf16_be.c +++ b/src/utf16_be.c @@ -2,7 +2,7 @@ utf16_be.c - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2018 K.Kosako + * Copyright (c) 2002-2019 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -103,7 +103,25 @@ utf16be_mbc_enc_len(const UChar* p) static int is_valid_mbc_string(const UChar* s, const UChar* end) { - return onigenc_length_check_is_valid_mbc_string(ONIG_ENCODING_UTF16_BE, s, end); + while (s < end) { + int len = utf16be_mbc_enc_len(s); + if (len == 4) { + if (s + 2 >= end) + return FALSE; + if (! UTF16_IS_SURROGATE_SECOND(*(s+2))) + return FALSE; + } + else + if (UTF16_IS_SURROGATE_SECOND(*s)) + return FALSE; + + s += len; + } + + if (s != end) + return FALSE; + else + return TRUE; } static int @@ -146,7 +164,15 @@ utf16be_mbc_to_code(const UChar* p, const UChar* end ARG_UNUSED) static int utf16be_code_to_mbclen(OnigCodePoint code) { - return (code > 0xffff ? 4 : 2); + if (code > 0xffff) { + if (code > 0x10ffff) + return ONIGERR_INVALID_CODE_POINT_VALUE; + else + return 4; + } + else { + return 2; + } } static int @@ -243,7 +269,8 @@ utf16be_left_adjust_char_head(const UChar* start, const UChar* s) s--; } - if (UTF16_IS_SURROGATE_SECOND(*s) && s > start + 1) + if (UTF16_IS_SURROGATE_SECOND(*s) && s > start + 1 && + UTF16_IS_SURROGATE_FIRST(*(s-2))) s -= 2; return (UChar* )s; diff --git a/src/utf16_le.c b/src/utf16_le.c index 4b231c6..cdc74b0 100644 --- a/src/utf16_le.c +++ b/src/utf16_le.c @@ -2,7 +2,7 @@ utf16_le.c - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2018 K.Kosako + * Copyright (c) 2002-2019 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -95,7 +95,15 @@ static const int EncLen_UTF16[] = { static int utf16le_code_to_mbclen(OnigCodePoint code) { - return (code > 0xffff ? 4 : 2); + if (code > 0xffff) { + if (code > 0x10ffff) + return ONIGERR_INVALID_CODE_POINT_VALUE; + else + return 4; + } + else { + return 2; + } } static int @@ -110,7 +118,16 @@ is_valid_mbc_string(const UChar* p, const UChar* end) const UChar* end1 = end - 1; while (p < end1) { - p += utf16le_mbc_enc_len(p); + int len = utf16le_mbc_enc_len(p); + if (len == 4) { + if (p + 3 < end && ! UTF16_IS_SURROGATE_SECOND(*(p + 3))) + return FALSE; + } + else + if (UTF16_IS_SURROGATE_SECOND(*(p + 1))) + return FALSE; + + p += len; } if (p != end) @@ -252,7 +269,8 @@ utf16le_left_adjust_char_head(const UChar* start, const UChar* s) s--; } - if (UTF16_IS_SURROGATE_SECOND(*(s+1)) && s > start + 1) + if (UTF16_IS_SURROGATE_SECOND(*(s+1)) && s > start + 1 && + UTF16_IS_SURROGATE_FIRST(*(s-1))) s -= 2; return (UChar* )s; diff --git a/test/test_utf8.c b/test/test_utf8.c index bab6b0d..2338526 100644 --- a/test/test_utf8.c +++ b/test/test_utf8.c @@ -1202,10 +1202,23 @@ extern int main(int argc, char* argv[]) x2("a{3,2}b", "aab", 0, 3); x2("a{3,2}?", "", 0, 0); /* == (?:a{3,2})?*/ x2("a{2,3}+a", "aaa", 0, 3); /* == (?:a{2,3})+*/ + x2("[\\x{0}-\\x{7fffffff}]", "a", 0, 1); + x2("[\\x{7f}-\\x{7fffffff}]", "\xe5\xae\xb6", 0, 3); + + n(" \xfd", ""); /* https://bugs.php.net/bug.php?id=77370 */ + /* can't use \xfc00.. because compiler error: hex escape sequence out of range */ + n("()0\\xfc00000\\xfc00000\\xfc00000\xfc", ""); /* https://bugs.php.net/bug.php?id=77371 */ + x2("000||0\xfa", "0", 0, 0); /* https://bugs.php.net/bug.php?id=77381 */ + e("(?i)000000000000000000000\xf0", "", ONIGERR_INVALID_CODE_POINT_VALUE); /* https://bugs.php.net/bug.php?id=77382 */ + n("0000\\\xf5", "0"); /* https://bugs.php.net/bug.php?id=77385 */ + n("(?i)FFF00000000000000000\xfd", ""); /* https://bugs.php.net/bug.php?id=77394 */ + x2("\\p{Common}", "\xe3\x8b\xbf", 0, 3); /* U+32FF */ x2("\\p{In_Enclosed_CJK_Letters_and_Months}", "\xe3\x8b\xbf", 0, 3); /* U+32FF */ + e("\\x{7fffffff}", "", ONIGERR_TOO_BIG_WIDE_CHAR_VALUE); + e("[\\x{7fffffff}]", "", ONIGERR_INVALID_CODE_POINT_VALUE); e("\\u040", "@", ONIGERR_INVALID_CODE_POINT_VALUE); e("(?\\g)", "zzzz", ONIGERR_NEVER_ENDING_RECURSION); e("(?<=(?>abc))", "abc", ONIGERR_INVALID_LOOK_BEHIND_PATTERN); diff --git a/test/testu.c b/test/testu.c index 4b053e5..397da95 100644 --- a/test/testu.c +++ b/test/testu.c @@ -116,28 +116,13 @@ static void xx(char* pattern, char* str, int from, int to, int mem, int not) #else regex_t* reg; - OnigCompileInfo ci; OnigErrorInfo einfo; uconv(pattern, cpat, ulen(pattern)); uconv(str, cstr, ulen(str)); -#if 0 r = onig_new(®, (UChar* )pattern, (UChar* )(pattern + ulen(pattern)), ONIG_OPTION_DEFAULT, ENC, ONIG_SYNTAX_DEFAULT, &einfo); -#else - ci.num_of_elements = 5; - ci.pattern_enc = ENC; - ci.target_enc = ENC; - ci.syntax = ONIG_SYNTAX_DEFAULT; - ci.option = ONIG_OPTION_DEFAULT; - ci.case_fold_flag = ONIGENC_CASE_FOLD_DEFAULT; - - r = onig_new_deluxe(®, (UChar* )pattern, - (UChar* )(pattern + ulen(pattern)), - &ci, &einfo); -#endif - if (r) { char s[ONIG_MAX_ERROR_MESSAGE_LEN]; onig_error_code_to_str((UChar* )s, r, &einfo); -- cgit v1.2.3