diff options
44 files changed, 1351 insertions, 450 deletions
| @@ -32,6 +32,7 @@ Makefile.in  m4/*.m4  /coverage  /coverage.info +/fuzzers  # src/  /src/CaseFolding.txt @@ -62,3 +63,5 @@ m4/*.m4  /sample/count  /sample/bug_fix  /sample/log* + +/harnesses/utf16*.dict diff --git a/CMakeLists.txt b/CMakeLists.txt index f3eca6b..c59bfe3 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,28 +1,19 @@  cmake_minimum_required(VERSION 3.1) -project(oniguruma VERSION 6.9.2) +project(oniguruma +  VERSION 6.9.3 +  LANGUAGES C)  set(PACKAGE onig)  set(PACKAGE_VERSION ${PROJECT_VERSION})  option(BUILD_SHARED_LIBS "Build shared libraries" ON)  option(ENABLE_POSIX_API  "Include POSIX API" ON) - -set(USE_CRNL_AS_LINE_TERMINATOR 0) -set(VERSION ${PACKAGE_VERSION}) -  if(MSVC) -  # Force to always compile with W4 -  if(CMAKE_CXX_FLAGS MATCHES "/W[0-4]") -    string(REGEX REPLACE "/W[0-4]" "/W4" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}") -  else() -    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /W4") -  endif() -elseif(CMAKE_COMPILER_IS_GNUCXX) -  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall") -elseif(CMAKE_COMPILER_IS_GNUCC) -  set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wall") +  option(MSVC_STATIC_RUNTIME "Build with static runtime" OFF)  endif() +set(USE_CRNL_AS_LINE_TERMINATOR 0) +set(VERSION ${PACKAGE_VERSION})  include(CheckCSourceCompiles)  include(CheckIncludeFiles) @@ -73,6 +64,26 @@ target_include_directories(onig PUBLIC    $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/src>    $<BUILD_INTERFACE:${CMAKE_CURRENT_BINARY_DIR}>) +if(MSVC) +  target_compile_options(onig PRIVATE +	#/W4 +	) +  if(MSVC_STATIC_RUNTIME) +	target_compile_options(onig PRIVATE +	  $<$<CONFIG:Release>:/MT> +	  $<$<CONFIG:Debug>:/MTd> +	  $<$<CONFIG:MinSizeRel>:/MT> +	  $<$<CONFIG:RelWithDebgInfo>:/MTd> +	  ) +	target_compile_definitions(onig PUBLIC -DONIG_STATIC) +  endif() +elseif(CMAKE_COMPILER_IS_GNUCC) +  target_compile_options(onig PRIVATE +	-Wall +	) +endif() + +  # Installation (https://github.com/forexample/package-example)  # Introduce variables: @@ -1,5 +1,19 @@  History +2019/08/06: Version 6.9.3 (secirity fix release) + +2019/07/30: add ONIG_SYN_ALLOW_INVALID_CODE_END_OF_RANGE +2019/07/29: add STK_PREC_READ_START/END stack type +2019/07/29: Fix #147: Stack Exhaustion Problem caused by some parsing functions +2019/07/11: add a dictionary file for libfuzzer +2019/07/07: add harnesses directory +2019/07/05-2019/07/29: fix many problems found by libfuzzer programs +2019/06/27: deprecate onig_new_deluxe() +2019/06/27: Fix CVE-2019-13224: don't allow different encodings for onig_new_deluxe() +2019/06/27: Fix CVE-2019-13225: problem in converting if-then-else pattern + +2019/05/07: Version 6.9.2 (same as Release Candidate 3) +  2019/04/23: Release Candidate 3 for 6.9.2  2019/04/23: add doc/SYNTAX.md into distribution file  2019/04/09: Release Candidate 2 for 6.9.2 diff --git a/Makefile.am b/Makefile.am index 6045eae..a0bbc7b 100644 --- a/Makefile.am +++ b/Makefile.am @@ -39,6 +39,12 @@ pkgconfig_DATA = oniguruma.pc  all-test:  	cd test; make test +sanitize: +	make clean +	./configure CC=clang CFLAGS="-O -g -fsanitize=address" +	make +	make all-test +  cov:  	make lcov-clear  	cd test; make CFLAGS="--coverage" test @@ -27,46 +27,55 @@ Supported character encodings:  * doc/SYNTAX.md: contributed by seanofw -New feature of version 6.9.2 ------------------------------------ +Version 6.9.3 (security fix release) +------------------------------------ +* Fixed CVE-2019-13224 +* Fixed CVE-2019-13225 +* Fixed many problems (found by libfuzzer programs) + + +Version 6.9.2 (Reiwa) +--------------------- + +* add doc/SYNTAX.md  * Update Unicode version 12.1.0 -* NEW: Unicode Text Segment mode option (?y{g}) (?y{w}) +* NEW: Unicode Text Segment mode option (?y{g}) (?y{w})  (*original)    g: Extended Grapheme Cluster mode / w: Word mode    (Unicode Standard Annex #29 [http://unicode.org/reports/tr29/]) -New feature of version 6.9.1 --------------------------- +Version 6.9.1 +-------------  * Speed improvement (* especially UTF-8) -New feature of version 6.9.0 --------------------------- +Version 6.9.0 +-------------  * Update Unicode version 11.0.0  * NEW: add Emoji properties -New feature of version 6.8.2 --------------------------- +Version 6.8.2 +-------------  * Fix: #80 UChar in header causes issue  * NEW API: onig_set_callout_user_data_of_match_param()  (* omission in 6.8.0)  * add doc/CALLOUTS.API and doc/CALLOUTS.API.ja -New feature of version 6.8.1 --------------------------- +Version 6.8.1 +-------------  * Update shared library version to 5.0.0 for API incompatible changes from 6.7.1 -New feature of version 6.8.0 --------------------------- +Version 6.8.0 +-------------  * Retry-limit-in-match function enabled by default  * NEW: configure option --enable-posix-api=no  (* enabled by default) @@ -77,14 +86,14 @@ New feature of version 6.8.0  * Examples of Callouts program: [callout.c](sample/callout.c), [count.c](sample/count.c), [echo.c](sample/echo.c) -New feature of version 6.7.1 --------------------------- +Version 6.7.1 +-------------  * NEW: Mechanism of retry-limit-in-match (* disabled by default) -New feature of version 6.7.0 --------------------------- +Version 6.7.0 +-------------  * NEW: hexadecimal codepoint \uHHHH  * NEW: add ONIG_SYNTAX_ONIGURUMA (== ONIG_SYNTAX_DEFAULT) @@ -92,8 +101,8 @@ New feature of version 6.7.0  * Reduced size of object file -New feature of version 6.6.0 --------------------------- +Version 6.6.0 +-------------  * NEW: ASCII only mode options for character type/property (?WDSP)  * NEW: Extended Grapheme Cluster boundary \y, \Y @@ -101,8 +110,8 @@ New feature of version 6.6.0  * Range-clear (Absent-clear) operator restores previous range in retractions. -New feature of version 6.5.0 --------------------------- +Version 6.5.0 +-------------  * NEW: \K (keep)  * NEW: \R (general newline) \N (no newline) @@ -114,16 +123,16 @@ New feature of version 6.5.0  * NEW: Absent stopper (?~|absent)     (*original) -New feature of version 6.4.0 --------------------------- +Version 6.4.0 +-------------  * Fix fatal problem of endless repeat on Windows  * NEW: call zero (call the total regexp) \g<0>  * NEW: relative backref/call by positive number \k<+n>, \g<+n> -New feature of version 6.3.0 --------------------------- +Version 6.3.0 +-------------  * NEW: octal codepoint \o{.....}  * Fixed CVE-2017-9224 @@ -134,20 +143,20 @@ New feature of version 6.3.0  * Fixed CVE-2017-9229 -New feature of version 6.1.2 --------------------------- +Version 6.1.2 +-------------  * allow word bound, word begin and word end in look-behind.  * NEW option: ONIG_OPTION_CHECK_VALIDITY_OF_STRING -New feature of version 6.1 --------------------------- +Version 6.1 +-----------  * improved doc/RE  * NEW API: onig_scan() -New feature of version 6.0 --------------------------- +Version 6.0 +-----------  * Update Unicode 8.0 Property/Case-folding  * NEW API: onig_unicode_define_user_property() diff --git a/build_harnesses.sh b/build_harnesses.sh new file mode 100755 index 0000000..54dc9ff --- /dev/null +++ b/build_harnesses.sh @@ -0,0 +1,31 @@ +#!/bin/bash + +make clean +autoreconf -vfi + +# build the library with ASAN +#NO_LINK="-fsanitize=fuzzer-no-link" +NO_LINK="" +./configure CC=clang LD=clang CFLAGS="-g -fsanitize=address -fno-omit-frame-pointer $NO_LINK" LDFLAGS="-g -fsanitize=address -fno-omit-frame-pointer $NO_LINK" +make -j4 + +OUT=`pwd`/fuzzers +mkdir -p $OUT +LIBFUZZER_FLAGS="-fsanitize=fuzzer,address -fno-omit-frame-pointer" +#LIBS="src/.libs/libonig.a" +LIBS="src/.libs/libonig.a /usr/local/lib/libLLVMFuzzerMain.a" + +CFLAGS="-Isrc -g $LIBFUZZER_FLAGS" + +# Libfuzzer builds +clang++ contributed/libfuzzer-onig.cpp $LIBS $CFLAGS -o $OUT/libfuzzer-onig +clang harnesses/syntax-harness.c $LIBS $CFLAGS -o $OUT/syntax-libfuzzer +clang harnesses/encode-harness.c $LIBS $CFLAGS -o $OUT/encode-libfuzzer +clang harnesses/deluxe-encode-harness.c $LIBS $CFLAGS -o $OUT/deluxe-encode-libfuzzer + +clang -DUTF16_BE harnesses/encode-harness.c $LIBS $CFLAGS -o $OUT/utf16-be-libfuzzer +clang -DUTF16_LE harnesses/encode-harness.c $LIBS $CFLAGS -o $OUT/utf16-le-libfuzzer +clang -DWITH_READ_MAIN harnesses/encode-harness.c src/.libs/libonig.a $CFLAGS -o $OUT/main-encode +clang -DWITH_READ_MAIN -DUTF16_LE harnesses/encode-harness.c src/.libs/libonig.a $CFLAGS -o $OUT/main-utf16-le +clang -DWITH_READ_MAIN -DUTF16_BE harnesses/encode-harness.c src/.libs/libonig.a $CFLAGS -o $OUT/main-utf16-be +clang -DWITH_READ_MAIN harnesses/deluxe-encode-harness.c $LIBS $CFLAGS -o $OUT/main-deluxe-encode diff --git a/configure.ac b/configure.ac index 010a0d8..62c9fa5 100644 --- a/configure.ac +++ b/configure.ac @@ -1,5 +1,5 @@  dnl Process this file with autoconf to produce a configure script. -AC_INIT(onig, 6.9.2) +AC_INIT(onig, 6.9.3)  AC_CONFIG_MACRO_DIR([m4]) diff --git a/contributed/libfuzzer-onig.cpp b/contributed/libfuzzer-onig.cpp index e137b73..526c826 100644 --- a/contributed/libfuzzer-onig.cpp +++ b/contributed/libfuzzer-onig.cpp @@ -29,6 +29,8 @@ extern "C" int LLVMFuzzerTestOneInput(const uint8_t * Data, size_t Size)  #ifdef FULL_TEST    onig_initialize(&enc, 1); +  onig_set_retry_limit_in_match(120); +  onig_set_parse_depth_limit(120);  #endif    if (onig_new(®, Data, Data + Size, ONIG_OPTION_DEFAULT, enc, diff --git a/debian/watch b/debian/watch index 8a7b475..2f0e85f 100644 --- a/debian/watch +++ b/debian/watch @@ -4,4 +4,4 @@ dversionmangle=s/\+(debian|dfsg|ds|deb)\d*$//,\  uversionmangle=s/(\d)[_\.\-\+]?((RC|rc|pre|dev|beta|alpha)\d*)$/$1~$2/;s/RC/rc/;s/\-/\./g;s/\_/\./g,\  filenamemangle=s/(?:.*?)?(?:rel|v|oniguruma|ONIGURUMA)?[\-\_]?(\d\S+)\.(tgz|tbz|txz|(?:tar\.(?:gz|bz2|xz)))/oniguruma-$1.$2/ \  https://github.com/kkos/oniguruma/tags \ -(?:.*?/)?(?:rel|v|oniguruma|ONIGURUMA)?[\-\_]?(\d\S+)\.(?:tgz|tbz|txz|(?:tar\.(?:gz|bz2|xz))) \ +(?:.*?/)?(?:rel|v|oniguruma|ONIGURUMA)?[\-\_]?(\d\S+)\.(?:tgz|tbz|txz|(?:tar\.(?:gz|bz2|xz)))  @@ -1,4 +1,4 @@ -Oniguruma API  Version 6.9.2  2019/03/25 +Oniguruma API  Version 6.9.3  2019/07/06  #include <oniguruma.h> @@ -168,6 +168,9 @@ Oniguruma API  Version 6.9.2  2019/03/25  # int onig_new_deluxe(regex_t** reg, const UChar* pattern, const UChar* pattern_end,                        OnigCompileInfo* ci, OnigErrorInfo* einfo) +  This function is deprecate, and it does not allow the case where +  the encoding of pattern and target is different. +    Create a regex object.    This function is deluxe version of onig_new(). @@ -299,6 +302,7 @@ Oniguruma API  Version 6.9.2  2019/03/25                     const UChar* range, OnigRegion* region, OnigOptionType option)    Search string and return search result and matching region. +  Do not pass invalid byte string in the regex character encoding.    normal return: match position offset (i.e.  p - str >= 0)    not found:     ONIG_MISMATCH (< 0) @@ -323,15 +327,19 @@ Oniguruma API  Version 6.9.2  2019/03/25                     const UChar* start, const UChar* range, OnigRegion* region,                     OnigOptionType option, OnigMatchParam* mp) -   arguments -   1-7:  same as onig_search() -   8 mp: match parameter values (match_stack_limit, retry_limit_in_match) +  Search string and return search result and matching region. +  Do not pass invalid byte string in the regex character encoding. + +  arguments +  1-7:  same as onig_search() +  8 mp: match parameter values (match_stack_limit, retry_limit_in_match)  # int onig_match(regex_t* reg, const UChar* str, const UChar* end, const UChar* at,                   OnigRegion* region, OnigOptionType option)    Match string and return result and matching region. +  Do not pass invalid byte string in the regex character encoding.    normal return: match length  (>= 0)    not match:     ONIG_MISMATCH ( < 0) @@ -353,6 +361,9 @@ Oniguruma API  Version 6.9.2  2019/03/25                              const UChar* at, OnigRegion* region,                              OnigOptionType option, OnigMatchParam* mp) +  Match string and return result and matching region. +  Do not pass invalid byte string in the regex character encoding. +     arguments     1-6:  same as onig_match()     7 mp: match parameter values (match_stack_limit, retry_limit_in_match) @@ -364,6 +375,7 @@ Oniguruma API  Version 6.9.2  2019/03/25                  void* callback_arg)    Scan string and callback with matching region. +  Do not pass invalid byte string in the regex character encoding.    normal return: number of matching times    error:         error code @@ -611,14 +623,20 @@ Oniguruma API  Version 6.9.2  2019/03/25  # int onigenc_strlen(OnigEncoding enc, const UChar* s, const UChar* end) + +  Return number of characters in the string. + +  # int onigenc_strlen_null(OnigEncoding enc, const UChar* s)    Return number of characters in the string. +  Do not pass invalid byte string in the character encoding.  # int onigenc_str_bytelen_null(OnigEncoding enc, const UChar* s)    Return number of bytes in the string. +  Do not pass invalid byte string in the character encoding.  # int onig_set_default_syntax(OnigSyntaxType* syntax) @@ -1,4 +1,4 @@ -鬼車インターフェース Version 6.9.2   2019/03/29 +鬼車インターフェース Version 6.9.3   2019/07/06  #include <oniguruma.h> @@ -167,6 +167,9 @@  # int onig_new_deluxe(regex_t** reg, const UChar* pattern, const UChar* pattern_end,                        OnigCompileInfo* ci, OnigErrorInfo* einfo) +  この関数は廃止予定。 +  パターンと対象文字列の文字エンコーディングが異なる場合を許さなくなった。 +    正規表現オブジェクト(regex)を作成する。    この関数は、onig_new()のデラックス版。 @@ -298,6 +301,7 @@                     const UChar* range, OnigRegion* region, OnigOptionType option)    正規表現で文字列を検索し、検索結果とマッチ領域を返す。 +  正規表現オブジェクトの文字エンコーディングで、検索文字列として不正な文字列を渡してはいけない。    正常終了戻り値: マッチ位置 (p - str >= 0)    検索失敗:       ONIG_MISMATCH (< 0) @@ -322,6 +326,9 @@                     const UChar* start, const UChar* range, OnigRegion* region,                     OnigOptionType option, OnigMatchParam* mp) +  正規表現で文字列を検索し、検索結果とマッチ領域を返す。 +  正規表現オブジェクトの文字エンコーディングで、検索文字列として不正な文字列を渡してはいけない。 +    引数    1-7:  onig_search()と同じ    8 mp: マッチパラメタ値 (match_stack_limit, retry_limit_in_match) @@ -331,6 +338,7 @@                   const UChar* at, OnigRegion* region, OnigOptionType option)    文字列の指定位置でマッチングを行い、結果とマッチ領域を返す。 +  正規表現オブジェクトの文字エンコーディングで、検索文字列として不正な文字列を渡してはいけない。    正常終了戻り値: マッチしたバイト長 (>= 0)    not match:      ONIG_MISMATCH      ( < 0) @@ -352,6 +360,9 @@                              const UChar* at, OnigRegion* region,                              OnigOptionType option, OnigMatchParam* mp) +  文字列の指定位置でマッチングを行い、結果とマッチ領域を返す。 +  正規表現オブジェクトの文字エンコーディングで、検索文字列として不正な文字列を渡してはいけない。 +    引数    1-6:  onig_match()と同じ    7 mp: マッチパラメタ値 (match_stack_limit, retry_limit_in_match) @@ -363,6 +374,7 @@                  void* callback_arg)    正規表現で文字列をスキャンして、マッチングする毎にコールバック関数を呼び出す。 +  正規表現オブジェクトの文字エンコーディングで、検索文字列として不正な文字列を渡してはいけない。    正常終了: マッチ回数 (0回も含める)    エラー:   エラーコード (< 0) @@ -616,14 +628,20 @@  # int onigenc_strlen(OnigEncoding enc, const UChar* s, const UChar* end) + +  文字列の文字数を返す。 + +  # int onigenc_strlen_null(OnigEncoding enc, const UChar* s)    文字列の文字数を返す。 +  文字エンコーディングに対して、不正な文字列を渡してはいけない。  # int onigenc_str_bytelen_null(OnigEncoding enc, const UChar* s)    文字列のバイト数を返す。 +  文字エンコーディングに対して、不正な文字列を渡してはいけない。  # int onig_set_default_syntax(OnigSyntaxType* syntax) diff --git a/doc/UNICODE_PROPERTIES b/doc/UNICODE_PROPERTIES index 1148b4d..ff2a6ce 100644 --- a/doc/UNICODE_PROPERTIES +++ b/doc/UNICODE_PROPERTIES @@ -1,4 +1,4 @@ -Unicode Properties (from Unicode Version: 12.0.0) +Unicode Properties (from Unicode Version: 12.1.0)   15: ASCII_Hex_Digit   16: Adlam diff --git a/harnesses/ascii_compatible.dict b/harnesses/ascii_compatible.dict new file mode 100644 index 0000000..820bf47 --- /dev/null +++ b/harnesses/ascii_compatible.dict @@ -0,0 +1,111 @@ +# First-pass fuzzing dictionary for Oniguruma by Mark Griffin +"\\o{17777777777}" +"\\777" +"\\u" +"\\uFFFF" +"\\xFF" +"\\x{70000000}" +"\\C-" +"\\M-\\C-" +"\\X" +"\\p{" +"\\p{^" +"}" +"]" +")" +"\\n" +"\\r" +"\\R" +"\\W" +"\\w" +"\\s" +"\\S" +"\\d" +"\\O" +"\\X" +"\\b" +"\\y" +"\\Y" +"\\A" +"\\z" +"\\K" +"\\G" +"\\p{Print}" +"\\p{ASCII}" +"\\p{Alnum}" +"{0,2}" +"{3,}" +"{,3}" +"{5}" +"{4,2}" +"??" +"*?" +"+?" +"*+" +"{1,3}+" +"(?>" +"\\B" +"(?y{" +"[abcd1-9]" +"[\\w\\d" +"[\\p{Alphabetic}" +"[\\P{Arabic}" +"[\\x{ffff}" +"[a-w&&" +"[^" +"[:graph:]" +"[^:cntrl:]" +"(?i:" +"(?i)" +"(?m:" +"(?x:" +"(?W:" +"(?y-:" +"(?y{w}:" +"(?P:" +"(?#" +"(?:" +"(?=" +"(?!" +"(?<=" +"(?<!" +"(?>" +"(?<name>" +"(?{" +"(?{....}[x])" +"(?{.}[x]>)" +"(?{{{.}}})" +"(?~" +"(?~a)" +"(?~|a|.*)" +"(?~|(?:a|b))" +"(?~|)" +"(?(.) |.)" +"(?('-n'))" +"(?(n+0))" +"(?(n+1))" +"(?(n-1))" +"(?(<name+0>))" +"(?(<name+1>))" +"(?(<name-1>))" +"(*ERROR{-2000})" +"(*COUNT[tag]{X})" +"\\1" +"\\2" +"\\k<name>" +"\\k<1>" +"\\k<2>" +"\\k<-1>" +"\\k<-2>" +"\\k<name+0>" +"\\k<name+1>" +"\\k<name-1>" +"\\g<-1>" +"\\g<name>" +"name" +"(?<name>a|b\\g<name>c)" +"(?-i:\\g<name>)" +"\\N{name}" +"\\p{Hiragana}" +"\\p{Katakana}" +"\\p{Emoji}" diff --git a/harnesses/deluxe-encode-harness.c b/harnesses/deluxe-encode-harness.c new file mode 100644 index 0000000..e1f84a5 --- /dev/null +++ b/harnesses/deluxe-encode-harness.c @@ -0,0 +1,239 @@ +/* + * deluxe-encode-harness.c + * contributed by Mark Griffin + */ +#include <stdio.h> +#include "oniguruma.h" + +#include <stdlib.h> +#include <string.h> + +#define DEFAULT_LIMIT 120 +typedef unsigned char uint8_t; + +static int +search(regex_t* reg, unsigned char* str, unsigned char* end) +{ +  int r; +  unsigned char *start, *range; +  OnigRegion *region; + +  region = onig_region_new(); + +  start = str; +  range = end; +  r = onig_search(reg, str, end, start, range, region, ONIG_OPTION_NONE); +  if (r >= 0) { +    int i; + +    fprintf(stdout, "match at %d  (%s)\n", r, +            ONIGENC_NAME(onig_get_encoding(reg))); +    for (i = 0; i < region->num_regs; i++) { +      fprintf(stdout, "%d: (%d-%d)\n", i, region->beg[i], region->end[i]); +    } +  } +  else if (r == ONIG_MISMATCH) { +    fprintf(stdout, "search fail (%s)\n", +            ONIGENC_NAME(onig_get_encoding(reg))); +  } +  else { /* error */ +    char s[ONIG_MAX_ERROR_MESSAGE_LEN]; +    onig_error_code_to_str((UChar* )s, r); +    fprintf(stdout, "ERROR: %s\n", s); +    fprintf(stdout, "  (%s)\n", ONIGENC_NAME(onig_get_encoding(reg))); +    onig_region_free(region, 1 /* 1:free self, 0:free contents only */); +    return -1; +  } + +  onig_region_free(region, 1 /* 1:free self, 0:free contents only */); +  return 0; +} + +static int +exec(OnigEncoding enc, OnigOptionType options, +     char* apattern, char* apattern_end, char* astr, char* astr_end) +{ +  int r; +  regex_t* reg; +  OnigErrorInfo einfo; +  UChar* pattern = (UChar* )apattern; +  UChar* str     = (UChar* )astr; +  UChar* pattern_end = (UChar* )apattern_end; +  unsigned char *end = (unsigned char* )astr_end; + +  onig_initialize(&enc, 1); +  onig_set_retry_limit_in_match(DEFAULT_LIMIT); +  onig_set_parse_depth_limit(DEFAULT_LIMIT); + +  r = onig_new(®, pattern, pattern_end, +               options, enc, ONIG_SYNTAX_DEFAULT, &einfo); +  if (r != ONIG_NORMAL) { +    char s[ONIG_MAX_ERROR_MESSAGE_LEN]; +    onig_error_code_to_str((UChar* )s, r, &einfo); +    fprintf(stdout, "ERROR: %s\n", s); +    onig_end(); +    return -1; +  } + +  r = search(reg, str, end); + +  onig_free(reg); +  onig_end(); +  return 0; +} + +static OnigCaseFoldType CF = ONIGENC_CASE_FOLD_MIN; + +static int +exec_deluxe(OnigEncoding pattern_enc, OnigEncoding str_enc, +            OnigOptionType options, char* apattern, char* apattern_end, +            char* astr, char* astr_end) +{ +  int r; +  regex_t* reg; +  OnigCompileInfo ci; +  OnigErrorInfo einfo; +  UChar* pattern = (UChar* )apattern; +  UChar* str     = (UChar* )astr; +  UChar* pattern_end = (UChar* )apattern_end; +  unsigned char* end = (unsigned char* )astr_end; + +  onig_initialize(&str_enc, 1); +  onig_set_retry_limit_in_match(DEFAULT_LIMIT); +  onig_set_parse_depth_limit(DEFAULT_LIMIT); + +  ci.num_of_elements = 5; +  ci.pattern_enc = pattern_enc; +  ci.target_enc  = str_enc; +  ci.syntax      = ONIG_SYNTAX_DEFAULT; +  ci.option      = options; +  ci.case_fold_flag  = CF; + +  r = onig_new_deluxe(®, pattern, pattern_end, &ci, &einfo); +  if (r != ONIG_NORMAL) { +    char s[ONIG_MAX_ERROR_MESSAGE_LEN]; +    onig_error_code_to_str((UChar* )s, r, &einfo); +    fprintf(stdout, "ERROR: %s\n", s); +    onig_end(); +    return -1; +  } + +  if (onigenc_is_valid_mbc_string(str_enc, str, end) != 0) { +    r = search(reg, str, end); +  } + +  onig_free(reg); +  onig_end(); +  return 0; +} + +#define PATTERN_SIZE 48  +#define NUM_CONTROL_BYTES 1 +#define MIN_STR_SIZE  2 +int LLVMFuzzerTestOneInput(const uint8_t * Data, size_t Size) +{ +  int r; +  size_t remaining_size; +  unsigned char *data; +  unsigned char pat_encoding_choice; +  unsigned char str_encoding_choice; +  unsigned char *pattern; +  unsigned char *str; +  unsigned char *pattern_end; +  unsigned char *str_end; +  unsigned int num_encodings; +  OnigEncodingType *pattern_enc; +  OnigEncodingType *str_enc; + +  OnigEncodingType *encodings[] = { +    ONIG_ENCODING_ASCII, +    ONIG_ENCODING_ISO_8859_1, +    ONIG_ENCODING_ISO_8859_2, +    ONIG_ENCODING_ISO_8859_3, +    ONIG_ENCODING_ISO_8859_4, +    ONIG_ENCODING_ISO_8859_5, +    ONIG_ENCODING_ISO_8859_6, +    ONIG_ENCODING_ISO_8859_7, +    ONIG_ENCODING_ISO_8859_8, +    ONIG_ENCODING_ISO_8859_9, +    ONIG_ENCODING_ISO_8859_10, +    ONIG_ENCODING_ISO_8859_11, +    ONIG_ENCODING_ISO_8859_13, +    ONIG_ENCODING_ISO_8859_14, +    ONIG_ENCODING_ISO_8859_15, +    ONIG_ENCODING_ISO_8859_16, +    ONIG_ENCODING_UTF8, +    ONIG_ENCODING_UTF16_BE, +    ONIG_ENCODING_UTF16_LE, +    ONIG_ENCODING_UTF32_BE, +    ONIG_ENCODING_UTF32_LE, +    ONIG_ENCODING_EUC_JP, +    ONIG_ENCODING_EUC_TW, +    ONIG_ENCODING_EUC_KR, +    ONIG_ENCODING_EUC_CN, +    ONIG_ENCODING_SJIS, +    //ONIG_ENCODING_KOI8, +    ONIG_ENCODING_KOI8_R, +    ONIG_ENCODING_CP1251, +    ONIG_ENCODING_BIG5, +    ONIG_ENCODING_GB18030, +  }; + +  if (Size <= (NUM_CONTROL_BYTES + PATTERN_SIZE + MIN_STR_SIZE)) +    return 0; +  if (Size > 0x1000) +    return 0; + +  remaining_size = Size; +  data = (unsigned char *)(Data); + +  // pull off bytes to switch off +  pat_encoding_choice = data[0]; +  data++; +  remaining_size--; +  str_encoding_choice = data[0]; +  data++; +  remaining_size--; + +  // copy first PATTERN_SIZE bytes off to be the pattern +  pattern = (unsigned char *)malloc(PATTERN_SIZE+4); +  memset(pattern, 0, PATTERN_SIZE+4); +  memcpy(pattern, data, PATTERN_SIZE); +  pattern_end = pattern + PATTERN_SIZE; +  data += PATTERN_SIZE; +  remaining_size -= PATTERN_SIZE; + +  str = (unsigned char*)malloc(remaining_size+4); +  memset(str, 0, remaining_size+4); +  memcpy(str, data, remaining_size); +  str_end = str + remaining_size; + +  num_encodings = sizeof(encodings) / sizeof(encodings[0]); +  pattern_enc = encodings[pat_encoding_choice % num_encodings]; +  str_enc = encodings[str_encoding_choice % num_encodings]; + +  r = exec_deluxe(pattern_enc, str_enc, ONIG_OPTION_NONE, (char *)pattern, (char *)pattern_end, (char *)str, (char *)str_end); + +  free(pattern); +  free(str); + +  return r; +} + + +#ifdef WITH_READ_MAIN + +#include <unistd.h> + +extern int main(int argc, char* argv[]) +{ +  size_t n; +  uint8_t Data[10000]; + +  n = read(0, Data, sizeof(Data)); +  fprintf(stdout, "n: %ld\n", n); +  LLVMFuzzerTestOneInput(Data, n); + +  return 0; +} +#endif /* WITH_READ_MAIN */ diff --git a/harnesses/dict_conv.py b/harnesses/dict_conv.py new file mode 100644 index 0000000..f721293 --- /dev/null +++ b/harnesses/dict_conv.py @@ -0,0 +1,72 @@ +# -*- coding: utf-8 -*- +# dict_conv.py  (Python3 script) + +import sys + +ENC_UTF16_BE = 1 +ENC_UTF16_LE = 2 + +def add_char(enc, s, c): +  if enc == ENC_UTF16_BE: +    s += "\\x00" + +  s += c +  if enc == ENC_UTF16_LE: +    s += "\\x00" + +  return s + +def conv(enc, s): +  n = len(s) +  r = "" +  i = 0 +  while i < n: +    c = s[i] +    if c == '\\': +      c = s[i+1] +      if c == '\\' or c == '"': +        r = add_char(enc, r, "\\" + c) +        i += 2 +        continue +      else: +        raise("Unknown escape {0}".format(s)) + +    r = add_char(enc, r, c) +    i += 1 + +  return r + +def main(enc): +  print("# This file was generated by dict_conv.py.") +  for line in sys.stdin: +    s = line.strip() +    if s[0] == '#': +      print(s) +      continue + +    if s[0] == '"' and s[-1] == '"': +      s = conv(enc, s[1:-1]) +      print("\"{0}\"".format(s)) +    else: +      raise("Invalid format {0}".format(s)) + +def usage(argv): +  raise RuntimeError("Usage: python {0} utf16_be/utf16_le".format(argv[0])) + + +if __name__ == "__main__": +  argv = sys.argv +  argc = len(argv) + +  if argc >= 2: +    s = argv[1] +    if s == 'utf16_be': +      enc = ENC_UTF16_BE +    elif s == 'utf16_le': +      enc = ENC_UTF16_LE +    else: +      usage(argv) +  else: +    usage(argv) + +  main(enc) diff --git a/harnesses/encode-harness.c b/harnesses/encode-harness.c new file mode 100644 index 0000000..e57fd4f --- /dev/null +++ b/harnesses/encode-harness.c @@ -0,0 +1,170 @@ +/* + * encode-harness.c + * contributed by Mark Griffin + */ +#include <stdio.h> +#include "oniguruma.h" + +#include <stdlib.h> +#include <string.h> + +#define PARSE_DEPTH_LIMIT   120 +#define RETRY_LIMIT        4000 + +typedef unsigned char uint8_t; + +static int +search(regex_t* reg, unsigned char* str, unsigned char* end) +{ +  int r; +  unsigned char *start, *range; +  OnigRegion *region; + +  region = onig_region_new(); + +  start = str; +  range = end; +  r = onig_search(reg, str, end, start, range, region, ONIG_OPTION_NONE); +  if (r >= 0) { +    int i; + +    fprintf(stdout, "match at %d  (%s)\n", r, +            ONIGENC_NAME(onig_get_encoding(reg))); +    for (i = 0; i < region->num_regs; i++) { +      fprintf(stdout, "%d: (%d-%d)\n", i, region->beg[i], region->end[i]); +    } +  } +  else if (r == ONIG_MISMATCH) { +    fprintf(stdout, "search fail (%s)\n", +            ONIGENC_NAME(onig_get_encoding(reg))); +  } +  else { /* error */ +    char s[ONIG_MAX_ERROR_MESSAGE_LEN]; +    onig_error_code_to_str((UChar* )s, r); +    fprintf(stdout, "ERROR: %s\n", s); +    fprintf(stdout, "  (%s)\n", ONIGENC_NAME(onig_get_encoding(reg))); +    onig_region_free(region, 1 /* 1:free self, 0:free contents only */); +    return -1; +  } + +  onig_region_free(region, 1 /* 1:free self, 0:free contents only */); +  return 0; +} + +static int +exec(OnigEncoding enc, OnigOptionType options, +     char* apattern, char* apattern_end, char* astr, UChar* end) +{ +  int r; +  regex_t* reg; +  OnigErrorInfo einfo; +  UChar* pattern = (UChar* )apattern; +  UChar* str     = (UChar* )astr; +  UChar* pattern_end = (UChar* )apattern_end; + +  onig_initialize(&enc, 1); +  onig_set_retry_limit_in_match(RETRY_LIMIT); +  onig_set_parse_depth_limit(PARSE_DEPTH_LIMIT); + +  r = onig_new(®, pattern, pattern_end, +               options, enc, ONIG_SYNTAX_DEFAULT, &einfo); +  if (r != ONIG_NORMAL) { +    char s[ONIG_MAX_ERROR_MESSAGE_LEN]; +    onig_error_code_to_str((UChar* )s, r, &einfo); +    fprintf(stdout, "ERROR: %s\n", s); +    onig_end(); +    return -1; +  } + +  if (onigenc_is_valid_mbc_string(enc, str, end) != 0) { +    r = search(reg, str, end); +  } + +  onig_free(reg); +  onig_end(); +  return 0; +} + +#define PATTERN_SIZE 32 +#define NUM_CONTROL_BYTES 1 +#define MIN_STR_SIZE  1 +int LLVMFuzzerTestOneInput(const uint8_t * Data, size_t Size) +{ +  if (Size <= (NUM_CONTROL_BYTES + PATTERN_SIZE + MIN_STR_SIZE)) +    return 0; +  if (Size > 0x1000) +    return 0; + +  unsigned char *pattern_end; +  unsigned char *str_null_end; + +  size_t remaining_size = Size; +  unsigned char *data = (unsigned char *)(Data); + +  // pull off one byte to switch off +  unsigned char encoding_choice = data[0]; +  data++; +  remaining_size--; + +  // copy first PATTERN_SIZE bytes off to be the pattern +  unsigned char *pattern = (unsigned char *)malloc(PATTERN_SIZE+4); +  memset(pattern, 0, PATTERN_SIZE+4); +  memcpy(pattern, data, PATTERN_SIZE); +  pattern_end = pattern + PATTERN_SIZE; +  data += PATTERN_SIZE; +  remaining_size -= PATTERN_SIZE; + +  unsigned char *str = (unsigned char*)malloc(remaining_size+4); +  memset(str, 0, remaining_size+4); +  memcpy(str, data, remaining_size); +  str_null_end = str + remaining_size; + +  int r; +  OnigEncodingType *encodings[] = { +	  ONIG_ENCODING_SJIS, +	  ONIG_ENCODING_EUC_JP, +	  ONIG_ENCODING_CP1251, +	  ONIG_ENCODING_ISO_8859_1, +	  ONIG_ENCODING_UTF8, +    ONIG_ENCODING_KOI8_R, +    ONIG_ENCODING_BIG5 +  }; + +  OnigEncodingType *enc; + +#ifdef UTF16_BE +  enc = ONIG_ENCODING_UTF16_BE; +#else +#ifdef UTF16_LE +  enc = ONIG_ENCODING_UTF16_LE; +#else +  int num_encodings = sizeof(encodings)/sizeof(encodings[0]); +  enc = encodings[encoding_choice % num_encodings]; +#endif +#endif + +  r = exec(enc, ONIG_OPTION_NONE, (char *)pattern, (char *)pattern_end, +           (char *)str, str_null_end); + +  free(pattern); +  free(str); + +  return r; +} + +#ifdef WITH_READ_MAIN + +#include <unistd.h> + +extern int main(int argc, char* argv[]) +{ +  size_t n; +  uint8_t Data[10000]; + +  n = read(0, Data, sizeof(Data)); +  fprintf(stdout, "n: %ld\n", n); +  LLVMFuzzerTestOneInput(Data, n); + +  return 0; +} +#endif /* WITH_READ_MAIN */ diff --git a/harnesses/syntax-harness.c b/harnesses/syntax-harness.c new file mode 100644 index 0000000..0fb3587 --- /dev/null +++ b/harnesses/syntax-harness.c @@ -0,0 +1,120 @@ +/* + * syntax-harness.c + * contributed by Mark Griffin + */ +#include <stdio.h> +#include <string.h> +#include "oniguruma.h" + +#include <stdlib.h> + +#define DEFAULT_LIMIT 120 +typedef unsigned char uint8_t; + +extern int exec(OnigSyntaxType* syntax, char* apattern, char* astr) +{ +  int r; +  unsigned char *start, *range, *end; +  regex_t* reg; +  OnigErrorInfo einfo; +  OnigRegion *region; +  UChar* pattern = (UChar* )apattern; +  UChar* str     = (UChar* )astr; + +  r = onig_new(®, pattern, pattern + strlen((char* )pattern), +               ONIG_OPTION_DEFAULT, ONIG_ENCODING_ASCII, syntax, &einfo); +  if (r != ONIG_NORMAL) { +    char s[ONIG_MAX_ERROR_MESSAGE_LEN]; +    onig_error_code_to_str((UChar* )s, r, &einfo); +    fprintf(stdout, "ERROR: %s\n", s); +    return -1; +  } + +  region = onig_region_new(); + +  end   = str + strlen((char* )str); +  start = str; +  range = end; +  r = onig_search(reg, str, end, start, range, region, ONIG_OPTION_NONE); +  if (r >= 0) { +    int i; + +    fprintf(stdout, "match at %d\n", r); +    for (i = 0; i < region->num_regs; i++) { +      fprintf(stdout, "%d: (%d-%d)\n", i, region->beg[i], region->end[i]); +    } +  } +  else if (r == ONIG_MISMATCH) { +    fprintf(stdout, "search fail\n"); +  } +  else { /* error */ +    char s[ONIG_MAX_ERROR_MESSAGE_LEN]; +    onig_error_code_to_str((UChar* )s, r); +    fprintf(stdout, "ERROR: %s\n", s); +    onig_region_free(region, 1 /* 1:free self, 0:free contents only */); +    onig_free(reg); +    return -1; +  } + +  onig_region_free(region, 1 /* 1:free self, 0:free contents only */); +  onig_free(reg); +  return 0; +} + +#define PATTERN_SIZE 64 +#define NUM_CONTROL_BYTES 1 +#define MIN_STR_SIZE  1 +int LLVMFuzzerTestOneInput(const uint8_t * Data, size_t Size) +{ +  if (Size <= (NUM_CONTROL_BYTES + PATTERN_SIZE + MIN_STR_SIZE)) +    return 0; +  if (Size > 0x1000) +    return 0; +  size_t remaining_size = Size; +  unsigned char *data = (unsigned char *)(Data); + +  // pull off one byte to switch syntax choice +  unsigned char syntax_choice = data[0]; +  data++; +  remaining_size--; + +  // copy first PATTERN_SIZE bytes off to be the pattern +  unsigned char *pattern = (unsigned char *)malloc(PATTERN_SIZE+1); +  memset(pattern, 0, PATTERN_SIZE+1); +  memcpy(pattern, data, PATTERN_SIZE); +  data += PATTERN_SIZE; +  remaining_size -= PATTERN_SIZE; + +  unsigned char *str = (unsigned char*)malloc(remaining_size+1); +  memset(str, 0, remaining_size+1); +  memcpy(str, data, remaining_size); +   +  OnigEncoding use_encs[] = { ONIG_ENCODING_ASCII }; +  onig_initialize(use_encs, sizeof(use_encs)/sizeof(use_encs[0])); + +  onig_set_retry_limit_in_match(DEFAULT_LIMIT); +  onig_set_parse_depth_limit(DEFAULT_LIMIT); + +  OnigSyntaxType *syntaxes[] = { +    ONIG_SYNTAX_POSIX_EXTENDED, +    ONIG_SYNTAX_EMACS, +    ONIG_SYNTAX_GREP, +    ONIG_SYNTAX_GNU_REGEX, +    ONIG_SYNTAX_JAVA, +    ONIG_SYNTAX_PERL_NG, +    ONIG_SYNTAX_RUBY, +    ONIG_SYNTAX_ONIGURUMA, +  };  +  OnigSyntaxType *syntax = syntaxes[syntax_choice % 8]; +   +  int r; +  r = exec(syntax, (char *)pattern, (char *)str); +  // r = exec(ONIG_SYNTAX_JAVA, "\\p{XDigit}\\P{XDigit}[a-c&&b-g]", "bgc"); + +  onig_end(); + +  free(pattern); +  free(str); + +  return 0; +} @@ -8,7 +8,7 @@  <h1>Oniguruma</h1> (<a href="index_ja.html">Japanese</a>)  <p> -(c) K.Kosako, updated at: 2018/12/06 +(c) K.Kosako, updated at: 2019/08/05  </p>  <dl> @@ -16,6 +16,8 @@  <dt><b>What's new</b>  </font>  <ul> +<li>2019/08/06: Version 6.9.3 released.</li> +<li>2019/05/07: Version 6.9.2 released.</li>  <li>2018/12/11: Version 6.9.1 released.</li>  <li>2018/09/03: Version 6.9.0 released.</li>  <li>2018/04/17: Version 6.8.2 released.</li> diff --git a/index_ja.html b/index_ja.html index 0ada788..6b75c6c 100644 --- a/index_ja.html +++ b/index_ja.html @@ -8,7 +8,7 @@  <h1>鬼車</h1>  <p> -(c) K.Kosako, 最終更新: 2018/12/06 +(c) K.Kosako, 最終更新: 2019/08/05  </p>  <dl> @@ -16,6 +16,8 @@  <dt><b>更新情報</b>  </font>  <ul> +<li>2019/08/06: Version 6.9.3 リリース</li> +<li>2019/05/07: Version 6.9.2 リリース</li>  <li>2018/12/11: Version 6.9.1 リリース</li>  <li>2018/09/03: Version 6.9.0 リリース</li>  <li>2018/04/17: Version 6.8.2 リリース</li> diff --git a/sample/bug_fix.c b/sample/bug_fix.c index 81c2784..3f60c5b 100644 --- a/sample/bug_fix.c +++ b/sample/bug_fix.c @@ -4,8 +4,6 @@  #include <stdio.h>  #include "oniguruma.h" -static OnigCaseFoldType CF = ONIGENC_CASE_FOLD_MIN; -  static int  search(regex_t* reg, unsigned char* str, unsigned char* end)  { @@ -36,6 +34,7 @@ search(regex_t* reg, unsigned char* str, unsigned char* end)      onig_error_code_to_str((UChar* )s, r);      fprintf(stderr, "ERROR: %s\n", s);      fprintf(stderr, "  (%s)\n", ONIGENC_NAME(onig_get_encoding(reg))); +    onig_region_free(region, 1 /* 1:free self, 0:free contents only */);      return -1;    } @@ -44,45 +43,6 @@ search(regex_t* reg, unsigned char* str, unsigned char* end)  }  static int -exec_deluxe(OnigEncoding pattern_enc, OnigEncoding str_enc, -            OnigOptionType options, char* apattern, char* astr) -{ -  int r; -  unsigned char *end; -  regex_t* reg; -  OnigCompileInfo ci; -  OnigErrorInfo einfo; -  UChar* pattern = (UChar* )apattern; -  UChar* str     = (UChar* )astr; - -  onig_initialize(&str_enc, 1); - -  ci.num_of_elements = 5; -  ci.pattern_enc = pattern_enc; -  ci.target_enc  = str_enc; -  ci.syntax      = ONIG_SYNTAX_DEFAULT; -  ci.option      = options; -  ci.case_fold_flag  = CF; - -  r = onig_new_deluxe(®, pattern, -                      pattern + onigenc_str_bytelen_null(pattern_enc, pattern), -                      &ci, &einfo); -  if (r != ONIG_NORMAL) { -    char s[ONIG_MAX_ERROR_MESSAGE_LEN]; -    onig_error_code_to_str((UChar* )s, r, &einfo); -    fprintf(stderr, "ERROR: %s\n", s); -    return -1; -  } - -  end = str + onigenc_str_bytelen_null(str_enc, str); -  r = search(reg, str, end); - -  onig_free(reg); -  onig_end(); -  return 0; -} - -static int  exec(OnigEncoding enc, OnigOptionType options, char* apattern, char* astr)  {    int r; @@ -92,8 +52,6 @@ exec(OnigEncoding enc, OnigOptionType options, char* apattern, char* astr)    UChar* pattern = (UChar* )apattern;    UChar* str     = (UChar* )astr; -  onig_initialize(&enc, 1); -    r = onig_new(®, pattern,                 pattern + onigenc_str_bytelen_null(enc, pattern),                 options, enc, ONIG_SYNTAX_DEFAULT, &einfo); @@ -108,7 +66,6 @@ exec(OnigEncoding enc, OnigOptionType options, char* apattern, char* astr)    r = search(reg, str, end);    onig_free(reg); -  onig_end();    return 0;  } @@ -116,16 +73,21 @@ exec(OnigEncoding enc, OnigOptionType options, char* apattern, char* astr)  extern int main(int argc, char* argv[])  { +  OnigEncoding use_encs[1]; + +  use_encs[0] = ONIG_ENCODING_UTF8; +  onig_initialize(use_encs, 1); +    /* fix ignore case in look-behind       commit: 3340ec2cc5627172665303fe248c9793354d2251 */ -  exec_deluxe(ONIG_ENCODING_UTF8, ONIG_ENCODING_UTF8, -              ONIG_OPTION_IGNORECASE, -              "(?<=\305\211)a", "\312\274na"); /* \u{0149}a  \u{02bc}na */ +  exec(ONIG_ENCODING_UTF8, ONIG_OPTION_IGNORECASE, +       "(?<=\305\211)a", "\312\274na"); /* \u{0149}a  \u{02bc}na */    exec(ONIG_ENCODING_UTF8, ONIG_OPTION_NONE, "(\\2)(\\1)", "aa"); /* fail. */    exec(ONIG_ENCODING_UTF8, ONIG_OPTION_FIND_LONGEST,         "a*", "aa aaa aaaa aaaaa "); /* match 12-17 */ +  onig_end();    return 0;  } diff --git a/sample/crnl.c b/sample/crnl.c index 3ad1210..bfa563e 100644 --- a/sample/crnl.c +++ b/sample/crnl.c @@ -65,6 +65,8 @@ x(int no, char* pattern_arg, char* str_arg,      char s[ONIG_MAX_ERROR_MESSAGE_LEN];      onig_error_code_to_str(s, r);      fprintf(stderr, "ERROR: %s\n", s); +    onig_region_free(region, 1 /* 1:free self, 0:free contents only */); +    onig_free(reg);      return -1;    } diff --git a/sample/encode.c b/sample/encode.c index 8a03ab8..c5d4771 100644 --- a/sample/encode.c +++ b/sample/encode.c @@ -34,6 +34,7 @@ search(regex_t* reg, unsigned char* str, unsigned char* end)      onig_error_code_to_str((UChar* )s, r);      fprintf(stderr, "ERROR: %s\n", s);      fprintf(stderr, "  (%s)\n", ONIGENC_NAME(onig_get_encoding(reg))); +    onig_region_free(region, 1 /* 1:free self, 0:free contents only */);      return -1;    } @@ -72,55 +73,6 @@ exec(OnigEncoding enc, OnigOptionType options,    return 0;  } -static OnigCaseFoldType CF = ONIGENC_CASE_FOLD_MIN; - -#if 0 -static void -set_case_fold(OnigCaseFoldType cf) -{ -  CF = cf; -} -#endif - -static int -exec_deluxe(OnigEncoding pattern_enc, OnigEncoding str_enc, -            OnigOptionType options, char* apattern, char* astr) -{ -  int r; -  unsigned char *end; -  regex_t* reg; -  OnigCompileInfo ci; -  OnigErrorInfo einfo; -  UChar* pattern = (UChar* )apattern; -  UChar* str     = (UChar* )astr; - -  onig_initialize(&str_enc, 1); - -  ci.num_of_elements = 5; -  ci.pattern_enc = pattern_enc; -  ci.target_enc  = str_enc; -  ci.syntax      = ONIG_SYNTAX_DEFAULT; -  ci.option      = options; -  ci.case_fold_flag  = CF; - -  r = onig_new_deluxe(®, pattern, -                      pattern + onigenc_str_bytelen_null(pattern_enc, pattern), -                      &ci, &einfo); -  if (r != ONIG_NORMAL) { -    char s[ONIG_MAX_ERROR_MESSAGE_LEN]; -    onig_error_code_to_str((UChar* )s, r, &einfo); -    fprintf(stderr, "ERROR: %s\n", s); -    return -1; -  } - -  end = str + onigenc_str_bytelen_null(str_enc, str); -  r = search(reg, str, end); - -  onig_free(reg); -  onig_end(); -  return 0; -} -  extern int main(int argc, char* argv[])  {    int r; @@ -196,39 +148,6 @@ extern int main(int argc, char* argv[])    r = exec(ONIG_ENCODING_ISO_8859_1, ONIG_OPTION_IGNORECASE,             "is", "iss"); -  r = exec_deluxe(ONIG_ENCODING_ASCII, ONIG_ENCODING_UTF16_BE, -                  ONIG_OPTION_NONE, "a+", -                  "\000b\000a\000a\000a\000c\000c\000\000"); - -  r = exec_deluxe(ONIG_ENCODING_ASCII, ONIG_ENCODING_UTF16_LE, -                  ONIG_OPTION_NONE, "a+", -                  "b\000a\000a\000a\000a\000c\000\000\000"); - -  r = exec_deluxe(ONIG_ENCODING_UTF16_BE, ONIG_ENCODING_UTF16_LE, -                  ONIG_OPTION_NONE, -                  "\000b\000a\000a\000a\000c\000c\000\000", -                  "x\000b\000a\000a\000a\000c\000c\000\000\000"); - -  r = exec_deluxe(ONIG_ENCODING_ISO_8859_1, ONIG_ENCODING_UTF16_BE, -                  ONIG_OPTION_IGNORECASE, -                  "\337", "\000S\000S\000\000"); - -  r = exec_deluxe(ONIG_ENCODING_ISO_8859_1, ONIG_ENCODING_UTF16_BE, -                  ONIG_OPTION_IGNORECASE, -                  "SS", "\000\337\000\000"); - -  r = exec_deluxe(ONIG_ENCODING_ISO_8859_1, ONIG_ENCODING_UTF16_LE, -                  ONIG_OPTION_IGNORECASE, -                  "\337", "S\000S\000\000\000"); - -  r = exec_deluxe(ONIG_ENCODING_ISO_8859_1, ONIG_ENCODING_UTF32_BE, -                  ONIG_OPTION_IGNORECASE, -                  "SS", "\000\000\000\337\000\000\000\000"); - -  r = exec_deluxe(ONIG_ENCODING_ISO_8859_1, ONIG_ENCODING_UTF32_LE, -                  ONIG_OPTION_IGNORECASE, -                  "\337", "S\000\000\000S\000\000\000\000\000\000\000"); -    r = exec(ONIG_ENCODING_UTF16_BE, ONIG_OPTION_NONE,             "\000[\000[\000:\000a\000l\000n\000u\000m\000:\000]\000]\000+\000\000",             "\000#\002\120\000a\000Z\012\077\012\076\012\075\000\000"); @@ -242,44 +161,34 @@ extern int main(int argc, char* argv[])    r = exec(ONIG_ENCODING_GB18030, ONIG_OPTION_IGNORECASE,             "(Aa\\d)+", "BaA5Aa0234"); -  r = exec_deluxe(ONIG_ENCODING_ISO_8859_1, ONIG_ENCODING_UTF16_BE, -                  ONIG_OPTION_NONE, -                  "^\\P{Hiragana}\\p{^Hiragana}(\\p{Hiragana}+)$", -                  "\060\100\060\240\060\101\060\102\060\226\060\237\000\000"); - -  r = exec_deluxe(ONIG_ENCODING_UTF16_BE, ONIG_ENCODING_UTF16_BE, -                  ONIG_OPTION_IGNORECASE, -                  "\000[\000\337\000]\000\000", "\000S\000S\000\000"); +  r = exec(ONIG_ENCODING_UTF16_BE, ONIG_OPTION_IGNORECASE, +           "\000[\000\337\000]\000\000", "\000S\000S\000\000"); -  r = exec_deluxe(ONIG_ENCODING_UTF16_BE, ONIG_ENCODING_UTF16_BE, -                  ONIG_OPTION_IGNORECASE, -                  "\000[\000\337\000]\000\000", "\000s\000S\000\000"); +  r = exec(ONIG_ENCODING_UTF16_BE, ONIG_OPTION_IGNORECASE, +           "\000[\000\337\000]\000\000", "\000s\000S\000\000"); -  r = exec_deluxe(ONIG_ENCODING_UTF16_BE, ONIG_ENCODING_UTF16_BE, -                  ONIG_OPTION_IGNORECASE, -                  "\000^\000[\000\001\000-\377\375\000]\000$\000\000", -                  "\000s\000S\000\000"); +  r = exec(ONIG_ENCODING_UTF16_BE, ONIG_OPTION_IGNORECASE, +           "\000^\000[\000\001\000-\377\375\000]\000$\000\000", +           "\000s\000S\000\000"); -  r = exec_deluxe(ONIG_ENCODING_UTF16_BE, ONIG_ENCODING_UTF16_BE, -                  ONIG_OPTION_IGNORECASE, -                  "\000S\000S\000\000", -                  "\000S\000T\000\337\000\000"); +  r = exec(ONIG_ENCODING_UTF16_BE, ONIG_OPTION_IGNORECASE, +           "\000S\000S\000\000", +           "\000S\000T\000\337\000\000"); -  r = exec_deluxe(ONIG_ENCODING_UTF16_BE, ONIG_ENCODING_UTF16_BE, -                  ONIG_OPTION_IGNORECASE, -                  "\000S\000T\000S\000S\000\000", -                  "\000S\000t\000s\000S\000\000"); +  r = exec(ONIG_ENCODING_UTF16_BE, ONIG_OPTION_IGNORECASE, +           "\000S\000T\000S\000S\000\000", +           "\000S\000t\000s\000S\000\000");    {      UChar pat[]  = { 0x1f, 0xfc, 0x00, 0x00 };      UChar str1[] = { 0x21, 0x26, 0x1f, 0xbe, 0x00, 0x00 };      UChar str2[] = { 0x1f, 0xf3, 0x00, 0x00 }; -    r = exec_deluxe(ONIG_ENCODING_UTF16_BE, ONIG_ENCODING_UTF16_BE, -                    ONIG_OPTION_IGNORECASE, (char* )pat, (char* )str1); +    r = exec(ONIG_ENCODING_UTF16_BE, ONIG_OPTION_IGNORECASE, +             (char* )pat, (char* )str1); -    r = exec_deluxe(ONIG_ENCODING_UTF16_BE, ONIG_ENCODING_UTF16_BE, -                    ONIG_OPTION_IGNORECASE, (char* )pat, (char* )str2); +    r = exec(ONIG_ENCODING_UTF16_BE, ONIG_OPTION_IGNORECASE, +             (char* )pat, (char* )str2);    }  #if 0 @@ -287,17 +196,14 @@ extern int main(int argc, char* argv[])    set_case_fold(ONIGENC_CASE_FOLD_TURKISH_AZERI); -  r = exec_deluxe(ONIG_ENCODING_UTF8, ONIG_ENCODING_UTF8, -                  ONIG_OPTION_IGNORECASE, -                  "Ii", "\304\261\304\260"); +  r = exec(ONIG_ENCODING_UTF8, ONIG_ENCODING_UTF8, ONIG_OPTION_IGNORECASE, +           "Ii", "\304\261\304\260"); -  r = exec_deluxe(ONIG_ENCODING_UTF16_BE, ONIG_ENCODING_UTF16_BE, -                  ONIG_OPTION_IGNORECASE, -                  "\000I\000i\000\000", "\001\061\001\060\000\000"); +  r = exec(ONIG_ENCODING_UTF16_BE, ONIG_OPTION_IGNORECASE, +           "\000I\000i\000\000", "\001\061\001\060\000\000"); -  r = exec_deluxe(ONIG_ENCODING_UTF16_BE, ONIG_ENCODING_UTF16_BE, -                  ONIG_OPTION_IGNORECASE, -                  "\001\061\001\060\000\000", "\000I\000i\000\000"); +  r = exec(ONIG_ENCODING_UTF16_BE, ONIG_OPTION_IGNORECASE, +           "\001\061\001\060\000\000", "\000I\000i\000\000");    set_case_fold(ONIGENC_CASE_FOLD_MIN);  #endif diff --git a/sample/listcap.c b/sample/listcap.c index e0fe23a..a73f7d4 100644 --- a/sample/listcap.c +++ b/sample/listcap.c @@ -69,6 +69,8 @@ extern int ex(unsigned char* str, unsigned char* pattern,    else { /* error */      char s[ONIG_MAX_ERROR_MESSAGE_LEN];      onig_error_code_to_str((UChar* )s, r); +    onig_region_free(region, 1 /* 1:free self, 0:free contents only */); +    onig_free(reg);      return -1;    } diff --git a/sample/names.c b/sample/names.c index a838056..9b1eb24 100644 --- a/sample/names.c +++ b/sample/names.c @@ -65,6 +65,9 @@ extern int main(int argc, char* argv[])    else { /* error */      char s[ONIG_MAX_ERROR_MESSAGE_LEN];      onig_error_code_to_str((UChar* )s, r); +    onig_region_free(region, 1 /* 1:free self, 0:free contents only */); +    onig_free(reg); +    onig_end();      return -1;    } diff --git a/sample/posix.c b/sample/posix.c index 35ccb68..c555936 100644 --- a/sample/posix.c +++ b/sample/posix.c @@ -49,6 +49,7 @@ extern int main(int argc, char* argv[])      regerror(r, ®, buf, sizeof(buf));      fprintf(stderr, "ERROR: %s\n", buf);      regfree(®); +    onig_end();      return -1;    }    x(®, pattern, (UChar* )"aaabbbbd"); @@ -60,6 +61,7 @@ extern int main(int argc, char* argv[])      regerror(r, ®, buf, sizeof(buf));      fprintf(stderr, "ERROR: %s\n", buf);      regfree(®); +    onig_end();      return -1;    }    x(®, pattern, (UChar* )"a+b{2,7}d?|uuu"); @@ -71,6 +73,7 @@ extern int main(int argc, char* argv[])      regerror(r, ®, buf, sizeof(buf));      fprintf(stderr, "ERROR: %s\n", buf);      regfree(®); +    onig_end();      return -1;    }    x(®, pattern, (UChar* )"aaaabbbbbbd"); @@ -83,6 +86,7 @@ extern int main(int argc, char* argv[])      regerror(r, ®, buf, sizeof(buf));      fprintf(stderr, "ERROR: %s\n", buf);      regfree(®); +    onig_end();      return -1;    }    x(®, pattern, (UChar* )"aaabbbbd)"); @@ -93,6 +97,7 @@ extern int main(int argc, char* argv[])      regerror(r, ®, buf, sizeof(buf));      fprintf(stderr, "ERROR: %s\n", buf);      regfree(®); +    onig_end();      return -1;    }    x(®, pattern, (UChar* )"a\nb\n"); diff --git a/sample/scan.c b/sample/scan.c index ad5ae74..4039e46 100644 --- a/sample/scan.c +++ b/sample/scan.c @@ -36,6 +36,7 @@ scan(regex_t* reg, unsigned char* str, unsigned char* end)      char s[ONIG_MAX_ERROR_MESSAGE_LEN];      onig_error_code_to_str((OnigUChar* )s, r);      fprintf(stderr, "ERROR: %s\n", s); +    onig_region_free(region, 1 /* 1:free self, 0:free contents only */);      return -1;    } @@ -63,6 +64,7 @@ exec(OnigEncoding enc, OnigOptionType options, char* apattern, char* astr)      char s[ONIG_MAX_ERROR_MESSAGE_LEN];      onig_error_code_to_str((OnigUChar* )s, r, &einfo);      fprintf(stderr, "ERROR: %s\n", s); +    onig_end();      return -1;    } diff --git a/sample/simple.c b/sample/simple.c index 95110b8..5a14042 100644 --- a/sample/simple.c +++ b/sample/simple.c @@ -49,6 +49,9 @@ extern int main(int argc, char* argv[])      char s[ONIG_MAX_ERROR_MESSAGE_LEN];      onig_error_code_to_str((UChar* )s, r);      fprintf(stderr, "ERROR: %s\n", s); +    onig_region_free(region, 1 /* 1:free self, 0:free contents only */); +    onig_free(reg); +    onig_end();      return -1;    } diff --git a/sample/sql.c b/sample/sql.c index 8e95f70..1602ac9 100644 --- a/sample/sql.c +++ b/sample/sql.c @@ -42,6 +42,7 @@ extern int main(int argc, char* argv[])      char s[ONIG_MAX_ERROR_MESSAGE_LEN];      onig_error_code_to_str((UChar* )s, r, &einfo);      fprintf(stderr, "ERROR: %s\n", s); +    onig_end();      return -1;    } @@ -66,6 +67,9 @@ extern int main(int argc, char* argv[])      char s[ONIG_MAX_ERROR_MESSAGE_LEN];      onig_error_code_to_str((UChar* )s, r);      fprintf(stderr, "ERROR: %s\n", s); +    onig_region_free(region, 1 /* 1:free self, 0:free contents only */); +    onig_free(reg); +    onig_end();      return -1;    } diff --git a/sample/syntax.c b/sample/syntax.c index e292079..e034608 100644 --- a/sample/syntax.c +++ b/sample/syntax.c @@ -45,6 +45,8 @@ extern int exec(OnigSyntaxType* syntax, char* apattern, char* astr)      char s[ONIG_MAX_ERROR_MESSAGE_LEN];      onig_error_code_to_str((UChar* )s, r);      fprintf(stderr, "ERROR: %s\n", s); +    onig_region_free(region, 1 /* 1:free self, 0:free contents only */); +    onig_free(reg);      return -1;    } diff --git a/sample/user_property.c b/sample/user_property.c index 8b2abd2..d52adc0 100644 --- a/sample/user_property.c +++ b/sample/user_property.c @@ -40,6 +40,7 @@ main(int argc, char* argv[])      char s[ONIG_MAX_ERROR_MESSAGE_LEN];      onig_error_code_to_str((UChar* )s, r);      fprintf(stderr, "ERROR: %s\n", s); +    onig_end();      return -1;    } @@ -52,6 +53,7 @@ main(int argc, char* argv[])      char s[ONIG_MAX_ERROR_MESSAGE_LEN];      onig_error_code_to_str((UChar* )s, r, &einfo);      fprintf(stderr, "onig_new: ERROR: %s\n", s); +    onig_end();      return -1;    } @@ -76,6 +78,9 @@ main(int argc, char* argv[])      char s[ONIG_MAX_ERROR_MESSAGE_LEN];      onig_error_code_to_str((UChar* )s, r);      fprintf(stderr, "ERROR: %s\n", s); +    onig_region_free(region, 1 /* 1:free self, 0:free contents only */); +    onig_free(reg); +    onig_end();      return -1;    } diff --git a/src/gb18030.c b/src/gb18030.c index 7654432..8d415b0 100644 --- a/src/gb18030.c +++ b/src/gb18030.c @@ -2,7 +2,7 @@    gb18030.c -  Oniguruma (regular expression library)  **********************************************************************/  /*- - * Copyright (c) 2005-2018  KUBO Takehiro <kubo AT jiubao DOT org> + * Copyright (c) 2005-2019  KUBO Takehiro <kubo AT jiubao DOT org>   *                          K.Kosako <sndgk393 AT ybb DOT ne DOT jp>   * All rights reserved.   * @@ -67,11 +67,11 @@ gb18030_mbc_enc_len(const UChar* p)  {    if (GB18030_MAP[*p] != CM)      return 1; +    p++;    if (GB18030_MAP[*p] == C4)      return 4; -  if (GB18030_MAP[*p] == C1) -    return 1; /* illegal sequence */ +    return 2;  } diff --git a/src/oniguruma.h b/src/oniguruma.h index f6aa5ba..90cf2d9 100644 --- a/src/oniguruma.h +++ b/src/oniguruma.h @@ -36,9 +36,9 @@ extern "C" {  #define ONIGURUMA  #define ONIGURUMA_VERSION_MAJOR   6  #define ONIGURUMA_VERSION_MINOR   9 -#define ONIGURUMA_VERSION_TEENY   2 +#define ONIGURUMA_VERSION_TEENY   3 -#define ONIGURUMA_VERSION_INT     60902 +#define ONIGURUMA_VERSION_INT     60903  #ifndef P_  #if defined(__STDC__) || defined(_WIN32) @@ -52,6 +52,7 @@ extern "C" {  # define PV_(args) args  #endif +#ifndef ONIG_STATIC  #ifndef ONIG_EXTERN  #if defined(_WIN32) && !defined(__GNUC__)  #if defined(ONIGURUMA_EXPORT) @@ -65,6 +66,9 @@ extern "C" {  #ifndef ONIG_EXTERN  #define ONIG_EXTERN   extern  #endif +#else +#define ONIG_EXTERN   extern +#endif  /* PART: character encoding */ @@ -517,6 +521,7 @@ ONIG_EXTERN OnigSyntaxType*   OnigDefaultSyntax;  #define ONIG_SYN_BACKSLASH_ESCAPE_IN_CC          (1U<<21) /* [..\w..] etc.. */  #define ONIG_SYN_ALLOW_EMPTY_RANGE_IN_CC         (1U<<22)  #define ONIG_SYN_ALLOW_DOUBLE_RANGE_OP_IN_CC     (1U<<23) /* [0-9-a]=[0-9\-a] */ +#define ONIG_SYN_ALLOW_INVALID_CODE_END_OF_RANGE_IN_CC (1U<<26)  /* syntax (behavior) warning */  #define ONIG_SYN_WARN_CC_OP_NOT_ESCAPED          (1U<<24) /* [,-,] */  #define ONIG_SYN_WARN_REDUNDANT_NESTED_REPEAT    (1U<<25) /* (?:a*)+ */ @@ -766,6 +771,8 @@ int onig_init P_((void));  ONIG_EXTERN  int onig_error_code_to_str PV_((OnigUChar* s, int err_code, ...));  ONIG_EXTERN +int onig_is_error_code_needs_param PV_((int code)); +ONIG_EXTERN  void onig_set_warn_func P_((OnigWarnFunc f));  ONIG_EXTERN  void onig_set_verb_warn_func P_((OnigWarnFunc f)); diff --git a/src/regcomp.c b/src/regcomp.c index c2c04a4..b96c793 100644 --- a/src/regcomp.c +++ b/src/regcomp.c @@ -599,12 +599,34 @@ select_str_opcode(int mb_len, int str_len, int ignore_case)  }  static int -compile_tree_empty_check(Node* node, regex_t* reg, int empty_info, ScanEnv* env) +is_strict_real_node(Node* node) +{ +  switch (NODE_TYPE(node)) { +  case NODE_STRING: +    { +      StrNode* sn = STR_(node); +      return (sn->end != sn->s); +    } +    break; + +  case NODE_CCLASS: +  case NODE_CTYPE: +    return 1; +    break; + +  default: +    return 0; +    break; +  } +} + +static int +compile_tree_empty_check(Node* node, regex_t* reg, int emptiness, ScanEnv* env)  {    int r;    int saved_num_null_check = reg->num_null_check; -  if (empty_info != BODY_IS_NOT_EMPTY) { +  if (emptiness != BODY_IS_NOT_EMPTY) {      r = add_op(reg, OP_EMPTY_CHECK_START);      if (r != 0) return r;      COP(reg)->empty_check_start.mem = reg->num_null_check; /* NULL CHECK ID */ @@ -614,12 +636,12 @@ compile_tree_empty_check(Node* node, regex_t* reg, int empty_info, ScanEnv* env)    r = compile_tree(node, reg, env);    if (r != 0) return r; -  if (empty_info != BODY_IS_NOT_EMPTY) { -    if (empty_info == BODY_IS_EMPTY) +  if (emptiness != BODY_IS_NOT_EMPTY) { +    if (emptiness == BODY_IS_EMPTY_POSSIBILITY)        r = add_op(reg, OP_EMPTY_CHECK_END); -    else if (empty_info == BODY_IS_EMPTY_MEM) +    else if (emptiness == BODY_IS_EMPTY_POSSIBILITY_MEM)        r = add_op(reg, OP_EMPTY_CHECK_END_MEMST); -    else if (empty_info == BODY_IS_EMPTY_REC) +    else if (emptiness == BODY_IS_EMPTY_POSSIBILITY_REC)        r = add_op(reg, OP_EMPTY_CHECK_END_MEMST_PUSH);      if (r != 0) return r; @@ -895,12 +917,12 @@ entry_repeat_range(regex_t* reg, int id, int lower, int upper)    }    p[id].lower = lower; -  p[id].upper = (IS_REPEAT_INFINITE(upper) ? 0x7fffffff : upper); +  p[id].upper = (IS_INFINITE_REPEAT(upper) ? 0x7fffffff : upper);    return 0;  }  static int -compile_range_repeat_node(QuantNode* qn, int target_len, int empty_info, +compile_range_repeat_node(QuantNode* qn, int target_len, int emptiness,                            regex_t* reg, ScanEnv* env)  {    int r; @@ -915,7 +937,7 @@ compile_range_repeat_node(QuantNode* qn, int target_len, int empty_info,    r = entry_repeat_range(reg, num_repeat, qn->lower, qn->upper);    if (r != 0) return r; -  r = compile_tree_empty_check(NODE_QUANT_BODY(qn), reg, empty_info, env); +  r = compile_tree_empty_check(NODE_QUANT_BODY(qn), reg, emptiness, env);    if (r != 0) return r;    if ( @@ -937,7 +959,7 @@ compile_range_repeat_node(QuantNode* qn, int target_len, int empty_info,  static int  is_anychar_infinite_greedy(QuantNode* qn)  { -  if (qn->greedy && IS_REPEAT_INFINITE(qn->upper) && +  if (qn->greedy && IS_INFINITE_REPEAT(qn->upper) &&        NODE_IS_ANYCHAR(NODE_QUANT_BODY(qn)))      return 1;    else @@ -951,8 +973,8 @@ static int  compile_length_quantifier_node(QuantNode* qn, regex_t* reg)  {    int len, mod_tlen; -  int infinite = IS_REPEAT_INFINITE(qn->upper); -  enum BodyEmpty empty_info = qn->empty_info; +  int infinite = IS_INFINITE_REPEAT(qn->upper); +  enum BodyEmptyType emptiness = qn->emptiness;    int tlen = compile_length_tree(NODE_QUANT_BODY(qn), reg);    if (tlen < 0) return tlen; @@ -969,10 +991,9 @@ compile_length_quantifier_node(QuantNode* qn, regex_t* reg)      }    } -  if (empty_info == BODY_IS_NOT_EMPTY) -    mod_tlen = tlen; -  else -    mod_tlen = tlen + (SIZE_OP_EMPTY_CHECK_START + SIZE_OP_EMPTY_CHECK_END); +  mod_tlen = tlen; +  if (emptiness != BODY_IS_NOT_EMPTY) +    mod_tlen += SIZE_OP_EMPTY_CHECK_START + SIZE_OP_EMPTY_CHECK_END;    if (infinite &&        (qn->lower <= 1 || @@ -1026,8 +1047,8 @@ static int  compile_quantifier_node(QuantNode* qn, regex_t* reg, ScanEnv* env)  {    int i, r, mod_tlen; -  int infinite = IS_REPEAT_INFINITE(qn->upper); -  enum BodyEmpty empty_info = qn->empty_info; +  int infinite = IS_INFINITE_REPEAT(qn->upper); +  enum BodyEmptyType emptiness = qn->emptiness;    int tlen = compile_length_tree(NODE_QUANT_BODY(qn), reg);    if (tlen < 0) return tlen; @@ -1055,10 +1076,9 @@ compile_quantifier_node(QuantNode* qn, regex_t* reg, ScanEnv* env)      }    } -  if (empty_info == BODY_IS_NOT_EMPTY) -    mod_tlen = tlen; -  else -    mod_tlen = tlen + (SIZE_OP_EMPTY_CHECK_START + SIZE_OP_EMPTY_CHECK_END); +  mod_tlen = tlen; +  if (emptiness != BODY_IS_NOT_EMPTY) +    mod_tlen += SIZE_OP_EMPTY_CHECK_START + SIZE_OP_EMPTY_CHECK_END;    if (infinite &&        (qn->lower <= 1 || @@ -1096,7 +1116,7 @@ compile_quantifier_node(QuantNode* qn, regex_t* reg, ScanEnv* env)          COP(reg)->push_or_jump_exact1.addr = SIZE_INC_OP + mod_tlen + SIZE_OP_JUMP;          COP(reg)->push_or_jump_exact1.c    = STR_(qn->head_exact)->s[0]; -        r = compile_tree_empty_check(NODE_QUANT_BODY(qn), reg, empty_info, env); +        r = compile_tree_empty_check(NODE_QUANT_BODY(qn), reg, emptiness, env);          if (r != 0) return r;          addr = -(mod_tlen + (int )SIZE_OP_PUSH_OR_JUMP_EXACT1); @@ -1109,7 +1129,7 @@ compile_quantifier_node(QuantNode* qn, regex_t* reg, ScanEnv* env)          COP(reg)->push_if_peek_next.addr = SIZE_INC_OP + mod_tlen + SIZE_OP_JUMP;          COP(reg)->push_if_peek_next.c    = STR_(qn->next_head_exact)->s[0]; -        r = compile_tree_empty_check(NODE_QUANT_BODY(qn), reg, empty_info, env); +        r = compile_tree_empty_check(NODE_QUANT_BODY(qn), reg, emptiness, env);          if (r != 0) return r;          addr = -(mod_tlen + (int )SIZE_OP_PUSH_IF_PEEK_NEXT); @@ -1119,7 +1139,7 @@ compile_quantifier_node(QuantNode* qn, regex_t* reg, ScanEnv* env)          if (r != 0) return r;          COP(reg)->push.addr = SIZE_INC_OP + mod_tlen + SIZE_OP_JUMP; -        r = compile_tree_empty_check(NODE_QUANT_BODY(qn), reg, empty_info, env); +        r = compile_tree_empty_check(NODE_QUANT_BODY(qn), reg, emptiness, env);          if (r != 0) return r;          addr = -(mod_tlen + (int )SIZE_OP_PUSH); @@ -1134,7 +1154,7 @@ compile_quantifier_node(QuantNode* qn, regex_t* reg, ScanEnv* env)        if (r != 0) return r;        COP(reg)->jump.addr = mod_tlen + SIZE_INC_OP; -      r = compile_tree_empty_check(NODE_QUANT_BODY(qn), reg, empty_info, env); +      r = compile_tree_empty_check(NODE_QUANT_BODY(qn), reg, emptiness, env);        if (r != 0) return r;        r = add_op(reg, OP_PUSH); @@ -1188,7 +1208,7 @@ compile_quantifier_node(QuantNode* qn, regex_t* reg, ScanEnv* env)      r = compile_tree(NODE_QUANT_BODY(qn), reg, env);    }    else { -    r = compile_range_repeat_node(qn, mod_tlen, empty_info, reg, env); +    r = compile_range_repeat_node(qn, mod_tlen, emptiness, reg, env);    }    return r;  } @@ -1273,7 +1293,7 @@ compile_length_bag_node(BagNode* node, regex_t* reg)      break;    case BAG_STOP_BACKTRACK: -    if (NODE_IS_STOP_BT_SIMPLE_REPEAT(node)) { +    if (NODE_IS_STRICT_REAL_REPEAT(node)) {        int v;        QuantNode* qn; @@ -1307,8 +1327,9 @@ compile_length_bag_node(BagNode* node, regex_t* reg)          len += tlen;        } +      len += SIZE_OP_JUMP + SIZE_OP_ATOMIC_END; +        if (IS_NOT_NULL(Else)) { -        len += SIZE_OP_JUMP;          tlen = compile_length_tree(Else, reg);          if (tlen < 0) return tlen;          len += tlen; @@ -1423,7 +1444,7 @@ compile_bag_node(BagNode* node, regex_t* reg, ScanEnv* env)      break;    case BAG_STOP_BACKTRACK: -    if (NODE_IS_STOP_BT_SIMPLE_REPEAT(node)) { +    if (NODE_IS_STRICT_REAL_REPEAT(node)) {        QuantNode* qn = QUANT_(NODE_BAG_BODY(node));        r = compile_tree_n_times(NODE_QUANT_BODY(qn), qn->lower, reg, env);        if (r != 0) return r; @@ -1455,7 +1476,7 @@ compile_bag_node(BagNode* node, regex_t* reg, ScanEnv* env)    case BAG_IF_ELSE:      { -      int cond_len, then_len, jump_len; +      int cond_len, then_len, else_len, jump_len;        Node* cond = NODE_BAG_BODY(node);        Node* Then = node->te.Then;        Node* Else = node->te.Else; @@ -1472,8 +1493,7 @@ compile_bag_node(BagNode* node, regex_t* reg, ScanEnv* env)        else          then_len = 0; -      jump_len = cond_len + then_len + SIZE_OP_ATOMIC_END; -      if (IS_NOT_NULL(Else)) jump_len += SIZE_OP_JUMP; +      jump_len = cond_len + then_len + SIZE_OP_ATOMIC_END + SIZE_OP_JUMP;        r = add_op(reg, OP_PUSH);        if (r != 0) return r; @@ -1490,11 +1510,20 @@ compile_bag_node(BagNode* node, regex_t* reg, ScanEnv* env)        }        if (IS_NOT_NULL(Else)) { -        int else_len = compile_length_tree(Else, reg); -        r = add_op(reg, OP_JUMP); -        if (r != 0) return r; -        COP(reg)->jump.addr = else_len + SIZE_INC_OP; +        else_len = compile_length_tree(Else, reg); +        if (else_len < 0) return else_len; +      } +      else +        else_len = 0; + +      r = add_op(reg, OP_JUMP); +      if (r != 0) return r; +      COP(reg)->jump.addr = SIZE_OP_ATOMIC_END + else_len + SIZE_INC_OP; +      r = add_op(reg, OP_ATOMIC_END); +      if (r != 0) return r; + +      if (IS_NOT_NULL(Else)) {          r = compile_tree(Else, reg, env);        }      } @@ -3035,7 +3064,7 @@ tree_max_len(Node* node, ScanEnv* env)        if (qn->upper != 0) {          len = tree_max_len(NODE_BODY(node), env);          if (len != 0) { -          if (! IS_REPEAT_INFINITE(qn->upper)) +          if (! IS_INFINITE_REPEAT(qn->upper))              len = distance_multiply(len, qn->upper);            else              len = INFINITE_LEN; @@ -3581,7 +3610,7 @@ next_setup(Node* node, Node* next_node, regex_t* reg)    type = NODE_TYPE(node);    if (type == NODE_QUANT) {      QuantNode* qn = QUANT_(node); -    if (qn->greedy && IS_REPEAT_INFINITE(qn->upper)) { +    if (qn->greedy && IS_INFINITE_REPEAT(qn->upper)) {  #ifdef USE_QUANT_PEEK_NEXT        Node* n = get_head_value_node(next_node, 1, reg);        /* '\0': for UTF-16BE etc... */ @@ -3591,7 +3620,7 @@ next_setup(Node* node, Node* next_node, regex_t* reg)  #endif        /* automatic posseivation a*b ==> (?>a*)b */        if (qn->lower <= 1) { -        if (NODE_IS_SIMPLE_TYPE(NODE_BODY(node))) { +        if (is_strict_real_node(NODE_BODY(node))) {            Node *x, *y;            x = get_head_value_node(NODE_BODY(node), 0, reg);            if (IS_NOT_NULL(x)) { @@ -3599,7 +3628,7 @@ next_setup(Node* node, Node* next_node, regex_t* reg)              if (IS_NOT_NULL(y) && is_exclusive(x, y, reg)) {                Node* en = onig_node_new_bag(BAG_STOP_BACKTRACK);                CHECK_NULL_RETURN_MEMERR(en); -              NODE_STATUS_ADD(en, STOP_BT_SIMPLE_REPEAT); +              NODE_STATUS_ADD(en, STRICT_REAL_REPEAT);                swap_node(node, en);                NODE_BODY(node) = en;              } @@ -4001,11 +4030,11 @@ expand_case_fold_string(Node* node, regex_t* reg, int state)    return r;  } -#ifdef USE_INSISTENT_CHECK_CAPTURES_IN_EMPTY_REPEAT -static enum BodyEmpty +#ifdef USE_STUBBORN_CHECK_CAPTURES_IN_EMPTY_REPEAT +static enum BodyEmptyType  quantifiers_memory_node_info(Node* node)  { -  int r = BODY_IS_EMPTY; +  int r = BODY_IS_EMPTY_POSSIBILITY;    switch (NODE_TYPE(node)) {    case NODE_LIST: @@ -4022,7 +4051,7 @@ quantifiers_memory_node_info(Node* node)  #ifdef USE_CALL    case NODE_CALL:      if (NODE_IS_RECURSION(node)) { -      return BODY_IS_EMPTY_REC; /* tiny version */ +      return BODY_IS_EMPTY_POSSIBILITY_REC; /* tiny version */      }      else        r = quantifiers_memory_node_info(NODE_BODY(node)); @@ -4044,9 +4073,9 @@ quantifiers_memory_node_info(Node* node)        switch (en->type) {        case BAG_MEMORY:          if (NODE_IS_RECURSION(node)) { -          return BODY_IS_EMPTY_REC; +          return BODY_IS_EMPTY_POSSIBILITY_REC;          } -        return BODY_IS_EMPTY_MEM; +        return BODY_IS_EMPTY_POSSIBILITY_MEM;          break;        case BAG_OPTION: @@ -4083,7 +4112,7 @@ quantifiers_memory_node_info(Node* node)    return r;  } -#endif /* USE_INSISTENT_CHECK_CAPTURES_IN_EMPTY_REPEAT */ +#endif /* USE_STUBBORN_CHECK_CAPTURES_IN_EMPTY_REPEAT */  #ifdef USE_CALL @@ -4351,7 +4380,7 @@ setup_called_state_call(Node* node, int state)      {        QuantNode* qn = QUANT_(node); -      if (IS_REPEAT_INFINITE(qn->upper) || qn->upper >= 2) +      if (IS_INFINITE_REPEAT(qn->upper) || qn->upper >= 2)          state |= IN_REAL_REPEAT;        if (qn->lower != qn->upper)          state |= IN_VAR_REPEAT; @@ -4468,7 +4497,7 @@ setup_called_state(Node* node, int state)      {        QuantNode* qn = QUANT_(node); -      if (IS_REPEAT_INFINITE(qn->upper) || qn->upper >= 2) +      if (IS_INFINITE_REPEAT(qn->upper) || qn->upper >= 2)          state |= IN_REAL_REPEAT;        if (qn->lower != qn->upper)          state |= IN_VAR_REPEAT; @@ -4600,24 +4629,24 @@ setup_quant(Node* node, regex_t* reg, int state, ScanEnv* env)      NODE_STATUS_ADD(node, IN_MULTI_ENTRY);    } -  if (IS_REPEAT_INFINITE(qn->upper) || qn->upper >= 1) { +  if (IS_INFINITE_REPEAT(qn->upper) || qn->upper >= 1) {      d = tree_min_len(body, env);      if (d == 0) { -#ifdef USE_INSISTENT_CHECK_CAPTURES_IN_EMPTY_REPEAT -      qn->empty_info = quantifiers_memory_node_info(body); -      if (qn->empty_info == BODY_IS_EMPTY_REC) { +#ifdef USE_STUBBORN_CHECK_CAPTURES_IN_EMPTY_REPEAT +      qn->emptiness = quantifiers_memory_node_info(body); +      if (qn->emptiness == BODY_IS_EMPTY_POSSIBILITY_REC) {          if (NODE_TYPE(body) == NODE_BAG &&              BAG_(body)->type == BAG_MEMORY) {            MEM_STATUS_ON(env->bt_mem_end, BAG_(body)->m.regnum);          }        }  #else -      qn->empty_info = BODY_IS_EMPTY; +      qn->emptiness = BODY_IS_EMPTY_POSSIBILITY;  #endif      }    } -  if (IS_REPEAT_INFINITE(qn->upper) || qn->upper >= 2) +  if (IS_INFINITE_REPEAT(qn->upper) || qn->upper >= 2)      state |= IN_REAL_REPEAT;    if (qn->lower != qn->upper)      state |= IN_VAR_REPEAT; @@ -4628,7 +4657,7 @@ setup_quant(Node* node, regex_t* reg, int state, ScanEnv* env)    /* expand string */  #define EXPAND_STRING_MAX_LENGTH  100    if (NODE_TYPE(body) == NODE_STRING) { -    if (!IS_REPEAT_INFINITE(qn->lower) && qn->lower == qn->upper && +    if (!IS_INFINITE_REPEAT(qn->lower) && qn->lower == qn->upper &&          qn->lower > 1 && qn->lower <= EXPAND_STRING_MAX_LENGTH) {        int len = NODE_STRING_LEN(body);        StrNode* sn = STR_(body); @@ -4646,7 +4675,7 @@ setup_quant(Node* node, regex_t* reg, int state, ScanEnv* env)      }    } -  if (qn->greedy && (qn->empty_info == BODY_IS_NOT_EMPTY)) { +  if (qn->greedy && (qn->emptiness == BODY_IS_NOT_EMPTY)) {      if (NODE_TYPE(body) == NODE_QUANT) {        QuantNode* tqn = QUANT_(body);        if (IS_NOT_NULL(tqn->head_exact)) { @@ -4663,7 +4692,7 @@ setup_quant(Node* node, regex_t* reg, int state, ScanEnv* env)  }  /* setup_tree does the following work. - 1. check empty loop. (set qn->empty_info) + 1. check empty loop. (set qn->emptiness)   2. expand ignore-case in char class.   3. set memory status bit flags. (reg->mem_stats)   4. set qn->head_exact for [push, exact] -> [push_or_jump_exact1, exact]. @@ -4752,10 +4781,10 @@ setup_tree(Node* node, regex_t* reg, int state, ScanEnv* env)            r = setup_tree(target, reg, state, env);            if (NODE_TYPE(target) == NODE_QUANT) {              QuantNode* tqn = QUANT_(target); -            if (IS_REPEAT_INFINITE(tqn->upper) && tqn->lower <= 1 && +            if (IS_INFINITE_REPEAT(tqn->upper) && tqn->lower <= 1 &&                  tqn->greedy != 0) {  /* (?>a*), a*+ etc... */ -              if (NODE_IS_SIMPLE_TYPE(NODE_BODY(target))) -                NODE_STATUS_ADD(node, STOP_BT_SIMPLE_REPEAT); +              if (is_strict_real_node(NODE_BODY(target))) +                NODE_STATUS_ADD(node, STRICT_REAL_REPEAT);              }            }          } @@ -5752,7 +5781,7 @@ optimize_nodes(Node* node, OptNode* opt, OptEnv* env)            opt->sm.reach_end = 0;        } -      if (IS_REPEAT_INFINITE(qn->upper)) { +      if (IS_INFINITE_REPEAT(qn->upper)) {          if (env->mmd.max == 0 &&              NODE_IS_ANYCHAR(NODE_BODY(node)) && qn->greedy != 0) {            if (IS_MULTILINE(CTYPE_OPTION(NODE_QUANT_BODY(qn), env))) @@ -6672,6 +6701,7 @@ onig_is_code_in_cc(OnigEncoding enc, OnigCodePoint code, CClassNode* cc)    }    else {      len = ONIGENC_CODE_TO_MBCLEN(enc, code); +    if (len < 0) return 0;    }    return onig_is_code_in_cc_len(len, code, cc);  } diff --git a/src/regenc.c b/src/regenc.c index 6376565..9fab721 100644 --- a/src/regenc.c +++ b/src/regenc.c @@ -853,6 +853,8 @@ onigenc_mbn_is_mbc_ambiguous(OnigEncoding enc, OnigCaseFoldType flag,  extern int  onigenc_mb2_code_to_mbclen(OnigCodePoint code)  { +  if ((code & (~0xffff)) != 0) return ONIGERR_INVALID_CODE_POINT_VALUE; +    if ((code & 0xff00) != 0) return 2;    else return 1;  } diff --git a/src/regerror.c b/src/regerror.c index 7564827..e6d1806 100644 --- a/src/regerror.c +++ b/src/regerror.c @@ -257,6 +257,23 @@ static int to_ascii(OnigEncoding enc, UChar *s, UChar *end,  } +extern int +onig_is_error_code_needs_param(int code) +{ +  switch (code) { +  case ONIGERR_UNDEFINED_NAME_REFERENCE: +  case ONIGERR_UNDEFINED_GROUP_REFERENCE: +  case ONIGERR_MULTIPLEX_DEFINED_NAME: +  case ONIGERR_MULTIPLEX_DEFINITION_NAME_CALL: +  case ONIGERR_INVALID_GROUP_NAME: +  case ONIGERR_INVALID_CHAR_IN_GROUP_NAME: +  case ONIGERR_INVALID_CHAR_PROPERTY_NAME: +    return 1; +  default: +    return 0; +  } +} +  /* for ONIG_MAX_ERROR_MESSAGE_LEN */  #define MAX_ERROR_PAR_LEN   30 diff --git a/src/regexec.c b/src/regexec.c index 6618996..f957b75 100644 --- a/src/regexec.c +++ b/src/regexec.c @@ -980,6 +980,8 @@ onig_region_copy(OnigRegion* to, OnigRegion* from)  #define STK_CALL_FRAME             0x0400  #define STK_RETURN                 0x0500  #define STK_SAVE_VAL               0x0600 +#define STK_PREC_READ_START        0x0700 +#define STK_PREC_READ_END          0x0800  /* stack type check mask */  #define STK_MASK_POP_USED          STK_ALT_FLAG @@ -1544,8 +1546,8 @@ stack_double(int is_alloca, char** arg_alloc_base,  #define STACK_PUSH_ALT(pat,s,sprev)       STACK_PUSH(STK_ALT,pat,s,sprev)  #define STACK_PUSH_SUPER_ALT(pat,s,sprev) STACK_PUSH(STK_SUPER_ALT,pat,s,sprev) -#define STACK_PUSH_POS(s,sprev) \ -  STACK_PUSH(STK_TO_VOID_START,(Operation* )0,s,sprev) +#define STACK_PUSH_PREC_READ_START(s,sprev) \ +  STACK_PUSH(STK_PREC_READ_START,(Operation* )0,s,sprev)  #define STACK_PUSH_ALT_PREC_READ_NOT(pat,s,sprev) \    STACK_PUSH(STK_ALT_PREC_READ_NOT,pat,s,sprev)  #define STACK_PUSH_TO_VOID_START        STACK_PUSH_TYPE(STK_TO_VOID_START) @@ -1887,6 +1889,27 @@ stack_double(int is_alloca, char** arg_alloc_base,    }\  } while(0) +#define STACK_GET_PREC_READ_START(k) do {\ +  int level = 0;\ +  k = stk;\ +  while (1) {\ +    k--;\ +    STACK_BASE_CHECK(k, "STACK_GET_PREC_READ_START");\ +    if (IS_TO_VOID_TARGET(k)) {\ +      k->type = STK_VOID;\ +    }\ +    else if (k->type == STK_PREC_READ_START) {\ +      if (level == 0) {\ +        break;\ +      }\ +      level--;\ +    }\ +    else if (k->type == STK_PREC_READ_END) {\ +      level++;\ +    }\ +  }\ +} while(0) +  #define STACK_EMPTY_CHECK(isnull,sid,s) do {\    StackType* k = stk;\    while (1) {\ @@ -1913,7 +1936,7 @@ stack_double(int is_alloca, char** arg_alloc_base,    }\  } while (0) -#ifdef USE_INSISTENT_CHECK_CAPTURES_IN_EMPTY_REPEAT +#ifdef USE_STUBBORN_CHECK_CAPTURES_IN_EMPTY_REPEAT  #define STACK_EMPTY_CHECK_MEM(isnull,sid,s,reg) do {\    StackType* k = stk;\    while (1) {\ @@ -1927,9 +1950,10 @@ stack_double(int is_alloca, char** arg_alloc_base,          }\          else {\            UChar* endp;\ +          int level = 0;\            (isnull) = 1;\            while (k < stk) {\ -            if (k->type == STK_MEM_START) {\ +            if (k->type == STK_MEM_START && level == 0) {\                STACK_MEM_START_GET_PREV_END_ADDR(k, reg, endp);\                if (endp == 0) {\                  (isnull) = 0; break;\ @@ -1941,6 +1965,12 @@ stack_double(int is_alloca, char** arg_alloc_base,                  (isnull) = -1; /* empty, but position changed */ \                }\              }\ +            else if (k->type == STK_PREC_READ_START) {\ +              level++;\ +            }\ +            else if (k->type == STK_PREC_READ_END) {\ +              level--;\ +            }\              k++;\            }\            break;\ @@ -1965,10 +1995,11 @@ stack_double(int is_alloca, char** arg_alloc_base,            }\            else {\              UChar* endp;\ +            int prec_level = 0;\              (isnull) = 1;\              while (k < stk) {\                if (k->type == STK_MEM_START) {\ -                if (level == 0) {\ +                if (level == 0 && prec_level == 0) {\                    STACK_MEM_START_GET_PREV_END_ADDR(k, reg, endp);\                    if (endp == 0) {\                      (isnull) = 0; break;\ @@ -1987,6 +2018,12 @@ stack_double(int is_alloca, char** arg_alloc_base,                else if (k->type == STK_EMPTY_CHECK_END) {\                  if (k->zid == (sid)) level--;\                }\ +              else if (k->type == STK_PREC_READ_START) {\ +                prec_level++;\ +              }\ +              else if (k->type == STK_PREC_READ_END) {\ +                prec_level--;\ +              }\                k++;\              }\              break;\ @@ -2023,7 +2060,7 @@ stack_double(int is_alloca, char** arg_alloc_base,      }\    }\  } while(0) -#endif /* USE_INSISTENT_CHECK_CAPTURES_IN_EMPTY_REPEAT */ +#endif /* USE_STUBBORN_CHECK_CAPTURES_IN_EMPTY_REPEAT */  #define STACK_GET_REPEAT(sid, k) do {\    int level = 0;\ @@ -2968,6 +3005,7 @@ match_at(regex_t* reg, const UChar* str, const UChar* end,        NEXT_OUT;      CASE_OP(CCLASS_MB) +      DATA_ENSURE(1);        if (! ONIGENC_IS_MBC_HEAD(encode, s)) goto fail;      cclass_mb: @@ -3441,11 +3479,13 @@ match_at(regex_t* reg, const UChar* str, const UChar* end,                  ? STACK_AT(mem_end_stk[mem])->u.mem.pstr                  : (UChar* )((void* )mem_end_stk[mem]));          n = (int )(pend - pstart); -        DATA_ENSURE(n); -        sprev = s; -        STRING_CMP(pstart, s, n); -        while (sprev + (len = enclen(encode, sprev)) < s) -          sprev += len; +        if (n != 0) { +          DATA_ENSURE(n); +          sprev = s; +          STRING_CMP(s, pstart, n); +          while (sprev + (len = enclen(encode, sprev)) < s) +            sprev += len; +        }        }        INC_OP;        JUMP_OUT; @@ -3468,11 +3508,13 @@ match_at(regex_t* reg, const UChar* str, const UChar* end,                  ? STACK_AT(mem_end_stk[mem])->u.mem.pstr                  : (UChar* )((void* )mem_end_stk[mem]));          n = (int )(pend - pstart); -        DATA_ENSURE(n); -        sprev = s; -        STRING_CMP_IC(case_fold_flag, pstart, &s, n); -        while (sprev + (len = enclen(encode, sprev)) < s) -          sprev += len; +        if (n != 0) { +          DATA_ENSURE(n); +          sprev = s; +          STRING_CMP_IC(case_fold_flag, pstart, &s, n); +          while (sprev + (len = enclen(encode, sprev)) < s) +            sprev += len; +        }        }        INC_OP;        JUMP_OUT; @@ -3498,15 +3540,16 @@ match_at(regex_t* reg, const UChar* str, const UChar* end,                    ? STACK_AT(mem_end_stk[mem])->u.mem.pstr                    : (UChar* )((void* )mem_end_stk[mem]));            n = (int )(pend - pstart); -          DATA_ENSURE(n); -          sprev = s; -          swork = s; -          STRING_CMP_VALUE(pstart, swork, n, is_fail); -          if (is_fail) continue; -          s = swork; -          while (sprev + (len = enclen(encode, sprev)) < s) -            sprev += len; - +          if (n != 0) { +            DATA_ENSURE(n); +            sprev = s; +            swork = s; +            STRING_CMP_VALUE(swork, pstart, n, is_fail); +            if (is_fail) continue; +            s = swork; +            while (sprev + (len = enclen(encode, sprev)) < s) +              sprev += len; +          }            break; /* success */          }          if (i == tlen) goto fail; @@ -3535,15 +3578,16 @@ match_at(regex_t* reg, const UChar* str, const UChar* end,                    ? STACK_AT(mem_end_stk[mem])->u.mem.pstr                    : (UChar* )((void* )mem_end_stk[mem]));            n = (int )(pend - pstart); -          DATA_ENSURE(n); -          sprev = s; -          swork = s; -          STRING_CMP_VALUE_IC(case_fold_flag, pstart, &swork, n, is_fail); -          if (is_fail) continue; -          s = swork; -          while (sprev + (len = enclen(encode, sprev)) < s) -            sprev += len; - +          if (n != 0) { +            DATA_ENSURE(n); +            sprev = s; +            swork = s; +            STRING_CMP_VALUE_IC(case_fold_flag, pstart, &swork, n, is_fail); +            if (is_fail) continue; +            s = swork; +            while (sprev + (len = enclen(encode, sprev)) < s) +              sprev += len; +          }            break; /* success */          }          if (i == tlen) goto fail; @@ -3560,6 +3604,7 @@ match_at(regex_t* reg, const UChar* str, const UChar* end,          int len;          int level;          MemNumType* mems; +        UChar* ssave;          n = 0;        backref_with_level: @@ -3567,10 +3612,11 @@ match_at(regex_t* reg, const UChar* str, const UChar* end,          tlen  = p->backref_general.num;          mems = tlen == 1 ? &(p->backref_general.n1) : p->backref_general.ns; -        sprev = s; +        ssave = s;          if (backref_match_at_nested_level(reg, stk, stk_base, n,                      case_fold_flag, level, (int )tlen, mems, &s, end)) { -          if (sprev < end) { +          if (ssave != s) { +            sprev = ssave;              while (sprev + (len = enclen(encode, sprev)) < s)                sprev += len;            } @@ -3658,7 +3704,7 @@ match_at(regex_t* reg, const UChar* str, const UChar* end,        }        JUMP_OUT; -#ifdef USE_INSISTENT_CHECK_CAPTURES_IN_EMPTY_REPEAT +#ifdef USE_STUBBORN_CHECK_CAPTURES_IN_EMPTY_REPEAT      CASE_OP(EMPTY_CHECK_END_MEMST)        {          int is_empty; @@ -3683,7 +3729,7 @@ match_at(regex_t* reg, const UChar* str, const UChar* end,          int is_empty;          mem = p->empty_check_end.mem;  /* mem: null check id */ -#ifdef USE_INSISTENT_CHECK_CAPTURES_IN_EMPTY_REPEAT +#ifdef USE_STUBBORN_CHECK_CAPTURES_IN_EMPTY_REPEAT          STACK_EMPTY_CHECK_MEM_REC(is_empty, mem, s, reg);  #else          STACK_EMPTY_CHECK_REC(is_empty, mem, s); @@ -3851,14 +3897,15 @@ match_at(regex_t* reg, const UChar* str, const UChar* end,        goto repeat_inc_ng;      CASE_OP(PREC_READ_START) -      STACK_PUSH_POS(s, sprev); +      STACK_PUSH_PREC_READ_START(s, sprev);        INC_OP;        JUMP_OUT;      CASE_OP(PREC_READ_END) -      STACK_EXEC_TO_VOID(stkp); +      STACK_GET_PREC_READ_START(stkp);        s     = stkp->u.state.pstr;        sprev = stkp->u.state.pstr_prev; +      STACK_PUSH(STK_PREC_READ_END,0,0,0);        INC_OP;        JUMP_OUT; @@ -5443,6 +5490,9 @@ onig_builtin_error(OnigCalloutArgs* args, void* user_data ARG_UNUSED)    if (n >= 0) {      n = ONIGERR_INVALID_CALLOUT_BODY;    } +  else if (onig_is_error_code_needs_param(n)) { +    n = ONIGERR_INVALID_CALLOUT_BODY; +  }    return n;  } diff --git a/src/regext.c b/src/regext.c index fa4b360..965c793 100644 --- a/src/regext.c +++ b/src/regext.c @@ -29,6 +29,7 @@  #include "regint.h" +#if 0  static void  conv_ext0be32(const UChar* s, const UChar* end, UChar* conv)  { @@ -158,6 +159,7 @@ conv_encoding(OnigEncoding from, OnigEncoding to, const UChar* s, const UChar* e    return ONIGERR_NOT_SUPPORTED_ENCODING_COMBINATION;  } +#endif  extern int  onig_new_deluxe(regex_t** reg, const UChar* pattern, const UChar* pattern_end, @@ -169,9 +171,7 @@ onig_new_deluxe(regex_t** reg, const UChar* pattern, const UChar* pattern_end,    if (IS_NOT_NULL(einfo)) einfo->par = (UChar* )NULL;    if (ci->pattern_enc != ci->target_enc) { -    r = conv_encoding(ci->pattern_enc, ci->target_enc, pattern, pattern_end, -                      &cpat, &cpat_end); -    if (r != 0) return r; +    return ONIGERR_NOT_SUPPORTED_ENCODING_COMBINATION;    }    else {      cpat     = (UChar* )pattern; diff --git a/src/regint.h b/src/regint.h index 56767e8..38389a1 100644 --- a/src/regint.h +++ b/src/regint.h @@ -63,7 +63,7 @@  #define USE_CALL  #define USE_CALLOUT  #define USE_BACKREF_WITH_LEVEL        /* \k<name+n>, \k<name-n> */ -#define USE_INSISTENT_CHECK_CAPTURES_IN_EMPTY_REPEAT    /* /(?:()|())*\2/ */ +#define USE_STUBBORN_CHECK_CAPTURES_IN_EMPTY_REPEAT     /* /(?:()|())*\2/ */  #define USE_NEWLINE_AT_END_OF_STRING_HAS_EMPTY_LINE     /* /\n$/ =~ "\n" */  #define USE_WARNING_REDUNDANT_NESTED_REPEAT_OPERATOR  #define USE_RETRY_LIMIT_IN_MATCH @@ -348,8 +348,8 @@ typedef unsigned int  MemStatusType;  #define DISABLE_CASE_FOLD_MULTI_CHAR(case_fold_flag) \    ((case_fold_flag) & ~INTERNAL_ONIGENC_CASE_FOLD_MULTI_CHAR) -#define REPEAT_INFINITE         -1 -#define IS_REPEAT_INFINITE(n)   ((n) == REPEAT_INFINITE) +#define INFINITE_REPEAT         -1 +#define IS_INFINITE_REPEAT(n)   ((n) == INFINITE_REPEAT)  /* bitset */  #define BITS_PER_BYTE      8 diff --git a/src/regparse.c b/src/regparse.c index f1deea3..7f8b1a9 100644 --- a/src/regparse.c +++ b/src/regparse.c @@ -77,6 +77,7 @@ OnigSyntaxType OnigSyntaxOniguruma = {        ONIG_SYN_CAPTURE_ONLY_NAMED_GROUP |        ONIG_SYN_ALLOW_MULTIPLEX_DEFINITION_NAME |        ONIG_SYN_FIXED_INTERVAL_IS_GREEDY_ONLY | +      ONIG_SYN_ALLOW_INVALID_CODE_END_OF_RANGE_IN_CC |        ONIG_SYN_WARN_CC_OP_NOT_ESCAPED |        ONIG_SYN_WARN_REDUNDANT_NESTED_REPEAT )    , ONIG_OPTION_NONE @@ -1093,6 +1094,35 @@ onig_name_to_group_numbers(regex_t* reg, const UChar* name,    return e->back_num;  } +static int +name_to_group_numbers(ScanEnv* env, const UChar* name, const UChar* name_end, +                      int** nums) +{ +  regex_t* reg; +  NameEntry* e; + +  reg = env->reg; +  e = name_find(reg, name, name_end); + +  if (IS_NULL(e)) { +    onig_scan_env_set_error_string(env, ONIGERR_UNDEFINED_NAME_REFERENCE, +                                   (UChar* )name, (UChar* )name_end); +    return ONIGERR_UNDEFINED_NAME_REFERENCE; +  } + +  switch (e->back_num) { +  case 0: +    break; +  case 1: +    *nums = &(e->back_ref1); +    break; +  default: +    *nums = e->back_refs; +    break; +  } +  return e->back_num; +} +  extern int  onig_name_to_backref_number(regex_t* reg, const UChar* name,                              const UChar* name_end, OnigRegion *region) @@ -1869,8 +1899,8 @@ callout_tag_table_new(CalloutTagTable** rt)  }  static int -callout_tag_entry_raw(CalloutTagTable* t, UChar* name, UChar* name_end, -                      CalloutTagVal entry_val) +callout_tag_entry_raw(ScanEnv* env, CalloutTagTable* t, UChar* name, +                      UChar* name_end, CalloutTagVal entry_val)  {    int r;    CalloutTagVal val; @@ -1879,8 +1909,11 @@ callout_tag_entry_raw(CalloutTagTable* t, UChar* name, UChar* name_end,      return ONIGERR_INVALID_CALLOUT_TAG_NAME;    val = callout_tag_find(t, name, name_end); -  if (val >= 0) +  if (val >= 0) { +    onig_scan_env_set_error_string(env, ONIGERR_MULTIPLEX_DEFINED_NAME, +                                   name, name_end);      return ONIGERR_MULTIPLEX_DEFINED_NAME; +  }    r = onig_st_insert_strend(t, name, name_end, (HashDataType )entry_val);    if (r < 0) return r; @@ -1909,7 +1942,7 @@ ext_ensure_tag_table(regex_t* reg)  }  static int -callout_tag_entry(regex_t* reg, UChar* name, UChar* name_end, +callout_tag_entry(ScanEnv* env, regex_t* reg, UChar* name, UChar* name_end,                    CalloutTagVal entry_val)  {    int r; @@ -1921,7 +1954,7 @@ callout_tag_entry(regex_t* reg, UChar* name, UChar* name_end,    ext = onig_get_regex_ext(reg);    CHECK_NULL_RETURN_MEMERR(ext); -  r = callout_tag_entry_raw(ext->tag_table, name, name_end, entry_val); +  r = callout_tag_entry_raw(env, ext->tag_table, name, name_end, entry_val);    e = onig_reg_callout_list_at(reg, (int )entry_val);    CHECK_NULL_RETURN_MEMERR(e); @@ -2391,10 +2424,10 @@ node_new_quantifier(int lower, int upper, int by_number)    CHECK_NULL_RETURN(node);    NODE_SET_TYPE(node, NODE_QUANT); -  QUANT_(node)->lower      = lower; -  QUANT_(node)->upper      = upper; -  QUANT_(node)->greedy     = 1; -  QUANT_(node)->empty_info = BODY_IS_NOT_EMPTY; +  QUANT_(node)->lower           = lower; +  QUANT_(node)->upper           = upper; +  QUANT_(node)->greedy          = 1; +  QUANT_(node)->emptiness       = BODY_IS_NOT_EMPTY;    QUANT_(node)->head_exact      = NULL_NODE;    QUANT_(node)->next_head_exact = NULL_NODE;    QUANT_(node)->is_refered      = 0; @@ -2694,7 +2727,7 @@ make_text_segment(Node** node, ScanEnv* env)    ns[0] = x;    ns[1] = NULL_NODE; -  x = node_new_quantifier(0, REPEAT_INFINITE, 1); +  x = node_new_quantifier(0, INFINITE_REPEAT, 1);    if (IS_NULL(x)) goto err;    NODE_BODY(x) = ns[0]; @@ -3044,7 +3077,7 @@ make_absent_tree(Node** node, Node* absent, Node* expr, int is_range_cutter,      if (expr == NULL_NODE) {        /* default expr \O* */ -      quant = node_new_quantifier(0, REPEAT_INFINITE, 0); +      quant = node_new_quantifier(0, INFINITE_REPEAT, 0);        if (IS_NULL(quant)) goto err0;        r = node_new_true_anychar(&body, env); @@ -3086,7 +3119,7 @@ make_absent_tree(Node** node, Node* absent, Node* expr, int is_range_cutter,    if (r != 0) goto err;    possessive = 1; -  r = make_absent_engine(&ns[2], id1, absent, ns[3], 0, REPEAT_INFINITE, +  r = make_absent_engine(&ns[2], id1, absent, ns[3], 0, INFINITE_REPEAT,                           possessive, is_range_cutter, env);    if (r != 0) goto err; @@ -3236,10 +3269,18 @@ node_new_empty(void)  static Node*  node_new_str_raw_char(UChar c)  { +  int i;    UChar p[1]; +  Node* node;    p[0] = c; -  return node_new_str_raw(p, p + 1); +  node = node_new_str_raw(p, p + 1); + +  /* clear buf tail */ +  for (i = 1; i < NODE_STRING_BUF_SIZE; i++) +    STR_(node)->buf[i] = '\0'; + +  return node;  }  static Node* @@ -3275,24 +3316,6 @@ str_node_can_be_split(Node* node, OnigEncoding enc)    return 0;  } -#ifdef USE_PAD_TO_SHORT_BYTE_CHAR -static int -node_str_head_pad(StrNode* sn, int num, UChar val) -{ -  UChar buf[NODE_STRING_BUF_SIZE]; -  int i, len; - -  len = sn->end - sn->s; -  onig_strcpy(buf, sn->s, sn->end); -  onig_strcpy(&(sn->s[num]), buf, buf + len); -  sn->end += num; - -  for (i = 0; i < num; i++) { -    sn->s[i] = val; -  } -} -#endif -  extern int  onig_scan_unsigned_number(UChar** src, const UChar* end, OnigEncoding enc)  { @@ -3877,19 +3900,19 @@ quantifier_type_num(QuantNode* q)    if (q->greedy) {      if (q->lower == 0) {        if (q->upper == 1) return 0; -      else if (IS_REPEAT_INFINITE(q->upper)) return 1; +      else if (IS_INFINITE_REPEAT(q->upper)) return 1;      }      else if (q->lower == 1) { -      if (IS_REPEAT_INFINITE(q->upper)) return 2; +      if (IS_INFINITE_REPEAT(q->upper)) return 2;      }    }    else {      if (q->lower == 0) {        if (q->upper == 1) return 3; -      else if (IS_REPEAT_INFINITE(q->upper)) return 4; +      else if (IS_INFINITE_REPEAT(q->upper)) return 4;      }      else if (q->lower == 1) { -      if (IS_REPEAT_INFINITE(q->upper)) return 5; +      if (IS_INFINITE_REPEAT(q->upper)) return 5;      }    }    return -1; @@ -3926,8 +3949,8 @@ onig_reduce_nested_quantifier(Node* pnode, Node* cnode)    pnum = quantifier_type_num(p);    cnum = quantifier_type_num(c);    if (pnum < 0 || cnum < 0) { -    if ((p->lower == p->upper) && ! IS_REPEAT_INFINITE(p->upper)) { -      if ((c->lower == c->upper) && ! IS_REPEAT_INFINITE(c->upper)) { +    if ((p->lower == p->upper) && ! IS_INFINITE_REPEAT(p->upper)) { +      if ((c->lower == c->upper) && ! IS_INFINITE_REPEAT(c->upper)) {          int n = onig_positive_int_multiply(p->lower, c->lower);          if (n >= 0) {            p->lower = p->upper = n; @@ -3946,11 +3969,11 @@ onig_reduce_nested_quantifier(Node* pnode, Node* cnode)      break;    case RQ_A:      NODE_BODY(pnode) = NODE_BODY(cnode); -    p->lower  = 0;  p->upper = REPEAT_INFINITE;  p->greedy = 1; +    p->lower  = 0;  p->upper = INFINITE_REPEAT;  p->greedy = 1;      break;    case RQ_AQ:      NODE_BODY(pnode) = NODE_BODY(cnode); -    p->lower  = 0;  p->upper = REPEAT_INFINITE;  p->greedy = 0; +    p->lower  = 0;  p->upper = INFINITE_REPEAT;  p->greedy = 0;      break;    case RQ_QQ:      NODE_BODY(pnode) = NODE_BODY(cnode); @@ -3959,13 +3982,13 @@ onig_reduce_nested_quantifier(Node* pnode, Node* cnode)    case RQ_P_QQ:      NODE_BODY(pnode) = cnode;      p->lower  = 0;  p->upper = 1;  p->greedy = 0; -    c->lower  = 1;  c->upper = REPEAT_INFINITE;  c->greedy = 1; +    c->lower  = 1;  c->upper = INFINITE_REPEAT;  c->greedy = 1;      return ;      break;    case RQ_PQ_Q:      NODE_BODY(pnode) = cnode;      p->lower  = 0;  p->upper = 1;  p->greedy = 1; -    c->lower  = 1;  c->upper = REPEAT_INFINITE;  c->greedy = 0; +    c->lower  = 1;  c->upper = INFINITE_REPEAT;  c->greedy = 0;      return ;      break;    case RQ_ASIS: @@ -4158,7 +4181,7 @@ fetch_interval_quantifier(UChar** src, UChar* end, PToken* tok, ScanEnv* env)      if (p == prev) {        if (non_low != 0)          goto invalid; -      up = REPEAT_INFINITE;  /* {n,} : {n,infinite} */ +      up = INFINITE_REPEAT;  /* {n,} : {n,infinite} */      }    }    else { @@ -4178,7 +4201,7 @@ fetch_interval_quantifier(UChar** src, UChar* end, PToken* tok, ScanEnv* env)    }    if (c != '}') goto invalid; -  if (!IS_REPEAT_INFINITE(up) && low > up) { +  if (!IS_INFINITE_REPEAT(up) && low > up) {      /* {n,m}+ supported case */      if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_PLUS_POSSESSIVE_INTERVAL))        return ONIGERR_UPPER_SMALLER_THAN_LOWER_IN_REPEAT_RANGE; @@ -4959,7 +4982,7 @@ fetch_token(PToken* tok, UChar** src, UChar* end, ScanEnv* env)        if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_ASTERISK_ZERO_INF)) break;        tok->type = TK_REPEAT;        tok->u.repeat.lower = 0; -      tok->u.repeat.upper = REPEAT_INFINITE; +      tok->u.repeat.upper = INFINITE_REPEAT;        goto greedy_check;        break; @@ -4967,7 +4990,7 @@ fetch_token(PToken* tok, UChar** src, UChar* end, ScanEnv* env)        if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_PLUS_ONE_INF)) break;        tok->type = TK_REPEAT;        tok->u.repeat.lower = 1; -      tok->u.repeat.upper = REPEAT_INFINITE; +      tok->u.repeat.upper = INFINITE_REPEAT;        goto greedy_check;        break; @@ -5358,10 +5381,8 @@ fetch_token(PToken* tok, UChar** src, UChar* end, ScanEnv* env)              tok->u.backref.ref1 = back_num;            }            else { -            num = onig_name_to_group_numbers(env->reg, prev, name_end, &backs); +            num = name_to_group_numbers(env, prev, name_end, &backs);              if (num <= 0) { -              onig_scan_env_set_error_string(env, -                        ONIGERR_UNDEFINED_NAME_REFERENCE, prev, name_end);                return ONIGERR_UNDEFINED_NAME_REFERENCE;              }              if (IS_SYNTAX_BV(syn, ONIG_SYN_STRICT_CHECK_BACKREF)) { @@ -5514,7 +5535,7 @@ fetch_token(PToken* tok, UChar** src, UChar* end, ScanEnv* env)  #endif        tok->type = TK_REPEAT;        tok->u.repeat.lower = 0; -      tok->u.repeat.upper = REPEAT_INFINITE; +      tok->u.repeat.upper = INFINITE_REPEAT;        goto greedy_check;        break; @@ -5525,7 +5546,7 @@ fetch_token(PToken* tok, UChar** src, UChar* end, ScanEnv* env)  #endif        tok->type = TK_REPEAT;        tok->u.repeat.lower = 1; -      tok->u.repeat.upper = REPEAT_INFINITE; +      tok->u.repeat.upper = INFINITE_REPEAT;        goto greedy_check;        break; @@ -5608,7 +5629,7 @@ fetch_token(PToken* tok, UChar** src, UChar* end, ScanEnv* env)                tok->u.call.gnum      = 0;                tok->u.call.name      = p;                PINC; -              if (! PPEEK_IS(')')) return ONIGERR_INVALID_GROUP_NAME; +              if (! PPEEK_IS(')')) return ONIGERR_UNDEFINED_GROUP_OPTION;                tok->u.call.name_end  = p;                break; @@ -6249,6 +6270,7 @@ parse_char_class(Node** np, PToken* tok, UChar** src, UChar* end, ScanEnv* env)    env->parse_depth++;    if (env->parse_depth > ParseDepthLimit)      return ONIGERR_PARSE_DEPTH_LIMIT_OVER; +    prev_cc = (CClassNode* )NULL;    r = fetch_token_in_cc(tok, src, end, env);    if (r == TK_CHAR && tok->u.c == '^' && tok->escaped == 0) { @@ -6301,10 +6323,11 @@ parse_char_class(Node** np, PToken* tok, UChar** src, UChar* end, ScanEnv* env)      case TK_RAW_BYTE:        /* tok->base != 0 : octal or hexadec. */        if (! ONIGENC_IS_SINGLEBYTE(env->enc) && tok->base != 0) { +        int i, j;          UChar buf[ONIGENC_CODE_TO_MBC_MAXLEN];          UChar* bufe = buf + ONIGENC_CODE_TO_MBC_MAXLEN;          UChar* psave = p; -        int i, base = tok->base; +        int base = tok->base;          buf[0] = tok->u.c;          for (i = 1; i < ONIGENC_MBC_MAXLEN(env->enc); i++) { @@ -6322,6 +6345,9 @@ parse_char_class(Node** np, PToken* tok, UChar** src, UChar* end, ScanEnv* env)            goto err;          } +        /* clear buf tail */ +        for (j = i; j < ONIGENC_CODE_TO_MBC_MAXLEN; j++) buf[j] = '\0'; +          len = enclen(env->enc, buf);          if (i < len) {            r = ONIGERR_TOO_SHORT_MULTI_BYTE_STRING; @@ -6359,8 +6385,13 @@ parse_char_class(Node** np, PToken* tok, UChar** src, UChar* end, ScanEnv* env)      val_entry:        len = ONIGENC_CODE_TO_MBCLEN(env->enc, v);        if (len < 0) { -        r = len; -        goto err; +        if (state != CCS_RANGE || +            ! IS_SYNTAX_BV(env->syntax, +                           ONIG_SYN_ALLOW_INVALID_CODE_END_OF_RANGE_IN_CC) || +            v < 0x100 || ONIGENC_MBC_MAXLEN(env->enc) == 1) { +          r = len; +          goto err; +        }        }        in_type = (len == 1 ? CCV_SB : CCV_CODE_POINT);      val_entry2: @@ -6673,7 +6704,7 @@ parse_callout_of_contents(Node** np, int cterm, UChar** src, UChar* end, ScanEnv    }    if (tag_start != tag_end) { -    r = callout_tag_entry(env->reg, tag_start, tag_end, num); +    r = callout_tag_entry(env, env->reg, tag_start, tag_end, num);      if (r != ONIG_NORMAL) return r;    } @@ -6994,7 +7025,7 @@ parse_callout_of_name(Node** np, int cterm, UChar** src, UChar* end, ScanEnv* en    }    if (tag_start != tag_end) { -    r = callout_tag_entry(env->reg, tag_start, tag_end, num); +    r = callout_tag_entry(env, env->reg, tag_start, tag_end, num);      if (r != ONIG_NORMAL) return r;    } @@ -7271,10 +7302,8 @@ parse_bag(Node** np, PToken* tok, int term, UChar** src, UChar* end,              int num;              int* backs; -            num = onig_name_to_group_numbers(env->reg, prev, name_end, &backs); +            num = name_to_group_numbers(env, prev, name_end, &backs);              if (num <= 0) { -              onig_scan_env_set_error_string(env, -                        ONIGERR_UNDEFINED_NAME_REFERENCE, prev, name_end);                return ONIGERR_UNDEFINED_NAME_REFERENCE;              }              if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_STRICT_CHECK_BACKREF)) { @@ -7414,6 +7443,7 @@ parse_bag(Node** np, PToken* tok, int term, UChar** src, UChar* end,        }        break; +#ifdef USE_CAPTURE_HISTORY      case '@':        if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_ATMARK_CAPTURE_HISTORY)) {          if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_LT_NAMED_GROUP)) { @@ -7441,6 +7471,7 @@ parse_bag(Node** np, PToken* tok, int term, UChar** src, UChar* end,          return ONIGERR_UNDEFINED_GROUP_OPTION;        }        break; +#endif  #ifdef USE_POSIXLINE_OPTION      case 'p': @@ -7688,7 +7719,7 @@ set_quantifier(Node* qnode, Node* target, int group, ScanEnv* env)        if (targetq_num >= 0 && nestq_num < 0) {          if (targetq_num == 1 || targetq_num == 2) { /* * or + */            /* (?:a*){n,m}, (?:a+){n,m} => (?:a*){n,n}, (?:a+){n,n} */ -          if (! IS_REPEAT_INFINITE(qn->upper) && qn->upper > 1 && qn->greedy) { +          if (! IS_INFINITE_REPEAT(qn->upper) && qn->upper > 1 && qn->greedy) {              qn->upper = (qn->lower == 0 ? 1 : qn->lower);            }          } @@ -7826,14 +7857,18 @@ static int  parse_exp(Node** np, PToken* tok, int term, UChar** src, UChar* end,            ScanEnv* env, int group_head)  { -  int r, len, group = 0; +  int r, len, group;    Node* qn;    Node** tp; +  unsigned int parse_depth; +  group = 0;    *np = NULL;    if (tok->type == (enum TokenSyms )term)      goto end_of_token; +  parse_depth = env->parse_depth; +    switch (tok->type) {    case TK_ALT:    case TK_EOT: @@ -7914,36 +7949,29 @@ parse_exp(Node** np, PToken* tok, int term, UChar** src, UChar* end,        len = 1;        while (1) {          if (len >= ONIGENC_MBC_MINLEN(env->enc)) { -          if (len == enclen(env->enc, STR_(*np)->s)) {/* should not enclen_end() */ +          if (len == enclen(env->enc, STR_(*np)->s)) {              r = fetch_token(tok, src, end, env); -            NODE_STRING_CLEAR_RAW(*np); -            goto string_end; +            goto tk_raw_byte_end;            }          }          r = fetch_token(tok, src, end, env);          if (r < 0) return r; -        if (r != TK_RAW_BYTE) { -          /* Don't use this, it is wrong for little endian encodings. */ -#ifdef USE_PAD_TO_SHORT_BYTE_CHAR -          int rem; -          if (len < ONIGENC_MBC_MINLEN(env->enc)) { -            rem = ONIGENC_MBC_MINLEN(env->enc) - len; -            (void )node_str_head_pad(STR_(*np), rem, (UChar )0); -            if (len + rem == enclen(env->enc, STR_(*np)->s)) { -              NODE_STRING_CLEAR_RAW(*np); -              goto string_end; -            } -          } -#endif +        if (r != TK_RAW_BYTE)            return ONIGERR_TOO_SHORT_MULTI_BYTE_STRING; -        }          r = node_str_cat_char(*np, (UChar )tok->u.c);          if (r < 0) return r;          len++;        } + +    tk_raw_byte_end: +      if (! ONIGENC_IS_VALID_MBC_STRING(env->enc, STR_(*np)->s, STR_(*np)->end)) +        return ONIGERR_INVALID_WIDE_CHAR_VALUE; + +      NODE_STRING_CLEAR_RAW(*np); +      goto string_end;      }      break; @@ -8055,7 +8083,7 @@ parse_exp(Node** np, PToken* tok, int term, UChar** src, UChar* end,    case TK_ANYCHAR_ANYTIME:      *np = node_new_anychar();      CHECK_NULL_RETURN_MEMERR(*np); -    qn = node_new_quantifier(0, REPEAT_INFINITE, 0); +    qn = node_new_quantifier(0, INFINITE_REPEAT, 0);      CHECK_NULL_RETURN_MEMERR(qn);      NODE_BODY(qn) = *np;      *np = qn; @@ -8158,6 +8186,10 @@ parse_exp(Node** np, PToken* tok, int term, UChar** src, UChar* end,        if (is_invalid_quantifier_target(*tp))          return ONIGERR_TARGET_OF_REPEAT_OPERATOR_INVALID; +      parse_depth++; +      if (parse_depth > ParseDepthLimit) +        return ONIGERR_PARSE_DEPTH_LIMIT_OVER; +        qn = node_new_quantifier(tok->u.repeat.lower, tok->u.repeat.upper,                                 r == TK_INTERVAL);        CHECK_NULL_RETURN_MEMERR(qn); diff --git a/src/regparse.h b/src/regparse.h index b7a2867..231f7b5 100644 --- a/src/regparse.h +++ b/src/regparse.h @@ -66,11 +66,11 @@ enum GimmickType {  #endif  }; -enum BodyEmpty { -  BODY_IS_NOT_EMPTY = 0, -  BODY_IS_EMPTY     = 1, -  BODY_IS_EMPTY_MEM = 2, -  BODY_IS_EMPTY_REC = 3 +enum BodyEmptyType { +  BODY_IS_NOT_EMPTY             = 0, +  BODY_IS_EMPTY_POSSIBILITY     = 1, +  BODY_IS_EMPTY_POSSIBILITY_MEM = 2, +  BODY_IS_EMPTY_POSSIBILITY_REC = 3  };  typedef struct { @@ -101,7 +101,7 @@ typedef struct {    int lower;    int upper;    int greedy; -  enum BodyEmpty empty_info; +  enum BodyEmptyType emptiness;    struct _Node* head_exact;    struct _Node* next_head_exact;    int is_refered;     /* include called node. don't eliminate even if {0} */ @@ -252,10 +252,6 @@ typedef struct _Node {  #define NODE_BIT_CALL       NODE_TYPE2BIT(NODE_CALL)  #define NODE_BIT_GIMMICK    NODE_TYPE2BIT(NODE_GIMMICK) -#define NODE_IS_SIMPLE_TYPE(node) \ -  ((NODE_TYPE2BIT(NODE_TYPE(node)) & \ -    (NODE_BIT_STRING | NODE_BIT_CCLASS | NODE_BIT_CTYPE | NODE_BIT_BACKREF)) != 0) -  #define NODE_TYPE(node)             ((node)->u.base.node_type)  #define NODE_SET_TYPE(node, ntype)   (node)->u.base.node_type = (ntype) @@ -314,7 +310,7 @@ typedef struct _Node {  #define NODE_ST_CLEN_FIXED            (1<<2)  #define NODE_ST_MARK1                 (1<<3)  #define NODE_ST_MARK2                 (1<<4) -#define NODE_ST_STOP_BT_SIMPLE_REPEAT (1<<5) +#define NODE_ST_STRICT_REAL_REPEAT    (1<<5)  #define NODE_ST_RECURSION             (1<<6)  #define NODE_ST_CALLED                (1<<7)  #define NODE_ST_ADDR_FIXED            (1<<8) @@ -357,8 +353,8 @@ typedef struct _Node {  #define NODE_IS_SUPER(node)           ((NODE_STATUS(node) & NODE_ST_SUPER)        != 0)  #define NODE_IS_PROHIBIT_RECURSION(node) \      ((NODE_STATUS(node) & NODE_ST_PROHIBIT_RECURSION) != 0) -#define NODE_IS_STOP_BT_SIMPLE_REPEAT(node) \ -    ((NODE_STATUS(node) & NODE_ST_STOP_BT_SIMPLE_REPEAT) != 0) +#define NODE_IS_STRICT_REAL_REPEAT(node) \ +    ((NODE_STATUS(node) & NODE_ST_STRICT_REAL_REPEAT) != 0)  #define NODE_BODY(node)           ((node)->u.base.body)  #define NODE_QUANT_BODY(node)     ((node)->body) diff --git a/src/utf16_be.c b/src/utf16_be.c index 22bf74d..b66d868 100644 --- a/src/utf16_be.c +++ b/src/utf16_be.c @@ -2,7 +2,7 @@    utf16_be.c -  Oniguruma (regular expression library)  **********************************************************************/  /*- - * Copyright (c) 2002-2018  K.Kosako  <sndgk393 AT ybb DOT ne DOT jp> + * Copyright (c) 2002-2019  K.Kosako  <sndgk393 AT ybb DOT ne DOT jp>   * All rights reserved.   *   * Redistribution and use in source and binary forms, with or without @@ -103,7 +103,25 @@ utf16be_mbc_enc_len(const UChar* p)  static int  is_valid_mbc_string(const UChar* s, const UChar* end)  { -  return onigenc_length_check_is_valid_mbc_string(ONIG_ENCODING_UTF16_BE, s, end); +  while (s < end) { +    int len = utf16be_mbc_enc_len(s); +    if (len == 4) { +      if (s + 2 >= end) +        return FALSE; +      if (! UTF16_IS_SURROGATE_SECOND(*(s+2))) +        return FALSE; +    } +    else +      if (UTF16_IS_SURROGATE_SECOND(*s)) +        return FALSE; + +    s += len; +  } + +  if (s != end) +    return FALSE; +  else +    return TRUE;  }  static int @@ -146,7 +164,15 @@ utf16be_mbc_to_code(const UChar* p, const UChar* end ARG_UNUSED)  static int  utf16be_code_to_mbclen(OnigCodePoint code)  { -  return (code > 0xffff ? 4 : 2); +  if (code > 0xffff) { +    if (code > 0x10ffff) +      return ONIGERR_INVALID_CODE_POINT_VALUE; +    else +      return 4; +  } +  else { +    return 2; +  }  }  static int @@ -243,7 +269,8 @@ utf16be_left_adjust_char_head(const UChar* start, const UChar* s)      s--;    } -  if (UTF16_IS_SURROGATE_SECOND(*s) && s > start + 1) +  if (UTF16_IS_SURROGATE_SECOND(*s) && s > start + 1 && +      UTF16_IS_SURROGATE_FIRST(*(s-2)))      s -= 2;    return (UChar* )s; diff --git a/src/utf16_le.c b/src/utf16_le.c index 4b231c6..cdc74b0 100644 --- a/src/utf16_le.c +++ b/src/utf16_le.c @@ -2,7 +2,7 @@    utf16_le.c -  Oniguruma (regular expression library)  **********************************************************************/  /*- - * Copyright (c) 2002-2018  K.Kosako  <sndgk393 AT ybb DOT ne DOT jp> + * Copyright (c) 2002-2019  K.Kosako  <sndgk393 AT ybb DOT ne DOT jp>   * All rights reserved.   *   * Redistribution and use in source and binary forms, with or without @@ -95,7 +95,15 @@ static const int EncLen_UTF16[] = {  static int  utf16le_code_to_mbclen(OnigCodePoint code)  { -  return (code > 0xffff ? 4 : 2); +  if (code > 0xffff) { +    if (code > 0x10ffff) +      return ONIGERR_INVALID_CODE_POINT_VALUE; +    else +      return 4; +  } +  else { +    return 2; +  }  }  static int @@ -110,7 +118,16 @@ is_valid_mbc_string(const UChar* p, const UChar* end)    const UChar* end1 = end - 1;    while (p < end1) { -    p += utf16le_mbc_enc_len(p); +    int len = utf16le_mbc_enc_len(p); +    if (len == 4) { +      if (p + 3 < end && ! UTF16_IS_SURROGATE_SECOND(*(p + 3))) +        return FALSE; +    } +    else +      if (UTF16_IS_SURROGATE_SECOND(*(p + 1))) +        return FALSE; + +    p += len;    }    if (p != end) @@ -252,7 +269,8 @@ utf16le_left_adjust_char_head(const UChar* start, const UChar* s)      s--;    } -  if (UTF16_IS_SURROGATE_SECOND(*(s+1)) && s > start + 1) +  if (UTF16_IS_SURROGATE_SECOND(*(s+1)) && s > start + 1 && +      UTF16_IS_SURROGATE_FIRST(*(s-1)))      s -= 2;    return (UChar* )s; diff --git a/test/test_utf8.c b/test/test_utf8.c index bab6b0d..2338526 100644 --- a/test/test_utf8.c +++ b/test/test_utf8.c @@ -1202,10 +1202,23 @@ extern int main(int argc, char* argv[])    x2("a{3,2}b", "aab", 0, 3);    x2("a{3,2}?", "", 0, 0);     /* == (?:a{3,2})?*/    x2("a{2,3}+a", "aaa", 0, 3); /* == (?:a{2,3})+*/ +  x2("[\\x{0}-\\x{7fffffff}]", "a", 0, 1); +  x2("[\\x{7f}-\\x{7fffffff}]", "\xe5\xae\xb6", 0, 3); + +  n("   \xfd", ""); /* https://bugs.php.net/bug.php?id=77370 */ +  /* can't use \xfc00.. because compiler error: hex escape sequence out of range */ +  n("()0\\xfc00000\\xfc00000\\xfc00000\xfc", ""); /* https://bugs.php.net/bug.php?id=77371 */ +  x2("000||0\xfa", "0", 0, 0); /* https://bugs.php.net/bug.php?id=77381 */ +  e("(?i)000000000000000000000\xf0", "", ONIGERR_INVALID_CODE_POINT_VALUE); /* https://bugs.php.net/bug.php?id=77382 */ +  n("0000\\\xf5", "0"); /* https://bugs.php.net/bug.php?id=77385 */ +  n("(?i)FFF00000000000000000\xfd", ""); /* https://bugs.php.net/bug.php?id=77394 */ +    x2("\\p{Common}", "\xe3\x8b\xbf", 0, 3);   /* U+32FF */    x2("\\p{In_Enclosed_CJK_Letters_and_Months}", "\xe3\x8b\xbf", 0, 3); /* U+32FF */ +  e("\\x{7fffffff}", "", ONIGERR_TOO_BIG_WIDE_CHAR_VALUE); +  e("[\\x{7fffffff}]", "", ONIGERR_INVALID_CODE_POINT_VALUE);    e("\\u040", "@", ONIGERR_INVALID_CODE_POINT_VALUE);    e("(?<abc>\\g<abc>)", "zzzz", ONIGERR_NEVER_ENDING_RECURSION);    e("(?<=(?>abc))", "abc", ONIGERR_INVALID_LOOK_BEHIND_PATTERN); diff --git a/test/testu.c b/test/testu.c index 4b053e5..397da95 100644 --- a/test/testu.c +++ b/test/testu.c @@ -116,28 +116,13 @@ static void xx(char* pattern, char* str, int from, int to, int mem, int not)  #else    regex_t* reg; -  OnigCompileInfo ci;    OnigErrorInfo einfo;    uconv(pattern, cpat, ulen(pattern));    uconv(str,     cstr, ulen(str)); -#if 0    r = onig_new(®, (UChar* )pattern, (UChar* )(pattern + ulen(pattern)),                 ONIG_OPTION_DEFAULT, ENC, ONIG_SYNTAX_DEFAULT, &einfo); -#else -  ci.num_of_elements = 5; -  ci.pattern_enc = ENC; -  ci.target_enc  = ENC; -  ci.syntax      = ONIG_SYNTAX_DEFAULT; -  ci.option      = ONIG_OPTION_DEFAULT; -  ci.case_fold_flag = ONIGENC_CASE_FOLD_DEFAULT; - -  r = onig_new_deluxe(®, (UChar* )pattern, -          (UChar* )(pattern + ulen(pattern)), -          &ci, &einfo); -#endif -    if (r) {      char s[ONIG_MAX_ERROR_MESSAGE_LEN];      onig_error_code_to_str((UChar* )s, r, &einfo); | 
