diff options
author | Jörg Frings-Fürst <debian@jff.email> | 2020-11-08 10:58:30 +0100 |
---|---|---|
committer | Jörg Frings-Fürst <debian@jff.email> | 2020-11-08 10:58:30 +0100 |
commit | fc9ba4264eafbb5a6ec0f3cc4cd2e1964c9b8fcf (patch) | |
tree | 465568f63d306c1de9bb8b934b08ee4f033050ad /src | |
parent | 6b986090d954dbac91bbb3c43ce7c3328c91a780 (diff) |
New upstream version 6.9.6upstream/6.9.6
Diffstat (limited to 'src')
-rw-r--r-- | src/Makefile.am | 25 | ||||
-rw-r--r-- | src/Makefile.windows | 2 | ||||
-rw-r--r-- | src/big5.c | 13 | ||||
-rw-r--r-- | src/config.h.cmake.in | 6 | ||||
-rw-r--r-- | src/euc_jp.c | 29 | ||||
-rw-r--r-- | src/euc_kr.c | 13 | ||||
-rw-r--r-- | src/euc_tw.c | 19 | ||||
-rw-r--r-- | src/gb18030.c | 24 | ||||
-rwxr-xr-x | src/make_property.sh | 2 | ||||
-rwxr-xr-x | src/make_unicode_property.sh | 2 | ||||
-rwxr-xr-x | src/make_unicode_property_data.py | 4 | ||||
-rw-r--r-- | src/onigposix.h | 41 | ||||
-rw-r--r-- | src/oniguruma.h | 15 | ||||
-rw-r--r-- | src/regcomp.c | 310 | ||||
-rw-r--r-- | src/regenc.c | 6 | ||||
-rw-r--r-- | src/regerror.c | 2 | ||||
-rw-r--r-- | src/regexec.c | 592 | ||||
-rw-r--r-- | src/regint.h | 42 | ||||
-rw-r--r-- | src/regparse.c | 315 | ||||
-rw-r--r-- | src/regparse.h | 4 | ||||
-rw-r--r-- | src/regposerr.c | 28 | ||||
-rw-r--r-- | src/regposix.c | 94 | ||||
-rw-r--r-- | src/sjis.c | 14 | ||||
-rw-r--r-- | src/st.c | 8 | ||||
-rw-r--r-- | src/unicode.c | 13 | ||||
-rw-r--r-- | src/utf16_be.c | 2 | ||||
-rw-r--r-- | src/utf16_le.c | 2 | ||||
-rw-r--r-- | src/utf32_be.c | 7 | ||||
-rw-r--r-- | src/utf32_le.c | 7 |
29 files changed, 1106 insertions, 535 deletions
diff --git a/src/Makefile.am b/src/Makefile.am index 36c2222..44a4167 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -11,10 +11,15 @@ posix_headers = onigposix.h if ENABLE_POSIX_API posix_sources = regposix.c regposerr.c include_HEADERS += $(posix_headers) +AM_CFLAGS += -DUSE_POSIX_API else posix_sources = endif +if ENABLE_BINARY_COMPATIBLE_POSIX_API +AM_CFLAGS += -DUSE_BINARY_COMPATIBLE_POSIX_API +endif + lib_LTLIBRARIES = $(libname) @@ -45,13 +50,29 @@ libonig_la_SOURCES = regint.h regparse.h regenc.h st.h \ gb18030.c koi8_r.c cp1251.c \ onig_init.c -libonig_la_LDFLAGS = -version-info $(LTVERSION) - EXTRA_DIST = koi8.c mktable.c \ unicode_fold_data.c unicode_property_data.c \ unicode_property_data_posix.c \ unicode_egcb_data.c unicode_wb_data.c + +libonig_la_LDFLAGS = $(EXTRA_LIBONIG_LDFLAGS) -version-info $(LTVERSION) + +if USE_LIBONIG_DEF_FILE + +libonig_la_LDFLAGS += -Wl,--output-def,$(LIBONIG_DEF_FILE) + +install-data-hook: + echo "$(LIBTOOL) $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=install $(INSTALL_DATA) $(LIBONIG_DEF_FILE) $(DESTDIR)$(libdir)"; \ + $(LIBTOOL) $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=install $(INSTALL_DATA) $(LIBONIG_DEF_FILE) $(DESTDIR)$(libdir) || exit 1 + +uninstall-hook: + echo " $(LIBTOOL) $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=uninstall rm -f '$(DESTDIR)$(libdir)/$(LIBONIG_DEF_FILE)'"; \ + $(LIBTOOL) $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=uninstall rm -f "$(DESTDIR)$(libdir)/$(LIBONIG_DEF_FILE)" + +endif + + dll: $(CXX) -shared -Wl,--output-def,libonig.def -o libonig.dll *.o \ $(LIBS) diff --git a/src/Makefile.windows b/src/Makefile.windows index 90ebf28..11d6fd8 100644 --- a/src/Makefile.windows +++ b/src/Makefile.windows @@ -18,7 +18,7 @@ LINKFLAGS = -link -incremental:no -pdb:none INSTALL = install -c CP = copy CC = cl -DEFS = -DHAVE_CONFIG_H +DEFS = -DHAVE_CONFIG_H -DUSE_POSIX_API -DUSE_BINARY_COMPATIBLE_POSIX_API subdirs = @@ -2,7 +2,7 @@ big5.c - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2019 K.Kosako + * Copyright (c) 2002-2020 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -58,8 +58,15 @@ static int big5_code_to_mbclen(OnigCodePoint code) { if ((code & (~0xffff)) != 0) return ONIGERR_INVALID_CODE_POINT_VALUE; - if ((code & 0xff00) != 0) return 2; - if (EncLen_BIG5[(int )(code & 0xff)] == 1) return 1; + + if ((code & 0xff00) != 0) { + if (EncLen_BIG5[(int )(code >> 8) & 0xff] == 2) + return 2; + } + else { + if (EncLen_BIG5[(int )(code & 0xff)] == 1) + return 1; + } return ONIGERR_INVALID_CODE_POINT_VALUE; } diff --git a/src/config.h.cmake.in b/src/config.h.cmake.in index 60db86c..c213a09 100644 --- a/src/config.h.cmake.in +++ b/src/config.h.cmake.in @@ -43,6 +43,12 @@ /* The size of `long', as computed by sizeof. */ #cmakedefine SIZEOF_LONG ${SIZEOF_LONG} +/* The size of `long long', as computed by sizeof. */ +#cmakedefine SIZEOF_LONG_LONG ${SIZEOF_LONG_LONG} + +/* The size of `void*', as computed by sizeof. */ +#cmakedefine SIZEOF_VOIDP ${SIZEOF_VOIDP} + /* Define if enable CR+NL as line terminator */ #cmakedefine USE_CRNL_AS_LINE_TERMINATOR ${USE_CRNL_AS_LINE_TERMINATOR} diff --git a/src/euc_jp.c b/src/euc_jp.c index 640b3e3..bfe91bf 100644 --- a/src/euc_jp.c +++ b/src/euc_jp.c @@ -2,7 +2,7 @@ euc_jp.c - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2019 K.Kosako + * Copyright (c) 2002-2020 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -114,10 +114,20 @@ static int code_to_mbclen(OnigCodePoint code) { if (ONIGENC_IS_CODE_ASCII(code)) return 1; - else if ((code & 0xff0000) != 0) return 3; - else if ((code & 0xff00) != 0) return 2; - else - return ONIGERR_INVALID_CODE_POINT_VALUE; + else if ((code & 0xff0000) != 0) { + if (EncLen_EUCJP[(int )(code >> 16) & 0xff] == 3) + return 3; + } + else if ((code & 0xff00) != 0) { + if (EncLen_EUCJP[(int )(code >> 8) & 0xff] == 2) + return 2; + } + else if (code < 256) { + if (EncLen_EUCJP[(int )(code & 0xff)] == 1) + return 1; + } + + return ONIGERR_INVALID_CODE_POINT_VALUE; } static int @@ -125,8 +135,13 @@ code_to_mbc(OnigCodePoint code, UChar *buf) { UChar *p = buf; - if ((code & 0xff0000) != 0) *p++ = (UChar )(((code >> 16) & 0xff)); - if ((code & 0xff00) != 0) *p++ = (UChar )(((code >> 8) & 0xff)); + if ((code & 0xff0000) != 0) { + *p++ = (UChar )(((code >> 16) & 0xff)); + *p++ = (UChar )(((code >> 8) & 0xff)); + } + else if ((code & 0xff00) != 0) + *p++ = (UChar )(((code >> 8) & 0xff)); + *p++ = (UChar )(code & 0xff); #if 1 diff --git a/src/euc_kr.c b/src/euc_kr.c index 7fa50af..b0e9fbf 100644 --- a/src/euc_kr.c +++ b/src/euc_kr.c @@ -2,7 +2,7 @@ euc_kr.c - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2019 K.Kosako + * Copyright (c) 2002-2020 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -58,8 +58,15 @@ static int euckr_code_to_mbclen(OnigCodePoint code) { if ((code & (~0xffff)) != 0) return ONIGERR_INVALID_CODE_POINT_VALUE; - if ((code & 0xff00) != 0) return 2; - if (EncLen_EUCKR[(int )(code & 0xff)] == 1) return 1; + + if ((code & 0xff00) != 0) { + if (EncLen_EUCKR[(int )(code >> 8) & 0xff] == 2) + return 2; + } + else { + if (EncLen_EUCKR[(int )(code & 0xff)] == 1) + return 1; + } return ONIGERR_INVALID_CODE_POINT_VALUE; } diff --git a/src/euc_tw.c b/src/euc_tw.c index 8e72b97..99dc5ec 100644 --- a/src/euc_tw.c +++ b/src/euc_tw.c @@ -2,7 +2,7 @@ euc_tw.c - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2019 K.Kosako + * Copyright (c) 2002-2020 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -57,15 +57,22 @@ euctw_mbc_enc_len(const UChar* p) static int euctw_code_to_mbclen(OnigCodePoint code) { - if ((code & 0xff000000) != 0) return 4; - else if ((code & 0xff0000) != 0) return ONIGERR_INVALID_CODE_POINT_VALUE; - else if ((code & 0xff00) != 0) return 2; + if ((code & 0xff000000) != 0) { + if (EncLen_EUCTW[(int )(code >> 24) & 0xff] == 4) + return 4; + } + else if ((code & 0xff0000) != 0) + return ONIGERR_INVALID_CODE_POINT_VALUE; + else if ((code & 0xff00) != 0) { + if (EncLen_EUCTW[(int )(code >> 8) & 0xff] == 2) + return 2; + } else { if (EncLen_EUCTW[(int )(code & 0xff)] == 1) return 1; - - return ONIGERR_INVALID_CODE_POINT_VALUE; } + + return ONIGERR_INVALID_CODE_POINT_VALUE; } static int diff --git a/src/gb18030.c b/src/gb18030.c index 1385a7f..7409d3e 100644 --- a/src/gb18030.c +++ b/src/gb18030.c @@ -89,15 +89,25 @@ gb18030_mbc_enc_len(const UChar* p) static int gb18030_code_to_mbclen(OnigCodePoint code) { - if ((code & 0xff000000) != 0) return 4; - else if ((code & 0xff0000) != 0) return ONIGERR_INVALID_CODE_POINT_VALUE; - else if ((code & 0xff00) != 0) return 2; + if ((code & 0xff000000) != 0) { + if (GB18030_MAP[(int )(code >> 24) & 0xff] == CM) + if (GB18030_MAP[(int )(code >> 16) & 0xff] == C4) + return 4; + } + else if ((code & 0xff0000) != 0) return ONIGERR_INVALID_CODE_POINT_VALUE; + else if ((code & 0xff00) != 0) { + if (GB18030_MAP[(int )(code >> 8) & 0xff] == CM) { + char c = GB18030_MAP[(int )code & 0xff]; + if (c == CM || c == C2) + return 2; + } + } else { - if (GB18030_MAP[(int )(code & 0xff)] == CM) - return ONIGERR_INVALID_CODE_POINT_VALUE; - - return 1; + if (GB18030_MAP[(int )(code & 0xff)] != CM) + return 1; } + + return ONIGERR_INVALID_CODE_POINT_VALUE; } static int diff --git a/src/make_property.sh b/src/make_property.sh index e5f1244..1c5e0f5 100755 --- a/src/make_property.sh +++ b/src/make_property.sh @@ -1,7 +1,7 @@ #!/bin/sh GPERF=gperf -SED=gsed +SED=sed TMP1=gperf1.tmp TMP2=gperf2.tmp diff --git a/src/make_unicode_property.sh b/src/make_unicode_property.sh index 5129376..ff7dc62 100755 --- a/src/make_unicode_property.sh +++ b/src/make_unicode_property.sh @@ -1,7 +1,7 @@ #!/bin/sh GPERF=gperf -SED=gsed +SED=sed NAME=unicode_property_data TMP1=gperf1.tmp diff --git a/src/make_unicode_property_data.py b/src/make_unicode_property_data.py index 285c462..d1b3377 100755 --- a/src/make_unicode_property_data.py +++ b/src/make_unicode_property_data.py @@ -1,7 +1,7 @@ #!/usr/bin/python # -*- coding: utf-8 -*- # make_unicode_property_data.py -# Copyright (c) 2016-2019 K.Kosako +# Copyright (c) 2016-2020 K.Kosako import sys import re @@ -405,7 +405,7 @@ def set_max_prop_name(name): def entry_prop_name(name, index): set_max_prop_name(name) if OUTPUT_LIST_MODE and index >= len(POSIX_LIST): - print >> UPF, "%3d: %s" % (index, name) + print >> UPF, "%s" % (name) def entry_and_print_prop_and_index(name, index): entry_prop_name(name, index) diff --git a/src/onigposix.h b/src/onigposix.h index 37e09ea..3514f80 100644 --- a/src/onigposix.h +++ b/src/onigposix.h @@ -74,19 +74,19 @@ extern "C" { #define REG_POSIX_ENCODING_UTF16_LE 5 -typedef int regoff_t; +typedef int onig_posix_regoff_t; typedef struct { - regoff_t rm_so; - regoff_t rm_eo; -} regmatch_t; + onig_posix_regoff_t rm_so; + onig_posix_regoff_t rm_eo; +} onig_posix_regmatch_t; /* POSIX regex_t */ typedef struct { void* onig; /* Oniguruma regex_t* */ size_t re_nsub; int comp_options; -} regex_t; +} onig_posix_regex_t; #ifndef P_ @@ -160,16 +160,31 @@ ONIG_EXTERN int onig_end P_((void)); #endif /* ONIGURUMA_H */ -ONIG_EXTERN int regcomp P_((regex_t* reg, const char* pat, int options)); -ONIG_EXTERN int regexec P_((regex_t* reg, const char* str, size_t nmatch, regmatch_t* matches, int options)); -ONIG_EXTERN void regfree P_((regex_t* reg)); -ONIG_EXTERN size_t regerror P_((int code, const regex_t* reg, char* buf, size_t size)); +ONIG_EXTERN int onig_posix_regcomp P_((onig_posix_regex_t* reg, const char* pat, int options)); +ONIG_EXTERN int onig_posix_regexec P_((onig_posix_regex_t* reg, const char* str, size_t nmatch, onig_posix_regmatch_t* matches, int options)); +ONIG_EXTERN void onig_posix_regfree P_((onig_posix_regex_t* reg)); +ONIG_EXTERN size_t onig_posix_regerror P_((int code, const onig_posix_regex_t* reg, char* buf, size_t size)); /* extended API */ -ONIG_EXTERN void reg_set_encoding P_((int enc)); -ONIG_EXTERN int reg_name_to_group_numbers P_((regex_t* reg, const unsigned char* name, const unsigned char* name_end, int** nums)); -ONIG_EXTERN int reg_foreach_name P_((regex_t* reg, int (*func)(const unsigned char*, const unsigned char*,int,int*,regex_t*,void*), void* arg)); -ONIG_EXTERN int reg_number_of_names P_((regex_t* reg)); +ONIG_EXTERN void onig_posix_reg_set_encoding P_((int enc)); +ONIG_EXTERN int onig_posix_reg_name_to_group_numbers P_((onig_posix_regex_t* reg, const unsigned char* name, const unsigned char* name_end, int** nums)); +ONIG_EXTERN int onig_posix_reg_foreach_name P_((onig_posix_regex_t* reg, int (*func)(const unsigned char*, const unsigned char*,int,int*,onig_posix_regex_t*,void*), void* arg)); +ONIG_EXTERN int onig_posix_reg_number_of_names P_((onig_posix_regex_t* reg)); + + +/* aliases */ +#define regex_t onig_posix_regex_t +#define regmatch_t onig_posix_regmatch_t +#define regoff_t onig_posix_regoff_t + +#define regcomp onig_posix_regcomp +#define regexec onig_posix_regexec +#define regfree onig_posix_regfree +#define regerror onig_posix_regerror +#define reg_set_encoding onig_posix_reg_set_encoding +#define reg_name_to_group_numbers onig_posix_reg_name_to_group_numbers +#define reg_foreach_name onig_posix_reg_foreach_name +#define reg_number_of_names onig_posix_reg_number_of_names #ifdef __cplusplus } diff --git a/src/oniguruma.h b/src/oniguruma.h index 15f6ef0..d983fc9 100644 --- a/src/oniguruma.h +++ b/src/oniguruma.h @@ -36,9 +36,9 @@ extern "C" { #define ONIGURUMA #define ONIGURUMA_VERSION_MAJOR 6 #define ONIGURUMA_VERSION_MINOR 9 -#define ONIGURUMA_VERSION_TEENY 5 +#define ONIGURUMA_VERSION_TEENY 6 -#define ONIGURUMA_VERSION_INT 60905 +#define ONIGURUMA_VERSION_INT 60906 #ifndef P_ #if defined(__STDC__) || defined(_WIN32) @@ -395,8 +395,12 @@ typedef unsigned int OnigOptionType; #define ONIG_OPTION_POSIX_IS_ASCII (ONIG_OPTION_SPACE_IS_ASCII << 1) #define ONIG_OPTION_TEXT_SEGMENT_EXTENDED_GRAPHEME_CLUSTER (ONIG_OPTION_POSIX_IS_ASCII << 1) #define ONIG_OPTION_TEXT_SEGMENT_WORD (ONIG_OPTION_TEXT_SEGMENT_EXTENDED_GRAPHEME_CLUSTER << 1) +/* options (search time) */ +#define ONIG_OPTION_NOT_BEGIN_STRING (ONIG_OPTION_TEXT_SEGMENT_WORD << 1) +#define ONIG_OPTION_NOT_END_STRING (ONIG_OPTION_NOT_BEGIN_STRING << 1) +#define ONIG_OPTION_NOT_BEGIN_POSITION (ONIG_OPTION_NOT_END_STRING << 1) -#define ONIG_OPTION_MAXBIT ONIG_OPTION_TEXT_SEGMENT_WORD /* limit */ +#define ONIG_OPTION_MAXBIT ONIG_OPTION_NOT_BEGIN_POSITION #define ONIG_OPTION_ON(options,regopt) ((options) |= (regopt)) #define ONIG_OPTION_OFF(options,regopt) ((options) &= ~(regopt)) @@ -561,6 +565,7 @@ ONIG_EXTERN OnigSyntaxType* OnigDefaultSyntax; #define ONIGERR_PARSE_DEPTH_LIMIT_OVER -16 #define ONIGERR_RETRY_LIMIT_IN_MATCH_OVER -17 #define ONIGERR_RETRY_LIMIT_IN_SEARCH_OVER -18 +#define ONIGERR_SUBEXP_CALL_LIMIT_IN_SEARCH_OVER -19 #define ONIGERR_DEFAULT_ENCODING_IS_NOT_SETTED -21 #define ONIGERR_SPECIFIED_ENCODING_CANT_CONVERT_TO_WIDE_CHAR -22 #define ONIGERR_FAIL_TO_INITIALIZE -23 @@ -919,6 +924,10 @@ int onig_set_capture_num_limit P_((int num)); ONIG_EXTERN int onig_set_parse_depth_limit P_((unsigned int depth)); ONIG_EXTERN +unsigned long onig_get_subexp_call_limit_in_search P_((void)); +ONIG_EXTERN +int onig_set_subexp_call_limit_in_search P_((unsigned long n)); +ONIG_EXTERN int onig_get_subexp_call_max_nest_level P_((void)); ONIG_EXTERN int onig_set_subexp_call_max_nest_level P_((int level)); diff --git a/src/regcomp.c b/src/regcomp.c index 4d5b78f..dd2b328 100644 --- a/src/regcomp.c +++ b/src/regcomp.c @@ -133,6 +133,7 @@ ops_init(regex_t* reg, int init_alloc_size) size = sizeof(Operation) * init_alloc_size; p = (Operation* )xrealloc(reg->ops, size); CHECK_NULL_RETURN_MEMERR(p); + reg->ops = p; #ifdef USE_DIRECT_THREADED_CODE { enum OpCode* cp; @@ -144,13 +145,12 @@ ops_init(regex_t* reg, int init_alloc_size) #endif } else { - p = (Operation* )0; + reg->ops = (Operation* )0; #ifdef USE_DIRECT_THREADED_CODE reg->ocs = (enum OpCode* )0; #endif } - reg->ops = p; reg->ops_curr = 0; /* !!! not yet done ops_new() */ reg->ops_alloc = init_alloc_size; reg->ops_used = 0; @@ -176,6 +176,7 @@ ops_expand(regex_t* reg, int n) size = sizeof(Operation) * n; p = (Operation* )xrealloc(reg->ops, size); CHECK_NULL_RETURN_MEMERR(p); + reg->ops = p; #ifdef USE_DIRECT_THREADED_CODE size = sizeof(enum OpCode) * n; @@ -184,7 +185,6 @@ ops_expand(regex_t* reg, int n) reg->ocs = cp; #endif - reg->ops = p; reg->ops_alloc = n; if (reg->ops_used == 0) reg->ops_curr = 0; @@ -265,10 +265,12 @@ ops_free(regex_t* reg) case OP_BACKREF1: case OP_BACKREF2: case OP_BACKREF_N: case OP_BACKREF_N_IC: break; case OP_BACKREF_MULTI: case OP_BACKREF_MULTI_IC: + case OP_BACKREF_CHECK: +#ifdef USE_BACKREF_WITH_LEVEL case OP_BACKREF_WITH_LEVEL: case OP_BACKREF_WITH_LEVEL_IC: - case OP_BACKREF_CHECK: case OP_BACKREF_CHECK_WITH_LEVEL: +#endif if (op->backref_general.num != 1) xfree(op->backref_general.ns); break; @@ -631,7 +633,7 @@ mmcl_add(MinMaxCharLen* to, MinMaxCharLen* add) to->min = distance_add(to->min, add->min); to->max = distance_add(to->max, add->max); - to->min_is_sure = add->min_is_sure != 0 && to->min_is_sure != 0; + to->min_is_sure = add->min_is_sure != FALSE && to->min_is_sure != FALSE; } static void @@ -656,8 +658,11 @@ static void mmcl_alt_merge(MinMaxCharLen* to, MinMaxCharLen* alt) { if (to->min > alt->min) { - to->min = alt->min; - if (alt->min_is_sure != 0) + to->min = alt->min; + to->min_is_sure = alt->min_is_sure; + } + else if (to->min == alt->min) { + if (alt->min_is_sure != FALSE) to->min_is_sure = TRUE; } @@ -840,7 +845,7 @@ node_char_len1(Node* node, regex_t* reg, MinMaxCharLen* ci, ScanEnv* env, en->min_char_len = ci->min; en->max_char_len = ci->max; NODE_STATUS_ADD(node, FIXED_CLEN); - if (ci->min_is_sure != 0) + if (ci->min_is_sure != FALSE) NODE_STATUS_ADD(node, FIXED_CLEN_MIN_SURE); } } @@ -882,15 +887,15 @@ node_char_len1(Node* node, regex_t* reg, MinMaxCharLen* ci, ScanEnv* env, } break; - case NODE_ANCHOR: + case NODE_GIMMICK: mmcl_set(ci, 0); - /* can't optimize look-behind if anchor exists. */ - ci->min_is_sure = FALSE; break; - case NODE_GIMMICK: + case NODE_ANCHOR: zero: mmcl_set(ci, 0); + /* can't optimize look-behind if anchor exists. */ + ci->min_is_sure = FALSE; break; case NODE_BACKREF: @@ -1082,6 +1087,9 @@ compile_call(CallNode* node, regex_t* reg, ScanEnv* env) if (r != 0) return r; COP(reg)->call.addr = 0; /* dummy addr. */ +#ifdef ONIG_DEBUG_MATCH_COUNTER + COP(reg)->call.called_mem = node->called_gnum; +#endif offset = COP_CURR_OFFSET_BYTES(reg, call.addr); r = unset_addr_list_add(env->unset_addr_list, offset, NODE_CALL_BODY(node)); @@ -1822,7 +1830,6 @@ compile_bag_memory_node(BagNode* node, regex_t* reg, ScanEnv* env) COP(reg)->memory_end.num = node->m.regnum; if (NODE_IS_CALLED(node)) { - if (r != 0) return r; r = add_op(reg, OP_RETURN); } #else @@ -2764,7 +2771,7 @@ compile_tree(Node* node, regex_t* reg, ScanEnv* env) static int make_named_capture_number_map(Node** plink, GroupNumMap* map, int* counter) { - int r = 0; + int r; Node* node = *plink; switch (NODE_TYPE(node)) { @@ -2772,17 +2779,17 @@ make_named_capture_number_map(Node** plink, GroupNumMap* map, int* counter) case NODE_ALT: do { r = make_named_capture_number_map(&(NODE_CAR(node)), map, counter); - } while (r == 0 && IS_NOT_NULL(node = NODE_CDR(node))); + } while (r >= 0 && IS_NOT_NULL(node = NODE_CDR(node))); + if (r < 0) return r; break; case NODE_QUANT: { Node** ptarget = &(NODE_BODY(node)); - Node* old = *ptarget; r = make_named_capture_number_map(ptarget, map, counter); - if (r != 0) return r; - if (*ptarget != old && NODE_TYPE(*ptarget) == NODE_QUANT) { - r = onig_reduce_nested_quantifier(node); + if (r < 0) return r; + if (r == 1 && NODE_TYPE(*ptarget) == NODE_QUANT) { + return onig_reduce_nested_quantifier(node); } } break; @@ -2796,41 +2803,48 @@ make_named_capture_number_map(Node** plink, GroupNumMap* map, int* counter) map[en->m.regnum].new_val = *counter; en->m.regnum = *counter; r = make_named_capture_number_map(&(NODE_BODY(node)), map, counter); + if (r < 0) return r; } else { *plink = NODE_BODY(node); NODE_BODY(node) = NULL_NODE; onig_node_free(node); r = make_named_capture_number_map(plink, map, counter); + if (r < 0) return r; + return 1; } } else if (en->type == BAG_IF_ELSE) { r = make_named_capture_number_map(&(NODE_BAG_BODY(en)), map, counter); - if (r != 0) return r; + if (r < 0) return r; if (IS_NOT_NULL(en->te.Then)) { r = make_named_capture_number_map(&(en->te.Then), map, counter); - if (r != 0) return r; + if (r < 0) return r; } if (IS_NOT_NULL(en->te.Else)) { r = make_named_capture_number_map(&(en->te.Else), map, counter); - if (r != 0) return r; + if (r < 0) return r; } } - else + else { r = make_named_capture_number_map(&(NODE_BODY(node)), map, counter); + if (r < 0) return r; + } } break; case NODE_ANCHOR: - if (IS_NOT_NULL(NODE_BODY(node))) + if (IS_NOT_NULL(NODE_BODY(node))) { r = make_named_capture_number_map(&(NODE_BODY(node)), map, counter); + if (r < 0) return r; + } break; default: break; } - return r; + return 0; } static int @@ -2982,7 +2996,7 @@ disable_noname_group_capture(Node** root, regex_t* reg, ScanEnv* env) } counter = 0; r = make_named_capture_number_map(root, map, &counter); - if (r != 0) return r; + if (r < 0) return r; r = renumber_backref_traverse(*root, map); if (r != 0) return r; @@ -3546,7 +3560,9 @@ check_node_in_look_behind(Node* node, int not, int* used) if (r != 0) break; if (en->type == BAG_MEMORY) { - if (NODE_IS_BACKREF(node) || NODE_IS_CALLED(node)) *used = TRUE; + if (NODE_IS_BACKREF(node) || NODE_IS_CALLED(node) + || NODE_IS_REFERENCED(node)) + *used = TRUE; } else if (en->type == BAG_IF_ELSE) { if (IS_NOT_NULL(en->te.Then)) { @@ -3978,6 +3994,7 @@ set_empty_repeat_node_trav(Node* node, Node* empty, ScanEnv* env) { BagNode* en = BAG_(node); + r = 0; if (en->type == BAG_MEMORY) { if (NODE_IS_BACKREF(node)) { if (IS_NOT_NULL(empty)) @@ -4484,7 +4501,7 @@ remove_from_list(Node* prev, Node* a) } static int -reduce_string_list(Node* node) +reduce_string_list(Node* node, OnigEncoding enc) { int r = 0; @@ -4515,43 +4532,70 @@ reduce_string_list(Node* node) } } else { - prev = NULL_NODE; + if (IS_NOT_NULL(prev)) { +#ifdef USE_CHECK_VALIDITY_OF_STRING_IN_TREE + StrNode* sn = STR_(prev); + if (! ONIGENC_IS_VALID_MBC_STRING(enc, sn->s, sn->end)) + return ONIGERR_INVALID_WIDE_CHAR_VALUE; +#endif + prev = NULL_NODE; + } + r = reduce_string_list(curr, enc); + if (r != 0) return r; prev_node = node; } node = next_node; } while (r == 0 && IS_NOT_NULL(node)); + +#ifdef USE_CHECK_VALIDITY_OF_STRING_IN_TREE + if (IS_NOT_NULL(prev)) { + StrNode* sn = STR_(prev); + if (! ONIGENC_IS_VALID_MBC_STRING(enc, sn->s, sn->end)) + return ONIGERR_INVALID_WIDE_CHAR_VALUE; + } +#endif } break; case NODE_ALT: do { - r = reduce_string_list(NODE_CAR(node)); + r = reduce_string_list(NODE_CAR(node), enc); } while (r == 0 && IS_NOT_NULL(node = NODE_CDR(node))); break; +#ifdef USE_CHECK_VALIDITY_OF_STRING_IN_TREE + case NODE_STRING: + { + StrNode* sn = STR_(node); + if (! ONIGENC_IS_VALID_MBC_STRING(enc, sn->s, sn->end)) + return ONIGERR_INVALID_WIDE_CHAR_VALUE; + } + break; +#endif + case NODE_ANCHOR: if (IS_NULL(NODE_BODY(node))) break; /* fall */ case NODE_QUANT: - r = reduce_string_list(NODE_BODY(node)); + r = reduce_string_list(NODE_BODY(node), enc); break; case NODE_BAG: { BagNode* en = BAG_(node); - r = reduce_string_list(NODE_BODY(node)); + r = reduce_string_list(NODE_BODY(node), enc); if (r != 0) return r; if (en->type == BAG_IF_ELSE) { if (IS_NOT_NULL(en->te.Then)) { - r = reduce_string_list(en->te.Then); + r = reduce_string_list(en->te.Then, enc); if (r != 0) return r; } if (IS_NOT_NULL(en->te.Else)) { - r = reduce_string_list(en->te.Else); + r = reduce_string_list(en->te.Else, enc); if (r != 0) return r; } } @@ -4723,7 +4767,7 @@ tune_look_behind(Node* node, regex_t* reg, int state, ScanEnv* env) return ONIGERR_INVALID_LOOK_BEHIND_PATTERN; } - if (ci.min == 0 && ci.min_is_sure != 0 && used == FALSE) { + if (ci.min == 0 && ci.min_is_sure != FALSE && used == FALSE) { if (an->type == ANCR_LOOK_BEHIND_NOT) r = onig_node_reset_fail(node); else @@ -4779,18 +4823,23 @@ tune_look_behind(Node* node, regex_t* reg, int state, ScanEnv* env) static int tune_next(Node* node, Node* next_node, regex_t* reg) { + int called; NodeType type; + called = FALSE; + retry: type = NODE_TYPE(node); if (type == NODE_QUANT) { QuantNode* qn = QUANT_(node); if (qn->greedy && IS_INFINITE_REPEAT(qn->upper)) { #ifdef USE_QUANT_PEEK_NEXT - Node* n = get_tree_head_literal(next_node, 1, reg); - /* '\0': for UTF-16BE etc... */ - if (IS_NOT_NULL(n) && STR_(n)->s[0] != '\0') { - qn->next_head_exact = n; + if (called == FALSE) { + Node* n = get_tree_head_literal(next_node, 1, reg); + /* '\0': for UTF-16BE etc... */ + if (IS_NOT_NULL(n) && STR_(n)->s[0] != '\0') { + qn->next_head_exact = n; + } } #endif /* automatic posseivation a*b ==> (?>a*)b */ @@ -4815,6 +4864,8 @@ tune_next(Node* node, Node* next_node, regex_t* reg) else if (type == NODE_BAG) { BagNode* en = BAG_(node); if (en->type == BAG_MEMORY) { + if (NODE_IS_CALLED(node)) + called = TRUE; node = NODE_BODY(node); goto retry; } @@ -4999,17 +5050,18 @@ unravel_cf_look_behind_add(Node** rlist, Node** rsn, { int r, i, found; - found = 0; + found = FALSE; for (i = 0; i < n; i++) { OnigCaseFoldCodeItem* item = items + i; if (item->byte_len == one_len) { if (item->code_len == 1) { - found = 1; + found = TRUE; + break; } } } - if (found == 0) { + if (found == FALSE) { r = unravel_cf_string_add(rlist, rsn, s, s + one_len, 0 /* flag */); } else { @@ -5073,6 +5125,7 @@ unravel_case_fold_string(Node* node, regex_t* reg, int state) one_len = (OnigLen )enclen(enc, p); if (n == 0) { q = p + one_len; + if (q > end) q = end; r = unravel_cf_string_add(&list, &sn, p, q, 0 /* flag */); if (r != 0) goto err; } @@ -5221,12 +5274,12 @@ quantifiers_memory_node_info(Node* node) __inline #endif static int -tune_call_node_call(CallNode* cn, ScanEnv* env, int state) +check_call_reference(CallNode* cn, ScanEnv* env, int state) { MemEnv* mem_env = SCANENV_MEMENV(env); if (cn->by_number != 0) { - int gnum = cn->group_num; + int gnum = cn->called_gnum; if (env->num_named > 0 && IS_SYNTAX_BV(env->syntax, ONIG_SYN_CAPTURE_ONLY_NAMED_GROUP) && @@ -5241,12 +5294,14 @@ tune_call_node_call(CallNode* cn, ScanEnv* env, int state) } set_call_attr: - NODE_CALL_BODY(cn) = mem_env[cn->group_num].mem_node; + NODE_CALL_BODY(cn) = mem_env[cn->called_gnum].mem_node; if (IS_NULL(NODE_CALL_BODY(cn))) { onig_scan_env_set_error_string(env, ONIGERR_UNDEFINED_NAME_REFERENCE, cn->name, cn->name_end); return ONIGERR_UNDEFINED_NAME_REFERENCE; } + + NODE_STATUS_ADD(NODE_CALL_BODY(cn), REFERENCED); } else { int *refs; @@ -5263,7 +5318,7 @@ tune_call_node_call(CallNode* cn, ScanEnv* env, int state) return ONIGERR_MULTIPLEX_DEFINITION_NAME_CALL; } else { - cn->group_num = refs[0]; + cn->called_gnum = refs[0]; goto set_call_attr; } } @@ -5396,7 +5451,7 @@ tune_call(Node* node, ScanEnv* env, int state) CALL_(node)->entry_count--; } - r = tune_call_node_call(CALL_(node), env, state); + r = check_call_reference(CALL_(node), env, state); break; default: @@ -6187,8 +6242,10 @@ concat_opt_exact(OptStr* to, OptStr* add, OnigEncoding enc) r = 1; /* 1:full */ break; } - for (j = 0; j < len && p < end; j++) + for (j = 0; j < len && p < end; j++) { + /* coverity[overrun-local] */ to->s[i++] = *p++; + } } to->len = i; @@ -6210,8 +6267,10 @@ concat_opt_exact_str(OptStr* to, UChar* s, UChar* end, OnigEncoding enc) for (i = to->len, p = s; p < end && i < OPT_EXACT_MAXLEN; ) { len = enclen(enc, p); if (i + len > OPT_EXACT_MAXLEN) break; - for (j = 0; j < len && p < end; j++) + for (j = 0; j < len && p < end; j++) { + /* coverity[overrun-local] */ to->s[i++] = *p++; + } } to->len = i; @@ -7229,19 +7288,10 @@ onig_compile(regex_t* reg, const UChar* pattern, const UChar* pattern_end, else reg->ops_used = 0; - reg->string_pool = 0; - reg->string_pool_end = 0; - reg->num_mem = 0; - reg->num_repeat = 0; - reg->num_empty_check = 0; - reg->repeat_range_alloc = 0; - reg->repeat_range = (RepeatRange* )NULL; - reg->empty_status_mem = 0; - r = onig_parse_tree(&root, pattern, pattern_end, reg, &scan_env); if (r != 0) goto err; - r = reduce_string_list(root); + r = reduce_string_list(root, reg->enc); if (r != 0) goto err; /* mixed use named group and no-named group */ @@ -7653,6 +7703,134 @@ onig_is_code_in_cc(OnigEncoding enc, OnigCodePoint code, CClassNode* cc) return onig_is_code_in_cc_len(len, code, cc); } +typedef struct { + int prec_read; + int look_behind; + int backref_with_level; + int call; +} SlowElementCount; + +static int +node_detect_can_be_slow(Node* node, SlowElementCount* ct) +{ + int r; + + r = 0; + switch (NODE_TYPE(node)) { + case NODE_LIST: + case NODE_ALT: + do { + r = node_detect_can_be_slow(NODE_CAR(node), ct); + if (r != 0) return r; + } while (IS_NOT_NULL(node = NODE_CDR(node))); + break; + + case NODE_QUANT: + r = node_detect_can_be_slow(NODE_BODY(node), ct); + break; + + case NODE_ANCHOR: + switch (ANCHOR_(node)->type) { + case ANCR_PREC_READ: + case ANCR_PREC_READ_NOT: + ct->prec_read++; + break; + case ANCR_LOOK_BEHIND: + case ANCR_LOOK_BEHIND_NOT: + ct->look_behind++; + break; + default: + break; + } + + if (ANCHOR_HAS_BODY(ANCHOR_(node))) + r = node_detect_can_be_slow(NODE_BODY(node), ct); + break; + + case NODE_BAG: + { + BagNode* en = BAG_(node); + + r = node_detect_can_be_slow(NODE_BODY(node), ct); + if (r != 0) return r; + + if (en->type == BAG_IF_ELSE) { + if (IS_NOT_NULL(en->te.Then)) { + r = node_detect_can_be_slow(en->te.Then, ct); + if (r != 0) return r; + } + if (IS_NOT_NULL(en->te.Else)) { + r = node_detect_can_be_slow(en->te.Else, ct); + if (r != 0) return r; + } + } + } + break; + +#ifdef USE_BACKREF_WITH_LEVEL + case NODE_BACKREF: + if (NODE_IS_NEST_LEVEL(node)) + ct->backref_with_level++; + break; +#endif + +#ifdef USE_CALL + case NODE_CALL: + ct->call++; + break; +#endif + + default: + break; + } + + return r; +} + +extern int +onig_detect_can_be_slow_pattern(const UChar* pattern, + const UChar* pattern_end, OnigOptionType option, OnigEncoding enc, + OnigSyntaxType* syntax) +{ + int r; + regex_t* reg; + Node* root; + ScanEnv scan_env; + SlowElementCount count; + + reg = (regex_t* )xmalloc(sizeof(regex_t)); + if (IS_NULL(reg)) return ONIGERR_MEMORY; + + r = onig_reg_init(reg, option, ONIGENC_CASE_FOLD_DEFAULT, enc, syntax); + if (r != 0) { + xfree(reg); + return r; + } + + root = 0; + r = onig_parse_tree(&root, pattern, pattern_end, reg, &scan_env); + if (r == 0) { + count.prec_read = 0; + count.look_behind = 0; + count.backref_with_level = 0; + count.call = 0; + + r = node_detect_can_be_slow(root, &count); + if (r == 0) { + int n = count.prec_read + count.look_behind + + count.backref_with_level + count.call; + r = n; + } + } + + if (IS_NOT_NULL(scan_env.mem_env_dynamic)) + xfree(scan_env.mem_env_dynamic); + + onig_node_free(root); + onig_free(reg); + return r; +} + #ifdef ONIG_DEBUG_PARSE @@ -7734,14 +7912,18 @@ print_indent_tree(FILE* f, Node* node, int indent) break; case NODE_CCLASS: +#define CCLASS_MBUF_MAX_OUTPUT_NUM 10 + fprintf(f, "<cclass:%p>", node); if (IS_NCCLASS_NOT(CCLASS_(node))) fputs(" not", f); if (CCLASS_(node)->mbuf) { BBuf* bbuf = CCLASS_(node)->mbuf; - for (i = 0; i < bbuf->used; i++) { + fprintf(f, " mbuf(%u) ", bbuf->used); + for (i = 0; i < bbuf->used && i < CCLASS_MBUF_MAX_OUTPUT_NUM; i++) { if (i > 0) fprintf(f, ","); fprintf(f, "%0x", bbuf->p[i]); } + if (i < bbuf->used) fprintf(f, "..."); } break; @@ -7822,6 +8004,11 @@ print_indent_tree(FILE* f, Node* node, int indent) if (i > 0) fputs(", ", f); fprintf(f, "%d", p[i]); } +#ifdef USE_BACKREF_WITH_LEVEL + if (NODE_IS_NEST_LEVEL(node)) { + fprintf(f, ", level: %d", br->nest_level); + } +#endif } break; @@ -7830,6 +8017,7 @@ print_indent_tree(FILE* f, Node* node, int indent) { CallNode* cn = CALL_(node); fprintf(f, "<call:%p>", node); + fprintf(f, " num: %d, name", cn->called_gnum); p_string(f, cn->name_end - cn->name, cn->name); } break; @@ -7881,6 +8069,8 @@ print_indent_tree(FILE* f, Node* node, int indent) fprintf(f, "memory:%d", BAG_(node)->m.regnum); if (NODE_IS_CALLED(node)) fprintf(f, ", called"); + else if (NODE_IS_REFERENCED(node)) + fprintf(f, ", referenced"); if (NODE_IS_FIXED_ADDR(node)) fprintf(f, ", fixed-addr"); break; diff --git a/src/regenc.c b/src/regenc.c index dbfbc89..27e4549 100644 --- a/src/regenc.c +++ b/src/regenc.c @@ -263,12 +263,12 @@ onigenc_strlen_null(OnigEncoding enc, const UChar* s) extern int onigenc_str_bytelen_null(OnigEncoding enc, const UChar* s) { - UChar* start = (UChar* )s; - UChar* p = (UChar* )s; + const UChar* start = s; + const UChar* p = s; while (1) { if (*p == '\0') { - UChar* q; + const UChar* q; int len = ONIGENC_MBC_MINLEN(enc); if (len == 1) return (int )(p - start); diff --git a/src/regerror.c b/src/regerror.c index 58bc7fd..dc1c8b6 100644 --- a/src/regerror.c +++ b/src/regerror.c @@ -56,6 +56,8 @@ onig_error_code_to_format(int code) p = "retry-limit-in-match over"; break; case ONIGERR_RETRY_LIMIT_IN_SEARCH_OVER: p = "retry-limit-in-search over"; break; + case ONIGERR_SUBEXP_CALL_LIMIT_IN_SEARCH_OVER: + p = "subexp-call-limit-in-search over"; break; case ONIGERR_TYPE_BUG: p = "undefined type (bug)"; break; case ONIGERR_PARSER_BUG: diff --git a/src/regexec.c b/src/regexec.c index 1b6895d..bb6b474 100644 --- a/src/regexec.c +++ b/src/regexec.c @@ -46,15 +46,15 @@ #define CHECK_INTERRUPT_IN_MATCH -#define STACK_MEM_START(reg, i) \ - (MEM_STATUS_AT((reg)->push_mem_start, (i)) != 0 ? \ - STACK_AT(mem_start_stk[i])->u.mem.pstr : (UChar* )((void* )(mem_start_stk[i]))) +#define STACK_MEM_START(reg, idx) \ + (MEM_STATUS_AT((reg)->push_mem_start, (idx)) != 0 ? \ + STACK_AT(mem_start_stk[idx].i)->u.mem.pstr : mem_start_stk[idx].s) -#define STACK_MEM_END(reg, i) \ - (MEM_STATUS_AT((reg)->push_mem_end, (i)) != 0 ? \ - STACK_AT(mem_end_stk[i])->u.mem.pstr : (UChar* )((void* )(mem_end_stk[i]))) +#define STACK_MEM_END(reg, idx) \ + (MEM_STATUS_AT((reg)->push_mem_end, (idx)) != 0 ? \ + STACK_AT(mem_end_stk[idx].i)->u.mem.pstr : mem_end_stk[idx].s) -static int forward_search(regex_t* reg, const UChar* str, const UChar* end, UChar* start, UChar* range, UChar** low, UChar** high, UChar** low_prev); +static int forward_search(regex_t* reg, const UChar* str, const UChar* end, UChar* start, UChar* range, UChar** low, UChar** high); static int search_in_range(regex_t* reg, const UChar* str, const UChar* end, const UChar* start, const UChar* range, /* match range */ const UChar* data_range, /* subject string range */ OnigRegion* region, OnigOptionType option, OnigMatchParam* mp); @@ -170,6 +170,9 @@ typedef struct { int best_len; /* for ONIG_OPTION_FIND_LONGEST */ UChar* best_s; #endif +#ifdef USE_CALL + unsigned long subexp_call_in_search_counter; +#endif } MatchArg; @@ -1057,8 +1060,6 @@ onig_region_copy(OnigRegion* to, OnigRegion* from) /** stack **/ -#define INVALID_STACK_INDEX -1 - #define STK_ALT_FLAG 0x0001 /* stack type */ @@ -1099,7 +1100,15 @@ onig_region_copy(OnigRegion* to, OnigRegion* from) #define STK_MASK_TO_VOID_TARGET 0x100e #define STK_MASK_MEM_END_OR_MARK 0x8000 /* MEM_END or MEM_END_MARK */ -typedef intptr_t StackIndex; +typedef ptrdiff_t StackIndex; + +#define INVALID_STACK_INDEX ((StackIndex )-1) + +typedef union { + StackIndex i; + UChar* s; +} StkPtrType; + typedef struct _StackType { unsigned int type; @@ -1108,7 +1117,6 @@ typedef struct _StackType { struct { Operation* pcode; /* byte code position */ UChar* pstr; /* string position */ - UChar* pstr_prev; /* previous char position of pstr */ } state; struct { int count; @@ -1119,8 +1127,8 @@ typedef struct _StackType { struct { UChar *pstr; /* start/end position */ /* Following information is set, if this stack type is MEM-START */ - StackIndex prev_start; /* prev. info (for backtrack "(...)*" ) */ - StackIndex prev_end; /* prev. info (for backtrack "(...)*" ) */ + StkPtrType prev_start; /* prev. info (for backtrack "(...)*" ) */ + StkPtrType prev_end; /* prev. info (for backtrack "(...)*" ) */ } mem; struct { UChar *pstr; /* start position */ @@ -1166,8 +1174,8 @@ struct OnigCalloutArgsStruct { MatchArg* msa; StackType* stk_base; StackType* stk; - StackIndex* mem_start_stk; - StackIndex* mem_end_stk; + StkPtrType* mem_start_stk; + StkPtrType* mem_end_stk; }; #endif @@ -1178,7 +1186,7 @@ struct OnigCalloutArgsStruct { #define UPDATE_FOR_STACK_REALLOC do{\ repeat_stk = (StackIndex* )alloc_base;\ empty_check_stk = (StackIndex* )(repeat_stk + reg->num_repeat);\ - mem_start_stk = (StackIndex* )(empty_check_stk + reg->num_empty_check);\ + mem_start_stk = (StkPtrType* )(empty_check_stk + reg->num_empty_check);\ mem_end_stk = mem_start_stk + num_mem + 1;\ } while(0) @@ -1194,7 +1202,7 @@ struct OnigCalloutArgsStruct { #define PTR_NUM_SIZE(reg) (((reg)->num_mem + 1) * 2) #define UPDATE_FOR_STACK_REALLOC do{\ - mem_start_stk = (StackIndex* )alloc_base;\ + mem_start_stk = (StkPtrType* )alloc_base;\ mem_end_stk = mem_start_stk + num_mem + 1;\ } while(0) @@ -1218,8 +1226,12 @@ struct OnigCalloutArgsStruct { #endif #if defined(USE_CALL) +#define SUBEXP_CALL_IN_MATCH_ARG_INIT(msa,mpv) \ + (msa).subexp_call_in_search_counter = 0; + #define POP_CALL else if (stk->type == STK_RETURN) {subexp_call_nest_counter++;} else if (stk->type == STK_CALL_FRAME) {subexp_call_nest_counter--;} #else +#define SUBEXP_CALL_IN_MATCH_ARG_INIT(msa,mpv) #define POP_CALL #endif @@ -1231,6 +1243,7 @@ struct OnigCalloutArgsStruct { (msa).start = (arg_start);\ (msa).match_stack_limit = (mpv)->match_stack_limit;\ RETRY_IN_MATCH_ARG_INIT(msa,mpv)\ + SUBEXP_CALL_IN_MATCH_ARG_INIT(msa,mpv)\ (msa).mp = mpv;\ (msa).best_len = ONIG_MISMATCH;\ (msa).ptr_num = PTR_NUM_SIZE(reg);\ @@ -1243,6 +1256,7 @@ struct OnigCalloutArgsStruct { (msa).start = (arg_start);\ (msa).match_stack_limit = (mpv)->match_stack_limit;\ RETRY_IN_MATCH_ARG_INIT(msa,mpv)\ + SUBEXP_CALL_IN_MATCH_ARG_INIT(msa,mpv)\ (msa).mp = mpv;\ (msa).ptr_num = PTR_NUM_SIZE(reg);\ } while(0) @@ -1258,27 +1272,27 @@ struct OnigCalloutArgsStruct { is_alloca = 0;\ alloc_base = msa->stack_p;\ stk_base = (StackType* )(alloc_base\ - + (sizeof(StackIndex) * msa->ptr_num));\ + + (sizeof(StkPtrType) * msa->ptr_num));\ stk = stk_base;\ stk_end = stk_base + msa->stack_n;\ }\ else if (msa->ptr_num > ALLOCA_PTR_NUM_LIMIT) {\ is_alloca = 0;\ - alloc_base = (char* )xmalloc(sizeof(StackIndex) * msa->ptr_num\ + alloc_base = (char* )xmalloc(sizeof(StkPtrType) * msa->ptr_num\ + sizeof(StackType) * (stack_num));\ CHECK_NULL_RETURN_MEMERR(alloc_base);\ stk_base = (StackType* )(alloc_base\ - + (sizeof(StackIndex) * msa->ptr_num));\ + + (sizeof(StkPtrType) * msa->ptr_num));\ stk = stk_base;\ stk_end = stk_base + (stack_num);\ }\ else {\ is_alloca = 1;\ - alloc_base = (char* )xalloca(sizeof(StackIndex) * msa->ptr_num\ + alloc_base = (char* )xalloca(sizeof(StkPtrType) * msa->ptr_num\ + sizeof(StackType) * (stack_num));\ CHECK_NULL_RETURN_MEMERR(alloc_base);\ stk_base = (StackType* )(alloc_base\ - + (sizeof(StackIndex) * msa->ptr_num));\ + + (sizeof(StkPtrType) * msa->ptr_num));\ stk = stk_base;\ stk_end = stk_base + (stack_num);\ }\ @@ -1288,7 +1302,7 @@ struct OnigCalloutArgsStruct { #define STACK_SAVE(msa,is_alloca,alloc_base) do{\ (msa)->stack_n = (int )(stk_end - stk_base);\ if ((is_alloca) != 0) {\ - size_t size = sizeof(StackIndex) * (msa)->ptr_num\ + size_t size = sizeof(StkPtrType) * (msa)->ptr_num\ + sizeof(StackType) * (msa)->stack_n;\ (msa)->stack_p = xmalloc(size);\ CHECK_NULL_RETURN_MEMERR((msa)->stack_p);\ @@ -1373,6 +1387,24 @@ onig_set_retry_limit_in_search(unsigned long n) #endif } +#ifdef USE_CALL +static unsigned long SubexpCallLimitInSearch = DEFAULT_SUBEXP_CALL_LIMIT_IN_SEARCH; + +extern unsigned long +onig_get_subexp_call_limit_in_search(void) +{ + return SubexpCallLimitInSearch; +} + +extern int +onig_set_subexp_call_limit_in_search(unsigned long n) +{ + SubexpCallLimitInSearch = n; + return 0; +} + +#endif + #ifdef USE_CALLOUT static OnigCalloutFunc DefaultProgressCallout; static OnigCalloutFunc DefaultRetractionCallout; @@ -1637,9 +1669,9 @@ stack_double(int* is_alloca, char** arg_alloc_base, stk = *arg_stk; n = (unsigned int )(stk_end - stk_base); - size = sizeof(StackIndex) * msa->ptr_num + sizeof(StackType) * n; + size = sizeof(StkPtrType) * msa->ptr_num + sizeof(StackType) * n; n *= 2; - new_size = sizeof(StackIndex) * msa->ptr_num + sizeof(StackType) * n; + new_size = sizeof(StkPtrType) * msa->ptr_num + sizeof(StackType) * n; if (*is_alloca != 0) { new_alloc_base = (char* )xmalloc(new_size); if (IS_NULL(new_alloc_base)) { @@ -1669,7 +1701,7 @@ stack_double(int* is_alloca, char** arg_alloc_base, used = (int )(stk - stk_base); *arg_alloc_base = alloc_base; *arg_stk_base = (StackType* )(alloc_base - + (sizeof(StackIndex) * msa->ptr_num)); + + (sizeof(StkPtrType) * msa->ptr_num)); *arg_stk = *arg_stk_base + used; *arg_stk_end = *arg_stk_base + n; return 0; @@ -1694,22 +1726,20 @@ stack_double(int* is_alloca, char** arg_alloc_base, #define IS_TO_VOID_TARGET(stk) (((stk)->type & STK_MASK_TO_VOID_TARGET) != 0) -#define STACK_PUSH(stack_type,pat,s,sprev) do {\ +#define STACK_PUSH(stack_type,pat,s) do {\ STACK_ENSURE(1);\ stk->type = (stack_type);\ stk->u.state.pcode = (pat);\ stk->u.state.pstr = (s);\ - stk->u.state.pstr_prev = (sprev);\ STACK_INC;\ } while(0) -#define STACK_PUSH_WITH_ZID(stack_type,pat,s,sprev,id) do {\ +#define STACK_PUSH_WITH_ZID(stack_type,pat,s,id) do {\ STACK_ENSURE(1);\ stk->type = (stack_type);\ stk->zid = (int )(id);\ stk->u.state.pcode = (pat);\ stk->u.state.pstr = (s);\ - stk->u.state.pstr_prev = (sprev);\ STACK_INC;\ } while(0) @@ -1724,7 +1754,6 @@ stack_double(int* is_alloca, char** arg_alloc_base, stk->type = (stack_type);\ stk->u.state.pcode = (pat);\ stk->u.state.pstr = s;\ - stk->u.state.pstr_prev = sprev;\ STACK_INC;\ } while (0) #else @@ -1735,10 +1764,9 @@ stack_double(int* is_alloca, char** arg_alloc_base, } while (0) #endif -#define STACK_PUSH_ALT(pat,s,sprev) STACK_PUSH(STK_ALT,pat,s,sprev) -#define STACK_PUSH_SUPER_ALT(pat,s,sprev) STACK_PUSH(STK_SUPER_ALT,pat,s,sprev) -#define STACK_PUSH_ALT_WITH_ZID(pat,s,sprev,id) \ - STACK_PUSH_WITH_ZID(STK_ALT,pat,s,sprev,id) +#define STACK_PUSH_ALT(pat,s) STACK_PUSH(STK_ALT,pat,s) +#define STACK_PUSH_SUPER_ALT(pat,s) STACK_PUSH(STK_SUPER_ALT,pat,s) +#define STACK_PUSH_ALT_WITH_ZID(pat,s,id) STACK_PUSH_WITH_ZID(STK_ALT,pat,s,id) #if 0 #define STACK_PUSH_REPEAT(sid, pat) do {\ @@ -1767,8 +1795,8 @@ stack_double(int* is_alloca, char** arg_alloc_base, stk->u.mem.pstr = (s);\ stk->u.mem.prev_start = mem_start_stk[mnum];\ stk->u.mem.prev_end = mem_end_stk[mnum];\ - mem_start_stk[mnum] = GET_STACK_INDEX(stk);\ - mem_end_stk[mnum] = INVALID_STACK_INDEX;\ + mem_start_stk[mnum].i = GET_STACK_INDEX(stk);\ + mem_end_stk[mnum].i = INVALID_STACK_INDEX;\ STACK_INC;\ } while(0) @@ -1779,7 +1807,7 @@ stack_double(int* is_alloca, char** arg_alloc_base, stk->u.mem.pstr = (s);\ stk->u.mem.prev_start = mem_start_stk[mnum];\ stk->u.mem.prev_end = mem_end_stk[mnum];\ - mem_end_stk[mnum] = GET_STACK_INDEX(stk);\ + mem_end_stk[mnum].i = GET_STACK_INDEX(stk);\ STACK_INC;\ } while(0) @@ -1861,12 +1889,11 @@ stack_double(int* is_alloca, char** arg_alloc_base, STACK_INC;\ } while(0) -#define STACK_PUSH_MARK_WITH_POS(sid, s, sprev) do {\ +#define STACK_PUSH_MARK_WITH_POS(sid, s) do {\ STACK_ENSURE(1);\ stk->type = STK_MARK;\ stk->zid = (sid);\ stk->u.val.v = (UChar* )(s);\ - stk->u.val.v2 = (sprev);\ STACK_INC;\ } while(0) @@ -1885,7 +1912,6 @@ stack_double(int* is_alloca, char** arg_alloc_base, stk->zid = (sid);\ stk->u.val.type = (stype);\ stk->u.val.v = (UChar* )(sval);\ - stk->u.val.v2 = sprev;\ STACK_INC;\ } while(0) @@ -1932,7 +1958,6 @@ stack_double(int* is_alloca, char** arg_alloc_base, && k->zid == (sid)) {\ if (level == 0) {\ (sval) = k->u.val.v;\ - sprev = k->u.val.v2;\ break;\ }\ }\ @@ -2135,14 +2160,14 @@ stack_double(int* is_alloca, char** arg_alloc_base, } while(0) #define STACK_MEM_START_GET_PREV_END_ADDR(k /* STK_MEM_START*/, reg, addr) do {\ - if (k->u.mem.prev_end == INVALID_STACK_INDEX) {\ + if (k->u.mem.prev_end.i == INVALID_STACK_INDEX) {\ (addr) = 0;\ }\ else {\ if (MEM_STATUS_AT((reg)->push_mem_end, k->zid))\ - (addr) = STACK_AT(k->u.mem.prev_end)->u.mem.pstr;\ + (addr) = STACK_AT(k->u.mem.prev_end.i)->u.mem.pstr;\ else\ - (addr) = (UChar* )k->u.mem.prev_end;\ + (addr) = k->u.mem.prev_end.s;\ }\ } while (0) @@ -2163,7 +2188,7 @@ stack_double(int* is_alloca, char** arg_alloc_base, if (endp == 0) {\ (isnull) = 0; break;\ }\ - else if (STACK_AT(k->u.mem.prev_start)->u.mem.pstr != endp) {\ + else if (STACK_AT(k->u.mem.prev_start.i)->u.mem.pstr != endp) {\ (isnull) = 0; break;\ }\ else if (endp != s) {\ @@ -2199,7 +2224,7 @@ stack_double(int* is_alloca, char** arg_alloc_base, if (endp == 0) {\ (isnull) = 0; break;\ }\ - else if (STACK_AT(k->u.mem.prev_start)->u.mem.pstr != endp) { \ + else if (STACK_AT(k->u.mem.prev_start.i)->u.mem.pstr != endp) { \ (isnull) = 0; break;\ }\ else if (endp != s) {\ @@ -2362,6 +2387,10 @@ static int string_cmp_ic(OnigEncoding enc, int case_fold_flag, p1++; p2++; } + if (s2 >= end2) { + if (s1 < end1) return 0; + else break; + } } *ps2 = s2; @@ -2390,7 +2419,7 @@ static int string_cmp_ic(OnigEncoding enc, int case_fold_flag, #define ON_STR_END(s) ((s) == end) #define DATA_ENSURE_CHECK1 (s < right_range) #define DATA_ENSURE_CHECK(n) (s + (n) <= right_range) -#define DATA_ENSURE(n) if (s + (n) > right_range) goto fail +#define DATA_ENSURE(n) if (right_range - s < (n)) goto fail #define INIT_RIGHT_RANGE right_range = (UChar* )in_right_range @@ -2632,9 +2661,9 @@ typedef struct { #define BYTECODE_INTERPRETER_START GOTO_OP; #define BYTECODE_INTERPRETER_END -#define CASE_OP(x) L_##x: SOP_IN(OP_##x); sbegin = s; MATCH_DEBUG_OUT(0) +#define CASE_OP(x) L_##x: SOP_IN(OP_##x); MATCH_DEBUG_OUT(0) #define DEFAULT_OP /* L_DEFAULT: */ -#define NEXT_OP sprev = sbegin; JUMP_OP +#define NEXT_OP JUMP_OP #define JUMP_OP GOTO_OP #ifdef USE_DIRECT_THREADED_CODE #define GOTO_OP goto *(p->opaddr) @@ -2648,9 +2677,8 @@ typedef struct { #define BYTECODE_INTERPRETER_START \ while (1) {\ MATCH_DEBUG_OUT(0)\ - sbegin = s;\ switch (p->opcode) { -#define BYTECODE_INTERPRETER_END } sprev = sbegin; } +#define BYTECODE_INTERPRETER_END } } #define CASE_OP(x) case OP_##x: SOP_IN(OP_##x); #define DEFAULT_OP default: #define NEXT_OP break @@ -2718,12 +2746,22 @@ typedef struct { best_len = err_code; goto match_at_end;\ } while(0) +#define MATCH_COUNTER_OUT(title) do {\ + int i;\ + fprintf(DBGFP, "%s (%ld): retry limit: %8lu, subexp_call: %8lu\n", (title), (sstart - str), retry_in_match_counter, msa->subexp_call_in_search_counter); \ + fprintf(DBGFP, " ");\ + for (i = 0; i < MAX_SUBEXP_CALL_COUNTERS; i++) {\ + fprintf(DBGFP, " %6lu", subexp_call_counters[i]);\ + }\ + fprintf(DBGFP, "\n");\ + fflush(DBGFP);\ +} while (0) + /* match data(str - end) from position (sstart). */ -/* if sstart == str then set sprev to NULL. */ static int match_at(regex_t* reg, const UChar* str, const UChar* end, - const UChar* in_right_range, const UChar* sstart, UChar* sprev, + const UChar* in_right_range, const UChar* sstart, MatchArg* msa) { @@ -2782,10 +2820,14 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, &&L_BACKREF_N_IC, &&L_BACKREF_MULTI, &&L_BACKREF_MULTI_IC, +#ifdef USE_BACKREF_WITH_LEVEL &&L_BACKREF_WITH_LEVEL, &&L_BACKREF_WITH_LEVEL_IC, +#endif &&L_BACKREF_CHECK, +#ifdef USE_BACKREF_WITH_LEVEL &&L_BACKREF_CHECK_WITH_LEVEL, +#endif &&L_MEM_START, &&L_MEM_START_PUSH, &&L_MEM_END_PUSH, @@ -2838,13 +2880,13 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, LengthType tlen, tlen2; MemNumType mem; RelAddrType addr; - UChar *s, *ps, *sbegin; + UChar *s, *ps; UChar *right_range; int is_alloca; char *alloc_base; StackType *stk_base, *stk, *stk_end; StackType *stkp; /* used as any purpose. */ - StackIndex *mem_start_stk, *mem_end_stk; + StkPtrType *mem_start_stk, *mem_end_stk; UChar* keep; #ifdef USE_REPEAT_AND_EMPTY_CHECK_LOCAL_VAR @@ -2858,6 +2900,10 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, #ifdef USE_CALLOUT int of; #endif +#ifdef ONIG_DEBUG_MATCH_COUNTER +#define MAX_SUBEXP_CALL_COUNTERS 9 + unsigned long subexp_call_counters[MAX_SUBEXP_CALL_COUNTERS]; +#endif Operation* p = reg->ops; OnigOptionType option = reg->options; @@ -2872,6 +2918,12 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, static unsigned int counter = 1; #endif +#ifdef ONIG_DEBUG_MATCH_COUNTER + for (i = 0; i < MAX_SUBEXP_CALL_COUNTERS; i++) { + subexp_call_counters[i] = 0; + } +#endif + #ifdef USE_DIRECT_THREADED_CODE if (IS_NULL(msa)) { for (i = 0; i < reg->ops_used; i++) { @@ -2903,12 +2955,11 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, STACK_INIT(INIT_MATCH_STACK_SIZE); UPDATE_FOR_STACK_REALLOC; for (i = 1; i <= num_mem; i++) { - mem_start_stk[i] = mem_end_stk[i] = INVALID_STACK_INDEX; + mem_start_stk[i].i = mem_end_stk[i].i = INVALID_STACK_INDEX; } #ifdef ONIG_DEBUG_MATCH - fprintf(DBGFP, "match_at: str: %p, end: %p, start: %p, sprev: %p\n", - str, end, sstart, sprev); + fprintf(DBGFP, "match_at: str: %p, end: %p, start: %p\n", str, end, sstart); fprintf(DBGFP, "size: %d, start offset: %d\n", (int )(end - str), (int )(sstart - str)); #endif @@ -2932,24 +2983,27 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, if (n > msa->best_len) { msa->best_len = n; msa->best_s = (UChar* )sstart; + goto set_region; } else goto end_best_len; } #endif best_len = n; + + set_region: region = msa->region; if (region) { if (keep > s) keep = s; -#ifdef USE_POSIX_API_REGION_OPTION +#ifdef USE_POSIX_API if (OPTON_POSIX_REGION(msa->options)) { posix_regmatch_t* rmt = (posix_regmatch_t* )region; rmt[0].rm_so = (regoff_t )(keep - str); rmt[0].rm_eo = (regoff_t )(s - str); for (i = 1; i <= num_mem; i++) { - if (mem_end_stk[i] != INVALID_STACK_INDEX) { + if (mem_end_stk[i].i != INVALID_STACK_INDEX) { rmt[i].rm_so = (regoff_t )(STACK_MEM_START(reg, i) - str); rmt[i].rm_eo = (regoff_t )(STACK_MEM_END(reg, i) - str); } @@ -2959,11 +3013,11 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, } } else { -#endif /* USE_POSIX_API_REGION_OPTION */ +#endif /* USE_POSIX_API */ region->beg[0] = (int )(keep - str); region->end[0] = (int )(s - str); for (i = 1; i <= num_mem; i++) { - if (mem_end_stk[i] != INVALID_STACK_INDEX) { + if (mem_end_stk[i].i != INVALID_STACK_INDEX) { region->beg[i] = (int )(STACK_MEM_START(reg, i) - str); region->end[i] = (int )(STACK_MEM_END(reg, i) - str); } @@ -2996,7 +3050,7 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, if (r < 0) MATCH_AT_ERROR_RETURN(r); } #endif /* USE_CAPTURE_HISTORY */ -#ifdef USE_POSIX_API_REGION_OPTION +#ifdef USE_POSIX_API } /* else OPTON_POSIX_REGION() */ #endif } /* if (region) */ @@ -3012,8 +3066,11 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, best_len = ONIG_MISMATCH; goto fail; /* for retry */ } - if (OPTON_FIND_LONGEST(option) && DATA_ENSURE_CHECK1) { - goto fail; /* for retry */ + if (OPTON_FIND_LONGEST(option)) { + if (s >= in_right_range && msa->best_s == sstart) + best_len = msa->best_len; + else + goto fail; /* for retry */ } } @@ -3034,7 +3091,6 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, if (*ps != *s) goto fail; ps++; s++; if (*ps != *s) goto fail; - sprev = s; s++; INC_OP; JUMP_OUT; @@ -3047,7 +3103,6 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, if (*ps != *s) goto fail; ps++; s++; if (*ps != *s) goto fail; - sprev = s; s++; INC_OP; JUMP_OUT; @@ -3062,7 +3117,6 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, if (*ps != *s) goto fail; ps++; s++; if (*ps != *s) goto fail; - sprev = s; s++; INC_OP; JUMP_OUT; @@ -3079,7 +3133,6 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, if (*ps != *s) goto fail; ps++; s++; if (*ps != *s) goto fail; - sprev = s; s++; INC_OP; JUMP_OUT; @@ -3091,7 +3144,6 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, while (tlen-- > 0) { if (*ps++ != *s++) goto fail; } - sprev = s - 1; INC_OP; JUMP_OUT; @@ -3112,7 +3164,6 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, ps++; s++; if (*ps != *s) goto fail; ps++; s++; - sprev = s; if (*ps != *s) goto fail; ps++; s++; if (*ps != *s) goto fail; @@ -3131,7 +3182,6 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, ps++; s++; if (*ps != *s) goto fail; ps++; s++; - sprev = s; if (*ps != *s) goto fail; ps++; s++; if (*ps != *s) goto fail; @@ -3149,7 +3199,6 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, if (*ps != *s) goto fail; ps++; s++; } - sprev = s - 2; INC_OP; JUMP_OUT; @@ -3165,7 +3214,6 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, if (*ps != *s) goto fail; ps++; s++; } - sprev = s - 3; INC_OP; JUMP_OUT; @@ -3179,7 +3227,6 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, if (*ps != *s) goto fail; ps++; s++; } - sprev = s - tlen; INC_OP; JUMP_OUT; @@ -3295,11 +3342,10 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, CASE_OP(ANYCHAR_STAR) INC_OP; while (DATA_ENSURE_CHECK1) { - STACK_PUSH_ALT(p, s, sprev); + STACK_PUSH_ALT(p, s); n = enclen(encode, s); DATA_ENSURE(n); if (ONIGENC_IS_MBC_NEWLINE(encode, s, end)) goto fail; - sprev = s; s += n; } JUMP_OUT; @@ -3307,15 +3353,13 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, CASE_OP(ANYCHAR_ML_STAR) INC_OP; while (DATA_ENSURE_CHECK1) { - STACK_PUSH_ALT(p, s, sprev); + STACK_PUSH_ALT(p, s); n = enclen(encode, s); if (n > 1) { DATA_ENSURE(n); - sprev = s; s += n; } else { - sprev = s; s++; } } @@ -3329,12 +3373,11 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, INC_OP; while (DATA_ENSURE_CHECK1) { if (c == *s) { - STACK_PUSH_ALT(p, s, sprev); + STACK_PUSH_ALT(p, s); } n = enclen(encode, s); DATA_ENSURE(n); if (ONIGENC_IS_MBC_NEWLINE(encode, s, end)) goto fail; - sprev = s; s += n; } } @@ -3348,16 +3391,14 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, INC_OP; while (DATA_ENSURE_CHECK1) { if (c == *s) { - STACK_PUSH_ALT(p, s, sprev); + STACK_PUSH_ALT(p, s); } n = enclen(encode, s); if (n > 1) { DATA_ENSURE(n); - sprev = s; s += n; } else { - sprev = s; s++; } } @@ -3410,14 +3451,17 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, if (! IS_MBC_WORD_ASCII_MODE(encode, s, end, mode)) goto fail; } - else if (ON_STR_END(s)) { - if (! IS_MBC_WORD_ASCII_MODE(encode, sprev, end, mode)) - goto fail; - } else { - if (IS_MBC_WORD_ASCII_MODE(encode, s, end, mode) - == IS_MBC_WORD_ASCII_MODE(encode, sprev, end, mode)) - goto fail; + UChar* sprev = (UChar* )onigenc_get_prev_char_head(encode, str, s); + if (ON_STR_END(s)) { + if (! IS_MBC_WORD_ASCII_MODE(encode, sprev, end, mode)) + goto fail; + } + else { + if (IS_MBC_WORD_ASCII_MODE(encode, s, end, mode) + == IS_MBC_WORD_ASCII_MODE(encode, sprev, end, mode)) + goto fail; + } } } INC_OP; @@ -3432,14 +3476,17 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, if (DATA_ENSURE_CHECK1 && IS_MBC_WORD_ASCII_MODE(encode, s, end, mode)) goto fail; } - else if (ON_STR_END(s)) { - if (IS_MBC_WORD_ASCII_MODE(encode, sprev, end, mode)) - goto fail; - } else { - if (IS_MBC_WORD_ASCII_MODE(encode, s, end, mode) - != IS_MBC_WORD_ASCII_MODE(encode, sprev, end, mode)) - goto fail; + UChar* sprev = (UChar* )onigenc_get_prev_char_head(encode, str, s); + if (ON_STR_END(s)) { + if (IS_MBC_WORD_ASCII_MODE(encode, sprev, end, mode)) + goto fail; + } + else { + if (IS_MBC_WORD_ASCII_MODE(encode, s, end, mode) + != IS_MBC_WORD_ASCII_MODE(encode, sprev, end, mode)) + goto fail; + } } } INC_OP; @@ -3452,7 +3499,13 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, mode = p->word_boundary.mode; if (DATA_ENSURE_CHECK1 && IS_MBC_WORD_ASCII_MODE(encode, s, end, mode)) { - if (ON_STR_BEGIN(s) || !IS_MBC_WORD_ASCII_MODE(encode, sprev, end, mode)) { + UChar* sprev; + if (ON_STR_BEGIN(s)) { + INC_OP; + JUMP_OUT; + } + sprev = (UChar* )onigenc_get_prev_char_head(encode, str, s); + if (! IS_MBC_WORD_ASCII_MODE(encode, sprev, end, mode)) { INC_OP; JUMP_OUT; } @@ -3465,10 +3518,13 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, ModeType mode; mode = p->word_boundary.mode; - if (!ON_STR_BEGIN(s) && IS_MBC_WORD_ASCII_MODE(encode, sprev, end, mode)) { - if (ON_STR_END(s) || ! IS_MBC_WORD_ASCII_MODE(encode, s, end, mode)) { - INC_OP; - JUMP_OUT; + if (! ON_STR_BEGIN(s)) { + UChar* sprev = (UChar* )onigenc_get_prev_char_head(encode, str, s); + if (IS_MBC_WORD_ASCII_MODE(encode, sprev, end, mode)) { + if (ON_STR_END(s) || ! IS_MBC_WORD_ASCII_MODE(encode, s, end, mode)) { + INC_OP; + JUMP_OUT; + } } } } @@ -3478,6 +3534,7 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, CASE_OP(TEXT_SEGMENT_BOUNDARY) { int is_break; + UChar* sprev = (UChar* )onigenc_get_prev_char_head(encode, str, s); switch (p->text_segment_boundary.type) { case EXTENDED_GRAPHEME_CLUSTER_BOUNDARY: @@ -3507,12 +3564,16 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, CASE_OP(BEGIN_BUF) if (! ON_STR_BEGIN(s)) goto fail; + if (OPTON_NOTBOL(msa->options)) goto fail; + if (OPTON_NOT_BEGIN_STRING(msa->options)) goto fail; INC_OP; JUMP_OUT; CASE_OP(END_BUF) if (! ON_STR_END(s)) goto fail; + if (OPTON_NOTEOL(msa->options)) goto fail; + if (OPTON_NOT_END_STRING(msa->options)) goto fail; INC_OP; JUMP_OUT; @@ -3523,15 +3584,19 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, INC_OP; JUMP_OUT; } - else if (ONIGENC_IS_MBC_NEWLINE(encode, sprev, end) && !ON_STR_END(s)) { - INC_OP; - JUMP_OUT; + else if (! ON_STR_END(s)) { + UChar* sprev = (UChar* )onigenc_get_prev_char_head(encode, str, s); + if (ONIGENC_IS_MBC_NEWLINE(encode, sprev, end)) { + INC_OP; + JUMP_OUT; + } } goto fail; CASE_OP(END_LINE) if (ON_STR_END(s)) { #ifndef USE_NEWLINE_AT_END_OF_STRING_HAS_EMPTY_LINE + UChar* sprev = (UChar* )onigenc_get_prev_char_head(encode, str, s); if (IS_EMPTY_STR || !ONIGENC_IS_MBC_NEWLINE(encode, sprev, end)) { #endif if (OPTON_NOTEOL(msa->options)) goto fail; @@ -3556,9 +3621,11 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, CASE_OP(SEMI_END_BUF) if (ON_STR_END(s)) { #ifndef USE_NEWLINE_AT_END_OF_STRING_HAS_EMPTY_LINE + UChar* sprev = (UChar* )onigenc_get_prev_char_head(encode, str, s); if (IS_EMPTY_STR || !ONIGENC_IS_MBC_NEWLINE(encode, sprev, end)) { #endif if (OPTON_NOTEOL(msa->options)) goto fail; + if (OPTON_NOT_END_STRING(msa->options)) goto fail; INC_OP; JUMP_OUT; #ifndef USE_NEWLINE_AT_END_OF_STRING_HAS_EMPTY_LINE @@ -3567,6 +3634,8 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, } else if (ONIGENC_IS_MBC_NEWLINE(encode, s, end) && ON_STR_END(s + enclen(encode, s))) { + if (OPTON_NOTEOL(msa->options)) goto fail; + if (OPTON_NOT_END_STRING(msa->options)) goto fail; INC_OP; JUMP_OUT; } @@ -3575,6 +3644,8 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, UChar* ss = s + enclen(encode, s); ss += enclen(encode, ss); if (ON_STR_END(ss)) { + if (OPTON_NOTEOL(msa->options)) goto fail; + if (OPTON_NOT_END_STRING(msa->options)) goto fail; INC_OP; JUMP_OUT; } @@ -3586,6 +3657,7 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, switch (p->check_position.type) { case CHECK_POSITION_SEARCH_START: if (s != msa->start) goto fail; + if (OPTON_NOT_BEGIN_POSITION(msa->options)) goto fail; break; case CHECK_POSITION_CURRENT_RIGHT_RANGE: if (s != right_range) goto fail; @@ -3604,7 +3676,7 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, CASE_OP(MEM_START) mem = p->memory_start.num; - mem_start_stk[mem] = (StackIndex )((void* )s); + mem_start_stk[mem].s = s; INC_OP; JUMP_OUT; @@ -3616,7 +3688,7 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, CASE_OP(MEM_END) mem = p->memory_end.num; - mem_end_stk[mem] = (StackIndex )((void* )s); + mem_end_stk[mem].s = s; INC_OP; JUMP_OUT; @@ -3629,20 +3701,20 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, STACK_GET_MEM_START(mem, stkp); /* should be before push mem-end. */ si = GET_STACK_INDEX(stkp); STACK_PUSH_MEM_END(mem, s); - mem_start_stk[mem] = si; + mem_start_stk[mem].i = si; INC_OP; JUMP_OUT; } CASE_OP(MEM_END_REC) mem = p->memory_end.num; - mem_end_stk[mem] = (StackIndex )((void* )s); + mem_end_stk[mem].s = s; STACK_GET_MEM_START(mem, stkp); if (MEM_STATUS_AT(reg->push_mem_start, mem)) - mem_start_stk[mem] = GET_STACK_INDEX(stkp); + mem_start_stk[mem].i = GET_STACK_INDEX(stkp); else - mem_start_stk[mem] = (StackIndex )((void* )stkp->u.mem.pstr); + mem_start_stk[mem].s = stkp->u.mem.pstr; STACK_PUSH_MEM_END_MARK(mem); INC_OP; @@ -3661,21 +3733,17 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, mem = p->backref_n.n1; backref: { - int len; UChar *pstart, *pend; - if (mem_end_stk[mem] == INVALID_STACK_INDEX) goto fail; - if (mem_start_stk[mem] == INVALID_STACK_INDEX) goto fail; + if (mem_end_stk[mem].i == INVALID_STACK_INDEX) goto fail; + if (mem_start_stk[mem].i == INVALID_STACK_INDEX) goto fail; pstart = STACK_MEM_START(reg, mem); pend = STACK_MEM_END(reg, mem); n = (int )(pend - pstart); if (n != 0) { DATA_ENSURE(n); - sprev = s; STRING_CMP(s, pstart, n); - while (sprev + (len = enclen(encode, sprev)) < s) - sprev += len; } } INC_OP; @@ -3684,21 +3752,17 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, CASE_OP(BACKREF_N_IC) mem = p->backref_n.n1; { - int len; UChar *pstart, *pend; - if (mem_end_stk[mem] == INVALID_STACK_INDEX) goto fail; - if (mem_start_stk[mem] == INVALID_STACK_INDEX) goto fail; + if (mem_end_stk[mem].i == INVALID_STACK_INDEX) goto fail; + if (mem_start_stk[mem].i == INVALID_STACK_INDEX) goto fail; pstart = STACK_MEM_START(reg, mem); pend = STACK_MEM_END(reg, mem); n = (int )(pend - pstart); if (n != 0) { DATA_ENSURE(n); - sprev = s; STRING_CMP_IC(case_fold_flag, pstart, &s, n); - while (sprev + (len = enclen(encode, sprev)) < s) - sprev += len; } } INC_OP; @@ -3706,28 +3770,25 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, CASE_OP(BACKREF_MULTI) { - int len, is_fail; + int is_fail; UChar *pstart, *pend, *swork; tlen = p->backref_general.num; for (i = 0; i < tlen; i++) { mem = tlen == 1 ? p->backref_general.n1 : p->backref_general.ns[i]; - if (mem_end_stk[mem] == INVALID_STACK_INDEX) continue; - if (mem_start_stk[mem] == INVALID_STACK_INDEX) continue; + if (mem_end_stk[mem].i == INVALID_STACK_INDEX) continue; + if (mem_start_stk[mem].i == INVALID_STACK_INDEX) continue; pstart = STACK_MEM_START(reg, mem); pend = STACK_MEM_END(reg, mem); n = (int )(pend - pstart); if (n != 0) { DATA_ENSURE(n); - sprev = s; swork = s; STRING_CMP_VALUE(swork, pstart, n, is_fail); if (is_fail) continue; s = swork; - while (sprev + (len = enclen(encode, sprev)) < s) - sprev += len; } break; /* success */ } @@ -3738,28 +3799,25 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, CASE_OP(BACKREF_MULTI_IC) { - int len, is_fail; + int is_fail; UChar *pstart, *pend, *swork; tlen = p->backref_general.num; for (i = 0; i < tlen; i++) { mem = tlen == 1 ? p->backref_general.n1 : p->backref_general.ns[i]; - if (mem_end_stk[mem] == INVALID_STACK_INDEX) continue; - if (mem_start_stk[mem] == INVALID_STACK_INDEX) continue; + if (mem_end_stk[mem].i == INVALID_STACK_INDEX) continue; + if (mem_start_stk[mem].i == INVALID_STACK_INDEX) continue; pstart = STACK_MEM_START(reg, mem); pend = STACK_MEM_END(reg, mem); n = (int )(pend - pstart); if (n != 0) { DATA_ENSURE(n); - sprev = s; swork = s; STRING_CMP_VALUE_IC(case_fold_flag, pstart, &swork, n, is_fail); if (is_fail) continue; s = swork; - while (sprev + (len = enclen(encode, sprev)) < s) - sprev += len; } break; /* success */ } @@ -3774,10 +3832,8 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, goto backref_with_level; CASE_OP(BACKREF_WITH_LEVEL) { - int len; int level; MemNumType* mems; - UChar* ssave; n = 0; backref_with_level: @@ -3785,17 +3841,10 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, tlen = p->backref_general.num; mems = tlen == 1 ? &(p->backref_general.n1) : p->backref_general.ns; - ssave = s; - if (backref_match_at_nested_level(reg, stk, stk_base, n, - case_fold_flag, level, (int )tlen, mems, &s, end)) { - if (ssave != s) { - sprev = ssave; - while (sprev + (len = enclen(encode, sprev)) < s) - sprev += len; - } - } - else + if (! backref_match_at_nested_level(reg, stk, stk_base, n, + case_fold_flag, level, (int )tlen, mems, &s, end)) { goto fail; + } } INC_OP; JUMP_OUT; @@ -3810,8 +3859,8 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, for (i = 0; i < tlen; i++) { mem = mems[i]; - if (mem_end_stk[mem] == INVALID_STACK_INDEX) continue; - if (mem_start_stk[mem] == INVALID_STACK_INDEX) continue; + if (mem_end_stk[mem].i == INVALID_STACK_INDEX) continue; + if (mem_start_stk[mem].i == INVALID_STACK_INDEX) continue; break; /* success */ } if (i == tlen) goto fail; @@ -3928,13 +3977,13 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, CASE_OP(PUSH) addr = p->push.addr; - STACK_PUSH_ALT(p + addr, s, sprev); + STACK_PUSH_ALT(p + addr, s); INC_OP; JUMP_OUT; CASE_OP(PUSH_SUPER) addr = p->push.addr; - STACK_PUSH_SUPER_ALT(p + addr, s, sprev); + STACK_PUSH_SUPER_ALT(p + addr, s); INC_OP; JUMP_OUT; @@ -3956,7 +4005,7 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, addr = p->push_or_jump_exact1.addr; c = p->push_or_jump_exact1.c; if (DATA_ENSURE_CHECK1 && c == *s) { - STACK_PUSH_ALT(p + addr, s, sprev); + STACK_PUSH_ALT(p + addr, s); INC_OP; JUMP_OUT; } @@ -3972,9 +4021,7 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, addr = p->push_if_peek_next.addr; c = p->push_if_peek_next.c; if (DATA_ENSURE_CHECK1 && c == *s) { - STACK_PUSH_ALT(p + addr, s, sprev); - INC_OP; - JUMP_OUT; + STACK_PUSH_ALT(p + addr, s); } } INC_OP; @@ -3986,7 +4033,7 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, STACK_PUSH_REPEAT_INC(mem, 0); if (reg->repeat_range[mem].lower == 0) { - STACK_PUSH_ALT(p + addr, s, sprev); + STACK_PUSH_ALT(p + addr, s); } INC_OP; JUMP_OUT; @@ -3997,7 +4044,7 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, STACK_PUSH_REPEAT_INC(mem, 0); if (reg->repeat_range[mem].lower == 0) { - STACK_PUSH_ALT(p + 1, s, sprev); + STACK_PUSH_ALT(p + 1, s); p += addr; } else @@ -4014,7 +4061,7 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, } else if (n >= reg->repeat_range[mem].lower) { INC_OP; - STACK_PUSH_ALT(p, s, sprev); + STACK_PUSH_ALT(p, s); p = reg->repeat_range[mem].u.pcode; } else { @@ -4033,7 +4080,7 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, } else { if (n >= reg->repeat_range[mem].lower) { - STACK_PUSH_ALT(reg->repeat_range[mem].u.pcode, s, sprev); + STACK_PUSH_ALT(reg->repeat_range[mem].u.pcode, s); INC_OP; } else { @@ -4047,6 +4094,21 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, if (subexp_call_nest_counter == SubexpCallMaxNestLevel) goto fail; subexp_call_nest_counter++; + + if (SubexpCallLimitInSearch != 0) { + msa->subexp_call_in_search_counter++; +#ifdef ONIG_DEBUG_MATCH_COUNTER + if (p->call.called_mem < MAX_SUBEXP_CALL_COUNTERS) + subexp_call_counters[p->call.called_mem]++; + if (msa->subexp_call_in_search_counter % 1000 == 0) + MATCH_COUNTER_OUT("CALL"); +#endif + if (msa->subexp_call_in_search_counter > + SubexpCallLimitInSearch) { + MATCH_AT_ERROR_RETURN(ONIGERR_SUBEXP_CALL_LIMIT_IN_SEARCH_OVER); + } + } + addr = p->call.addr; INC_OP; STACK_PUSH_CALL_FRAME(p); p = reg->ops + addr; @@ -4070,7 +4132,6 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, for (tlen = p->move.n; tlen > 0; tlen--) { len = enclen(encode, s); - sprev = s; s += len; if (s > end) goto fail; if (s == end) { @@ -4079,7 +4140,6 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, } } } - sprev = (UChar* )onigenc_get_prev_char_head(encode, str, s); INC_OP; JUMP_OUT; @@ -4088,10 +4148,9 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, if (tlen != 0) { s = (UChar* )ONIGENC_STEP_BACK(encode, str, s, (int )tlen); if (IS_NULL(s)) goto fail; - sprev = (UChar* )onigenc_get_prev_char_head(encode, str, s); } if (p->step_back_start.remaining != 0) { - STACK_PUSH_ALT_WITH_ZID(p + 1, s, sprev, p->step_back_start.remaining); + STACK_PUSH_ALT_WITH_ZID(p + 1, s, p->step_back_start.remaining); p += p->step_back_start.addr; } else @@ -4103,9 +4162,8 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, if (tlen != INFINITE_LEN) tlen--; s = (UChar* )ONIGENC_STEP_BACK(encode, str, s, 1); if (IS_NULL(s)) goto fail; - sprev = (UChar* )onigenc_get_prev_char_head(encode, str, s); if (tlen != 0) { - STACK_PUSH_ALT_WITH_ZID(p, s, sprev, (int )tlen); + STACK_PUSH_ALT_WITH_ZID(p, s, (int )tlen); } INC_OP; JUMP_OUT; @@ -4114,8 +4172,7 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, mem = p->cut_to_mark.id; /* mem: mark id */ STACK_TO_VOID_TO_MARK(stkp, mem); if (p->cut_to_mark.restore_pos != 0) { - s = stkp->u.val.v; - sprev = stkp->u.val.v2; + s = stkp->u.val.v; } INC_OP; JUMP_OUT; @@ -4123,7 +4180,7 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, CASE_OP(MARK) mem = p->mark.id; /* mem: mark id */ if (p->mark.save_pos != 0) - STACK_PUSH_MARK_WITH_POS(mem, s, sprev); + STACK_PUSH_MARK_WITH_POS(mem, s); else STACK_PUSH_MARK(mem); @@ -4275,9 +4332,8 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, fail: #endif STACK_POP; - p = stk->u.state.pcode; - s = stk->u.state.pstr; - sprev = stk->u.state.pstr_prev; + p = stk->u.state.pcode; + s = stk->u.state.pstr; CHECK_RETRY_LIMIT_IN_MATCH; JUMP_OUT; @@ -4290,6 +4346,11 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, if (msa->retry_limit_in_search != 0) { msa->retry_limit_in_search_counter += retry_in_match_counter; } + +#ifdef ONIG_DEBUG_MATCH_COUNTER + MATCH_COUNTER_OUT("END"); +#endif + STACK_SAVE(msa, is_alloca, alloc_base); return best_len; } @@ -4324,12 +4385,11 @@ typedef struct { int state; /* value of enum SearchRangeStatus */ UChar* low; UChar* high; - UChar* low_prev; UChar* sch_range; } SearchRange; #define REGSET_MATCH_AND_RETURN_CHECK(upper_range) \ - r = match_at(reg, str, end, (upper_range), s, prev, msas + i); \ + r = match_at(reg, str, end, (upper_range), s, msas + i); \ if (r != ONIG_MISMATCH) {\ if (r >= 0) {\ goto match;\ @@ -4345,8 +4405,8 @@ regset_search_body_position_lead(OnigRegSet* set, OnigOptionType option, MatchArg* msas, int* rmatch_pos) { int r, n, i; - UChar *s, *prev; - UChar *low, *high, *low_prev; + UChar *s; + UChar *low, *high; UChar* sch_range; regex_t* reg; OnigEncoding enc; @@ -4354,12 +4414,7 @@ regset_search_body_position_lead(OnigRegSet* set, n = set->n; enc = set->enc; - s = (UChar* )start; - if (s > str) - prev = onigenc_get_prev_char_head(enc, str, s); - else - prev = (UChar* )NULL; sr = (SearchRange* )xmalloc(sizeof(*sr) * n); CHECK_NULL_RETURN_MEMERR(sr); @@ -4375,18 +4430,16 @@ regset_search_body_position_lead(OnigRegSet* set, else sch_range = (UChar* )end; - if (forward_search(reg, str, end, s, sch_range, &low, &high, &low_prev)) { + if (forward_search(reg, str, end, s, sch_range, &low, &high)) { sr[i].state = SRS_LOW_HIGH; sr[i].low = low; sr[i].high = high; - sr[i].low_prev = low_prev; sr[i].sch_range = sch_range; } } else { sch_range = (UChar* )end; - if (forward_search(reg, str, end, s, sch_range, - &low, &high, (UChar** )NULL)) { + if (forward_search(reg, str, end, s, sch_range, &low, &high)) { goto total_active; } } @@ -4396,7 +4449,6 @@ regset_search_body_position_lead(OnigRegSet* set, sr[i].state = SRS_ALL_RANGE; sr[i].low = s; sr[i].high = (UChar* )range; - sr[i].low_prev = prev; } } @@ -4412,10 +4464,9 @@ regset_search_body_position_lead(OnigRegSet* set, if (s < sr[i].low) continue; if (s >= sr[i].high) { if (forward_search(set->rs[i].reg, str, end, s, sr[i].sch_range, - &low, &high, &low_prev) != 0) { + &low, &high) != 0) { sr[i].low = low; sr[i].high = high; - sr[i].low_prev = low_prev; if (s < low) continue; } else { @@ -4436,16 +4487,13 @@ regset_search_body_position_lead(OnigRegSet* set, for (i = 0; i < n; i++) { if (sr[i].state == SRS_LOW_HIGH && low > sr[i].low) { low = sr[i].low; - low_prev = sr[i].low_prev; } } if (low == range) break; s = low; - prev = low_prev; } else { - prev = s; s += enclen(enc, s); } } while (1); @@ -4459,10 +4507,9 @@ regset_search_body_position_lead(OnigRegSet* set, if (s < sr[i].low) continue; if (s >= sr[i].high) { if (forward_search(set->rs[i].reg, str, end, s, sr[i].sch_range, - &low, &high, &low_prev) != 0) { + &low, &high) != 0) { sr[i].low = low; sr[i].high = high; - /* sr[i].low_prev = low_prev; */ if (s < low) continue; } else { @@ -4483,7 +4530,6 @@ regset_search_body_position_lead(OnigRegSet* set, if (set->anychar_inf != 0) prev_is_newline = ONIGENC_IS_MBC_NEWLINE(set->enc, s, end); - prev = s; s += enclen(enc, s); } while (1); } @@ -4552,7 +4598,7 @@ onig_regset_search_with_param(OnigRegSet* set, { int r; int i; - UChar *s, *prev; + UChar *s; regex_t* reg; OnigEncoding enc; OnigRegion* region; @@ -4654,7 +4700,6 @@ onig_regset_search_with_param(OnigRegSet* set, else if (str == end) { /* empty string */ start = end = str; s = (UChar* )start; - prev = (UChar* )NULL; msas = (MatchArg* )xmalloc(sizeof(*msas) * set->n); CHECK_NULL_RETURN_MEMERR(msas); @@ -4669,7 +4714,7 @@ onig_regset_search_with_param(OnigRegSet* set, /* Can't use REGSET_MATCH_AND_RETURN_CHECK() because r must be set regex index (i) */ - r = match_at(reg, str, end, end, s, prev, msas + i); + r = match_at(reg, str, end, end, s, msas + i); if (r != ONIG_MISMATCH) { if (r >= 0) { r = i; @@ -4814,7 +4859,7 @@ slow_search_backward(OnigEncoding enc, UChar* target, UChar* target_end, else s = ONIGENC_LEFT_ADJUST_CHAR_HEAD(enc, adjust_text, s); - while (s >= text) { + while (PTR_GE(s, text)) { if (*s == *target) { p = s + 1; t = target + 1; @@ -4855,7 +4900,7 @@ sunday_quick_search_step_forward(regex_t* reg, tail = target_end - 1; tlen1 = (int )(tail - target); end = text_range; - if (end + tlen1 > text_end) + if (tlen1 > text_end - end) end = text_end - tlen1; map_offset = reg->map_offset; @@ -4893,15 +4938,38 @@ sunday_quick_search(regex_t* reg, const UChar* target, const UChar* target_end, const UChar *s, *t, *p, *end; const UChar *tail; int map_offset; - - end = text_range + (target_end - target); - if (end > text_end) - end = text_end; + ptrdiff_t target_len; map_offset = reg->map_offset; tail = target_end - 1; - s = text + (tail - target); + target_len = target_end - target; + if (target_len > text_end - text_range) { + end = text_end; + if (target_len > text_end - text) + return (UChar* )NULL; + } + else { + end = text_range + target_len; + } + + s = text + target_len - 1; + +#ifdef USE_STRICT_POINTER_ADDRESS + if (s < end) { + while (TRUE) { + p = s; + t = tail; + while (*p == *t) { + if (t == target) return (UChar* )p; + p--; t--; + } + if (text_end - s <= map_offset) break; + if (reg->map[*(s + map_offset)] >= end - s) break; + s += reg->map[*(s + map_offset)]; + } + } +#else while (s < end) { p = s; t = tail; @@ -4909,9 +4977,10 @@ sunday_quick_search(regex_t* reg, const UChar* target, const UChar* target_end, if (t == target) return (UChar* )p; p--; t--; } - if (s + map_offset >= text_end) break; + if (text_end - s <= map_offset) break; s += reg->map[*(s + map_offset)]; } +#endif return (UChar* )NULL; } @@ -4937,7 +5006,7 @@ map_search_backward(OnigEncoding enc, UChar map[], { const UChar *s = text_start; - while (s >= text) { + while (PTR_GE(s, text)) { if (map[*s]) return (UChar* )s; s = onigenc_get_prev_char_head(enc, adjust_text, s); @@ -4963,13 +5032,16 @@ onig_match_with_param(regex_t* reg, const UChar* str, const UChar* end, OnigMatchParam* mp) { int r; - UChar *prev; MatchArg msa; +#ifndef USE_POSIX_API + if (OPTON_POSIX_REGION(option)) return ONIGERR_INVALID_ARGUMENT; +#endif + ADJUST_MATCH_PARAM(reg, mp); MATCH_ARG_INIT(msa, reg, option, region, at, mp); if (region -#ifdef USE_POSIX_API_REGION_OPTION +#ifdef USE_POSIX_API && !OPTON_POSIX_REGION(option) #endif ) { @@ -4986,8 +5058,14 @@ onig_match_with_param(regex_t* reg, const UChar* str, const UChar* end, } } - prev = (UChar* )onigenc_get_prev_char_head(reg->enc, str, at); - r = match_at(reg, str, end, end, at, prev, &msa); + r = match_at(reg, str, end, end, at, &msa); +#ifdef USE_FIND_LONGEST_SEARCH_ALL_OF_RANGE + if (OPTON_FIND_LONGEST(option) && r == ONIG_MISMATCH) { + if (msa.best_len >= 0) { + r = msa.best_len; + } + } +#endif } end: @@ -4997,7 +5075,7 @@ onig_match_with_param(regex_t* reg, const UChar* str, const UChar* end, static int forward_search(regex_t* reg, const UChar* str, const UChar* end, UChar* start, - UChar* range, UChar** low, UChar** high, UChar** low_prev) + UChar* range, UChar** low, UChar** high) { UChar *p, *pprev = (UChar* )NULL; @@ -5081,33 +5159,18 @@ forward_search(regex_t* reg, const UChar* str, const UChar* end, UChar* start, } if (reg->dist_max == 0) { - *low = p; - if (low_prev) { - if (*low > start) - *low_prev = onigenc_get_prev_char_head(reg->enc, start, p); - else - *low_prev = onigenc_get_prev_char_head(reg->enc, - (pprev ? pprev : str), p); - } + *low = p; *high = p; } else { if (reg->dist_max != INFINITE_LEN) { if (p - str < reg->dist_max) { *low = (UChar* )str; - if (low_prev) - *low_prev = onigenc_get_prev_char_head(reg->enc, str, *low); } else { *low = p - reg->dist_max; if (*low > start) { - *low = onigenc_get_right_adjust_char_head_with_prev(reg->enc, start, - *low, (const UChar** )low_prev); - } - else { - if (low_prev) - *low_prev = onigenc_get_prev_char_head(reg->enc, - (pprev ? pprev : str), *low); + *low = onigenc_get_right_adjust_char_head(reg->enc, start, *low); } } } @@ -5263,7 +5326,7 @@ search_in_range(regex_t* reg, const UChar* str, const UChar* end, OnigOptionType option, OnigMatchParam* mp) { int r; - UChar *s, *prev; + UChar *s; MatchArg msa; const UChar *orig_start = start; @@ -5275,8 +5338,15 @@ search_in_range(regex_t* reg, const UChar* str, const UChar* end, ADJUST_MATCH_PARAM(reg, mp); +#ifndef USE_POSIX_API + if (OPTON_POSIX_REGION(option)) { + r = ONIGERR_INVALID_ARGUMENT; + goto finish_no_msa; + } +#endif + if (region -#ifdef USE_POSIX_API_REGION_OPTION +#ifdef USE_POSIX_API && ! OPTON_POSIX_REGION(option) #endif ) { @@ -5294,27 +5364,14 @@ search_in_range(regex_t* reg, const UChar* str, const UChar* end, } -#ifdef USE_FIND_LONGEST_SEARCH_ALL_OF_RANGE #define MATCH_AND_RETURN_CHECK(upper_range) \ - r = match_at(reg, str, end, (upper_range), s, prev, &msa); \ - if (r != ONIG_MISMATCH) {\ - if (r >= 0) {\ - if (! OPTON_FIND_LONGEST(reg->options)) {\ - goto match;\ - }\ - }\ - else goto finish; /* error */ \ - } -#else -#define MATCH_AND_RETURN_CHECK(upper_range) \ - r = match_at(reg, str, end, (upper_range), s, prev, &msa); \ + r = match_at(reg, str, end, (upper_range), s, &msa);\ if (r != ONIG_MISMATCH) {\ if (r >= 0) {\ goto match;\ }\ else goto finish; /* error */ \ } -#endif /* USE_FIND_LONGEST_SEARCH_ALL_OF_RANGE */ /* anchor optimize: resume search range */ @@ -5422,7 +5479,6 @@ search_in_range(regex_t* reg, const UChar* str, const UChar* end, if (reg->threshold_len == 0) { start = end = str = address_for_empty_string; s = (UChar* )start; - prev = (UChar* )NULL; MATCH_ARG_INIT(msa, reg, option, region, start, mp); MATCH_AND_RETURN_CHECK(end); @@ -5440,13 +5496,8 @@ search_in_range(regex_t* reg, const UChar* str, const UChar* end, s = (UChar* )start; if (range > start) { /* forward search */ - if (s > str) - prev = onigenc_get_prev_char_head(reg->enc, str, s); - else - prev = (UChar* )NULL; - if (reg->optimize != OPTIMIZE_NONE) { - UChar *sch_range, *low, *high, *low_prev; + UChar *sch_range, *low, *high; if (reg->dist_max != 0) { if (reg->dist_max == INFINITE_LEN) @@ -5467,27 +5518,27 @@ search_in_range(regex_t* reg, const UChar* str, const UChar* end, if (reg->dist_max != INFINITE_LEN) { do { - if (! forward_search(reg, str, end, s, sch_range, &low, &high, - &low_prev)) goto mismatch; + if (! forward_search(reg, str, end, s, sch_range, &low, &high)) + goto mismatch; if (s < low) { s = low; - prev = low_prev; } while (s <= high) { MATCH_AND_RETURN_CHECK(data_range); - prev = s; s += enclen(reg->enc, s); } } while (s < range); goto mismatch; } else { /* check only. */ - if (! forward_search(reg, str, end, s, sch_range, &low, &high, - (UChar** )NULL)) goto mismatch; + if (! forward_search(reg, str, end, s, sch_range, &low, &high)) + goto mismatch; if ((reg->anchor & ANCR_ANYCHAR_INF) != 0 && (reg->anchor & (ANCR_LOOK_BEHIND | ANCR_PREC_READ_NOT)) == 0) { do { + UChar* prev; + MATCH_AND_RETURN_CHECK(data_range); prev = s; s += enclen(reg->enc, s); @@ -5504,7 +5555,6 @@ search_in_range(regex_t* reg, const UChar* str, const UChar* end, do { MATCH_AND_RETURN_CHECK(data_range); - prev = s; s += enclen(reg->enc, s); } while (s < range); @@ -5549,12 +5599,11 @@ search_in_range(regex_t* reg, const UChar* str, const UChar* end, if (s > high) s = high; - while (s >= low) { - prev = onigenc_get_prev_char_head(reg->enc, str, s); + while (PTR_GE(s, low)) { MATCH_AND_RETURN_CHECK(orig_start); - s = prev; + s = onigenc_get_prev_char_head(reg->enc, str, s); } - } while (s >= range); + } while (PTR_GE(s, range)); goto mismatch; } else { /* check only. */ @@ -5566,10 +5615,9 @@ search_in_range(regex_t* reg, const UChar* str, const UChar* end, } do { - prev = onigenc_get_prev_char_head(reg->enc, str, s); MATCH_AND_RETURN_CHECK(orig_start); - s = prev; - } while (s >= range); + s = onigenc_get_prev_char_head(reg->enc, str, s); + } while (PTR_GE(s, range)); } mismatch: @@ -5589,7 +5637,7 @@ search_in_range(regex_t* reg, const UChar* str, const UChar* end, /* If result is mismatch and no FIND_NOT_EMPTY option, then the region is not set in match_at(). */ if (OPTON_FIND_NOT_EMPTY(reg->options) && region -#ifdef USE_POSIX_API_REGION_OPTION +#ifdef USE_POSIX_API && !OPTON_POSIX_REGION(option) #endif ) { @@ -5952,7 +6000,7 @@ extern int onig_init_for_match_at(regex_t* reg) { return match_at(reg, (const UChar* )NULL, (const UChar* )NULL, - (const UChar* )NULL, (const UChar* )NULL, (UChar* )NULL, + (const UChar* )NULL, (const UChar* )NULL, (MatchArg* )NULL); } #endif @@ -6139,8 +6187,8 @@ onig_get_capture_range_in_callout(OnigCalloutArgs* a, int mem_num, int* begin, i const UChar* str; StackType* stk_base; int i; - StackIndex* mem_start_stk; - StackIndex* mem_end_stk; + StkPtrType* mem_start_stk; + StkPtrType* mem_end_stk; i = mem_num; reg = a->regex; @@ -6150,7 +6198,7 @@ onig_get_capture_range_in_callout(OnigCalloutArgs* a, int mem_num, int* begin, i mem_end_stk = a->mem_end_stk; if (i > 0) { - if (a->mem_end_stk[i] != INVALID_STACK_INDEX) { + if (a->mem_end_stk[i].i != INVALID_STACK_INDEX) { *begin = (int )(STACK_MEM_START(reg, i) - str); *end = (int )(STACK_MEM_END(reg, i) - str); } diff --git a/src/regint.h b/src/regint.h index 04ebe0a..74a5c61 100644 --- a/src/regint.h +++ b/src/regint.h @@ -34,6 +34,7 @@ /* #define ONIG_DEBUG_COMPILE */ /* #define ONIG_DEBUG_SEARCH */ /* #define ONIG_DEBUG_MATCH */ +/* #define ONIG_DEBUG_MATCH_COUNTER */ /* #define ONIG_DONT_OPTIMIZE */ /* for byte-code statistical data. */ @@ -41,7 +42,7 @@ #if defined(ONIG_DEBUG_PARSE) || defined(ONIG_DEBUG_MATCH) || \ defined(ONIG_DEBUG_SEARCH) || defined(ONIG_DEBUG_COMPILE) || \ - defined(ONIG_DEBUG_STATISTICS) + defined(ONIG_DEBUG_MATCH_COUNTER) || defined(ONIG_DEBUG_STATISTICS) #ifndef ONIG_DEBUG #define ONIG_DEBUG #define DBGFP stderr @@ -70,23 +71,29 @@ #endif /* internal config */ +#define USE_CHECK_VALIDITY_OF_STRING_IN_TREE #define USE_OP_PUSH_OR_JUMP_EXACT #define USE_QUANT_PEEK_NEXT #define USE_ST_LIBRARY #define USE_TIMEOFDAY +#define USE_STRICT_POINTER_ADDRESS +#define USE_STRICT_POINTER_COMPARISON #define USE_WORD_BEGIN_END /* "\<", "\>" */ #define USE_CAPTURE_HISTORY #define USE_VARIABLE_META_CHARS -#define USE_POSIX_API_REGION_OPTION #define USE_FIND_LONGEST_SEARCH_ALL_OF_RANGE /* #define USE_REPEAT_AND_EMPTY_CHECK_LOCAL_VAR */ +/* enabled by configure --enable-posix-api=yes */ +/* #define USE_POSIX_API */ + +#define DEFAULT_PARSE_DEPTH_LIMIT 4096 #define INIT_MATCH_STACK_SIZE 160 #define DEFAULT_MATCH_STACK_LIMIT_SIZE 0 /* unlimited */ #define DEFAULT_RETRY_LIMIT_IN_MATCH 10000000 #define DEFAULT_RETRY_LIMIT_IN_SEARCH 0 /* unlimited */ -#define DEFAULT_PARSE_DEPTH_LIMIT 4096 +#define DEFAULT_SUBEXP_CALL_LIMIT_IN_SEARCH 0 /* unlimited */ #define DEFAULT_SUBEXP_CALL_MAX_NEST_LEVEL 20 @@ -181,6 +188,12 @@ #define CHECK_NULL_RETURN_MEMERR(p) if (IS_NULL(p)) return ONIGERR_MEMORY #define NULL_UCHARP ((UChar* )0) +#ifdef USE_STRICT_POINTER_COMPARISON +#define PTR_GE(p,q) ((p) != NULL && (p) >= (q)) +#else +#define PTR_GE(p,q) (p) >= (q) +#endif + #ifndef ONIG_INT_MAX #define ONIG_INT_MAX INT_MAX #endif @@ -255,11 +268,22 @@ #ifdef _WIN32 -#if defined(_MSC_VER) && (_MSC_VER < 1300) +#ifdef _MSC_VER + +#if _MSC_VER < 1300 typedef int intptr_t; typedef unsigned int uintptr_t; #endif + +#if _MSC_VER < 1600 +typedef __int32 int32_t; +typedef unsigned __int32 uint32_t; +typedef __int64 int64_t; +typedef unsigned __int64 uint64_t; +#endif + #endif +#endif /* _WIN32 */ #if SIZEOF_VOIDP == SIZEOF_LONG typedef unsigned long hash_data_type; @@ -378,6 +402,9 @@ typedef unsigned int MemStatusType; #define OPTON_POSIX_REGION(option) ((option) & ONIG_OPTION_POSIX_REGION) #define OPTON_CHECK_VALIDITY_OF_STRING(option) ((option) & \ ONIG_OPTION_CHECK_VALIDITY_OF_STRING) +#define OPTON_NOT_BEGIN_STRING(option) ((option) & ONIG_OPTION_NOT_BEGIN_STRING) +#define OPTON_NOT_END_STRING(option) ((option) & ONIG_OPTION_NOT_END_STRING) +#define OPTON_NOT_BEGIN_POSITION(option) ((option) & ONIG_OPTION_NOT_BEGIN_POSITION) #define DISABLE_CASE_FOLD_MULTI_CHAR(case_fold_flag) \ ((case_fold_flag) & ~INTERNAL_ONIGENC_CASE_FOLD_MULTI_CHAR) @@ -562,10 +589,14 @@ enum OpCode { OP_BACKREF_N_IC, OP_BACKREF_MULTI, OP_BACKREF_MULTI_IC, +#ifdef USE_BACKREF_WITH_LEVEL OP_BACKREF_WITH_LEVEL, /* \k<xxx+n>, \k<xxx-n> */ OP_BACKREF_WITH_LEVEL_IC, /* \k<xxx+n>, \k<xxx-n> */ +#endif OP_BACKREF_CHECK, /* (?(n)), (?('name')) */ +#ifdef USE_BACKREF_WITH_LEVEL OP_BACKREF_CHECK_WITH_LEVEL, /* (?(n-level)), (?('name-level')) */ +#endif OP_MEM_START, OP_MEM_START_PUSH, /* push back-tracker to stack */ OP_MEM_END_PUSH, /* push back-tracker to stack */ @@ -891,6 +922,9 @@ typedef struct { } update_var; struct { AbsAddrType addr; +#ifdef ONIG_DEBUG_MATCH_COUNTER + MemNumType called_mem; +#endif } call; #ifdef USE_CALLOUT struct { diff --git a/src/regparse.c b/src/regparse.c index cc015a7..dd2824b 100644 --- a/src/regparse.c +++ b/src/regparse.c @@ -290,7 +290,7 @@ bbuf_clone(BBuf** rto, BBuf* from) CHECK_NULL_RETURN_MEMERR(to); r = BB_INIT(to, from->alloc); if (r != 0) { - xfree(to->p); + bbuf_free(to); *rto = 0; return r; } @@ -303,6 +303,8 @@ static int backref_rel_to_abs(int rel_no, ScanEnv* env) { if (rel_no > 0) { + if (rel_no > ONIG_INT_MAX - env->num_mem) + return ONIGERR_INVALID_BACKREF; return env->num_mem + rel_no; } else { @@ -437,6 +439,7 @@ strcat_capa(UChar* dest, UChar* dest_end, const UChar* src, const UChar* src_end int capa) { UChar* r; + ptrdiff_t dest_delta = dest_end - dest; if (dest) r = (UChar* )xrealloc(dest, capa + 1); @@ -444,7 +447,7 @@ strcat_capa(UChar* dest, UChar* dest_end, const UChar* src, const UChar* src_end r = (UChar* )xmalloc(capa + 1); CHECK_NULL_RETURN(r); - onig_strcpy(r + (dest_end - dest), src, src_end); + onig_strcpy(r + dest_delta, src, src_end); return r; } @@ -1294,7 +1297,9 @@ static int i_free_callout_name_entry(st_callout_name_key* key, CalloutNameEntry* e, void* arg ARG_UNUSED) { - xfree(e->name); + if (IS_NOT_NULL(e)) { + xfree(e->name); + } /*xfree(key->s); */ /* is same as e->name */ xfree(key); xfree(e); @@ -2502,7 +2507,7 @@ node_new_call(UChar* name, UChar* name_end, int gnum, int by_number) CALL_(node)->by_number = by_number; CALL_(node)->name = name; CALL_(node)->name_end = name_end; - CALL_(node)->group_num = gnum; + CALL_(node)->called_gnum = gnum; CALL_(node)->entry_count = 1; return node; } @@ -3135,7 +3140,6 @@ make_absent_tree_for_simple_one_char_repeat(Node** node, Node* absent, Node* qua lower = QUANT_(quant)->lower; upper = QUANT_(quant)->upper; - onig_node_free(quant); r = node_new_save_gimmick(&ns[0], SAVE_RIGHT_RANGE, env); if (r != 0) goto err; @@ -3202,9 +3206,9 @@ make_absent_tree(Node** node, Node* absent, Node* expr, int is_range_cutter, simple: r = make_absent_tree_for_simple_one_char_repeat(node, absent, quant, body, possessive, env); + onig_node_free(quant); if (r != 0) { ns[4] = NULL_NODE; - onig_node_free(quant); onig_node_free(body); goto err; } @@ -3708,21 +3712,24 @@ get_next_code_point(UChar** src, UChar* end, int base, OnigEncoding enc, int in_ while (! PEND) { PFETCH(c); - if (! IS_CODE_POINT_DIVIDE(c)) break; - } - if (IS_CODE_POINT_DIVIDE(c)) - return ONIGERR_INVALID_CODE_POINT_VALUE; - - if (c == '}') { - *src = p; - return 1; /* end of sequence */ - } - else if (c == '-' && in_cc == TRUE) { - *src = p; - return 2; /* range */ + if (! IS_CODE_POINT_DIVIDE(c)) { + if (c == '}') { + *src = p; + return 1; /* end of sequence */ + } + else if (c == '-' && in_cc == TRUE) { + *src = p; + return 2; /* range */ + } + PUNFETCH; + break; + } + else { + if (PEND) + return ONIGERR_INVALID_CODE_POINT_VALUE; + } } - PUNFETCH; r = scan_number_of_base(&p, end, 1, enc, rcode, base); if (r != 0) return r; @@ -3873,13 +3880,17 @@ not_code_range_buf(OnigEncoding enc, BBuf* bbuf, BBuf** pbuf) to = data[i*2+1]; if (pre <= from - 1) { r = add_code_range_to_buf(pbuf, pre, from - 1); - if (r != 0) return r; + if (r != 0) { + bbuf_free(*pbuf); + return r; + } } if (to == ~((OnigCodePoint )0)) break; pre = to + 1; } if (to < ~((OnigCodePoint )0)) { r = add_code_range_to_buf(pbuf, to + 1, ~((OnigCodePoint )0)); + if (r != 0) bbuf_free(*pbuf); } return r; } @@ -4564,7 +4575,7 @@ fetch_interval(UChar** src, UChar* end, PToken* tok, ScanEnv* env) /* \M-, \C-, \c, or \... */ static int -fetch_escaped_value(UChar** src, UChar* end, ScanEnv* env, OnigCodePoint* val) +fetch_escaped_value_raw(UChar** src, UChar* end, ScanEnv* env, OnigCodePoint* val) { int v; OnigCodePoint c; @@ -4583,7 +4594,7 @@ fetch_escaped_value(UChar** src, UChar* end, ScanEnv* env, OnigCodePoint* val) if (PEND) return ONIGERR_END_PATTERN_AT_META; PFETCH_S(c); if (c == MC_ESC(env->syntax)) { - v = fetch_escaped_value(&p, end, env, &c); + v = fetch_escaped_value_raw(&p, end, env, &c); if (v < 0) return v; } c = ((c & 0xff) | 0x80); @@ -4612,7 +4623,7 @@ fetch_escaped_value(UChar** src, UChar* end, ScanEnv* env, OnigCodePoint* val) } else { if (c == MC_ESC(env->syntax)) { - v = fetch_escaped_value(&p, end, env, &c); + v = fetch_escaped_value_raw(&p, end, env, &c); if (v < 0) return v; } c &= 0x9f; @@ -4634,6 +4645,21 @@ fetch_escaped_value(UChar** src, UChar* end, ScanEnv* env, OnigCodePoint* val) return 0; } +static int +fetch_escaped_value(UChar** src, UChar* end, ScanEnv* env, OnigCodePoint* val) +{ + int r; + int len; + + r = fetch_escaped_value_raw(src, end, env, val); + if (r != 0) return r; + + len = ONIGENC_CODE_TO_MBCLEN(env->enc, *val); + if (len < 0) return len; + + return 0; +} + static int fetch_token(PToken* tok, UChar** src, UChar* end, ScanEnv* env); static OnigCodePoint @@ -5192,7 +5218,7 @@ fetch_token_cc(PToken* tok, UChar** src, UChar* end, ScanEnv* env, int state) else { int curr_state; - curr_state = (state == CS_RANGE) ? CPS_EMPTY : CPS_START; + curr_state = (state == CS_RANGE) ? CPS_EMPTY : CPS_START; r = check_code_point_sequence_cc(p, end, tok->base_num, enc, curr_state); if (r < 0) return r; @@ -6372,7 +6398,7 @@ add_ctype_to_cc(CClassNode* cc, int ctype, int not, ScanEnv* env) } static int -parse_posix_bracket(CClassNode* cc, UChar** src, UChar* end, ScanEnv* env) +prs_posix_bracket(CClassNode* cc, UChar** src, UChar* end, ScanEnv* env) { #define POSIX_BRACKET_CHECK_LIMIT_LENGTH 20 #define POSIX_BRACKET_NAME_MIN_LEN 4 @@ -6481,7 +6507,7 @@ fetch_char_property_to_ctype(UChar** src, UChar* end, ScanEnv* env) } static int -parse_char_property(Node** np, PToken* tok, UChar** src, UChar* end, ScanEnv* env) +prs_char_property(Node** np, PToken* tok, UChar** src, UChar* end, ScanEnv* env) { int r, ctype; CClassNode* cc; @@ -6617,7 +6643,7 @@ code_exist_check(OnigCodePoint c, UChar* from, UChar* end, int ignore_escaped, } static int -parse_cc(Node** np, PToken* tok, UChar** src, UChar* end, ScanEnv* env) +prs_cc(Node** np, PToken* tok, UChar** src, UChar* end, ScanEnv* env) { int r, neg, len, fetched, and_start; OnigCodePoint in_code, curr_code; @@ -6715,6 +6741,7 @@ parse_cc(Node** np, PToken* tok, UChar** src, UChar* end, ScanEnv* env) p = psave; for (i = 1; i < len; i++) { r = fetch_token_cc(tok, &p, end, env, CS_COMPLETE); + if (r < 0) goto err; } fetched = 0; } @@ -6759,7 +6786,7 @@ parse_cc(Node** np, PToken* tok, UChar** src, UChar* end, ScanEnv* env) break; case TK_CC_POSIX_BRACKET_OPEN: - r = parse_posix_bracket(cc, &p, end, env); + r = prs_posix_bracket(cc, &p, end, env); if (r < 0) goto err; if (r == 1) { /* is not POSIX bracket */ CC_ESC_WARN(env, (UChar* )"["); @@ -6869,7 +6896,7 @@ parse_cc(Node** np, PToken* tok, UChar** src, UChar* end, ScanEnv* env) } state = CS_COMPLETE; - r = parse_cc(&anode, tok, &p, end, env); + r = prs_cc(&anode, tok, &p, end, env); if (r != 0) { onig_node_free(anode); goto cc_open_err; @@ -6967,14 +6994,14 @@ parse_cc(Node** np, PToken* tok, UChar** src, UChar* end, ScanEnv* env) return r; } -static int parse_alts(Node** top, PToken* tok, int term, - UChar** src, UChar* end, ScanEnv* env, int group_head); +static int prs_alts(Node** top, PToken* tok, int term, + UChar** src, UChar* end, ScanEnv* env, int group_head); #ifdef USE_CALLOUT /* (?{...}[tag][+-]) (?{{...}}[tag][+-]) */ static int -parse_callout_of_contents(Node** np, int cterm, UChar** src, UChar* end, ScanEnv* env) +prs_callout_of_contents(Node** np, int cterm, UChar** src, UChar* end, ScanEnv* env) { int r; int i; @@ -7078,18 +7105,18 @@ parse_callout_of_contents(Node** np, int cterm, UChar** src, UChar* end, ScanEnv contents = onigenc_strdup(enc, code_start, code_end); CHECK_NULL_RETURN_MEMERR(contents); - r = node_new_callout(np, ONIG_CALLOUT_OF_CONTENTS, num, ONIG_NON_NAME_ID, env); - if (r != 0) { - xfree(contents); - return r; - } - e = onig_reg_callout_list_at(env->reg, num); if (IS_NULL(e)) { xfree(contents); return ONIGERR_MEMORY; } + r = node_new_callout(np, ONIG_CALLOUT_OF_CONTENTS, num, ONIG_NON_NAME_ID, env); + if (r != 0) { + xfree(contents); + return r; + } + e->of = ONIG_CALLOUT_OF_CONTENTS; e->in = in; e->name_id = ONIG_NON_NAME_ID; @@ -7101,7 +7128,7 @@ parse_callout_of_contents(Node** np, int cterm, UChar** src, UChar* end, ScanEnv } static long -parse_long(OnigEncoding enc, UChar* s, UChar* end, int sign_on, long max, long* rl) +prs_long(OnigEncoding enc, UChar* s, UChar* end, int sign_on, long max, long* rl) { long v; long d; @@ -7137,10 +7164,27 @@ parse_long(OnigEncoding enc, UChar* s, UChar* end, int sign_on, long max, long* return ONIG_NORMAL; } +static void +clear_callout_args(int n, unsigned int types[], OnigValue vals[]) +{ + int i; + + for (i = 0; i < n; i++) { + switch (types[i]) { + case ONIG_TYPE_STRING: + if (IS_NOT_NULL(vals[i].s.start)) + xfree(vals[i].s.start); + break; + default: + break; + } + } +} + static int -parse_callout_args(int skip_mode, int cterm, UChar** src, UChar* end, - int max_arg_num, unsigned int types[], OnigValue vals[], - ScanEnv* env) +prs_callout_args(int skip_mode, int cterm, UChar** src, UChar* end, + int max_arg_num, unsigned int types[], OnigValue vals[], + ScanEnv* env) { #define MAX_CALLOUT_ARG_BYTE_LENGTH 128 @@ -7168,7 +7212,10 @@ parse_callout_args(int skip_mode, int cterm, UChar** src, UChar* end, bufend = buf; s = e = p; while (1) { - if (PEND) return ONIGERR_INVALID_CALLOUT_PATTERN; + if (PEND) { + r = ONIGERR_INVALID_CALLOUT_PATTERN; + goto err_clear; + } e = p; PFETCH_S(c); @@ -7196,8 +7243,10 @@ parse_callout_args(int skip_mode, int cterm, UChar** src, UChar* end, add_char: if (skip_mode == FALSE) { clen = p - e; - if (bufend + clen > buf + MAX_CALLOUT_ARG_BYTE_LENGTH) - return ONIGERR_INVALID_CALLOUT_ARG; /* too long argument */ + if (bufend + clen > buf + MAX_CALLOUT_ARG_BYTE_LENGTH) { + r = ONIGERR_INVALID_CALLOUT_ARG; /* too long argument */ + goto err_clear; + } xmemcpy(bufend, e, clen); bufend += clen; @@ -7208,15 +7257,17 @@ parse_callout_args(int skip_mode, int cterm, UChar** src, UChar* end, } if (cn != 0) { - if (max_arg_num >= 0 && n >= max_arg_num) - return ONIGERR_INVALID_CALLOUT_ARG; + if (max_arg_num >= 0 && n >= max_arg_num) { + r = ONIGERR_INVALID_CALLOUT_ARG; + goto err_clear; + } if (skip_mode == FALSE) { if ((types[n] & ONIG_TYPE_LONG) != 0) { int fixed = 0; if (cn > 0) { long rl; - r = parse_long(enc, buf, bufend, 1, LONG_MAX, &rl); + r = prs_long(enc, buf, bufend, 1, LONG_MAX, &rl); if (r == ONIG_NORMAL) { vals[n].l = rl; fixed = 1; @@ -7226,8 +7277,10 @@ parse_callout_args(int skip_mode, int cterm, UChar** src, UChar* end, if (fixed == 0) { types[n] = (types[n] & ~ONIG_TYPE_LONG); - if (types[n] == ONIG_TYPE_VOID) - return ONIGERR_INVALID_CALLOUT_ARG; + if (types[n] == ONIG_TYPE_VOID) { + r = ONIGERR_INVALID_CALLOUT_ARG; + goto err_clear; + } } } @@ -7236,22 +7289,29 @@ parse_callout_args(int skip_mode, int cterm, UChar** src, UChar* end, break; case ONIG_TYPE_CHAR: - if (cn != 1) return ONIGERR_INVALID_CALLOUT_ARG; + if (cn != 1) { + r = ONIGERR_INVALID_CALLOUT_ARG; + goto err_clear; + } vals[n].c = ONIGENC_MBC_TO_CODE(enc, buf, bufend); break; case ONIG_TYPE_STRING: { UChar* rs = onigenc_strdup(enc, buf, bufend); - CHECK_NULL_RETURN_MEMERR(rs); + if (IS_NULL(rs)) { + r = ONIGERR_MEMORY; goto err_clear; + } vals[n].s.start = rs; vals[n].s.end = rs + (e - s); } break; case ONIG_TYPE_TAG: - if (eesc != 0 || ! is_allowed_callout_tag_name(enc, s, e)) - return ONIGERR_INVALID_CALLOUT_TAG_NAME; + if (eesc != 0 || ! is_allowed_callout_tag_name(enc, s, e)) { + r = ONIGERR_INVALID_CALLOUT_TAG_NAME; + goto err_clear; + } vals[n].s.start = s; vals[n].s.end = e; @@ -7259,7 +7319,8 @@ parse_callout_args(int skip_mode, int cterm, UChar** src, UChar* end, case ONIG_TYPE_VOID: case ONIG_TYPE_POINTER: - return ONIGERR_PARSER_BUG; + r = ONIGERR_PARSER_BUG; + goto err_clear; break; } } @@ -7270,15 +7331,23 @@ parse_callout_args(int skip_mode, int cterm, UChar** src, UChar* end, if (c == cterm) break; } - if (c != cterm) return ONIGERR_INVALID_CALLOUT_PATTERN; + if (c != cterm) { + r = ONIGERR_INVALID_CALLOUT_PATTERN; + goto err_clear; + } *src = p; return n; + + err_clear: + if (skip_mode == FALSE) + clear_callout_args(n, types, vals); + return r; } /* (*name[TAG]) (*name[TAG]{a,b,..}) */ static int -parse_callout_of_name(Node** np, int cterm, UChar** src, UChar* end, ScanEnv* env) +prs_callout_of_name(Node** np, int cterm, UChar** src, UChar* end, ScanEnv* env) { int r; int i; @@ -7343,7 +7412,7 @@ parse_callout_of_name(Node** np, int cterm, UChar** src, UChar* end, ScanEnv* en /* read for single check only */ save = p; - arg_num = parse_callout_args(TRUE, '}', &p, end, -1, NULL, NULL, env); + arg_num = prs_callout_args(TRUE, '}', &p, end, -1, NULL, NULL, env); if (arg_num < 0) return arg_num; is_not_single = PPEEK_IS(cterm) ? 0 : 1; @@ -7357,10 +7426,13 @@ parse_callout_of_name(Node** np, int cterm, UChar** src, UChar* end, ScanEnv* en types[i] = get_callout_arg_type_by_name_id(name_id, i); } - arg_num = parse_callout_args(FALSE, '}', &p, end, max_arg_num, types, vals, env); + arg_num = prs_callout_args(FALSE, '}', &p, end, max_arg_num, types, vals, env); if (arg_num < 0) return arg_num; - if (PEND) return ONIGERR_END_PATTERN_IN_GROUP; + if (PEND) { + r = ONIGERR_END_PATTERN_IN_GROUP; + goto err_clear; + } PFETCH_S(c); } else { @@ -7379,32 +7451,40 @@ parse_callout_of_name(Node** np, int cterm, UChar** src, UChar* end, ScanEnv* en in = onig_get_callout_in_by_name_id(name_id); opt_arg_num = get_callout_opt_arg_num_by_name_id(name_id); - if (arg_num > max_arg_num || arg_num < (max_arg_num - opt_arg_num)) - return ONIGERR_INVALID_CALLOUT_ARG; + if (arg_num > max_arg_num || arg_num < (max_arg_num - opt_arg_num)) { + r = ONIGERR_INVALID_CALLOUT_ARG; + goto err_clear; + } - if (c != cterm) - return ONIGERR_INVALID_CALLOUT_PATTERN; + if (c != cterm) { + r = ONIGERR_INVALID_CALLOUT_PATTERN; + goto err_clear; + } r = reg_callout_list_entry(env, &num); - if (r != 0) return r; + if (r != 0) goto err_clear; ext = onig_get_regex_ext(env->reg); - CHECK_NULL_RETURN_MEMERR(ext); + if (IS_NULL(ext)) { + r = ONIGERR_MEMORY; goto err_clear; + } if (IS_NULL(ext->pattern)) { r = onig_ext_set_pattern(env->reg, env->pattern, env->pattern_end); - if (r != ONIG_NORMAL) return r; + if (r != ONIG_NORMAL) goto err_clear; } if (tag_start != tag_end) { r = callout_tag_entry(env, env->reg, tag_start, tag_end, num); - if (r != ONIG_NORMAL) return r; + if (r != ONIG_NORMAL) goto err_clear; } - r = node_new_callout(&node, ONIG_CALLOUT_OF_NAME, num, name_id, env); - if (r != ONIG_NORMAL) return r; - e = onig_reg_callout_list_at(env->reg, num); - CHECK_NULL_RETURN_MEMERR(e); + if (IS_NULL(e)) { + r = ONIGERR_MEMORY; goto err_clear; + } + + r = node_new_callout(&node, ONIG_CALLOUT_OF_NAME, num, name_id, env); + if (r != ONIG_NORMAL) goto err_clear; e->of = ONIG_CALLOUT_OF_NAME; e->in = in; @@ -7425,12 +7505,16 @@ parse_callout_of_name(Node** np, int cterm, UChar** src, UChar* end, ScanEnv* en *np = node; *src = p; return 0; + + err_clear: + clear_callout_args(arg_num, types, vals); + return r; } #endif static int -parse_bag(Node** np, PToken* tok, int term, UChar** src, UChar* end, - ScanEnv* env) +prs_bag(Node** np, PToken* tok, int term, UChar** src, UChar* end, + ScanEnv* env) { int r, num; Node *target; @@ -7457,7 +7541,7 @@ parse_bag(Node** np, PToken* tok, int term, UChar** src, UChar* end, group: r = fetch_token(tok, &p, end, env); if (r < 0) return r; - r = parse_alts(np, tok, term, &p, end, env, FALSE); + r = prs_alts(np, tok, term, &p, end, env, FALSE); if (r < 0) return r; *src = p; return 1; /* group */ @@ -7554,7 +7638,7 @@ parse_bag(Node** np, PToken* tok, int term, UChar** src, UChar* end, r = fetch_token(tok, &p, end, env); if (r < 0) return r; - r = parse_alts(&absent, tok, term, &p, end, env, TRUE); + r = prs_alts(&absent, tok, term, &p, end, env, TRUE); if (r < 0) { onig_node_free(absent); return r; @@ -7600,7 +7684,7 @@ parse_bag(Node** np, PToken* tok, int term, UChar** src, UChar* end, if (! IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_BRACE_CALLOUT_CONTENTS)) return ONIGERR_UNDEFINED_GROUP_OPTION; - r = parse_callout_of_contents(np, ')', &p, end, env); + r = prs_callout_of_contents(np, ')', &p, end, env); if (r != 0) return r; goto end; @@ -7620,10 +7704,12 @@ parse_bag(Node** np, PToken* tok, int term, UChar** src, UChar* end, if (IS_CODE_DIGIT_ASCII(enc, c) || c == '-' || c == '+' || c == '<' || c == '\'') { - UChar* name_end; - int back_num; +#ifdef USE_BACKREF_WITH_LEVEL int exist_level; int level; +#endif + UChar* name_end; + int back_num; enum REF_NUM num_type; int is_enclosed; @@ -7631,8 +7717,8 @@ parse_bag(Node** np, PToken* tok, int term, UChar** src, UChar* end, if (! is_enclosed) PUNFETCH; prev = p; - exist_level = 0; #ifdef USE_BACKREF_WITH_LEVEL + exist_level = 0; name_end = NULL_UCHARP; /* no need. escape gcc warning. */ r = fetch_name_with_level( (OnigCodePoint )(is_enclosed != 0 ? c : '('), @@ -7709,7 +7795,7 @@ parse_bag(Node** np, PToken* tok, int term, UChar** src, UChar* end, /* condition part is callouts of contents: (?(?{...})THEN|ELSE) */ condition_is_checker = 0; PFETCH(c); - r = parse_callout_of_contents(&condition, ')', &p, end, env); + r = prs_callout_of_contents(&condition, ')', &p, end, env); if (r != 0) return r; goto end_condition; } @@ -7719,7 +7805,7 @@ parse_bag(Node** np, PToken* tok, int term, UChar** src, UChar* end, else if (c == '*' && IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_ASTERISK_CALLOUT_NAME)) { condition_is_checker = 0; - r = parse_callout_of_name(&condition, ')', &p, end, env); + r = prs_callout_of_name(&condition, ')', &p, end, env); if (r != 0) return r; goto end_condition; } @@ -7730,7 +7816,7 @@ parse_bag(Node** np, PToken* tok, int term, UChar** src, UChar* end, condition_is_checker = 0; r = fetch_token(tok, &p, end, env); if (r < 0) return r; - r = parse_alts(&condition, tok, term, &p, end, env, FALSE); + r = prs_alts(&condition, tok, term, &p, end, env, FALSE); if (r < 0) { onig_node_free(condition); return r; @@ -7773,7 +7859,7 @@ parse_bag(Node** np, PToken* tok, int term, UChar** src, UChar* end, onig_node_free(condition); return r; } - r = parse_alts(&target, tok, term, &p, end, env, TRUE); + r = prs_alts(&target, tok, term, &p, end, env, TRUE); if (r < 0) { onig_node_free(condition); onig_node_free(target); @@ -7949,7 +8035,7 @@ parse_bag(Node** np, PToken* tok, int term, UChar** src, UChar* end, env->options = option; r = fetch_token(tok, &p, end, env); if (r < 0) return r; - r = parse_alts(&target, tok, term, &p, end, env, FALSE); + r = prs_alts(&target, tok, term, &p, end, env, FALSE); env->options = prev; if (r < 0) { onig_node_free(target); @@ -7976,7 +8062,7 @@ parse_bag(Node** np, PToken* tok, int term, UChar** src, UChar* end, else if (c == '*' && IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_ASTERISK_CALLOUT_NAME)) { PINC; - r = parse_callout_of_name(np, ')', &p, end, env); + r = prs_callout_of_name(np, ')', &p, end, env); if (r != 0) return r; goto end; @@ -7996,7 +8082,7 @@ parse_bag(Node** np, PToken* tok, int term, UChar** src, UChar* end, CHECK_NULL_RETURN_MEMERR(*np); r = fetch_token(tok, &p, end, env); if (r < 0) return r; - r = parse_alts(&target, tok, term, &p, end, env, FALSE); + r = prs_alts(&target, tok, term, &p, end, env, FALSE); if (r < 0) { onig_node_free(target); return r; @@ -8006,7 +8092,7 @@ parse_bag(Node** np, PToken* tok, int term, UChar** src, UChar* end, if (NODE_TYPE(*np) == NODE_BAG) { if (BAG_(*np)->type == BAG_MEMORY) { - /* Don't move this to previous of parse_alts() */ + /* Don't move this to previous of prs_alts() */ r = scan_env_set_mem_node(env, BAG_(*np)->m.regnum, *np); if (r != 0) return r; } @@ -8285,8 +8371,8 @@ i_apply_case_fold(OnigCodePoint from, OnigCodePoint to[], int to_len, void* arg) } static int -parse_exp(Node** np, PToken* tok, int term, UChar** src, UChar* end, - ScanEnv* env, int group_head) +prs_exp(Node** np, PToken* tok, int term, UChar** src, UChar* end, + ScanEnv* env, int group_head) { int r, len, group; Node* qn; @@ -8311,7 +8397,7 @@ parse_exp(Node** np, PToken* tok, int term, UChar** src, UChar* end, break; case TK_SUBEXP_OPEN: - r = parse_bag(np, tok, TK_SUBEXP_CLOSE, src, end, env); + r = prs_bag(np, tok, TK_SUBEXP_CLOSE, src, end, env); if (r < 0) return r; if (r == 1) { /* group */ if (group_head == 0) @@ -8341,7 +8427,7 @@ parse_exp(Node** np, PToken* tok, int term, UChar** src, UChar* end, env->options = BAG_(*np)->o.options; r = fetch_token(tok, src, end, env); if (r < 0) return r; - r = parse_alts(&target, tok, term, src, end, env, FALSE); + r = prs_alts(&target, tok, term, src, end, env, FALSE); env->options = prev; if (r < 0) { onig_node_free(target); @@ -8419,8 +8505,9 @@ parse_exp(Node** np, PToken* tok, int term, UChar** src, UChar* end, case TK_CODE_POINT: { UChar buf[ONIGENC_CODE_TO_MBC_MAXLEN]; - len = ONIGENC_CODE_TO_MBC(env->enc, tok->u.code, buf); + len = ONIGENC_CODE_TO_MBCLEN(env->enc, tok->u.code); if (len < 0) return len; + len = ONIGENC_CODE_TO_MBC(env->enc, tok->u.code, buf); #ifdef NUMBERED_CHAR_IS_NOT_CASE_AMBIG *np = node_new_str_crude(buf, buf + len, env->options); #else @@ -8465,7 +8552,12 @@ parse_exp(Node** np, PToken* tok, int term, UChar** src, UChar* end, *np = node_new_cclass(); CHECK_NULL_RETURN_MEMERR(*np); cc = CCLASS_(*np); - add_ctype_to_cc(cc, tok->u.prop.ctype, FALSE, env); + r = add_ctype_to_cc(cc, tok->u.prop.ctype, FALSE, env); + if (r != 0) { + onig_node_free(*np); + *np = NULL_NODE; + return r; + } if (tok->u.prop.not != 0) NCCLASS_SET_NOT(cc); } break; @@ -8478,7 +8570,7 @@ parse_exp(Node** np, PToken* tok, int term, UChar** src, UChar* end, break; case TK_CHAR_PROPERTY: - r = parse_char_property(np, tok, src, end, env); + r = prs_char_property(np, tok, src, end, env); if (r != 0) return r; break; @@ -8486,7 +8578,7 @@ parse_exp(Node** np, PToken* tok, int term, UChar** src, UChar* end, { CClassNode* cc; - r = parse_cc(np, tok, src, end, env); + r = prs_cc(np, tok, src, end, env); if (r != 0) return r; cc = CCLASS_(*np); @@ -8685,8 +8777,8 @@ parse_exp(Node** np, PToken* tok, int term, UChar** src, UChar* end, } static int -parse_branch(Node** top, PToken* tok, int term, UChar** src, UChar* end, - ScanEnv* env, int group_head) +prs_branch(Node** top, PToken* tok, int term, UChar** src, UChar* end, + ScanEnv* env, int group_head) { int r; Node *node, **headp; @@ -8694,7 +8786,7 @@ parse_branch(Node** top, PToken* tok, int term, UChar** src, UChar* end, *top = NULL; INC_PARSE_DEPTH(env->parse_depth); - r = parse_exp(&node, tok, term, src, end, env, group_head); + r = prs_exp(&node, tok, term, src, end, env, group_head); if (r < 0) { onig_node_free(node); return r; @@ -8712,7 +8804,7 @@ parse_branch(Node** top, PToken* tok, int term, UChar** src, UChar* end, headp = &(NODE_CDR(*top)); while (r != TK_EOT && r != term && r != TK_ALT) { - r = parse_exp(&node, tok, term, src, end, env, FALSE); + r = prs_exp(&node, tok, term, src, end, env, FALSE); if (r < 0) { onig_node_free(node); return r; @@ -8736,8 +8828,8 @@ parse_branch(Node** top, PToken* tok, int term, UChar** src, UChar* end, /* term_tok: TK_EOT or TK_SUBEXP_CLOSE */ static int -parse_alts(Node** top, PToken* tok, int term, UChar** src, UChar* end, - ScanEnv* env, int group_head) +prs_alts(Node** top, PToken* tok, int term, UChar** src, UChar* end, + ScanEnv* env, int group_head) { int r; Node *node, **headp; @@ -8747,7 +8839,7 @@ parse_alts(Node** top, PToken* tok, int term, UChar** src, UChar* end, INC_PARSE_DEPTH(env->parse_depth); save_options = env->options; - r = parse_branch(&node, tok, term, src, end, env, group_head); + r = prs_branch(&node, tok, term, src, end, env, group_head); if (r < 0) { onig_node_free(node); return r; @@ -8767,7 +8859,7 @@ parse_alts(Node** top, PToken* tok, int term, UChar** src, UChar* end, while (r == TK_ALT) { r = fetch_token(tok, src, end, env); if (r < 0) return r; - r = parse_branch(&node, tok, term, src, end, env, FALSE); + r = prs_branch(&node, tok, term, src, end, env, FALSE); if (r < 0) { onig_node_free(node); return r; @@ -8800,7 +8892,7 @@ parse_alts(Node** top, PToken* tok, int term, UChar** src, UChar* end, } static int -parse_regexp(Node** top, UChar** src, UChar* end, ScanEnv* env) +prs_regexp(Node** top, UChar** src, UChar* end, ScanEnv* env) { int r; PToken tok; @@ -8808,7 +8900,7 @@ parse_regexp(Node** top, UChar** src, UChar* end, ScanEnv* env) ptoken_init(&tok); r = fetch_token(&tok, src, end, env); if (r < 0) return r; - r = parse_alts(top, &tok, TK_EOT, src, end, env, FALSE); + r = prs_alts(top, &tok, TK_EOT, src, end, env, FALSE); if (r < 0) return r; return 0; @@ -8846,6 +8938,15 @@ onig_parse_tree(Node** root, const UChar* pattern, const UChar* end, RegexExt* ext; #endif + reg->string_pool = 0; + reg->string_pool_end = 0; + reg->num_mem = 0; + reg->num_repeat = 0; + reg->num_empty_check = 0; + reg->repeat_range_alloc = 0; + reg->repeat_range = (RepeatRange* )NULL; + reg->empty_status_mem = 0; + names_clear(reg); scan_env_clear(env); @@ -8863,7 +8964,7 @@ onig_parse_tree(Node** root, const UChar* pattern, const UChar* end, return ONIGERR_INVALID_WIDE_CHAR_VALUE; p = (UChar* )pattern; - r = parse_regexp(root, &p, (UChar* )end, env); + r = prs_regexp(root, &p, (UChar* )end, env); if (r != 0) return r; #ifdef USE_CALL diff --git a/src/regparse.h b/src/regparse.h index 979e982..c60a42d 100644 --- a/src/regparse.h +++ b/src/regparse.h @@ -163,7 +163,7 @@ typedef struct { struct _Node* body; /* to BagNode : BAG_MEMORY */ int by_number; - int group_num; + int called_gnum; UChar* name; UChar* name_end; int entry_count; @@ -339,6 +339,7 @@ typedef struct { #define NODE_ST_TEXT_SEGMENT_WORD (1<<23) #define NODE_ST_ABSENT_WITH_SIDE_EFFECTS (1<<24) /* stopper or clear */ #define NODE_ST_FIXED_CLEN_MIN_SURE (1<<25) +#define NODE_ST_REFERENCED (1<<26) #define NODE_STATUS(node) (((Node* )node)->u.base.status) @@ -374,6 +375,7 @@ typedef struct { #define NODE_IS_TEXT_SEGMENT_WORD(node) ((NODE_STATUS(node) & NODE_ST_TEXT_SEGMENT_WORD) != 0) #define NODE_IS_ABSENT_WITH_SIDE_EFFECTS(node) ((NODE_STATUS(node) & NODE_ST_ABSENT_WITH_SIDE_EFFECTS) != 0) #define NODE_IS_FIXED_CLEN_MIN_SURE(node) ((NODE_STATUS(node) & NODE_ST_FIXED_CLEN_MIN_SURE) != 0) +#define NODE_IS_REFERENCED(node) ((NODE_STATUS(node) & NODE_ST_REFERENCED) != 0) #define NODE_PARENT(node) ((node)->u.base.parent) #define NODE_BODY(node) ((node)->u.base.body) diff --git a/src/regposerr.c b/src/regposerr.c index 12d95a9..e5b7899 100644 --- a/src/regposerr.c +++ b/src/regposerr.c @@ -37,6 +37,18 @@ #include "config.h" #include "onigposix.h" +#undef regex_t +#undef regmatch_t +#undef regoff_t +#undef regcomp +#undef regexec +#undef regfree +#undef regerror +#undef reg_set_encoding +#undef reg_name_to_group_numbers +#undef reg_foreach_name +#undef reg_number_of_names + #ifndef ONIG_NO_STANDARD_C_HEADERS #include <string.h> #include <stdio.h> @@ -92,10 +104,9 @@ static char* ESTRING[] = { }; - extern size_t -regerror(int posix_ecode, const regex_t* reg ARG_UNUSED, char* buf, - size_t size) +onig_posix_regerror(int posix_ecode, const onig_posix_regex_t* reg ARG_UNUSED, + char* buf, size_t size) { char* s; char tbuf[35]; @@ -121,3 +132,14 @@ regerror(int posix_ecode, const regex_t* reg ARG_UNUSED, char* buf, } return len; } + +#ifdef USE_BINARY_COMPATIBLE_POSIX_API + +extern size_t +regerror(int posix_ecode, const onig_posix_regex_t* reg ARG_UNUSED, + char* buf, size_t size) +{ + return onig_posix_regerror(posix_ecode, reg, buf, size); +} + +#endif diff --git a/src/regposix.c b/src/regposix.c index 4e523a4..497ba02 100644 --- a/src/regposix.c +++ b/src/regposix.c @@ -33,6 +33,18 @@ #include "onigposix.h" +#undef regex_t +#undef regmatch_t +#undef regoff_t +#undef regcomp +#undef regexec +#undef regfree +#undef regerror +#undef reg_set_encoding +#undef reg_name_to_group_numbers +#undef reg_foreach_name +#undef reg_number_of_names + #define ONIG_C(reg) ((onig_regex_t* )((reg)->onig)) #define PONIG_C(reg) ((onig_regex_t** )(&(reg)->onig)) @@ -64,6 +76,7 @@ onig2posix_error_code(int code) { ONIGERR_MATCH_STACK_LIMIT_OVER, REG_EONIG_INTERNAL }, { ONIGERR_RETRY_LIMIT_IN_MATCH_OVER, REG_EONIG_INTERNAL }, { ONIGERR_RETRY_LIMIT_IN_SEARCH_OVER, REG_EONIG_INTERNAL }, + { ONIGERR_SUBEXP_CALL_LIMIT_IN_SEARCH_OVER, REG_EONIG_INTERNAL }, { ONIGERR_TYPE_BUG, REG_EONIG_INTERNAL }, { ONIGERR_PARSER_BUG, REG_EONIG_INTERNAL }, { ONIGERR_STACK_BUG, REG_EONIG_INTERNAL }, @@ -144,7 +157,7 @@ onig2posix_error_code(int code) } extern int -regcomp(regex_t* reg, const char* pattern, int posix_options) +onig_posix_regcomp(onig_posix_regex_t* reg, const char* pattern, int posix_options) { int r, len; OnigSyntaxType* syntax = OnigDefaultSyntax; @@ -178,12 +191,12 @@ regcomp(regex_t* reg, const char* pattern, int posix_options) } extern int -regexec(regex_t* reg, const char* str, size_t nmatch, - regmatch_t pmatch[], int posix_options) +onig_posix_regexec(onig_posix_regex_t* reg, const char* str, size_t nmatch, + onig_posix_regmatch_t pmatch[], int posix_options) { int r, i, len; UChar* end; - regmatch_t* pm; + onig_posix_regmatch_t* pm; OnigOptionType options; options = ONIG_OPTION_POSIX_REGION; @@ -191,11 +204,11 @@ regexec(regex_t* reg, const char* str, size_t nmatch, if ((posix_options & REG_NOTEOL) != 0) options |= ONIG_OPTION_NOTEOL; if (nmatch == 0 || (reg->comp_options & REG_NOSUB) != 0) { - pm = (regmatch_t* )NULL; + pm = (onig_posix_regmatch_t* )NULL; nmatch = 0; } else if ((int )nmatch < ONIG_C(reg)->num_mem + 1) { - pm = (regmatch_t* )xmalloc(sizeof(regmatch_t) + pm = (onig_posix_regmatch_t* )xmalloc(sizeof(onig_posix_regmatch_t) * (ONIG_C(reg)->num_mem + 1)); if (pm == NULL) return REG_ESPACE; @@ -212,7 +225,7 @@ regexec(regex_t* reg, const char* str, size_t nmatch, if (r >= 0) { r = 0; /* Match */ if (pm != pmatch && pm != NULL) { - xmemcpy(pmatch, pm, sizeof(regmatch_t) * nmatch); + xmemcpy(pmatch, pm, sizeof(onig_posix_regmatch_t) * nmatch); } } else if (r == ONIG_MISMATCH) { @@ -236,7 +249,7 @@ regexec(regex_t* reg, const char* str, size_t nmatch, } extern void -regfree(regex_t* reg) +onig_posix_regfree(onig_posix_regex_t* reg) { onig_free(ONIG_C(reg)); reg->onig = (void* )0; @@ -244,7 +257,7 @@ regfree(regex_t* reg) extern void -reg_set_encoding(int mb_code) +onig_posix_reg_set_encoding(int mb_code) { OnigEncoding enc; @@ -279,15 +292,15 @@ reg_set_encoding(int mb_code) } extern int -reg_name_to_group_numbers(regex_t* reg, +onig_posix_reg_name_to_group_numbers(onig_posix_regex_t* reg, const unsigned char* name, const unsigned char* name_end, int** nums) { return onig_name_to_group_numbers(ONIG_C(reg), name, name_end, nums); } typedef struct { - int (*func)(const unsigned char*, const unsigned char*,int,int*,regex_t*,void*); - regex_t* reg; + int (*func)(const unsigned char*, const unsigned char*,int,int*,onig_posix_regex_t*,void*); + onig_posix_regex_t* reg; void* arg; } i_wrap; @@ -301,8 +314,8 @@ i_wrapper(const UChar* name, const UChar* name_end, int ng, int* gs, } extern int -reg_foreach_name(regex_t* reg, - int (*func)(const unsigned char*, const unsigned char*,int,int*,regex_t*,void*), +onig_posix_reg_foreach_name(onig_posix_regex_t* reg, + int (*func)(const unsigned char*, const unsigned char*,int,int*,onig_posix_regex_t*,void*), void* arg) { i_wrap warg; @@ -315,7 +328,58 @@ reg_foreach_name(regex_t* reg, } extern int -reg_number_of_names(regex_t* reg) +onig_posix_reg_number_of_names(onig_posix_regex_t* reg) { return onig_number_of_names(ONIG_C(reg)); } + + +#ifdef USE_BINARY_COMPATIBLE_POSIX_API + +extern int +regcomp(onig_posix_regex_t* reg, const char* pattern, int posix_options) +{ + return onig_posix_regcomp(reg, pattern, posix_options); +} + +extern int +regexec(onig_posix_regex_t* reg, const char* str, size_t nmatch, + onig_posix_regmatch_t pmatch[], int posix_options) +{ + return onig_posix_regexec(reg, str, nmatch, pmatch, posix_options); +} + +extern void +regfree(onig_posix_regex_t* reg) +{ + onig_posix_regfree(reg); +} + +extern void +reg_set_encoding(int mb_code) +{ + onig_posix_reg_set_encoding(mb_code); +} + +extern int +reg_name_to_group_numbers(onig_posix_regex_t* reg, + const unsigned char* name, const unsigned char* name_end, int** nums) +{ + return onig_posix_reg_name_to_group_numbers(reg, name, name_end, nums); +} + +extern int +reg_foreach_name(onig_posix_regex_t* reg, + int (*func)(const unsigned char*, const unsigned char*,int,int*,onig_posix_regex_t*,void*), + void* arg) +{ + return onig_posix_reg_foreach_name(reg, func, arg); +} + +extern int +reg_number_of_names(onig_posix_regex_t* reg) +{ + return onig_posix_reg_number_of_names(reg); +} + +#endif /* USE_BINARY_COMPATIBLE_POSIX_API */ @@ -2,7 +2,7 @@ sjis.c - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2019 K.Kosako + * Copyright (c) 2002-2020 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -113,13 +113,15 @@ static int code_to_mbclen(OnigCodePoint code) { if (code < 256) { - return EncLen_SJIS[(int )code] == 1; + if (EncLen_SJIS[(int )code] == 1) + return 1; } - else if (code <= 0xffff) { - return 2; + else if (code < 0x10000) { + if (EncLen_SJIS[(int )(code >> 8) & 0xff] == 2) + return 2; } - else - return ONIGERR_INVALID_CODE_POINT_VALUE; + + return ONIGERR_INVALID_CODE_POINT_VALUE; } static OnigCodePoint @@ -151,6 +151,7 @@ st_init_table_with_size(type, size) #endif size = new_size(size); /* round up to prime number */ + if (size <= 0) return 0; tbl = alloc(st_table); if (tbl == 0) return 0; @@ -318,10 +319,13 @@ rehash(table) register st_table *table; { register st_table_entry *ptr, *next, **new_bins; - int i, old_num_bins = table->num_bins, new_num_bins; + int i, new_num_bins, old_num_bins; unsigned int hash_val; - new_num_bins = new_size(old_num_bins+1); + old_num_bins = table->num_bins; + new_num_bins = new_size(old_num_bins + 1); + if (new_num_bins <= 0) return ; + new_bins = (st_table_entry**)Calloc(new_num_bins, sizeof(st_table_entry*)); if (new_bins == 0) { return ; diff --git a/src/unicode.c b/src/unicode.c index 080da74..6703d4b 100644 --- a/src/unicode.c +++ b/src/unicode.c @@ -387,15 +387,15 @@ onigenc_unicode_get_case_fold_codes_by_str(OnigEncoding enc, for (i = 0; i < ncs[0]; i++) { for (j = 0; j < ncs[1]; j++) { for (k = 0; k < ncs[2]; k++) { + if (cs[0][i] == orig_codes[0] && cs[1][j] == orig_codes[1] && + cs[2][k] == orig_codes[2]) + continue; + items[n].byte_len = lens[2]; items[n].code_len = 3; items[n].code[0] = cs[0][i]; items[n].code[1] = cs[1][j]; items[n].code[2] = cs[2][k]; - if (items[n].code[0] == orig_codes[0] && - items[n].code[1] == orig_codes[1] && - items[n].code[2] == orig_codes[2]) - continue; n++; } } @@ -431,13 +431,12 @@ onigenc_unicode_get_case_fold_codes_by_str(OnigEncoding enc, for (i = 0; i < ncs[0]; i++) { for (j = 0; j < ncs[1]; j++) { + if (cs[0][i] == orig_codes[0] && cs[1][j] == orig_codes[1]) + continue; items[n].byte_len = lens[1]; items[n].code_len = 2; items[n].code[0] = cs[0][i]; items[n].code[1] = cs[1][j]; - if (items[n].code[0] == orig_codes[0] && - items[n].code[1] == orig_codes[1]) - continue; n++; } } diff --git a/src/utf16_be.c b/src/utf16_be.c index d237b93..5014e18 100644 --- a/src/utf16_be.c +++ b/src/utf16_be.c @@ -2,7 +2,7 @@ utf16_be.c - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2019 K.Kosako + * Copyright (c) 2002-2020 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without diff --git a/src/utf16_le.c b/src/utf16_le.c index f14d263..35ceb3c 100644 --- a/src/utf16_le.c +++ b/src/utf16_le.c @@ -2,7 +2,7 @@ utf16_le.c - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2019 K.Kosako + * Copyright (c) 2002-2020 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without diff --git a/src/utf32_be.c b/src/utf32_be.c index bdd3db7..31bd98b 100644 --- a/src/utf32_be.c +++ b/src/utf32_be.c @@ -2,7 +2,7 @@ utf32_be.c - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2019 K.Kosako + * Copyright (c) 2002-2020 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -67,7 +67,10 @@ utf32be_is_mbc_newline(const UChar* p, const UChar* end) static OnigCodePoint utf32be_mbc_to_code(const UChar* p, const UChar* end ARG_UNUSED) { - return (OnigCodePoint )(((p[0] * 256 + p[1]) * 256 + p[2]) * 256 + p[3]); + OnigCodePoint code; + + code = (OnigCodePoint )((((p[0] & 0x7f) * 256 + p[1]) * 256 + p[2]) * 256 + p[3]); + return code; } static int diff --git a/src/utf32_le.c b/src/utf32_le.c index 473ab74..f50cab7 100644 --- a/src/utf32_le.c +++ b/src/utf32_le.c @@ -2,7 +2,7 @@ utf32_le.c - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2019 K.Kosako + * Copyright (c) 2002-2020 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -67,7 +67,10 @@ utf32le_is_mbc_newline(const UChar* p, const UChar* end) static OnigCodePoint utf32le_mbc_to_code(const UChar* p, const UChar* end ARG_UNUSED) { - return (OnigCodePoint )(((p[3] * 256 + p[2]) * 256 + p[1]) * 256 + p[0]); + OnigCodePoint code; + + code = (OnigCodePoint )((((p[3] & 0x7f) * 256 + p[2]) * 256 + p[1]) * 256 + p[0]); + return code; } static int |