diff options
Diffstat (limited to 'src/regparse.c')
-rw-r--r-- | src/regparse.c | 179 |
1 files changed, 105 insertions, 74 deletions
diff --git a/src/regparse.c b/src/regparse.c index e06d9d2..6be8366 100644 --- a/src/regparse.c +++ b/src/regparse.c @@ -26,7 +26,6 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ - #include "regparse.h" #include "st.h" @@ -97,6 +96,14 @@ extern void onig_set_verb_warn_func(OnigWarnFunc f) onig_verb_warn = f; } +extern void +onig_warning(const char* s) +{ + if (onig_warn == onig_null_warn) return ; + + (*onig_warn)(s); +} + static void bbuf_free(BBuf* bbuf) { @@ -957,6 +964,9 @@ scan_env_add_mem_entry(ScanEnv* env) Node** p; need = env->num_mem + 1; + if (need > ONIG_MAX_CAPTURE_NUM) + return ONIGERR_TOO_MANY_CAPTURES; + if (need >= SCANENV_MEMNODES_SIZE) { if (env->mem_alloc <= need) { if (IS_NULL(env->mem_nodes_dynamic)) { @@ -1987,8 +1997,8 @@ or_cclass(CClassNode* dest, CClassNode* cc, OnigEncoding enc) return 0; } -static int -conv_backslash_value(int c, ScanEnv* env) +static OnigCodePoint +conv_backslash_value(OnigCodePoint c, ScanEnv* env) { if (IS_SYNTAX_OP(env->syntax, ONIG_SYN_OP_ESC_CONTROL_CHARS)) { switch (c) { @@ -2259,7 +2269,7 @@ fetch_range_quantifier(UChar** src, UChar* end, OnigToken* tok, ScanEnv* env) if (p == prev) { if (non_low != 0) - goto invalid; + goto invalid; up = REPEAT_INFINITE; /* {n,} : {n,infinite} */ } } @@ -2291,15 +2301,17 @@ fetch_range_quantifier(UChar** src, UChar* end, OnigToken* tok, ScanEnv* env) return r; /* 0: normal {n,m}, 2: fixed {n} */ invalid: - if (syn_allow) + if (syn_allow) { + *src = p; return 1; /* OK */ + } else return ONIGERR_INVALID_REPEAT_RANGE_PATTERN; } /* \M-, \C-, \c, or \... */ static int -fetch_escaped_value(UChar** src, UChar* end, ScanEnv* env) +fetch_escaped_value(UChar** src, UChar* end, ScanEnv* env, OnigCodePoint* val) { int v; OnigCodePoint c; @@ -2318,9 +2330,8 @@ fetch_escaped_value(UChar** src, UChar* end, ScanEnv* env) if (PEND) return ONIGERR_END_PATTERN_AT_META; PFETCH_S(c); if (c == MC_ESC(env->syntax)) { - v = fetch_escaped_value(&p, end, env); + v = fetch_escaped_value(&p, end, env, &c); if (v < 0) return v; - c = (OnigCodePoint )v; } c = ((c & 0xff) | 0x80); } @@ -2348,9 +2359,8 @@ fetch_escaped_value(UChar** src, UChar* end, ScanEnv* env) } else { if (c == MC_ESC(env->syntax)) { - v = fetch_escaped_value(&p, end, env); + v = fetch_escaped_value(&p, end, env, &c); if (v < 0) return v; - c = (OnigCodePoint )v; } c &= 0x9f; } @@ -2367,7 +2377,8 @@ fetch_escaped_value(UChar** src, UChar* end, ScanEnv* env) } *src = p; - return c; + *val = c; + return 0; } static int fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env); @@ -2463,6 +2474,10 @@ fetch_name_with_level(OnigCodePoint start_code, UChar** src, UChar* end, int level; int flag = (c == '-' ? -1 : 1); + if (PEND) { + r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME; + goto end; + } PFETCH(c); if (! ONIGENC_IS_CODE_DIGIT(enc, c)) goto err; PUNFETCH; @@ -2471,9 +2486,11 @@ fetch_name_with_level(OnigCodePoint start_code, UChar** src, UChar* end, *rlevel = (level * flag); exist_level = 1; - PFETCH(c); - if (c == end_code) - goto end; + if (!PEND) { + PFETCH(c); + if (c == end_code) + goto end; + } } err: @@ -2880,6 +2897,8 @@ fetch_token_in_cc(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env) case 'p': case 'P': + if (PEND) break; + c2 = PPEEK; if (c2 == '{' && IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_P_BRACE_CHAR_PROPERTY)) { @@ -2887,7 +2906,7 @@ fetch_token_in_cc(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env) tok->type = TK_CHAR_PROPERTY; tok->u.prop.not = (c == 'P' ? 1 : 0); - if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_P_BRACE_CIRCUMFLEX_NOT)) { + if (!PEND && IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_P_BRACE_CIRCUMFLEX_NOT)) { PFETCH(c2); if (c2 == '^') { tok->u.prop.not = (tok->u.prop.not == 0 ? 1 : 0); @@ -2903,25 +2922,25 @@ fetch_token_in_cc(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env) prev = p; if (PPEEK_IS('{') && IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_X_BRACE_HEX8)) { - PINC; - num = scan_unsigned_hexadecimal_number(&p, end, 8, enc); - if (num < 0) return ONIGERR_TOO_BIG_WIDE_CHAR_VALUE; - if (!PEND) { + PINC; + num = scan_unsigned_hexadecimal_number(&p, end, 8, enc); + if (num < 0) return ONIGERR_TOO_BIG_WIDE_CHAR_VALUE; + if (!PEND) { c2 = PPEEK; if (ONIGENC_IS_CODE_XDIGIT(enc, c2)) return ONIGERR_TOO_LONG_WIDE_CHAR_VALUE; } - if (p > prev + enclen(enc, prev) && !PEND && (PPEEK_IS('}'))) { - PINC; - tok->type = TK_CODE_POINT; - tok->base = 16; - tok->u.code = (OnigCodePoint )num; - } - else { - /* can't read nothing or invalid format */ - p = prev; - } + if (p > prev + enclen(enc, prev) && !PEND && (PPEEK_IS('}'))) { + PINC; + tok->type = TK_CODE_POINT; + tok->base = 16; + tok->u.code = (OnigCodePoint )num; + } + else { + /* can't read nothing or invalid format */ + p = prev; + } } else if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_X_HEX2)) { num = scan_unsigned_hexadecimal_number(&p, end, 2, enc); @@ -2969,10 +2988,10 @@ fetch_token_in_cc(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env) default: PUNFETCH; - num = fetch_escaped_value(&p, end, env); + num = fetch_escaped_value(&p, end, env, &c2); if (num < 0) return num; - if (tok->u.c != num) { - tok->u.code = (OnigCodePoint )num; + if (tok->u.c != c2) { + tok->u.code = c2; tok->type = TK_CODE_POINT; } break; @@ -3332,7 +3351,7 @@ fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env) if (num < 0) return ONIGERR_TOO_BIG_NUMBER; if (p == prev) { /* can't read nothing. */ num = 0; /* but, it's not error */ - } + } tok->type = TK_RAW_BYTE; tok->base = 8; tok->u.c = num; @@ -3344,7 +3363,7 @@ fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env) #ifdef USE_NAMED_GROUP case 'k': - if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_K_NAMED_BACKREF)) { + if (!PEND && IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_K_NAMED_BACKREF)) { PFETCH(c); if (c == '<' || c == '\'') { UChar* name_end; @@ -3417,7 +3436,7 @@ fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env) #ifdef USE_SUBEXP_CALL case 'g': - if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_G_SUBEXP_CALL)) { + if (!PEND && IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_G_SUBEXP_CALL)) { PFETCH(c); if (c == '<' || c == '\'') { int gnum; @@ -3446,13 +3465,14 @@ fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env) case 'p': case 'P': - if (PPEEK_IS('{') && + if (!PEND && PPEEK_IS('{') && IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_P_BRACE_CHAR_PROPERTY)) { PINC; tok->type = TK_CHAR_PROPERTY; tok->u.prop.not = (c == 'P' ? 1 : 0); - if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_P_BRACE_CIRCUMFLEX_NOT)) { + if (!PEND && + IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_P_BRACE_CIRCUMFLEX_NOT)) { PFETCH(c); if (c == '^') { tok->u.prop.not = (tok->u.prop.not == 0 ? 1 : 0); @@ -3464,16 +3484,20 @@ fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env) break; default: - PUNFETCH; - num = fetch_escaped_value(&p, end, env); - if (num < 0) return num; - /* set_raw: */ - if (tok->u.c != num) { - tok->type = TK_CODE_POINT; - tok->u.code = (OnigCodePoint )num; - } - else { /* string */ - p = tok->backp + enclen(enc, tok->backp); + { + OnigCodePoint c2; + + PUNFETCH; + num = fetch_escaped_value(&p, end, env, &c2); + if (num < 0) return num; + /* set_raw: */ + if (tok->u.c != c2) { + tok->type = TK_CODE_POINT; + tok->u.code = c2; + } + else { /* string */ + p = tok->backp + enclen(enc, tok->backp); + } } break; } @@ -3548,10 +3572,10 @@ fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env) if (r < 0) return r; /* error */ if (r == 0) goto greedy_check; else if (r == 2) { /* {n} */ - if (IS_SYNTAX_BV(syn, ONIG_SYN_FIXED_INTERVAL_IS_GREEDY_ONLY)) - goto possessive_check; + if (IS_SYNTAX_BV(syn, ONIG_SYN_FIXED_INTERVAL_IS_GREEDY_ONLY)) + goto possessive_check; - goto greedy_check; + goto greedy_check; } /* r == 1 : normal char */ break; @@ -3562,10 +3586,10 @@ fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env) break; case '(': - if (PPEEK_IS('?') && + if (!PEND && PPEEK_IS('?') && IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_QMARK_GROUP_EFFECT)) { PINC; - if (PPEEK_IS('#')) { + if (!PEND && PPEEK_IS('#')) { PFETCH(c); while (1) { if (PEND) return ONIGERR_END_PATTERN_IN_GROUP; @@ -3612,7 +3636,7 @@ fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env) case ']': if (*src > env->pattern) /* /].../ is allowed. */ - CLOSE_BRACKET_WITHOUT_ESC_WARN(env, (UChar* )"]"); + CLOSE_BRACKET_WITHOUT_ESC_WARN(env, (UChar* )"]"); break; case '#': @@ -3975,8 +3999,9 @@ next_state_val(CClassNode* cc, OnigCodePoint *vs, OnigCodePoint v, switch (*state) { case CCS_VALUE: - if (*type == CCV_SB) + if (*type == CCV_SB) { BITSET_SET_BIT(cc->bs, (int )(*vs)); + } else if (*type == CCV_CODE_POINT) { r = add_code_range(&(cc->mbuf), env, *vs, *vs); if (r < 0) return r; @@ -3989,13 +4014,13 @@ next_state_val(CClassNode* cc, OnigCodePoint *vs, OnigCodePoint v, if (*vs > 0xff || v > 0xff) return ONIGERR_INVALID_CODE_POINT_VALUE; - if (*vs > v) { - if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_EMPTY_RANGE_IN_CC)) - goto ccs_range_end; - else - return ONIGERR_EMPTY_RANGE_IN_CHAR_CLASS; - } - bitset_set_range(cc->bs, (int )*vs, (int )v); + if (*vs > v) { + if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_EMPTY_RANGE_IN_CC)) + goto ccs_range_end; + else + return ONIGERR_EMPTY_RANGE_IN_CHAR_CLASS; + } + bitset_set_range(cc->bs, (int )*vs, (int )v); } else { r = add_code_range(&(cc->mbuf), env, *vs, v); @@ -4006,15 +4031,15 @@ next_state_val(CClassNode* cc, OnigCodePoint *vs, OnigCodePoint v, #if 0 if (intype == CCV_CODE_POINT && *type == CCV_SB) { #endif - if (*vs > v) { - if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_EMPTY_RANGE_IN_CC)) - goto ccs_range_end; - else - return ONIGERR_EMPTY_RANGE_IN_CHAR_CLASS; - } - bitset_set_range(cc->bs, (int )*vs, (int )(v < 0xff ? v : 0xff)); - r = add_code_range(&(cc->mbuf), env, (OnigCodePoint )*vs, v); - if (r < 0) return r; + if (*vs > v) { + if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_EMPTY_RANGE_IN_CC)) + goto ccs_range_end; + else + return ONIGERR_EMPTY_RANGE_IN_CHAR_CLASS; + } + bitset_set_range(cc->bs, (int )*vs, (int )(v < 0xff ? v : 0xff)); + r = add_code_range(&(cc->mbuf), env, (OnigCodePoint )*vs, v); + if (r < 0) return r; #if 0 } else @@ -4110,6 +4135,7 @@ parse_char_class(Node** np, OnigToken* tok, UChar** src, UChar* end, fetched = 0; switch (r) { case TK_CHAR: + any_char_in: len = ONIGENC_CODE_TO_MBCLEN(env->enc, tok->u.c); if (len > 1) { in_type = CCV_CODE_POINT; @@ -4119,7 +4145,7 @@ parse_char_class(Node** np, OnigToken* tok, UChar** src, UChar* end, goto err; } else { - sb_char: + /* sb_char: */ in_type = CCV_SB; } v = (OnigCodePoint )tok->u.c; @@ -4265,7 +4291,7 @@ parse_char_class(Node** np, OnigToken* tok, UChar** src, UChar* end, } else if (state == CCS_RANGE) { CC_ESC_WARN(env, (UChar* )"-"); - goto sb_char; /* [!--x] is allowed */ + goto any_char_in; /* [!--x] is allowed */ } else { /* CCS_COMPLETE */ r = fetch_token_in_cc(tok, &p, end, env); @@ -4279,7 +4305,7 @@ parse_char_class(Node** np, OnigToken* tok, UChar** src, UChar* end, if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_DOUBLE_RANGE_OP_IN_CC)) { CC_ESC_WARN(env, (UChar* )"-"); - goto sb_char; /* [0-9-a] is allowed as [0-9\-a] */ + goto any_char_in; /* [0-9-a] is allowed as [0-9\-a] */ } r = ONIGERR_UNMATCHED_RANGE_SPECIFIER_IN_CHAR_CLASS; goto err; @@ -4452,6 +4478,7 @@ parse_enclose(Node** np, OnigToken* tok, int term, UChar** src, UChar* end, #endif case '<': /* look behind (?<=...), (?<!...) */ + if (PEND) return ONIGERR_END_PATTERN_WITH_UNMATCHED_PARENTHESIS; PFETCH(c); if (c == '=') *np = onig_node_new_anchor(ANCHOR_LOOK_BEHIND); @@ -4924,7 +4951,7 @@ parse_exp(Node** np, OnigToken* tok, int term, len = 1; while (1) { if (len >= ONIGENC_MBC_MINLEN(env->enc)) { - if (len == enclen(env->enc, NSTR(*np)->s)) { + if (len == enclen(env->enc, NSTR(*np)->s)) {//should not enclen_end() r = fetch_token(tok, src, end, env); NSTRING_CLEAR_RAW(*np); goto string_end; @@ -5300,6 +5327,10 @@ onig_parse_make_tree(Node** root, const UChar* pattern, const UChar* end, env->reg = reg; *root = NULL; + + if (! ONIGENC_IS_VALID_MBC_STRING(env->enc, pattern, end)) + return ONIGERR_INVALID_WIDE_CHAR_VALUE; + p = (UChar* )pattern; r = parse_regexp(root, &p, (UChar* )end, env); reg->num_mem = env->num_mem; |