summaryrefslogtreecommitdiff
path: root/src/regparse.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/regparse.c')
-rw-r--r--src/regparse.c179
1 files changed, 105 insertions, 74 deletions
diff --git a/src/regparse.c b/src/regparse.c
index e06d9d2..6be8366 100644
--- a/src/regparse.c
+++ b/src/regparse.c
@@ -26,7 +26,6 @@
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
-
#include "regparse.h"
#include "st.h"
@@ -97,6 +96,14 @@ extern void onig_set_verb_warn_func(OnigWarnFunc f)
onig_verb_warn = f;
}
+extern void
+onig_warning(const char* s)
+{
+ if (onig_warn == onig_null_warn) return ;
+
+ (*onig_warn)(s);
+}
+
static void
bbuf_free(BBuf* bbuf)
{
@@ -957,6 +964,9 @@ scan_env_add_mem_entry(ScanEnv* env)
Node** p;
need = env->num_mem + 1;
+ if (need > ONIG_MAX_CAPTURE_NUM)
+ return ONIGERR_TOO_MANY_CAPTURES;
+
if (need >= SCANENV_MEMNODES_SIZE) {
if (env->mem_alloc <= need) {
if (IS_NULL(env->mem_nodes_dynamic)) {
@@ -1987,8 +1997,8 @@ or_cclass(CClassNode* dest, CClassNode* cc, OnigEncoding enc)
return 0;
}
-static int
-conv_backslash_value(int c, ScanEnv* env)
+static OnigCodePoint
+conv_backslash_value(OnigCodePoint c, ScanEnv* env)
{
if (IS_SYNTAX_OP(env->syntax, ONIG_SYN_OP_ESC_CONTROL_CHARS)) {
switch (c) {
@@ -2259,7 +2269,7 @@ fetch_range_quantifier(UChar** src, UChar* end, OnigToken* tok, ScanEnv* env)
if (p == prev) {
if (non_low != 0)
- goto invalid;
+ goto invalid;
up = REPEAT_INFINITE; /* {n,} : {n,infinite} */
}
}
@@ -2291,15 +2301,17 @@ fetch_range_quantifier(UChar** src, UChar* end, OnigToken* tok, ScanEnv* env)
return r; /* 0: normal {n,m}, 2: fixed {n} */
invalid:
- if (syn_allow)
+ if (syn_allow) {
+ *src = p;
return 1; /* OK */
+ }
else
return ONIGERR_INVALID_REPEAT_RANGE_PATTERN;
}
/* \M-, \C-, \c, or \... */
static int
-fetch_escaped_value(UChar** src, UChar* end, ScanEnv* env)
+fetch_escaped_value(UChar** src, UChar* end, ScanEnv* env, OnigCodePoint* val)
{
int v;
OnigCodePoint c;
@@ -2318,9 +2330,8 @@ fetch_escaped_value(UChar** src, UChar* end, ScanEnv* env)
if (PEND) return ONIGERR_END_PATTERN_AT_META;
PFETCH_S(c);
if (c == MC_ESC(env->syntax)) {
- v = fetch_escaped_value(&p, end, env);
+ v = fetch_escaped_value(&p, end, env, &c);
if (v < 0) return v;
- c = (OnigCodePoint )v;
}
c = ((c & 0xff) | 0x80);
}
@@ -2348,9 +2359,8 @@ fetch_escaped_value(UChar** src, UChar* end, ScanEnv* env)
}
else {
if (c == MC_ESC(env->syntax)) {
- v = fetch_escaped_value(&p, end, env);
+ v = fetch_escaped_value(&p, end, env, &c);
if (v < 0) return v;
- c = (OnigCodePoint )v;
}
c &= 0x9f;
}
@@ -2367,7 +2377,8 @@ fetch_escaped_value(UChar** src, UChar* end, ScanEnv* env)
}
*src = p;
- return c;
+ *val = c;
+ return 0;
}
static int fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env);
@@ -2463,6 +2474,10 @@ fetch_name_with_level(OnigCodePoint start_code, UChar** src, UChar* end,
int level;
int flag = (c == '-' ? -1 : 1);
+ if (PEND) {
+ r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME;
+ goto end;
+ }
PFETCH(c);
if (! ONIGENC_IS_CODE_DIGIT(enc, c)) goto err;
PUNFETCH;
@@ -2471,9 +2486,11 @@ fetch_name_with_level(OnigCodePoint start_code, UChar** src, UChar* end,
*rlevel = (level * flag);
exist_level = 1;
- PFETCH(c);
- if (c == end_code)
- goto end;
+ if (!PEND) {
+ PFETCH(c);
+ if (c == end_code)
+ goto end;
+ }
}
err:
@@ -2880,6 +2897,8 @@ fetch_token_in_cc(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env)
case 'p':
case 'P':
+ if (PEND) break;
+
c2 = PPEEK;
if (c2 == '{' &&
IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_P_BRACE_CHAR_PROPERTY)) {
@@ -2887,7 +2906,7 @@ fetch_token_in_cc(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env)
tok->type = TK_CHAR_PROPERTY;
tok->u.prop.not = (c == 'P' ? 1 : 0);
- if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_P_BRACE_CIRCUMFLEX_NOT)) {
+ if (!PEND && IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_P_BRACE_CIRCUMFLEX_NOT)) {
PFETCH(c2);
if (c2 == '^') {
tok->u.prop.not = (tok->u.prop.not == 0 ? 1 : 0);
@@ -2903,25 +2922,25 @@ fetch_token_in_cc(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env)
prev = p;
if (PPEEK_IS('{') && IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_X_BRACE_HEX8)) {
- PINC;
- num = scan_unsigned_hexadecimal_number(&p, end, 8, enc);
- if (num < 0) return ONIGERR_TOO_BIG_WIDE_CHAR_VALUE;
- if (!PEND) {
+ PINC;
+ num = scan_unsigned_hexadecimal_number(&p, end, 8, enc);
+ if (num < 0) return ONIGERR_TOO_BIG_WIDE_CHAR_VALUE;
+ if (!PEND) {
c2 = PPEEK;
if (ONIGENC_IS_CODE_XDIGIT(enc, c2))
return ONIGERR_TOO_LONG_WIDE_CHAR_VALUE;
}
- if (p > prev + enclen(enc, prev) && !PEND && (PPEEK_IS('}'))) {
- PINC;
- tok->type = TK_CODE_POINT;
- tok->base = 16;
- tok->u.code = (OnigCodePoint )num;
- }
- else {
- /* can't read nothing or invalid format */
- p = prev;
- }
+ if (p > prev + enclen(enc, prev) && !PEND && (PPEEK_IS('}'))) {
+ PINC;
+ tok->type = TK_CODE_POINT;
+ tok->base = 16;
+ tok->u.code = (OnigCodePoint )num;
+ }
+ else {
+ /* can't read nothing or invalid format */
+ p = prev;
+ }
}
else if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_X_HEX2)) {
num = scan_unsigned_hexadecimal_number(&p, end, 2, enc);
@@ -2969,10 +2988,10 @@ fetch_token_in_cc(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env)
default:
PUNFETCH;
- num = fetch_escaped_value(&p, end, env);
+ num = fetch_escaped_value(&p, end, env, &c2);
if (num < 0) return num;
- if (tok->u.c != num) {
- tok->u.code = (OnigCodePoint )num;
+ if (tok->u.c != c2) {
+ tok->u.code = c2;
tok->type = TK_CODE_POINT;
}
break;
@@ -3332,7 +3351,7 @@ fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env)
if (num < 0) return ONIGERR_TOO_BIG_NUMBER;
if (p == prev) { /* can't read nothing. */
num = 0; /* but, it's not error */
- }
+ }
tok->type = TK_RAW_BYTE;
tok->base = 8;
tok->u.c = num;
@@ -3344,7 +3363,7 @@ fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env)
#ifdef USE_NAMED_GROUP
case 'k':
- if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_K_NAMED_BACKREF)) {
+ if (!PEND && IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_K_NAMED_BACKREF)) {
PFETCH(c);
if (c == '<' || c == '\'') {
UChar* name_end;
@@ -3417,7 +3436,7 @@ fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env)
#ifdef USE_SUBEXP_CALL
case 'g':
- if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_G_SUBEXP_CALL)) {
+ if (!PEND && IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_G_SUBEXP_CALL)) {
PFETCH(c);
if (c == '<' || c == '\'') {
int gnum;
@@ -3446,13 +3465,14 @@ fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env)
case 'p':
case 'P':
- if (PPEEK_IS('{') &&
+ if (!PEND && PPEEK_IS('{') &&
IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_P_BRACE_CHAR_PROPERTY)) {
PINC;
tok->type = TK_CHAR_PROPERTY;
tok->u.prop.not = (c == 'P' ? 1 : 0);
- if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_P_BRACE_CIRCUMFLEX_NOT)) {
+ if (!PEND &&
+ IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_P_BRACE_CIRCUMFLEX_NOT)) {
PFETCH(c);
if (c == '^') {
tok->u.prop.not = (tok->u.prop.not == 0 ? 1 : 0);
@@ -3464,16 +3484,20 @@ fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env)
break;
default:
- PUNFETCH;
- num = fetch_escaped_value(&p, end, env);
- if (num < 0) return num;
- /* set_raw: */
- if (tok->u.c != num) {
- tok->type = TK_CODE_POINT;
- tok->u.code = (OnigCodePoint )num;
- }
- else { /* string */
- p = tok->backp + enclen(enc, tok->backp);
+ {
+ OnigCodePoint c2;
+
+ PUNFETCH;
+ num = fetch_escaped_value(&p, end, env, &c2);
+ if (num < 0) return num;
+ /* set_raw: */
+ if (tok->u.c != c2) {
+ tok->type = TK_CODE_POINT;
+ tok->u.code = c2;
+ }
+ else { /* string */
+ p = tok->backp + enclen(enc, tok->backp);
+ }
}
break;
}
@@ -3548,10 +3572,10 @@ fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env)
if (r < 0) return r; /* error */
if (r == 0) goto greedy_check;
else if (r == 2) { /* {n} */
- if (IS_SYNTAX_BV(syn, ONIG_SYN_FIXED_INTERVAL_IS_GREEDY_ONLY))
- goto possessive_check;
+ if (IS_SYNTAX_BV(syn, ONIG_SYN_FIXED_INTERVAL_IS_GREEDY_ONLY))
+ goto possessive_check;
- goto greedy_check;
+ goto greedy_check;
}
/* r == 1 : normal char */
break;
@@ -3562,10 +3586,10 @@ fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env)
break;
case '(':
- if (PPEEK_IS('?') &&
+ if (!PEND && PPEEK_IS('?') &&
IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_QMARK_GROUP_EFFECT)) {
PINC;
- if (PPEEK_IS('#')) {
+ if (!PEND && PPEEK_IS('#')) {
PFETCH(c);
while (1) {
if (PEND) return ONIGERR_END_PATTERN_IN_GROUP;
@@ -3612,7 +3636,7 @@ fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env)
case ']':
if (*src > env->pattern) /* /].../ is allowed. */
- CLOSE_BRACKET_WITHOUT_ESC_WARN(env, (UChar* )"]");
+ CLOSE_BRACKET_WITHOUT_ESC_WARN(env, (UChar* )"]");
break;
case '#':
@@ -3975,8 +3999,9 @@ next_state_val(CClassNode* cc, OnigCodePoint *vs, OnigCodePoint v,
switch (*state) {
case CCS_VALUE:
- if (*type == CCV_SB)
+ if (*type == CCV_SB) {
BITSET_SET_BIT(cc->bs, (int )(*vs));
+ }
else if (*type == CCV_CODE_POINT) {
r = add_code_range(&(cc->mbuf), env, *vs, *vs);
if (r < 0) return r;
@@ -3989,13 +4014,13 @@ next_state_val(CClassNode* cc, OnigCodePoint *vs, OnigCodePoint v,
if (*vs > 0xff || v > 0xff)
return ONIGERR_INVALID_CODE_POINT_VALUE;
- if (*vs > v) {
- if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_EMPTY_RANGE_IN_CC))
- goto ccs_range_end;
- else
- return ONIGERR_EMPTY_RANGE_IN_CHAR_CLASS;
- }
- bitset_set_range(cc->bs, (int )*vs, (int )v);
+ if (*vs > v) {
+ if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_EMPTY_RANGE_IN_CC))
+ goto ccs_range_end;
+ else
+ return ONIGERR_EMPTY_RANGE_IN_CHAR_CLASS;
+ }
+ bitset_set_range(cc->bs, (int )*vs, (int )v);
}
else {
r = add_code_range(&(cc->mbuf), env, *vs, v);
@@ -4006,15 +4031,15 @@ next_state_val(CClassNode* cc, OnigCodePoint *vs, OnigCodePoint v,
#if 0
if (intype == CCV_CODE_POINT && *type == CCV_SB) {
#endif
- if (*vs > v) {
- if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_EMPTY_RANGE_IN_CC))
- goto ccs_range_end;
- else
- return ONIGERR_EMPTY_RANGE_IN_CHAR_CLASS;
- }
- bitset_set_range(cc->bs, (int )*vs, (int )(v < 0xff ? v : 0xff));
- r = add_code_range(&(cc->mbuf), env, (OnigCodePoint )*vs, v);
- if (r < 0) return r;
+ if (*vs > v) {
+ if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_EMPTY_RANGE_IN_CC))
+ goto ccs_range_end;
+ else
+ return ONIGERR_EMPTY_RANGE_IN_CHAR_CLASS;
+ }
+ bitset_set_range(cc->bs, (int )*vs, (int )(v < 0xff ? v : 0xff));
+ r = add_code_range(&(cc->mbuf), env, (OnigCodePoint )*vs, v);
+ if (r < 0) return r;
#if 0
}
else
@@ -4110,6 +4135,7 @@ parse_char_class(Node** np, OnigToken* tok, UChar** src, UChar* end,
fetched = 0;
switch (r) {
case TK_CHAR:
+ any_char_in:
len = ONIGENC_CODE_TO_MBCLEN(env->enc, tok->u.c);
if (len > 1) {
in_type = CCV_CODE_POINT;
@@ -4119,7 +4145,7 @@ parse_char_class(Node** np, OnigToken* tok, UChar** src, UChar* end,
goto err;
}
else {
- sb_char:
+ /* sb_char: */
in_type = CCV_SB;
}
v = (OnigCodePoint )tok->u.c;
@@ -4265,7 +4291,7 @@ parse_char_class(Node** np, OnigToken* tok, UChar** src, UChar* end,
}
else if (state == CCS_RANGE) {
CC_ESC_WARN(env, (UChar* )"-");
- goto sb_char; /* [!--x] is allowed */
+ goto any_char_in; /* [!--x] is allowed */
}
else { /* CCS_COMPLETE */
r = fetch_token_in_cc(tok, &p, end, env);
@@ -4279,7 +4305,7 @@ parse_char_class(Node** np, OnigToken* tok, UChar** src, UChar* end,
if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_DOUBLE_RANGE_OP_IN_CC)) {
CC_ESC_WARN(env, (UChar* )"-");
- goto sb_char; /* [0-9-a] is allowed as [0-9\-a] */
+ goto any_char_in; /* [0-9-a] is allowed as [0-9\-a] */
}
r = ONIGERR_UNMATCHED_RANGE_SPECIFIER_IN_CHAR_CLASS;
goto err;
@@ -4452,6 +4478,7 @@ parse_enclose(Node** np, OnigToken* tok, int term, UChar** src, UChar* end,
#endif
case '<': /* look behind (?<=...), (?<!...) */
+ if (PEND) return ONIGERR_END_PATTERN_WITH_UNMATCHED_PARENTHESIS;
PFETCH(c);
if (c == '=')
*np = onig_node_new_anchor(ANCHOR_LOOK_BEHIND);
@@ -4924,7 +4951,7 @@ parse_exp(Node** np, OnigToken* tok, int term,
len = 1;
while (1) {
if (len >= ONIGENC_MBC_MINLEN(env->enc)) {
- if (len == enclen(env->enc, NSTR(*np)->s)) {
+ if (len == enclen(env->enc, NSTR(*np)->s)) {//should not enclen_end()
r = fetch_token(tok, src, end, env);
NSTRING_CLEAR_RAW(*np);
goto string_end;
@@ -5300,6 +5327,10 @@ onig_parse_make_tree(Node** root, const UChar* pattern, const UChar* end,
env->reg = reg;
*root = NULL;
+
+ if (! ONIGENC_IS_VALID_MBC_STRING(env->enc, pattern, end))
+ return ONIGERR_INVALID_WIDE_CHAR_VALUE;
+
p = (UChar* )pattern;
r = parse_regexp(root, &p, (UChar* )end, env);
reg->num_mem = env->num_mem;