diff options
Diffstat (limited to 'src/regcomp.c')
| -rw-r--r-- | src/regcomp.c | 310 | 
1 files changed, 250 insertions, 60 deletions
| diff --git a/src/regcomp.c b/src/regcomp.c index 4d5b78f..dd2b328 100644 --- a/src/regcomp.c +++ b/src/regcomp.c @@ -133,6 +133,7 @@ ops_init(regex_t* reg, int init_alloc_size)      size = sizeof(Operation) * init_alloc_size;      p = (Operation* )xrealloc(reg->ops, size);      CHECK_NULL_RETURN_MEMERR(p); +    reg->ops = p;  #ifdef USE_DIRECT_THREADED_CODE      {        enum OpCode* cp; @@ -144,13 +145,12 @@ ops_init(regex_t* reg, int init_alloc_size)  #endif    }    else { -    p  = (Operation* )0; +    reg->ops = (Operation* )0;  #ifdef USE_DIRECT_THREADED_CODE      reg->ocs = (enum OpCode* )0;  #endif    } -  reg->ops = p;    reg->ops_curr  = 0; /* !!! not yet done ops_new() */    reg->ops_alloc = init_alloc_size;    reg->ops_used  = 0; @@ -176,6 +176,7 @@ ops_expand(regex_t* reg, int n)    size = sizeof(Operation) * n;    p = (Operation* )xrealloc(reg->ops, size);    CHECK_NULL_RETURN_MEMERR(p); +  reg->ops = p;  #ifdef USE_DIRECT_THREADED_CODE    size = sizeof(enum OpCode) * n; @@ -184,7 +185,6 @@ ops_expand(regex_t* reg, int n)    reg->ocs = cp;  #endif -  reg->ops = p;    reg->ops_alloc = n;    if (reg->ops_used == 0)      reg->ops_curr = 0; @@ -265,10 +265,12 @@ ops_free(regex_t* reg)      case OP_BACKREF1: case OP_BACKREF2: case OP_BACKREF_N: case OP_BACKREF_N_IC:        break;      case OP_BACKREF_MULTI:      case OP_BACKREF_MULTI_IC: +    case OP_BACKREF_CHECK: +#ifdef USE_BACKREF_WITH_LEVEL      case OP_BACKREF_WITH_LEVEL:      case OP_BACKREF_WITH_LEVEL_IC: -    case OP_BACKREF_CHECK:      case OP_BACKREF_CHECK_WITH_LEVEL: +#endif        if (op->backref_general.num != 1)          xfree(op->backref_general.ns);        break; @@ -631,7 +633,7 @@ mmcl_add(MinMaxCharLen* to, MinMaxCharLen* add)    to->min = distance_add(to->min, add->min);    to->max = distance_add(to->max, add->max); -  to->min_is_sure = add->min_is_sure != 0 && to->min_is_sure != 0; +  to->min_is_sure = add->min_is_sure != FALSE && to->min_is_sure != FALSE;  }  static void @@ -656,8 +658,11 @@ static void  mmcl_alt_merge(MinMaxCharLen* to, MinMaxCharLen* alt)  {    if (to->min > alt->min) { -    to->min = alt->min; -    if (alt->min_is_sure != 0) +    to->min         = alt->min; +    to->min_is_sure = alt->min_is_sure; +  } +  else if (to->min == alt->min) { +    if (alt->min_is_sure != FALSE)        to->min_is_sure = TRUE;    } @@ -840,7 +845,7 @@ node_char_len1(Node* node, regex_t* reg, MinMaxCharLen* ci, ScanEnv* env,              en->min_char_len = ci->min;              en->max_char_len = ci->max;              NODE_STATUS_ADD(node, FIXED_CLEN); -            if (ci->min_is_sure != 0) +            if (ci->min_is_sure != FALSE)                NODE_STATUS_ADD(node, FIXED_CLEN_MIN_SURE);            }          } @@ -882,15 +887,15 @@ node_char_len1(Node* node, regex_t* reg, MinMaxCharLen* ci, ScanEnv* env,      }      break; -  case NODE_ANCHOR: +  case NODE_GIMMICK:      mmcl_set(ci, 0); -    /* can't optimize look-behind if anchor exists. */ -    ci->min_is_sure = FALSE;      break; -  case NODE_GIMMICK: +  case NODE_ANCHOR:    zero:      mmcl_set(ci, 0); +    /* can't optimize look-behind if anchor exists. */ +    ci->min_is_sure = FALSE;      break;    case NODE_BACKREF: @@ -1082,6 +1087,9 @@ compile_call(CallNode* node, regex_t* reg, ScanEnv* env)    if (r != 0) return r;    COP(reg)->call.addr = 0; /* dummy addr. */ +#ifdef ONIG_DEBUG_MATCH_COUNTER +  COP(reg)->call.called_mem = node->called_gnum; +#endif    offset = COP_CURR_OFFSET_BYTES(reg, call.addr);    r = unset_addr_list_add(env->unset_addr_list, offset, NODE_CALL_BODY(node)); @@ -1822,7 +1830,6 @@ compile_bag_memory_node(BagNode* node, regex_t* reg, ScanEnv* env)    COP(reg)->memory_end.num = node->m.regnum;    if (NODE_IS_CALLED(node)) { -    if (r != 0) return r;      r = add_op(reg, OP_RETURN);    }  #else @@ -2764,7 +2771,7 @@ compile_tree(Node* node, regex_t* reg, ScanEnv* env)  static int  make_named_capture_number_map(Node** plink, GroupNumMap* map, int* counter)  { -  int r = 0; +  int r;    Node* node = *plink;    switch (NODE_TYPE(node)) { @@ -2772,17 +2779,17 @@ make_named_capture_number_map(Node** plink, GroupNumMap* map, int* counter)    case NODE_ALT:      do {        r = make_named_capture_number_map(&(NODE_CAR(node)), map, counter); -    } while (r == 0 && IS_NOT_NULL(node = NODE_CDR(node))); +    } while (r >= 0 && IS_NOT_NULL(node = NODE_CDR(node))); +    if (r < 0) return r;      break;    case NODE_QUANT:      {        Node** ptarget = &(NODE_BODY(node)); -      Node*  old = *ptarget;        r = make_named_capture_number_map(ptarget, map, counter); -      if (r != 0) return r; -      if (*ptarget != old && NODE_TYPE(*ptarget) == NODE_QUANT) { -        r = onig_reduce_nested_quantifier(node); +      if (r < 0) return r; +      if (r == 1 && NODE_TYPE(*ptarget) == NODE_QUANT) { +        return onig_reduce_nested_quantifier(node);        }      }      break; @@ -2796,41 +2803,48 @@ make_named_capture_number_map(Node** plink, GroupNumMap* map, int* counter)            map[en->m.regnum].new_val = *counter;            en->m.regnum = *counter;            r = make_named_capture_number_map(&(NODE_BODY(node)), map, counter); +          if (r < 0) return r;          }          else {            *plink = NODE_BODY(node);            NODE_BODY(node) = NULL_NODE;            onig_node_free(node);            r = make_named_capture_number_map(plink, map, counter); +          if (r < 0) return r; +          return 1;          }        }        else if (en->type == BAG_IF_ELSE) {          r = make_named_capture_number_map(&(NODE_BAG_BODY(en)), map, counter); -        if (r != 0) return r; +        if (r < 0) return r;          if (IS_NOT_NULL(en->te.Then)) {            r = make_named_capture_number_map(&(en->te.Then), map, counter); -          if (r != 0) return r; +          if (r < 0) return r;          }          if (IS_NOT_NULL(en->te.Else)) {            r = make_named_capture_number_map(&(en->te.Else), map, counter); -          if (r != 0) return r; +          if (r < 0) return r;          }        } -      else +      else {          r = make_named_capture_number_map(&(NODE_BODY(node)), map, counter); +        if (r < 0) return r; +      }      }      break;    case NODE_ANCHOR: -    if (IS_NOT_NULL(NODE_BODY(node))) +    if (IS_NOT_NULL(NODE_BODY(node))) {        r = make_named_capture_number_map(&(NODE_BODY(node)), map, counter); +      if (r < 0) return r; +    }      break;    default:      break;    } -  return r; +  return 0;  }  static int @@ -2982,7 +2996,7 @@ disable_noname_group_capture(Node** root, regex_t* reg, ScanEnv* env)    }    counter = 0;    r = make_named_capture_number_map(root, map, &counter); -  if (r != 0) return r; +  if (r < 0) return r;    r = renumber_backref_traverse(*root, map);    if (r != 0) return r; @@ -3546,7 +3560,9 @@ check_node_in_look_behind(Node* node, int not, int* used)        if (r != 0) break;        if (en->type == BAG_MEMORY) { -        if (NODE_IS_BACKREF(node) || NODE_IS_CALLED(node)) *used = TRUE; +        if (NODE_IS_BACKREF(node) || NODE_IS_CALLED(node) +         || NODE_IS_REFERENCED(node)) +          *used = TRUE;        }        else if (en->type == BAG_IF_ELSE) {          if (IS_NOT_NULL(en->te.Then)) { @@ -3978,6 +3994,7 @@ set_empty_repeat_node_trav(Node* node, Node* empty, ScanEnv* env)      {        BagNode* en = BAG_(node); +      r = 0;        if (en->type == BAG_MEMORY) {          if (NODE_IS_BACKREF(node)) {            if (IS_NOT_NULL(empty)) @@ -4484,7 +4501,7 @@ remove_from_list(Node* prev, Node* a)  }  static int -reduce_string_list(Node* node) +reduce_string_list(Node* node, OnigEncoding enc)  {    int r = 0; @@ -4515,43 +4532,70 @@ reduce_string_list(Node* node)            }          }          else { -          prev = NULL_NODE; +          if (IS_NOT_NULL(prev)) { +#ifdef USE_CHECK_VALIDITY_OF_STRING_IN_TREE +            StrNode* sn = STR_(prev); +            if (! ONIGENC_IS_VALID_MBC_STRING(enc, sn->s, sn->end)) +              return ONIGERR_INVALID_WIDE_CHAR_VALUE; +#endif +            prev = NULL_NODE; +          } +          r = reduce_string_list(curr, enc); +          if (r != 0) return r;            prev_node = node;          }          node = next_node;        } while (r == 0 && IS_NOT_NULL(node)); + +#ifdef USE_CHECK_VALIDITY_OF_STRING_IN_TREE +      if (IS_NOT_NULL(prev)) { +        StrNode* sn = STR_(prev); +        if (! ONIGENC_IS_VALID_MBC_STRING(enc, sn->s, sn->end)) +          return ONIGERR_INVALID_WIDE_CHAR_VALUE; +      } +#endif      }      break;    case NODE_ALT:      do { -      r = reduce_string_list(NODE_CAR(node)); +      r = reduce_string_list(NODE_CAR(node), enc);      } while (r == 0 && IS_NOT_NULL(node = NODE_CDR(node)));      break; +#ifdef USE_CHECK_VALIDITY_OF_STRING_IN_TREE +  case NODE_STRING: +    { +      StrNode* sn = STR_(node); +      if (! ONIGENC_IS_VALID_MBC_STRING(enc, sn->s, sn->end)) +        return ONIGERR_INVALID_WIDE_CHAR_VALUE; +    } +    break; +#endif +    case NODE_ANCHOR:      if (IS_NULL(NODE_BODY(node)))        break;      /* fall */    case NODE_QUANT: -    r = reduce_string_list(NODE_BODY(node)); +    r = reduce_string_list(NODE_BODY(node), enc);      break;    case NODE_BAG:      {        BagNode* en = BAG_(node); -      r = reduce_string_list(NODE_BODY(node)); +      r = reduce_string_list(NODE_BODY(node), enc);        if (r != 0) return r;        if (en->type == BAG_IF_ELSE) {          if (IS_NOT_NULL(en->te.Then)) { -          r = reduce_string_list(en->te.Then); +          r = reduce_string_list(en->te.Then, enc);            if (r != 0) return r;          }          if (IS_NOT_NULL(en->te.Else)) { -          r = reduce_string_list(en->te.Else); +          r = reduce_string_list(en->te.Else, enc);            if (r != 0) return r;          }        } @@ -4723,7 +4767,7 @@ tune_look_behind(Node* node, regex_t* reg, int state, ScanEnv* env)        return ONIGERR_INVALID_LOOK_BEHIND_PATTERN;      } -    if (ci.min == 0 && ci.min_is_sure != 0 && used == FALSE) { +    if (ci.min == 0 && ci.min_is_sure != FALSE && used == FALSE) {        if (an->type == ANCR_LOOK_BEHIND_NOT)          r = onig_node_reset_fail(node);        else @@ -4779,18 +4823,23 @@ tune_look_behind(Node* node, regex_t* reg, int state, ScanEnv* env)  static int  tune_next(Node* node, Node* next_node, regex_t* reg)  { +  int called;    NodeType type; +  called = FALSE; +   retry:    type = NODE_TYPE(node);    if (type == NODE_QUANT) {      QuantNode* qn = QUANT_(node);      if (qn->greedy && IS_INFINITE_REPEAT(qn->upper)) {  #ifdef USE_QUANT_PEEK_NEXT -      Node* n = get_tree_head_literal(next_node, 1, reg); -      /* '\0': for UTF-16BE etc... */ -      if (IS_NOT_NULL(n) && STR_(n)->s[0] != '\0') { -        qn->next_head_exact = n; +      if (called == FALSE) { +        Node* n = get_tree_head_literal(next_node, 1, reg); +        /* '\0': for UTF-16BE etc... */ +        if (IS_NOT_NULL(n) && STR_(n)->s[0] != '\0') { +          qn->next_head_exact = n; +        }        }  #endif        /* automatic posseivation a*b ==> (?>a*)b */ @@ -4815,6 +4864,8 @@ tune_next(Node* node, Node* next_node, regex_t* reg)    else if (type == NODE_BAG) {      BagNode* en = BAG_(node);      if (en->type == BAG_MEMORY) { +      if (NODE_IS_CALLED(node)) +        called = TRUE;        node = NODE_BODY(node);        goto retry;      } @@ -4999,17 +5050,18 @@ unravel_cf_look_behind_add(Node** rlist, Node** rsn,  {    int r, i, found; -  found = 0; +  found = FALSE;    for (i = 0; i < n; i++) {      OnigCaseFoldCodeItem* item = items + i;      if (item->byte_len == one_len) {        if (item->code_len == 1) { -        found = 1; +        found = TRUE; +        break;        }      }    } -  if (found == 0) { +  if (found == FALSE) {      r = unravel_cf_string_add(rlist, rsn, s, s + one_len, 0 /* flag */);    }    else { @@ -5073,6 +5125,7 @@ unravel_case_fold_string(Node* node, regex_t* reg, int state)      one_len = (OnigLen )enclen(enc, p);      if (n == 0) {        q = p + one_len; +      if (q > end) q = end;        r = unravel_cf_string_add(&list, &sn, p, q, 0 /* flag */);        if (r != 0) goto err;      } @@ -5221,12 +5274,12 @@ quantifiers_memory_node_info(Node* node)  __inline  #endif  static int -tune_call_node_call(CallNode* cn, ScanEnv* env, int state) +check_call_reference(CallNode* cn, ScanEnv* env, int state)  {    MemEnv* mem_env = SCANENV_MEMENV(env);    if (cn->by_number != 0) { -    int gnum = cn->group_num; +    int gnum = cn->called_gnum;      if (env->num_named > 0 &&          IS_SYNTAX_BV(env->syntax, ONIG_SYN_CAPTURE_ONLY_NAMED_GROUP) && @@ -5241,12 +5294,14 @@ tune_call_node_call(CallNode* cn, ScanEnv* env, int state)      }    set_call_attr: -    NODE_CALL_BODY(cn) = mem_env[cn->group_num].mem_node; +    NODE_CALL_BODY(cn) = mem_env[cn->called_gnum].mem_node;      if (IS_NULL(NODE_CALL_BODY(cn))) {        onig_scan_env_set_error_string(env, ONIGERR_UNDEFINED_NAME_REFERENCE,                                       cn->name, cn->name_end);        return ONIGERR_UNDEFINED_NAME_REFERENCE;      } + +    NODE_STATUS_ADD(NODE_CALL_BODY(cn), REFERENCED);    }    else {      int *refs; @@ -5263,7 +5318,7 @@ tune_call_node_call(CallNode* cn, ScanEnv* env, int state)        return ONIGERR_MULTIPLEX_DEFINITION_NAME_CALL;      }      else { -      cn->group_num = refs[0]; +      cn->called_gnum = refs[0];        goto set_call_attr;      }    } @@ -5396,7 +5451,7 @@ tune_call(Node* node, ScanEnv* env, int state)        CALL_(node)->entry_count--;      } -    r = tune_call_node_call(CALL_(node), env, state); +    r = check_call_reference(CALL_(node), env, state);      break;    default: @@ -6187,8 +6242,10 @@ concat_opt_exact(OptStr* to, OptStr* add, OnigEncoding enc)        r = 1; /* 1:full */        break;      } -    for (j = 0; j < len && p < end; j++) +    for (j = 0; j < len && p < end; j++) { +      /* coverity[overrun-local] */        to->s[i++] = *p++; +    }    }    to->len = i; @@ -6210,8 +6267,10 @@ concat_opt_exact_str(OptStr* to, UChar* s, UChar* end, OnigEncoding enc)    for (i = to->len, p = s; p < end && i < OPT_EXACT_MAXLEN; ) {      len = enclen(enc, p);      if (i + len > OPT_EXACT_MAXLEN) break; -    for (j = 0; j < len && p < end; j++) +    for (j = 0; j < len && p < end; j++) { +      /* coverity[overrun-local] */        to->s[i++] = *p++; +    }    }    to->len = i; @@ -7229,19 +7288,10 @@ onig_compile(regex_t* reg, const UChar* pattern, const UChar* pattern_end,    else      reg->ops_used = 0; -  reg->string_pool        = 0; -  reg->string_pool_end    = 0; -  reg->num_mem            = 0; -  reg->num_repeat         = 0; -  reg->num_empty_check    = 0; -  reg->repeat_range_alloc = 0; -  reg->repeat_range       = (RepeatRange* )NULL; -  reg->empty_status_mem   = 0; -    r = onig_parse_tree(&root, pattern, pattern_end, reg, &scan_env);    if (r != 0) goto err; -  r = reduce_string_list(root); +  r = reduce_string_list(root, reg->enc);    if (r != 0) goto err;    /* mixed use named group and no-named group */ @@ -7653,6 +7703,134 @@ onig_is_code_in_cc(OnigEncoding enc, OnigCodePoint code, CClassNode* cc)    return onig_is_code_in_cc_len(len, code, cc);  } +typedef struct { +  int prec_read; +  int look_behind; +  int backref_with_level; +  int call; +} SlowElementCount; + +static int +node_detect_can_be_slow(Node* node, SlowElementCount* ct) +{ +  int r; + +  r = 0; +  switch (NODE_TYPE(node)) { +  case NODE_LIST: +  case NODE_ALT: +    do { +      r = node_detect_can_be_slow(NODE_CAR(node), ct); +      if (r != 0) return r; +    } while (IS_NOT_NULL(node = NODE_CDR(node))); +    break; + +  case NODE_QUANT: +    r = node_detect_can_be_slow(NODE_BODY(node), ct); +    break; + +  case NODE_ANCHOR: +    switch (ANCHOR_(node)->type) { +    case ANCR_PREC_READ: +    case ANCR_PREC_READ_NOT: +      ct->prec_read++; +      break; +    case ANCR_LOOK_BEHIND: +    case ANCR_LOOK_BEHIND_NOT: +      ct->look_behind++; +      break; +    default: +      break; +    } + +    if (ANCHOR_HAS_BODY(ANCHOR_(node))) +      r = node_detect_can_be_slow(NODE_BODY(node), ct); +    break; + +  case NODE_BAG: +    { +      BagNode* en = BAG_(node); + +      r = node_detect_can_be_slow(NODE_BODY(node), ct); +      if (r != 0) return r; + +      if (en->type == BAG_IF_ELSE) { +        if (IS_NOT_NULL(en->te.Then)) { +          r = node_detect_can_be_slow(en->te.Then, ct); +          if (r != 0) return r; +        } +        if (IS_NOT_NULL(en->te.Else)) { +          r = node_detect_can_be_slow(en->te.Else, ct); +          if (r != 0) return r; +        } +      } +    } +    break; + +#ifdef USE_BACKREF_WITH_LEVEL +  case NODE_BACKREF: +    if (NODE_IS_NEST_LEVEL(node)) +      ct->backref_with_level++; +    break; +#endif + +#ifdef USE_CALL +  case NODE_CALL: +    ct->call++; +    break; +#endif + +  default: +    break; +  } + +  return r; +} + +extern int +onig_detect_can_be_slow_pattern(const UChar* pattern, +  const UChar* pattern_end, OnigOptionType option, OnigEncoding enc, +  OnigSyntaxType* syntax) +{ +  int r; +  regex_t* reg; +  Node* root; +  ScanEnv scan_env; +  SlowElementCount count; + +  reg = (regex_t* )xmalloc(sizeof(regex_t)); +  if (IS_NULL(reg)) return ONIGERR_MEMORY; + +  r = onig_reg_init(reg, option, ONIGENC_CASE_FOLD_DEFAULT, enc, syntax); +  if (r != 0) { +    xfree(reg); +    return r; +  } + +  root = 0; +  r = onig_parse_tree(&root, pattern, pattern_end, reg, &scan_env); +  if (r == 0) { +    count.prec_read          = 0; +    count.look_behind        = 0; +    count.backref_with_level = 0; +    count.call               = 0; + +    r = node_detect_can_be_slow(root, &count); +    if (r == 0) { +      int n = count.prec_read + count.look_behind +            + count.backref_with_level + count.call; +      r = n; +    } +  } + +  if (IS_NOT_NULL(scan_env.mem_env_dynamic)) +    xfree(scan_env.mem_env_dynamic); + +  onig_node_free(root); +  onig_free(reg); +  return r; +} +  #ifdef ONIG_DEBUG_PARSE @@ -7734,14 +7912,18 @@ print_indent_tree(FILE* f, Node* node, int indent)      break;    case NODE_CCLASS: +#define CCLASS_MBUF_MAX_OUTPUT_NUM   10 +      fprintf(f, "<cclass:%p>", node);      if (IS_NCCLASS_NOT(CCLASS_(node))) fputs(" not", f);      if (CCLASS_(node)->mbuf) {        BBuf* bbuf = CCLASS_(node)->mbuf; -      for (i = 0; i < bbuf->used; i++) { +      fprintf(f, " mbuf(%u) ", bbuf->used); +      for (i = 0; i < bbuf->used && i < CCLASS_MBUF_MAX_OUTPUT_NUM; i++) {          if (i > 0) fprintf(f, ",");          fprintf(f, "%0x", bbuf->p[i]);        } +      if (i < bbuf->used) fprintf(f, "...");      }      break; @@ -7822,6 +8004,11 @@ print_indent_tree(FILE* f, Node* node, int indent)          if (i > 0) fputs(", ", f);          fprintf(f, "%d", p[i]);        } +#ifdef USE_BACKREF_WITH_LEVEL +      if (NODE_IS_NEST_LEVEL(node)) { +        fprintf(f, ", level: %d", br->nest_level); +      } +#endif      }      break; @@ -7830,6 +8017,7 @@ print_indent_tree(FILE* f, Node* node, int indent)      {        CallNode* cn = CALL_(node);        fprintf(f, "<call:%p>", node); +      fprintf(f, " num: %d, name", cn->called_gnum);        p_string(f, cn->name_end - cn->name, cn->name);      }      break; @@ -7881,6 +8069,8 @@ print_indent_tree(FILE* f, Node* node, int indent)        fprintf(f, "memory:%d", BAG_(node)->m.regnum);        if (NODE_IS_CALLED(node))          fprintf(f, ", called"); +      else if (NODE_IS_REFERENCED(node)) +        fprintf(f, ", referenced");        if (NODE_IS_FIXED_ADDR(node))          fprintf(f, ", fixed-addr");        break; | 
