diff options
| author | Jörg Frings-Fürst <debian@jff.email> | 2019-08-07 09:32:54 +0200 | 
|---|---|---|
| committer | Jörg Frings-Fürst <debian@jff.email> | 2019-08-07 09:32:54 +0200 | 
| commit | 1fb4b2b100d76cfa362cd021760b7cc0038cf55d (patch) | |
| tree | 2443bfdda69965757d8ce335cda1a28bb7327834 /src | |
| parent | b134093d75235a90f09ff591137aed9dbdad6e89 (diff) | |
| parent | 40f3d0030e6e98bcb02d6523e5ee48497dec49a6 (diff) | |
Update upstream source from tag 'upstream/6.9.3'
Update to upstream version '6.9.3'
with Debian dir 0b54db06b48ebf22a6090f21e4dcc045a1085e11
Diffstat (limited to 'src')
| -rw-r--r-- | src/gb18030.c | 6 | ||||
| -rw-r--r-- | src/oniguruma.h | 11 | ||||
| -rw-r--r-- | src/regcomp.c | 156 | ||||
| -rw-r--r-- | src/regenc.c | 2 | ||||
| -rw-r--r-- | src/regerror.c | 17 | ||||
| -rw-r--r-- | src/regexec.c | 130 | ||||
| -rw-r--r-- | src/regext.c | 6 | ||||
| -rw-r--r-- | src/regint.h | 6 | ||||
| -rw-r--r-- | src/regparse.c | 190 | ||||
| -rw-r--r-- | src/regparse.h | 22 | ||||
| -rw-r--r-- | src/utf16_be.c | 35 | ||||
| -rw-r--r-- | src/utf16_le.c | 26 | 
12 files changed, 393 insertions, 214 deletions
| diff --git a/src/gb18030.c b/src/gb18030.c index 7654432..8d415b0 100644 --- a/src/gb18030.c +++ b/src/gb18030.c @@ -2,7 +2,7 @@    gb18030.c -  Oniguruma (regular expression library)  **********************************************************************/  /*- - * Copyright (c) 2005-2018  KUBO Takehiro <kubo AT jiubao DOT org> + * Copyright (c) 2005-2019  KUBO Takehiro <kubo AT jiubao DOT org>   *                          K.Kosako <sndgk393 AT ybb DOT ne DOT jp>   * All rights reserved.   * @@ -67,11 +67,11 @@ gb18030_mbc_enc_len(const UChar* p)  {    if (GB18030_MAP[*p] != CM)      return 1; +    p++;    if (GB18030_MAP[*p] == C4)      return 4; -  if (GB18030_MAP[*p] == C1) -    return 1; /* illegal sequence */ +    return 2;  } diff --git a/src/oniguruma.h b/src/oniguruma.h index f6aa5ba..90cf2d9 100644 --- a/src/oniguruma.h +++ b/src/oniguruma.h @@ -36,9 +36,9 @@ extern "C" {  #define ONIGURUMA  #define ONIGURUMA_VERSION_MAJOR   6  #define ONIGURUMA_VERSION_MINOR   9 -#define ONIGURUMA_VERSION_TEENY   2 +#define ONIGURUMA_VERSION_TEENY   3 -#define ONIGURUMA_VERSION_INT     60902 +#define ONIGURUMA_VERSION_INT     60903  #ifndef P_  #if defined(__STDC__) || defined(_WIN32) @@ -52,6 +52,7 @@ extern "C" {  # define PV_(args) args  #endif +#ifndef ONIG_STATIC  #ifndef ONIG_EXTERN  #if defined(_WIN32) && !defined(__GNUC__)  #if defined(ONIGURUMA_EXPORT) @@ -65,6 +66,9 @@ extern "C" {  #ifndef ONIG_EXTERN  #define ONIG_EXTERN   extern  #endif +#else +#define ONIG_EXTERN   extern +#endif  /* PART: character encoding */ @@ -517,6 +521,7 @@ ONIG_EXTERN OnigSyntaxType*   OnigDefaultSyntax;  #define ONIG_SYN_BACKSLASH_ESCAPE_IN_CC          (1U<<21) /* [..\w..] etc.. */  #define ONIG_SYN_ALLOW_EMPTY_RANGE_IN_CC         (1U<<22)  #define ONIG_SYN_ALLOW_DOUBLE_RANGE_OP_IN_CC     (1U<<23) /* [0-9-a]=[0-9\-a] */ +#define ONIG_SYN_ALLOW_INVALID_CODE_END_OF_RANGE_IN_CC (1U<<26)  /* syntax (behavior) warning */  #define ONIG_SYN_WARN_CC_OP_NOT_ESCAPED          (1U<<24) /* [,-,] */  #define ONIG_SYN_WARN_REDUNDANT_NESTED_REPEAT    (1U<<25) /* (?:a*)+ */ @@ -766,6 +771,8 @@ int onig_init P_((void));  ONIG_EXTERN  int onig_error_code_to_str PV_((OnigUChar* s, int err_code, ...));  ONIG_EXTERN +int onig_is_error_code_needs_param PV_((int code)); +ONIG_EXTERN  void onig_set_warn_func P_((OnigWarnFunc f));  ONIG_EXTERN  void onig_set_verb_warn_func P_((OnigWarnFunc f)); diff --git a/src/regcomp.c b/src/regcomp.c index c2c04a4..b96c793 100644 --- a/src/regcomp.c +++ b/src/regcomp.c @@ -599,12 +599,34 @@ select_str_opcode(int mb_len, int str_len, int ignore_case)  }  static int -compile_tree_empty_check(Node* node, regex_t* reg, int empty_info, ScanEnv* env) +is_strict_real_node(Node* node) +{ +  switch (NODE_TYPE(node)) { +  case NODE_STRING: +    { +      StrNode* sn = STR_(node); +      return (sn->end != sn->s); +    } +    break; + +  case NODE_CCLASS: +  case NODE_CTYPE: +    return 1; +    break; + +  default: +    return 0; +    break; +  } +} + +static int +compile_tree_empty_check(Node* node, regex_t* reg, int emptiness, ScanEnv* env)  {    int r;    int saved_num_null_check = reg->num_null_check; -  if (empty_info != BODY_IS_NOT_EMPTY) { +  if (emptiness != BODY_IS_NOT_EMPTY) {      r = add_op(reg, OP_EMPTY_CHECK_START);      if (r != 0) return r;      COP(reg)->empty_check_start.mem = reg->num_null_check; /* NULL CHECK ID */ @@ -614,12 +636,12 @@ compile_tree_empty_check(Node* node, regex_t* reg, int empty_info, ScanEnv* env)    r = compile_tree(node, reg, env);    if (r != 0) return r; -  if (empty_info != BODY_IS_NOT_EMPTY) { -    if (empty_info == BODY_IS_EMPTY) +  if (emptiness != BODY_IS_NOT_EMPTY) { +    if (emptiness == BODY_IS_EMPTY_POSSIBILITY)        r = add_op(reg, OP_EMPTY_CHECK_END); -    else if (empty_info == BODY_IS_EMPTY_MEM) +    else if (emptiness == BODY_IS_EMPTY_POSSIBILITY_MEM)        r = add_op(reg, OP_EMPTY_CHECK_END_MEMST); -    else if (empty_info == BODY_IS_EMPTY_REC) +    else if (emptiness == BODY_IS_EMPTY_POSSIBILITY_REC)        r = add_op(reg, OP_EMPTY_CHECK_END_MEMST_PUSH);      if (r != 0) return r; @@ -895,12 +917,12 @@ entry_repeat_range(regex_t* reg, int id, int lower, int upper)    }    p[id].lower = lower; -  p[id].upper = (IS_REPEAT_INFINITE(upper) ? 0x7fffffff : upper); +  p[id].upper = (IS_INFINITE_REPEAT(upper) ? 0x7fffffff : upper);    return 0;  }  static int -compile_range_repeat_node(QuantNode* qn, int target_len, int empty_info, +compile_range_repeat_node(QuantNode* qn, int target_len, int emptiness,                            regex_t* reg, ScanEnv* env)  {    int r; @@ -915,7 +937,7 @@ compile_range_repeat_node(QuantNode* qn, int target_len, int empty_info,    r = entry_repeat_range(reg, num_repeat, qn->lower, qn->upper);    if (r != 0) return r; -  r = compile_tree_empty_check(NODE_QUANT_BODY(qn), reg, empty_info, env); +  r = compile_tree_empty_check(NODE_QUANT_BODY(qn), reg, emptiness, env);    if (r != 0) return r;    if ( @@ -937,7 +959,7 @@ compile_range_repeat_node(QuantNode* qn, int target_len, int empty_info,  static int  is_anychar_infinite_greedy(QuantNode* qn)  { -  if (qn->greedy && IS_REPEAT_INFINITE(qn->upper) && +  if (qn->greedy && IS_INFINITE_REPEAT(qn->upper) &&        NODE_IS_ANYCHAR(NODE_QUANT_BODY(qn)))      return 1;    else @@ -951,8 +973,8 @@ static int  compile_length_quantifier_node(QuantNode* qn, regex_t* reg)  {    int len, mod_tlen; -  int infinite = IS_REPEAT_INFINITE(qn->upper); -  enum BodyEmpty empty_info = qn->empty_info; +  int infinite = IS_INFINITE_REPEAT(qn->upper); +  enum BodyEmptyType emptiness = qn->emptiness;    int tlen = compile_length_tree(NODE_QUANT_BODY(qn), reg);    if (tlen < 0) return tlen; @@ -969,10 +991,9 @@ compile_length_quantifier_node(QuantNode* qn, regex_t* reg)      }    } -  if (empty_info == BODY_IS_NOT_EMPTY) -    mod_tlen = tlen; -  else -    mod_tlen = tlen + (SIZE_OP_EMPTY_CHECK_START + SIZE_OP_EMPTY_CHECK_END); +  mod_tlen = tlen; +  if (emptiness != BODY_IS_NOT_EMPTY) +    mod_tlen += SIZE_OP_EMPTY_CHECK_START + SIZE_OP_EMPTY_CHECK_END;    if (infinite &&        (qn->lower <= 1 || @@ -1026,8 +1047,8 @@ static int  compile_quantifier_node(QuantNode* qn, regex_t* reg, ScanEnv* env)  {    int i, r, mod_tlen; -  int infinite = IS_REPEAT_INFINITE(qn->upper); -  enum BodyEmpty empty_info = qn->empty_info; +  int infinite = IS_INFINITE_REPEAT(qn->upper); +  enum BodyEmptyType emptiness = qn->emptiness;    int tlen = compile_length_tree(NODE_QUANT_BODY(qn), reg);    if (tlen < 0) return tlen; @@ -1055,10 +1076,9 @@ compile_quantifier_node(QuantNode* qn, regex_t* reg, ScanEnv* env)      }    } -  if (empty_info == BODY_IS_NOT_EMPTY) -    mod_tlen = tlen; -  else -    mod_tlen = tlen + (SIZE_OP_EMPTY_CHECK_START + SIZE_OP_EMPTY_CHECK_END); +  mod_tlen = tlen; +  if (emptiness != BODY_IS_NOT_EMPTY) +    mod_tlen += SIZE_OP_EMPTY_CHECK_START + SIZE_OP_EMPTY_CHECK_END;    if (infinite &&        (qn->lower <= 1 || @@ -1096,7 +1116,7 @@ compile_quantifier_node(QuantNode* qn, regex_t* reg, ScanEnv* env)          COP(reg)->push_or_jump_exact1.addr = SIZE_INC_OP + mod_tlen + SIZE_OP_JUMP;          COP(reg)->push_or_jump_exact1.c    = STR_(qn->head_exact)->s[0]; -        r = compile_tree_empty_check(NODE_QUANT_BODY(qn), reg, empty_info, env); +        r = compile_tree_empty_check(NODE_QUANT_BODY(qn), reg, emptiness, env);          if (r != 0) return r;          addr = -(mod_tlen + (int )SIZE_OP_PUSH_OR_JUMP_EXACT1); @@ -1109,7 +1129,7 @@ compile_quantifier_node(QuantNode* qn, regex_t* reg, ScanEnv* env)          COP(reg)->push_if_peek_next.addr = SIZE_INC_OP + mod_tlen + SIZE_OP_JUMP;          COP(reg)->push_if_peek_next.c    = STR_(qn->next_head_exact)->s[0]; -        r = compile_tree_empty_check(NODE_QUANT_BODY(qn), reg, empty_info, env); +        r = compile_tree_empty_check(NODE_QUANT_BODY(qn), reg, emptiness, env);          if (r != 0) return r;          addr = -(mod_tlen + (int )SIZE_OP_PUSH_IF_PEEK_NEXT); @@ -1119,7 +1139,7 @@ compile_quantifier_node(QuantNode* qn, regex_t* reg, ScanEnv* env)          if (r != 0) return r;          COP(reg)->push.addr = SIZE_INC_OP + mod_tlen + SIZE_OP_JUMP; -        r = compile_tree_empty_check(NODE_QUANT_BODY(qn), reg, empty_info, env); +        r = compile_tree_empty_check(NODE_QUANT_BODY(qn), reg, emptiness, env);          if (r != 0) return r;          addr = -(mod_tlen + (int )SIZE_OP_PUSH); @@ -1134,7 +1154,7 @@ compile_quantifier_node(QuantNode* qn, regex_t* reg, ScanEnv* env)        if (r != 0) return r;        COP(reg)->jump.addr = mod_tlen + SIZE_INC_OP; -      r = compile_tree_empty_check(NODE_QUANT_BODY(qn), reg, empty_info, env); +      r = compile_tree_empty_check(NODE_QUANT_BODY(qn), reg, emptiness, env);        if (r != 0) return r;        r = add_op(reg, OP_PUSH); @@ -1188,7 +1208,7 @@ compile_quantifier_node(QuantNode* qn, regex_t* reg, ScanEnv* env)      r = compile_tree(NODE_QUANT_BODY(qn), reg, env);    }    else { -    r = compile_range_repeat_node(qn, mod_tlen, empty_info, reg, env); +    r = compile_range_repeat_node(qn, mod_tlen, emptiness, reg, env);    }    return r;  } @@ -1273,7 +1293,7 @@ compile_length_bag_node(BagNode* node, regex_t* reg)      break;    case BAG_STOP_BACKTRACK: -    if (NODE_IS_STOP_BT_SIMPLE_REPEAT(node)) { +    if (NODE_IS_STRICT_REAL_REPEAT(node)) {        int v;        QuantNode* qn; @@ -1307,8 +1327,9 @@ compile_length_bag_node(BagNode* node, regex_t* reg)          len += tlen;        } +      len += SIZE_OP_JUMP + SIZE_OP_ATOMIC_END; +        if (IS_NOT_NULL(Else)) { -        len += SIZE_OP_JUMP;          tlen = compile_length_tree(Else, reg);          if (tlen < 0) return tlen;          len += tlen; @@ -1423,7 +1444,7 @@ compile_bag_node(BagNode* node, regex_t* reg, ScanEnv* env)      break;    case BAG_STOP_BACKTRACK: -    if (NODE_IS_STOP_BT_SIMPLE_REPEAT(node)) { +    if (NODE_IS_STRICT_REAL_REPEAT(node)) {        QuantNode* qn = QUANT_(NODE_BAG_BODY(node));        r = compile_tree_n_times(NODE_QUANT_BODY(qn), qn->lower, reg, env);        if (r != 0) return r; @@ -1455,7 +1476,7 @@ compile_bag_node(BagNode* node, regex_t* reg, ScanEnv* env)    case BAG_IF_ELSE:      { -      int cond_len, then_len, jump_len; +      int cond_len, then_len, else_len, jump_len;        Node* cond = NODE_BAG_BODY(node);        Node* Then = node->te.Then;        Node* Else = node->te.Else; @@ -1472,8 +1493,7 @@ compile_bag_node(BagNode* node, regex_t* reg, ScanEnv* env)        else          then_len = 0; -      jump_len = cond_len + then_len + SIZE_OP_ATOMIC_END; -      if (IS_NOT_NULL(Else)) jump_len += SIZE_OP_JUMP; +      jump_len = cond_len + then_len + SIZE_OP_ATOMIC_END + SIZE_OP_JUMP;        r = add_op(reg, OP_PUSH);        if (r != 0) return r; @@ -1490,11 +1510,20 @@ compile_bag_node(BagNode* node, regex_t* reg, ScanEnv* env)        }        if (IS_NOT_NULL(Else)) { -        int else_len = compile_length_tree(Else, reg); -        r = add_op(reg, OP_JUMP); -        if (r != 0) return r; -        COP(reg)->jump.addr = else_len + SIZE_INC_OP; +        else_len = compile_length_tree(Else, reg); +        if (else_len < 0) return else_len; +      } +      else +        else_len = 0; + +      r = add_op(reg, OP_JUMP); +      if (r != 0) return r; +      COP(reg)->jump.addr = SIZE_OP_ATOMIC_END + else_len + SIZE_INC_OP; +      r = add_op(reg, OP_ATOMIC_END); +      if (r != 0) return r; + +      if (IS_NOT_NULL(Else)) {          r = compile_tree(Else, reg, env);        }      } @@ -3035,7 +3064,7 @@ tree_max_len(Node* node, ScanEnv* env)        if (qn->upper != 0) {          len = tree_max_len(NODE_BODY(node), env);          if (len != 0) { -          if (! IS_REPEAT_INFINITE(qn->upper)) +          if (! IS_INFINITE_REPEAT(qn->upper))              len = distance_multiply(len, qn->upper);            else              len = INFINITE_LEN; @@ -3581,7 +3610,7 @@ next_setup(Node* node, Node* next_node, regex_t* reg)    type = NODE_TYPE(node);    if (type == NODE_QUANT) {      QuantNode* qn = QUANT_(node); -    if (qn->greedy && IS_REPEAT_INFINITE(qn->upper)) { +    if (qn->greedy && IS_INFINITE_REPEAT(qn->upper)) {  #ifdef USE_QUANT_PEEK_NEXT        Node* n = get_head_value_node(next_node, 1, reg);        /* '\0': for UTF-16BE etc... */ @@ -3591,7 +3620,7 @@ next_setup(Node* node, Node* next_node, regex_t* reg)  #endif        /* automatic posseivation a*b ==> (?>a*)b */        if (qn->lower <= 1) { -        if (NODE_IS_SIMPLE_TYPE(NODE_BODY(node))) { +        if (is_strict_real_node(NODE_BODY(node))) {            Node *x, *y;            x = get_head_value_node(NODE_BODY(node), 0, reg);            if (IS_NOT_NULL(x)) { @@ -3599,7 +3628,7 @@ next_setup(Node* node, Node* next_node, regex_t* reg)              if (IS_NOT_NULL(y) && is_exclusive(x, y, reg)) {                Node* en = onig_node_new_bag(BAG_STOP_BACKTRACK);                CHECK_NULL_RETURN_MEMERR(en); -              NODE_STATUS_ADD(en, STOP_BT_SIMPLE_REPEAT); +              NODE_STATUS_ADD(en, STRICT_REAL_REPEAT);                swap_node(node, en);                NODE_BODY(node) = en;              } @@ -4001,11 +4030,11 @@ expand_case_fold_string(Node* node, regex_t* reg, int state)    return r;  } -#ifdef USE_INSISTENT_CHECK_CAPTURES_IN_EMPTY_REPEAT -static enum BodyEmpty +#ifdef USE_STUBBORN_CHECK_CAPTURES_IN_EMPTY_REPEAT +static enum BodyEmptyType  quantifiers_memory_node_info(Node* node)  { -  int r = BODY_IS_EMPTY; +  int r = BODY_IS_EMPTY_POSSIBILITY;    switch (NODE_TYPE(node)) {    case NODE_LIST: @@ -4022,7 +4051,7 @@ quantifiers_memory_node_info(Node* node)  #ifdef USE_CALL    case NODE_CALL:      if (NODE_IS_RECURSION(node)) { -      return BODY_IS_EMPTY_REC; /* tiny version */ +      return BODY_IS_EMPTY_POSSIBILITY_REC; /* tiny version */      }      else        r = quantifiers_memory_node_info(NODE_BODY(node)); @@ -4044,9 +4073,9 @@ quantifiers_memory_node_info(Node* node)        switch (en->type) {        case BAG_MEMORY:          if (NODE_IS_RECURSION(node)) { -          return BODY_IS_EMPTY_REC; +          return BODY_IS_EMPTY_POSSIBILITY_REC;          } -        return BODY_IS_EMPTY_MEM; +        return BODY_IS_EMPTY_POSSIBILITY_MEM;          break;        case BAG_OPTION: @@ -4083,7 +4112,7 @@ quantifiers_memory_node_info(Node* node)    return r;  } -#endif /* USE_INSISTENT_CHECK_CAPTURES_IN_EMPTY_REPEAT */ +#endif /* USE_STUBBORN_CHECK_CAPTURES_IN_EMPTY_REPEAT */  #ifdef USE_CALL @@ -4351,7 +4380,7 @@ setup_called_state_call(Node* node, int state)      {        QuantNode* qn = QUANT_(node); -      if (IS_REPEAT_INFINITE(qn->upper) || qn->upper >= 2) +      if (IS_INFINITE_REPEAT(qn->upper) || qn->upper >= 2)          state |= IN_REAL_REPEAT;        if (qn->lower != qn->upper)          state |= IN_VAR_REPEAT; @@ -4468,7 +4497,7 @@ setup_called_state(Node* node, int state)      {        QuantNode* qn = QUANT_(node); -      if (IS_REPEAT_INFINITE(qn->upper) || qn->upper >= 2) +      if (IS_INFINITE_REPEAT(qn->upper) || qn->upper >= 2)          state |= IN_REAL_REPEAT;        if (qn->lower != qn->upper)          state |= IN_VAR_REPEAT; @@ -4600,24 +4629,24 @@ setup_quant(Node* node, regex_t* reg, int state, ScanEnv* env)      NODE_STATUS_ADD(node, IN_MULTI_ENTRY);    } -  if (IS_REPEAT_INFINITE(qn->upper) || qn->upper >= 1) { +  if (IS_INFINITE_REPEAT(qn->upper) || qn->upper >= 1) {      d = tree_min_len(body, env);      if (d == 0) { -#ifdef USE_INSISTENT_CHECK_CAPTURES_IN_EMPTY_REPEAT -      qn->empty_info = quantifiers_memory_node_info(body); -      if (qn->empty_info == BODY_IS_EMPTY_REC) { +#ifdef USE_STUBBORN_CHECK_CAPTURES_IN_EMPTY_REPEAT +      qn->emptiness = quantifiers_memory_node_info(body); +      if (qn->emptiness == BODY_IS_EMPTY_POSSIBILITY_REC) {          if (NODE_TYPE(body) == NODE_BAG &&              BAG_(body)->type == BAG_MEMORY) {            MEM_STATUS_ON(env->bt_mem_end, BAG_(body)->m.regnum);          }        }  #else -      qn->empty_info = BODY_IS_EMPTY; +      qn->emptiness = BODY_IS_EMPTY_POSSIBILITY;  #endif      }    } -  if (IS_REPEAT_INFINITE(qn->upper) || qn->upper >= 2) +  if (IS_INFINITE_REPEAT(qn->upper) || qn->upper >= 2)      state |= IN_REAL_REPEAT;    if (qn->lower != qn->upper)      state |= IN_VAR_REPEAT; @@ -4628,7 +4657,7 @@ setup_quant(Node* node, regex_t* reg, int state, ScanEnv* env)    /* expand string */  #define EXPAND_STRING_MAX_LENGTH  100    if (NODE_TYPE(body) == NODE_STRING) { -    if (!IS_REPEAT_INFINITE(qn->lower) && qn->lower == qn->upper && +    if (!IS_INFINITE_REPEAT(qn->lower) && qn->lower == qn->upper &&          qn->lower > 1 && qn->lower <= EXPAND_STRING_MAX_LENGTH) {        int len = NODE_STRING_LEN(body);        StrNode* sn = STR_(body); @@ -4646,7 +4675,7 @@ setup_quant(Node* node, regex_t* reg, int state, ScanEnv* env)      }    } -  if (qn->greedy && (qn->empty_info == BODY_IS_NOT_EMPTY)) { +  if (qn->greedy && (qn->emptiness == BODY_IS_NOT_EMPTY)) {      if (NODE_TYPE(body) == NODE_QUANT) {        QuantNode* tqn = QUANT_(body);        if (IS_NOT_NULL(tqn->head_exact)) { @@ -4663,7 +4692,7 @@ setup_quant(Node* node, regex_t* reg, int state, ScanEnv* env)  }  /* setup_tree does the following work. - 1. check empty loop. (set qn->empty_info) + 1. check empty loop. (set qn->emptiness)   2. expand ignore-case in char class.   3. set memory status bit flags. (reg->mem_stats)   4. set qn->head_exact for [push, exact] -> [push_or_jump_exact1, exact]. @@ -4752,10 +4781,10 @@ setup_tree(Node* node, regex_t* reg, int state, ScanEnv* env)            r = setup_tree(target, reg, state, env);            if (NODE_TYPE(target) == NODE_QUANT) {              QuantNode* tqn = QUANT_(target); -            if (IS_REPEAT_INFINITE(tqn->upper) && tqn->lower <= 1 && +            if (IS_INFINITE_REPEAT(tqn->upper) && tqn->lower <= 1 &&                  tqn->greedy != 0) {  /* (?>a*), a*+ etc... */ -              if (NODE_IS_SIMPLE_TYPE(NODE_BODY(target))) -                NODE_STATUS_ADD(node, STOP_BT_SIMPLE_REPEAT); +              if (is_strict_real_node(NODE_BODY(target))) +                NODE_STATUS_ADD(node, STRICT_REAL_REPEAT);              }            }          } @@ -5752,7 +5781,7 @@ optimize_nodes(Node* node, OptNode* opt, OptEnv* env)            opt->sm.reach_end = 0;        } -      if (IS_REPEAT_INFINITE(qn->upper)) { +      if (IS_INFINITE_REPEAT(qn->upper)) {          if (env->mmd.max == 0 &&              NODE_IS_ANYCHAR(NODE_BODY(node)) && qn->greedy != 0) {            if (IS_MULTILINE(CTYPE_OPTION(NODE_QUANT_BODY(qn), env))) @@ -6672,6 +6701,7 @@ onig_is_code_in_cc(OnigEncoding enc, OnigCodePoint code, CClassNode* cc)    }    else {      len = ONIGENC_CODE_TO_MBCLEN(enc, code); +    if (len < 0) return 0;    }    return onig_is_code_in_cc_len(len, code, cc);  } diff --git a/src/regenc.c b/src/regenc.c index 6376565..9fab721 100644 --- a/src/regenc.c +++ b/src/regenc.c @@ -853,6 +853,8 @@ onigenc_mbn_is_mbc_ambiguous(OnigEncoding enc, OnigCaseFoldType flag,  extern int  onigenc_mb2_code_to_mbclen(OnigCodePoint code)  { +  if ((code & (~0xffff)) != 0) return ONIGERR_INVALID_CODE_POINT_VALUE; +    if ((code & 0xff00) != 0) return 2;    else return 1;  } diff --git a/src/regerror.c b/src/regerror.c index 7564827..e6d1806 100644 --- a/src/regerror.c +++ b/src/regerror.c @@ -257,6 +257,23 @@ static int to_ascii(OnigEncoding enc, UChar *s, UChar *end,  } +extern int +onig_is_error_code_needs_param(int code) +{ +  switch (code) { +  case ONIGERR_UNDEFINED_NAME_REFERENCE: +  case ONIGERR_UNDEFINED_GROUP_REFERENCE: +  case ONIGERR_MULTIPLEX_DEFINED_NAME: +  case ONIGERR_MULTIPLEX_DEFINITION_NAME_CALL: +  case ONIGERR_INVALID_GROUP_NAME: +  case ONIGERR_INVALID_CHAR_IN_GROUP_NAME: +  case ONIGERR_INVALID_CHAR_PROPERTY_NAME: +    return 1; +  default: +    return 0; +  } +} +  /* for ONIG_MAX_ERROR_MESSAGE_LEN */  #define MAX_ERROR_PAR_LEN   30 diff --git a/src/regexec.c b/src/regexec.c index 6618996..f957b75 100644 --- a/src/regexec.c +++ b/src/regexec.c @@ -980,6 +980,8 @@ onig_region_copy(OnigRegion* to, OnigRegion* from)  #define STK_CALL_FRAME             0x0400  #define STK_RETURN                 0x0500  #define STK_SAVE_VAL               0x0600 +#define STK_PREC_READ_START        0x0700 +#define STK_PREC_READ_END          0x0800  /* stack type check mask */  #define STK_MASK_POP_USED          STK_ALT_FLAG @@ -1544,8 +1546,8 @@ stack_double(int is_alloca, char** arg_alloc_base,  #define STACK_PUSH_ALT(pat,s,sprev)       STACK_PUSH(STK_ALT,pat,s,sprev)  #define STACK_PUSH_SUPER_ALT(pat,s,sprev) STACK_PUSH(STK_SUPER_ALT,pat,s,sprev) -#define STACK_PUSH_POS(s,sprev) \ -  STACK_PUSH(STK_TO_VOID_START,(Operation* )0,s,sprev) +#define STACK_PUSH_PREC_READ_START(s,sprev) \ +  STACK_PUSH(STK_PREC_READ_START,(Operation* )0,s,sprev)  #define STACK_PUSH_ALT_PREC_READ_NOT(pat,s,sprev) \    STACK_PUSH(STK_ALT_PREC_READ_NOT,pat,s,sprev)  #define STACK_PUSH_TO_VOID_START        STACK_PUSH_TYPE(STK_TO_VOID_START) @@ -1887,6 +1889,27 @@ stack_double(int is_alloca, char** arg_alloc_base,    }\  } while(0) +#define STACK_GET_PREC_READ_START(k) do {\ +  int level = 0;\ +  k = stk;\ +  while (1) {\ +    k--;\ +    STACK_BASE_CHECK(k, "STACK_GET_PREC_READ_START");\ +    if (IS_TO_VOID_TARGET(k)) {\ +      k->type = STK_VOID;\ +    }\ +    else if (k->type == STK_PREC_READ_START) {\ +      if (level == 0) {\ +        break;\ +      }\ +      level--;\ +    }\ +    else if (k->type == STK_PREC_READ_END) {\ +      level++;\ +    }\ +  }\ +} while(0) +  #define STACK_EMPTY_CHECK(isnull,sid,s) do {\    StackType* k = stk;\    while (1) {\ @@ -1913,7 +1936,7 @@ stack_double(int is_alloca, char** arg_alloc_base,    }\  } while (0) -#ifdef USE_INSISTENT_CHECK_CAPTURES_IN_EMPTY_REPEAT +#ifdef USE_STUBBORN_CHECK_CAPTURES_IN_EMPTY_REPEAT  #define STACK_EMPTY_CHECK_MEM(isnull,sid,s,reg) do {\    StackType* k = stk;\    while (1) {\ @@ -1927,9 +1950,10 @@ stack_double(int is_alloca, char** arg_alloc_base,          }\          else {\            UChar* endp;\ +          int level = 0;\            (isnull) = 1;\            while (k < stk) {\ -            if (k->type == STK_MEM_START) {\ +            if (k->type == STK_MEM_START && level == 0) {\                STACK_MEM_START_GET_PREV_END_ADDR(k, reg, endp);\                if (endp == 0) {\                  (isnull) = 0; break;\ @@ -1941,6 +1965,12 @@ stack_double(int is_alloca, char** arg_alloc_base,                  (isnull) = -1; /* empty, but position changed */ \                }\              }\ +            else if (k->type == STK_PREC_READ_START) {\ +              level++;\ +            }\ +            else if (k->type == STK_PREC_READ_END) {\ +              level--;\ +            }\              k++;\            }\            break;\ @@ -1965,10 +1995,11 @@ stack_double(int is_alloca, char** arg_alloc_base,            }\            else {\              UChar* endp;\ +            int prec_level = 0;\              (isnull) = 1;\              while (k < stk) {\                if (k->type == STK_MEM_START) {\ -                if (level == 0) {\ +                if (level == 0 && prec_level == 0) {\                    STACK_MEM_START_GET_PREV_END_ADDR(k, reg, endp);\                    if (endp == 0) {\                      (isnull) = 0; break;\ @@ -1987,6 +2018,12 @@ stack_double(int is_alloca, char** arg_alloc_base,                else if (k->type == STK_EMPTY_CHECK_END) {\                  if (k->zid == (sid)) level--;\                }\ +              else if (k->type == STK_PREC_READ_START) {\ +                prec_level++;\ +              }\ +              else if (k->type == STK_PREC_READ_END) {\ +                prec_level--;\ +              }\                k++;\              }\              break;\ @@ -2023,7 +2060,7 @@ stack_double(int is_alloca, char** arg_alloc_base,      }\    }\  } while(0) -#endif /* USE_INSISTENT_CHECK_CAPTURES_IN_EMPTY_REPEAT */ +#endif /* USE_STUBBORN_CHECK_CAPTURES_IN_EMPTY_REPEAT */  #define STACK_GET_REPEAT(sid, k) do {\    int level = 0;\ @@ -2968,6 +3005,7 @@ match_at(regex_t* reg, const UChar* str, const UChar* end,        NEXT_OUT;      CASE_OP(CCLASS_MB) +      DATA_ENSURE(1);        if (! ONIGENC_IS_MBC_HEAD(encode, s)) goto fail;      cclass_mb: @@ -3441,11 +3479,13 @@ match_at(regex_t* reg, const UChar* str, const UChar* end,                  ? STACK_AT(mem_end_stk[mem])->u.mem.pstr                  : (UChar* )((void* )mem_end_stk[mem]));          n = (int )(pend - pstart); -        DATA_ENSURE(n); -        sprev = s; -        STRING_CMP(pstart, s, n); -        while (sprev + (len = enclen(encode, sprev)) < s) -          sprev += len; +        if (n != 0) { +          DATA_ENSURE(n); +          sprev = s; +          STRING_CMP(s, pstart, n); +          while (sprev + (len = enclen(encode, sprev)) < s) +            sprev += len; +        }        }        INC_OP;        JUMP_OUT; @@ -3468,11 +3508,13 @@ match_at(regex_t* reg, const UChar* str, const UChar* end,                  ? STACK_AT(mem_end_stk[mem])->u.mem.pstr                  : (UChar* )((void* )mem_end_stk[mem]));          n = (int )(pend - pstart); -        DATA_ENSURE(n); -        sprev = s; -        STRING_CMP_IC(case_fold_flag, pstart, &s, n); -        while (sprev + (len = enclen(encode, sprev)) < s) -          sprev += len; +        if (n != 0) { +          DATA_ENSURE(n); +          sprev = s; +          STRING_CMP_IC(case_fold_flag, pstart, &s, n); +          while (sprev + (len = enclen(encode, sprev)) < s) +            sprev += len; +        }        }        INC_OP;        JUMP_OUT; @@ -3498,15 +3540,16 @@ match_at(regex_t* reg, const UChar* str, const UChar* end,                    ? STACK_AT(mem_end_stk[mem])->u.mem.pstr                    : (UChar* )((void* )mem_end_stk[mem]));            n = (int )(pend - pstart); -          DATA_ENSURE(n); -          sprev = s; -          swork = s; -          STRING_CMP_VALUE(pstart, swork, n, is_fail); -          if (is_fail) continue; -          s = swork; -          while (sprev + (len = enclen(encode, sprev)) < s) -            sprev += len; - +          if (n != 0) { +            DATA_ENSURE(n); +            sprev = s; +            swork = s; +            STRING_CMP_VALUE(swork, pstart, n, is_fail); +            if (is_fail) continue; +            s = swork; +            while (sprev + (len = enclen(encode, sprev)) < s) +              sprev += len; +          }            break; /* success */          }          if (i == tlen) goto fail; @@ -3535,15 +3578,16 @@ match_at(regex_t* reg, const UChar* str, const UChar* end,                    ? STACK_AT(mem_end_stk[mem])->u.mem.pstr                    : (UChar* )((void* )mem_end_stk[mem]));            n = (int )(pend - pstart); -          DATA_ENSURE(n); -          sprev = s; -          swork = s; -          STRING_CMP_VALUE_IC(case_fold_flag, pstart, &swork, n, is_fail); -          if (is_fail) continue; -          s = swork; -          while (sprev + (len = enclen(encode, sprev)) < s) -            sprev += len; - +          if (n != 0) { +            DATA_ENSURE(n); +            sprev = s; +            swork = s; +            STRING_CMP_VALUE_IC(case_fold_flag, pstart, &swork, n, is_fail); +            if (is_fail) continue; +            s = swork; +            while (sprev + (len = enclen(encode, sprev)) < s) +              sprev += len; +          }            break; /* success */          }          if (i == tlen) goto fail; @@ -3560,6 +3604,7 @@ match_at(regex_t* reg, const UChar* str, const UChar* end,          int len;          int level;          MemNumType* mems; +        UChar* ssave;          n = 0;        backref_with_level: @@ -3567,10 +3612,11 @@ match_at(regex_t* reg, const UChar* str, const UChar* end,          tlen  = p->backref_general.num;          mems = tlen == 1 ? &(p->backref_general.n1) : p->backref_general.ns; -        sprev = s; +        ssave = s;          if (backref_match_at_nested_level(reg, stk, stk_base, n,                      case_fold_flag, level, (int )tlen, mems, &s, end)) { -          if (sprev < end) { +          if (ssave != s) { +            sprev = ssave;              while (sprev + (len = enclen(encode, sprev)) < s)                sprev += len;            } @@ -3658,7 +3704,7 @@ match_at(regex_t* reg, const UChar* str, const UChar* end,        }        JUMP_OUT; -#ifdef USE_INSISTENT_CHECK_CAPTURES_IN_EMPTY_REPEAT +#ifdef USE_STUBBORN_CHECK_CAPTURES_IN_EMPTY_REPEAT      CASE_OP(EMPTY_CHECK_END_MEMST)        {          int is_empty; @@ -3683,7 +3729,7 @@ match_at(regex_t* reg, const UChar* str, const UChar* end,          int is_empty;          mem = p->empty_check_end.mem;  /* mem: null check id */ -#ifdef USE_INSISTENT_CHECK_CAPTURES_IN_EMPTY_REPEAT +#ifdef USE_STUBBORN_CHECK_CAPTURES_IN_EMPTY_REPEAT          STACK_EMPTY_CHECK_MEM_REC(is_empty, mem, s, reg);  #else          STACK_EMPTY_CHECK_REC(is_empty, mem, s); @@ -3851,14 +3897,15 @@ match_at(regex_t* reg, const UChar* str, const UChar* end,        goto repeat_inc_ng;      CASE_OP(PREC_READ_START) -      STACK_PUSH_POS(s, sprev); +      STACK_PUSH_PREC_READ_START(s, sprev);        INC_OP;        JUMP_OUT;      CASE_OP(PREC_READ_END) -      STACK_EXEC_TO_VOID(stkp); +      STACK_GET_PREC_READ_START(stkp);        s     = stkp->u.state.pstr;        sprev = stkp->u.state.pstr_prev; +      STACK_PUSH(STK_PREC_READ_END,0,0,0);        INC_OP;        JUMP_OUT; @@ -5443,6 +5490,9 @@ onig_builtin_error(OnigCalloutArgs* args, void* user_data ARG_UNUSED)    if (n >= 0) {      n = ONIGERR_INVALID_CALLOUT_BODY;    } +  else if (onig_is_error_code_needs_param(n)) { +    n = ONIGERR_INVALID_CALLOUT_BODY; +  }    return n;  } diff --git a/src/regext.c b/src/regext.c index fa4b360..965c793 100644 --- a/src/regext.c +++ b/src/regext.c @@ -29,6 +29,7 @@  #include "regint.h" +#if 0  static void  conv_ext0be32(const UChar* s, const UChar* end, UChar* conv)  { @@ -158,6 +159,7 @@ conv_encoding(OnigEncoding from, OnigEncoding to, const UChar* s, const UChar* e    return ONIGERR_NOT_SUPPORTED_ENCODING_COMBINATION;  } +#endif  extern int  onig_new_deluxe(regex_t** reg, const UChar* pattern, const UChar* pattern_end, @@ -169,9 +171,7 @@ onig_new_deluxe(regex_t** reg, const UChar* pattern, const UChar* pattern_end,    if (IS_NOT_NULL(einfo)) einfo->par = (UChar* )NULL;    if (ci->pattern_enc != ci->target_enc) { -    r = conv_encoding(ci->pattern_enc, ci->target_enc, pattern, pattern_end, -                      &cpat, &cpat_end); -    if (r != 0) return r; +    return ONIGERR_NOT_SUPPORTED_ENCODING_COMBINATION;    }    else {      cpat     = (UChar* )pattern; diff --git a/src/regint.h b/src/regint.h index 56767e8..38389a1 100644 --- a/src/regint.h +++ b/src/regint.h @@ -63,7 +63,7 @@  #define USE_CALL  #define USE_CALLOUT  #define USE_BACKREF_WITH_LEVEL        /* \k<name+n>, \k<name-n> */ -#define USE_INSISTENT_CHECK_CAPTURES_IN_EMPTY_REPEAT    /* /(?:()|())*\2/ */ +#define USE_STUBBORN_CHECK_CAPTURES_IN_EMPTY_REPEAT     /* /(?:()|())*\2/ */  #define USE_NEWLINE_AT_END_OF_STRING_HAS_EMPTY_LINE     /* /\n$/ =~ "\n" */  #define USE_WARNING_REDUNDANT_NESTED_REPEAT_OPERATOR  #define USE_RETRY_LIMIT_IN_MATCH @@ -348,8 +348,8 @@ typedef unsigned int  MemStatusType;  #define DISABLE_CASE_FOLD_MULTI_CHAR(case_fold_flag) \    ((case_fold_flag) & ~INTERNAL_ONIGENC_CASE_FOLD_MULTI_CHAR) -#define REPEAT_INFINITE         -1 -#define IS_REPEAT_INFINITE(n)   ((n) == REPEAT_INFINITE) +#define INFINITE_REPEAT         -1 +#define IS_INFINITE_REPEAT(n)   ((n) == INFINITE_REPEAT)  /* bitset */  #define BITS_PER_BYTE      8 diff --git a/src/regparse.c b/src/regparse.c index f1deea3..7f8b1a9 100644 --- a/src/regparse.c +++ b/src/regparse.c @@ -77,6 +77,7 @@ OnigSyntaxType OnigSyntaxOniguruma = {        ONIG_SYN_CAPTURE_ONLY_NAMED_GROUP |        ONIG_SYN_ALLOW_MULTIPLEX_DEFINITION_NAME |        ONIG_SYN_FIXED_INTERVAL_IS_GREEDY_ONLY | +      ONIG_SYN_ALLOW_INVALID_CODE_END_OF_RANGE_IN_CC |        ONIG_SYN_WARN_CC_OP_NOT_ESCAPED |        ONIG_SYN_WARN_REDUNDANT_NESTED_REPEAT )    , ONIG_OPTION_NONE @@ -1093,6 +1094,35 @@ onig_name_to_group_numbers(regex_t* reg, const UChar* name,    return e->back_num;  } +static int +name_to_group_numbers(ScanEnv* env, const UChar* name, const UChar* name_end, +                      int** nums) +{ +  regex_t* reg; +  NameEntry* e; + +  reg = env->reg; +  e = name_find(reg, name, name_end); + +  if (IS_NULL(e)) { +    onig_scan_env_set_error_string(env, ONIGERR_UNDEFINED_NAME_REFERENCE, +                                   (UChar* )name, (UChar* )name_end); +    return ONIGERR_UNDEFINED_NAME_REFERENCE; +  } + +  switch (e->back_num) { +  case 0: +    break; +  case 1: +    *nums = &(e->back_ref1); +    break; +  default: +    *nums = e->back_refs; +    break; +  } +  return e->back_num; +} +  extern int  onig_name_to_backref_number(regex_t* reg, const UChar* name,                              const UChar* name_end, OnigRegion *region) @@ -1869,8 +1899,8 @@ callout_tag_table_new(CalloutTagTable** rt)  }  static int -callout_tag_entry_raw(CalloutTagTable* t, UChar* name, UChar* name_end, -                      CalloutTagVal entry_val) +callout_tag_entry_raw(ScanEnv* env, CalloutTagTable* t, UChar* name, +                      UChar* name_end, CalloutTagVal entry_val)  {    int r;    CalloutTagVal val; @@ -1879,8 +1909,11 @@ callout_tag_entry_raw(CalloutTagTable* t, UChar* name, UChar* name_end,      return ONIGERR_INVALID_CALLOUT_TAG_NAME;    val = callout_tag_find(t, name, name_end); -  if (val >= 0) +  if (val >= 0) { +    onig_scan_env_set_error_string(env, ONIGERR_MULTIPLEX_DEFINED_NAME, +                                   name, name_end);      return ONIGERR_MULTIPLEX_DEFINED_NAME; +  }    r = onig_st_insert_strend(t, name, name_end, (HashDataType )entry_val);    if (r < 0) return r; @@ -1909,7 +1942,7 @@ ext_ensure_tag_table(regex_t* reg)  }  static int -callout_tag_entry(regex_t* reg, UChar* name, UChar* name_end, +callout_tag_entry(ScanEnv* env, regex_t* reg, UChar* name, UChar* name_end,                    CalloutTagVal entry_val)  {    int r; @@ -1921,7 +1954,7 @@ callout_tag_entry(regex_t* reg, UChar* name, UChar* name_end,    ext = onig_get_regex_ext(reg);    CHECK_NULL_RETURN_MEMERR(ext); -  r = callout_tag_entry_raw(ext->tag_table, name, name_end, entry_val); +  r = callout_tag_entry_raw(env, ext->tag_table, name, name_end, entry_val);    e = onig_reg_callout_list_at(reg, (int )entry_val);    CHECK_NULL_RETURN_MEMERR(e); @@ -2391,10 +2424,10 @@ node_new_quantifier(int lower, int upper, int by_number)    CHECK_NULL_RETURN(node);    NODE_SET_TYPE(node, NODE_QUANT); -  QUANT_(node)->lower      = lower; -  QUANT_(node)->upper      = upper; -  QUANT_(node)->greedy     = 1; -  QUANT_(node)->empty_info = BODY_IS_NOT_EMPTY; +  QUANT_(node)->lower           = lower; +  QUANT_(node)->upper           = upper; +  QUANT_(node)->greedy          = 1; +  QUANT_(node)->emptiness       = BODY_IS_NOT_EMPTY;    QUANT_(node)->head_exact      = NULL_NODE;    QUANT_(node)->next_head_exact = NULL_NODE;    QUANT_(node)->is_refered      = 0; @@ -2694,7 +2727,7 @@ make_text_segment(Node** node, ScanEnv* env)    ns[0] = x;    ns[1] = NULL_NODE; -  x = node_new_quantifier(0, REPEAT_INFINITE, 1); +  x = node_new_quantifier(0, INFINITE_REPEAT, 1);    if (IS_NULL(x)) goto err;    NODE_BODY(x) = ns[0]; @@ -3044,7 +3077,7 @@ make_absent_tree(Node** node, Node* absent, Node* expr, int is_range_cutter,      if (expr == NULL_NODE) {        /* default expr \O* */ -      quant = node_new_quantifier(0, REPEAT_INFINITE, 0); +      quant = node_new_quantifier(0, INFINITE_REPEAT, 0);        if (IS_NULL(quant)) goto err0;        r = node_new_true_anychar(&body, env); @@ -3086,7 +3119,7 @@ make_absent_tree(Node** node, Node* absent, Node* expr, int is_range_cutter,    if (r != 0) goto err;    possessive = 1; -  r = make_absent_engine(&ns[2], id1, absent, ns[3], 0, REPEAT_INFINITE, +  r = make_absent_engine(&ns[2], id1, absent, ns[3], 0, INFINITE_REPEAT,                           possessive, is_range_cutter, env);    if (r != 0) goto err; @@ -3236,10 +3269,18 @@ node_new_empty(void)  static Node*  node_new_str_raw_char(UChar c)  { +  int i;    UChar p[1]; +  Node* node;    p[0] = c; -  return node_new_str_raw(p, p + 1); +  node = node_new_str_raw(p, p + 1); + +  /* clear buf tail */ +  for (i = 1; i < NODE_STRING_BUF_SIZE; i++) +    STR_(node)->buf[i] = '\0'; + +  return node;  }  static Node* @@ -3275,24 +3316,6 @@ str_node_can_be_split(Node* node, OnigEncoding enc)    return 0;  } -#ifdef USE_PAD_TO_SHORT_BYTE_CHAR -static int -node_str_head_pad(StrNode* sn, int num, UChar val) -{ -  UChar buf[NODE_STRING_BUF_SIZE]; -  int i, len; - -  len = sn->end - sn->s; -  onig_strcpy(buf, sn->s, sn->end); -  onig_strcpy(&(sn->s[num]), buf, buf + len); -  sn->end += num; - -  for (i = 0; i < num; i++) { -    sn->s[i] = val; -  } -} -#endif -  extern int  onig_scan_unsigned_number(UChar** src, const UChar* end, OnigEncoding enc)  { @@ -3877,19 +3900,19 @@ quantifier_type_num(QuantNode* q)    if (q->greedy) {      if (q->lower == 0) {        if (q->upper == 1) return 0; -      else if (IS_REPEAT_INFINITE(q->upper)) return 1; +      else if (IS_INFINITE_REPEAT(q->upper)) return 1;      }      else if (q->lower == 1) { -      if (IS_REPEAT_INFINITE(q->upper)) return 2; +      if (IS_INFINITE_REPEAT(q->upper)) return 2;      }    }    else {      if (q->lower == 0) {        if (q->upper == 1) return 3; -      else if (IS_REPEAT_INFINITE(q->upper)) return 4; +      else if (IS_INFINITE_REPEAT(q->upper)) return 4;      }      else if (q->lower == 1) { -      if (IS_REPEAT_INFINITE(q->upper)) return 5; +      if (IS_INFINITE_REPEAT(q->upper)) return 5;      }    }    return -1; @@ -3926,8 +3949,8 @@ onig_reduce_nested_quantifier(Node* pnode, Node* cnode)    pnum = quantifier_type_num(p);    cnum = quantifier_type_num(c);    if (pnum < 0 || cnum < 0) { -    if ((p->lower == p->upper) && ! IS_REPEAT_INFINITE(p->upper)) { -      if ((c->lower == c->upper) && ! IS_REPEAT_INFINITE(c->upper)) { +    if ((p->lower == p->upper) && ! IS_INFINITE_REPEAT(p->upper)) { +      if ((c->lower == c->upper) && ! IS_INFINITE_REPEAT(c->upper)) {          int n = onig_positive_int_multiply(p->lower, c->lower);          if (n >= 0) {            p->lower = p->upper = n; @@ -3946,11 +3969,11 @@ onig_reduce_nested_quantifier(Node* pnode, Node* cnode)      break;    case RQ_A:      NODE_BODY(pnode) = NODE_BODY(cnode); -    p->lower  = 0;  p->upper = REPEAT_INFINITE;  p->greedy = 1; +    p->lower  = 0;  p->upper = INFINITE_REPEAT;  p->greedy = 1;      break;    case RQ_AQ:      NODE_BODY(pnode) = NODE_BODY(cnode); -    p->lower  = 0;  p->upper = REPEAT_INFINITE;  p->greedy = 0; +    p->lower  = 0;  p->upper = INFINITE_REPEAT;  p->greedy = 0;      break;    case RQ_QQ:      NODE_BODY(pnode) = NODE_BODY(cnode); @@ -3959,13 +3982,13 @@ onig_reduce_nested_quantifier(Node* pnode, Node* cnode)    case RQ_P_QQ:      NODE_BODY(pnode) = cnode;      p->lower  = 0;  p->upper = 1;  p->greedy = 0; -    c->lower  = 1;  c->upper = REPEAT_INFINITE;  c->greedy = 1; +    c->lower  = 1;  c->upper = INFINITE_REPEAT;  c->greedy = 1;      return ;      break;    case RQ_PQ_Q:      NODE_BODY(pnode) = cnode;      p->lower  = 0;  p->upper = 1;  p->greedy = 1; -    c->lower  = 1;  c->upper = REPEAT_INFINITE;  c->greedy = 0; +    c->lower  = 1;  c->upper = INFINITE_REPEAT;  c->greedy = 0;      return ;      break;    case RQ_ASIS: @@ -4158,7 +4181,7 @@ fetch_interval_quantifier(UChar** src, UChar* end, PToken* tok, ScanEnv* env)      if (p == prev) {        if (non_low != 0)          goto invalid; -      up = REPEAT_INFINITE;  /* {n,} : {n,infinite} */ +      up = INFINITE_REPEAT;  /* {n,} : {n,infinite} */      }    }    else { @@ -4178,7 +4201,7 @@ fetch_interval_quantifier(UChar** src, UChar* end, PToken* tok, ScanEnv* env)    }    if (c != '}') goto invalid; -  if (!IS_REPEAT_INFINITE(up) && low > up) { +  if (!IS_INFINITE_REPEAT(up) && low > up) {      /* {n,m}+ supported case */      if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_PLUS_POSSESSIVE_INTERVAL))        return ONIGERR_UPPER_SMALLER_THAN_LOWER_IN_REPEAT_RANGE; @@ -4959,7 +4982,7 @@ fetch_token(PToken* tok, UChar** src, UChar* end, ScanEnv* env)        if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_ASTERISK_ZERO_INF)) break;        tok->type = TK_REPEAT;        tok->u.repeat.lower = 0; -      tok->u.repeat.upper = REPEAT_INFINITE; +      tok->u.repeat.upper = INFINITE_REPEAT;        goto greedy_check;        break; @@ -4967,7 +4990,7 @@ fetch_token(PToken* tok, UChar** src, UChar* end, ScanEnv* env)        if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_PLUS_ONE_INF)) break;        tok->type = TK_REPEAT;        tok->u.repeat.lower = 1; -      tok->u.repeat.upper = REPEAT_INFINITE; +      tok->u.repeat.upper = INFINITE_REPEAT;        goto greedy_check;        break; @@ -5358,10 +5381,8 @@ fetch_token(PToken* tok, UChar** src, UChar* end, ScanEnv* env)              tok->u.backref.ref1 = back_num;            }            else { -            num = onig_name_to_group_numbers(env->reg, prev, name_end, &backs); +            num = name_to_group_numbers(env, prev, name_end, &backs);              if (num <= 0) { -              onig_scan_env_set_error_string(env, -                        ONIGERR_UNDEFINED_NAME_REFERENCE, prev, name_end);                return ONIGERR_UNDEFINED_NAME_REFERENCE;              }              if (IS_SYNTAX_BV(syn, ONIG_SYN_STRICT_CHECK_BACKREF)) { @@ -5514,7 +5535,7 @@ fetch_token(PToken* tok, UChar** src, UChar* end, ScanEnv* env)  #endif        tok->type = TK_REPEAT;        tok->u.repeat.lower = 0; -      tok->u.repeat.upper = REPEAT_INFINITE; +      tok->u.repeat.upper = INFINITE_REPEAT;        goto greedy_check;        break; @@ -5525,7 +5546,7 @@ fetch_token(PToken* tok, UChar** src, UChar* end, ScanEnv* env)  #endif        tok->type = TK_REPEAT;        tok->u.repeat.lower = 1; -      tok->u.repeat.upper = REPEAT_INFINITE; +      tok->u.repeat.upper = INFINITE_REPEAT;        goto greedy_check;        break; @@ -5608,7 +5629,7 @@ fetch_token(PToken* tok, UChar** src, UChar* end, ScanEnv* env)                tok->u.call.gnum      = 0;                tok->u.call.name      = p;                PINC; -              if (! PPEEK_IS(')')) return ONIGERR_INVALID_GROUP_NAME; +              if (! PPEEK_IS(')')) return ONIGERR_UNDEFINED_GROUP_OPTION;                tok->u.call.name_end  = p;                break; @@ -6249,6 +6270,7 @@ parse_char_class(Node** np, PToken* tok, UChar** src, UChar* end, ScanEnv* env)    env->parse_depth++;    if (env->parse_depth > ParseDepthLimit)      return ONIGERR_PARSE_DEPTH_LIMIT_OVER; +    prev_cc = (CClassNode* )NULL;    r = fetch_token_in_cc(tok, src, end, env);    if (r == TK_CHAR && tok->u.c == '^' && tok->escaped == 0) { @@ -6301,10 +6323,11 @@ parse_char_class(Node** np, PToken* tok, UChar** src, UChar* end, ScanEnv* env)      case TK_RAW_BYTE:        /* tok->base != 0 : octal or hexadec. */        if (! ONIGENC_IS_SINGLEBYTE(env->enc) && tok->base != 0) { +        int i, j;          UChar buf[ONIGENC_CODE_TO_MBC_MAXLEN];          UChar* bufe = buf + ONIGENC_CODE_TO_MBC_MAXLEN;          UChar* psave = p; -        int i, base = tok->base; +        int base = tok->base;          buf[0] = tok->u.c;          for (i = 1; i < ONIGENC_MBC_MAXLEN(env->enc); i++) { @@ -6322,6 +6345,9 @@ parse_char_class(Node** np, PToken* tok, UChar** src, UChar* end, ScanEnv* env)            goto err;          } +        /* clear buf tail */ +        for (j = i; j < ONIGENC_CODE_TO_MBC_MAXLEN; j++) buf[j] = '\0'; +          len = enclen(env->enc, buf);          if (i < len) {            r = ONIGERR_TOO_SHORT_MULTI_BYTE_STRING; @@ -6359,8 +6385,13 @@ parse_char_class(Node** np, PToken* tok, UChar** src, UChar* end, ScanEnv* env)      val_entry:        len = ONIGENC_CODE_TO_MBCLEN(env->enc, v);        if (len < 0) { -        r = len; -        goto err; +        if (state != CCS_RANGE || +            ! IS_SYNTAX_BV(env->syntax, +                           ONIG_SYN_ALLOW_INVALID_CODE_END_OF_RANGE_IN_CC) || +            v < 0x100 || ONIGENC_MBC_MAXLEN(env->enc) == 1) { +          r = len; +          goto err; +        }        }        in_type = (len == 1 ? CCV_SB : CCV_CODE_POINT);      val_entry2: @@ -6673,7 +6704,7 @@ parse_callout_of_contents(Node** np, int cterm, UChar** src, UChar* end, ScanEnv    }    if (tag_start != tag_end) { -    r = callout_tag_entry(env->reg, tag_start, tag_end, num); +    r = callout_tag_entry(env, env->reg, tag_start, tag_end, num);      if (r != ONIG_NORMAL) return r;    } @@ -6994,7 +7025,7 @@ parse_callout_of_name(Node** np, int cterm, UChar** src, UChar* end, ScanEnv* en    }    if (tag_start != tag_end) { -    r = callout_tag_entry(env->reg, tag_start, tag_end, num); +    r = callout_tag_entry(env, env->reg, tag_start, tag_end, num);      if (r != ONIG_NORMAL) return r;    } @@ -7271,10 +7302,8 @@ parse_bag(Node** np, PToken* tok, int term, UChar** src, UChar* end,              int num;              int* backs; -            num = onig_name_to_group_numbers(env->reg, prev, name_end, &backs); +            num = name_to_group_numbers(env, prev, name_end, &backs);              if (num <= 0) { -              onig_scan_env_set_error_string(env, -                        ONIGERR_UNDEFINED_NAME_REFERENCE, prev, name_end);                return ONIGERR_UNDEFINED_NAME_REFERENCE;              }              if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_STRICT_CHECK_BACKREF)) { @@ -7414,6 +7443,7 @@ parse_bag(Node** np, PToken* tok, int term, UChar** src, UChar* end,        }        break; +#ifdef USE_CAPTURE_HISTORY      case '@':        if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_ATMARK_CAPTURE_HISTORY)) {          if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_LT_NAMED_GROUP)) { @@ -7441,6 +7471,7 @@ parse_bag(Node** np, PToken* tok, int term, UChar** src, UChar* end,          return ONIGERR_UNDEFINED_GROUP_OPTION;        }        break; +#endif  #ifdef USE_POSIXLINE_OPTION      case 'p': @@ -7688,7 +7719,7 @@ set_quantifier(Node* qnode, Node* target, int group, ScanEnv* env)        if (targetq_num >= 0 && nestq_num < 0) {          if (targetq_num == 1 || targetq_num == 2) { /* * or + */            /* (?:a*){n,m}, (?:a+){n,m} => (?:a*){n,n}, (?:a+){n,n} */ -          if (! IS_REPEAT_INFINITE(qn->upper) && qn->upper > 1 && qn->greedy) { +          if (! IS_INFINITE_REPEAT(qn->upper) && qn->upper > 1 && qn->greedy) {              qn->upper = (qn->lower == 0 ? 1 : qn->lower);            }          } @@ -7826,14 +7857,18 @@ static int  parse_exp(Node** np, PToken* tok, int term, UChar** src, UChar* end,            ScanEnv* env, int group_head)  { -  int r, len, group = 0; +  int r, len, group;    Node* qn;    Node** tp; +  unsigned int parse_depth; +  group = 0;    *np = NULL;    if (tok->type == (enum TokenSyms )term)      goto end_of_token; +  parse_depth = env->parse_depth; +    switch (tok->type) {    case TK_ALT:    case TK_EOT: @@ -7914,36 +7949,29 @@ parse_exp(Node** np, PToken* tok, int term, UChar** src, UChar* end,        len = 1;        while (1) {          if (len >= ONIGENC_MBC_MINLEN(env->enc)) { -          if (len == enclen(env->enc, STR_(*np)->s)) {/* should not enclen_end() */ +          if (len == enclen(env->enc, STR_(*np)->s)) {              r = fetch_token(tok, src, end, env); -            NODE_STRING_CLEAR_RAW(*np); -            goto string_end; +            goto tk_raw_byte_end;            }          }          r = fetch_token(tok, src, end, env);          if (r < 0) return r; -        if (r != TK_RAW_BYTE) { -          /* Don't use this, it is wrong for little endian encodings. */ -#ifdef USE_PAD_TO_SHORT_BYTE_CHAR -          int rem; -          if (len < ONIGENC_MBC_MINLEN(env->enc)) { -            rem = ONIGENC_MBC_MINLEN(env->enc) - len; -            (void )node_str_head_pad(STR_(*np), rem, (UChar )0); -            if (len + rem == enclen(env->enc, STR_(*np)->s)) { -              NODE_STRING_CLEAR_RAW(*np); -              goto string_end; -            } -          } -#endif +        if (r != TK_RAW_BYTE)            return ONIGERR_TOO_SHORT_MULTI_BYTE_STRING; -        }          r = node_str_cat_char(*np, (UChar )tok->u.c);          if (r < 0) return r;          len++;        } + +    tk_raw_byte_end: +      if (! ONIGENC_IS_VALID_MBC_STRING(env->enc, STR_(*np)->s, STR_(*np)->end)) +        return ONIGERR_INVALID_WIDE_CHAR_VALUE; + +      NODE_STRING_CLEAR_RAW(*np); +      goto string_end;      }      break; @@ -8055,7 +8083,7 @@ parse_exp(Node** np, PToken* tok, int term, UChar** src, UChar* end,    case TK_ANYCHAR_ANYTIME:      *np = node_new_anychar();      CHECK_NULL_RETURN_MEMERR(*np); -    qn = node_new_quantifier(0, REPEAT_INFINITE, 0); +    qn = node_new_quantifier(0, INFINITE_REPEAT, 0);      CHECK_NULL_RETURN_MEMERR(qn);      NODE_BODY(qn) = *np;      *np = qn; @@ -8158,6 +8186,10 @@ parse_exp(Node** np, PToken* tok, int term, UChar** src, UChar* end,        if (is_invalid_quantifier_target(*tp))          return ONIGERR_TARGET_OF_REPEAT_OPERATOR_INVALID; +      parse_depth++; +      if (parse_depth > ParseDepthLimit) +        return ONIGERR_PARSE_DEPTH_LIMIT_OVER; +        qn = node_new_quantifier(tok->u.repeat.lower, tok->u.repeat.upper,                                 r == TK_INTERVAL);        CHECK_NULL_RETURN_MEMERR(qn); diff --git a/src/regparse.h b/src/regparse.h index b7a2867..231f7b5 100644 --- a/src/regparse.h +++ b/src/regparse.h @@ -66,11 +66,11 @@ enum GimmickType {  #endif  }; -enum BodyEmpty { -  BODY_IS_NOT_EMPTY = 0, -  BODY_IS_EMPTY     = 1, -  BODY_IS_EMPTY_MEM = 2, -  BODY_IS_EMPTY_REC = 3 +enum BodyEmptyType { +  BODY_IS_NOT_EMPTY             = 0, +  BODY_IS_EMPTY_POSSIBILITY     = 1, +  BODY_IS_EMPTY_POSSIBILITY_MEM = 2, +  BODY_IS_EMPTY_POSSIBILITY_REC = 3  };  typedef struct { @@ -101,7 +101,7 @@ typedef struct {    int lower;    int upper;    int greedy; -  enum BodyEmpty empty_info; +  enum BodyEmptyType emptiness;    struct _Node* head_exact;    struct _Node* next_head_exact;    int is_refered;     /* include called node. don't eliminate even if {0} */ @@ -252,10 +252,6 @@ typedef struct _Node {  #define NODE_BIT_CALL       NODE_TYPE2BIT(NODE_CALL)  #define NODE_BIT_GIMMICK    NODE_TYPE2BIT(NODE_GIMMICK) -#define NODE_IS_SIMPLE_TYPE(node) \ -  ((NODE_TYPE2BIT(NODE_TYPE(node)) & \ -    (NODE_BIT_STRING | NODE_BIT_CCLASS | NODE_BIT_CTYPE | NODE_BIT_BACKREF)) != 0) -  #define NODE_TYPE(node)             ((node)->u.base.node_type)  #define NODE_SET_TYPE(node, ntype)   (node)->u.base.node_type = (ntype) @@ -314,7 +310,7 @@ typedef struct _Node {  #define NODE_ST_CLEN_FIXED            (1<<2)  #define NODE_ST_MARK1                 (1<<3)  #define NODE_ST_MARK2                 (1<<4) -#define NODE_ST_STOP_BT_SIMPLE_REPEAT (1<<5) +#define NODE_ST_STRICT_REAL_REPEAT    (1<<5)  #define NODE_ST_RECURSION             (1<<6)  #define NODE_ST_CALLED                (1<<7)  #define NODE_ST_ADDR_FIXED            (1<<8) @@ -357,8 +353,8 @@ typedef struct _Node {  #define NODE_IS_SUPER(node)           ((NODE_STATUS(node) & NODE_ST_SUPER)        != 0)  #define NODE_IS_PROHIBIT_RECURSION(node) \      ((NODE_STATUS(node) & NODE_ST_PROHIBIT_RECURSION) != 0) -#define NODE_IS_STOP_BT_SIMPLE_REPEAT(node) \ -    ((NODE_STATUS(node) & NODE_ST_STOP_BT_SIMPLE_REPEAT) != 0) +#define NODE_IS_STRICT_REAL_REPEAT(node) \ +    ((NODE_STATUS(node) & NODE_ST_STRICT_REAL_REPEAT) != 0)  #define NODE_BODY(node)           ((node)->u.base.body)  #define NODE_QUANT_BODY(node)     ((node)->body) diff --git a/src/utf16_be.c b/src/utf16_be.c index 22bf74d..b66d868 100644 --- a/src/utf16_be.c +++ b/src/utf16_be.c @@ -2,7 +2,7 @@    utf16_be.c -  Oniguruma (regular expression library)  **********************************************************************/  /*- - * Copyright (c) 2002-2018  K.Kosako  <sndgk393 AT ybb DOT ne DOT jp> + * Copyright (c) 2002-2019  K.Kosako  <sndgk393 AT ybb DOT ne DOT jp>   * All rights reserved.   *   * Redistribution and use in source and binary forms, with or without @@ -103,7 +103,25 @@ utf16be_mbc_enc_len(const UChar* p)  static int  is_valid_mbc_string(const UChar* s, const UChar* end)  { -  return onigenc_length_check_is_valid_mbc_string(ONIG_ENCODING_UTF16_BE, s, end); +  while (s < end) { +    int len = utf16be_mbc_enc_len(s); +    if (len == 4) { +      if (s + 2 >= end) +        return FALSE; +      if (! UTF16_IS_SURROGATE_SECOND(*(s+2))) +        return FALSE; +    } +    else +      if (UTF16_IS_SURROGATE_SECOND(*s)) +        return FALSE; + +    s += len; +  } + +  if (s != end) +    return FALSE; +  else +    return TRUE;  }  static int @@ -146,7 +164,15 @@ utf16be_mbc_to_code(const UChar* p, const UChar* end ARG_UNUSED)  static int  utf16be_code_to_mbclen(OnigCodePoint code)  { -  return (code > 0xffff ? 4 : 2); +  if (code > 0xffff) { +    if (code > 0x10ffff) +      return ONIGERR_INVALID_CODE_POINT_VALUE; +    else +      return 4; +  } +  else { +    return 2; +  }  }  static int @@ -243,7 +269,8 @@ utf16be_left_adjust_char_head(const UChar* start, const UChar* s)      s--;    } -  if (UTF16_IS_SURROGATE_SECOND(*s) && s > start + 1) +  if (UTF16_IS_SURROGATE_SECOND(*s) && s > start + 1 && +      UTF16_IS_SURROGATE_FIRST(*(s-2)))      s -= 2;    return (UChar* )s; diff --git a/src/utf16_le.c b/src/utf16_le.c index 4b231c6..cdc74b0 100644 --- a/src/utf16_le.c +++ b/src/utf16_le.c @@ -2,7 +2,7 @@    utf16_le.c -  Oniguruma (regular expression library)  **********************************************************************/  /*- - * Copyright (c) 2002-2018  K.Kosako  <sndgk393 AT ybb DOT ne DOT jp> + * Copyright (c) 2002-2019  K.Kosako  <sndgk393 AT ybb DOT ne DOT jp>   * All rights reserved.   *   * Redistribution and use in source and binary forms, with or without @@ -95,7 +95,15 @@ static const int EncLen_UTF16[] = {  static int  utf16le_code_to_mbclen(OnigCodePoint code)  { -  return (code > 0xffff ? 4 : 2); +  if (code > 0xffff) { +    if (code > 0x10ffff) +      return ONIGERR_INVALID_CODE_POINT_VALUE; +    else +      return 4; +  } +  else { +    return 2; +  }  }  static int @@ -110,7 +118,16 @@ is_valid_mbc_string(const UChar* p, const UChar* end)    const UChar* end1 = end - 1;    while (p < end1) { -    p += utf16le_mbc_enc_len(p); +    int len = utf16le_mbc_enc_len(p); +    if (len == 4) { +      if (p + 3 < end && ! UTF16_IS_SURROGATE_SECOND(*(p + 3))) +        return FALSE; +    } +    else +      if (UTF16_IS_SURROGATE_SECOND(*(p + 1))) +        return FALSE; + +    p += len;    }    if (p != end) @@ -252,7 +269,8 @@ utf16le_left_adjust_char_head(const UChar* start, const UChar* s)      s--;    } -  if (UTF16_IS_SURROGATE_SECOND(*(s+1)) && s > start + 1) +  if (UTF16_IS_SURROGATE_SECOND(*(s+1)) && s > start + 1 && +      UTF16_IS_SURROGATE_FIRST(*(s-1)))      s -= 2;    return (UChar* )s; | 
