diff options
| author | Jörg Frings-Fürst <debian@jff.email> | 2021-04-26 17:45:59 +0200 | 
|---|---|---|
| committer | Jörg Frings-Fürst <debian@jff.email> | 2021-04-26 17:45:59 +0200 | 
| commit | ddebf6f9bc11c3a23c5b3b3598fb913c328e2352 (patch) | |
| tree | 585328f4ed04955626c3d2cac5db64f1726260ea /src | |
| parent | 77a04959299aa252579a98655e626d1b8f5f9f34 (diff) | |
| parent | f5b2920f12628bb7a0fb8b13097533878a1a9936 (diff) | |
Merge branch 'feature/upstream' into develop
Diffstat (limited to 'src')
| -rw-r--r-- | src/Makefile.windows | 39 | ||||
| -rw-r--r-- | src/cp1251.c | 10 | ||||
| -rw-r--r-- | src/gb18030.c | 46 | ||||
| -rw-r--r-- | src/iso8859_1.c | 96 | ||||
| -rw-r--r-- | src/iso8859_10.c | 8 | ||||
| -rw-r--r-- | src/iso8859_13.c | 8 | ||||
| -rw-r--r-- | src/iso8859_14.c | 8 | ||||
| -rw-r--r-- | src/iso8859_15.c | 8 | ||||
| -rw-r--r-- | src/iso8859_16.c | 8 | ||||
| -rw-r--r-- | src/iso8859_2.c | 8 | ||||
| -rw-r--r-- | src/iso8859_3.c | 8 | ||||
| -rw-r--r-- | src/iso8859_4.c | 8 | ||||
| -rw-r--r-- | src/iso8859_5.c | 10 | ||||
| -rw-r--r-- | src/iso8859_7.c | 10 | ||||
| -rw-r--r-- | src/iso8859_9.c | 8 | ||||
| -rw-r--r-- | src/koi8.c | 10 | ||||
| -rw-r--r-- | src/koi8_r.c | 8 | ||||
| -rw-r--r-- | src/oniguruma.h | 32 | ||||
| -rw-r--r-- | src/regcomp.c | 1105 | ||||
| -rw-r--r-- | src/regenc.c | 19 | ||||
| -rw-r--r-- | src/regenc.h | 16 | ||||
| -rw-r--r-- | src/regerror.c | 6 | ||||
| -rw-r--r-- | src/regexec.c | 417 | ||||
| -rw-r--r-- | src/regint.h | 93 | ||||
| -rw-r--r-- | src/regparse.c | 408 | ||||
| -rw-r--r-- | src/regparse.h | 25 | ||||
| -rw-r--r-- | src/regposix.c | 4 | ||||
| -rw-r--r-- | src/regsyntax.c | 31 | ||||
| -rw-r--r-- | src/unicode.c | 95 | 
29 files changed, 1658 insertions, 894 deletions
| diff --git a/src/Makefile.windows b/src/Makefile.windows index 11d6fd8..b637772 100644 --- a/src/Makefile.windows +++ b/src/Makefile.windows @@ -2,8 +2,9 @@  product_name = oniguruma -TEST_DIR = $(ONIG_DIR)/../test -WIN_DIR  = $(ONIG_DIR)/../windows +TEST_DIR   = $(ONIG_DIR)/../test +SAMPLE_DIR = $(ONIG_DIR)/../sample +WIN_DIR    = $(ONIG_DIR)/../windows  CPPFLAGS =  CFLAGS = -O2 -nologo /W3 @@ -15,6 +16,8 @@ ARDLL = cl  ARDLL_FLAGS = -nologo -LD $(LINKFLAGS) -dll  LINKFLAGS = -link -incremental:no -pdb:none +SAMPLE_CFLAGS = $(CFLAGS) /I$(ONIG_DIR) +  INSTALL = install -c  CP      = copy  CC = cl @@ -89,11 +92,6 @@ makeargs = $(MFLAGS) CPPFLAGS='$(CPPFLAGS)' CFLAGS='$(CFLAGS)' CC='$(CC)'  # targets  default: all -setup: -	$(CP) ..\win32\config.h config.h -	$(CP) ..\win32\testc.c  testc.c - -  all: $(libname) $(dllname)  $(libname): $(libobjs) $(encobjs) @@ -155,7 +153,7 @@ $(BUILD_DIR)/unicode_fold1_key.obj: $(ONIG_DIR)/unicode_fold1_key.c $(ONIG_DIR)/  $(BUILD_DIR)/unicode_fold2_key.obj: $(ONIG_DIR)/unicode_fold2_key.c $(ONIG_DIR)/regenc.h $(BUILD_DIR)/config.h  $(BUILD_DIR)/unicode_fold3_key.obj: $(ONIG_DIR)/unicode_fold3_key.c $(ONIG_DIR)/regenc.h $(BUILD_DIR)/config.h -all-test: test_syntax test_regset test_utf8 testc testp testu +all-test: test_syntax test_regset test_utf8 test_options test_back testc testp testu  test_syntax: $(TEST_DIR)/test_syntax.c $(libname)  	$(CC) -nologo /Fe:$@ /I. /I$(ONIG_DIR) /DONIG_EXTERN=extern /utf-8 $(TEST_DIR)/test_syntax.c $(libname) @@ -166,6 +164,12 @@ test_regset: $(TEST_DIR)/test_regset.c $(libname)  test_utf8: $(TEST_DIR)/test_utf8.c $(libname)  	$(CC) -nologo /Fe:$@ /I. /I$(ONIG_DIR) /DONIG_EXTERN=extern /utf-8 $(TEST_DIR)/test_utf8.c $(libname) +test_options: $(TEST_DIR)/test_options.c $(libname) +	$(CC) -nologo /Fe:$@ /I. /I$(ONIG_DIR) /DONIG_EXTERN=extern /utf-8 $(TEST_DIR)/test_options.c $(libname) + +test_back: $(TEST_DIR)/test_back.c $(libname) +	$(CC) -nologo /Fe:$@ /I. /I$(ONIG_DIR) /DONIG_EXTERN=extern /utf-8 $(TEST_DIR)/test_back.c $(libname) +  testc: $(WIN_DIR)/testc.c $(libname)  	$(CC) -nologo /Fe:$@ /I. /I$(ONIG_DIR) /DONIG_EXTERN=extern $(WIN_DIR)/testc.c $(libname) @@ -176,14 +180,17 @@ testu: $(TEST_DIR)/testu.c $(libname)  	$(CC) -nologo /Fe:$@ /I. /I$(ONIG_DIR) /DONIG_EXTERN=extern $(TEST_DIR)/testu.c $(libname)  clean: -	del $(BUILD_DIR)\*.obj $(BUILD_DIR)\*.lib $(BUILD_DIR)\*.exp $(BUILD_DIR)\*.dll $(BUILD_DIR)\test_regset.exe $(BUILD_DIR)\test_syntax.exe $(BUILD_DIR)\test_utf8.exe $(BUILD_DIR)\testp.exe $(BUILD_DIR)\testc.exe $(BUILD_DIR)\testu.exe +	del $(BUILD_DIR)\*.obj $(BUILD_DIR)\*.lib $(BUILD_DIR)\*.exp $(BUILD_DIR)\*.dll $(BUILD_DIR)\test_regset.exe $(BUILD_DIR)\test_syntax.exe $(BUILD_DIR)\test_utf8.exe $(BUILD_DIR)\test_options.exe $(BUILD_DIR)\test_back.exe $(BUILD_DIR)\testp.exe $(BUILD_DIR)\testc.exe $(BUILD_DIR)\testu.exe  samples: all -	$(CC) $(CFLAGS) -I. /Fe:simple  $(ONIG_DIR)\sample\simple.c  $(dlllib) -	$(CC) $(CFLAGS) -I. /Fe:posix   $(ONIG_DIR)\sample\posix.c   $(dlllib) -	$(CC) $(CFLAGS) -I. /Fe:names   $(ONIG_DIR)\sample\names.c   $(dlllib) -	$(CC) $(CFLAGS) -I. /Fe:listcap $(ONIG_DIR)\sample\listcap.c $(dlllib) -	$(CC) $(CFLAGS) -I. /Fe:sql     $(ONIG_DIR)\sample\sql.c     $(dlllib) -	$(CC) $(CFLAGS) -I. /Fe:encode  $(ONIG_DIR)\sample\encode.c  $(dlllib) -	$(CC) $(CFLAGS) -I. /Fe:syntax  $(ONIG_DIR)\sample\syntax.c  $(dlllib) +	$(CC) $(SAMPLE_CFLAGS) /Fe:simple  $(SAMPLE_DIR)\simple.c  $(dlllib) +	$(CC) $(SAMPLE_CFLAGS) /Fe:posix   $(SAMPLE_DIR)\posix.c   $(dlllib) +	$(CC) $(SAMPLE_CFLAGS) /Fe:names   $(SAMPLE_DIR)\names.c   $(dlllib) +	$(CC) $(SAMPLE_CFLAGS) /Fe:listcap $(SAMPLE_DIR)\listcap.c $(dlllib) +	$(CC) $(SAMPLE_CFLAGS) /Fe:sql     $(SAMPLE_DIR)\sql.c     $(dlllib) +	$(CC) $(SAMPLE_CFLAGS) /Fe:encode  $(SAMPLE_DIR)\encode.c  $(dlllib) +	$(CC) $(SAMPLE_CFLAGS) /Fe:syntax  $(SAMPLE_DIR)\syntax.c  $(dlllib) +	$(CC) $(SAMPLE_CFLAGS) /Fe:count   $(SAMPLE_DIR)\count.c   $(dlllib) +	$(CC) $(SAMPLE_CFLAGS) /Fe:regset  $(SAMPLE_DIR)\regset.c  $(dlllib) +	$(CC) $(SAMPLE_CFLAGS) /Fe:callback_each_match $(SAMPLE_DIR)\callback_each_match.c  $(dlllib) diff --git a/src/cp1251.c b/src/cp1251.c index fa20780..36b36f6 100644 --- a/src/cp1251.c +++ b/src/cp1251.c @@ -2,7 +2,7 @@    cp1251.c -  Oniguruma (regular expression library)  **********************************************************************/  /*- - * Copyright (c) 2006-2019  Byte      <byte AT mail DOT kna DOT ru> + * Copyright (c) 2006-2020  Byte      <byte AT mail DOT kna DOT ru>   *                          K.Kosako   * All rights reserved.   * @@ -105,12 +105,16 @@ static const unsigned short EncCP1251_CtypeTable[256] = {  };  static int -cp1251_mbc_case_fold(OnigCaseFoldType flag ARG_UNUSED, +cp1251_mbc_case_fold(OnigCaseFoldType flag,               const UChar** pp, const UChar* end ARG_UNUSED, UChar* lower)  {    const UChar* p = *pp; -  *lower = ENC_CP1251_TO_LOWER_CASE(*p); +  if (CASE_FOLD_IS_NOT_ASCII_ONLY(flag) || ONIGENC_IS_ASCII_CODE(*p)) +    *lower = ENC_CP1251_TO_LOWER_CASE(*p); +  else +    *lower = *p; +    (*pp)++;    return 1;  } diff --git a/src/gb18030.c b/src/gb18030.c index 7409d3e..1da19b4 100644 --- a/src/gb18030.c +++ b/src/gb18030.c @@ -30,9 +30,11 @@  #include "regenc.h" -#if 1 +/* #define DEBUG_GB18030 */ -#define DEBUG_GB18030(arg) +#ifndef DEBUG_GB18030 + +#define DEBUG_OUT(arg)  #else @@ -43,7 +45,7 @@  /* for printf() */  #include "regint.h" -#define DEBUG_GB18030(arg) printf arg +#define DEBUG_OUT(arg) printf arg  #endif @@ -177,8 +179,8 @@ gb18030_is_code_ctype(OnigCodePoint code, unsigned int ctype)  }  enum state { -  S_START, -  S_one_C2, +  S_START = 0, +  S_one_C2 = 1,    S_one_C4,    S_one_CM, @@ -210,15 +212,43 @@ enum state {    S_odd_CM_even_C4CM,  }; +#ifdef DEBUG_GB18030 +static char* StateNames[] = { +  "S_START", +  "S_one_C2", +  "S_one_C4", +  "S_one_CM", +  "S_odd_CM_one_CX", +  "S_even_CM_one_CX", +  "S_one_CMC4", +  "S_odd_CMC4", +  "S_one_C4_odd_CMC4", +  "S_even_CMC4", +  "S_one_C4_even_CMC4", +  "S_odd_CM_odd_CMC4", +  "S_even_CM_odd_CMC4", +  "S_odd_CM_even_CMC4", +  "S_even_CM_even_CMC4", +  "S_odd_C4CM", +  "S_one_CM_odd_C4CM", +  "S_even_C4CM", +  "S_one_CM_even_C4CM", +  "S_even_CM_odd_C4CM", +  "S_odd_CM_odd_C4CM", +  "S_even_CM_even_C4CM", +  "S_odd_CM_even_C4CM" +}; +#endif +  static UChar*  gb18030_left_adjust_char_head(const UChar* start, const UChar* s)  {    const UChar *p;    enum state state = S_START; -  DEBUG_GB18030(("----------------\n")); +  DEBUG_OUT(("----------------\n"));    for (p = s; p >= start; p--) { -    DEBUG_GB18030(("state %d --(%02x)-->\n", state, *p)); +    DEBUG_OUT(("%5d: state %-19s (0x%02x)->\n", (int )(p - start), StateNames[state], *p));      switch (state) {      case S_START:        switch (GB18030_MAP[*p]) { @@ -499,7 +529,7 @@ gb18030_left_adjust_char_head(const UChar* start, const UChar* s)      }    } -  DEBUG_GB18030(("state %d\n", state)); +  DEBUG_OUT(("state %-19s\n", StateNames[state]));    switch (state) {    case S_START:             return (UChar *)(s - 0);    case S_one_C2:            return (UChar *)(s - 0); diff --git a/src/iso8859_1.c b/src/iso8859_1.c index d75509e..2013e75 100644 --- a/src/iso8859_1.c +++ b/src/iso8859_1.c @@ -2,7 +2,7 @@    iso8859_1.c -  Oniguruma (regular expression library)  **********************************************************************/  /*- - * Copyright (c) 2002-2019  K.Kosako + * Copyright (c) 2002-2020  K.Kosako   * All rights reserved.   *   * Redistribution and use in source and binary forms, with or without @@ -114,7 +114,7 @@ apply_all_case_fold(OnigCaseFoldType flag,  }  static int -get_case_fold_codes_by_str(OnigCaseFoldType flag ARG_UNUSED, +get_case_fold_codes_by_str(OnigCaseFoldType flag,                             const OnigUChar* p, const OnigUChar* end,                             OnigCaseFoldCodeItem items[])  { @@ -123,7 +123,8 @@ get_case_fold_codes_by_str(OnigCaseFoldType flag ARG_UNUSED,    if (0x41 <= *p && *p <= 0x5a) {      if (*p == LARGE_S && end > p + 1 -        && (*(p+1) == LARGE_S || *(p+1) == SMALL_S)) { /* SS */ +        && (*(p+1) == LARGE_S || *(p+1) == SMALL_S) +        && CASE_FOLD_IS_NOT_ASCII_ONLY(flag)) { /* SS */      ss_combination:        items[0].byte_len = 2;        items[0].code_len = 1; @@ -152,7 +153,8 @@ get_case_fold_codes_by_str(OnigCaseFoldType flag ARG_UNUSED,    }    else if (0x61 <= *p && *p <= 0x7a) {      if (*p == SMALL_S && end > p + 1 -        && (*(p+1) == SMALL_S || *(p+1) == LARGE_S)) { /* ss */ +        && (*(p+1) == SMALL_S || *(p+1) == LARGE_S) +        && CASE_FOLD_IS_NOT_ASCII_ONLY(flag)) { /* ss */        goto ss_combination;      } @@ -161,56 +163,58 @@ get_case_fold_codes_by_str(OnigCaseFoldType flag ARG_UNUSED,      items[0].code[0] = (OnigCodePoint )(*p - 0x20);      return 1;    } -  else if (0xc0 <= *p && *p <= 0xcf) { -    items[0].byte_len = 1; -    items[0].code_len = 1; -    items[0].code[0] = (OnigCodePoint )(*p + 0x20); -    return 1; -  } -  else if (0xd0 <= *p && *p <= 0xdf) { -    if (*p == 0xdf) { +  else if (CASE_FOLD_IS_NOT_ASCII_ONLY(flag)) { +    if (0xc0 <= *p && *p <= 0xcf) {        items[0].byte_len = 1; -      items[0].code_len = 2; -      items[0].code[0] = (OnigCodePoint )'s'; -      items[0].code[1] = (OnigCodePoint )'s'; +      items[0].code_len = 1; +      items[0].code[0] = (OnigCodePoint )(*p + 0x20); +      return 1; +    } +    else if (0xd0 <= *p && *p <= 0xdf) { +      if (*p == 0xdf) { +        items[0].byte_len = 1; +        items[0].code_len = 2; +        items[0].code[0] = (OnigCodePoint )'s'; +        items[0].code[1] = (OnigCodePoint )'s'; -      items[1].byte_len = 1; -      items[1].code_len = 2; -      items[1].code[0] = (OnigCodePoint )'S'; -      items[1].code[1] = (OnigCodePoint )'S'; +        items[1].byte_len = 1; +        items[1].code_len = 2; +        items[1].code[0] = (OnigCodePoint )'S'; +        items[1].code[1] = (OnigCodePoint )'S'; -      items[2].byte_len = 1; -      items[2].code_len = 2; -      items[2].code[0] = (OnigCodePoint )'s'; -      items[2].code[1] = (OnigCodePoint )'S'; +        items[2].byte_len = 1; +        items[2].code_len = 2; +        items[2].code[0] = (OnigCodePoint )'s'; +        items[2].code[1] = (OnigCodePoint )'S'; -      items[3].byte_len = 1; -      items[3].code_len = 2; -      items[3].code[0] = (OnigCodePoint )'S'; -      items[3].code[1] = (OnigCodePoint )'s'; +        items[3].byte_len = 1; +        items[3].code_len = 2; +        items[3].code[0] = (OnigCodePoint )'S'; +        items[3].code[1] = (OnigCodePoint )'s'; -      return 4; -    } -    else if (*p != 0xd7) { -      items[0].byte_len = 1; -      items[0].code_len = 1; -      items[0].code[0] = (OnigCodePoint )(*p + 0x20); -      return 1; +        return 4; +      } +      else if (*p != 0xd7) { +        items[0].byte_len = 1; +        items[0].code_len = 1; +        items[0].code[0] = (OnigCodePoint )(*p + 0x20); +        return 1; +      }      } -  } -  else if (0xe0 <= *p && *p <= 0xef) { -    items[0].byte_len = 1; -    items[0].code_len = 1; -    items[0].code[0] = (OnigCodePoint )(*p - 0x20); -    return 1; -  } -  else if (0xf0 <= *p && *p <= 0xfe) { -    if (*p != 0xf7) { +    else if (0xe0 <= *p && *p <= 0xef) {        items[0].byte_len = 1;        items[0].code_len = 1;        items[0].code[0] = (OnigCodePoint )(*p - 0x20);        return 1;      } +    else if (0xf0 <= *p && *p <= 0xfe) { +      if (*p != 0xf7) { +        items[0].byte_len = 1; +        items[0].code_len = 1; +        items[0].code[0] = (OnigCodePoint )(*p - 0x20); +        return 1; +      } +    }    }    return 0; @@ -229,7 +233,11 @@ mbc_case_fold(OnigCaseFoldType flag, const UChar** pp,      return 2;    } -  *lower = ONIGENC_ISO_8859_1_TO_LOWER_CASE(*p); +  if (CASE_FOLD_IS_NOT_ASCII_ONLY(flag) || ONIGENC_IS_ASCII_CODE(*p)) +    *lower = ONIGENC_ISO_8859_1_TO_LOWER_CASE(*p); +  else +    *lower = *p; +    (*pp)++;    return 1;  } diff --git a/src/iso8859_10.c b/src/iso8859_10.c index e98cffb..e4bf599 100644 --- a/src/iso8859_10.c +++ b/src/iso8859_10.c @@ -2,7 +2,7 @@    iso8859_10.c -  Oniguruma (regular expression library)  **********************************************************************/  /*- - * Copyright (c) 2002-2019  K.Kosako + * Copyright (c) 2002-2020  K.Kosako   * All rights reserved.   *   * Redistribution and use in source and binary forms, with or without @@ -116,7 +116,11 @@ mbc_case_fold(OnigCaseFoldType flag,      return 2;    } -  *lower = ENC_ISO_8859_10_TO_LOWER_CASE(*p); +  if (CASE_FOLD_IS_NOT_ASCII_ONLY(flag) || ONIGENC_IS_ASCII_CODE(*p)) +    *lower = ENC_ISO_8859_10_TO_LOWER_CASE(*p); +  else +    *lower = *p; +    (*pp)++;    return 1;  } diff --git a/src/iso8859_13.c b/src/iso8859_13.c index 2bd460f..dbf747f 100644 --- a/src/iso8859_13.c +++ b/src/iso8859_13.c @@ -2,7 +2,7 @@    iso8859_13.c -  Oniguruma (regular expression library)  **********************************************************************/  /*- - * Copyright (c) 2002-2019  K.Kosako + * Copyright (c) 2002-2020  K.Kosako   * All rights reserved.   *   * Redistribution and use in source and binary forms, with or without @@ -116,7 +116,11 @@ mbc_case_fold(OnigCaseFoldType flag,      return 2;    } -  *lower = ENC_ISO_8859_13_TO_LOWER_CASE(*p); +  if (CASE_FOLD_IS_NOT_ASCII_ONLY(flag) || ONIGENC_IS_ASCII_CODE(*p)) +    *lower = ENC_ISO_8859_13_TO_LOWER_CASE(*p); +  else +    *lower = *p; +    (*pp)++;    return 1;  } diff --git a/src/iso8859_14.c b/src/iso8859_14.c index 5030b55..a6d6b71 100644 --- a/src/iso8859_14.c +++ b/src/iso8859_14.c @@ -2,7 +2,7 @@    iso8859_14.c -  Oniguruma (regular expression library)  **********************************************************************/  /*- - * Copyright (c) 2002-2019  K.Kosako + * Copyright (c) 2002-2020  K.Kosako   * All rights reserved.   *   * Redistribution and use in source and binary forms, with or without @@ -116,7 +116,11 @@ mbc_case_fold(OnigCaseFoldType flag,      return 2;    } -  *lower = ENC_ISO_8859_14_TO_LOWER_CASE(*p); +  if (CASE_FOLD_IS_NOT_ASCII_ONLY(flag) || ONIGENC_IS_ASCII_CODE(*p)) +    *lower = ENC_ISO_8859_14_TO_LOWER_CASE(*p); +  else +    *lower = *p; +    (*pp)++;    return 1; /* return byte length of converted char to lower */  } diff --git a/src/iso8859_15.c b/src/iso8859_15.c index f32c3de..0bb6b12 100644 --- a/src/iso8859_15.c +++ b/src/iso8859_15.c @@ -2,7 +2,7 @@    iso8859_15.c -  Oniguruma (regular expression library)  **********************************************************************/  /*- - * Copyright (c) 2002-2019  K.Kosako + * Copyright (c) 2002-2020  K.Kosako   * All rights reserved.   *   * Redistribution and use in source and binary forms, with or without @@ -116,7 +116,11 @@ mbc_case_fold(OnigCaseFoldType flag,      return 2;    } -  *lower = ENC_ISO_8859_15_TO_LOWER_CASE(*p); +  if (CASE_FOLD_IS_NOT_ASCII_ONLY(flag) || ONIGENC_IS_ASCII_CODE(*p)) +    *lower = ENC_ISO_8859_15_TO_LOWER_CASE(*p); +  else +    *lower = *p; +    (*pp)++;    return 1; /* return byte length of converted char to lower */  } diff --git a/src/iso8859_16.c b/src/iso8859_16.c index 22a653a..bfd0a5b 100644 --- a/src/iso8859_16.c +++ b/src/iso8859_16.c @@ -2,7 +2,7 @@    iso8859_16.c -  Oniguruma (regular expression library)  **********************************************************************/  /*- - * Copyright (c) 2002-2019  K.Kosako + * Copyright (c) 2002-2020  K.Kosako   * All rights reserved.   *   * Redistribution and use in source and binary forms, with or without @@ -116,7 +116,11 @@ mbc_case_fold(OnigCaseFoldType flag,      return 2;    } -  *lower = ENC_ISO_8859_16_TO_LOWER_CASE(*p); +  if (CASE_FOLD_IS_NOT_ASCII_ONLY(flag) || ONIGENC_IS_ASCII_CODE(*p)) +    *lower = ENC_ISO_8859_16_TO_LOWER_CASE(*p); +  else +    *lower = *p; +    (*pp)++;    return 1; /* return byte length of converted char to lower */  } diff --git a/src/iso8859_2.c b/src/iso8859_2.c index dc3d0a1..d08140e 100644 --- a/src/iso8859_2.c +++ b/src/iso8859_2.c @@ -2,7 +2,7 @@    iso8859_2.c -  Oniguruma (regular expression library)  **********************************************************************/  /*- - * Copyright (c) 2002-2019  K.Kosako + * Copyright (c) 2002-2020  K.Kosako   * All rights reserved.   *   * Redistribution and use in source and binary forms, with or without @@ -116,7 +116,11 @@ mbc_case_fold(OnigCaseFoldType flag,      return 2;    } -  *lower = ENC_ISO_8859_2_TO_LOWER_CASE(*p); +  if (CASE_FOLD_IS_NOT_ASCII_ONLY(flag) || ONIGENC_IS_ASCII_CODE(*p)) +    *lower = ENC_ISO_8859_2_TO_LOWER_CASE(*p); +  else +    *lower = *p; +    (*pp)++;    return 1; /* return byte length of converted char to lower */  } diff --git a/src/iso8859_3.c b/src/iso8859_3.c index 49dc6b2..69b96fd 100644 --- a/src/iso8859_3.c +++ b/src/iso8859_3.c @@ -2,7 +2,7 @@    iso8859_3.c -  Oniguruma (regular expression library)  **********************************************************************/  /*- - * Copyright (c) 2002-2019  K.Kosako + * Copyright (c) 2002-2020  K.Kosako   * All rights reserved.   *   * Redistribution and use in source and binary forms, with or without @@ -116,7 +116,11 @@ mbc_case_fold(OnigCaseFoldType flag, const UChar** pp,      return 2;    } -  *lower = ENC_ISO_8859_3_TO_LOWER_CASE(*p); +  if (CASE_FOLD_IS_NOT_ASCII_ONLY(flag) || ONIGENC_IS_ASCII_CODE(*p)) +    *lower = ENC_ISO_8859_3_TO_LOWER_CASE(*p); +  else +    *lower = *p; +    (*pp)++;    return 1;  } diff --git a/src/iso8859_4.c b/src/iso8859_4.c index f3f6ba9..949b7a1 100644 --- a/src/iso8859_4.c +++ b/src/iso8859_4.c @@ -2,7 +2,7 @@    iso8859_4.c -  Oniguruma (regular expression library)  **********************************************************************/  /*- - * Copyright (c) 2002-2019  K.Kosako + * Copyright (c) 2002-2020  K.Kosako   * All rights reserved.   *   * Redistribution and use in source and binary forms, with or without @@ -116,7 +116,11 @@ mbc_case_fold(OnigCaseFoldType flag,      return 2;    } -  *lower = ENC_ISO_8859_4_TO_LOWER_CASE(*p); +  if (CASE_FOLD_IS_NOT_ASCII_ONLY(flag) || ONIGENC_IS_ASCII_CODE(*p)) +    *lower = ENC_ISO_8859_4_TO_LOWER_CASE(*p); +  else +    *lower = *p; +    (*pp)++;    return 1; /* return byte length of converted char to lower */  } diff --git a/src/iso8859_5.c b/src/iso8859_5.c index a5f587c..9e5d418 100644 --- a/src/iso8859_5.c +++ b/src/iso8859_5.c @@ -2,7 +2,7 @@    iso8859_5.c -  Oniguruma (regular expression library)  **********************************************************************/  /*- - * Copyright (c) 2002-2019  K.Kosako + * Copyright (c) 2002-2020  K.Kosako   * All rights reserved.   *   * Redistribution and use in source and binary forms, with or without @@ -104,12 +104,16 @@ static const unsigned short EncISO_8859_5_CtypeTable[256] = {  };  static int -mbc_case_fold(OnigCaseFoldType flag ARG_UNUSED, +mbc_case_fold(OnigCaseFoldType flag,                const UChar** pp, const UChar* end ARG_UNUSED, UChar* lower)  {    const UChar* p = *pp; -  *lower = ENC_ISO_8859_5_TO_LOWER_CASE(*p); +  if (CASE_FOLD_IS_NOT_ASCII_ONLY(flag) || ONIGENC_IS_ASCII_CODE(*p)) +    *lower = ENC_ISO_8859_5_TO_LOWER_CASE(*p); +  else +    *lower = *p; +    (*pp)++;    return 1;  } diff --git a/src/iso8859_7.c b/src/iso8859_7.c index 018efac..07b1360 100644 --- a/src/iso8859_7.c +++ b/src/iso8859_7.c @@ -2,7 +2,7 @@    iso8859_7.c -  Oniguruma (regular expression library)  **********************************************************************/  /*- - * Copyright (c) 2002-2019  K.Kosako + * Copyright (c) 2002-2020  K.Kosako   * All rights reserved.   *   * Redistribution and use in source and binary forms, with or without @@ -104,12 +104,16 @@ static const unsigned short EncISO_8859_7_CtypeTable[256] = {  };  static int -mbc_case_fold(OnigCaseFoldType flag ARG_UNUSED, +mbc_case_fold(OnigCaseFoldType flag,                const UChar** pp, const UChar* end ARG_UNUSED, UChar* lower)  {    const UChar* p = *pp; -  *lower = ENC_ISO_8859_7_TO_LOWER_CASE(*p); +  if (CASE_FOLD_IS_NOT_ASCII_ONLY(flag) || ONIGENC_IS_ASCII_CODE(*p)) +    *lower = ENC_ISO_8859_7_TO_LOWER_CASE(*p); +  else +    *lower = *p; +    (*pp)++;    return 1;  } diff --git a/src/iso8859_9.c b/src/iso8859_9.c index 1f9bdea..6f205e5 100644 --- a/src/iso8859_9.c +++ b/src/iso8859_9.c @@ -2,7 +2,7 @@    iso8859_9.c -  Oniguruma (regular expression library)  **********************************************************************/  /*- - * Copyright (c) 2002-2019  K.Kosako + * Copyright (c) 2002-2020  K.Kosako   * All rights reserved.   *   * Redistribution and use in source and binary forms, with or without @@ -116,7 +116,11 @@ mbc_case_fold(OnigCaseFoldType flag,      return 2;    } -  *lower = ENC_ISO_8859_9_TO_LOWER_CASE(*p); +  if (CASE_FOLD_IS_NOT_ASCII_ONLY(flag) || ONIGENC_IS_ASCII_CODE(*p)) +    *lower = ENC_ISO_8859_9_TO_LOWER_CASE(*p); +  else +    *lower = *p; +    (*pp)++;    return 1;  } @@ -2,7 +2,7 @@    koi8.c -  Oniguruma (regular expression library)  **********************************************************************/  /*- - * Copyright (c) 2002-2019  K.Kosako + * Copyright (c) 2002-2020  K.Kosako   * All rights reserved.   *   * Redistribution and use in source and binary forms, with or without @@ -105,12 +105,16 @@ static const unsigned short EncKOI8_CtypeTable[256] = {  static int -koi8_mbc_case_fold(OnigCaseFoldType flag ARG_UNUSED, +koi8_mbc_case_fold(OnigCaseFoldType flag,                     const UChar** pp, const UChar* end ARG_UNUSED, UChar* lower)  {    const UChar* p = *pp; -  *lower = ENC_KOI8_TO_LOWER_CASE(*p); +  if (CASE_FOLD_IS_NOT_ASCII_ONLY(flag) || ONIGENC_IS_ASCII_CODE(*p)) +    *lower = ENC_KOI8_TO_LOWER_CASE(*p); +  else +    *lower = *p; +    (*pp)++;    return 1;  } diff --git a/src/koi8_r.c b/src/koi8_r.c index c77302f..31cc870 100644 --- a/src/koi8_r.c +++ b/src/koi8_r.c @@ -2,7 +2,7 @@    koi8_r.c -  Oniguruma (regular expression library)  **********************************************************************/  /*- - * Copyright (c) 2002-2019  K.Kosako + * Copyright (c) 2002-2020  K.Kosako   * All rights reserved.   *   * Redistribution and use in source and binary forms, with or without @@ -109,7 +109,11 @@ koi8_r_mbc_case_fold(OnigCaseFoldType flag ARG_UNUSED,  {    const UChar* p = *pp; -  *lower = ENC_KOI8_R_TO_LOWER_CASE(*p); +  if (CASE_FOLD_IS_NOT_ASCII_ONLY(flag) || ONIGENC_IS_ASCII_CODE(*p)) +    *lower = ENC_KOI8_R_TO_LOWER_CASE(*p); +  else +    *lower = *p; +    (*pp)++;    return 1;  } diff --git a/src/oniguruma.h b/src/oniguruma.h index d983fc9..a7b9d8f 100644 --- a/src/oniguruma.h +++ b/src/oniguruma.h @@ -4,7 +4,7 @@    oniguruma.h - Oniguruma (regular expression library)  **********************************************************************/  /*- - * Copyright (c) 2002-2020  K.Kosako + * Copyright (c) 2002-2021  K.Kosako   * All rights reserved.   *   * Redistribution and use in source and binary forms, with or without @@ -36,9 +36,9 @@ extern "C" {  #define ONIGURUMA  #define ONIGURUMA_VERSION_MAJOR   6  #define ONIGURUMA_VERSION_MINOR   9 -#define ONIGURUMA_VERSION_TEENY   6 +#define ONIGURUMA_VERSION_TEENY   7 -#define ONIGURUMA_VERSION_INT     60906 +#define ONIGURUMA_VERSION_INT     60907  #ifndef P_  #if defined(__STDC__) || defined(_WIN32) @@ -91,6 +91,7 @@ typedef unsigned int OnigCaseFoldType; /* case fold flag */  ONIG_EXTERN OnigCaseFoldType OnigDefaultCaseFoldFlag; +#define ONIGENC_CASE_FOLD_ASCII_ONLY            (1)  /* #define ONIGENC_CASE_FOLD_HIRAGANA_KATAKANA  (1<<1) */  /* #define ONIGENC_CASE_FOLD_KATAKANA_WIDTH     (1<<2) */  #define ONIGENC_CASE_FOLD_TURKISH_AZERI         (1<<20) @@ -387,9 +388,9 @@ typedef unsigned int        OnigOptionType;  #define ONIG_OPTION_NOTEOL                    (ONIG_OPTION_NOTBOL << 1)  #define ONIG_OPTION_POSIX_REGION              (ONIG_OPTION_NOTEOL << 1)  #define ONIG_OPTION_CHECK_VALIDITY_OF_STRING  (ONIG_OPTION_POSIX_REGION << 1) -/* #define ONIG_OPTION_CRLF_AS_LINE_SEPARATOR    (ONIG_OPTION_CHECK_VALIDITY_OF_STRING << 1) */  /* options (compile time) */ -#define ONIG_OPTION_WORD_IS_ASCII        (ONIG_OPTION_CHECK_VALIDITY_OF_STRING << 4) +#define ONIG_OPTION_IGNORECASE_IS_ASCII  (ONIG_OPTION_CHECK_VALIDITY_OF_STRING << 3) +#define ONIG_OPTION_WORD_IS_ASCII        (ONIG_OPTION_IGNORECASE_IS_ASCII << 1)  #define ONIG_OPTION_DIGIT_IS_ASCII       (ONIG_OPTION_WORD_IS_ASCII << 1)  #define ONIG_OPTION_SPACE_IS_ASCII       (ONIG_OPTION_DIGIT_IS_ASCII << 1)  #define ONIG_OPTION_POSIX_IS_ASCII       (ONIG_OPTION_SPACE_IS_ASCII << 1) @@ -399,8 +400,9 @@ typedef unsigned int        OnigOptionType;  #define ONIG_OPTION_NOT_BEGIN_STRING     (ONIG_OPTION_TEXT_SEGMENT_WORD << 1)  #define ONIG_OPTION_NOT_END_STRING       (ONIG_OPTION_NOT_BEGIN_STRING << 1)  #define ONIG_OPTION_NOT_BEGIN_POSITION   (ONIG_OPTION_NOT_END_STRING << 1) +#define ONIG_OPTION_CALLBACK_EACH_MATCH  (ONIG_OPTION_NOT_BEGIN_POSITION << 1) -#define ONIG_OPTION_MAXBIT               ONIG_OPTION_NOT_BEGIN_POSITION +#define ONIG_OPTION_MAXBIT               ONIG_OPTION_CALLBACK_EACH_MATCH  #define ONIG_OPTION_ON(options,regopt)      ((options) |= (regopt))  #define ONIG_OPTION_OFF(options,regopt)     ((options) &= ~(regopt)) @@ -425,6 +427,7 @@ ONIG_EXTERN OnigSyntaxType OnigSyntaxJava;  ONIG_EXTERN OnigSyntaxType OnigSyntaxPerl;  ONIG_EXTERN OnigSyntaxType OnigSyntaxPerl_NG;  ONIG_EXTERN OnigSyntaxType OnigSyntaxRuby; +ONIG_EXTERN OnigSyntaxType OnigSyntaxPython;  ONIG_EXTERN OnigSyntaxType OnigSyntaxOniguruma;  /* predefined syntaxes (see regsyntax.c) */ @@ -438,6 +441,7 @@ ONIG_EXTERN OnigSyntaxType OnigSyntaxOniguruma;  #define ONIG_SYNTAX_PERL               (&OnigSyntaxPerl)  #define ONIG_SYNTAX_PERL_NG            (&OnigSyntaxPerl_NG)  #define ONIG_SYNTAX_RUBY               (&OnigSyntaxRuby) +#define ONIG_SYNTAX_PYTHON             (&OnigSyntaxPython)  #define ONIG_SYNTAX_ONIGURUMA          (&OnigSyntaxOniguruma)  /* default syntax */ @@ -510,6 +514,7 @@ ONIG_EXTERN OnigSyntaxType*   OnigDefaultSyntax;  #define ONIG_SYN_OP2_QMARK_BRACE_CALLOUT_CONTENTS (1U<<28) /* (?{...}) (?{{...}}) */  #define ONIG_SYN_OP2_ASTERISK_CALLOUT_NAME      (1U<<29) /* (*name) (*name{a,..}) */  #define ONIG_SYN_OP2_OPTION_ONIGURUMA           (1U<<30) /* (?imxWDSPy) */ +#define ONIG_SYN_OP2_QMARK_CAPITAL_P_NAME       (1U<<31) /* (?P<name>...) (?P=name) */  /* syntax (behavior) */  #define ONIG_SYN_CONTEXT_INDEP_ANCHORS           (1U<<31) /* not implemented */ @@ -525,6 +530,7 @@ ONIG_EXTERN OnigSyntaxType*   OnigDefaultSyntax;  #define ONIG_SYN_FIXED_INTERVAL_IS_GREEDY_ONLY   (1U<<9)  /* a{n}?=(?:a{n})? */  #define ONIG_SYN_ISOLATED_OPTION_CONTINUE_BRANCH (1U<<10) /* ..(?i)...|... */  #define ONIG_SYN_VARIABLE_LEN_LOOK_BEHIND        (1U<<11)  /* (?<=a+|..) */ +#define ONIG_SYN_PYTHON                          (1U<<12)  /* \UHHHHHHHH */  /* syntax (behavior) in char class [...] */  #define ONIG_SYN_NOT_NEWLINE_IN_NEGATIVE_CC      (1U<<20) /* [^...] */ @@ -548,8 +554,10 @@ ONIG_EXTERN OnigSyntaxType*   OnigDefaultSyntax;  /* error codes */  #define ONIG_IS_PATTERN_ERROR(ecode)   ((ecode) <= -100 && (ecode) > -1000) +  /* normal return */  #define ONIG_NORMAL                                            0 +#define ONIG_VALUE_IS_NOT_SET                                  1  #define ONIG_MISMATCH                                         -1  #define ONIG_NO_SUPPORT_CONFIG                                -2  #define ONIG_ABORT                                            -3 @@ -607,6 +615,7 @@ ONIG_EXTERN OnigSyntaxType*   OnigDefaultSyntax;  #define ONIGERR_NUMBERED_BACKREF_OR_CALL_NOT_ALLOWED         -209  #define ONIGERR_TOO_MANY_CAPTURES                            -210  #define ONIGERR_TOO_LONG_WIDE_CHAR_VALUE                     -212 +#define ONIGERR_UNDEFINED_OPERATOR                           -213  #define ONIGERR_EMPTY_GROUP_NAME                             -214  #define ONIGERR_INVALID_GROUP_NAME                           -215  #define ONIGERR_INVALID_CHAR_IN_GROUP_NAME                   -216 @@ -633,6 +642,7 @@ ONIG_EXTERN OnigSyntaxType*   OnigDefaultSyntax;  #define ONIGERR_INVALID_COMBINATION_OF_OPTIONS               -403  #define ONIGERR_TOO_MANY_USER_DEFINED_OBJECTS                -404  #define ONIGERR_TOO_LONG_PROPERTY_NAME                       -405 +#define ONIGERR_VERY_INEFFICIENT_PATTERN                     -406  #define ONIGERR_LIBRARY_IS_NOT_INITIALIZED                   -500  /* errors related to thread */ @@ -717,6 +727,8 @@ typedef struct {    OnigCaseFoldType   case_fold_flag;  } OnigCompileInfo; +typedef int (*OnigCallbackEachMatchFunc)(const OnigUChar* str, const OnigUChar* end, const OnigUChar* match_start, OnigRegion* region, void* user_data); +  /* types for callout */  typedef enum { @@ -940,6 +952,12 @@ const char* onig_version P_((void));  ONIG_EXTERN  const char* onig_copyright P_((void)); +/* for callback each match */ +ONIG_EXTERN +OnigCallbackEachMatchFunc onig_get_callback_each_match P_((void)); +ONIG_EXTERN +int onig_set_callback_each_match P_((OnigCallbackEachMatchFunc f)); +  /* for OnigMatchParam */  ONIG_EXTERN  OnigMatchParam* onig_new_match_param P_((void)); @@ -981,6 +999,8 @@ ONIG_EXTERN  int onig_get_callout_data_by_tag P_((OnigRegex reg, OnigMatchParam* mp, const OnigUChar* tag, const OnigUChar* tag_end, int slot, OnigType* type, OnigValue* val));  ONIG_EXTERN  int onig_set_callout_data_by_tag P_((OnigRegex reg, OnigMatchParam* mp, const OnigUChar* tag, const OnigUChar* tag_end, int slot, OnigType type, OnigValue* val)); +ONIG_EXTERN +int onig_get_callout_data_by_tag_dont_clear_old P_((regex_t* reg, OnigMatchParam* mp, const OnigUChar* tag, const OnigUChar* tag_end, int slot, OnigType* type, OnigValue* val));  /* used in callout functions */  ONIG_EXTERN diff --git a/src/regcomp.c b/src/regcomp.c index dd2b328..d80551d 100644 --- a/src/regcomp.c +++ b/src/regcomp.c @@ -2,7 +2,7 @@    regcomp.c -  Oniguruma (regular expression library)  **********************************************************************/  /*- - * Copyright (c) 2002-2020  K.Kosako + * Copyright (c) 2002-2021  K.Kosako   * All rights reserved.   *   * Redistribution and use in source and binary forms, with or without @@ -31,6 +31,9 @@  #define OPS_INIT_SIZE  8 +#define NODE_IS_REAL_IGNORECASE(node) \ +  (NODE_IS_IGNORECASE(node) && !NODE_STRING_IS_CRUDE(node)) +  typedef struct {    OnigLen min;    OnigLen max; @@ -44,7 +47,7 @@ typedef struct {  OnigCaseFoldType OnigDefaultCaseFoldFlag = ONIGENC_CASE_FOLD_MIN; -static OnigLen node_min_byte_len(Node* node, ScanEnv* env); +static OnigLen node_min_byte_len(Node* node, ParseEnv* env);  #if 0  typedef struct { @@ -129,27 +132,22 @@ ops_init(regex_t* reg, int init_alloc_size)    Operation* p;    size_t size; -  if (init_alloc_size > 0) { -    size = sizeof(Operation) * init_alloc_size; -    p = (Operation* )xrealloc(reg->ops, size); -    CHECK_NULL_RETURN_MEMERR(p); -    reg->ops = p; +  if (init_alloc_size <= 0) +    return ONIGERR_PARSER_BUG; + +  size = sizeof(Operation) * init_alloc_size; +  p = (Operation* )xrealloc(reg->ops, size); +  CHECK_NULL_RETURN_MEMERR(p); +  reg->ops = p;  #ifdef USE_DIRECT_THREADED_CODE -    { -      enum OpCode* cp; -      size = sizeof(enum OpCode) * init_alloc_size; -      cp = (enum OpCode* )xrealloc(reg->ocs, size); -      CHECK_NULL_RETURN_MEMERR(cp); -      reg->ocs = cp; -    } -#endif +  { +    enum OpCode* cp; +    size = sizeof(enum OpCode) * init_alloc_size; +    cp = (enum OpCode* )xrealloc(reg->ocs, size); +    CHECK_NULL_RETURN_MEMERR(cp); +    reg->ocs = cp;    } -  else { -    reg->ops = (Operation* )0; -#ifdef USE_DIRECT_THREADED_CODE -    reg->ocs = (enum OpCode* )0;  #endif -  }    reg->ops_curr  = 0; /* !!! not yet done ops_new() */    reg->ops_alloc = init_alloc_size; @@ -159,19 +157,16 @@ ops_init(regex_t* reg, int init_alloc_size)  }  static int -ops_expand(regex_t* reg, int n) +ops_resize(regex_t* reg, int n)  { -#define MIN_OPS_EXPAND_SIZE   4 -  #ifdef USE_DIRECT_THREADED_CODE    enum OpCode* cp;  #endif    Operation* p;    size_t size; -  if (n <= 0) n = MIN_OPS_EXPAND_SIZE; - -  n += reg->ops_alloc; +  if (n == reg->ops_alloc) return ONIG_NORMAL; +  if (n <= 0) return ONIGERR_PARSER_BUG;    size = sizeof(Operation) * n;    p = (Operation* )xrealloc(reg->ops, size); @@ -197,10 +192,8 @@ ops_expand(regex_t* reg, int n)  static int  ops_new(regex_t* reg)  { -  int r; -    if (reg->ops_used >= reg->ops_alloc) { -    r = ops_expand(reg, reg->ops_alloc); +    int r = ops_resize(reg, reg->ops_alloc << 1);      if (r != ONIG_NORMAL) return r;    } @@ -669,6 +662,8 @@ mmcl_alt_merge(MinMaxCharLen* to, MinMaxCharLen* alt)    if (to->max < alt->max) to->max = alt->max;  } +#ifndef ONIG_DONT_OPTIMIZE +  static int  mml_is_equal(MinMaxLen* a, MinMaxLen* b)  { @@ -709,9 +704,11 @@ mml_alt_merge(MinMaxLen* to, MinMaxLen* alt)    if (to->max < alt->max) to->max = alt->max;  } +#endif +  /* fixed size pattern node only */  static int -node_char_len1(Node* node, regex_t* reg, MinMaxCharLen* ci, ScanEnv* env, +node_char_len1(Node* node, regex_t* reg, MinMaxCharLen* ci, ParseEnv* env,                 int level)  {    MinMaxCharLen tci; @@ -768,7 +765,8 @@ node_char_len1(Node* node, regex_t* reg, MinMaxCharLen* ci, ScanEnv* env,        StrNode* sn = STR_(node);        UChar *s = sn->s; -      if (NODE_IS_IGNORECASE(node) && ! NODE_STRING_IS_CRUDE(node)) { +      if (NODE_IS_REAL_IGNORECASE(node) && +          CASE_FOLD_IS_NOT_ASCII_ONLY(env->case_fold_flag)) {          /* Such a case is possible.             ex. /(?i)(?<=\1)(a)/             Backref node refer to capture group, but it doesn't tune yet. @@ -917,7 +915,7 @@ node_char_len1(Node* node, regex_t* reg, MinMaxCharLen* ci, ScanEnv* env,      {        int i;        int* backs; -      MemEnv* mem_env = SCANENV_MEMENV(env); +      MemEnv* mem_env = PARSEENV_MEMENV(env);        BackRefNode* br = BACKREF_(node);        backs = BACKREFS_P(br); @@ -943,7 +941,7 @@ node_char_len1(Node* node, regex_t* reg, MinMaxCharLen* ci, ScanEnv* env,  }  static int -node_char_len(Node* node, regex_t* reg, MinMaxCharLen* ci, ScanEnv* env) +node_char_len(Node* node, regex_t* reg, MinMaxCharLen* ci, ParseEnv* env)  {    return node_char_len1(node, reg, ci, env, 0);  } @@ -967,7 +965,7 @@ add_op(regex_t* reg, int opcode)  }  static int compile_length_tree(Node* node, regex_t* reg); -static int compile_tree(Node* node, regex_t* reg, ScanEnv* env); +static int compile_tree(Node* node, regex_t* reg, ParseEnv* env);  #define IS_NEED_STR_LEN_OP(op) \ @@ -1035,7 +1033,7 @@ is_strict_real_node(Node* node)  }  static int -compile_quant_body_with_empty_check(QuantNode* qn, regex_t* reg, ScanEnv* env) +compile_quant_body_with_empty_check(QuantNode* qn, regex_t* reg, ParseEnv* env)  {    int r;    int saved_num_empty_check; @@ -1060,14 +1058,20 @@ compile_quant_body_with_empty_check(QuantNode* qn, regex_t* reg, ScanEnv* env)      if (emptiness == BODY_MAY_BE_EMPTY)        r = add_op(reg, OP_EMPTY_CHECK_END);      else if (emptiness == BODY_MAY_BE_EMPTY_MEM) { -      if (NODE_IS_EMPTY_STATUS_CHECK(qn) != 0) +      if (NODE_IS_EMPTY_STATUS_CHECK(qn) != 0 && qn->empty_status_mem != 0) {          r = add_op(reg, OP_EMPTY_CHECK_END_MEMST); +        if (r != 0) return r; +        COP(reg)->empty_check_end.empty_status_mem = qn->empty_status_mem; +      }        else          r = add_op(reg, OP_EMPTY_CHECK_END);      }  #ifdef USE_CALL -    else if (emptiness == BODY_MAY_BE_EMPTY_REC) +    else if (emptiness == BODY_MAY_BE_EMPTY_REC) {        r = add_op(reg, OP_EMPTY_CHECK_END_MEMST_PUSH); +      if (r != 0) return r; +      COP(reg)->empty_check_end.empty_status_mem = qn->empty_status_mem; +    }  #endif      if (r != 0) return r; @@ -1078,7 +1082,7 @@ compile_quant_body_with_empty_check(QuantNode* qn, regex_t* reg, ScanEnv* env)  #ifdef USE_CALL  static int -compile_call(CallNode* node, regex_t* reg, ScanEnv* env) +compile_call(CallNode* node, regex_t* reg, ParseEnv* env)  {    int r;    int offset; @@ -1098,7 +1102,7 @@ compile_call(CallNode* node, regex_t* reg, ScanEnv* env)  #endif  static int -compile_tree_n_times(Node* node, int n, regex_t* reg, ScanEnv* env) +compile_tree_n_times(Node* node, int n, regex_t* reg, ParseEnv* env)  {    int i, r; @@ -1356,7 +1360,7 @@ entry_repeat_range(regex_t* reg, int id, int lower, int upper, int ops_index)  static int  compile_range_repeat_node(QuantNode* qn, int target_len, int emptiness, -                          regex_t* reg, ScanEnv* env) +                          regex_t* reg, ParseEnv* env)  {    int r;    int num_repeat = reg->num_repeat++; @@ -1469,7 +1473,7 @@ compile_length_quantifier_node(QuantNode* qn, regex_t* reg)  }  static int -compile_quantifier_node(QuantNode* qn, regex_t* reg, ScanEnv* env) +compile_quantifier_node(QuantNode* qn, regex_t* reg, ParseEnv* env)  {    int i, r, mod_tlen;    int infinite = IS_INFINITE_REPEAT(qn->upper); @@ -1649,7 +1653,7 @@ compile_length_option_node(BagNode* node, regex_t* reg)  }  static int -compile_option_node(BagNode* node, regex_t* reg, ScanEnv* env) +compile_option_node(BagNode* node, regex_t* reg, ParseEnv* env)  {    int r; @@ -1765,7 +1769,7 @@ compile_length_bag_node(BagNode* node, regex_t* reg)  }  static int -compile_bag_memory_node(BagNode* node, regex_t* reg, ScanEnv* env) +compile_bag_memory_node(BagNode* node, regex_t* reg, ParseEnv* env)  {    int r; @@ -1845,7 +1849,7 @@ compile_bag_memory_node(BagNode* node, regex_t* reg, ScanEnv* env)  }  static int -compile_bag_node(BagNode* node, regex_t* reg, ScanEnv* env) +compile_bag_node(BagNode* node, regex_t* reg, ParseEnv* env)  {    int r, len; @@ -2036,7 +2040,7 @@ compile_length_anchor_node(AnchorNode* node, regex_t* reg)  }  static int -compile_anchor_look_behind_node(AnchorNode* node, regex_t* reg, ScanEnv* env) +compile_anchor_look_behind_node(AnchorNode* node, regex_t* reg, ParseEnv* env)  {    int r; @@ -2150,7 +2154,7 @@ compile_anchor_look_behind_node(AnchorNode* node, regex_t* reg, ScanEnv* env)  static int  compile_anchor_look_behind_not_node(AnchorNode* node, regex_t* reg, -                                    ScanEnv* env) +                                    ParseEnv* env)  {    int r;    int len; @@ -2279,7 +2283,7 @@ compile_anchor_look_behind_not_node(AnchorNode* node, regex_t* reg,  }  static int -compile_anchor_node(AnchorNode* node, regex_t* reg, ScanEnv* env) +compile_anchor_node(AnchorNode* node, regex_t* reg, ParseEnv* env)  {    int r, len;    enum OpCode op; @@ -2573,7 +2577,7 @@ compile_length_tree(Node* node, regex_t* reg)  }  static int -compile_tree(Node* node, regex_t* reg, ScanEnv* env) +compile_tree(Node* node, regex_t* reg, ParseEnv* env)  {    int n, len, pos, r = 0; @@ -2983,7 +2987,7 @@ numbered_ref_check(Node* node)  }  static int -disable_noname_group_capture(Node** root, regex_t* reg, ScanEnv* env) +disable_noname_group_capture(Node** root, regex_t* reg, ParseEnv* env)  {    int r, i, pos, counter;    MemStatusType loc; @@ -3003,7 +3007,7 @@ disable_noname_group_capture(Node** root, regex_t* reg, ScanEnv* env)    for (i = 1, pos = 1; i <= env->num_mem; i++) {      if (map[i].new_val > 0) { -      SCANENV_MEMENV(env)[pos] = SCANENV_MEMENV(env)[i]; +      PARSEENV_MEMENV(env)[pos] = PARSEENV_MEMENV(env)[i];        pos++;      }    } @@ -3285,8 +3289,7 @@ get_tree_head_literal(Node* node, int exact, regex_t* reg)        if (sn->end <= sn->s)          break; -      if (exact == 0 || -          ! NODE_IS_IGNORECASE(node) || NODE_STRING_IS_CRUDE(node)) { +      if (exact == 0 || !NODE_IS_REAL_IGNORECASE(node)) {          n = node;        }      } @@ -3381,7 +3384,7 @@ get_tree_tail_literal(Node* node, Node** rnode, regex_t* reg)          break;        } -      if (NODE_IS_IGNORECASE(node) && ! NODE_STRING_IS_CRUDE(node)) { +      if (NODE_IS_REAL_IGNORECASE(node)) {          r = GET_VALUE_NONE;          break;        } @@ -3601,7 +3604,7 @@ check_node_in_look_behind(Node* node, int not, int* used)  }  static OnigLen -node_min_byte_len(Node* node, ScanEnv* env) +node_min_byte_len(Node* node, ParseEnv* env)  {    OnigLen len;    OnigLen tmin; @@ -3612,7 +3615,7 @@ node_min_byte_len(Node* node, ScanEnv* env)      if (! NODE_IS_CHECKER(node)) {        int i;        int* backs; -      MemEnv* mem_env = SCANENV_MEMENV(env); +      MemEnv* mem_env = PARSEENV_MEMENV(env);        BackRefNode* br = BACKREF_(node);        if (NODE_IS_RECURSION(node)) break; @@ -3629,10 +3632,8 @@ node_min_byte_len(Node* node, ScanEnv* env)    case NODE_CALL:      {        Node* t = NODE_BODY(node); -      if (NODE_IS_RECURSION(node)) { -        if (NODE_IS_FIXED_MIN(t)) -          len = BAG_(t)->min_len; -      } +      if (NODE_IS_FIXED_MIN(t)) +        len = BAG_(t)->min_len;        else          len = node_min_byte_len(t, env);      } @@ -3742,143 +3743,8 @@ node_min_byte_len(Node* node, ScanEnv* env)    return len;  } -static OnigLen -node_max_byte_len(Node* node, ScanEnv* env) -{ -  OnigLen len; -  OnigLen tmax; - -  len = 0; -  switch (NODE_TYPE(node)) { -  case NODE_LIST: -    do { -      tmax = node_max_byte_len(NODE_CAR(node), env); -      len = distance_add(len, tmax); -    } while (IS_NOT_NULL(node = NODE_CDR(node))); -    break; - -  case NODE_ALT: -    do { -      tmax = node_max_byte_len(NODE_CAR(node), env); -      if (len < tmax) len = tmax; -    } while (IS_NOT_NULL(node = NODE_CDR(node))); -    break; - -  case NODE_STRING: -    { -      StrNode* sn = STR_(node); -      len = (OnigLen )(sn->end - sn->s); -    } -    break; - -  case NODE_CTYPE: -  case NODE_CCLASS: -    len = ONIGENC_MBC_MAXLEN_DIST(env->enc); -    break; - -  case NODE_BACKREF: -    if (! NODE_IS_CHECKER(node)) { -      int i; -      int* backs; -      MemEnv* mem_env = SCANENV_MEMENV(env); -      BackRefNode* br = BACKREF_(node); -      if (NODE_IS_RECURSION(node)) { -#ifdef USE_BACKREF_WITH_LEVEL -        if (NODE_IS_NEST_LEVEL(node)) { -          len = INFINITE_LEN; -        } -#endif -        break; -      } -      backs = BACKREFS_P(br); -      for (i = 0; i < br->back_num; i++) { -        tmax = node_max_byte_len(mem_env[backs[i]].mem_node, env); -        if (len < tmax) len = tmax; -      } -    } -    break; - -#ifdef USE_CALL -  case NODE_CALL: -    if (! NODE_IS_RECURSION(node)) -      len = node_max_byte_len(NODE_BODY(node), env); -    else -      len = INFINITE_LEN; -    break; -#endif - -  case NODE_QUANT: -    { -      QuantNode* qn = QUANT_(node); - -      if (qn->upper != 0) { -        len = node_max_byte_len(NODE_BODY(node), env); -        if (len != 0) { -          if (! IS_INFINITE_REPEAT(qn->upper)) -            len = distance_multiply(len, qn->upper); -          else -            len = INFINITE_LEN; -        } -      } -    } -    break; - -  case NODE_BAG: -    { -      BagNode* en = BAG_(node); -      switch (en->type) { -      case BAG_MEMORY: -        if (NODE_IS_FIXED_MAX(node)) -          len = en->max_len; -        else { -          if (NODE_IS_MARK1(node)) -            len = INFINITE_LEN; -          else { -            NODE_STATUS_ADD(node, MARK1); -            len = node_max_byte_len(NODE_BODY(node), env); -            NODE_STATUS_REMOVE(node, MARK1); - -            en->max_len = len; -            NODE_STATUS_ADD(node, FIXED_MAX); -          } -        } -        break; - -      case BAG_OPTION: -      case BAG_STOP_BACKTRACK: -        len = node_max_byte_len(NODE_BODY(node), env); -        break; -      case BAG_IF_ELSE: -        { -          OnigLen tlen, elen; - -          len = node_max_byte_len(NODE_BODY(node), env); -          if (IS_NOT_NULL(en->te.Then)) { -            tlen = node_max_byte_len(en->te.Then, env); -            len = distance_add(len, tlen); -          } -          if (IS_NOT_NULL(en->te.Else)) -            elen = node_max_byte_len(en->te.Else, env); -          else elen = 0; - -          if (elen > len) len = elen; -        } -        break; -      } -    } -    break; - -  case NODE_ANCHOR: -  case NODE_GIMMICK: -  default: -    break; -  } - -  return len; -} -  static int -check_backrefs(Node* node, ScanEnv* env) +check_backrefs(Node* node, ParseEnv* env)  {    int r; @@ -3923,7 +3789,7 @@ check_backrefs(Node* node, ScanEnv* env)        int i;        BackRefNode* br = BACKREF_(node);        int* backs = BACKREFS_P(br); -      MemEnv* mem_env = SCANENV_MEMENV(env); +      MemEnv* mem_env = PARSEENV_MEMENV(env);        for (i = 0; i < br->back_num; i++) {          if (backs[i] > env->num_mem) @@ -3944,7 +3810,7 @@ check_backrefs(Node* node, ScanEnv* env)  }  static int -set_empty_repeat_node_trav(Node* node, Node* empty, ScanEnv* env) +set_empty_repeat_node_trav(Node* node, Node* empty, ParseEnv* env)  {    int r; @@ -3998,7 +3864,7 @@ set_empty_repeat_node_trav(Node* node, Node* empty, ScanEnv* env)        if (en->type == BAG_MEMORY) {          if (NODE_IS_BACKREF(node)) {            if (IS_NOT_NULL(empty)) -            SCANENV_MEMENV(env)[en->m.regnum].empty_repeat_node = empty; +            PARSEENV_MEMENV(env)[en->m.regnum].empty_repeat_node = empty;          }        }        else if (en->type == BAG_IF_ELSE) { @@ -4034,7 +3900,7 @@ is_ancestor_node(Node* node, Node* me)  }  static void -set_empty_status_check_trav(Node* node, ScanEnv* env) +set_empty_status_check_trav(Node* node, ParseEnv* env)  {    switch (NODE_TYPE(node)) {    case NODE_LIST: @@ -4078,14 +3944,14 @@ set_empty_status_check_trav(Node* node, ScanEnv* env)      {        int i;        int* backs; -      MemEnv* mem_env = SCANENV_MEMENV(env); +      MemEnv* mem_env = PARSEENV_MEMENV(env);        BackRefNode* br = BACKREF_(node);        backs = BACKREFS_P(br);        for (i = 0; i < br->back_num; i++) {          Node* ernode = mem_env[backs[i]].empty_repeat_node;          if (IS_NOT_NULL(ernode)) {            if (! is_ancestor_node(ernode, node)) { -            MEM_STATUS_LIMIT_ON(env->reg->empty_status_mem, backs[i]); +            MEM_STATUS_LIMIT_ON(QUANT_(ernode)->empty_status_mem, backs[i]);              NODE_STATUS_ADD(ernode, EMPTY_STATUS_CHECK);              NODE_STATUS_ADD(mem_env[backs[i]].mem_node, EMPTY_STATUS_CHECK);            } @@ -4150,7 +4016,7 @@ set_parent_node_trav(Node* node, Node* parent)  #define RECURSION_INFINITE     (1<<2)  static int -infinite_recursive_call_check(Node* node, ScanEnv* env, int head) +infinite_recursive_call_check(Node* node, ParseEnv* env, int head)  {    int ret;    int r = 0; @@ -4191,6 +4057,8 @@ infinite_recursive_call_check(Node* node, ScanEnv* env, int head)      break;    case NODE_QUANT: +    if (QUANT_(node)->upper == 0) break; +      r = infinite_recursive_call_check(NODE_BODY(node), env, head);      if (r < 0) return r;      if ((r & RECURSION_MUST) != 0) { @@ -4265,7 +4133,7 @@ infinite_recursive_call_check(Node* node, ScanEnv* env, int head)  }  static int -infinite_recursive_call_check_trav(Node* node, ScanEnv* env) +infinite_recursive_call_check_trav(Node* node, ParseEnv* env)  {    int r; @@ -4403,7 +4271,7 @@ recursive_call_check(Node* node)  #define FOUND_CALLED_NODE    1  static int -recursive_call_check_trav(Node* node, ScanEnv* env, int state) +recursive_call_check_trav(Node* node, ParseEnv* env, int state)  {    int r = 0; @@ -4443,19 +4311,21 @@ recursive_call_check_trav(Node* node, ScanEnv* env, int state)        BagNode* en = BAG_(node);        if (en->type == BAG_MEMORY) { -        if (NODE_IS_CALLED(node) || (state & IN_RECURSION) != 0) { +        if (NODE_IS_CALLED(node)) { +          r = FOUND_CALLED_NODE; +          goto check_recursion; +        } +        else if ((state & IN_RECURSION) != 0) { +        check_recursion:            if (! NODE_IS_RECURSION(node)) {              NODE_STATUS_ADD(node, MARK1); -            r = recursive_call_check(NODE_BODY(node)); -            if (r != 0) { +            ret = recursive_call_check(NODE_BODY(node)); +            if (ret != 0) {                NODE_STATUS_ADD(node, RECURSION);                MEM_STATUS_ON(env->backtrack_mem, en->m.regnum);              }              NODE_STATUS_REMOVE(node, MARK1);            } - -          if (NODE_IS_CALLED(node)) -            r = FOUND_CALLED_NODE;          }        } @@ -4616,8 +4486,9 @@ reduce_string_list(Node* node, OnigEncoding enc)  #define IN_VAR_REPEAT   (1<<3)  #define IN_ZERO_REPEAT  (1<<4)  #define IN_MULTI_ENTRY  (1<<5) -#define IN_LOOK_BEHIND  (1<<6) - +#define IN_PREC_READ    (1<<6) +#define IN_LOOK_BEHIND  (1<<7) +#define IN_PEEK         (1<<8)  /* divide different length alternatives in look-behind.    (?<=A|B) ==> (?<=A)|(?<=B) @@ -4706,7 +4577,7 @@ list_reduce_in_look_behind(Node* node)  }  static int -alt_reduce_in_look_behind(Node* node, regex_t* reg, ScanEnv* env) +alt_reduce_in_look_behind(Node* node, regex_t* reg, ParseEnv* env)  {    int r; @@ -4725,10 +4596,10 @@ alt_reduce_in_look_behind(Node* node, regex_t* reg, ScanEnv* env)    return r;  } -static int tune_tree(Node* node, regex_t* reg, int state, ScanEnv* env); +static int tune_tree(Node* node, regex_t* reg, int state, ParseEnv* env);  static int -tune_look_behind(Node* node, regex_t* reg, int state, ScanEnv* env) +tune_look_behind(Node* node, regex_t* reg, int state, ParseEnv* env)  {    int r;    int state1; @@ -5183,7 +5054,7 @@ unravel_case_fold_string(Node* node, regex_t* reg, int state)    return r;  } -#ifdef USE_STUBBORN_CHECK_CAPTURES_IN_EMPTY_REPEAT +#ifdef USE_RIGID_CHECK_CAPTURES_IN_EMPTY_REPEAT  static enum BodyEmptyType  quantifiers_memory_node_info(Node* node)  { @@ -5265,7 +5136,7 @@ quantifiers_memory_node_info(Node* node)    return r;  } -#endif /* USE_STUBBORN_CHECK_CAPTURES_IN_EMPTY_REPEAT */ +#endif /* USE_RIGID_CHECK_CAPTURES_IN_EMPTY_REPEAT */  #ifdef USE_CALL @@ -5274,9 +5145,9 @@ quantifiers_memory_node_info(Node* node)  __inline  #endif  static int -check_call_reference(CallNode* cn, ScanEnv* env, int state) +check_call_reference(CallNode* cn, ParseEnv* env, int state)  { -  MemEnv* mem_env = SCANENV_MEMENV(env); +  MemEnv* mem_env = PARSEENV_MEMENV(env);    if (cn->by_number != 0) {      int gnum = cn->called_gnum; @@ -5393,7 +5264,7 @@ tune_call2_call(Node* node)  }  static int -tune_call(Node* node, ScanEnv* env, int state) +tune_call(Node* node, ParseEnv* env, int state)  {    int r; @@ -5539,6 +5410,8 @@ tune_called_state_call(Node* node, int state)          state |= IN_REAL_REPEAT;        if (qn->lower != qn->upper)          state |= IN_VAR_REPEAT; +      if ((state & IN_PEEK) != 0) +        NODE_STATUS_ADD(node, INPEEK);        tune_called_state_call(NODE_QUANT_BODY(qn), state);      } @@ -5551,10 +5424,12 @@ tune_called_state_call(Node* node, int state)        switch (an->type) {        case ANCR_PREC_READ_NOT:        case ANCR_LOOK_BEHIND_NOT: -        state |= IN_NOT; -        /* fall */ +        state |= (IN_NOT | IN_PEEK); +        tune_called_state_call(NODE_ANCHOR_BODY(an), state); +        break;        case ANCR_PREC_READ:        case ANCR_LOOK_BEHIND: +        state |= IN_PEEK;          tune_called_state_call(NODE_ANCHOR_BODY(an), state);          break;        default: @@ -5597,6 +5472,11 @@ tune_called_state_call(Node* node, int state)      break;    case NODE_CALL: +    if ((state & IN_PEEK) != 0) +      NODE_STATUS_ADD(node, INPEEK); +    if ((state & IN_REAL_REPEAT) != 0) +      NODE_STATUS_ADD(node, IN_REAL_REPEAT); +      tune_called_state_call(NODE_BODY(node), state);      break; @@ -5620,6 +5500,11 @@ tune_called_state(Node* node, int state)  #ifdef USE_CALL    case NODE_CALL: +    if ((state & IN_PEEK) != 0) +      NODE_STATUS_ADD(node, INPEEK); +    if ((state & IN_REAL_REPEAT) != 0) +      NODE_STATUS_ADD(node, IN_REAL_REPEAT); +      tune_called_state_call(node, state);      break;  #endif @@ -5659,6 +5544,8 @@ tune_called_state(Node* node, int state)          state |= IN_REAL_REPEAT;        if (qn->lower != qn->upper)          state |= IN_VAR_REPEAT; +      if ((state & IN_PEEK) != 0) +        NODE_STATUS_ADD(node, INPEEK);        tune_called_state(NODE_QUANT_BODY(qn), state);      } @@ -5671,10 +5558,12 @@ tune_called_state(Node* node, int state)        switch (an->type) {        case ANCR_PREC_READ_NOT:        case ANCR_LOOK_BEHIND_NOT: -        state |= IN_NOT; -        /* fall */ +        state |= (IN_NOT | IN_PEEK); +        tune_called_state(NODE_ANCHOR_BODY(an), state); +        break;        case ANCR_PREC_READ:        case ANCR_LOOK_BEHIND: +        state |= IN_PEEK;          tune_called_state(NODE_ANCHOR_BODY(an), state);          break;        default: @@ -5700,17 +5589,18 @@ tune_called_state(Node* node, int state)  __inline  #endif  static int -tune_anchor(Node* node, regex_t* reg, int state, ScanEnv* env) +tune_anchor(Node* node, regex_t* reg, int state, ParseEnv* env)  {    int r;    AnchorNode* an = ANCHOR_(node);    switch (an->type) {    case ANCR_PREC_READ: -    r = tune_tree(NODE_ANCHOR_BODY(an), reg, state, env); +    r = tune_tree(NODE_ANCHOR_BODY(an), reg, (state | IN_PREC_READ), env);      break;    case ANCR_PREC_READ_NOT: -    r = tune_tree(NODE_ANCHOR_BODY(an), reg, (state | IN_NOT), env); +    r = tune_tree(NODE_ANCHOR_BODY(an), reg, (state | IN_PREC_READ | IN_NOT), +                  env);      break;    case ANCR_LOOK_BEHIND: @@ -5730,7 +5620,7 @@ tune_anchor(Node* node, regex_t* reg, int state, ScanEnv* env)  __inline  #endif  static int -tune_quant(Node* node, regex_t* reg, int state, ScanEnv* env) +tune_quant(Node* node, regex_t* reg, int state, ParseEnv* env)  {    int r;    QuantNode* qn = QUANT_(node); @@ -5746,7 +5636,7 @@ tune_quant(Node* node, regex_t* reg, int state, ScanEnv* env)    if (IS_INFINITE_REPEAT(qn->upper) || qn->upper >= 1) {      OnigLen d = node_min_byte_len(body, env);      if (d == 0) { -#ifdef USE_STUBBORN_CHECK_CAPTURES_IN_EMPTY_REPEAT +#ifdef USE_RIGID_CHECK_CAPTURES_IN_EMPTY_REPEAT        qn->emptiness = quantifiers_memory_node_info(body);  #else        qn->emptiness = BODY_MAY_BE_EMPTY; @@ -5807,7 +5697,7 @@ tune_quant(Node* node, regex_t* reg, int state, ScanEnv* env)   6. expand repeated string.   */  static int -tune_tree(Node* node, regex_t* reg, int state, ScanEnv* env) +tune_tree(Node* node, regex_t* reg, int state, ParseEnv* env)  {    int r = 0; @@ -5832,7 +5722,7 @@ tune_tree(Node* node, regex_t* reg, int state, ScanEnv* env)      break;    case NODE_STRING: -    if (NODE_IS_IGNORECASE(node) && ! NODE_STRING_IS_CRUDE(node)) { +    if (NODE_IS_REAL_IGNORECASE(node)) {        r = unravel_case_fold_string(node, reg, state);      }      break; @@ -5918,6 +5808,9 @@ tune_tree(Node* node, regex_t* reg, int state, ScanEnv* env)      break;    case NODE_QUANT: +    if ((state & (IN_PREC_READ | IN_LOOK_BEHIND)) != 0) +      NODE_STATUS_ADD(node, INPEEK); +      r = tune_quant(node, reg, state, env);      break; @@ -5938,6 +5831,7 @@ tune_tree(Node* node, regex_t* reg, int state, ScanEnv* env)    return r;  } +#ifndef ONIG_DONT_OPTIMIZE  static int  set_sunday_quick_search_or_bmh_skip_table(regex_t* reg, int case_expand,                                            UChar* s, UChar* end, @@ -6007,6 +5901,7 @@ set_sunday_quick_search_or_bmh_skip_table(regex_t* reg, int case_expand,    return 0;  } +#endif  #define OPT_EXACT_MAXLEN   24 @@ -6019,7 +5914,7 @@ typedef struct {    MinMaxLen        mm;    OnigEncoding     enc;    OnigCaseFoldType case_fold_flag; -  ScanEnv*         scan_env; +  ParseEnv*        scan_env;  } OptEnv;  typedef struct { @@ -6052,6 +5947,8 @@ typedef struct {  } OptNode; +#ifndef ONIG_DONT_OPTIMIZE +  static int  map_position_value(OnigEncoding enc, int i)  { @@ -6540,6 +6437,140 @@ alt_merge_node_opt_info(OptNode* to, OptNode* add, OptEnv* env)    mml_alt_merge(&to->len, &add->len);  } +static OnigLen +node_max_byte_len(Node* node, ParseEnv* env) +{ +  OnigLen len; +  OnigLen tmax; + +  len = 0; +  switch (NODE_TYPE(node)) { +  case NODE_LIST: +    do { +      tmax = node_max_byte_len(NODE_CAR(node), env); +      len = distance_add(len, tmax); +    } while (IS_NOT_NULL(node = NODE_CDR(node))); +    break; + +  case NODE_ALT: +    do { +      tmax = node_max_byte_len(NODE_CAR(node), env); +      if (len < tmax) len = tmax; +    } while (IS_NOT_NULL(node = NODE_CDR(node))); +    break; + +  case NODE_STRING: +    { +      StrNode* sn = STR_(node); +      len = (OnigLen )(sn->end - sn->s); +    } +    break; + +  case NODE_CTYPE: +  case NODE_CCLASS: +    len = ONIGENC_MBC_MAXLEN_DIST(env->enc); +    break; + +  case NODE_BACKREF: +    if (! NODE_IS_CHECKER(node)) { +      int i; +      int* backs; +      MemEnv* mem_env = PARSEENV_MEMENV(env); +      BackRefNode* br = BACKREF_(node); +      if (NODE_IS_RECURSION(node)) { +#ifdef USE_BACKREF_WITH_LEVEL +        if (NODE_IS_NEST_LEVEL(node)) { +          len = INFINITE_LEN; +        } +#endif +        break; +      } +      backs = BACKREFS_P(br); +      for (i = 0; i < br->back_num; i++) { +        tmax = node_max_byte_len(mem_env[backs[i]].mem_node, env); +        if (len < tmax) len = tmax; +      } +    } +    break; + +#ifdef USE_CALL +  case NODE_CALL: +    if (! NODE_IS_RECURSION(node)) +      len = node_max_byte_len(NODE_BODY(node), env); +    else +      len = INFINITE_LEN; +    break; +#endif + +  case NODE_QUANT: +    { +      QuantNode* qn = QUANT_(node); + +      if (qn->upper != 0) { +        len = node_max_byte_len(NODE_BODY(node), env); +        if (len != 0) { +          if (! IS_INFINITE_REPEAT(qn->upper)) +            len = distance_multiply(len, qn->upper); +          else +            len = INFINITE_LEN; +        } +      } +    } +    break; + +  case NODE_BAG: +    { +      BagNode* en = BAG_(node); +      switch (en->type) { +      case BAG_MEMORY: +        if (NODE_IS_FIXED_MAX(node)) +          len = en->max_len; +        else { +          if (NODE_IS_MARK1(node)) +            len = INFINITE_LEN; +          else { +            NODE_STATUS_ADD(node, MARK1); +            len = node_max_byte_len(NODE_BODY(node), env); +            NODE_STATUS_REMOVE(node, MARK1); + +            en->max_len = len; +            NODE_STATUS_ADD(node, FIXED_MAX); +          } +        } +        break; + +      case BAG_OPTION: +      case BAG_STOP_BACKTRACK: +        len = node_max_byte_len(NODE_BODY(node), env); +        break; +      case BAG_IF_ELSE: +        { +          OnigLen tlen, elen; + +          len = node_max_byte_len(NODE_BODY(node), env); +          if (IS_NOT_NULL(en->te.Then)) { +            tlen = node_max_byte_len(en->te.Then, env); +            len = distance_add(len, tlen); +          } +          if (IS_NOT_NULL(en->te.Else)) +            elen = node_max_byte_len(en->te.Else, env); +          else elen = 0; + +          if (elen > len) len = elen; +        } +        break; +      } +    } +    break; + +  case NODE_ANCHOR: +  case NODE_GIMMICK: +  default: +    break; +  } + +  return len; +}  #define MAX_NODE_OPT_INFO_REF_COUNT    5 @@ -6822,22 +6853,22 @@ optimize_nodes(Node* node, OptNode* opt, OptEnv* env)          {            OptEnv nenv; -          copy_opt_env(&nenv, env); -          r = optimize_nodes(NODE_BAG_BODY(en), &xo, &nenv); -          if (r == 0) { -            mml_add(&nenv.mm, &xo.len); -            concat_left_node_opt_info(enc, opt, &xo); -            if (IS_NOT_NULL(en->te.Then)) { -              r = optimize_nodes(en->te.Then, &xo, &nenv); -              if (r == 0) { -                concat_left_node_opt_info(enc, opt, &xo); +          if (IS_NOT_NULL(en->te.Else)) { +            copy_opt_env(&nenv, env); +            r = optimize_nodes(NODE_BAG_BODY(en), &xo, &nenv); +            if (r == 0) { +              mml_add(&nenv.mm, &xo.len); +              concat_left_node_opt_info(enc, opt, &xo); +              if (IS_NOT_NULL(en->te.Then)) { +                r = optimize_nodes(en->te.Then, &xo, &nenv); +                if (r == 0) { +                  concat_left_node_opt_info(enc, opt, &xo); +                }                } -            } -            if (IS_NOT_NULL(en->te.Else)) { -              r = optimize_nodes(en->te.Else, &xo, env); -              if (r == 0) -                alt_merge_node_opt_info(opt, &xo, env); +                r = optimize_nodes(en->te.Else, &xo, env); +                if (r == 0) +                  alt_merge_node_opt_info(opt, &xo, env);              }            }          } @@ -6930,7 +6961,7 @@ static void print_optimize_info(FILE* f, regex_t* reg);  #endif  static int -set_optimize_info_from_tree(Node* node, regex_t* reg, ScanEnv* scan_env) +set_optimize_info_from_tree(Node* node, regex_t* reg, ParseEnv* scan_env)  {    int r;    OptNode opt; @@ -6985,6 +7016,7 @@ set_optimize_info_from_tree(Node* node, regex_t* reg, ScanEnv* scan_env)  #endif    return r;  } +#endif /* ONIG_DONT_OPTIMIZE */  static void  clear_optimize_info(regex_t* reg) @@ -7031,14 +7063,43 @@ static void print_enc_string(FILE* fp, OnigEncoding enc,        s++;      }    } +} -  fprintf(fp, "/\n"); +static void +print_options(FILE* fp, OnigOptionType o) +{ +  if ((o & ONIG_OPTION_IGNORECASE) != 0)      fprintf(fp, " IGNORECASE"); +  if ((o & ONIG_OPTION_EXTEND) != 0)          fprintf(fp, " EXTEND"); +  if ((o & ONIG_OPTION_MULTILINE) != 0)       fprintf(fp, " MULTILINE"); +  if ((o & ONIG_OPTION_SINGLELINE) != 0)      fprintf(fp, " SINGLELINE"); +  if ((o & ONIG_OPTION_FIND_LONGEST) != 0)    fprintf(fp, " FIND_LONGEST"); +  if ((o & ONIG_OPTION_FIND_NOT_EMPTY) != 0)  fprintf(fp, " FIND_NOT_EMPTY"); +  if ((o & ONIG_OPTION_NEGATE_SINGLELINE) != 0)  fprintf(fp, " NEGATE_SINGLELINE"); +  if ((o & ONIG_OPTION_DONT_CAPTURE_GROUP) != 0) fprintf(fp, " DONT_CAPTURE_GROUP"); +  if ((o & ONIG_OPTION_CAPTURE_GROUP) != 0)   fprintf(fp, " CAPTURE_GROUP"); +  if ((o & ONIG_OPTION_NOTBOL) != 0)          fprintf(fp, " NOTBOL"); +  if ((o & ONIG_OPTION_NOTEOL) != 0)          fprintf(fp, " NOTEOL"); +  if ((o & ONIG_OPTION_POSIX_REGION) != 0)    fprintf(fp, " POSIX_REGION"); +  if ((o & ONIG_OPTION_CHECK_VALIDITY_OF_STRING) != 0) fprintf(fp, " CHECK_VALIDITY_OF_STRING"); +  if ((o & ONIG_OPTION_IGNORECASE_IS_ASCII) != 0) fprintf(fp, " IGNORECASE_IS_ASCII"); +  if ((o & ONIG_OPTION_WORD_IS_ASCII) != 0)   fprintf(fp, " WORD_IS_ASCII"); +  if ((o & ONIG_OPTION_DIGIT_IS_ASCII) != 0)  fprintf(fp, " DIGIT_IS_ASCII"); +  if ((o & ONIG_OPTION_SPACE_IS_ASCII) != 0)  fprintf(fp, " SPACE_IS_ASCII"); +  if ((o & ONIG_OPTION_POSIX_IS_ASCII) != 0)  fprintf(fp, " POSIX_IS_ASCII"); +  if ((o & ONIG_OPTION_TEXT_SEGMENT_EXTENDED_GRAPHEME_CLUSTER) != 0) fprintf(fp, " TEXT_SEGMENT_EXTENDED_GRAPHEME_CLUSTER"); +  if ((o & ONIG_OPTION_TEXT_SEGMENT_WORD) != 0) fprintf(fp, " TEXT_SEGMENT_WORD"); +  if ((o & ONIG_OPTION_NOT_BEGIN_STRING) != 0) fprintf(fp, " NOT_BIGIN_STRING"); +  if ((o & ONIG_OPTION_NOT_END_STRING) != 0)   fprintf(fp, " NOT_END_STRING"); +  if ((o & ONIG_OPTION_NOT_BEGIN_POSITION) != 0) fprintf(fp, " NOT_BEGIN_POSITION"); +  if ((o & ONIG_OPTION_CALLBACK_EACH_MATCH) != 0) fprintf(fp, " CALLBACK_EACH_MATCH");  }  #endif /* ONIG_DEBUG */  #if defined(ONIG_DEBUG_COMPILE) || defined(ONIG_DEBUG_MATCH) +#ifndef ONIG_DONT_OPTIMIZE +  static void  print_distance_range(FILE* f, OnigLen a, OnigLen b)  { @@ -7161,7 +7222,8 @@ print_optimize_info(FILE* f, regex_t* reg)      }    }  } -#endif +#endif /* ONIG_DONT_OPTIMIZE */ +#endif /* defined(ONIG_DEBUG_COMPILE) || defined(ONIG_DEBUG_MATCH) */  extern RegexExt* @@ -7259,93 +7321,150 @@ static void print_tree P_((FILE* f, Node* node));  extern int onig_init_for_match_at(regex_t* reg); -extern int -onig_compile(regex_t* reg, const UChar* pattern, const UChar* pattern_end, -             OnigErrorInfo* einfo) -{ -  int r; -  Node*  root; -  ScanEnv  scan_env; +static int parse_and_tune(regex_t* reg, const UChar* pattern, +  const UChar* pattern_end, ParseEnv *scan_env, Node** rroot, +  OnigErrorInfo* einfo  #ifdef USE_CALL -  UnsetAddrList  uslist = {0}; +  , UnsetAddrList* uslist  #endif +) +{ +  int r; +  Node* root; -  root = 0; +  root = NULL_NODE;    if (IS_NOT_NULL(einfo)) {      einfo->enc = reg->enc;      einfo->par = (UChar* )NULL;    } -#ifdef ONIG_DEBUG -  fprintf(DBGFP, "\nPATTERN: /"); -  print_enc_string(DBGFP, reg->enc, pattern, pattern_end); -#endif - -  if (reg->ops_alloc == 0) { -    r = ops_init(reg, OPS_INIT_SIZE); -    if (r != 0) goto end; -  } -  else -    reg->ops_used = 0; - -  r = onig_parse_tree(&root, pattern, pattern_end, reg, &scan_env); +  r = onig_parse_tree(&root, pattern, pattern_end, reg, scan_env);    if (r != 0) goto err;    r = reduce_string_list(root, reg->enc);    if (r != 0) goto err;    /* mixed use named group and no-named group */ -  if (scan_env.num_named > 0 && -      IS_SYNTAX_BV(scan_env.syntax, ONIG_SYN_CAPTURE_ONLY_NAMED_GROUP) && +  if (scan_env->num_named > 0 && +      IS_SYNTAX_BV(scan_env->syntax, ONIG_SYN_CAPTURE_ONLY_NAMED_GROUP) &&        ! OPTON_CAPTURE_GROUP(reg->options)) { -    if (scan_env.num_named != scan_env.num_mem) -      r = disable_noname_group_capture(&root, reg, &scan_env); +    if (scan_env->num_named != scan_env->num_mem) +      r = disable_noname_group_capture(&root, reg, scan_env);      else        r = numbered_ref_check(root);      if (r != 0) goto err;    } -  r = check_backrefs(root, &scan_env); +  r = check_backrefs(root, scan_env);    if (r != 0) goto err;  #ifdef USE_CALL -  if (scan_env.num_call > 0) { -    r = unset_addr_list_init(&uslist, scan_env.num_call); +  if (scan_env->num_call > 0) { +    r = unset_addr_list_init(uslist, scan_env->num_call);      if (r != 0) goto err; -    scan_env.unset_addr_list = &uslist; -    r = tune_call(root, &scan_env, 0); +    scan_env->unset_addr_list = uslist; +    r = tune_call(root, scan_env, 0);      if (r != 0) goto err_unset;      r = tune_call2(root);      if (r != 0) goto err_unset; -    r = recursive_call_check_trav(root, &scan_env, 0); +    r = recursive_call_check_trav(root, scan_env, 0);      if (r  < 0) goto err_unset; -    r = infinite_recursive_call_check_trav(root, &scan_env); +    r = infinite_recursive_call_check_trav(root, scan_env);      if (r != 0) goto err_unset;      tune_called_state(root, 0);    } -  reg->num_call = scan_env.num_call; +  reg->num_call = scan_env->num_call;  #endif  #ifdef ONIG_DEBUG_PARSE -  fprintf(DBGFP, "MAX PARSE DEPTH: %d\n", scan_env.max_parse_depth); -  fprintf(DBGFP, "TREE (parsed)\n"); -  print_tree(DBGFP, root); -  fprintf(DBGFP, "\n"); +  fprintf(DBGFP, "MAX PARSE DEPTH: %d\n", scan_env->max_parse_depth);  #endif -  r = tune_tree(root, reg, 0, &scan_env); -  if (r != 0) goto err_unset; +  r = tune_tree(root, reg, 0, scan_env); +  if (r != 0) { +#ifdef ONIG_DEBUG_PARSE +    fprintf(DBGFP, "TREE (error in tune)\n"); +    print_tree(DBGFP, root); +    fprintf(DBGFP, "\n"); +#endif +    goto err_unset; +  } -  if (scan_env.backref_num != 0) { +  if (scan_env->backref_num != 0) {      set_parent_node_trav(root, NULL_NODE); -    r = set_empty_repeat_node_trav(root, NULL_NODE, &scan_env); +    r = set_empty_repeat_node_trav(root, NULL_NODE, scan_env);      if (r != 0) goto err_unset; -    set_empty_status_check_trav(root, &scan_env); +    set_empty_status_check_trav(root, scan_env);    } +  *rroot = root; +  return r; + + err_unset: +#ifdef USE_CALL +  if (scan_env->num_call > 0) { +    unset_addr_list_end(uslist); +  } +#endif + err: +  if (IS_NOT_NULL(scan_env->error)) { +    if (IS_NOT_NULL(einfo)) { +      einfo->par     = scan_env->error; +      einfo->par_end = scan_env->error_end; +    } +  } + +  onig_node_free(root); +  if (IS_NOT_NULL(scan_env->mem_env_dynamic)) +    xfree(scan_env->mem_env_dynamic); + +  *rroot = NULL_NODE; +  return r; +} + +extern int +onig_compile(regex_t* reg, const UChar* pattern, const UChar* pattern_end, +             OnigErrorInfo* einfo) +{ +  int r; +  Node* root; +  ParseEnv scan_env; +#ifdef USE_CALL +  UnsetAddrList uslist = {0}; +#endif + +#ifdef ONIG_DEBUG +  fprintf(DBGFP, "\nPATTERN: /"); +  print_enc_string(DBGFP, reg->enc, pattern, pattern_end); +  fprintf(DBGFP, "/\n"); +  fprintf(DBGFP, "OPTIONS:"); +  print_options(DBGFP, reg->options); +  fprintf(DBGFP, "\n"); +#endif + +  if (reg->ops_alloc == 0) { +    r = ops_init(reg, OPS_INIT_SIZE); +    if (r != 0) { +      if (IS_NOT_NULL(einfo)) { +        einfo->enc = reg->enc; +        einfo->par = (UChar* )NULL; +      } +      return r; +    } +  } +  else +    reg->ops_used = 0; + +  r = parse_and_tune(reg, pattern, pattern_end, &scan_env, &root, einfo +#ifdef USE_CALL +                     , &uslist +#endif +                    ); +  if (r != 0) return r; +  #ifdef ONIG_DEBUG_PARSE    fprintf(DBGFP, "TREE (after tune)\n");    print_tree(DBGFP, root); @@ -7377,7 +7496,14 @@ onig_compile(regex_t* reg, const UChar* pattern, const UChar* pattern_end,    clear_optimize_info(reg);  #ifndef ONIG_DONT_OPTIMIZE    r = set_optimize_info_from_tree(root, reg, &scan_env); -  if (r != 0) goto err_unset; +  if (r != 0)  { +#ifdef USE_CALL +    if (scan_env.num_call > 0) { +      unset_addr_list_end(&uslist); +    } +#endif +    goto err; +  }  #endif    if (IS_NOT_NULL(scan_env.mem_env_dynamic)) { @@ -7407,6 +7533,9 @@ onig_compile(regex_t* reg, const UChar* pattern, const UChar* pattern_end,      }  #endif +    r = ops_resize(reg, reg->ops_used); +    if (r != ONIG_NORMAL) goto err; +      set_addr_in_repeat_range(reg);      if ((reg->push_mem_end != 0) @@ -7449,15 +7578,8 @@ onig_compile(regex_t* reg, const UChar* pattern, const UChar* pattern_end,    onig_init_for_match_at(reg);  #endif - end:    return r; - err_unset: -#ifdef USE_CALL -  if (scan_env.num_call > 0) { -    unset_addr_list_end(&uslist); -  } -#endif   err:    if (IS_NOT_NULL(scan_env.error)) {      if (IS_NOT_NULL(einfo)) { @@ -7513,6 +7635,12 @@ onig_reg_init(regex_t* reg, OnigOptionType option, OnigCaseFoldType case_fold_fl    else      option |= syntax->options; +  if ((option & ONIG_OPTION_IGNORECASE_IS_ASCII) != 0) { +    case_fold_flag &= ~(INTERNAL_ONIGENC_CASE_FOLD_MULTI_CHAR | +                        ONIGENC_CASE_FOLD_TURKISH_AZERI); +    case_fold_flag |= ONIGENC_CASE_FOLD_ASCII_ONLY; +  } +    (reg)->enc            = enc;    (reg)->options        = option;    (reg)->syntax         = syntax; @@ -7703,15 +7831,145 @@ onig_is_code_in_cc(OnigEncoding enc, OnigCodePoint code, CClassNode* cc)    return onig_is_code_in_cc_len(len, code, cc);  } + +#define MANY_REPEAT_OF_ANYCHAR   20 + +typedef enum { +  MJ_NO     = 0, +  MJ_YES    = 1, +  MJ_IGNORE = 2, +} MJ_RESULT; + +static MJ_RESULT +mostly_just_anychar(Node* node, int in_reluctant) +{ +  MJ_RESULT r; + +  r = MJ_NO; +  switch (NODE_TYPE(node)) { +  case NODE_LIST: +    { +      int found = FALSE; +      do { +        r = mostly_just_anychar(NODE_CAR(node), in_reluctant); +        if (r == MJ_NO) break; +        if (r == MJ_YES) found = TRUE; +      } while (IS_NOT_NULL(node = NODE_CDR(node))); +      if (r == MJ_IGNORE) { +        if (found == TRUE) r = MJ_YES; +      } +    } +    break; + +  case NODE_ALT: +    r = MJ_IGNORE; +    do { +      r = mostly_just_anychar(NODE_CAR(node), in_reluctant); +      if (r == MJ_YES) break; +    } while (IS_NOT_NULL(node = NODE_CDR(node))); +    break; + +  case NODE_QUANT: +    { +      QuantNode* qn = QUANT_(node); + +      if (qn->upper == 0) +        r = MJ_IGNORE; +      else { +        if (in_reluctant == FALSE) { +          if (qn->greedy != 0 && +              (! IS_INFINITE_REPEAT(qn->upper) && +               qn->upper <= MANY_REPEAT_OF_ANYCHAR)) { +            in_reluctant = TRUE; +          } +        } +        r = mostly_just_anychar(NODE_BODY(node), in_reluctant); +      } +    } +    break; + +  case NODE_ANCHOR: +    switch (ANCHOR_(node)->type) { +    case ANCR_PREC_READ: +    case ANCR_PREC_READ_NOT: +    case ANCR_LOOK_BEHIND: +    case ANCR_LOOK_BEHIND_NOT: +    case ANCR_TEXT_SEGMENT_BOUNDARY: /* \y */ +      r = MJ_IGNORE; +      break; +    default: +      break; +    } +    break; + +  case NODE_BAG: +    { +      BagNode* en = BAG_(node); + +      if (en->type == BAG_IF_ELSE) { +        if (IS_NOT_NULL(en->te.Then)) { +          r = mostly_just_anychar(en->te.Then, in_reluctant); +          if (r == MJ_YES) break; +        } +        if (IS_NOT_NULL(en->te.Else)) { +          r = mostly_just_anychar(en->te.Else, in_reluctant); +        } +      } +      else { +        r = mostly_just_anychar(NODE_BODY(node), in_reluctant); +      } +    } +    break; + +  case NODE_CTYPE: +    if (CTYPE_(node)->ctype == CTYPE_ANYCHAR) +      r = MJ_YES; +    else +      r = MJ_NO; +    break; + +  case NODE_STRING: +    if (NODE_STRING_LEN(node) == 0) { +      r = MJ_IGNORE; +      break; +    } +    /* fall */ +  case NODE_CCLASS: +    r = MJ_NO; +    break; + +#ifdef USE_CALL +  case NODE_CALL: +    /* ignore call */ +#endif +  case NODE_BACKREF: +  case NODE_GIMMICK: +    r = MJ_IGNORE; +    break; + +  default: +    break; +  } + +  return r; +} + +#define MAX_CALLS_IN_DETECT   10 +  typedef struct {    int prec_read;    int look_behind; +  int backref;    int backref_with_level;    int call; +  int anychar_reluctant_many; +  int empty_check_nest_level; +  int max_empty_check_nest_level; +  int heavy_element;  } SlowElementCount;  static int -node_detect_can_be_slow(Node* node, SlowElementCount* ct) +detect_can_be_slow(Node* node, SlowElementCount* ct, int ncall, int calls[])  {    int r; @@ -7720,13 +7978,45 @@ node_detect_can_be_slow(Node* node, SlowElementCount* ct)    case NODE_LIST:    case NODE_ALT:      do { -      r = node_detect_can_be_slow(NODE_CAR(node), ct); +      r = detect_can_be_slow(NODE_CAR(node), ct, ncall, calls);        if (r != 0) return r;      } while (IS_NOT_NULL(node = NODE_CDR(node)));      break;    case NODE_QUANT: -    r = node_detect_can_be_slow(NODE_BODY(node), ct); +    { +      int prev_heavy_element; +      QuantNode* qn; +      Node* body; + +      qn = QUANT_(node); +      body = NODE_BODY(node); + +      if (qn->emptiness != BODY_IS_NOT_EMPTY) { +        prev_heavy_element = ct->heavy_element; +        ct->empty_check_nest_level++; +        if (ct->empty_check_nest_level > ct->max_empty_check_nest_level) +          ct->max_empty_check_nest_level = ct->empty_check_nest_level; +      } +      else if (IS_INFINITE_REPEAT(qn->upper) || +               qn->upper > MANY_REPEAT_OF_ANYCHAR) { +        MJ_RESULT mr = mostly_just_anychar(body, (qn->greedy == 0)); +        if (mr == MJ_YES) +          ct->anychar_reluctant_many++; +      } + +      r = detect_can_be_slow(body, ct, ncall, calls); + +      if (qn->emptiness != BODY_IS_NOT_EMPTY) { +        if (NODE_IS_INPEEK(node)) { +          if (ct->empty_check_nest_level > 2) { +            if (prev_heavy_element == ct->heavy_element) +              ct->heavy_element++; +          } +        } +        ct->empty_check_nest_level--; +      } +    }      break;    case NODE_ANCHOR: @@ -7744,23 +8034,23 @@ node_detect_can_be_slow(Node* node, SlowElementCount* ct)      }      if (ANCHOR_HAS_BODY(ANCHOR_(node))) -      r = node_detect_can_be_slow(NODE_BODY(node), ct); +      r = detect_can_be_slow(NODE_BODY(node), ct, ncall, calls);      break;    case NODE_BAG:      {        BagNode* en = BAG_(node); -      r = node_detect_can_be_slow(NODE_BODY(node), ct); +      r = detect_can_be_slow(NODE_BODY(node), ct, ncall, calls);        if (r != 0) return r;        if (en->type == BAG_IF_ELSE) {          if (IS_NOT_NULL(en->te.Then)) { -          r = node_detect_can_be_slow(en->te.Then, ct); +          r = detect_can_be_slow(en->te.Then, ct, ncall, calls);            if (r != 0) return r;          }          if (IS_NOT_NULL(en->te.Else)) { -          r = node_detect_can_be_slow(en->te.Else, ct); +          r = detect_can_be_slow(en->te.Else, ct, ncall, calls);            if (r != 0) return r;          }        } @@ -7771,12 +8061,44 @@ node_detect_can_be_slow(Node* node, SlowElementCount* ct)    case NODE_BACKREF:      if (NODE_IS_NEST_LEVEL(node))        ct->backref_with_level++; +    else +      ct->backref++;      break;  #endif  #ifdef USE_CALL    case NODE_CALL: -    ct->call++; +    { +      int i; +      int found; +      int gnum; + +      gnum = CALL_(node)->called_gnum; +      ct->call++; + +      if (NODE_IS_RECURSION(node) && NODE_IS_INPEEK(node) && +          NODE_IS_IN_REAL_REPEAT(node)) { +         ct->heavy_element += 10; +      } + +      found = FALSE; +      for (i = 0; i < ncall; i++) { +        if (gnum == calls[i]) { +          found = TRUE; +          break; +        } +      } + +      if (! found) { +        if (ncall + 1 < MAX_CALLS_IN_DETECT) { +          calls[ncall] = gnum; +          r = detect_can_be_slow(NODE_BODY(node), ct, ncall + 1, calls); +        } +        else { +          ct->heavy_element++; +        } +      } +    }      break;  #endif @@ -7795,8 +8117,12 @@ onig_detect_can_be_slow_pattern(const UChar* pattern,    int r;    regex_t* reg;    Node* root; -  ScanEnv scan_env; +  ParseEnv scan_env;    SlowElementCount count; +  int calls[MAX_CALLS_IN_DETECT]; +#ifdef USE_CALL +  UnsetAddrList  uslist = {0}; +#endif    reg = (regex_t* )xmalloc(sizeof(regex_t));    if (IS_NULL(reg)) return ONIGERR_MEMORY; @@ -7807,25 +8133,44 @@ onig_detect_can_be_slow_pattern(const UChar* pattern,      return r;    } -  root = 0; -  r = onig_parse_tree(&root, pattern, pattern_end, reg, &scan_env); +  r = parse_and_tune(reg, pattern, pattern_end, &scan_env, &root, NULL +#ifdef USE_CALL +                     , &uslist +#endif +                    ); +  if (r != 0) goto err; + +#ifdef USE_CALL +  if (scan_env.num_call > 0) { +    unset_addr_list_end(&uslist); +  } +#endif + +  count.prec_read          = 0; +  count.look_behind        = 0; +  count.backref            = 0; +  count.backref_with_level = 0; +  count.call               = 0; +  count.anychar_reluctant_many     = 0; +  count.empty_check_nest_level     = 0; +  count.max_empty_check_nest_level = 0; +  count.heavy_element = 0; + +  r = detect_can_be_slow(root, &count, 0, calls);    if (r == 0) { -    count.prec_read          = 0; -    count.look_behind        = 0; -    count.backref_with_level = 0; -    count.call               = 0; - -    r = node_detect_can_be_slow(root, &count); -    if (r == 0) { -      int n = count.prec_read + count.look_behind -            + count.backref_with_level + count.call; -      r = n; -    } +    int n = count.prec_read + count.look_behind +          + count.backref + count.backref_with_level + count.call +          + count.anychar_reluctant_many; +    if (count.heavy_element != 0) +      n += count.heavy_element * 10; + +    r = n;    }    if (IS_NOT_NULL(scan_env.mem_env_dynamic))      xfree(scan_env.mem_env_dynamic); + err:    onig_node_free(root);    onig_free(reg);    return r; @@ -7853,6 +8198,8 @@ Indent(FILE* f, int indent)  static void  print_indent_tree(FILE* f, Node* node, int indent)  { +  static char* emptiness_name[] = { "", " empty", " empty_mem", " empty_rec" }; +    int i;    NodeType type;    UChar* p; @@ -8019,69 +8366,83 @@ print_indent_tree(FILE* f, Node* node, int indent)        fprintf(f, "<call:%p>", node);        fprintf(f, " num: %d, name", cn->called_gnum);        p_string(f, cn->name_end - cn->name, cn->name); +      if (NODE_IS_RECURSION(node)) fprintf(f, ", recursion"); +      if (NODE_IS_INPEEK(node))    fprintf(f, ", in-peek"); +      if (NODE_IS_IN_REAL_REPEAT(node)) fprintf(f, ", in-real-repeat");      }      break;  #endif    case NODE_QUANT: -    fprintf(f, "<quantifier:%p>{%d,%d}%s%s\n", node, -            QUANT_(node)->lower, QUANT_(node)->upper, -            (QUANT_(node)->greedy ? "" : "?"), -            QUANT_(node)->include_referred == 0 ? "" : " referred"); -    print_indent_tree(f, NODE_BODY(node), indent + add); +    { +      fprintf(f, "<quantifier:%p>{%d,%d}%s%s%s", node, +              QUANT_(node)->lower, QUANT_(node)->upper, +              (QUANT_(node)->greedy ? "" : "?"), +              QUANT_(node)->include_referred == 0 ? "" : " referred", +              emptiness_name[QUANT_(node)->emptiness]); +      if (NODE_IS_INPEEK(node)) fprintf(f, ", in-peek"); +      fprintf(f, "\n"); +      print_indent_tree(f, NODE_BODY(node), indent + add); +    }      break;    case NODE_BAG: -    fprintf(f, "<bag:%p> ", node); -    if (BAG_(node)->type == BAG_IF_ELSE) { -      Node* Then; -      Node* Else; -      BagNode* bn; - -      bn = BAG_(node); -      fprintf(f, "if-else\n"); -      print_indent_tree(f, NODE_BODY(node), indent + add); +    { +      BagNode* bn = BAG_(node); +      fprintf(f, "<bag:%p> ", node); +      if (bn->type == BAG_IF_ELSE) { +        Node* Then; +        Node* Else; + +        fprintf(f, "if-else\n"); +        print_indent_tree(f, NODE_BODY(node), indent + add); + +        Then = bn->te.Then; +        Else = bn->te.Else; +        if (IS_NULL(Then)) { +          Indent(f, indent + add); +          fprintf(f, "THEN empty\n"); +        } +        else +          print_indent_tree(f, Then, indent + add); -      Then = bn->te.Then; -      Else = bn->te.Else; -      if (IS_NULL(Then)) { -        Indent(f, indent + add); -        fprintf(f, "THEN empty\n"); +        if (IS_NULL(Else)) { +          Indent(f, indent + add); +          fprintf(f, "ELSE empty\n"); +        } +        else +          print_indent_tree(f, Else, indent + add);        } -      else -        print_indent_tree(f, Then, indent + add); +      else { +        switch (bn->type) { +        case BAG_OPTION: +          fprintf(f, "option:%d", bn->o.options); +          break; +        case BAG_MEMORY: +          fprintf(f, "memory:%d", bn->m.regnum); +          if (NODE_IS_CALLED(node)) { +            fprintf(f, ", called"); +            if (NODE_IS_RECURSION(node)) +              fprintf(f, ", recursion"); +          } +          else if (NODE_IS_REFERENCED(node)) +            fprintf(f, ", referenced"); -      if (IS_NULL(Else)) { -        Indent(f, indent + add); -        fprintf(f, "ELSE empty\n"); +          if (NODE_IS_FIXED_ADDR(node)) +            fprintf(f, ", fixed-addr"); +          if ((bn->m.called_state & IN_PEEK) != 0) +            fprintf(f, ", in-peek"); +          break; +        case BAG_STOP_BACKTRACK: +          fprintf(f, "stop-bt"); +          break; +        default: +          break; +        } +        fprintf(f, "\n"); +        print_indent_tree(f, NODE_BODY(node), indent + add);        } -      else -        print_indent_tree(f, Else, indent + add); - -      break;      } - -    switch (BAG_(node)->type) { -    case BAG_OPTION: -      fprintf(f, "option:%d", BAG_(node)->o.options); -      break; -    case BAG_MEMORY: -      fprintf(f, "memory:%d", BAG_(node)->m.regnum); -      if (NODE_IS_CALLED(node)) -        fprintf(f, ", called"); -      else if (NODE_IS_REFERENCED(node)) -        fprintf(f, ", referenced"); -      if (NODE_IS_FIXED_ADDR(node)) -        fprintf(f, ", fixed-addr"); -      break; -    case BAG_STOP_BACKTRACK: -      fprintf(f, "stop-bt"); -      break; -    default: -      break; -    } -    fprintf(f, "\n"); -    print_indent_tree(f, NODE_BODY(node), indent + add);      break;    case NODE_GIMMICK: diff --git a/src/regenc.c b/src/regenc.c index 27e4549..84afd1e 100644 --- a/src/regenc.c +++ b/src/regenc.c @@ -2,7 +2,7 @@    regenc.c -  Oniguruma (regular expression library)  **********************************************************************/  /*- - * Copyright (c) 2002-2019  K.Kosako + * Copyright (c) 2002-2020  K.Kosako   * All rights reserved.   *   * Redistribution and use in source and binary forms, with or without @@ -569,6 +569,9 @@ onigenc_apply_all_case_fold_with_map(int map_size,    r = onigenc_ascii_apply_all_case_fold(flag, f, arg);    if (r != 0) return r; +  if (CASE_FOLD_IS_ASCII_ONLY(flag)) +    return 0; +    for (i = 0; i < map_size; i++) {      code = map[i].to;      r = (*f)(map[i].from, &code, 1, arg); @@ -588,7 +591,7 @@ onigenc_apply_all_case_fold_with_map(int map_size,  extern int  onigenc_get_case_fold_codes_by_str_with_map(int map_size,      const OnigPairCaseFoldCodes map[], -    int ess_tsett_flag, OnigCaseFoldType flag ARG_UNUSED, +    int ess_tsett_flag, OnigCaseFoldType flag,      const OnigUChar* p, const OnigUChar* end, OnigCaseFoldCodeItem items[])  {    int i, j, n; @@ -596,7 +599,8 @@ onigenc_get_case_fold_codes_by_str_with_map(int map_size,    if (0x41 <= *p && *p <= 0x5a) { /* A - Z */      if (*p == LARGE_S && ess_tsett_flag != 0 && end > p + 1 -        && (*(p+1) == LARGE_S || *(p+1) == SMALL_S)) { /* SS */ +        && (*(p+1) == LARGE_S || *(p+1) == SMALL_S) /* SS */ +        && CASE_FOLD_IS_NOT_ASCII_ONLY(flag)) {      ss_combination:        items[0].byte_len = 2;        items[0].code_len = 1; @@ -625,7 +629,8 @@ onigenc_get_case_fold_codes_by_str_with_map(int map_size,    }    else if (0x61 <= *p && *p <= 0x7a) { /* a - z */      if (*p == SMALL_S && ess_tsett_flag != 0 && end > p + 1 -        && (*(p+1) == SMALL_S || *(p+1) == LARGE_S)) { +        && (*(p+1) == SMALL_S || *(p+1) == LARGE_S) +        && CASE_FOLD_IS_NOT_ASCII_ONLY(flag)) {        goto ss_combination;      } @@ -634,7 +639,8 @@ onigenc_get_case_fold_codes_by_str_with_map(int map_size,      items[0].code[0] = (OnigCodePoint )(*p - 0x20);      return 1;    } -  else if (*p == 0xdf && ess_tsett_flag != 0) { +  else if (*p == 0xdf && ess_tsett_flag != 0 +           && CASE_FOLD_IS_NOT_ASCII_ONLY(flag)) {      items[0].byte_len = 1;      items[0].code_len = 2;      items[0].code[0] = (OnigCodePoint )'s'; @@ -660,6 +666,9 @@ onigenc_get_case_fold_codes_by_str_with_map(int map_size,    else {      int i; +    if (CASE_FOLD_IS_ASCII_ONLY(flag)) +      return 0; +      for (i = 0; i < map_size; i++) {        if (*p == map[i].from) {          items[0].byte_len = 1; diff --git a/src/regenc.h b/src/regenc.h index d183b97..d0b447d 100644 --- a/src/regenc.h +++ b/src/regenc.h @@ -142,6 +142,10 @@ struct PropertyNameCtype {  #define ENC_GET_SKIP_OFFSET(enc) \    (((enc)->flag & ENC_FLAG_SKIP_OFFSET_MASK)>>2) +#define CASE_FOLD_IS_ASCII_ONLY(flag) \ +  (((flag) & ONIGENC_CASE_FOLD_ASCII_ONLY) != 0) +#define CASE_FOLD_IS_NOT_ASCII_ONLY(flag) \ +  (((flag) & ONIGENC_CASE_FOLD_ASCII_ONLY) == 0)  /* for encoding system implementation (internal) */  extern int onigenc_end(void); @@ -202,12 +206,12 @@ extern int onigenc_wb_is_break_position P_((OnigEncoding enc, UChar* p, UChar* p  #define FOLDS1_UNFOLDS_NUM(i)  (OnigUnicodeFolds1[(i)+1])  #define FOLDS2_UNFOLDS_NUM(i)  (OnigUnicodeFolds2[(i)+2])  #define FOLDS3_UNFOLDS_NUM(i)  (OnigUnicodeFolds3[(i)+3]) -#define FOLDS1_UNFOLDS(i)      (OnigUnicodeFolds1 + (i) + 2) -#define FOLDS2_UNFOLDS(i)      (OnigUnicodeFolds2 + (i) + 3) -#define FOLDS3_UNFOLDS(i)      (OnigUnicodeFolds3 + (i) + 4) -#define FOLDS1_NEXT_INDEX(i)   ((i) + 2 + OnigUnicodeFolds1[(i)+1]) -#define FOLDS2_NEXT_INDEX(i)   ((i) + 3 + OnigUnicodeFolds2[(i)+2]) -#define FOLDS3_NEXT_INDEX(i)   ((i) + 4 + OnigUnicodeFolds3[(i)+3]) +#define FOLDS1_UNFOLDS(i)      (FOLDS1_FOLD(i) + 2) +#define FOLDS2_UNFOLDS(i)      (FOLDS2_FOLD(i) + 3) +#define FOLDS3_UNFOLDS(i)      (FOLDS3_FOLD(i) + 4) +#define FOLDS1_NEXT_INDEX(i)   ((i) + 2 + FOLDS1_UNFOLDS_NUM(i)) +#define FOLDS2_NEXT_INDEX(i)   ((i) + 3 + FOLDS2_UNFOLDS_NUM(i)) +#define FOLDS3_NEXT_INDEX(i)   ((i) + 4 + FOLDS3_UNFOLDS_NUM(i))  #define FOLDS_FOLD_ADDR_BUK(buk, addr) do {\    if ((buk)->fold_len == 1)\ diff --git a/src/regerror.c b/src/regerror.c index dc1c8b6..18a5bdd 100644 --- a/src/regerror.c +++ b/src/regerror.c @@ -2,7 +2,7 @@    regerror.c -  Oniguruma (regular expression library)  **********************************************************************/  /*- - * Copyright (c) 2002-2020  K.Kosako + * Copyright (c) 2002-2021  K.Kosako   * All rights reserved.   *   * Redistribution and use in source and binary forms, with or without @@ -146,6 +146,8 @@ onig_error_code_to_format(int code)      p = "too big wide-char value"; break;    case ONIGERR_TOO_LONG_WIDE_CHAR_VALUE:      p = "too long wide-char value"; break; +  case ONIGERR_UNDEFINED_OPERATOR: +    p = "undefined operator"; break;    case ONIGERR_INVALID_CODE_POINT_VALUE:      p = "invalid code point value"; break;    case ONIGERR_EMPTY_GROUP_NAME: @@ -190,6 +192,8 @@ onig_error_code_to_format(int code)      p = "not supported encoding combination"; break;    case ONIGERR_INVALID_COMBINATION_OF_OPTIONS:      p = "invalid combination of options"; break; +  case ONIGERR_VERY_INEFFICIENT_PATTERN: +    p = "very inefficient pattern"; break;    case ONIGERR_LIBRARY_IS_NOT_INITIALIZED:      p = "library is not initialized"; break; diff --git a/src/regexec.c b/src/regexec.c index bb6b474..a3cf60a 100644 --- a/src/regexec.c +++ b/src/regexec.c @@ -2,7 +2,7 @@    regexec.c -  Oniguruma (regular expression library)  **********************************************************************/  /*- - * Copyright (c) 2002-2020  K.Kosako + * Copyright (c) 2002-2021  K.Kosako   * All rights reserved.   *   * Redistribution and use in source and binary forms, with or without @@ -54,6 +54,13 @@    (MEM_STATUS_AT((reg)->push_mem_end, (idx)) != 0 ? \     STACK_AT(mem_end_stk[idx].i)->u.mem.pstr : mem_end_stk[idx].s) +#ifdef _MSC_VER +#define DIST_CAST(d)   (size_t )(d) +#else +#define DIST_CAST(d)   (d) +#endif + +  static int forward_search(regex_t* reg, const UChar* str, const UChar* end, UChar* start, UChar* range, UChar** low, UChar** high);  static int @@ -76,11 +83,12 @@ struct OnigMatchParamStruct {    unsigned long   retry_limit_in_match;    unsigned long   retry_limit_in_search;  #endif + +  void*           callout_user_data; /* used in callback each match */  #ifdef USE_CALLOUT    OnigCalloutFunc progress_callout_of_contents;    OnigCalloutFunc retraction_callout_of_contents;    int             match_at_call_counter; -  void*           callout_user_data;    CalloutData*    callout_data;    int             callout_data_alloc_num;  #endif @@ -143,12 +151,8 @@ onig_set_retraction_callout_of_match_param(OnigMatchParam* param, OnigCalloutFun  extern int  onig_set_callout_user_data_of_match_param(OnigMatchParam* param, void* user_data)  { -#ifdef USE_CALLOUT    param->callout_user_data = user_data;    return ONIG_NORMAL; -#else -  return ONIG_NO_SUPPORT_CONFIG; -#endif  } @@ -873,6 +877,23 @@ onig_get_capture_tree(OnigRegion* region)  }  #endif /* USE_CAPTURE_HISTORY */ + +static OnigCallbackEachMatchFunc CallbackEachMatch; + +extern OnigCallbackEachMatchFunc +onig_get_callback_each_match(void) +{ +  return CallbackEachMatch; +} + +extern int +onig_set_callback_each_match(OnigCallbackEachMatchFunc f) +{ +  CallbackEachMatch = f; +  return ONIG_NORMAL; +} + +  extern void  onig_region_clear(OnigRegion* region)  { @@ -1238,7 +1259,7 @@ struct OnigCalloutArgsStruct {  #ifdef USE_FIND_LONGEST_SEARCH_ALL_OF_RANGE  #define MATCH_ARG_INIT(msa, reg, arg_option, arg_region, arg_start, mpv) do { \    (msa).stack_p  = (void* )0;\ -  (msa).options  = (arg_option);\ +  (msa).options  = (arg_option)|(reg)->options;\    (msa).region   = (arg_region);\    (msa).start    = (arg_start);\    (msa).match_stack_limit  = (mpv)->match_stack_limit;\ @@ -1251,7 +1272,7 @@ struct OnigCalloutArgsStruct {  #else  #define MATCH_ARG_INIT(msa, reg, arg_option, arg_region, arg_start, mpv) do { \    (msa).stack_p  = (void* )0;\ -  (msa).options  = (arg_option);\ +  (msa).options  = (arg_option)|(reg)->options;\    (msa).region   = (arg_region);\    (msa).start    = (arg_start);\    (msa).match_stack_limit  = (mpv)->match_stack_limit;\ @@ -1405,6 +1426,7 @@ onig_set_subexp_call_limit_in_search(unsigned long n)  #endif +  #ifdef USE_CALLOUT  static OnigCalloutFunc DefaultProgressCallout;  static OnigCalloutFunc DefaultRetractionCallout; @@ -1452,11 +1474,12 @@ onig_initialize_match_param(OnigMatchParam* mp)    mp->retry_limit_in_search = RetryLimitInSearch;  #endif +  mp->callout_user_data = 0; +  #ifdef USE_CALLOUT    mp->progress_callout_of_contents   = DefaultProgressCallout;    mp->retraction_callout_of_contents = DefaultRetractionCallout;    mp->match_at_call_counter  = 0; -  mp->callout_user_data      = 0;    mp->callout_data           = 0;    mp->callout_data_alloc_num = 0;  #endif @@ -1532,13 +1555,26 @@ onig_get_callout_data_dont_clear_old(regex_t* reg, OnigMatchParam* mp,    t = d->slot[slot].type;    if (IS_NOT_NULL(type)) *type = t;    if (IS_NOT_NULL(val))  *val  = d->slot[slot].val; -  return (t == ONIG_TYPE_VOID ? 1 : ONIG_NORMAL); +  return (t == ONIG_TYPE_VOID ? ONIG_VALUE_IS_NOT_SET : ONIG_NORMAL); +} + +extern int +onig_get_callout_data_by_tag_dont_clear_old(regex_t* reg, +  OnigMatchParam* mp, const UChar* tag, const UChar* tag_end, int slot, +  OnigType* type, OnigValue* val) +{ +  int num; + +  num = onig_get_callout_num_by_tag(reg, tag, tag_end); +  if (num < 0)  return num; +  if (num == 0) return ONIGERR_INVALID_CALLOUT_TAG_NAME; + +  return onig_get_callout_data_dont_clear_old(reg, mp, num, slot, type, val);  }  extern int -onig_get_callout_data_by_callout_args_self_dont_clear_old(OnigCalloutArgs* args, -                                                          int slot, OnigType* type, -                                                          OnigValue* val) +onig_get_callout_data_by_callout_args_self_dont_clear_old( +  OnigCalloutArgs* args, int slot, OnigType* type, OnigValue* val)  {    return onig_get_callout_data_dont_clear_old(args->regex, args->msa->mp,                                                args->num, slot, type, val); @@ -1563,7 +1599,7 @@ onig_get_callout_data(regex_t* reg, OnigMatchParam* mp,    t = d->slot[slot].type;    if (IS_NOT_NULL(type)) *type = t;    if (IS_NOT_NULL(val))  *val  = d->slot[slot].val; -  return (t == ONIG_TYPE_VOID ? 1 : ONIG_NORMAL); +  return (t == ONIG_TYPE_VOID ? ONIG_VALUE_IS_NOT_SET : ONIG_NORMAL);  }  extern int @@ -2171,65 +2207,90 @@ stack_double(int* is_alloca, char** arg_alloc_base,    }\  } while (0) -#ifdef USE_STUBBORN_CHECK_CAPTURES_IN_EMPTY_REPEAT -#define STACK_EMPTY_CHECK_MEM(isnull, sid, s, reg) do {\ -  StackType* k;\ -  GET_EMPTY_CHECK_START(sid, k);\ -  if (k->u.empty_check.pstr != (s)) {\ +#ifdef USE_RIGID_CHECK_CAPTURES_IN_EMPTY_REPEAT +#define STACK_EMPTY_CHECK_MEM(isnull, sid, empty_status_mem, s, reg) do {\ +  StackType* klow;\ +  GET_EMPTY_CHECK_START(sid, klow);\ +  if (klow->u.empty_check.pstr != (s)) {\ +  stack_empty_check_mem_not_empty:\      (isnull) = 0;\    }\    else {\ -    UChar* endp;\ +    StackType *k, *kk;\ +    MemStatusType ms = (empty_status_mem);\      (isnull) = 1;\ -    while (k < stk) {\ -      if (k->type == STK_MEM_START &&\ -        MEM_STATUS_LIMIT_AT((reg)->empty_status_mem, k->zid)) {\ -        STACK_MEM_START_GET_PREV_END_ADDR(k, reg, endp);\ -        if (endp == 0) {\ -          (isnull) = 0; break;\ -        }\ -        else if (STACK_AT(k->u.mem.prev_start.i)->u.mem.pstr != endp) {\ -          (isnull) = 0; break;\ -        }\ -        else if (endp != s) {\ -          (isnull) = -1; /* empty, but position changed */ \ +    k = stk;\ +    while (k > klow) {\ +      k--;\ +      if (k->type == STK_MEM_END && MEM_STATUS_LIMIT_AT(ms, k->zid)) {\ +        kk = klow;\ +        while (kk < k) {\ +          if (kk->type == STK_MEM_START && kk->zid == k->zid) {\ +            if (kk->u.mem.prev_end.i == INVALID_STACK_INDEX || \ +                ((STACK_AT(kk->u.mem.prev_end.i)->u.mem.pstr != k->u.mem.pstr || STACK_AT(kk->u.mem.prev_start.i)->u.mem.pstr != STACK_AT(k->u.mem.prev_start.i)->u.mem.pstr) && (STACK_AT(k->u.mem.prev_start.i)->u.mem.pstr != k->u.mem.pstr || STACK_AT(kk->u.mem.prev_start.i)->u.mem.pstr != STACK_AT(kk->u.mem.prev_end.i)->u.mem.pstr))) {\ +              goto stack_empty_check_mem_not_empty;\ +            }\ +            else {\ +              ms &= ~((MemStatusType )1 << k->zid);\ +              break;\ +            }\ +          }\ +          kk++;\          }\ +        if (ms == 0) break;\        }\ -      k++;\      }\    }\  } while(0) -#define STACK_EMPTY_CHECK_MEM_REC(isnull,sid,s,reg) do {\ +#define STACK_EMPTY_CHECK_MEM_REC(isnull,sid,empty_status_mem,s,reg) do {\    int level = 0;\ -  StackType* k = stk;\ +  StackType* klow = stk;\    while (1) {\ -    k--;\ -    STACK_BASE_CHECK(k, "STACK_EMPTY_CHECK_MEM_REC");\ -    if (k->type == STK_EMPTY_CHECK_START) {\ -      if (k->zid == (sid)) {\ +    klow--;\ +    STACK_BASE_CHECK(klow, "STACK_EMPTY_CHECK_MEM_REC");\ +    if (klow->type == STK_EMPTY_CHECK_START) {\ +      if (klow->zid == (sid)) {\          if (level == 0) {\ -          if (k->u.empty_check.pstr != (s)) {\ +          if (klow->u.empty_check.pstr != (s)) {\ +          stack_empty_check_mem_rec_not_empty:\              (isnull) = 0;\              break;\            }\            else {\ -            UChar* endp;\ +            StackType *k, *kk;\ +            MemStatusType ms;\              (isnull) = 1;\ -            while (k < stk) {\ -              if (k->type == STK_MEM_START) {\ -                if (level == 0 && \ -                  MEM_STATUS_LIMIT_AT((reg)->empty_status_mem, k->zid) !=0) {\ -                  STACK_MEM_START_GET_PREV_END_ADDR(k, reg, endp);\ -                  if (endp == 0) {\ -                    (isnull) = 0; break;\ -                  }\ -                  else if (STACK_AT(k->u.mem.prev_start.i)->u.mem.pstr != endp) { \ -                    (isnull) = 0; break;\ -                  }\ -                  else if (endp != s) {\ -                    (isnull) = -1; /* empty, but position changed */\ +            if ((empty_status_mem) == 0) break;\ +            ms = (empty_status_mem);\ +            k = stk;\ +            while (k > klow) {\ +              k--;\ +              if (k->type == STK_MEM_END) {\ +                if (level == 0 && MEM_STATUS_LIMIT_AT(ms, k->zid)) {\ +                  kk = klow;\ +                  kk++;\ +                  while (kk < k) {\ +                    if (kk->type == STK_MEM_START && kk->zid == k->zid) {\ +                      if (kk->u.mem.prev_end.i == INVALID_STACK_INDEX || \ +                          ((STACK_AT(kk->u.mem.prev_end.i)->u.mem.pstr != k->u.mem.pstr || STACK_AT(kk->u.mem.prev_start.i)->u.mem.pstr != STACK_AT(k->u.mem.prev_start.i)->u.mem.pstr) && (STACK_AT(k->u.mem.prev_start.i)->u.mem.pstr != k->u.mem.pstr || STACK_AT(kk->u.mem.prev_start.i)->u.mem.pstr != STACK_AT(kk->u.mem.prev_end.i)->u.mem.pstr))) {\ +                        goto stack_empty_check_mem_rec_not_empty;\ +                      }\ +                      else {\ +                        ms &= ~((MemStatusType )1 << k->zid);\ +                        break;\ +                      }\ +                    }\ +                    else if (kk->type == STK_EMPTY_CHECK_START) {\ +                      if (kk->zid == (sid)) level++;\ +                    }\ +                    else if (kk->type == STK_EMPTY_CHECK_END) {\ +                      if (kk->zid == (sid)) level--;\ +                    }\ +                    kk++;\                    }\ +                  level = 0;\ +                  if (ms == 0) break;\                  }\                }\                else if (k->type == STK_EMPTY_CHECK_START) {\ @@ -2238,7 +2299,6 @@ stack_double(int* is_alloca, char** arg_alloc_base,                else if (k->type == STK_EMPTY_CHECK_END) {\                  if (k->zid == (sid)) level--;\                }\ -              k++;\              }\              break;\            }\ @@ -2248,8 +2308,8 @@ stack_double(int* is_alloca, char** arg_alloc_base,          }\        }\      }\ -    else if (k->type == STK_EMPTY_CHECK_END) {\ -      if (k->zid == (sid)) level++;\ +    else if (klow->type == STK_EMPTY_CHECK_END) {\ +      if (klow->zid == (sid)) level++;\      }\    }\  } while(0) @@ -2274,7 +2334,7 @@ stack_double(int* is_alloca, char** arg_alloc_base,      }\    }\  } while(0) -#endif /* USE_STUBBORN_CHECK_CAPTURES_IN_EMPTY_REPEAT */ +#endif /* USE_RIGID_CHECK_CAPTURES_IN_EMPTY_REPEAT */  #define STACK_GET_REPEAT_COUNT_SEARCH(sid, c) do {\    StackType* k = stk;\ @@ -2888,6 +2948,7 @@ match_at(regex_t* reg, const UChar* str, const UChar* end,    StackType *stkp; /* used as any purpose. */    StkPtrType *mem_start_stk, *mem_end_stk;    UChar* keep; +  OnigRegion* region;  #ifdef USE_REPEAT_AND_EMPTY_CHECK_LOCAL_VAR    StackIndex *repeat_stk; @@ -2905,8 +2966,8 @@ match_at(regex_t* reg, const UChar* str, const UChar* end,    unsigned long subexp_call_counters[MAX_SUBEXP_CALL_COUNTERS];  #endif +  OnigOptionType options;    Operation* p = reg->ops; -  OnigOptionType option = reg->options;    OnigEncoding encode = reg->enc;    OnigCaseFoldType case_fold_flag = reg->case_fold_flag; @@ -2936,6 +2997,8 @@ match_at(regex_t* reg, const UChar* str, const UChar* end,    }  #endif +  options = msa->options; +  #ifdef USE_CALLOUT    msa->mp->match_at_call_counter++;  #endif @@ -2976,102 +3039,113 @@ match_at(regex_t* reg, const UChar* str, const UChar* end,    BYTECODE_INTERPRETER_START {      CASE_OP(END)        n = (int )(s - sstart); +      if (n == 0 && OPTON_FIND_NOT_EMPTY(options)) { +        best_len = ONIG_MISMATCH; +        goto fail; /* for retry */ +      } +        if (n > best_len) { -        OnigRegion* region;  #ifdef USE_FIND_LONGEST_SEARCH_ALL_OF_RANGE -        if (OPTON_FIND_LONGEST(option)) { +        if (OPTON_FIND_LONGEST(options)) {            if (n > msa->best_len) {              msa->best_len = n;              msa->best_s   = (UChar* )sstart; -            goto set_region;            } -          else -            goto end_best_len; +          else { +            if (s >= in_right_range && msa->best_s == sstart) { +              best_len = msa->best_len; /* end of find */ +            } +            else { +              SOP_OUT; +              goto fail; /* for retry */ +            } +          }          } -#endif +        else { +          best_len = n; +        } +#else          best_len = n; +#endif +      } -      set_region: -        region = msa->region; -        if (region) { -          if (keep > s) keep = s; +      /* set region */ +      region = msa->region; +      if (region) { +        if (keep > s) keep = s;  #ifdef USE_POSIX_API -          if (OPTON_POSIX_REGION(msa->options)) { -            posix_regmatch_t* rmt = (posix_regmatch_t* )region; - -            rmt[0].rm_so = (regoff_t )(keep - str); -            rmt[0].rm_eo = (regoff_t )(s    - str); -            for (i = 1; i <= num_mem; i++) { -              if (mem_end_stk[i].i != INVALID_STACK_INDEX) { -                rmt[i].rm_so = (regoff_t )(STACK_MEM_START(reg, i) - str); -                rmt[i].rm_eo = (regoff_t )(STACK_MEM_END(reg, i)   - str); -              } -              else { -                rmt[i].rm_so = rmt[i].rm_eo = ONIG_REGION_NOTPOS; -              } +        if (OPTON_POSIX_REGION(options)) { +          posix_regmatch_t* rmt = (posix_regmatch_t* )region; + +          rmt[0].rm_so = (regoff_t )(keep - str); +          rmt[0].rm_eo = (regoff_t )(s    - str); +          for (i = 1; i <= num_mem; i++) { +            if (mem_end_stk[i].i != INVALID_STACK_INDEX) { +              rmt[i].rm_so = (regoff_t )(STACK_MEM_START(reg, i) - str); +              rmt[i].rm_eo = (regoff_t )(STACK_MEM_END(reg, i)   - str); +            } +            else { +              rmt[i].rm_so = rmt[i].rm_eo = ONIG_REGION_NOTPOS;              }            } -          else { +        } +        else {  #endif /* USE_POSIX_API */ -            region->beg[0] = (int )(keep - str); -            region->end[0] = (int )(s    - str); -            for (i = 1; i <= num_mem; i++) { -              if (mem_end_stk[i].i != INVALID_STACK_INDEX) { -                region->beg[i] = (int )(STACK_MEM_START(reg, i) - str); -                region->end[i] = (int )(STACK_MEM_END(reg, i)   - str); -              } -              else { -                region->beg[i] = region->end[i] = ONIG_REGION_NOTPOS; -              } +          region->beg[0] = (int )(keep - str); +          region->end[0] = (int )(s    - str); +          for (i = 1; i <= num_mem; i++) { +            if (mem_end_stk[i].i != INVALID_STACK_INDEX) { +              region->beg[i] = (int )(STACK_MEM_START(reg, i) - str); +              region->end[i] = (int )(STACK_MEM_END(reg, i)   - str); +            } +            else { +              region->beg[i] = region->end[i] = ONIG_REGION_NOTPOS;              } +          }  #ifdef USE_CAPTURE_HISTORY -            if (reg->capture_history != 0) { -              int r; -              OnigCaptureTreeNode* node; +          if (reg->capture_history != 0) { +            OnigCaptureTreeNode* node; -              if (IS_NULL(region->history_root)) { -                region->history_root = node = history_node_new(); -                CHECK_NULL_RETURN_MEMERR(node); -              } -              else { -                node = region->history_root; -                history_tree_clear(node); -              } +            if (IS_NULL(region->history_root)) { +              region->history_root = node = history_node_new(); +              CHECK_NULL_RETURN_MEMERR(node); +            } +            else { +              node = region->history_root; +              history_tree_clear(node); +            } -              node->group = 0; -              node->beg   = (int )(keep - str); -              node->end   = (int )(s    - str); +            node->group = 0; +            node->beg   = (int )(keep - str); +            node->end   = (int )(s    - str); -              stkp = stk_base; -              r = make_capture_history_tree(region->history_root, &stkp, -                                            stk, (UChar* )str, reg); -              if (r < 0) MATCH_AT_ERROR_RETURN(r); -            } +            stkp = stk_base; +            i = make_capture_history_tree(region->history_root, &stkp, +                                          stk, (UChar* )str, reg); +            if (i < 0) MATCH_AT_ERROR_RETURN(i); +          }  #endif /* USE_CAPTURE_HISTORY */  #ifdef USE_POSIX_API -          } /* else OPTON_POSIX_REGION() */ +        } /* else OPTON_POSIX_REGION() */  #endif -        } /* if (region) */ -      } /* n > best_len */ +      } /* if (region) */ -#ifdef USE_FIND_LONGEST_SEARCH_ALL_OF_RANGE -    end_best_len: -#endif        SOP_OUT; -      if (OPTON_FIND_CONDITION(option)) { -        if (OPTON_FIND_NOT_EMPTY(option) && s == sstart) { +      if (OPTON_CALLBACK_EACH_MATCH(options) && +          IS_NOT_NULL(CallbackEachMatch)) { +        i = CallbackEachMatch(str, end, sstart, region, +                              msa->mp->callout_user_data); +        if (i < 0) MATCH_AT_ERROR_RETURN(i); + +#ifdef USE_FIND_LONGEST_SEARCH_ALL_OF_RANGE +        if (! OPTON_FIND_LONGEST(options)) +#endif            best_len = ONIG_MISMATCH; -          goto fail; /* for retry */ -        } -        if (OPTON_FIND_LONGEST(option)) { -          if (s >= in_right_range && msa->best_s == sstart) -            best_len = msa->best_len; -          else -            goto fail; /* for retry */ -        } + +        goto fail;        }        /* default behavior: return first-matching result. */ @@ -3564,23 +3638,23 @@ match_at(regex_t* reg, const UChar* str, const UChar* end,      CASE_OP(BEGIN_BUF)        if (! ON_STR_BEGIN(s)) goto fail; -      if (OPTON_NOTBOL(msa->options)) goto fail; -      if (OPTON_NOT_BEGIN_STRING(msa->options)) goto fail; +      if (OPTON_NOTBOL(options)) goto fail; +      if (OPTON_NOT_BEGIN_STRING(options)) goto fail;        INC_OP;        JUMP_OUT;      CASE_OP(END_BUF)        if (! ON_STR_END(s)) goto fail; -      if (OPTON_NOTEOL(msa->options)) goto fail; -      if (OPTON_NOT_END_STRING(msa->options)) goto fail; +      if (OPTON_NOTEOL(options)) goto fail; +      if (OPTON_NOT_END_STRING(options)) goto fail;        INC_OP;        JUMP_OUT;      CASE_OP(BEGIN_LINE)        if (ON_STR_BEGIN(s)) { -        if (OPTON_NOTBOL(msa->options)) goto fail; +        if (OPTON_NOTBOL(options)) goto fail;          INC_OP;          JUMP_OUT;        } @@ -3599,7 +3673,7 @@ match_at(regex_t* reg, const UChar* str, const UChar* end,          UChar* sprev = (UChar* )onigenc_get_prev_char_head(encode, str, s);          if (IS_EMPTY_STR || !ONIGENC_IS_MBC_NEWLINE(encode, sprev, end)) {  #endif -          if (OPTON_NOTEOL(msa->options)) goto fail; +          if (OPTON_NOTEOL(options)) goto fail;            INC_OP;            JUMP_OUT;  #ifndef USE_NEWLINE_AT_END_OF_STRING_HAS_EMPTY_LINE @@ -3624,8 +3698,8 @@ match_at(regex_t* reg, const UChar* str, const UChar* end,          UChar* sprev = (UChar* )onigenc_get_prev_char_head(encode, str, s);          if (IS_EMPTY_STR || !ONIGENC_IS_MBC_NEWLINE(encode, sprev, end)) {  #endif -          if (OPTON_NOTEOL(msa->options)) goto fail; -          if (OPTON_NOT_END_STRING(msa->options)) goto fail; +          if (OPTON_NOTEOL(options)) goto fail; +          if (OPTON_NOT_END_STRING(options)) goto fail;            INC_OP;            JUMP_OUT;  #ifndef USE_NEWLINE_AT_END_OF_STRING_HAS_EMPTY_LINE @@ -3634,8 +3708,8 @@ match_at(regex_t* reg, const UChar* str, const UChar* end,        }        else if (ONIGENC_IS_MBC_NEWLINE(encode, s, end) &&                 ON_STR_END(s + enclen(encode, s))) { -        if (OPTON_NOTEOL(msa->options)) goto fail; -        if (OPTON_NOT_END_STRING(msa->options)) goto fail; +        if (OPTON_NOTEOL(options)) goto fail; +        if (OPTON_NOT_END_STRING(options)) goto fail;          INC_OP;          JUMP_OUT;        } @@ -3644,8 +3718,8 @@ match_at(regex_t* reg, const UChar* str, const UChar* end,          UChar* ss = s + enclen(encode, s);          ss += enclen(encode, ss);          if (ON_STR_END(ss)) { -          if (OPTON_NOTEOL(msa->options)) goto fail; -          if (OPTON_NOT_END_STRING(msa->options)) goto fail; +          if (OPTON_NOTEOL(options)) goto fail; +          if (OPTON_NOT_END_STRING(options)) goto fail;            INC_OP;            JUMP_OUT;          } @@ -3657,7 +3731,7 @@ match_at(regex_t* reg, const UChar* str, const UChar* end,        switch (p->check_position.type) {        case CHECK_POSITION_SEARCH_START:          if (s != msa->start) goto fail; -        if (OPTON_NOT_BEGIN_POSITION(msa->options)) goto fail; +        if (OPTON_NOT_BEGIN_POSITION(options)) goto fail;          break;        case CHECK_POSITION_CURRENT_RIGHT_RANGE:          if (s != right_range) goto fail; @@ -3924,13 +3998,13 @@ match_at(regex_t* reg, const UChar* str, const UChar* end,        }        JUMP_OUT; -#ifdef USE_STUBBORN_CHECK_CAPTURES_IN_EMPTY_REPEAT +#ifdef USE_RIGID_CHECK_CAPTURES_IN_EMPTY_REPEAT      CASE_OP(EMPTY_CHECK_END_MEMST)        {          int is_empty;          mem = p->empty_check_end.mem;  /* mem: null check id */ -        STACK_EMPTY_CHECK_MEM(is_empty, mem, s, reg); +        STACK_EMPTY_CHECK_MEM(is_empty, mem, p->empty_check_end.empty_status_mem, s, reg);          INC_OP;          if (is_empty) {  #ifdef ONIG_DEBUG_MATCH @@ -3949,8 +4023,8 @@ match_at(regex_t* reg, const UChar* str, const UChar* end,          int is_empty;          mem = p->empty_check_end.mem;  /* mem: null check id */ -#ifdef USE_STUBBORN_CHECK_CAPTURES_IN_EMPTY_REPEAT -        STACK_EMPTY_CHECK_MEM_REC(is_empty, mem, s, reg); +#ifdef USE_RIGID_CHECK_CAPTURES_IN_EMPTY_REPEAT +        STACK_EMPTY_CHECK_MEM_REC(is_empty, mem, p->empty_check_end.empty_status_mem, s, reg);  #else          STACK_EMPTY_CHECK_REC(is_empty, mem, s);  #endif @@ -4109,6 +4183,9 @@ match_at(regex_t* reg, const UChar* str, const UChar* end,          }        } +#ifdef ONIG_DEBUG_CALL +      fprintf(DBGFP, "CALL: id:%d, at:%ld, level:%lu\n", p->call.called_mem, s - str, subexp_call_nest_counter); +#endif        addr = p->call.addr;        INC_OP; STACK_PUSH_CALL_FRAME(p);        p = reg->ops + addr; @@ -4425,7 +4502,7 @@ regset_search_body_position_lead(OnigRegSet* set,      sr[i].state = SRS_DEAD;      if (reg->optimize != OPTIMIZE_NONE) {        if (reg->dist_max != INFINITE_LEN) { -        if (end - range > reg->dist_max) +        if (DIST_CAST(end - range) > reg->dist_max)            sch_range = (UChar* )range + reg->dist_max;          else            sch_range = (UChar* )end; @@ -4609,7 +4686,7 @@ onig_regset_search_with_param(OnigRegSet* set,    if (set->n == 0)      return ONIG_MISMATCH; -  if (OPTON_POSIX_REGION(option)) +  if (OPTON_POSIX_REGION(option) || OPTON_CALLBACK_EACH_MATCH(option))      return ONIGERR_INVALID_ARGUMENT;    r = 0; @@ -4884,7 +4961,7 @@ sunday_quick_search_step_forward(regex_t* reg,                                   const UChar* text_range)  {    const UChar *s, *se, *t, *p, *end; -  const UChar *tail; +  const UChar *tail, *next;    int skip, tlen1;    int map_offset;    OnigEncoding enc; @@ -4921,9 +4998,11 @@ sunday_quick_search_step_forward(regex_t* reg,        s += enclen(enc, s);      } while ((s - t) < skip && s < end);  #else -    s += skip; -    if (s < end) -      s = onigenc_get_right_adjust_char_head(enc, text, s); +    next = s + skip; +    if (next < end) +      s = onigenc_get_right_adjust_char_head(enc, s, next); +    else +      break;  #endif    } @@ -5086,7 +5165,7 @@ forward_search(regex_t* reg, const UChar* str, const UChar* end, UChar* start,    p = start;    if (reg->dist_min != 0) { -    if (end - p <= reg->dist_min) +    if (DIST_CAST(end - p) <= reg->dist_min)        return 0; /* fail */      if (ONIGENC_IS_SINGLEBYTE(reg->enc)) { @@ -5119,7 +5198,7 @@ forward_search(regex_t* reg, const UChar* str, const UChar* end, UChar* start,    }    if (p && p < range) { -    if (p - start < reg->dist_min) { +    if (DIST_CAST(p - start) < reg->dist_min) {      retry_gate:        pprev = p;        p += enclen(reg->enc, p); @@ -5164,7 +5243,7 @@ forward_search(regex_t* reg, const UChar* str, const UChar* end, UChar* start,      }      else {        if (reg->dist_max != INFINITE_LEN) { -        if (p - str < reg->dist_max) { +        if (DIST_CAST(p - str) < reg->dist_max) {            *low = (UChar* )str;          }          else { @@ -5175,7 +5254,7 @@ forward_search(regex_t* reg, const UChar* str, const UChar* end, UChar* start,          }        }        /* no needs to adjust *high, *high is used as range check only */ -      if (p - str < reg->dist_min) +      if (DIST_CAST(p - str) < reg->dist_min)          *high = (UChar* )str;        else          *high = p - reg->dist_min; @@ -5260,13 +5339,13 @@ backward_search(regex_t* reg, const UChar* str, const UChar* end, UChar* s,      }      if (reg->dist_max != INFINITE_LEN) { -      if (p - str < reg->dist_max) +      if (DIST_CAST(p - str) < reg->dist_max)          *low = (UChar* )str;        else          *low = p - reg->dist_max;        if (reg->dist_min != 0) { -        if (p - str < reg->dist_min) +        if (DIST_CAST(p - str) < reg->dist_min)            *high = (UChar* )str;          else            *high = p - reg->dist_min; @@ -5410,13 +5489,13 @@ search_in_range(regex_t* reg, const UChar* str, const UChar* end,        if (range > start) {          if (reg->anc_dist_max != INFINITE_LEN && -            min_semi_end - start > reg->anc_dist_max) { +            DIST_CAST(min_semi_end - start) > reg->anc_dist_max) {            start = min_semi_end - reg->anc_dist_max;            if (start < end)              start = onigenc_get_right_adjust_char_head(reg->enc, str, start);          } -        if (max_semi_end - (range - 1) < reg->anc_dist_min) { -          if (max_semi_end - str + 1 < reg->anc_dist_min) +        if (DIST_CAST(max_semi_end - (range - 1)) < reg->anc_dist_min) { +          if (DIST_CAST(max_semi_end - str + 1) < reg->anc_dist_min)              goto mismatch_no_msa;            else              range = max_semi_end - reg->anc_dist_min + 1; @@ -5428,11 +5507,11 @@ search_in_range(regex_t* reg, const UChar* str, const UChar* end,        }        else {          if (reg->anc_dist_max != INFINITE_LEN && -            min_semi_end - range > reg->anc_dist_max) { +            DIST_CAST(min_semi_end - range) > reg->anc_dist_max) {            range = min_semi_end - reg->anc_dist_max;          } -        if (max_semi_end - start < reg->anc_dist_min) { -          if (max_semi_end - str < reg->anc_dist_min) +        if (DIST_CAST(max_semi_end - start) < reg->anc_dist_min) { +          if (DIST_CAST(max_semi_end - str) < reg->anc_dist_min)              goto mismatch_no_msa;            else {              start = max_semi_end - reg->anc_dist_min; @@ -5503,7 +5582,7 @@ search_in_range(regex_t* reg, const UChar* str, const UChar* end,          if (reg->dist_max == INFINITE_LEN)            sch_range = (UChar* )end;          else { -          if ((end - range) < reg->dist_max) +          if (DIST_CAST(end - range) < reg->dist_max)              sch_range = (UChar* )end;            else {              sch_range = (UChar* )range + reg->dist_max; @@ -5579,14 +5658,14 @@ search_in_range(regex_t* reg, const UChar* str, const UChar* end,        else          adjrange = (UChar* )end; -      if (end - range > reg->dist_min) +      if (DIST_CAST(end - range) > reg->dist_min)          min_range = range + reg->dist_min;        else          min_range = end;        if (reg->dist_max != INFINITE_LEN) {          do { -          if (end - s > reg->dist_max) +          if (DIST_CAST(end - s) > reg->dist_max)              sch_start = s + reg->dist_max;            else {              sch_start = onigenc_get_prev_char_head(reg->enc, str, end); @@ -5887,8 +5966,10 @@ onig_regset_add(OnigRegSet* set, regex_t* reg)  {    OnigRegion* region; +#ifdef USE_FIND_LONGEST_SEARCH_ALL_OF_RANGE    if (OPTON_FIND_LONGEST(reg->options))      return ONIGERR_INVALID_ARGUMENT; +#endif    if (set->n != 0 && reg->enc != set->enc)      return ONIGERR_INVALID_ARGUMENT; @@ -5933,8 +6014,10 @@ onig_regset_replace(OnigRegSet* set, int at, regex_t* reg)      set->n--;    }    else { +#ifdef USE_FIND_LONGEST_SEARCH_ALL_OF_RANGE      if (OPTON_FIND_LONGEST(reg->options))        return ONIGERR_INVALID_ARGUMENT; +#endif      if (set->n > 1 && reg->enc != set->enc)        return ONIGERR_INVALID_ARGUMENT; @@ -6573,7 +6656,7 @@ onig_builtin_monitor(OnigCalloutArgs* args, void* user_data)      tag_len = tag_end - tag_start;      if (tag_len >= sizeof(buf)) tag_len = sizeof(buf) - 1; -    for (i = 0; i < tag_len; i++) buf[i] = tag_start[i]; +    for (i = 0; i < (int )tag_len; i++) buf[i] = tag_start[i];      buf[tag_len] = '\0';    } diff --git a/src/regint.h b/src/regint.h index 74a5c61..9856a96 100644 --- a/src/regint.h +++ b/src/regint.h @@ -4,7 +4,7 @@    regint.h -  Oniguruma (regular expression library)  **********************************************************************/  /*- - * Copyright (c) 2002-2020  K.Kosako + * Copyright (c) 2002-2021  K.Kosako   * All rights reserved.   *   * Redistribution and use in source and binary forms, with or without @@ -35,6 +35,7 @@  /* #define ONIG_DEBUG_SEARCH */  /* #define ONIG_DEBUG_MATCH */  /* #define ONIG_DEBUG_MATCH_COUNTER */ +/* #define ONIG_DEBUG_CALL */  /* #define ONIG_DONT_OPTIMIZE */  /* for byte-code statistical data. */ @@ -42,7 +43,8 @@  #if defined(ONIG_DEBUG_PARSE) || defined(ONIG_DEBUG_MATCH) || \      defined(ONIG_DEBUG_SEARCH) || defined(ONIG_DEBUG_COMPILE) || \ -    defined(ONIG_DEBUG_MATCH_COUNTER) || defined(ONIG_DEBUG_STATISTICS) +    defined(ONIG_DEBUG_MATCH_COUNTER) || defined(ONIG_DEBUG_CALL) || \ +    defined(ONIG_DEBUG_STATISTICS)  #ifndef ONIG_DEBUG  #define ONIG_DEBUG  #define DBGFP   stderr @@ -61,7 +63,7 @@  #define USE_CALL  #define USE_CALLOUT  #define USE_BACKREF_WITH_LEVEL        /* \k<name+n>, \k<name-n> */ -#define USE_STUBBORN_CHECK_CAPTURES_IN_EMPTY_REPEAT     /* /(?:()|())*\2/ */ +#define USE_RIGID_CHECK_CAPTURES_IN_EMPTY_REPEAT        /* /(?:()|())*\2/ */  #define USE_NEWLINE_AT_END_OF_STRING_HAS_EMPTY_LINE     /* /\n$/ =~ "\n" */  #define USE_WARNING_REDUNDANT_NESTED_REPEAT_OPERATOR  #define USE_RETRY_LIMIT @@ -388,10 +390,10 @@ typedef unsigned int  MemStatusType;    (IS_CODE_DIGIT_ASCII(enc,code) ? DIGITVAL(code) \     : (ONIGENC_IS_CODE_UPPER(enc,code) ? (code) - 'A' + 10 : (code) - 'a' + 10)) +#define OPTON_CALLBACK_EACH_MATCH(option) \ +        ((option) & ONIG_OPTION_CALLBACK_EACH_MATCH)  #define OPTON_FIND_LONGEST(option)   ((option) & ONIG_OPTION_FIND_LONGEST)  #define OPTON_FIND_NOT_EMPTY(option) ((option) & ONIG_OPTION_FIND_NOT_EMPTY) -#define OPTON_FIND_CONDITION(option) ((option) & \ -          (ONIG_OPTION_FIND_LONGEST | ONIG_OPTION_FIND_NOT_EMPTY))  #define OPTON_NEGATE_SINGLELINE(option) ((option) & \                                            ONIG_OPTION_NEGATE_SINGLELINE)  #define OPTON_DONT_CAPTURE_GROUP(option) ((option) & \ @@ -406,8 +408,6 @@ typedef unsigned int  MemStatusType;  #define OPTON_NOT_END_STRING(option)      ((option) & ONIG_OPTION_NOT_END_STRING)  #define OPTON_NOT_BEGIN_POSITION(option)  ((option) & ONIG_OPTION_NOT_BEGIN_POSITION) -#define DISABLE_CASE_FOLD_MULTI_CHAR(case_fold_flag) \ -  ((case_fold_flag) & ~INTERNAL_ONIGENC_CASE_FOLD_MULTI_CHAR)  #define INFINITE_REPEAT         -1  #define IS_INFINITE_REPEAT(n)   ((n) == INFINITE_REPEAT) @@ -437,81 +437,6 @@ typedef Bits*     BitSetRef;  #define BITSET_CLEAR_BIT(bs, pos)   BS_ROOM(bs,pos) &= ~(BS_BIT(pos))  #define BITSET_INVERT_BIT(bs, pos)  BS_ROOM(bs,pos) ^= BS_BIT(pos) -/* bytes buffer */ -typedef struct _BBuf { -  UChar* p; -  unsigned int used; -  unsigned int alloc; -} BBuf; - -#define BB_INIT(buf,size)    bbuf_init((BBuf* )(buf), (size)) - -#define BB_EXPAND(buf,low) do{\ -  do { (buf)->alloc *= 2; } while ((buf)->alloc < (unsigned int )low);\ -  (buf)->p = (UChar* )xrealloc((buf)->p, (buf)->alloc);\ -  if (IS_NULL((buf)->p)) return(ONIGERR_MEMORY);\ -} while (0) - -#define BB_ENSURE_SIZE(buf,size) do{\ -  unsigned int new_alloc = (buf)->alloc;\ -  while (new_alloc < (unsigned int )(size)) { new_alloc *= 2; }\ -  if ((buf)->alloc != new_alloc) {\ -    (buf)->p = (UChar* )xrealloc((buf)->p, new_alloc);\ -    if (IS_NULL((buf)->p)) return(ONIGERR_MEMORY);\ -    (buf)->alloc = new_alloc;\ -  }\ -} while (0) - -#define BB_WRITE(buf,pos,bytes,n) do{\ -  int used = (pos) + (n);\ -  if ((buf)->alloc < (unsigned int )used) BB_EXPAND((buf),used);\ -  xmemcpy((buf)->p + (pos), (bytes), (n));\ -  if ((buf)->used < (unsigned int )used) (buf)->used = used;\ -} while (0) - -#define BB_WRITE1(buf,pos,byte) do{\ -  int used = (pos) + 1;\ -  if ((buf)->alloc < (unsigned int )used) BB_EXPAND((buf),used);\ -  (buf)->p[(pos)] = (byte);\ -  if ((buf)->used < (unsigned int )used) (buf)->used = used;\ -} while (0) - -#define BB_ADD(buf,bytes,n)       BB_WRITE((buf),(buf)->used,(bytes),(n)) -#define BB_ADD1(buf,byte)         BB_WRITE1((buf),(buf)->used,(byte)) -#define BB_GET_ADD_ADDRESS(buf)   ((buf)->p + (buf)->used) -#define BB_GET_OFFSET_POS(buf)    ((buf)->used) - -/* from < to */ -#define BB_MOVE_RIGHT(buf,from,to,n) do {\ -  if ((unsigned int )((to)+(n)) > (buf)->alloc) BB_EXPAND((buf),(to) + (n));\ -  xmemmove((buf)->p + (to), (buf)->p + (from), (n));\ -  if ((unsigned int )((to)+(n)) > (buf)->used) (buf)->used = (to) + (n);\ -} while (0) - -/* from > to */ -#define BB_MOVE_LEFT(buf,from,to,n) do {\ -  xmemmove((buf)->p + (to), (buf)->p + (from), (n));\ -} while (0) - -/* from > to */ -#define BB_MOVE_LEFT_REDUCE(buf,from,to) do {\ -  xmemmove((buf)->p + (to), (buf)->p + (from), (buf)->used - (from));\ -  (buf)->used -= (from - to);\ -} while (0) - -#define BB_INSERT(buf,pos,bytes,n) do {\ -  if (pos >= (buf)->used) {\ -    BB_WRITE(buf,pos,bytes,n);\ -  }\ -  else {\ -    BB_MOVE_RIGHT((buf),(pos),(pos) + (n),((buf)->used - (pos)));\ -    xmemcpy((buf)->p + (pos), (bytes), (n));\ -  }\ -} while (0) - -#define BB_GET_BYTE(buf, pos) (buf)->p[(pos)] - -  /* has body */  #define ANCR_PREC_READ        (1<<0)  #define ANCR_PREC_READ_NOT    (1<<1) @@ -884,6 +809,7 @@ typedef struct {      } empty_check_start;      struct {        MemNumType mem; +      MemStatusType empty_status_mem;      } empty_check_end; /* EMPTY_CHECK_END, EMPTY_CHECK_END_MEMST, EMPTY_CHECK_END_MEMST_PUSH */      struct {        RelAddrType addr; @@ -922,7 +848,7 @@ typedef struct {      } update_var;      struct {        AbsAddrType addr; -#ifdef ONIG_DEBUG_MATCH_COUNTER +#if defined(ONIG_DEBUG_MATCH_COUNTER) || defined(ONIG_DEBUG_CALL)        MemNumType called_mem;  #endif      } call; @@ -977,7 +903,6 @@ struct re_pattern_buffer {    MemStatusType  capture_history;  /* (?@...) flag (1-31) */    MemStatusType  push_mem_start;   /* need backtrack flag */    MemStatusType  push_mem_end;     /* need backtrack flag */ -  MemStatusType  empty_status_mem;    int            stack_pop_level;    int            repeat_range_alloc;    RepeatRange*   repeat_range; diff --git a/src/regparse.c b/src/regparse.c index dd2824b..938a569 100644 --- a/src/regparse.c +++ b/src/regparse.c @@ -2,7 +2,7 @@    regparse.c -  Oniguruma (regular expression library)  **********************************************************************/  /*- - * Copyright (c) 2002-2020  K.Kosako + * Copyright (c) 2002-2021  K.Kosako   * All rights reserved.   *   * Redistribution and use in source and binary forms, with or without @@ -159,6 +159,75 @@ OnigSyntaxType OnigSyntaxRuby = {  OnigSyntaxType*  OnigDefaultSyntax = ONIG_SYNTAX_ONIGURUMA; + +#define BB_INIT(buf,size)    bbuf_init((BBuf* )(buf), (size)) + +#define BB_EXPAND(buf,low) do{\ +  do { (buf)->alloc *= 2; } while ((buf)->alloc < (unsigned int )low);\ +  (buf)->p = (UChar* )xrealloc((buf)->p, (buf)->alloc);\ +  if (IS_NULL((buf)->p)) return(ONIGERR_MEMORY);\ +} while (0) + +#define BB_ENSURE_SIZE(buf,size) do{\ +  unsigned int new_alloc = (buf)->alloc;\ +  while (new_alloc < (unsigned int )(size)) { new_alloc *= 2; }\ +  if ((buf)->alloc != new_alloc) {\ +    (buf)->p = (UChar* )xrealloc((buf)->p, new_alloc);\ +    if (IS_NULL((buf)->p)) return(ONIGERR_MEMORY);\ +    (buf)->alloc = new_alloc;\ +  }\ +} while (0) + +#define BB_WRITE(buf,pos,bytes,n) do{\ +  int used = (pos) + (n);\ +  if ((buf)->alloc < (unsigned int )used) BB_EXPAND((buf),used);\ +  xmemcpy((buf)->p + (pos), (bytes), (n));\ +  if ((buf)->used < (unsigned int )used) (buf)->used = used;\ +} while (0) + +#define BB_WRITE1(buf,pos,byte) do{\ +  int used = (pos) + 1;\ +  if ((buf)->alloc < (unsigned int )used) BB_EXPAND((buf),used);\ +  (buf)->p[(pos)] = (byte);\ +  if ((buf)->used < (unsigned int )used) (buf)->used = used;\ +} while (0) + +#define BB_ADD(buf,bytes,n)       BB_WRITE((buf),(buf)->used,(bytes),(n)) +#define BB_ADD1(buf,byte)         BB_WRITE1((buf),(buf)->used,(byte)) +#define BB_GET_ADD_ADDRESS(buf)   ((buf)->p + (buf)->used) +#define BB_GET_OFFSET_POS(buf)    ((buf)->used) + +/* from < to */ +#define BB_MOVE_RIGHT(buf,from,to,n) do {\ +  if ((unsigned int )((to)+(n)) > (buf)->alloc) BB_EXPAND((buf),(to) + (n));\ +  xmemmove((buf)->p + (to), (buf)->p + (from), (n));\ +  if ((unsigned int )((to)+(n)) > (buf)->used) (buf)->used = (to) + (n);\ +} while (0) + +/* from > to */ +#define BB_MOVE_LEFT(buf,from,to,n) do {\ +  xmemmove((buf)->p + (to), (buf)->p + (from), (n));\ +} while (0) + +/* from > to */ +#define BB_MOVE_LEFT_REDUCE(buf,from,to) do {\ +  xmemmove((buf)->p + (to), (buf)->p + (from), (buf)->used - (from));\ +  (buf)->used -= (from - to);\ +} while (0) + +#define BB_INSERT(buf,pos,bytes,n) do {\ +  if (pos >= (buf)->used) {\ +    BB_WRITE(buf,pos,bytes,n);\ +  }\ +  else {\ +    BB_MOVE_RIGHT((buf),(pos),(pos) + (n),((buf)->used - (pos)));\ +    xmemcpy((buf)->p + (pos), (bytes), (n));\ +  }\ +} while (0) + +#define BB_GET_BYTE(buf, pos) (buf)->p[(pos)] + +  typedef enum {    CS_VALUE,    CS_RANGE, @@ -300,7 +369,7 @@ bbuf_clone(BBuf** rto, BBuf* from)  }  static int -backref_rel_to_abs(int rel_no, ScanEnv* env) +backref_rel_to_abs(int rel_no, ParseEnv* env)  {    if (rel_no > 0) {      if (rel_no > ONIG_INT_MAX - env->num_mem) @@ -981,7 +1050,7 @@ onig_number_of_names(regex_t* reg)  #endif /* else USE_ST_LIBRARY */  static int -name_add(regex_t* reg, UChar* name, UChar* name_end, int backref, ScanEnv* env) +name_add(regex_t* reg, UChar* name, UChar* name_end, int backref, ParseEnv* env)  {    int r;    int alloc; @@ -1115,7 +1184,7 @@ onig_name_to_group_numbers(regex_t* reg, const UChar* name,  }  static int -name_to_group_numbers(ScanEnv* env, const UChar* name, const UChar* name_end, +name_to_group_numbers(ParseEnv* env, const UChar* name, const UChar* name_end,                        int** nums)  {    regex_t* reg; @@ -1920,7 +1989,7 @@ callout_tag_table_new(CalloutTagTable** rt)  }  static int -callout_tag_entry_raw(ScanEnv* env, CalloutTagTable* t, UChar* name, +callout_tag_entry_raw(ParseEnv* env, CalloutTagTable* t, UChar* name,                        UChar* name_end, CalloutTagVal entry_val)  {    int r; @@ -1963,7 +2032,7 @@ ext_ensure_tag_table(regex_t* reg)  }  static int -callout_tag_entry(ScanEnv* env, regex_t* reg, UChar* name, UChar* name_end, +callout_tag_entry(ParseEnv* env, regex_t* reg, UChar* name, UChar* name_end,                    CalloutTagVal entry_val)  {    int r; @@ -1988,10 +2057,10 @@ callout_tag_entry(ScanEnv* env, regex_t* reg, UChar* name, UChar* name_end,  #endif /* USE_CALLOUT */ -#define INIT_SCANENV_MEMENV_ALLOC_SIZE   16 +#define INIT_PARSEENV_MEMENV_ALLOC_SIZE   16  static void -scan_env_clear(ScanEnv* env) +scan_env_clear(ParseEnv* env)  {    MEM_STATUS_CLEAR(env->cap_history);    MEM_STATUS_CLEAR(env->backtrack_mem); @@ -2024,7 +2093,7 @@ scan_env_clear(ScanEnv* env)  }  static int -scan_env_add_mem_entry(ScanEnv* env) +scan_env_add_mem_entry(ParseEnv* env)  {    int i, need, alloc;    MemEnv* p; @@ -2033,10 +2102,10 @@ scan_env_add_mem_entry(ScanEnv* env)    if (need > MaxCaptureNum && MaxCaptureNum != 0)      return ONIGERR_TOO_MANY_CAPTURES; -  if (need >= SCANENV_MEMENV_SIZE) { +  if (need >= PARSEENV_MEMENV_SIZE) {      if (env->mem_alloc <= need) {        if (IS_NULL(env->mem_env_dynamic)) { -        alloc = INIT_SCANENV_MEMENV_ALLOC_SIZE; +        alloc = INIT_PARSEENV_MEMENV_ALLOC_SIZE;          p = (MemEnv* )xmalloc(sizeof(MemEnv) * alloc);          CHECK_NULL_RETURN_MEMERR(p);          xmemcpy(p, env->mem_env_static, sizeof(env->mem_env_static)); @@ -2062,10 +2131,10 @@ scan_env_add_mem_entry(ScanEnv* env)  }  static int -scan_env_set_mem_node(ScanEnv* env, int num, Node* node) +scan_env_set_mem_node(ParseEnv* env, int num, Node* node)  {    if (env->num_mem >= num) -    SCANENV_MEMENV(env)[num].mem_node = node; +    PARSEENV_MEMENV(env)[num].mem_node = node;    else      return ONIGERR_PARSER_BUG;    return 0; @@ -2285,7 +2354,7 @@ node_new_anychar(OnigOptionType options)  }  static int -node_new_no_newline(Node** node, ScanEnv* env) +node_new_no_newline(Node** node, ParseEnv* env)  {    Node* n; @@ -2425,7 +2494,7 @@ node_new_backref(int back_num, int* backrefs, int by_name,  #ifdef USE_BACKREF_WITH_LEVEL                   int exist_level, int nest_level,  #endif -                 ScanEnv* env) +                 ParseEnv* env)  {    int i;    Node* node; @@ -2451,7 +2520,7 @@ node_new_backref(int back_num, int* backrefs, int by_name,    for (i = 0; i < back_num; i++) {      if (backrefs[i] <= env->num_mem && -        IS_NULL(SCANENV_MEMENV(env)[backrefs[i]].mem_node)) { +        IS_NULL(PARSEENV_MEMENV(env)[backrefs[i]].mem_node)) {        NODE_STATUS_ADD(node, RECURSION);   /* /...(\1).../ */        break;      } @@ -2481,7 +2550,7 @@ node_new_backref_checker(int back_num, int* backrefs, int by_name,  #ifdef USE_BACKREF_WITH_LEVEL                           int exist_level, int nest_level,  #endif -                         ScanEnv* env) +                         ParseEnv* env)  {    Node* node; @@ -2527,6 +2596,7 @@ node_new_quantifier(int lower, int upper, int by_number)    QUANT_(node)->head_exact       = NULL_NODE;    QUANT_(node)->next_head_exact  = NULL_NODE;    QUANT_(node)->include_referred = 0; +  QUANT_(node)->empty_status_mem = 0;    if (by_number != 0)      NODE_STATUS_ADD(node, BY_NUMBER); @@ -2640,7 +2710,7 @@ node_set_fail(Node* node)  }  static int -node_new_fail(Node** node, ScanEnv* env) +node_new_fail(Node** node, ParseEnv* env)  {    *node = node_new();    CHECK_NULL_RETURN_MEMERR(*node); @@ -2656,7 +2726,7 @@ onig_node_reset_fail(Node* node)  }  static int -node_new_save_gimmick(Node** node, enum SaveType save_type, ScanEnv* env) +node_new_save_gimmick(Node** node, enum SaveType save_type, ParseEnv* env)  {    int id; @@ -2675,7 +2745,7 @@ node_new_save_gimmick(Node** node, enum SaveType save_type, ScanEnv* env)  static int  node_new_update_var_gimmick(Node** node, enum UpdateVarType update_var_type, -                            int id, ScanEnv* env) +                            int id, ParseEnv* env)  {    *node = node_new();    CHECK_NULL_RETURN_MEMERR(*node); @@ -2689,7 +2759,7 @@ node_new_update_var_gimmick(Node** node, enum UpdateVarType update_var_type,  }  static int -node_new_keep(Node** node, ScanEnv* env) +node_new_keep(Node** node, ParseEnv* env)  {    int r; @@ -2743,7 +2813,7 @@ onig_reg_callout_list_at(regex_t* reg, int num)  }  static int -reg_callout_list_entry(ScanEnv* env, int* rnum) +reg_callout_list_entry(ParseEnv* env, int* rnum)  {  #define INIT_CALLOUT_LIST_NUM  3 @@ -2795,7 +2865,7 @@ reg_callout_list_entry(ScanEnv* env, int* rnum)  static int  node_new_callout(Node** node, OnigCalloutOf callout_of, int num, int id, -                 ScanEnv* env) +                 ParseEnv* env)  {    *node = node_new();    CHECK_NULL_RETURN_MEMERR(*node); @@ -2811,7 +2881,7 @@ node_new_callout(Node** node, OnigCalloutOf callout_of, int num, int id,  #endif  static int -make_text_segment(Node** node, ScanEnv* env) +make_text_segment(Node** node, ParseEnv* env)  {    int r;    int i; @@ -2868,7 +2938,7 @@ make_text_segment(Node** node, ScanEnv* env)  static int  make_absent_engine(Node** node, int pre_save_right_id, Node* absent,                     Node* step_one, int lower, int upper, int possessive, -                   int is_range_cutter, ScanEnv* env) +                   int is_range_cutter, ParseEnv* env)  {    int r;    int i; @@ -2950,7 +3020,7 @@ make_absent_engine(Node** node, int pre_save_right_id, Node* absent,  static int  make_absent_tail(Node** node1, Node** node2, int pre_save_right_id, -                 ScanEnv* env) +                 ParseEnv* env)  {    int r;    int id; @@ -2998,7 +3068,7 @@ make_absent_tail(Node** node1, Node** node2, int pre_save_right_id,  }  static int -make_range_clear(Node** node, ScanEnv* env) +make_range_clear(Node** node, ParseEnv* env)  {    int r;    int id; @@ -3057,7 +3127,7 @@ make_range_clear(Node** node, ScanEnv* env)  static int  is_simple_one_char_repeat(Node* node, Node** rquant, Node** rbody, -                          int* is_possessive, ScanEnv* env) +                          int* is_possessive, ParseEnv* env)  {    Node* quant;    Node* body; @@ -3123,8 +3193,8 @@ is_simple_one_char_repeat(Node* node, Node** rquant, Node** rbody,  }  static int -make_absent_tree_for_simple_one_char_repeat(Node** node, Node* absent, Node* quant, -                                            Node* body, int possessive, ScanEnv* env) +make_absent_tree_for_simple_one_char_repeat(Node** node, +  Node* absent, Node* quant, Node* body, int possessive, ParseEnv* env)  {    int r;    int i; @@ -3171,7 +3241,7 @@ make_absent_tree_for_simple_one_char_repeat(Node** node, Node* absent, Node* qua  static int  make_absent_tree(Node** node, Node* absent, Node* expr, int is_range_cutter, -                 ScanEnv* env) +                 ParseEnv* env)  {    int r;    int i; @@ -3844,7 +3914,7 @@ add_code_range_to_buf(BBuf** pbuf, OnigCodePoint from, OnigCodePoint to)  }  static int -add_code_range(BBuf** pbuf, ScanEnv* env, OnigCodePoint from, OnigCodePoint to) +add_code_range(BBuf** pbuf, ParseEnv* env, OnigCodePoint from, OnigCodePoint to)  {    if (from > to) {      if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_EMPTY_RANGE_IN_CC)) @@ -4172,7 +4242,7 @@ or_cclass(CClassNode* dest, CClassNode* cc, OnigEncoding enc)  }  static OnigCodePoint -conv_backslash_value(OnigCodePoint c, ScanEnv* env) +conv_backslash_value(OnigCodePoint c, ParseEnv* env)  {    if (IS_SYNTAX_OP(env->syntax, ONIG_SYN_OP_ESC_CONTROL_CHARS)) {      switch (c) { @@ -4258,10 +4328,10 @@ enum ReduceType {    RQ_ASIS = 0, /* as is */    RQ_DEL  = 1, /* delete parent */    RQ_A,        /* to '*'    */ +  RQ_P,        /* to '+'    */    RQ_AQ,       /* to '*?'   */    RQ_QQ,       /* to '??'   */    RQ_P_QQ,     /* to '+)??' */ -  RQ_PQ_Q      /* to '+?)?' */  };  static enum ReduceType ReduceTypeTable[6][6] = { @@ -4270,7 +4340,7 @@ static enum ReduceType ReduceTypeTable[6][6] = {    {RQ_A,    RQ_A,    RQ_DEL, RQ_ASIS, RQ_P_QQ, RQ_DEL},  /* '+'  */    {RQ_DEL,  RQ_AQ,   RQ_AQ,  RQ_DEL,  RQ_AQ,   RQ_AQ},   /* '??' */    {RQ_DEL,  RQ_DEL,  RQ_DEL, RQ_DEL,  RQ_DEL,  RQ_DEL},  /* '*?' */ -  {RQ_ASIS, RQ_PQ_Q, RQ_DEL, RQ_AQ,   RQ_AQ,   RQ_DEL}   /* '+?' */ +  {RQ_ASIS, RQ_A,    RQ_P,   RQ_AQ,   RQ_AQ,   RQ_DEL}   /* '+?' */  };  extern int @@ -4309,6 +4379,11 @@ onig_reduce_nested_quantifier(Node* pnode)      p->lower  = 0;  p->upper = INFINITE_REPEAT;  p->greedy = 1;      goto remove_cnode;      break; +  case RQ_P: +    NODE_BODY(pnode) = NODE_BODY(cnode); +    p->lower  = 1;  p->upper = INFINITE_REPEAT;  p->greedy = 1; +    goto remove_cnode; +    break;    case RQ_AQ:      NODE_BODY(pnode) = NODE_BODY(cnode);      p->lower  = 0;  p->upper = INFINITE_REPEAT;  p->greedy = 0; @@ -4323,10 +4398,6 @@ onig_reduce_nested_quantifier(Node* pnode)      p->lower  = 0;  p->upper = 1;  p->greedy = 0;      c->lower  = 1;  c->upper = INFINITE_REPEAT;  c->greedy = 1;      break; -  case RQ_PQ_Q: -    p->lower  = 0;  p->upper = 1;  p->greedy = 1; -    c->lower  = 1;  c->upper = INFINITE_REPEAT;  c->greedy = 0; -    break;    case RQ_ASIS:      break;    } @@ -4340,7 +4411,7 @@ onig_reduce_nested_quantifier(Node* pnode)  }  static int -node_new_general_newline(Node** node, ScanEnv* env) +node_new_general_newline(Node** node, ParseEnv* env)  {    int r;    int dlen, alen; @@ -4472,7 +4543,7 @@ ptoken_init(PToken* tok)  }  static int -fetch_interval(UChar** src, UChar* end, PToken* tok, ScanEnv* env) +fetch_interval(UChar** src, UChar* end, PToken* tok, ParseEnv* env)  {    int low, up, syn_allow, non_low = 0;    int r = 0; @@ -4575,7 +4646,8 @@ fetch_interval(UChar** src, UChar* end, PToken* tok, ScanEnv* env)  /* \M-, \C-, \c, or \... */  static int -fetch_escaped_value_raw(UChar** src, UChar* end, ScanEnv* env, OnigCodePoint* val) +fetch_escaped_value_raw(UChar** src, UChar* end, ParseEnv* env, +                        OnigCodePoint* val)  {    int v;    OnigCodePoint c; @@ -4646,7 +4718,7 @@ fetch_escaped_value_raw(UChar** src, UChar* end, ScanEnv* env, OnigCodePoint* va  }  static int -fetch_escaped_value(UChar** src, UChar* end, ScanEnv* env, OnigCodePoint* val) +fetch_escaped_value(UChar** src, UChar* end, ParseEnv* env, OnigCodePoint* val)  {    int r;    int len; @@ -4660,7 +4732,7 @@ fetch_escaped_value(UChar** src, UChar* end, ScanEnv* env, OnigCodePoint* val)    return 0;  } -static int fetch_token(PToken* tok, UChar** src, UChar* end, ScanEnv* env); +static int fetch_token(PToken* tok, UChar** src, UChar* end, ParseEnv* env);  static OnigCodePoint  get_name_end_code_point(OnigCodePoint start) @@ -4691,7 +4763,7 @@ enum REF_NUM {  */  static int  fetch_name_with_level(OnigCodePoint start_code, UChar** src, UChar* end, -                      UChar** rname_end, ScanEnv* env, +                      UChar** rname_end, ParseEnv* env,                        int* rback_num, int* rlevel, enum REF_NUM* num_type)  {    int r, sign, exist_level; @@ -4825,7 +4897,7 @@ fetch_name_with_level(OnigCodePoint start_code, UChar** src, UChar* end,  */  static int  fetch_name(OnigCodePoint start_code, UChar** src, UChar* end, -           UChar** rname_end, ScanEnv* env, int* rback_num, +           UChar** rname_end, ParseEnv* env, int* rback_num,             enum REF_NUM* num_type, int is_ref)  {    int r, sign; @@ -4957,7 +5029,7 @@ fetch_name(OnigCodePoint start_code, UChar** src, UChar* end,  }  static void -CC_ESC_WARN(ScanEnv* env, UChar *c) +CC_ESC_WARN(ParseEnv* env, UChar *c)  {    if (onig_warn == onig_null_warn) return ; @@ -4973,7 +5045,7 @@ CC_ESC_WARN(ScanEnv* env, UChar *c)  }  static void -CLOSE_BRACKET_WITHOUT_ESC_WARN(ScanEnv* env, UChar* c) +CLOSE_BRACKET_WITHOUT_ESC_WARN(ParseEnv* env, UChar* c)  {    if (onig_warn == onig_null_warn) return ; @@ -5054,11 +5126,12 @@ str_exist_check_with_esc(OnigCodePoint s[], int n, UChar* from, UChar* to,  }  static int -fetch_token_cc(PToken* tok, UChar** src, UChar* end, ScanEnv* env, int state) +fetch_token_cc(PToken* tok, UChar** src, UChar* end, ParseEnv* env, int state)  {    int r;    OnigCodePoint code;    OnigCodePoint c, c2; +  int mindigits, maxdigits;    OnigSyntaxType* syn = env->syntax;    OnigEncoding enc = env->enc;    UChar* prev; @@ -5247,10 +5320,11 @@ fetch_token_cc(PToken* tok, UChar** src, UChar* end, ScanEnv* env, int state)      case 'u':        if (PEND) break; -        prev = p;        if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_U_HEX4)) { -        r = scan_hexadecimal_number(&p, end, 4, 4, enc, &code); +        mindigits = maxdigits = 4; +      u_hex_digits: +        r = scan_hexadecimal_number(&p, end, mindigits, maxdigits, enc, &code);          if (r < 0) return r;          if (p == prev) {  /* can't read nothing. */            code = 0; /* but, it's not error */ @@ -5261,6 +5335,15 @@ fetch_token_cc(PToken* tok, UChar** src, UChar* end, ScanEnv* env, int state)        }        break; +    case 'U': +      if (PEND) break; +      prev = p; +      if (IS_SYNTAX_BV(syn, ONIG_SYN_PYTHON)) { +        mindigits = maxdigits = 8; +        goto u_hex_digits; +      } +      break; +      case '0':      case '1': case '2': case '3': case '4': case '5': case '6': case '7':        if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_OCTAL3)) { @@ -5327,15 +5410,22 @@ fetch_token_cc(PToken* tok, UChar** src, UChar* end, ScanEnv* env, int state)  }  static int -fetch_token(PToken* tok, UChar** src, UChar* end, ScanEnv* env) +fetch_token(PToken* tok, UChar** src, UChar* end, ParseEnv* env)  {    int r;    OnigCodePoint code;    OnigCodePoint c; -  OnigEncoding enc = env->enc; -  OnigSyntaxType* syn = env->syntax; +  int mindigits, maxdigits;    UChar* prev; -  UChar* p = *src; +  int allow_num; +  OnigEncoding enc; +  OnigSyntaxType* syn; +  UChar* p; + +  enc = env->enc; +  syn = env->syntax; +  p = *src; +    PFETCH_READY;    if (tok->code_point_continue != 0) { @@ -5574,12 +5664,20 @@ fetch_token(PToken* tok, UChar** src, UChar* end, ScanEnv* env)        break;      case 'Z': -      if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_AZ_BUF_ANCHOR)) break; -      tok->type = TK_ANCHOR; -      tok->u.subtype = ANCR_SEMI_END_BUF; +      if (IS_SYNTAX_BV(syn, ONIG_SYN_PYTHON)) { +        goto end_buf; +      } +      else { +        if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_AZ_BUF_ANCHOR)) break; +        tok->type = TK_ANCHOR; +        tok->u.subtype = ANCR_SEMI_END_BUF; +      }        break;      case 'z': +      if (IS_SYNTAX_BV(syn, ONIG_SYN_PYTHON)) +        return ONIGERR_UNDEFINED_OPERATOR; +        if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_AZ_BUF_ANCHOR)) break;      end_buf:        tok->type = TK_ANCHOR; @@ -5668,10 +5766,11 @@ fetch_token(PToken* tok, UChar** src, UChar* end, ScanEnv* env)      case 'u':        if (PEND) break; -        prev = p; +      mindigits = maxdigits = 4;        if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_U_HEX4)) { -        r = scan_hexadecimal_number(&p, end, 4, 4, enc, &code); +    u_hex_digits: +        r = scan_hexadecimal_number(&p, end, mindigits, maxdigits, enc, &code);          if (r < 0) return r;          if (p == prev) {  /* can't read nothing. */            code = 0; /* but, it's not error */ @@ -5682,6 +5781,15 @@ fetch_token(PToken* tok, UChar** src, UChar* end, ScanEnv* env)        }        break; +    case 'U': +      if (PEND) break; +      prev = p; +      if (IS_SYNTAX_BV(syn, ONIG_SYN_PYTHON)) { +        mindigits = maxdigits = 8; +        goto u_hex_digits; +      } +      break; +      case '1': case '2': case '3': case '4':      case '5': case '6': case '7': case '8': case '9':        PUNFETCH; @@ -5694,7 +5802,7 @@ fetch_token(PToken* tok, UChar** src, UChar* end, ScanEnv* env)        if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_DECIMAL_BACKREF) &&            (r <= env->num_mem || r <= 9)) { /* This spec. from GNU regex */          if (IS_SYNTAX_BV(syn, ONIG_SYN_STRICT_CHECK_BACKREF)) { -          if (r > env->num_mem || IS_NULL(SCANENV_MEMENV(env)[r].mem_node)) +          if (r > env->num_mem || IS_NULL(PARSEENV_MEMENV(env)[r].mem_node))              return ONIGERR_INVALID_BACKREF;          } @@ -5743,6 +5851,9 @@ fetch_token(PToken* tok, UChar** src, UChar* end, ScanEnv* env)            int back_num;            enum REF_NUM num_type; +          allow_num = 1; + +        backref_start:            prev = p;  #ifdef USE_BACKREF_WITH_LEVEL @@ -5757,6 +5868,8 @@ fetch_token(PToken* tok, UChar** src, UChar* end, ScanEnv* env)            if (r < 0) return r;            if (num_type != IS_NOT_NUM) { +            if (allow_num == 0) return ONIGERR_INVALID_BACKREF; +              if (num_type == IS_REL_NUM) {                back_num = backref_rel_to_abs(back_num, env);              } @@ -5765,7 +5878,7 @@ fetch_token(PToken* tok, UChar** src, UChar* end, ScanEnv* env)              if (IS_SYNTAX_BV(syn, ONIG_SYN_STRICT_CHECK_BACKREF)) {                if (back_num > env->num_mem || -                  IS_NULL(SCANENV_MEMENV(env)[back_num].mem_node)) +                  IS_NULL(PARSEENV_MEMENV(env)[back_num].mem_node))                  return ONIGERR_INVALID_BACKREF;              }              tok->type = TK_BACKREF; @@ -5782,7 +5895,7 @@ fetch_token(PToken* tok, UChar** src, UChar* end, ScanEnv* env)                int i;                for (i = 0; i < num; i++) {                  if (backs[i] > env->num_mem || -                    IS_NULL(SCANENV_MEMENV(env)[backs[i]].mem_node)) +                    IS_NULL(PARSEENV_MEMENV(env)[backs[i]].mem_node))                    return ONIGERR_INVALID_BACKREF;                }              } @@ -5813,12 +5926,17 @@ fetch_token(PToken* tok, UChar** src, UChar* end, ScanEnv* env)            UChar* name_end;            enum REF_NUM num_type; +          allow_num = 1; + +        call_start:            prev = p;            r = fetch_name((OnigCodePoint )c, &p, end, &name_end, env,                           &gnum, &num_type, TRUE);            if (r < 0) return r;            if (num_type != IS_NOT_NUM) { +            if (allow_num == 0) return ONIGERR_UNDEFINED_GROUP_REFERENCE; +              if (num_type == IS_REL_NUM) {                gnum = backref_rel_to_abs(gnum, env);                if (gnum < 0) { @@ -5975,6 +6093,7 @@ fetch_token(PToken* tok, UChar** src, UChar* end, ScanEnv* env)      case '(':        if (!PEND && PPEEK_IS('?') &&            IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_QMARK_GROUP_EFFECT)) { +        prev = p;          PINC;          if (! PEND) {            c = PPEEK; @@ -6062,11 +6181,35 @@ fetch_token(PToken* tok, UChar** src, UChar* end, ScanEnv* env)                break;              }            } +          else if (c == 'P' && +                   IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_CAPITAL_P_NAME)) { +            PINC; /* skip 'P' */ +            if (PEND) return ONIGERR_END_PATTERN_IN_GROUP; +            PFETCH(c); +            allow_num = 0; +            if (c == '=') { +              c = '('; +              goto backref_start; +            } +            else if (c == '>') { +#ifdef USE_CALL +              c = '('; +              goto call_start; +#else +              return ONIGERR_UNDEFINED_OPERATOR; +#endif +            } +            else { +              p = prev; +              goto lparen_qmark_end2; +            } +          }          }        lparen_qmark_end:          PUNFETCH;        } +    lparen_qmark_end2:        if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_LPAREN_SUBEXP)) break;        tok->type = TK_SUBEXP_OPEN;        break; @@ -6295,7 +6438,7 @@ add_ctype_to_cc_by_range_limit(CClassNode* cc, int ctype ARG_UNUSED, int not,  }  static int -add_ctype_to_cc(CClassNode* cc, int ctype, int not, ScanEnv* env) +add_ctype_to_cc(CClassNode* cc, int ctype, int not, ParseEnv* env)  {    int c, r;    int ascii_mode; @@ -6398,7 +6541,7 @@ add_ctype_to_cc(CClassNode* cc, int ctype, int not, ScanEnv* env)  }  static int -prs_posix_bracket(CClassNode* cc, UChar** src, UChar* end, ScanEnv* env) +prs_posix_bracket(CClassNode* cc, UChar** src, UChar* end, ParseEnv* env)  {  #define POSIX_BRACKET_CHECK_LIMIT_LENGTH  20  #define POSIX_BRACKET_NAME_MIN_LEN         4 @@ -6472,7 +6615,7 @@ prs_posix_bracket(CClassNode* cc, UChar** src, UChar* end, ScanEnv* env)  }  static int -fetch_char_property_to_ctype(UChar** src, UChar* end, ScanEnv* env) +fetch_char_property_to_ctype(UChar** src, UChar* end, ParseEnv* env)  {    int r;    OnigCodePoint c; @@ -6507,7 +6650,8 @@ fetch_char_property_to_ctype(UChar** src, UChar* end, ScanEnv* env)  }  static int -prs_char_property(Node** np, PToken* tok, UChar** src, UChar* end, ScanEnv* env) +prs_char_property(Node** np, PToken* tok, UChar** src, UChar* end, +                  ParseEnv* env)  {    int r, ctype;    CClassNode* cc; @@ -6528,7 +6672,7 @@ prs_char_property(Node** np, PToken* tok, UChar** src, UChar* end, ScanEnv* env)  static int  cc_cprop_next(CClassNode* cc, OnigCodePoint* pcode, CVAL* val, CSTATE* state, -              ScanEnv* env) +              ParseEnv* env)  {    int r; @@ -6552,7 +6696,7 @@ cc_cprop_next(CClassNode* cc, OnigCodePoint* pcode, CVAL* val, CSTATE* state,  static int  cc_char_next(CClassNode* cc, OnigCodePoint *from, OnigCodePoint to,               int* from_raw, int to_raw, CVAL intype, CVAL* type, -             CSTATE* state, ScanEnv* env) +             CSTATE* state, ParseEnv* env)  {    int r; @@ -6621,7 +6765,7 @@ cc_char_next(CClassNode* cc, OnigCodePoint *from, OnigCodePoint to,  static int  code_exist_check(OnigCodePoint c, UChar* from, UChar* end, int ignore_escaped, -                 ScanEnv* env) +                 ParseEnv* env)  {    int in_esc;    OnigCodePoint code; @@ -6643,7 +6787,7 @@ code_exist_check(OnigCodePoint c, UChar* from, UChar* end, int ignore_escaped,  }  static int -prs_cc(Node** np, PToken* tok, UChar** src, UChar* end, ScanEnv* env) +prs_cc(Node** np, PToken* tok, UChar** src, UChar* end, ParseEnv* env)  {    int r, neg, len, fetched, and_start;    OnigCodePoint in_code, curr_code; @@ -6995,13 +7139,14 @@ prs_cc(Node** np, PToken* tok, UChar** src, UChar* end, ScanEnv* env)  }  static int prs_alts(Node** top, PToken* tok, int term, -                    UChar** src, UChar* end, ScanEnv* env, int group_head); +                    UChar** src, UChar* end, ParseEnv* env, int group_head);  #ifdef USE_CALLOUT  /* (?{...}[tag][+-]) (?{{...}}[tag][+-]) */  static int -prs_callout_of_contents(Node** np, int cterm, UChar** src, UChar* end, ScanEnv* env) +prs_callout_of_contents(Node** np, int cterm, UChar** src, UChar* end, +                        ParseEnv* env)  {    int r;    int i; @@ -7184,7 +7329,7 @@ clear_callout_args(int n, unsigned int types[], OnigValue vals[])  static int  prs_callout_args(int skip_mode, int cterm, UChar** src, UChar* end,                   int max_arg_num, unsigned int types[], OnigValue vals[], -                 ScanEnv* env) +                 ParseEnv* env)  {  #define MAX_CALLOUT_ARG_BYTE_LENGTH   128 @@ -7347,7 +7492,8 @@ prs_callout_args(int skip_mode, int cterm, UChar** src, UChar* end,  /* (*name[TAG]) (*name[TAG]{a,b,..}) */  static int -prs_callout_of_name(Node** np, int cterm, UChar** src, UChar* end, ScanEnv* env) +prs_callout_of_name(Node** np, int cterm, UChar** src, UChar* end, +                    ParseEnv* env)  {    int r;    int i; @@ -7514,7 +7660,7 @@ prs_callout_of_name(Node** np, int cterm, UChar** src, UChar* end, ScanEnv* env)  static int  prs_bag(Node** np, PToken* tok, int term, UChar** src, UChar* end, -        ScanEnv* env) +        ParseEnv* env)  {    int r, num;    Node *target; @@ -7747,7 +7893,7 @@ prs_bag(Node** np, PToken* tok, int term, UChar** src, UChar* end,              if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_STRICT_CHECK_BACKREF)) {                if (back_num > env->num_mem || -                  IS_NULL(SCANENV_MEMENV(env)[back_num].mem_node)) +                  IS_NULL(PARSEENV_MEMENV(env)[back_num].mem_node))                  return ONIGERR_INVALID_BACKREF;              } @@ -7769,7 +7915,7 @@ prs_bag(Node** np, PToken* tok, int term, UChar** src, UChar* end,                int i;                for (i = 0; i < num; i++) {                  if (backs[i] > env->num_mem || -                    IS_NULL(SCANENV_MEMENV(env)[backs[i]].mem_node)) +                    IS_NULL(PARSEENV_MEMENV(env)[backs[i]].mem_node))                    return ONIGERR_INVALID_BACKREF;                }              } @@ -7932,12 +8078,26 @@ prs_bag(Node** np, PToken* tok, int term, UChar** src, UChar* end,        break;  #endif +    case 'P': +      if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_CAPITAL_P_NAME)) { +        if (PEND) return ONIGERR_END_PATTERN_IN_GROUP; +        PFETCH(c); +        if (c == '<') goto named_group1; + +        return ONIGERR_UNDEFINED_GROUP_OPTION; +      } +      /* else fall */ +    case 'W': case 'D': case 'S': +    case 'y': +      if (! IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_ONIGURUMA)) +        return ONIGERR_UNDEFINED_GROUP_OPTION; +      /* else fall */ +  #ifdef USE_POSIXLINE_OPTION      case 'p':  #endif +    case 'a':      case '-': case 'i': case 'm': case 's': case 'x': -    case 'W': case 'D': case 'S': case 'P': -    case 'y':        {          int neg = 0; @@ -7974,10 +8134,26 @@ prs_bag(Node** np, PToken* tok, int term, UChar** src, UChar* end,              OPTION_NEGATE(option, ONIG_OPTION_MULTILINE|ONIG_OPTION_SINGLELINE, neg);              break;  #endif -          case 'W': OPTION_NEGATE(option, ONIG_OPTION_WORD_IS_ASCII, neg); break; -          case 'D': OPTION_NEGATE(option, ONIG_OPTION_DIGIT_IS_ASCII, neg); break; -          case 'S': OPTION_NEGATE(option, ONIG_OPTION_SPACE_IS_ASCII, neg); break; -          case 'P': OPTION_NEGATE(option, ONIG_OPTION_POSIX_IS_ASCII, neg); break; +          case 'W': +            if (! IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_ONIGURUMA)) +              return ONIGERR_UNDEFINED_GROUP_OPTION; +            OPTION_NEGATE(option, ONIG_OPTION_WORD_IS_ASCII, neg); +            break; +          case 'D': +            if (! IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_ONIGURUMA)) +              return ONIGERR_UNDEFINED_GROUP_OPTION; +            OPTION_NEGATE(option, ONIG_OPTION_DIGIT_IS_ASCII, neg); +            break; +          case 'S': +            if (! IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_ONIGURUMA)) +              return ONIGERR_UNDEFINED_GROUP_OPTION; +            OPTION_NEGATE(option, ONIG_OPTION_SPACE_IS_ASCII, neg); +            break; +          case 'P': +            if (! IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_ONIGURUMA)) +              return ONIGERR_UNDEFINED_GROUP_OPTION; +            OPTION_NEGATE(option, ONIG_OPTION_POSIX_IS_ASCII, neg); +            break;            case 'y': /* y{g}, y{w} */              { @@ -8016,8 +8192,15 @@ prs_bag(Node** np, PToken* tok, int term, UChar** src, UChar* end,                PFETCH(c);                if (c != '}')                  return ONIGERR_UNDEFINED_GROUP_OPTION; -              break;              } /* case 'y' */ +            break; + +          case 'a': +            if (! IS_SYNTAX_BV(env->syntax, ONIG_SYN_PYTHON)) +              return ONIGERR_UNDEFINED_GROUP_OPTION; + +            OPTION_NEGATE(option, ONIG_OPTION_POSIX_IS_ASCII, neg); +            break;            default:              return ONIGERR_UNDEFINED_GROUP_OPTION; @@ -8112,7 +8295,7 @@ static const char* ReduceQStr[] = {  };  static int -assign_quantifier_body(Node* qnode, Node* target, int group, ScanEnv* env) +assign_quantifier_body(Node* qnode, Node* target, int group, ParseEnv* env)  {    QuantNode* qn; @@ -8260,35 +8443,38 @@ onig_new_cclass_with_code_list(Node** rnode, OnigEncoding enc,  }  typedef struct { -  ScanEnv*    env; +  ParseEnv*   env;    CClassNode* cc;    Node*       alt_root;    Node**      ptail;  } IApplyCaseFoldArg;  static int -i_apply_case_fold(OnigCodePoint from, OnigCodePoint to[], int to_len, void* arg) +i_apply_case_fold(OnigCodePoint from, OnigCodePoint to[], int to_len, +                  void* arg)  {    IApplyCaseFoldArg* iarg; -  ScanEnv* env; +  ParseEnv* env; +  OnigEncoding enc;    CClassNode* cc;    iarg = (IApplyCaseFoldArg* )arg;    env = iarg->env;    cc  = iarg->cc; +  enc = env->enc;    if (to_len == 1) { -    int is_in = onig_is_code_in_cc(env->enc, from, cc); +    int is_in = onig_is_code_in_cc(enc, from, cc);  #ifdef CASE_FOLD_IS_APPLIED_INSIDE_NEGATIVE_CCLASS      if ((is_in != 0 && !IS_NCCLASS_NOT(cc)) ||          (is_in == 0 &&  IS_NCCLASS_NOT(cc))) { -      ADD_CODE_INTO_CC(cc, *to, env->enc); +      ADD_CODE_INTO_CC(cc, *to, enc);      }  #else      if (is_in != 0) { -      if (ONIGENC_MBC_MINLEN(env->enc) > 1 || -          ONIGENC_CODE_TO_MBCLEN(env->enc, *to) != 1) { -        if (IS_NCCLASS_NOT(cc)) clear_not_flag_cclass(cc, env->enc); +      if (ONIGENC_MBC_MINLEN(enc) > 1 || +          ONIGENC_CODE_TO_MBCLEN(enc, *to) != 1) { +        if (IS_NCCLASS_NOT(cc)) clear_not_flag_cclass(cc, enc);          add_code_range(&(cc->mbuf), env, *to, *to);        }        else { @@ -8305,7 +8491,7 @@ i_apply_case_fold(OnigCodePoint from, OnigCodePoint to[], int to_len, void* arg)      int r, i, len;      UChar buf[ONIGENC_CODE_TO_MBC_MAXLEN]; -    if (onig_is_code_in_cc(env->enc, from, cc) +    if (onig_is_code_in_cc(enc, from, cc)  #ifdef CASE_FOLD_IS_APPLIED_INSIDE_NEGATIVE_CCLASS          && !IS_NCCLASS_NOT(cc)  #endif @@ -8320,8 +8506,9 @@ i_apply_case_fold(OnigCodePoint from, OnigCodePoint to[], int to_len, void* arg)          Node* csnode;          CClassNode* cs_cc; -        index = onigenc_unicode_fold1_key(&to[i]); -        if (index >= 0) { +        index = 0; +        if (ONIGENC_IS_UNICODE_ENCODING(enc) && +            (index = onigenc_unicode_fold1_key(&to[i])) >= 0) {            csnode = node_new_cclass();            cs_cc = CCLASS_(csnode);            if (IS_NULL(csnode)) { @@ -8332,18 +8519,22 @@ i_apply_case_fold(OnigCodePoint from, OnigCodePoint to[], int to_len, void* arg)            m = FOLDS1_UNFOLDS_NUM(index);            for (j = 0; j < m; j++) {              code = FOLDS1_UNFOLDS(index)[j]; -            ADD_CODE_INTO_CC(cs_cc, code, env->enc); +            ADD_CODE_INTO_CC(cs_cc, code, enc);            } -          ADD_CODE_INTO_CC(cs_cc, to[i], env->enc); +          ADD_CODE_INTO_CC(cs_cc, to[i], enc);            ns[n++] = csnode;          }          else { -          len = ONIGENC_CODE_TO_MBC(env->enc, to[i], buf); +          len = ONIGENC_CODE_TO_MBC(enc, to[i], buf);            if (n == 0 || NODE_TYPE(ns[n-1]) != NODE_STRING) {              csnode = node_new_str(buf, buf + len);              if (IS_NULL(csnode)) goto err_free_ns; -            NODE_STRING_SET_CASE_EXPANDED(csnode); +            if (index == 0) +              NODE_STATUS_ADD(csnode, IGNORECASE); +            else +              NODE_STRING_SET_CASE_EXPANDED(csnode); +              ns[n++] = csnode;            }            else { @@ -8372,7 +8563,7 @@ i_apply_case_fold(OnigCodePoint from, OnigCodePoint to[], int to_len, void* arg)  static int  prs_exp(Node** np, PToken* tok, int term, UChar** src, UChar* end, -        ScanEnv* env, int group_head) +        ParseEnv* env, int group_head)  {    int r, len, group;    Node* qn; @@ -8778,7 +8969,7 @@ prs_exp(Node** np, PToken* tok, int term, UChar** src, UChar* end,  static int  prs_branch(Node** top, PToken* tok, int term, UChar** src, UChar* end, -           ScanEnv* env, int group_head) +           ParseEnv* env, int group_head)  {    int r;    Node *node, **headp; @@ -8829,7 +9020,7 @@ prs_branch(Node** top, PToken* tok, int term, UChar** src, UChar* end,  /* term_tok: TK_EOT or TK_SUBEXP_CLOSE */  static int  prs_alts(Node** top, PToken* tok, int term, UChar** src, UChar* end, -         ScanEnv* env, int group_head) +         ParseEnv* env, int group_head)  {    int r;    Node *node, **headp; @@ -8892,7 +9083,7 @@ prs_alts(Node** top, PToken* tok, int term, UChar** src, UChar* end,  }  static int -prs_regexp(Node** top, UChar** src, UChar* end, ScanEnv* env) +prs_regexp(Node** top, UChar** src, UChar* end, ParseEnv* env)  {    int r;    PToken tok; @@ -8908,7 +9099,7 @@ prs_regexp(Node** top, UChar** src, UChar* end, ScanEnv* env)  #ifdef USE_CALL  static int -make_call_zero_body(Node* node, ScanEnv* env, Node** rnode) +make_call_zero_body(Node* node, ParseEnv* env, Node** rnode)  {    int r; @@ -8930,7 +9121,7 @@ make_call_zero_body(Node* node, ScanEnv* env, Node** rnode)  extern int  onig_parse_tree(Node** root, const UChar* pattern, const UChar* end, -                regex_t* reg, ScanEnv* env) +                regex_t* reg, ParseEnv* env)  {    int r;    UChar* p; @@ -8945,7 +9136,6 @@ onig_parse_tree(Node** root, const UChar* pattern, const UChar* end,    reg->num_empty_check    = 0;    reg->repeat_range_alloc = 0;    reg->repeat_range       = (RepeatRange* )NULL; -  reg->empty_status_mem   = 0;    names_clear(reg); @@ -8990,7 +9180,7 @@ onig_parse_tree(Node** root, const UChar* pattern, const UChar* end,  }  extern void -onig_scan_env_set_error_string(ScanEnv* env, int ecode ARG_UNUSED, +onig_scan_env_set_error_string(ParseEnv* env, int ecode ARG_UNUSED,                                 UChar* arg, UChar* arg_end)  {    env->error     = arg; diff --git a/src/regparse.h b/src/regparse.h index c60a42d..8875f78 100644 --- a/src/regparse.h +++ b/src/regparse.h @@ -4,7 +4,7 @@    regparse.h -  Oniguruma (regular expression library)  **********************************************************************/  /*- - * Copyright (c) 2002-2020  K.Kosako + * Copyright (c) 2002-2021  K.Kosako   * All rights reserved.   *   * Redistribution and use in source and binary forms, with or without @@ -73,6 +73,14 @@ enum BodyEmptyType {    BODY_MAY_BE_EMPTY_REC = 3  }; +/* bytes buffer */ +typedef struct _BBuf { +  UChar* p; +  unsigned int used; +  unsigned int alloc; +} BBuf; + +  struct _Node;  typedef struct { @@ -110,6 +118,7 @@ typedef struct {    struct _Node* head_exact;    struct _Node* next_head_exact;    int include_referred;  /* include called node. don't eliminate even if {0} */ +  MemStatusType empty_status_mem;  } QuantNode;  typedef struct { @@ -340,6 +349,7 @@ typedef struct {  #define NODE_ST_ABSENT_WITH_SIDE_EFFECTS (1<<24)  /* stopper or clear */  #define NODE_ST_FIXED_CLEN_MIN_SURE (1<<25)  #define NODE_ST_REFERENCED          (1<<26) +#define NODE_ST_INPEEK              (1<<27)  #define NODE_STATUS(node)           (((Node* )node)->u.base.status) @@ -376,6 +386,7 @@ typedef struct {  #define NODE_IS_ABSENT_WITH_SIDE_EFFECTS(node)  ((NODE_STATUS(node) & NODE_ST_ABSENT_WITH_SIDE_EFFECTS) != 0)  #define NODE_IS_FIXED_CLEN_MIN_SURE(node)  ((NODE_STATUS(node) & NODE_ST_FIXED_CLEN_MIN_SURE) != 0)  #define NODE_IS_REFERENCED(node)      ((NODE_STATUS(node) & NODE_ST_REFERENCED) != 0) +#define NODE_IS_INPEEK(node)          ((NODE_STATUS(node) & NODE_ST_INPEEK) != 0)  #define NODE_PARENT(node)         ((node)->u.base.parent)  #define NODE_BODY(node)           ((node)->u.base.body) @@ -384,8 +395,8 @@ typedef struct {  #define NODE_CALL_BODY(node)      ((node)->body)  #define NODE_ANCHOR_BODY(node)    ((node)->body) -#define SCANENV_MEMENV_SIZE  8 -#define SCANENV_MEMENV(senv) \ +#define PARSEENV_MEMENV_SIZE  8 +#define PARSEENV_MEMENV(senv) \   (IS_NOT_NULL((senv)->mem_env_dynamic) ? \      (senv)->mem_env_dynamic : (senv)->mem_env_static) @@ -424,7 +435,7 @@ typedef struct {    int              num_mem;    int              num_named;    int              mem_alloc; -  MemEnv           mem_env_static[SCANENV_MEMENV_SIZE]; +  MemEnv           mem_env_static[PARSEENV_MEMENV_SIZE];    MemEnv*          mem_env_dynamic;    int              backref_num;    int              keep_num; @@ -439,14 +450,14 @@ typedef struct {  #ifdef ONIG_DEBUG_PARSE    unsigned int     max_parse_depth;  #endif -} ScanEnv; +} ParseEnv;  extern int    onig_renumber_name_table P_((regex_t* reg, GroupNumMap* map));  extern int    onig_strncmp P_((const UChar* s1, const UChar* s2, int n));  extern void   onig_strcpy P_((UChar* dest, const UChar* src, const UChar* end)); -extern void   onig_scan_env_set_error_string P_((ScanEnv* env, int ecode, UChar* arg, UChar* arg_end)); +extern void   onig_scan_env_set_error_string P_((ParseEnv* env, int ecode, UChar* arg, UChar* arg_end));  extern int    onig_reduce_nested_quantifier P_((Node* pnode));  extern int    onig_node_copy(Node** rcopy, Node* from);  extern int    onig_node_str_cat P_((Node* node, const UChar* s, const UChar* end)); @@ -460,7 +471,7 @@ extern Node*  onig_node_new_str P_((const UChar* s, const UChar* end));  extern Node*  onig_node_new_list P_((Node* left, Node* right));  extern Node*  onig_node_new_alt P_((Node* left, Node* right));  extern int    onig_names_free P_((regex_t* reg)); -extern int    onig_parse_tree P_((Node** root, const UChar* pattern, const UChar* end, regex_t* reg, ScanEnv* env)); +extern int    onig_parse_tree P_((Node** root, const UChar* pattern, const UChar* end, regex_t* reg, ParseEnv* env));  extern int    onig_free_shared_cclass_table P_((void));  extern int    onig_is_code_in_cc P_((OnigEncoding enc, OnigCodePoint code, CClassNode* cc));  extern int    onig_new_cclass_with_code_list(Node** rnode, OnigEncoding enc, int n, OnigCodePoint codes[]); diff --git a/src/regposix.c b/src/regposix.c index 497ba02..494446f 100644 --- a/src/regposix.c +++ b/src/regposix.c @@ -2,7 +2,7 @@    regposix.c - Oniguruma (regular expression library)  **********************************************************************/  /*- - * Copyright (c) 2002-2020  K.Kosako + * Copyright (c) 2002-2021  K.Kosako   * All rights reserved.   *   * Redistribution and use in source and binary forms, with or without @@ -120,6 +120,7 @@ onig2posix_error_code(int code)      { ONIGERR_NUMBERED_BACKREF_OR_CALL_NOT_ALLOWED,       REG_BADPAT  },      { ONIGERR_TOO_BIG_WIDE_CHAR_VALUE,                    REG_EONIG_BADWC },      { ONIGERR_TOO_LONG_WIDE_CHAR_VALUE,                   REG_EONIG_BADWC }, +    { ONIGERR_UNDEFINED_OPERATOR,                         REG_BADPAT  },      { ONIGERR_INVALID_CODE_POINT_VALUE,                   REG_EONIG_BADWC },      { ONIGERR_EMPTY_GROUP_NAME,                           REG_BADPAT },      { ONIGERR_INVALID_GROUP_NAME,                         REG_BADPAT }, @@ -141,6 +142,7 @@ onig2posix_error_code(int code)      { ONIGERR_INVALID_CALLOUT_TAG_NAME,                   REG_BADPAT },      { ONIGERR_INVALID_CALLOUT_ARG,                        REG_BADPAT },      { ONIGERR_NOT_SUPPORTED_ENCODING_COMBINATION,         REG_EONIG_BADARG }, +    { ONIGERR_VERY_INEFFICIENT_PATTERN,                   REG_BADPAT },      { ONIGERR_LIBRARY_IS_NOT_INITIALIZED,                 REG_EONIG_INTERNAL }    }; diff --git a/src/regsyntax.c b/src/regsyntax.c index 984aac6..8e1c313 100644 --- a/src/regsyntax.c +++ b/src/regsyntax.c @@ -2,7 +2,7 @@    regsyntax.c -  Oniguruma (regular expression library)  **********************************************************************/  /*- - * Copyright (c) 2002-2020  K.Kosako + * Copyright (c) 2002-2021  K.Kosako   * All rights reserved.   *   * Redistribution and use in source and binary forms, with or without @@ -240,6 +240,35 @@ OnigSyntaxType OnigSyntaxPerl_NG = {    }  }; +/* Python 3.9 */ +OnigSyntaxType OnigSyntaxPython = { +  (( SYN_GNU_REGEX_OP | ONIG_SYN_OP_QMARK_NON_GREEDY | +     ONIG_SYN_OP_ESC_OCTAL3 | ONIG_SYN_OP_ESC_X_HEX2 | +     ONIG_SYN_OP_ESC_CONTROL_CHARS | +     ONIG_SYN_OP_ESC_C_CONTROL ) +   & ~ONIG_SYN_OP_ESC_LTGT_WORD_BEGIN_END ) +  , ( ONIG_SYN_OP2_QMARK_GROUP_EFFECT | ONIG_SYN_OP2_OPTION_PERL | +      ONIG_SYN_OP2_QMARK_LPAREN_IF_ELSE | +      ONIG_SYN_OP2_ASTERISK_CALLOUT_NAME    | +      ONIG_SYN_OP2_ESC_P_BRACE_CHAR_PROPERTY  | +      ONIG_SYN_OP2_ESC_P_BRACE_CIRCUMFLEX_NOT | +      ONIG_SYN_OP2_QMARK_CAPITAL_P_NAME       | +      ONIG_SYN_OP2_ESC_CAPITAL_K_KEEP | +      ONIG_SYN_OP2_ESC_V_VTAB | ONIG_SYN_OP2_ESC_U_HEX4 ) +  , ( SYN_GNU_REGEX_BV | ONIG_SYN_ISOLATED_OPTION_CONTINUE_BRANCH | +      ONIG_SYN_ALLOW_INTERVAL_LOW_ABBREV | ONIG_SYN_PYTHON ) +  , ONIG_OPTION_SINGLELINE +  , +  { +      (OnigCodePoint )'\\'                       /* esc */ +    , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anychar '.'  */ +    , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anytime '*'  */ +    , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* zero or one time '?' */ +    , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* one or more time '+' */ +    , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anychar anytime */ +  } +}; +  extern int diff --git a/src/unicode.c b/src/unicode.c index 6703d4b..efe5f73 100644 --- a/src/unicode.c +++ b/src/unicode.c @@ -2,7 +2,7 @@    unicode.c -  Oniguruma (regular expression library)  **********************************************************************/  /*- - * Copyright (c) 2002-2019  K.Kosako + * Copyright (c) 2002-2020  K.Kosako   * All rights reserved.   *   * Redistribution and use in source and binary forms, with or without @@ -77,9 +77,8 @@ static const unsigned short EncUNICODE_ISO_8859_1_CtypeTable[256] = {  #include "unicode_fold_data.c"  extern int -onigenc_unicode_mbc_case_fold(OnigEncoding enc, -    OnigCaseFoldType flag ARG_UNUSED, const UChar** pp, const UChar* end, -    UChar* fold) +onigenc_unicode_mbc_case_fold(OnigEncoding enc, OnigCaseFoldType flag, +    const UChar** pp, const UChar* end, UChar* fold)  {    const struct ByUnfoldKey* buk; @@ -104,23 +103,27 @@ onigenc_unicode_mbc_case_fold(OnigEncoding enc,    }  #endif -  buk = onigenc_unicode_unfold_key(code); -  if (buk != 0) { -    if (buk->fold_len == 1) { -      return ONIGENC_CODE_TO_MBC(enc, *FOLDS1_FOLD(buk->index), fold); -    } -    else { -      OnigCodePoint* addr; - -      FOLDS_FOLD_ADDR_BUK(buk, addr); -      rlen = 0; -      for (i = 0; i < buk->fold_len; i++) { -        OnigCodePoint c = addr[i]; -        len = ONIGENC_CODE_TO_MBC(enc, c, fold); -        fold += len; -        rlen += len; +  if (CASE_FOLD_IS_NOT_ASCII_ONLY(flag) || ONIGENC_IS_ASCII_CODE(code)) { +    buk = onigenc_unicode_unfold_key(code); +    if (buk != 0) { +      if (buk->fold_len == 1) { +        if (CASE_FOLD_IS_NOT_ASCII_ONLY(flag) || +            ONIGENC_IS_ASCII_CODE(*FOLDS1_FOLD(buk->index))) +          return ONIGENC_CODE_TO_MBC(enc, *FOLDS1_FOLD(buk->index), fold); +      } +      else { +        OnigCodePoint* addr; + +        FOLDS_FOLD_ADDR_BUK(buk, addr); +        rlen = 0; +        for (i = 0; i < buk->fold_len; i++) { +          OnigCodePoint c = addr[i]; +          len = ONIGENC_CODE_TO_MBC(enc, c, fold); +          fold += len; +          rlen += len; +        } +        return rlen;        } -      return rlen;      }    } @@ -131,16 +134,22 @@ onigenc_unicode_mbc_case_fold(OnigEncoding enc,  }  static int -apply_case_fold1(int from, int to, OnigApplyAllCaseFoldFunc f, void* arg) +apply_case_fold1(OnigCaseFoldType flag, int from, int to, +                 OnigApplyAllCaseFoldFunc f, void* arg)  {    int i, j, k, n, r;    for (i = from; i < to; ) {      OnigCodePoint fold = *FOLDS1_FOLD(i); +    if (CASE_FOLD_IS_ASCII_ONLY(flag) && ! ONIGENC_IS_ASCII_CODE(fold)) break; +      n = FOLDS1_UNFOLDS_NUM(i);      for (j = 0; j < n; j++) {        OnigCodePoint unfold = FOLDS1_UNFOLDS(i)[j]; +      if (CASE_FOLD_IS_ASCII_ONLY(flag) && ! ONIGENC_IS_ASCII_CODE(unfold)) +        continue; +        r = (*f)(fold, &unfold, 1, arg);        if (r != 0) return r;        r = (*f)(unfold, &fold, 1, arg); @@ -148,6 +157,9 @@ apply_case_fold1(int from, int to, OnigApplyAllCaseFoldFunc f, void* arg)        for (k = 0; k < j; k++) {          OnigCodePoint unfold2 = FOLDS1_UNFOLDS(i)[k]; +        if (CASE_FOLD_IS_ASCII_ONLY(flag) && +            ! ONIGENC_IS_ASCII_CODE(unfold2)) continue; +          r = (*f)(unfold, &unfold2, 1, arg);          if (r != 0) return r;          r = (*f)(unfold2, &unfold, 1, arg); @@ -225,7 +237,7 @@ onigenc_unicode_apply_all_case_fold(OnigCaseFoldType flag,  {    int r; -  r = apply_case_fold1(0, FOLDS1_NORMAL_END_INDEX, f, arg); +  r = apply_case_fold1(flag, 0, FOLDS1_NORMAL_END_INDEX, f, arg);    if (r != 0) return r;  #ifdef USE_UNICODE_CASE_FOLD_TURKISH_AZERI @@ -246,7 +258,7 @@ onigenc_unicode_apply_all_case_fold(OnigCaseFoldType flag,    }    else {  #endif -    r = apply_case_fold1(FOLDS1_NORMAL_END_INDEX, FOLDS1_END_INDEX, f, arg); +    r = apply_case_fold1(flag, FOLDS1_NORMAL_END_INDEX, FOLDS1_END_INDEX, f, arg);      if (r != 0) return r;  #ifdef USE_UNICODE_CASE_FOLD_TURKISH_AZERI    } @@ -288,6 +300,9 @@ onigenc_unicode_get_case_fold_codes_by_str(OnigEncoding enc,    n = 0;    code = ONIGENC_MBC_TO_CODE(enc, p, end); +  if (CASE_FOLD_IS_ASCII_ONLY(flag)) { +    if (! ONIGENC_IS_ASCII_CODE(code)) return n; +  }    len = enclen(enc, p);  #ifdef USE_UNICODE_CASE_FOLD_TURKISH_AZERI @@ -449,19 +464,26 @@ onigenc_unicode_get_case_fold_codes_by_str(OnigEncoding enc,    if (buk1 != 0) {      if (buk1->fold_len == 1) {        int un; -      items[0].byte_len = lens[0]; -      items[0].code_len = 1; -      items[0].code[0]  = *FOLDS1_FOLD(buk1->index); -      n++; + +      if (CASE_FOLD_IS_NOT_ASCII_ONLY(flag) || +          ONIGENC_IS_ASCII_CODE(*FOLDS1_FOLD(buk1->index))) { +        items[0].byte_len = lens[0]; +        items[0].code_len = 1; +        items[0].code[0]  = *FOLDS1_FOLD(buk1->index); +        n++; +      }        un = FOLDS1_UNFOLDS_NUM(buk1->index);        for (i = 0; i < un; i++) {          OnigCodePoint unfold = FOLDS1_UNFOLDS(buk1->index)[i];          if (unfold != orig_codes[0]) { -          items[n].byte_len = lens[0]; -          items[n].code_len = 1; -          items[n].code[0]  = unfold; -          n++; +          if (CASE_FOLD_IS_NOT_ASCII_ONLY(flag) || +              ONIGENC_IS_ASCII_CODE(unfold)) { +            items[n].byte_len = lens[0]; +            items[n].code_len = 1; +            items[n].code[0]  = unfold; +            n++; +          }          }        }      } @@ -548,10 +570,13 @@ onigenc_unicode_get_case_fold_codes_by_str(OnigEncoding enc,      if (index >= 0) {        int m = FOLDS1_UNFOLDS_NUM(index);        for (i = 0; i < m; i++) { -        items[n].byte_len = lens[0]; -        items[n].code_len = 1; -        items[n].code[0]  = FOLDS1_UNFOLDS(index)[i]; -        n++; +        code = FOLDS1_UNFOLDS(index)[i]; +        if (CASE_FOLD_IS_NOT_ASCII_ONLY(flag)||ONIGENC_IS_ASCII_CODE(code)) { +          items[n].byte_len = lens[0]; +          items[n].code_len = 1; +          items[n].code[0]  = code; +          n++; +        }        }      }    } | 
