diff options
Diffstat (limited to 'src/utf8.c')
-rw-r--r-- | src/utf8.c | 36 |
1 files changed, 32 insertions, 4 deletions
@@ -29,7 +29,7 @@ #include "regenc.h" -#define USE_INVALID_CODE_SCHEME +//#define USE_INVALID_CODE_SCHEME #ifdef USE_INVALID_CODE_SCHEME /* virtual codepoint values for invalid encoding byte 0xfe and 0xff */ @@ -39,6 +39,7 @@ #endif #define utf8_islead(c) ((UChar )((c) & 0xc0) != 0x80) +#define utf8_istail(c) ((UChar )((c) & 0xc0) == 0x80) static const int EncLen_UTF8[] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, @@ -66,6 +67,30 @@ mbc_enc_len(const UChar* p) } static int +is_valid_mbc_string(const UChar* p, const UChar* end) +{ + int i, len; + + while (p < end) { + if (! utf8_islead(*p)) + return FALSE; + + len = mbc_enc_len(p++); + if (len > 1) { + for (i = 1; i < len; i++) { + if (p == end) + return FALSE; + + if (! utf8_istail(*p++)) + return FALSE; + } + } + } + + return TRUE; +} + +static int is_mbc_newline(const UChar* p, const UChar* end) { if (p < end) { @@ -91,12 +116,14 @@ is_mbc_newline(const UChar* p, const UChar* end) } static OnigCodePoint -mbc_to_code(const UChar* p, const UChar* end ARG_UNUSED) +mbc_to_code(const UChar* p, const UChar* end) { int c, len; OnigCodePoint n; - len = enclen(ONIG_ENCODING_UTF8, p); + len = mbc_enc_len(p); + if (len > end - p) len = end - p; + c = *p++; if (len > 1) { len--; @@ -303,5 +330,6 @@ OnigEncodingType OnigEncodingUTF8 = { left_adjust_char_head, onigenc_always_true_is_allowed_reverse_match, NULL, /* init */ - NULL /* is_initialized */ + NULL, /* is_initialized */ + is_valid_mbc_string }; |