summaryrefslogtreecommitdiff
path: root/src/utf8.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/utf8.c')
-rw-r--r--src/utf8.c36
1 files changed, 32 insertions, 4 deletions
diff --git a/src/utf8.c b/src/utf8.c
index b78e7eb..219b7ea 100644
--- a/src/utf8.c
+++ b/src/utf8.c
@@ -29,7 +29,7 @@
#include "regenc.h"
-#define USE_INVALID_CODE_SCHEME
+//#define USE_INVALID_CODE_SCHEME
#ifdef USE_INVALID_CODE_SCHEME
/* virtual codepoint values for invalid encoding byte 0xfe and 0xff */
@@ -39,6 +39,7 @@
#endif
#define utf8_islead(c) ((UChar )((c) & 0xc0) != 0x80)
+#define utf8_istail(c) ((UChar )((c) & 0xc0) == 0x80)
static const int EncLen_UTF8[] = {
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
@@ -66,6 +67,30 @@ mbc_enc_len(const UChar* p)
}
static int
+is_valid_mbc_string(const UChar* p, const UChar* end)
+{
+ int i, len;
+
+ while (p < end) {
+ if (! utf8_islead(*p))
+ return FALSE;
+
+ len = mbc_enc_len(p++);
+ if (len > 1) {
+ for (i = 1; i < len; i++) {
+ if (p == end)
+ return FALSE;
+
+ if (! utf8_istail(*p++))
+ return FALSE;
+ }
+ }
+ }
+
+ return TRUE;
+}
+
+static int
is_mbc_newline(const UChar* p, const UChar* end)
{
if (p < end) {
@@ -91,12 +116,14 @@ is_mbc_newline(const UChar* p, const UChar* end)
}
static OnigCodePoint
-mbc_to_code(const UChar* p, const UChar* end ARG_UNUSED)
+mbc_to_code(const UChar* p, const UChar* end)
{
int c, len;
OnigCodePoint n;
- len = enclen(ONIG_ENCODING_UTF8, p);
+ len = mbc_enc_len(p);
+ if (len > end - p) len = end - p;
+
c = *p++;
if (len > 1) {
len--;
@@ -303,5 +330,6 @@ OnigEncodingType OnigEncodingUTF8 = {
left_adjust_char_head,
onigenc_always_true_is_allowed_reverse_match,
NULL, /* init */
- NULL /* is_initialized */
+ NULL, /* is_initialized */
+ is_valid_mbc_string
};