1 files changed, 32 insertions, 4 deletions
diff --git a/src/utf8.c b/src/utf8.c
index b78e7eb..219b7ea 100644
--- a/src/utf8.c
+++ b/src/utf8.c
@@ -29,7 +29,7 @@
 
 #include "regenc.h"
 
-#define USE_INVALID_CODE_SCHEME
+//#define USE_INVALID_CODE_SCHEME
 
 #ifdef USE_INVALID_CODE_SCHEME
 /* virtual codepoint values for invalid encoding byte 0xfe and 0xff */
@@ -39,6 +39,7 @@
 #endif
 
 #define utf8_islead(c)     ((UChar )((c) & 0xc0) != 0x80)
+#define utf8_istail(c)     ((UChar )((c) & 0xc0) == 0x80)
 
 static const int EncLen_UTF8[] = {
   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
@@ -66,6 +67,30 @@ mbc_enc_len(const UChar* p)
 }
 
 static int
+is_valid_mbc_string(const UChar* p, const UChar* end)
+{
+  int i, len;
+
+  while (p < end) {
+    if (! utf8_islead(*p))
+      return FALSE;
+
+    len = mbc_enc_len(p++);
+    if (len > 1) {
+      for (i = 1; i < len; i++) {
+	if (p == end)
+	  return FALSE;
+
+	if (! utf8_istail(*p++))
+	  return FALSE;
+      }
+    }
+  }
+
+  return TRUE;
+}
+
+static int
 is_mbc_newline(const UChar* p, const UChar* end)
 {
   if (p < end) {
@@ -91,12 +116,14 @@ is_mbc_newline(const UChar* p, const UChar* end)
 }
 
 static OnigCodePoint
-mbc_to_code(const UChar* p, const UChar* end ARG_UNUSED)
+mbc_to_code(const UChar* p, const UChar* end)
 {
   int c, len;
   OnigCodePoint n;
 
-  len = enclen(ONIG_ENCODING_UTF8, p);
+  len = mbc_enc_len(p);
+  if (len > end - p) len = end - p;
+
   c = *p++;
   if (len > 1) {
     len--;
@@ -303,5 +330,6 @@ OnigEncodingType OnigEncodingUTF8 = {
   left_adjust_char_head,
   onigenc_always_true_is_allowed_reverse_match,
   NULL, /* init */
-  NULL  /* is_initialized */
+  NULL, /* is_initialized */
+  is_valid_mbc_string
 };