summaryrefslogtreecommitdiff
path: root/src/make_unicode_fold_data.py
diff options
context:
space:
mode:
Diffstat (limited to 'src/make_unicode_fold_data.py')
-rwxr-xr-xsrc/make_unicode_fold_data.py221
1 files changed, 185 insertions, 36 deletions
diff --git a/src/make_unicode_fold_data.py b/src/make_unicode_fold_data.py
index 55d5b88..0e6c635 100755
--- a/src/make_unicode_fold_data.py
+++ b/src/make_unicode_fold_data.py
@@ -1,7 +1,7 @@
#!/usr/bin/python
# -*- coding: utf-8 -*-
# make_unicode_fold_data.py
-# Copyright (c) 2016-2019 K.Kosako
+# Copyright (c) 2016-2020 K.Kosako
import sys
import re
@@ -28,6 +28,35 @@ UNFOLDS = {}
TURKISH_UNFOLDS = {}
LOCALE_UNFOLDS = {}
+COPYRIGHT = '''
+/*-
+ * Copyright (c) 2017-2020 K.Kosako
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+'''.strip()
+
+
class Entry:
def __init__(self, fold):
self.fold = fold
@@ -55,6 +84,11 @@ def form3bytes(x):
x2 = (x>>16) & 0xff
return "\\x%02x\\x%02x\\x%02x" % (x2, x1, x0)
+def enc_len(code, encode):
+ u = unichr(code)
+ s = u.encode(encode)
+ return len(s)
+
def check_version_info(s):
m = VERSION_REG.match(s)
if m is not None:
@@ -231,6 +265,8 @@ def output_macros(f, name):
def output_fold_source(f, out_comment):
print >> f, "/* This file was generated by make_unicode_fold_data.py. */"
+ print >> f, COPYRIGHT
+ print >> f, "\n"
print >> f, '#include "regenc.h"'
print >> f, ''
if VERSION_INFO[0] < 0:
@@ -244,42 +280,11 @@ def output_fold_source(f, out_comment):
output_fold_data(f, DataName, out_comment)
-HEAD = '''
-%{
-/* This gperf source file was generated by make_unicode_fold_data.py */
+def output_gperf_unfold_key(f):
+ head = "%{\n/* This gperf source file was generated by make_unicode_fold_data.py */\n\n" + COPYRIGHT + """\
-/*-
- * Copyright (c) 2017-2019 K.Kosako
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- */
-#include <string.h>
-#include "regenc.h"
+#include "regint.h"
%}
-'''.strip()
-
-def output_gperf_unfold_key(f):
- head = HEAD + """\
struct ByUnfoldKey {
OnigCodePoint code;
@@ -299,7 +304,10 @@ struct ByUnfoldKey {
print >> f, '%%'
def output_gperf_fold_key(f, key_len):
- head = HEAD + """\
+ head = "%{\n/* This gperf source file was generated by make_unicode_fold_data.py */\n\n" + COPYRIGHT + """\
+
+#include "regint.h"
+%}
short int
%%
@@ -324,6 +332,138 @@ def output_gperf_source():
with open(GPERF_FOLD_KEY_FILES[i-1], 'w') as f:
output_gperf_fold_key(f, i)
+def unfolds_byte_length_check(encode):
+ l = UNFOLDS.items()
+ sl = sorted(l, key=lambda (k,e):(e.fold_len, e.index))
+ for unfold, e in sl:
+ key_len = enc_len(unfold, encode)
+ fold_len = sum(map(lambda c: enc_len(c, encode), e.fold))
+ if key_len > fold_len:
+ sfolds = ' '.join(map(lambda c: "0x%06x" % c, e.fold))
+ s = "%s byte length: %d > %d: 0x%06x => %s" % (encode, key_len, fold_len, unfold, sfolds)
+ print >> sys.stderr, s
+
+def double_fold_check():
+ l = UNFOLDS.items()
+ sl = sorted(l, key=lambda (k,e):(e.fold_len, e.index))
+ for unfold, e in sl:
+ for f in e.fold:
+ #print >> sys.stderr, ("check 0x%06x" % f)
+ e2 = UNFOLDS.get(f)
+ if e2 is not None:
+ s = "double folds: 0x%06x => %s, 0x%06x => %s" % (unfold, e.fold, f, e2.fold)
+ print >> sys.stderr, s
+
+def unfold_is_multi_code_folds_head_check():
+ l = UNFOLDS.items()
+ l2 = filter(lambda (k,e):e.fold_len == 2, l)
+ l3 = filter(lambda (k,e):e.fold_len == 3, l)
+ sl = sorted(l, key=lambda (k,e):(e.fold_len, e.index))
+ for unfold, _ in sl:
+ for k, e in l2:
+ if e.fold[0] == unfold:
+ s = "unfold 0x%06x is multi-code fold head in %s" % (unfold, e.fold)
+ print >> sys.stderr, s
+ for k, e in l3:
+ if e.fold[0] == unfold:
+ s = "unfold 0x%06x is multi-code fold head in %s" % (unfold, e.fold)
+ print >> sys.stderr, s
+
+def make_one_folds(l):
+ h = {}
+ for unfold, e in l:
+ if e.fold_len != 1:
+ continue
+ fold = e.fold[0]
+ unfolds = h.get(fold)
+ if unfolds is None:
+ unfolds = [unfold]
+ h[fold] = unfolds
+ else:
+ unfolds.append(unfold)
+
+ return h
+
+def make_foldn_heads(l, fold_len, one_folds):
+ h = {}
+ for unfold, e in l:
+ if e.fold_len != fold_len:
+ continue
+ unfolds = one_folds.get(e.fold[0])
+ h[e.fold[0]] = (e, unfolds)
+
+ return h
+
+def fold2_expansion_num(e, one_folds):
+ n = len(e.unfolds)
+ n0 = 1
+ u0 = one_folds.get(e.fold[0])
+ if u0 is not None:
+ n0 += len(u0)
+ n1 = 1
+ u1 = one_folds.get(e.fold[1])
+ if u1 is not None:
+ n1 += len(u1)
+ n += (n0 * n1)
+ return n
+
+def fold3_expansion_num(e, one_folds):
+ n = len(e.unfolds)
+ n0 = 1
+ u0 = one_folds.get(e.fold[0])
+ if u0 is not None:
+ n0 += len(u0)
+ n1 = 1
+ u1 = one_folds.get(e.fold[1])
+ if u1 is not None:
+ n1 += len(u1)
+ n2 = 1
+ u2 = one_folds.get(e.fold[2])
+ if u2 is not None:
+ n2 += len(u2)
+ n += (n0 * n1 * n2)
+ return n
+
+def get_all_folds_expansion_num(x, one_folds, fold2_heads, fold3_heads):
+ e = UNFOLDS[x]
+ n = 0
+ if e.fold_len == 1:
+ n1 = len(e.unfolds) + 1 # +1: fold
+ fx = e.fold[0]
+ r = fold2_heads.get(fx)
+ n2 = n3 = 0
+ if r is not None:
+ e2, _ = r
+ n2 = fold2_expansion_num(e2, one_folds)
+ r = fold3_heads.get(fx)
+ if r is not None:
+ e3, _ = r
+ n3 = fold3_expansion_num(e3, one_folds)
+ n = max(n1, n2, n3)
+ elif e.fold_len == 2:
+ n = fold2_expansion_num(e, one_folds)
+ elif e.fold_len == 3:
+ n = fold3_expansion_num(e, one_folds)
+ else:
+ raise RuntimeError("Invalid fold_len %d" % (e.fold_len))
+
+ return n
+
+def get_all_folds_expansion_max_num():
+ l = UNFOLDS.items()
+ one_folds = make_one_folds(l)
+ fold2_heads = make_foldn_heads(l, 2, one_folds)
+ fold3_heads = make_foldn_heads(l, 3, one_folds)
+ sl = sorted(l, key=lambda (k,e):(e.fold_len, e.index))
+ nmax = 0
+ max_unfold = None
+ for unfold, e in sl:
+ n = get_all_folds_expansion_num(unfold, one_folds, fold2_heads, fold3_heads)
+ if nmax < n:
+ nmax = n
+ max_unfold = unfold
+
+ return (nmax, max_unfold)
## main ##
with open(SOURCE_FILE, 'r') as f:
@@ -335,3 +475,12 @@ out_comment = True
output_fold_source(sys.stdout, out_comment)
output_gperf_source()
+
+#unfolds_byte_length_check('utf-8')
+#unfolds_byte_length_check('utf-16')
+double_fold_check()
+unfold_is_multi_code_folds_head_check()
+
+#max_num, max_code = get_all_folds_expansion_max_num()
+#max_num -= 1 # remove self
+#print >> sys.stderr, "max expansion: 0x%06x: %d" % (max_code, max_num)