diff options
Diffstat (limited to 'src/make_unicode_fold_data.py')
-rwxr-xr-x | src/make_unicode_fold_data.py | 221 |
1 files changed, 185 insertions, 36 deletions
diff --git a/src/make_unicode_fold_data.py b/src/make_unicode_fold_data.py index 55d5b88..0e6c635 100755 --- a/src/make_unicode_fold_data.py +++ b/src/make_unicode_fold_data.py @@ -1,7 +1,7 @@ #!/usr/bin/python # -*- coding: utf-8 -*- # make_unicode_fold_data.py -# Copyright (c) 2016-2019 K.Kosako +# Copyright (c) 2016-2020 K.Kosako import sys import re @@ -28,6 +28,35 @@ UNFOLDS = {} TURKISH_UNFOLDS = {} LOCALE_UNFOLDS = {} +COPYRIGHT = ''' +/*- + * Copyright (c) 2017-2020 K.Kosako + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ +'''.strip() + + class Entry: def __init__(self, fold): self.fold = fold @@ -55,6 +84,11 @@ def form3bytes(x): x2 = (x>>16) & 0xff return "\\x%02x\\x%02x\\x%02x" % (x2, x1, x0) +def enc_len(code, encode): + u = unichr(code) + s = u.encode(encode) + return len(s) + def check_version_info(s): m = VERSION_REG.match(s) if m is not None: @@ -231,6 +265,8 @@ def output_macros(f, name): def output_fold_source(f, out_comment): print >> f, "/* This file was generated by make_unicode_fold_data.py. */" + print >> f, COPYRIGHT + print >> f, "\n" print >> f, '#include "regenc.h"' print >> f, '' if VERSION_INFO[0] < 0: @@ -244,42 +280,11 @@ def output_fold_source(f, out_comment): output_fold_data(f, DataName, out_comment) -HEAD = ''' -%{ -/* This gperf source file was generated by make_unicode_fold_data.py */ +def output_gperf_unfold_key(f): + head = "%{\n/* This gperf source file was generated by make_unicode_fold_data.py */\n\n" + COPYRIGHT + """\ -/*- - * Copyright (c) 2017-2019 K.Kosako - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - */ -#include <string.h> -#include "regenc.h" +#include "regint.h" %} -'''.strip() - -def output_gperf_unfold_key(f): - head = HEAD + """\ struct ByUnfoldKey { OnigCodePoint code; @@ -299,7 +304,10 @@ struct ByUnfoldKey { print >> f, '%%' def output_gperf_fold_key(f, key_len): - head = HEAD + """\ + head = "%{\n/* This gperf source file was generated by make_unicode_fold_data.py */\n\n" + COPYRIGHT + """\ + +#include "regint.h" +%} short int %% @@ -324,6 +332,138 @@ def output_gperf_source(): with open(GPERF_FOLD_KEY_FILES[i-1], 'w') as f: output_gperf_fold_key(f, i) +def unfolds_byte_length_check(encode): + l = UNFOLDS.items() + sl = sorted(l, key=lambda (k,e):(e.fold_len, e.index)) + for unfold, e in sl: + key_len = enc_len(unfold, encode) + fold_len = sum(map(lambda c: enc_len(c, encode), e.fold)) + if key_len > fold_len: + sfolds = ' '.join(map(lambda c: "0x%06x" % c, e.fold)) + s = "%s byte length: %d > %d: 0x%06x => %s" % (encode, key_len, fold_len, unfold, sfolds) + print >> sys.stderr, s + +def double_fold_check(): + l = UNFOLDS.items() + sl = sorted(l, key=lambda (k,e):(e.fold_len, e.index)) + for unfold, e in sl: + for f in e.fold: + #print >> sys.stderr, ("check 0x%06x" % f) + e2 = UNFOLDS.get(f) + if e2 is not None: + s = "double folds: 0x%06x => %s, 0x%06x => %s" % (unfold, e.fold, f, e2.fold) + print >> sys.stderr, s + +def unfold_is_multi_code_folds_head_check(): + l = UNFOLDS.items() + l2 = filter(lambda (k,e):e.fold_len == 2, l) + l3 = filter(lambda (k,e):e.fold_len == 3, l) + sl = sorted(l, key=lambda (k,e):(e.fold_len, e.index)) + for unfold, _ in sl: + for k, e in l2: + if e.fold[0] == unfold: + s = "unfold 0x%06x is multi-code fold head in %s" % (unfold, e.fold) + print >> sys.stderr, s + for k, e in l3: + if e.fold[0] == unfold: + s = "unfold 0x%06x is multi-code fold head in %s" % (unfold, e.fold) + print >> sys.stderr, s + +def make_one_folds(l): + h = {} + for unfold, e in l: + if e.fold_len != 1: + continue + fold = e.fold[0] + unfolds = h.get(fold) + if unfolds is None: + unfolds = [unfold] + h[fold] = unfolds + else: + unfolds.append(unfold) + + return h + +def make_foldn_heads(l, fold_len, one_folds): + h = {} + for unfold, e in l: + if e.fold_len != fold_len: + continue + unfolds = one_folds.get(e.fold[0]) + h[e.fold[0]] = (e, unfolds) + + return h + +def fold2_expansion_num(e, one_folds): + n = len(e.unfolds) + n0 = 1 + u0 = one_folds.get(e.fold[0]) + if u0 is not None: + n0 += len(u0) + n1 = 1 + u1 = one_folds.get(e.fold[1]) + if u1 is not None: + n1 += len(u1) + n += (n0 * n1) + return n + +def fold3_expansion_num(e, one_folds): + n = len(e.unfolds) + n0 = 1 + u0 = one_folds.get(e.fold[0]) + if u0 is not None: + n0 += len(u0) + n1 = 1 + u1 = one_folds.get(e.fold[1]) + if u1 is not None: + n1 += len(u1) + n2 = 1 + u2 = one_folds.get(e.fold[2]) + if u2 is not None: + n2 += len(u2) + n += (n0 * n1 * n2) + return n + +def get_all_folds_expansion_num(x, one_folds, fold2_heads, fold3_heads): + e = UNFOLDS[x] + n = 0 + if e.fold_len == 1: + n1 = len(e.unfolds) + 1 # +1: fold + fx = e.fold[0] + r = fold2_heads.get(fx) + n2 = n3 = 0 + if r is not None: + e2, _ = r + n2 = fold2_expansion_num(e2, one_folds) + r = fold3_heads.get(fx) + if r is not None: + e3, _ = r + n3 = fold3_expansion_num(e3, one_folds) + n = max(n1, n2, n3) + elif e.fold_len == 2: + n = fold2_expansion_num(e, one_folds) + elif e.fold_len == 3: + n = fold3_expansion_num(e, one_folds) + else: + raise RuntimeError("Invalid fold_len %d" % (e.fold_len)) + + return n + +def get_all_folds_expansion_max_num(): + l = UNFOLDS.items() + one_folds = make_one_folds(l) + fold2_heads = make_foldn_heads(l, 2, one_folds) + fold3_heads = make_foldn_heads(l, 3, one_folds) + sl = sorted(l, key=lambda (k,e):(e.fold_len, e.index)) + nmax = 0 + max_unfold = None + for unfold, e in sl: + n = get_all_folds_expansion_num(unfold, one_folds, fold2_heads, fold3_heads) + if nmax < n: + nmax = n + max_unfold = unfold + + return (nmax, max_unfold) ## main ## with open(SOURCE_FILE, 'r') as f: @@ -335,3 +475,12 @@ out_comment = True output_fold_source(sys.stdout, out_comment) output_gperf_source() + +#unfolds_byte_length_check('utf-8') +#unfolds_byte_length_check('utf-16') +double_fold_check() +unfold_is_multi_code_folds_head_check() + +#max_num, max_code = get_all_folds_expansion_max_num() +#max_num -= 1 # remove self +#print >> sys.stderr, "max expansion: 0x%06x: %d" % (max_code, max_num) |