diff options
Diffstat (limited to 'src/make_unicode_fold_data.py')
-rwxr-xr-x | src/make_unicode_fold_data.py | 306 |
1 files changed, 306 insertions, 0 deletions
diff --git a/src/make_unicode_fold_data.py b/src/make_unicode_fold_data.py new file mode 100755 index 0000000..3f7d416 --- /dev/null +++ b/src/make_unicode_fold_data.py @@ -0,0 +1,306 @@ +#!/usr/bin/python +# -*- coding: utf-8 -*- + +import sys +import re +import codecs + +SOURCE_FILE = 'CaseFolding.txt' +GPERF_UNFOLD_KEY_FILE = 'unicode_unfold_key.gperf' +GPERF_FOLD_KEY_FILES = ['unicode_fold1_key.gperf', 'unicode_fold2_key.gperf', 'unicode_fold3_key.gperf'] + + +DataName = 'OnigUnicodeFolds' + +ENCODING = 'utf-8' + +LINE_REG = re.compile("([0-9A-F]{1,6}); (.); ([0-9A-F]{1,6})(?: ([0-9A-F]{1,6}))?(?: ([0-9A-F]{1,6}))?;(?:\s*#\s*)(.*)") +VERSION_REG = re.compile("#.*-(\d\.\d\.\d)\.txt") + +VERSION_INFO = None + +FOLDS = {} +TURKISH_FOLDS = {} +LOCALE_FOLDS = {} + +UNFOLDS = {} +TURKISH_UNFOLDS = {} +LOCALE_UNFOLDS = {} + +class Entry: + def __init__(self, fold): + self.fold = fold + self.unfolds = [] + self.fold_len = len(fold) + self.index = -1 + self.comment = None + +def fold_key(fold): + sfold = map(lambda i: "%06x" % i, fold) + return ':'.join(sfold) + +def form16(x, size): + form = "0x%06x" if x > 0xffff else "0x%04x" + s = form % x + rem = size - len(s) + if rem > 0: + s = ' ' * rem + s + + return s + +def form3bytes(x): + x0 = x & 0xff + x1 = (x>>8) & 0xff + x2 = (x>>16) & 0xff + return "\\x%02x\\x%02x\\x%02x" % (x2, x1, x0) + +def check_version_info(s): + global VERSION_INFO + if VERSION_INFO is None: + m = VERSION_REG.match(s) + if m is not None: + VERSION_INFO = m.group(1) + +def parse_line(s): + if len(s) == 0: + return False + if s[0] == '#': + check_version_info(s) + return False + + m = LINE_REG.match(s) + if m is None: + print >> sys.stderr, s.encode(ENCODING) + sys.exit(-1) + + s_unfold = m.group(1) + s_type = m.group(2) + s_fold = m.group(3) + comment = m.group(6) + + if s_type == 'S': + return False; + + unfold = int(s_unfold, 16) + f1 = int(s_fold, 16) + fold = [f1] + if m.group(4) is not None: + f2 = int(m.group(4), 16) + fold.append(f2) + if m.group(5) is not None: + f3 = int(m.group(5), 16) + fold.append(f3) + + if s_type == 'T': + dic = TURKISH_FOLDS + undic = TURKISH_UNFOLDS + else: + dic = FOLDS + undic = UNFOLDS + + key = fold_key(fold) + e = dic.get(key, None) + if e is None: + e = Entry(fold) + e.comment = comment + dic[key] = e + + e.unfolds.append(unfold) + + if undic.get(unfold, None) is not None: + print >> sys.stderr, ("unfold dup: 0x%04x %s\n" % (unfold, s_type)) + undic[unfold] = e + + return True + +def parse_file(f): + line = f.readline() + while line: + s = line.strip() + parse_line(s) + line = f.readline() + +def make_locale(): + for unfold, te in TURKISH_UNFOLDS.items(): + e = UNFOLDS.get(unfold, None) + if e is None: + continue + + fkey = fold_key(e.fold) + if len(e.unfolds) == 1: + del FOLDS[fkey] + else: + e.unfolds.remove(unfold) + e = Entry(e.fold) + e.unfolds.append(unfold) + + LOCALE_FOLDS[fkey] = e + LOCALE_UNFOLDS[unfold] = e + del UNFOLDS[unfold] + +def output_typedef(f): + s = """\ +typedef unsigned long OnigCodePoint; +""" + print >> f, s + +def divide_by_fold_len(d): + l = d.items() + l1 = filter(lambda (k,e):e.fold_len == 1, l) + l2 = filter(lambda (k,e):e.fold_len == 2, l) + l3 = filter(lambda (k,e):e.fold_len == 3, l) + sl1 = sorted(l1, key=lambda (k,e):k) + sl2 = sorted(l2, key=lambda (k,e):k) + sl3 = sorted(l3, key=lambda (k,e):k) + return (sl1, sl2, sl3) + +def output_comment(f, s): + f.write(" /* %s */" % s) + +def output_data_n1(f, n, fn, c, out_comment): + for k, e in fn: + e.index = c + if out_comment and n > 1 and e.comment is not None: + output_comment(f, e.comment) + print >> f, '' + + f.write(' ') + f.write("/*%4d*/ " % c) + for i in range(0, n): + s = form16(e.fold[i], 8) + f.write(" %s," % s) + + usize = len(e.unfolds) + f.write(" %d," % usize) + for u in e.unfolds: + s = form16(u, 8) + f.write(" %s," % s) + + if out_comment and n == 1 and e.comment is not None: + if len(e.comment) < 35: + s = e.comment + else: + s = e.comment[0:33] + '..' + + output_comment(f, s) + + f.write("\n") + c += n + 1 + usize + + return c + +def output_data_n(f, name, n, fn, lfn, out_comment): + print >> f, "OnigCodePoint %s%d[] = {" % (name, n) + c = 0 + c = output_data_n1(f, n, fn, c, out_comment) + print >> f, "#define FOLDS%d_NORMAL_END_INDEX %d" % (n, c) + print >> f, " /* ----- LOCALE ----- */" + c = output_data_n1(f, n, lfn, c, out_comment) + print >> f, "#define FOLDS%d_END_INDEX %d" % (n, c) + print >> f, "};" + +def output_fold_data(f, name, out_comment): + f1, f2, f3 = divide_by_fold_len(FOLDS) + lf1, lf2, lf3 = divide_by_fold_len(LOCALE_FOLDS) + + output_data_n(f, name, 1, f1, lf1, out_comment) + print >> f, '' + output_data_n(f, name, 2, f2, lf2, out_comment) + print >> f, '' + output_data_n(f, name, 3, f3, lf3, out_comment) + print >> f, '' + +def output_macros(f, name): + print >> f, "#define FOLDS1_FOLD(i) (%s1 + (i))" % name + print >> f, "#define FOLDS2_FOLD(i) (%s2 + (i))" % name + print >> f, "#define FOLDS3_FOLD(i) (%s3 + (i))" % name + + print >> f, "#define FOLDS1_UNFOLDS_NUM(i) %s1[(i)+1]" % name + print >> f, "#define FOLDS2_UNFOLDS_NUM(i) %s2[(i)+2]" % name + print >> f, "#define FOLDS3_UNFOLDS_NUM(i) %s3[(i)+3]" % name + + print >> f, "#define FOLDS1_UNFOLDS(i) (%s1 + (i) + 2)" % name + print >> f, "#define FOLDS2_UNFOLDS(i) (%s2 + (i) + 3)" % name + print >> f, "#define FOLDS3_UNFOLDS(i) (%s3 + (i) + 4)" % name + + print >> f, "#define FOLDS1_NEXT_INDEX(i) ((i) + 2 + %s1[(i)+1])" % name + print >> f, "#define FOLDS2_NEXT_INDEX(i) ((i) + 3 + %s1[(i)+2])" % name + print >> f, "#define FOLDS3_NEXT_INDEX(i) ((i) + 4 + %s1[(i)+3])" % name + +def output_fold_source(f, out_comment): + print >> f, "/* This file was generated by make_unicode_fold_data.py. */" + print >> f, '#include "regenc.h"' + print >> f, '' + if VERSION_INFO is not None: + print "#define CASEFOLD_VERSION %s" % re.sub(r'[\.-]', '_', VERSION_INFO) + print '' + #output_macros(f, DataName) + print >> f, '' + #output_typedef(f) + output_fold_data(f, DataName, out_comment) + +def output_gperf_unfold_key(f): + head = """\ +%{ +/* This gperf source file was generated by make_unicode_fold_data.py */ +#include <string.h> +#include "regenc.h" +%} +struct ByUnfoldKey { + OnigCodePoint code; + int index; + int fold_len; +}; +%% +""" + f.write(head) + UNFOLDS.update(LOCALE_UNFOLDS) + l = UNFOLDS.items() + sl = sorted(l, key=lambda (k,e):(e.fold_len, e.index)) + for k, e in sl: + f.write('"%s", /*0x%04x*/ %4d, %d\n' % + (form3bytes(k), k, e.index, e.fold_len)) + + print >> f, '%%' + +def output_gperf_fold_key(f, key_len): + head = """\ +%{ +/* This gperf source file was generated by make_unicode_fold_data.py */ +#include <string.h> +#include "regenc.h" +%} +int +%% +""" + f.write(head) + l = FOLDS.items() + l = filter(lambda (k,e):e.fold_len == key_len, l) + sl = sorted(l, key=lambda (k,e):e.index) + for k, e in sl: + skey = ''.join(map(lambda i: form3bytes(i), e.fold)) + f.write('"%s", %4d\n' % (skey, e.index)) + + print >> f, '%%' + +def output_gperf_source(): + with open(GPERF_UNFOLD_KEY_FILE, 'w') as f: + output_gperf_unfold_key(f) + + FOLDS.update(LOCALE_FOLDS) + + for i in range(1, 4): + with open(GPERF_FOLD_KEY_FILES[i-1], 'w') as f: + output_gperf_fold_key(f, i) + + +## main ## +with open(SOURCE_FILE, 'r') as f: + parse_file(f) + +make_locale() + +out_comment = True +output_fold_source(sys.stdout, out_comment) + +output_gperf_source() |