diff options
| author | Jörg Frings-Fürst <debian@jff-webhosting.net> | 2016-05-10 05:15:59 +0200 | 
|---|---|---|
| committer | Jörg Frings-Fürst <debian@jff-webhosting.net> | 2016-05-10 05:15:59 +0200 | 
| commit | e706cbe5496e1829d1ddbe4d5bb0a6728204e510 (patch) | |
| tree | c72d1848ac6aef07703848d0ffbe80f1336a81cd /src/make_unicode_fold_data.py | |
| parent | 69ab3addbc2dbbc90c311b2845cd25a2159435cd (diff) | |
| parent | 5e01a4852b31d537307994248869caf38b4023cc (diff) | |
new upstream release
Diffstat (limited to 'src/make_unicode_fold_data.py')
| -rwxr-xr-x | src/make_unicode_fold_data.py | 306 | 
1 files changed, 306 insertions, 0 deletions
| diff --git a/src/make_unicode_fold_data.py b/src/make_unicode_fold_data.py new file mode 100755 index 0000000..3f7d416 --- /dev/null +++ b/src/make_unicode_fold_data.py @@ -0,0 +1,306 @@ +#!/usr/bin/python +# -*- coding: utf-8 -*- + +import sys +import re +import codecs + +SOURCE_FILE = 'CaseFolding.txt' +GPERF_UNFOLD_KEY_FILE = 'unicode_unfold_key.gperf' +GPERF_FOLD_KEY_FILES  = ['unicode_fold1_key.gperf', 'unicode_fold2_key.gperf', 'unicode_fold3_key.gperf'] + + +DataName = 'OnigUnicodeFolds' + +ENCODING = 'utf-8' + +LINE_REG = re.compile("([0-9A-F]{1,6}); (.); ([0-9A-F]{1,6})(?: ([0-9A-F]{1,6}))?(?: ([0-9A-F]{1,6}))?;(?:\s*#\s*)(.*)") +VERSION_REG  = re.compile("#.*-(\d\.\d\.\d)\.txt") + +VERSION_INFO = None + +FOLDS = {} +TURKISH_FOLDS = {} +LOCALE_FOLDS  = {} + +UNFOLDS = {} +TURKISH_UNFOLDS = {} +LOCALE_UNFOLDS  = {} + +class Entry: +    def __init__(self, fold): +        self.fold = fold +        self.unfolds = [] +        self.fold_len = len(fold) +        self.index = -1 +        self.comment = None + +def fold_key(fold): +    sfold = map(lambda i: "%06x" % i, fold) +    return ':'.join(sfold) + +def form16(x, size): +    form = "0x%06x" if x > 0xffff else "0x%04x" +    s = form % x +    rem = size - len(s) +    if rem > 0: +        s = ' ' * rem + s + +    return s + +def form3bytes(x): +    x0 = x & 0xff +    x1 = (x>>8)  & 0xff +    x2 = (x>>16) & 0xff +    return "\\x%02x\\x%02x\\x%02x" % (x2, x1, x0) + +def check_version_info(s): +    global VERSION_INFO +    if VERSION_INFO is None: +        m = VERSION_REG.match(s) +        if m is not None: +            VERSION_INFO = m.group(1) + +def parse_line(s): +    if len(s) == 0: +        return False +    if s[0] == '#': +        check_version_info(s) +        return False + +    m = LINE_REG.match(s) +    if m is None: +        print >> sys.stderr, s.encode(ENCODING) +        sys.exit(-1) + +    s_unfold = m.group(1) +    s_type   = m.group(2) +    s_fold   = m.group(3) +    comment  = m.group(6) + +    if s_type == 'S': +        return False; + +    unfold = int(s_unfold, 16) +    f1     = int(s_fold, 16) +    fold = [f1] +    if m.group(4) is not None: +        f2 = int(m.group(4), 16) +        fold.append(f2) +        if m.group(5) is not None: +            f3 = int(m.group(5), 16) +            fold.append(f3) + +    if s_type == 'T': +        dic   = TURKISH_FOLDS +        undic = TURKISH_UNFOLDS +    else: +        dic   = FOLDS +        undic = UNFOLDS + +    key = fold_key(fold) +    e = dic.get(key, None) +    if e is None: +        e = Entry(fold) +        e.comment = comment +        dic[key] = e + +    e.unfolds.append(unfold) + +    if undic.get(unfold, None) is not None: +        print >> sys.stderr, ("unfold dup: 0x%04x %s\n" % (unfold, s_type)) +    undic[unfold] = e + +    return True + +def parse_file(f): +    line = f.readline() +    while line: +        s = line.strip() +        parse_line(s) +        line = f.readline() + +def make_locale(): +    for unfold, te in TURKISH_UNFOLDS.items(): +        e = UNFOLDS.get(unfold, None) +        if e is None: +            continue + +        fkey = fold_key(e.fold) +        if len(e.unfolds) == 1: +            del FOLDS[fkey] +        else: +            e.unfolds.remove(unfold) +            e = Entry(e.fold) +            e.unfolds.append(unfold) + +        LOCALE_FOLDS[fkey] = e +        LOCALE_UNFOLDS[unfold] = e +        del UNFOLDS[unfold] + +def output_typedef(f): +    s = """\ +typedef unsigned long OnigCodePoint; +""" +    print >> f, s + +def divide_by_fold_len(d): +    l = d.items() +    l1 = filter(lambda (k,e):e.fold_len == 1, l) +    l2 = filter(lambda (k,e):e.fold_len == 2, l) +    l3 = filter(lambda (k,e):e.fold_len == 3, l) +    sl1 = sorted(l1, key=lambda (k,e):k) +    sl2 = sorted(l2, key=lambda (k,e):k) +    sl3 = sorted(l3, key=lambda (k,e):k) +    return (sl1, sl2, sl3) + +def output_comment(f, s): +    f.write(" /* %s */" % s) + +def output_data_n1(f, n, fn, c, out_comment): +    for k, e in fn: +        e.index = c +        if out_comment and n > 1 and e.comment is not None: +            output_comment(f, e.comment) +            print >> f, '' + +        f.write(' ') +        f.write("/*%4d*/ " % c) +        for i in range(0, n): +            s = form16(e.fold[i], 8) +            f.write(" %s," % s) + +        usize = len(e.unfolds) +        f.write("  %d," % usize) +        for u in e.unfolds: +            s = form16(u, 8) +            f.write(" %s," % s) + +        if out_comment and n == 1 and e.comment is not None: +            if len(e.comment) < 35: +                s = e.comment +            else: +                s = e.comment[0:33] + '..' + +            output_comment(f, s) + +        f.write("\n") +        c += n + 1 + usize + +    return c + +def output_data_n(f, name, n, fn, lfn, out_comment): +    print >> f, "OnigCodePoint %s%d[] = {" % (name, n) +    c = 0 +    c = output_data_n1(f, n,  fn, c, out_comment) +    print >> f, "#define FOLDS%d_NORMAL_END_INDEX   %d" % (n, c) +    print >> f, " /* ----- LOCALE ----- */" +    c = output_data_n1(f, n, lfn, c, out_comment) +    print >> f, "#define FOLDS%d_END_INDEX   %d" % (n, c) +    print >> f, "};" + +def output_fold_data(f, name, out_comment): +    f1, f2, f3 = divide_by_fold_len(FOLDS) +    lf1, lf2, lf3 = divide_by_fold_len(LOCALE_FOLDS) + +    output_data_n(f, name, 1, f1, lf1, out_comment) +    print >> f, '' +    output_data_n(f, name, 2, f2, lf2, out_comment) +    print >> f, '' +    output_data_n(f, name, 3, f3, lf3, out_comment) +    print >> f, '' + +def output_macros(f, name): +    print >> f, "#define FOLDS1_FOLD(i)         (%s1 + (i))" % name +    print >> f, "#define FOLDS2_FOLD(i)         (%s2 + (i))" % name +    print >> f, "#define FOLDS3_FOLD(i)         (%s3 + (i))" % name + +    print >> f, "#define FOLDS1_UNFOLDS_NUM(i)  %s1[(i)+1]" % name +    print >> f, "#define FOLDS2_UNFOLDS_NUM(i)  %s2[(i)+2]" % name +    print >> f, "#define FOLDS3_UNFOLDS_NUM(i)  %s3[(i)+3]" % name + +    print >> f, "#define FOLDS1_UNFOLDS(i)      (%s1 + (i) + 2)" % name +    print >> f, "#define FOLDS2_UNFOLDS(i)      (%s2 + (i) + 3)" % name +    print >> f, "#define FOLDS3_UNFOLDS(i)      (%s3 + (i) + 4)" % name + +    print >> f, "#define FOLDS1_NEXT_INDEX(i)   ((i) + 2 + %s1[(i)+1])" % name +    print >> f, "#define FOLDS2_NEXT_INDEX(i)   ((i) + 3 + %s1[(i)+2])" % name +    print >> f, "#define FOLDS3_NEXT_INDEX(i)   ((i) + 4 + %s1[(i)+3])" % name + +def output_fold_source(f, out_comment): +    print >> f, "/* This file was generated by make_unicode_fold_data.py. */" +    print >> f, '#include "regenc.h"' +    print >> f, '' +    if VERSION_INFO is not None: +        print "#define CASEFOLD_VERSION  %s" % re.sub(r'[\.-]', '_', VERSION_INFO) +        print '' +    #output_macros(f, DataName) +    print >> f, '' +    #output_typedef(f) +    output_fold_data(f, DataName, out_comment) + +def output_gperf_unfold_key(f): +    head = """\ +%{ +/* This gperf source file was generated by make_unicode_fold_data.py */ +#include <string.h> +#include "regenc.h" +%} +struct ByUnfoldKey { +  OnigCodePoint code; +  int   index; +  int   fold_len; +}; +%% +""" +    f.write(head) +    UNFOLDS.update(LOCALE_UNFOLDS) +    l = UNFOLDS.items() +    sl = sorted(l, key=lambda (k,e):(e.fold_len, e.index)) +    for k, e in sl: +        f.write('"%s", /*0x%04x*/ %4d, %d\n' % +                (form3bytes(k), k, e.index, e.fold_len)) + +    print >> f, '%%' + +def output_gperf_fold_key(f, key_len): +    head = """\ +%{ +/* This gperf source file was generated by make_unicode_fold_data.py */ +#include <string.h> +#include "regenc.h" +%} +int +%% +""" +    f.write(head) +    l = FOLDS.items() +    l = filter(lambda (k,e):e.fold_len == key_len, l) +    sl = sorted(l, key=lambda (k,e):e.index) +    for k, e in sl: +        skey = ''.join(map(lambda i: form3bytes(i), e.fold)) +        f.write('"%s", %4d\n' % (skey, e.index)) + +    print >> f, '%%' + +def output_gperf_source(): +   with open(GPERF_UNFOLD_KEY_FILE, 'w') as f: +       output_gperf_unfold_key(f) + +   FOLDS.update(LOCALE_FOLDS) + +   for i in range(1, 4): +       with open(GPERF_FOLD_KEY_FILES[i-1], 'w') as f: +           output_gperf_fold_key(f, i) + + +## main ## +with open(SOURCE_FILE, 'r') as f: +    parse_file(f) + +make_locale() + +out_comment = True +output_fold_source(sys.stdout, out_comment) + +output_gperf_source() | 
