diff options
| author | Jörg Frings-Fürst <debian@jff-webhosting.net> | 2016-05-10 05:15:59 +0200 | 
|---|---|---|
| committer | Jörg Frings-Fürst <debian@jff-webhosting.net> | 2016-05-10 05:15:59 +0200 | 
| commit | e706cbe5496e1829d1ddbe4d5bb0a6728204e510 (patch) | |
| tree | c72d1848ac6aef07703848d0ffbe80f1336a81cd /src/make_unicode_property_data.py | |
| parent | 69ab3addbc2dbbc90c311b2845cd25a2159435cd (diff) | |
| parent | 5e01a4852b31d537307994248869caf38b4023cc (diff) | |
new upstream release
Diffstat (limited to 'src/make_unicode_property_data.py')
| -rwxr-xr-x | src/make_unicode_property_data.py | 545 | 
1 files changed, 545 insertions, 0 deletions
| diff --git a/src/make_unicode_property_data.py b/src/make_unicode_property_data.py new file mode 100755 index 0000000..25ed092 --- /dev/null +++ b/src/make_unicode_property_data.py @@ -0,0 +1,545 @@ +#!/usr/bin/python +# -*- coding: utf-8 -*- + +import sys +import re + +POSIX_LIST = [ +    'NEWLINE', 'Alpha', 'Blank', 'Cntrl', 'Digit', 'Graph', 'Lower', +    'Print', 'Punct', 'Space', 'Upper', 'XDigit', 'Word', 'Alnum', 'ASCII' +] + +MAX_CODE_POINT = 0x10ffff + +UD_FIRST_REG = re.compile("<.+,\s*First>") +UD_LAST_REG  = re.compile("<.+,\s*Last>") +PR_TOTAL_REG = re.compile("#\s*Total\s+code\s+points:") +PR_LINE_REG  = re.compile("([0-9A-Fa-f]+)(?:..([0-9A-Fa-f]+))?\s*;\s*(\w+)") +PA_LINE_REG  = re.compile("(\w+)\s*;\s*(\w+)") +PVA_LINE_REG = re.compile("(sc|gc)\s*;\s*(\w+)\s*;\s*(\w+)(?:\s*;\s*(\w+))?") +BL_LINE_REG  = re.compile("([0-9A-Fa-f]+)\.\.([0-9A-Fa-f]+)\s*;\s*(.*)") +VERSION_REG  = re.compile("#\s*.*-(\d\.\d\.\d)\.txt") + +VERSION_INFO = None +DIC  = { } +KDIC = { } +PropIndex = { } +PROPERTY_NAME_MAX_LEN = 0 + +def normalize_prop_name(name): +    name = re.sub(r'[ _]', '', name) +    name = name.lower() +    return name + +def fix_block_name(name): +    s = re.sub(r'[- ]+', '_', name) +    return 'In_' + s + +def check_version_info(s): +    global VERSION_INFO +    m = VERSION_REG.match(s) +    if m is not None: +        VERSION_INFO = m.group(1) + + +def print_ranges(ranges): +    for (start, end) in ranges: +        print "0x%06x, 0x%06x" % (start, end) + +    print len(ranges) + +def print_prop_and_index(prop, i): +    print "%-35s %3d" % (prop + ',', i) +    PropIndex[prop] = i + +print_cache = { } + +def print_property(prop, data, desc): +    print '' +    print "/* PROPERTY: '%s': %s */" % (prop, desc) + +    prev_prop = dic_find_by_value(print_cache, data) +    if prev_prop is not None: +        print "#define CR_%s CR_%s" % (prop, prev_prop) +    else: +        print_cache[prop] = data +        print "static const OnigCodePoint" +        print "CR_%s[] = { %d," % (prop, len(data)) +        for (start, end) in data: +            print "0x%04x, 0x%04x," % (start, end) + +        print "}; /* END of CR_%s */" % prop + + +def dic_find_by_value(dic, v): +    for key, val in dic.items(): +        if val == v: +            return key + +    return None + + +def normalize_ranges(in_ranges, sort=False): +    if sort: +        ranges = sorted(in_ranges) +    else: +        ranges = in_ranges + +    r = [] +    prev = None +    for (start, end) in ranges: +        if prev >= start - 1: +            (pstart, pend) = r.pop() +            end = max(pend, end) +            start = pstart + +        r.append((start, end)) +        prev = end + +    return r + +def inverse_ranges(in_ranges): +    r = [] +    prev = 0x000000 +    for (start, end) in in_ranges: +        if prev < start: +            r.append((prev, start - 1)) + +        prev = end + 1 + +    if prev < MAX_CODE_POINT: +        r.append((prev, MAX_CODE_POINT)) + +    return r + +def add_ranges(r1, r2): +    r = r1 + r2 +    return normalize_ranges(r, True) + +def sub_one_range(one_range, rs): +    r = [] +    (s1, e1) = one_range +    n = len(rs) +    for i in range(0, n): +        (s2, e2) = rs[i] +        if s2 >= s1 and s2 <= e1: +            if s2 > s1: +                r.append((s1, s2 - 1)) +            if e2 >= e1: +                return r + +            s1 = e2 + 1 +        elif s2 < s1 and e2 >= s1: +            if e2 < e1: +                s1 = e2 + 1 +            else: +                return r + +    r.append((s1, e1)) +    return r + +def sub_ranges(r1, r2): +    r = [] +    for one_range in r1: +        rs = sub_one_range(one_range, r2) +        r.extend(rs) + +    return r + +def add_ranges_in_dic(dic): +    r = [] +    for k, v in dic.items(): +        r = r + v + +    return normalize_ranges(r, True) + +def normalize_ranges_in_dic(dic, sort=False): +    for k, v in dic.items(): +        r = normalize_ranges(v, sort) +        dic[k] = r + +def merge_dic(to_dic, from_dic): +    to_keys   = to_dic.keys() +    from_keys = from_dic.keys() +    common = list(set(to_keys) & set(from_keys)) +    if len(common) != 0: +        print >> sys.stderr, "merge_dic: collision: %s" % sorted(common) + +    to_dic.update(from_dic) + +def merge_props(to_props, from_props): +    common = list(set(to_props) & set(from_props)) +    if len(common) != 0: +        print >> sys.stderr, "merge_props: collision: %s" % sorted(common) + +    to_props.extend(from_props) + +def add_range_into_dic(dic, name, start, end): +    d = dic.get(name, None) +    if d is None: +        d = [(start, end)] +        dic[name] = d +    else: +        d.append((start, end)) + +def list_sub(a, b): +    x = set(a) - set(b) +    return list(x) + + +def parse_unicode_data_file(f): +    dic = { } +    assigned = [] +    for line in f: +        s = line.strip() +        if len(s) == 0: +            continue +        if s[0] == '#': +            continue + +        a = s.split(';') +        code = int(a[0], 16) +        desc = a[1] +        prop = a[2] +        if UD_FIRST_REG.match(desc) is not None: +            start = code +            end   = None +        elif UD_LAST_REG.match(desc) is not None: +            end = code +        else: +            start = end = code + +        if end is not None: +            assigned.append((start, end)) +            add_range_into_dic(dic, prop, start, end) +            if len(prop) == 2: +                add_range_into_dic(dic, prop[0:1], start, end) + +    normalize_ranges_in_dic(dic) +    return dic, assigned + +def parse_properties(path, klass): +    with open(path, 'r') as f: +        dic = { } +        prop = None +        props = [] +        for line in f: +            s = line.strip() +            if len(s) == 0: +                continue + +            if s[0] == '#': +                if VERSION_INFO is None: +                    check_version_info(s) + +            m = PR_LINE_REG.match(s) +            if m: +                prop = m.group(3) +                if m.group(2): +                    start = int(m.group(1), 16) +                    end   = int(m.group(2), 16) +                    add_range_into_dic(dic, prop, start, end) +                else: +                    start = int(m.group(1), 16) +                    add_range_into_dic(dic, prop, start, start) + +            elif PR_TOTAL_REG.match(s) is not None: +                KDIC[prop] = klass +                props.append(prop) + +    normalize_ranges_in_dic(dic) +    return (dic, props) + +def parse_property_aliases(path): +    a = { } +    with open(path, 'r') as f: +        for line in f: +            s = line.strip() +            if len(s) == 0: +                continue + +            m = PA_LINE_REG.match(s) +            if not(m): +                continue + +            if m.group(1) == m.group(2): +                continue + +            a[m.group(1)] = m.group(2) + +    return a + +def parse_property_value_aliases(path): +    a = { } +    with open(path, 'r') as f: +        for line in f: +            s = line.strip() +            if len(s) == 0: +                continue + +            m = PVA_LINE_REG.match(s) +            if not(m): +                continue + +            cat = m.group(1) +            x2  = m.group(2) +            x3  = m.group(3) +            x4  = m.group(4) +            if cat == 'sc': +                if x2 != x3: +                    a[x2] = x3 +                if x4 and x4 != x3: +                    a[x4] = x3 +            else: +                if x2 != x3: +                    a[x3] = x2 +                if x4 and x4 != x2: +                    a[x4] = x2 + +    return a + +def parse_blocks(path): +    dic = { } +    blocks = [] +    with open(path, 'r') as f: +        for line in f: +            s = line.strip() +            if len(s) == 0: +                continue + +            m = BL_LINE_REG.match(s) +            if not(m): +                continue + +            start = int(m.group(1), 16) +            end   = int(m.group(2), 16) +            block = fix_block_name(m.group(3)) +            add_range_into_dic(dic, block, start, end) +            blocks.append(block) + +    noblock = fix_block_name('No_Block') +    dic[noblock] = inverse_ranges(add_ranges_in_dic(dic)) +    blocks.append(noblock) +    return dic, blocks + +def add_primitive_props(assigned): +    DIC['Assigned'] = normalize_ranges(assigned) +    DIC['Any']     = [(0x000000, 0x10ffff)] +    DIC['ASCII']   = [(0x000000, 0x00007f)] +    DIC['NEWLINE'] = [(0x00000a, 0x00000a)] +    DIC['Cn'] = inverse_ranges(DIC['Assigned']) +    DIC['C'].extend(DIC['Cn']) +    DIC['C'] = normalize_ranges(DIC['C'], True) + +    d = [] +    d.extend(DIC['Ll']) +    d.extend(DIC['Lt']) +    d.extend(DIC['Lu']) +    DIC['LC'] = normalize_ranges(d, True) + +def add_posix_props(dic): +    alnum = [] +    alnum.extend(dic['Alphabetic']) +    alnum.extend(dic['Nd'])  # Nd == Decimal_Number +    alnum = normalize_ranges(alnum, True) + +    blank = [(0x0009, 0x0009)] +    blank.extend(dic['Zs'])  # Zs == Space_Separator +    blank = normalize_ranges(blank, True) + +    word = [] +    word.extend(dic['Alphabetic']) +    word.extend(dic['M'])   # M == Mark +    word.extend(dic['Nd']) +    word.extend(dic['Pc'])  # Pc == Connector_Punctuation +    word = normalize_ranges(word, True) + +    graph = sub_ranges(dic['Any'], dic['White_Space']) +    graph = sub_ranges(graph, dic['Cc']) +    graph = sub_ranges(graph, dic['Cs'])  # Cs == Surrogate +    graph = sub_ranges(graph, dic['Cn'])  # Cn == Unassigned +    graph = normalize_ranges(graph, True) + +    p = [] +    p.extend(graph) +    p.extend(dic['Zs']) +    p = normalize_ranges(p, True) + +    dic['Alpha']  = dic['Alphabetic'] +    dic['Upper']  = dic['Uppercase'] +    dic['Lower']  = dic['Lowercase'] +    dic['Punct']  = dic['P']  # P == Punctuation +    dic['Digit']  = dic['Nd'] +    dic['XDigit'] = [(0x0030, 0x0039), (0x0041, 0x0046), (0x0061, 0x0066)] +    dic['Alnum']  = alnum +    dic['Space']  = dic['White_Space'] +    dic['Blank']  = blank +    dic['Cntrl']  = dic['Cc'] +    dic['Word']   = word +    dic['Graph']  = graph +    dic['Print']  = p + + +def set_max_prop_name(name): +    global PROPERTY_NAME_MAX_LEN +    n = len(name) +    if n > PROPERTY_NAME_MAX_LEN: +        PROPERTY_NAME_MAX_LEN = n + +LIST_COUNTER = 1 +def entry_prop_name(name, index): +    global LIST_COUNTER +    set_max_prop_name(name) +    if OUTPUT_LIST and index >= len(POSIX_LIST): +        print >> UPF, "%3d: %s" % (LIST_COUNTER, name) +        LIST_COUNTER += 1 + + +### main ### +argv = sys.argv +argc = len(argv) + +POSIX_ONLY = False +if argc >= 2: +    if argv[1] == '-posix': +        POSIX_ONLY = True + +OUTPUT_LIST = not(POSIX_ONLY) + +with open('UnicodeData.txt', 'r') as f: +    dic, assigned = parse_unicode_data_file(f) +    DIC = dic +    add_primitive_props(assigned) + +PROPS = DIC.keys() +PROPS = list_sub(PROPS, POSIX_LIST) +PROPS = sorted(PROPS) + +dic, props = parse_properties('DerivedCoreProperties.txt', 'Derived Property') +merge_dic(DIC, dic) +merge_props(PROPS, props) + +dic, props = parse_properties('Scripts.txt', 'Script') +merge_dic(DIC, dic) +merge_props(PROPS, props) +DIC['Unknown'] = inverse_ranges(add_ranges_in_dic(dic)) + +dic, props = parse_properties('PropList.txt', 'Binary Property') +merge_dic(DIC, dic) +merge_props(PROPS, props) +PROPS.append('Unknown') +KDIC['Unknown'] = 'Script' + +ALIASES = parse_property_aliases('PropertyAliases.txt') +a = parse_property_value_aliases('PropertyValueAliases.txt') +merge_dic(ALIASES, a) + +dic, BLOCKS = parse_blocks('Blocks.txt') +merge_dic(DIC, dic) + +add_posix_props(DIC) + +s = '''%{ +/* Generated by make_unicode_property_data.py. */ +''' +print s +for prop in POSIX_LIST: +    print_property(prop, DIC[prop], "POSIX [[:%s:]]" % prop) + +print '' + +if not(POSIX_ONLY): +    for prop in PROPS: +        klass = KDIC.get(prop, None) +        if klass is None: +            n = len(prop) +            if n == 1: +                klass = 'Major Category' +            elif n == 2: +                klass = 'General Category' +            else: +                klass = '-' + +        print_property(prop, DIC[prop], klass) + +    for block in BLOCKS: +        print_property(block, DIC[block], 'Block') + + +print '' +print "static const OnigCodePoint*\nconst CodeRanges[] = {" + +for prop in POSIX_LIST: +    print "  CR_%s," % prop + +if not(POSIX_ONLY): +    for prop in PROPS: +        print "  CR_%s," % prop + +    for prop in BLOCKS: +        print "  CR_%s," % prop + +s = '''}; +%} +struct PropertyNameCtype { +  char* name: +  int ctype; +}; +%% +''' +sys.stdout.write(s) + +if OUTPUT_LIST: +    UPF = open("UNICODE_PROPERTIES", "w") +    if VERSION_INFO is not None: +        print >> UPF, "Unicode Properties (from Unicode Version: %s)" % VERSION_INFO +        print >> UPF, '' + +index = -1 +for prop in POSIX_LIST: +  index += 1 +  entry_prop_name(prop, index) +  prop = normalize_prop_name(prop) +  print_prop_and_index(prop, index) + +if not(POSIX_ONLY): +    for prop in PROPS: +        index += 1 +        entry_prop_name(prop, index) +        prop = normalize_prop_name(prop) +        print_prop_and_index(prop, index) + +    NALIASES = map(lambda (k,v):(normalize_prop_name(k), k, v), ALIASES.items()) +    NALIASES = sorted(NALIASES) +    for (nk, k, v) in NALIASES: +        nv = normalize_prop_name(v) +        if PropIndex.get(nk, None) is not None: +            print >> sys.stderr, "ALIASES: already exists: %s => %s" % (k, v) +            continue +        index = PropIndex.get(nv, None) +        if index is None: +            #print >> sys.stderr, "ALIASES: value is not exist: %s => %s" % (k, v) +            continue + +        entry_prop_name(k, index) +        print_prop_and_index(nk, index) + +    for name in BLOCKS: +        index += 1 +        entry_prop_name(name, index) +        name = normalize_prop_name(name) +        print_prop_and_index(name, index) + +print '%%' +print '' +if VERSION_INFO is not None: +    print "#define PROPERTY_VERSION  %s" % re.sub(r'[\.-]', '_', VERSION_INFO) +    print '' + +print "#define PROPERTY_NAME_MAX_SIZE  %d" % (PROPERTY_NAME_MAX_LEN + 10) +print "#define CODE_RANGES_NUM         %d" % (index + 1) + +if OUTPUT_LIST: +    UPF.close() + +sys.exit(0) | 
