Learn more  » Push, build, and install  RubyGems npm packages Python packages Maven artifacts PHP packages Go Modules Bower components Debian packages RPM packages NuGet packages

agriconnect / python3.8-examples   deb

Repository URL to install this package:

/ usr / share / doc / python3.8 / examples / unicode / makeunicodedata.py

#
# (re)generate unicode property and type databases
#
# This script converts Unicode database files to Modules/unicodedata_db.h,
# Modules/unicodename_db.h, and Objects/unicodetype_db.h
#
# history:
# 2000-09-24 fl   created (based on bits and pieces from unidb)
# 2000-09-25 fl   merged tim's splitbin fixes, separate decomposition table
# 2000-09-25 fl   added character type table
# 2000-09-26 fl   added LINEBREAK, DECIMAL, and DIGIT flags/fields (2.0)
# 2000-11-03 fl   expand first/last ranges
# 2001-01-19 fl   added character name tables (2.1)
# 2001-01-21 fl   added decomp compression; dynamic phrasebook threshold
# 2002-09-11 wd   use string methods
# 2002-10-18 mvl  update to Unicode 3.2
# 2002-10-22 mvl  generate NFC tables
# 2002-11-24 mvl  expand all ranges, sort names version-independently
# 2002-11-25 mvl  add UNIDATA_VERSION
# 2004-05-29 perky add east asian width information
# 2006-03-10 mvl  update to Unicode 4.1; add UCD 3.2 delta
# 2008-06-11 gb   add PRINTABLE_MASK for Atsuo Ishimoto's ascii() patch
# 2011-10-21 ezio add support for name aliases and named sequences
# 2012-01    benjamin add full case mappings
#
# written by Fredrik Lundh (fredrik@pythonware.com)
#

import os
import sys
import zipfile

from textwrap import dedent
from functools import partial

SCRIPT = sys.argv[0]
VERSION = "3.3"

# The Unicode Database
# --------------------
# When changing UCD version please update
#   * Doc/library/stdtypes.rst, and
#   * Doc/library/unicodedata.rst
#   * Doc/reference/lexical_analysis.rst (two occurrences)
UNIDATA_VERSION = "12.1.0"
UNICODE_DATA = "UnicodeData%s.txt"
COMPOSITION_EXCLUSIONS = "CompositionExclusions%s.txt"
EASTASIAN_WIDTH = "EastAsianWidth%s.txt"
UNIHAN = "Unihan%s.zip"
DERIVED_CORE_PROPERTIES = "DerivedCoreProperties%s.txt"
DERIVEDNORMALIZATION_PROPS = "DerivedNormalizationProps%s.txt"
LINE_BREAK = "LineBreak%s.txt"
NAME_ALIASES = "NameAliases%s.txt"
NAMED_SEQUENCES = "NamedSequences%s.txt"
SPECIAL_CASING = "SpecialCasing%s.txt"
CASE_FOLDING = "CaseFolding%s.txt"

# Private Use Areas -- in planes 1, 15, 16
PUA_1 = range(0xE000, 0xF900)
PUA_15 = range(0xF0000, 0xFFFFE)
PUA_16 = range(0x100000, 0x10FFFE)

# we use this ranges of PUA_15 to store name aliases and named sequences
NAME_ALIASES_START = 0xF0000
NAMED_SEQUENCES_START = 0xF0200

old_versions = ["3.2.0"]

CATEGORY_NAMES = [ "Cn", "Lu", "Ll", "Lt", "Mn", "Mc", "Me", "Nd",
    "Nl", "No", "Zs", "Zl", "Zp", "Cc", "Cf", "Cs", "Co", "Cn", "Lm",
    "Lo", "Pc", "Pd", "Ps", "Pe", "Pi", "Pf", "Po", "Sm", "Sc", "Sk",
    "So" ]

BIDIRECTIONAL_NAMES = [ "", "L", "LRE", "LRO", "R", "AL", "RLE", "RLO",
    "PDF", "EN", "ES", "ET", "AN", "CS", "NSM", "BN", "B", "S", "WS",
    "ON", "LRI", "RLI", "FSI", "PDI" ]

EASTASIANWIDTH_NAMES = [ "F", "H", "W", "Na", "A", "N" ]

MANDATORY_LINE_BREAKS = [ "BK", "CR", "LF", "NL" ]

# note: should match definitions in Objects/unicodectype.c
ALPHA_MASK = 0x01
DECIMAL_MASK = 0x02
DIGIT_MASK = 0x04
LOWER_MASK = 0x08
LINEBREAK_MASK = 0x10
SPACE_MASK = 0x20
TITLE_MASK = 0x40
UPPER_MASK = 0x80
XID_START_MASK = 0x100
XID_CONTINUE_MASK = 0x200
PRINTABLE_MASK = 0x400
NUMERIC_MASK = 0x800
CASE_IGNORABLE_MASK = 0x1000
CASED_MASK = 0x2000
EXTENDED_CASE_MASK = 0x4000

# these ranges need to match unicodedata.c:is_unified_ideograph
cjk_ranges = [
    ('3400', '4DB5'),
    ('4E00', '9FEF'),
    ('20000', '2A6D6'),
    ('2A700', '2B734'),
    ('2B740', '2B81D'),
    ('2B820', '2CEA1'),
    ('2CEB0', '2EBE0'),
]


def maketables(trace=0):

    print("--- Reading", UNICODE_DATA % "", "...")

    unicode = UnicodeData(UNIDATA_VERSION)

    print(len(list(filter(None, unicode.table))), "characters")

    for version in old_versions:
        print("--- Reading", UNICODE_DATA % ("-"+version), "...")
        old_unicode = UnicodeData(version, cjk_check=False)
        print(len(list(filter(None, old_unicode.table))), "characters")
        merge_old_version(version, unicode, old_unicode)

    makeunicodename(unicode, trace)
    makeunicodedata(unicode, trace)
    makeunicodetype(unicode, trace)


# --------------------------------------------------------------------
# unicode character properties

def makeunicodedata(unicode, trace):

    dummy = (0, 0, 0, 0, 0, 0)
    table = [dummy]
    cache = {0: dummy}
    index = [0] * len(unicode.chars)

    FILE = "Modules/unicodedata_db.h"

    print("--- Preparing", FILE, "...")

    # 1) database properties

    for char in unicode.chars:
        record = unicode.table[char]
        if record:
            # extract database properties
            category = CATEGORY_NAMES.index(record[2])
            combining = int(record[3])
            bidirectional = BIDIRECTIONAL_NAMES.index(record[4])
            mirrored = record[9] == "Y"
            eastasianwidth = EASTASIANWIDTH_NAMES.index(record[15])
            normalizationquickcheck = record[17]
            item = (
                category, combining, bidirectional, mirrored, eastasianwidth,
                normalizationquickcheck
                )
            # add entry to index and item tables
            i = cache.get(item)
            if i is None:
                cache[item] = i = len(table)
                table.append(item)
            index[char] = i

    # 2) decomposition data

    decomp_data = [0]
    decomp_prefix = [""]
    decomp_index = [0] * len(unicode.chars)
    decomp_size = 0

    comp_pairs = []
    comp_first = [None] * len(unicode.chars)
    comp_last = [None] * len(unicode.chars)

    for char in unicode.chars:
        record = unicode.table[char]
        if record:
            if record[5]:
                decomp = record[5].split()
                if len(decomp) > 19:
                    raise Exception("character %x has a decomposition too large for nfd_nfkd" % char)
                # prefix
                if decomp[0][0] == "<":
                    prefix = decomp.pop(0)
                else:
                    prefix = ""
                try:
                    i = decomp_prefix.index(prefix)
                except ValueError:
                    i = len(decomp_prefix)
                    decomp_prefix.append(prefix)
                prefix = i
                assert prefix < 256
                # content
                decomp = [prefix + (len(decomp)<<8)] + [int(s, 16) for s in decomp]
                # Collect NFC pairs
                if not prefix and len(decomp) == 3 and \
                   char not in unicode.exclusions and \
                   unicode.table[decomp[1]][3] == "0":
                    p, l, r = decomp
                    comp_first[l] = 1
                    comp_last[r] = 1
                    comp_pairs.append((l,r,char))
                try:
                    i = decomp_data.index(decomp)
                except ValueError:
                    i = len(decomp_data)
                    decomp_data.extend(decomp)
                    decomp_size = decomp_size + len(decomp) * 2
            else:
                i = 0
            decomp_index[char] = i

    f = l = 0
    comp_first_ranges = []
    comp_last_ranges = []
    prev_f = prev_l = None
    for i in unicode.chars:
        if comp_first[i] is not None:
            comp_first[i] = f
            f += 1
            if prev_f is None:
                prev_f = (i,i)
            elif prev_f[1]+1 == i:
                prev_f = prev_f[0],i
            else:
                comp_first_ranges.append(prev_f)
                prev_f = (i,i)
        if comp_last[i] is not None:
            comp_last[i] = l
            l += 1
            if prev_l is None:
                prev_l = (i,i)
            elif prev_l[1]+1 == i:
                prev_l = prev_l[0],i
            else:
                comp_last_ranges.append(prev_l)
                prev_l = (i,i)
    comp_first_ranges.append(prev_f)
    comp_last_ranges.append(prev_l)
    total_first = f
    total_last = l

    comp_data = [0]*(total_first*total_last)
    for f,l,char in comp_pairs:
        f = comp_first[f]
        l = comp_last[l]
        comp_data[f*total_last+l] = char

    print(len(table), "unique properties")
    print(len(decomp_prefix), "unique decomposition prefixes")
    print(len(decomp_data), "unique decomposition entries:", end=' ')
    print(decomp_size, "bytes")
    print(total_first, "first characters in NFC")
    print(total_last, "last characters in NFC")
    print(len(comp_pairs), "NFC pairs")

    print("--- Writing", FILE, "...")

    with open(FILE, "w") as fp:
        fprint = partial(print, file=fp)

        fprint("/* this file was generated by %s %s */" % (SCRIPT, VERSION))
        fprint()
        fprint('#define UNIDATA_VERSION "%s"' % UNIDATA_VERSION)
        fprint("/* a list of unique database records */")
        fprint("const _PyUnicode_DatabaseRecord _PyUnicode_Database_Records[] = {")
        for item in table:
            fprint("    {%d, %d, %d, %d, %d, %d}," % item)
        fprint("};")
        fprint()

        fprint("/* Reindexing of NFC first characters. */")
        fprint("#define TOTAL_FIRST",total_first)
        fprint("#define TOTAL_LAST",total_last)
        fprint("struct reindex{int start;short count,index;};")
        fprint("static struct reindex nfc_first[] = {")
        for start,end in comp_first_ranges:
            fprint("    { %d, %d, %d}," % (start,end-start,comp_first[start]))
        fprint("    {0,0,0}")
        fprint("};\n")
        fprint("static struct reindex nfc_last[] = {")
        for start,end in comp_last_ranges:
            fprint("  { %d, %d, %d}," % (start,end-start,comp_last[start]))
        fprint("  {0,0,0}")
        fprint("};\n")

        # FIXME: <fl> the following tables could be made static, and
        # the support code moved into unicodedatabase.c

        fprint("/* string literals */")
        fprint("const char *_PyUnicode_CategoryNames[] = {")
        for name in CATEGORY_NAMES:
            fprint("    \"%s\"," % name)
        fprint("    NULL")
        fprint("};")

        fprint("const char *_PyUnicode_BidirectionalNames[] = {")
        for name in BIDIRECTIONAL_NAMES:
            fprint("    \"%s\"," % name)
        fprint("    NULL")
        fprint("};")

        fprint("const char *_PyUnicode_EastAsianWidthNames[] = {")
        for name in EASTASIANWIDTH_NAMES:
            fprint("    \"%s\"," % name)
        fprint("    NULL")
        fprint("};")

        fprint("static const char *decomp_prefix[] = {")
        for name in decomp_prefix:
            fprint("    \"%s\"," % name)
        fprint("    NULL")
        fprint("};")

        # split record index table
        index1, index2, shift = splitbins(index, trace)

        fprint("/* index tables for the database records */")
        fprint("#define SHIFT", shift)
        Array("index1", index1).dump(fp, trace)
        Array("index2", index2).dump(fp, trace)

        # split decomposition index table
        index1, index2, shift = splitbins(decomp_index, trace)

        fprint("/* decomposition data */")
        Array("decomp_data", decomp_data).dump(fp, trace)

        fprint("/* index tables for the decomposition data */")
        fprint("#define DECOMP_SHIFT", shift)
        Array("decomp_index1", index1).dump(fp, trace)
        Array("decomp_index2", index2).dump(fp, trace)

        index, index2, shift = splitbins(comp_data, trace)
        fprint("/* NFC pairs */")
        fprint("#define COMP_SHIFT", shift)
        Array("comp_index", index).dump(fp, trace)
        Array("comp_data", index2).dump(fp, trace)

        # Generate delta tables for old versions
        for version, table, normalization in unicode.changed:
Loading ...