Why Gemfury? Push, build, and install  RubyGems npm packages Python packages Maven artifacts PHP packages Go Modules Debian packages RPM packages NuGet packages

Repository URL to install this package:

Details    
enable / fonttools / text / _parse_scripts.py
Size: Mime:
# (C) Copyright 2005-2021 Enthought, Inc., Austin, TX
# All rights reserved.
#
# This software is provided without warranty under the terms of the BSD
# license included in LICENSE.txt and may be redistributed only under
# the conditions described in the aforementioned license. The license
# is also available online at http://www.enthought.com/licenses/BSD.txt
#
# Thanks for using Enthought open source!
import os
import re

_DATA_FILE = """\
# ++++++++++++++++++++++++++++++++++++++++++++
# Do Not Edit This File. It was autogenerated.
# ++++++++++++++++++++++++++++++++++++++++++++
# You can regenerate this file by running `kiva.fonttools.text._parse_scripts`

SCRIPTS = [
{scripts}
]

ENTRIES = [
{entries}
]
"""
_DATA_URL = "http://www.unicode.org/Public/UNIDATA/Scripts.txt"


def _parse_lines(lines):
    """ Return a list of parsed entries for the lines in Scripts.txt
    """
    # Regex for parsing lines
    _entry_regex = re.compile(
        r"([0-9A-F]+)"  # First hex number
        r"(?:[.]{2}([0-9A-F]+))?"  # Second hex number [Maybe]
        r"\W+"  # Anything not a word
        r"(\w+)"  # A word
        r"(?: # )"  # The exact string: " # "
        r"(\w{2})"  # A two letter category code
        # The rest of the line is ignored
    )
    # Example
    # 0000..001F    ; Common # Cc  [32] <control-0000>..<control-001F>
    # 0020          ; Common # Zs       SPACE

    entries = []
    for line in lines:
        line = line.strip()
        if not line or line.startswith("#"):
            continue

        match = _entry_regex.match(line)
        if match is not None:
            start, end, lang, category = match.groups()
            if end is None:
                end = start
            entries.append((int(start, 16), int(end, 16), lang, category))

    # Sort by starting codepoint
    entries.sort()
    return entries


def _parse_scripts_txt(lines):
    """ Parse the content of http://www.unicode.org/Public/UNIDATA/Scripts.txt

    This file provides an up to date mapping of Unicode codepoint ranges to
    the languages they cover.
    """
    entries = _parse_lines(lines)
    output = os.path.abspath(
        os.path.join(os.path.dirname(__file__), "_data.py")
    )

    scripts = ",\n".join(
        f"    {repr(ln)}" for ln in sorted({e[2] for e in entries})
    )
    entries = ",\n".join(
        f"    ({hex(e[0])}, {hex(e[1])}, {repr(e[2])}, {repr(e[3])})"
        for e in entries
    )
    with open(output, "w") as fp:
        fp.write(_DATA_FILE.format(entries=entries, scripts=scripts))


if __name__ == '__main__':
    import sys
    from urllib.request import urlopen

    response = urlopen(_DATA_URL)
    if response.code != 200:
        print("Failed with HTTP code", response.code)
        sys.exit()

    lines = response.read().decode("utf-8").split("\n")
    _parse_scripts_txt(lines)