Repository URL to install this package:
Version:
5.2.1 ▾
|
# (C) Copyright 2005-2021 Enthought, Inc., Austin, TX
# All rights reserved.
#
# This software is provided without warranty under the terms of the BSD
# license included in LICENSE.txt and may be redistributed only under
# the conditions described in the aforementioned license. The license
# is also available online at http://www.enthought.com/licenses/BSD.txt
#
# Thanks for using Enthought open source!
import os
import re
_DATA_FILE = """\
# ++++++++++++++++++++++++++++++++++++++++++++
# Do Not Edit This File. It was autogenerated.
# ++++++++++++++++++++++++++++++++++++++++++++
# You can regenerate this file by running `kiva.fonttools.text._parse_scripts`
SCRIPTS = [
{scripts}
]
ENTRIES = [
{entries}
]
"""
_DATA_URL = "http://www.unicode.org/Public/UNIDATA/Scripts.txt"
def _parse_lines(lines):
""" Return a list of parsed entries for the lines in Scripts.txt
"""
# Regex for parsing lines
_entry_regex = re.compile(
r"([0-9A-F]+)" # First hex number
r"(?:[.]{2}([0-9A-F]+))?" # Second hex number [Maybe]
r"\W+" # Anything not a word
r"(\w+)" # A word
r"(?: # )" # The exact string: " # "
r"(\w{2})" # A two letter category code
# The rest of the line is ignored
)
# Example
# 0000..001F ; Common # Cc [32] <control-0000>..<control-001F>
# 0020 ; Common # Zs SPACE
entries = []
for line in lines:
line = line.strip()
if not line or line.startswith("#"):
continue
match = _entry_regex.match(line)
if match is not None:
start, end, lang, category = match.groups()
if end is None:
end = start
entries.append((int(start, 16), int(end, 16), lang, category))
# Sort by starting codepoint
entries.sort()
return entries
def _parse_scripts_txt(lines):
""" Parse the content of http://www.unicode.org/Public/UNIDATA/Scripts.txt
This file provides an up to date mapping of Unicode codepoint ranges to
the languages they cover.
"""
entries = _parse_lines(lines)
output = os.path.abspath(
os.path.join(os.path.dirname(__file__), "_data.py")
)
scripts = ",\n".join(
f" {repr(ln)}" for ln in sorted({e[2] for e in entries})
)
entries = ",\n".join(
f" ({hex(e[0])}, {hex(e[1])}, {repr(e[2])}, {repr(e[3])})"
for e in entries
)
with open(output, "w") as fp:
fp.write(_DATA_FILE.format(entries=entries, scripts=scripts))
if __name__ == '__main__':
import sys
from urllib.request import urlopen
response = urlopen(_DATA_URL)
if response.code != 200:
print("Failed with HTTP code", response.code)
sys.exit()
lines = response.read().decode("utf-8").split("\n")
_parse_scripts_txt(lines)