Gemfury

duality-group / Babel python

Repository URL to install this package:
Details
Babel / scripts / import_cldr.py
#!/usr/bin/env python
#
# Copyright (C) 2007-2011 Edgewall Software, 2013-2022 the Babel team
# All rights reserved.
#
# This software is licensed as described in the file LICENSE, which
# you should have received as part of this distribution. The terms
# are also available at http://babel.edgewall.org/wiki/License.
#
# This software consists of voluntary contributions made by many
# individuals. For the exact contribution history, see the revision
# history and logs, available at http://babel.edgewall.org/log/.

import collections
from optparse import OptionParser
import os
import pickle
import re
import sys
import logging

try:
    from xml.etree import cElementTree as ElementTree
except ImportError:
    from xml.etree import ElementTree

# Make sure we're using Babel source, and not some previously installed version
CHECKOUT_ROOT = os.path.abspath(os.path.join(
    os.path.dirname(__file__),
    '..'
))
BABEL_PACKAGE_ROOT = os.path.join(CHECKOUT_ROOT, "babel")
sys.path.insert(0, CHECKOUT_ROOT)

from babel import dates, numbers
from babel.dates import split_interval_pattern
from babel.localedata import Alias
from babel.plural import PluralRule

parse = ElementTree.parse
weekdays = {'mon': 0, 'tue': 1, 'wed': 2, 'thu': 3, 'fri': 4, 'sat': 5,
            'sun': 6}


def _text(elem):
    buf = [elem.text or '']
    for child in elem:
        buf.append(_text(child))
    buf.append(elem.tail or '')
    return u''.join(filter(None, buf)).strip()


NAME_RE = re.compile(r"^\w+$")
TYPE_ATTR_RE = re.compile(r"^\w+\[@type='(.*?)'\]$")

NAME_MAP = {
    'dateFormats': 'date_formats',
    'dateTimeFormats': 'datetime_formats',
    'eraAbbr': 'abbreviated',
    'eraNames': 'wide',
    'eraNarrow': 'narrow',
    'timeFormats': 'time_formats'
}

log = logging.getLogger("import_cldr")


def need_conversion(dst_filename, data_dict, source_filename):
    with open(source_filename, 'rb') as f:
        blob = f.read(4096)
        version_match = re.search(b'version number="\\$Revision: (\\d+)', blob)
        if not version_match:  # CLDR 36.0 was shipped without proper revision numbers
            return True
        version = int(version_match.group(1))

    data_dict['_version'] = version
    if not os.path.isfile(dst_filename):
        return True

    with open(dst_filename, 'rb') as f:
        data = pickle.load(f)
        return data.get('_version') != version


def _translate_alias(ctxt, path):
    parts = path.split('/')
    keys = ctxt[:]
    for part in parts:
        if part == '..':
            keys.pop()
        else:
            match = TYPE_ATTR_RE.match(part)
            if match:
                keys.append(match.group(1))
            else:
                assert NAME_RE.match(part)
                keys.append(NAME_MAP.get(part, part))
    return keys


def _parse_currency_date(s):
    if not s:
        return None
    parts = s.split('-', 2)
    return tuple(map(int, parts + [1] * (3 - len(parts))))


def _currency_sort_key(tup):
    code, start, end, tender = tup
    return int(not tender), start or (1, 1, 1)


def _extract_plural_rules(file_path):
    rule_dict = {}
    prsup = parse(file_path)
    for elem in prsup.findall('.//plurals/pluralRules'):
        rules = []
        for rule in elem.findall('pluralRule'):
            rules.append((rule.attrib['count'], str(rule.text)))
        pr = PluralRule(rules)
        for locale in elem.attrib['locales'].split():
            rule_dict[locale] = pr
    return rule_dict


def _time_to_seconds_past_midnight(time_expr):
    """
    Parse a time expression to seconds after midnight.
    :param time_expr: Time expression string (H:M or H:M:S)
    :rtype: int
    """
    if time_expr is None:
        return None
    if time_expr.count(":") == 1:
        time_expr += ":00"
    hour, minute, second = [int(p, 10) for p in time_expr.split(":")]
    return hour * 60 * 60 + minute * 60 + second


def _compact_dict(dict):
    """
    "Compact" the given dict by removing items whose value is None or False.
    """
    out_dict = {}
    for key, value in dict.items():
        if value is not None and value is not False:
            out_dict[key] = value
    return out_dict


def debug_repr(obj):
    if isinstance(obj, PluralRule):
        return obj.abstract
    return repr(obj)


def write_datafile(path, data, dump_json=False):
    with open(path, 'wb') as outfile:
        pickle.dump(data, outfile, 2)
    if dump_json:
        import json
        with open(path + '.json', 'w') as outfile:
            json.dump(data, outfile, indent=4, default=debug_repr)


def main():
    parser = OptionParser(usage='%prog path/to/cldr')
    parser.add_option(
        '-f', '--force', dest='force', action='store_true', default=False,
        help='force import even if destination file seems up to date'
    )
    parser.add_option(
        '-j', '--json', dest='dump_json', action='store_true', default=False,
        help='also export debugging JSON dumps of locale data'
    )
    parser.add_option(
        '-q', '--quiet', dest='quiet', action='store_true', default=bool(os.environ.get('BABEL_CLDR_QUIET')),
        help='quiesce info/warning messages',
    )

    options, args = parser.parse_args()
    if len(args) != 1:
        parser.error('incorrect number of arguments')

    logging.basicConfig(
        level=(logging.ERROR if options.quiet else logging.INFO),
    )

    return process_data(
        srcdir=args[0],
        destdir=BABEL_PACKAGE_ROOT,
        force=bool(options.force),
        dump_json=bool(options.dump_json)
    )


def process_data(srcdir, destdir, force=False, dump_json=False):
    sup_filename = os.path.join(srcdir, 'supplemental', 'supplementalData.xml')
    sup = parse(sup_filename)

    # Import global data from the supplemental files
    global_path = os.path.join(destdir, 'global.dat')
    global_data = {}
    if force or need_conversion(global_path, global_data, sup_filename):
        global_data.update(parse_global(srcdir, sup))
        write_datafile(global_path, global_data, dump_json=dump_json)
    _process_local_datas(sup, srcdir, destdir, force=force, dump_json=dump_json)


def parse_global(srcdir, sup):
    global_data = {}
    sup_dir = os.path.join(srcdir, 'supplemental')
    territory_zones = global_data.setdefault('territory_zones', {})
    zone_aliases = global_data.setdefault('zone_aliases', {})
    zone_territories = global_data.setdefault('zone_territories', {})
    win_mapping = global_data.setdefault('windows_zone_mapping', {})
    language_aliases = global_data.setdefault('language_aliases', {})
    territory_aliases = global_data.setdefault('territory_aliases', {})
    script_aliases = global_data.setdefault('script_aliases', {})
    variant_aliases = global_data.setdefault('variant_aliases', {})
    likely_subtags = global_data.setdefault('likely_subtags', {})
    territory_currencies = global_data.setdefault('territory_currencies', {})
    parent_exceptions = global_data.setdefault('parent_exceptions', {})
    all_currencies = collections.defaultdict(set)
    currency_fractions = global_data.setdefault('currency_fractions', {})
    territory_languages = global_data.setdefault('territory_languages', {})
    bcp47_timezone = parse(os.path.join(srcdir, 'bcp47', 'timezone.xml'))
    sup_windows_zones = parse(os.path.join(sup_dir, 'windowsZones.xml'))
    sup_metadata = parse(os.path.join(sup_dir, 'supplementalMetadata.xml'))
    sup_likely = parse(os.path.join(sup_dir, 'likelySubtags.xml'))
    # create auxiliary zone->territory map from the windows zones (we don't set
    # the 'zones_territories' map directly here, because there are some zones
    # aliases listed and we defer the decision of which ones to choose to the
    # 'bcp47' data
    _zone_territory_map = {}
    for map_zone in sup_windows_zones.findall('.//windowsZones/mapTimezones/mapZone'):
        if map_zone.attrib.get('territory') == '001':
            win_mapping[map_zone.attrib['other']] = map_zone.attrib['type'].split()[0]
        for tzid in str(map_zone.attrib['type']).split():
            _zone_territory_map[tzid] = str(map_zone.attrib['territory'])
    for key_elem in bcp47_timezone.findall('.//keyword/key'):
        if key_elem.attrib['name'] == 'tz':
            for elem in key_elem.findall('type'):
                if 'deprecated' not in elem.attrib:
                    aliases = str(elem.attrib['alias']).split()
                    tzid = aliases.pop(0)
                    territory = _zone_territory_map.get(tzid, '001')
                    territory_zones.setdefault(territory, []).append(tzid)
                    zone_territories[tzid] = territory
                    for alias in aliases:
                        zone_aliases[alias] = tzid
            break

    # Import Metazone mapping
    meta_zones = global_data.setdefault('meta_zones', {})
    tzsup = parse(os.path.join(srcdir, 'supplemental', 'metaZones.xml'))
    for elem in tzsup.findall('.//timezone'):
        for child in elem.findall('usesMetazone'):
            if 'to' not in child.attrib:  # FIXME: support old mappings
                meta_zones[elem.attrib['type']] = child.attrib['mzone']

    # Language aliases
    for alias in sup_metadata.findall('.//alias/languageAlias'):
        # We don't have a use for those at the moment.  They don't
        # pass our parser anyways.
        if '_' in alias.attrib['type']:
            continue
        language_aliases[alias.attrib['type']] = alias.attrib['replacement']

    # Territory aliases
    for alias in sup_metadata.findall('.//alias/territoryAlias'):
        territory_aliases[alias.attrib['type']] = alias.attrib['replacement'].split()

    # Script aliases
    for alias in sup_metadata.findall('.//alias/scriptAlias'):
        script_aliases[alias.attrib['type']] = alias.attrib['replacement']

    # Variant aliases
    for alias in sup_metadata.findall('.//alias/variantAlias'):
        repl = alias.attrib.get('replacement')
        if repl:
            variant_aliases[alias.attrib['type']] = repl

    # Likely subtags
    for likely_subtag in sup_likely.findall('.//likelySubtags/likelySubtag'):
        likely_subtags[likely_subtag.attrib['from']] = likely_subtag.attrib['to']

    # Currencies in territories
    for region in sup.findall('.//currencyData/region'):
        region_code = region.attrib['iso3166']
        region_currencies = []
        for currency in region.findall('./currency'):
            cur_code = currency.attrib['iso4217']
            cur_start = _parse_currency_date(currency.attrib.get('from'))
            cur_end = _parse_currency_date(currency.attrib.get('to'))
            cur_tender = currency.attrib.get('tender', 'true') == 'true'
            # Tie region to currency.
            region_currencies.append((cur_code, cur_start, cur_end, cur_tender))
            # Keep a reverse index of currencies to territorie.
            all_currencies[cur_code].add(region_code)
        region_currencies.sort(key=_currency_sort_key)
        territory_currencies[region_code] = region_currencies
    global_data['all_currencies'] = {
        currency: tuple(sorted(regions)) for currency, regions in all_currencies.items()}

    # Explicit parent locales
    for paternity in sup.findall('.//parentLocales/parentLocale'):
        parent = paternity.attrib['parent']
        for child in paternity.attrib['locales'].split():
            parent_exceptions[child] = parent

    # Currency decimal and rounding digits
    for fraction in sup.findall('.//currencyData/fractions/info'):
        cur_code = fraction.attrib['iso4217']
        cur_digits = int(fraction.attrib['digits'])
        cur_rounding = int(fraction.attrib['rounding'])
        cur_cdigits = int(fraction.attrib.get('cashDigits', cur_digits))
        cur_crounding = int(fraction.attrib.get('cashRounding', cur_rounding))
        currency_fractions[cur_code] = (cur_digits, cur_rounding, cur_cdigits, cur_crounding)

    # Languages in territories
    for territory in sup.findall('.//territoryInfo/territory'):
        languages = {}
        for language in territory.findall('./languagePopulation'):
            languages[language.attrib['type']] = {
                'population_percent': float(language.attrib['populationPercent']),
                'official_status': language.attrib.get('officialStatus'),
            }
        territory_languages[territory.attrib['type']] = languages
    return global_data


def _process_local_datas(sup, srcdir, destdir, force=False, dump_json=False):
    day_period_rules = parse_day_period_rules(parse(os.path.join(srcdir, 'supplemental', 'dayPeriods.xml')))
    # build a territory containment mapping for inheritance
    regions = {}
    for elem in sup.findall('.//territoryContainment/group'):
        regions[elem.attrib['type']] = elem.attrib['contains'].split()

    # Resolve territory containment
    territory_containment = {}
    region_items = sorted(regions.items())
    for group, territory_list in region_items:
        for territory in territory_list:
            containers = territory_containment.setdefault(territory, set())
            if group in territory_containment:
                containers |= territory_containment[group]
            containers.add(group)

    # prepare the per-locale plural rules definitions
    plural_rules = _extract_plural_rules(os.path.join(srcdir, 'supplemental', 'plurals.xml'))
    ordinal_rules = _extract_plural_rules(os.path.join(srcdir, 'supplemental', 'ordinals.xml'))

    filenames = os.listdir(os.path.join(srcdir, 'main'))
    filenames.remove('root.xml')
    filenames.sort(key=len)
    filenames.insert(0, 'root.xml')

    for filename in filenames:
        stem, ext = os.path.splitext(filename)
        if ext != '.xml':
            continue

        full_filename = os.path.join(srcdir, 'main', filename)
        data_filename = os.path.join(destdir, 'locale-data', stem + '.dat')

        data = {}
        if not (force or need_conversion(data_filename, data, full_filename)):
            continue

        tree = parse(full_filename)

        language = None
        elem = tree.find('.//identity/language')
        if elem is not None:
            language = elem.attrib['type']

        territory = None
        elem = tree.find('.//identity/territory')
        if elem is not None:
            territory = elem.attrib['type']
        else:
            territory = '001'  # world
        regions = territory_containment.get(territory, [])

        log.info(
            'Processing %s (Language = %s; Territory = %s)',
            filename, language, territory,
        )

        locale_id = '_'.join(filter(None, [
            language,
            territory != '001' and territory or None
        ]))

        data['locale_id'] = locale_id
        data['unsupported_number_systems'] = set()

        if locale_id in plural_rules:
            data['plural_form'] = plural_rules[locale_id]
        if locale_id in ordinal_rules:
            data['ordinal_form'] = ordinal_rules[locale_id]
        if locale_id in day_period_rules:
            data["day_period_rules"] = day_period_rules[locale_id]

        parse_locale_display_names(data, tree)
        parse_list_patterns(data, tree)
        parse_dates(data, tree, sup, regions, territory)

        for calendar in tree.findall('.//calendars/calendar'):
            if calendar.attrib['type'] != 'gregorian':
                # TODO: support other calendar types
                continue

            parse_calendar_months(data, calendar)
            parse_calendar_days(data, calendar)
            parse_calendar_quarters(data, calendar)
            parse_calendar_eras(data, calendar)
            parse_calendar_periods(data, calendar)
            parse_calendar_date_formats(data, calendar)
            parse_calendar_time_formats(data, calendar)
            parse_calendar_datetime_skeletons(data, calendar)
            parse_interval_formats(data, calendar)

        parse_number_symbols(data, tree)
        parse_decimal_formats(data, tree)
        parse_scientific_formats(data, tree)
        parse_percent_formats(data, tree)

        parse_currency_formats(data, tree)
        parse_currency_unit_patterns(data, tree)
        parse_currency_names(data, tree)
        parse_unit_patterns(data, tree)
        parse_date_fields(data, tree)
        parse_character_order(data, tree)
        parse_measurement_systems(data, tree)

        unsupported_number_systems_string = ', '.join(sorted(data.pop('unsupported_number_systems')))
        if unsupported_number_systems_string:
            log.warning('%s: unsupported number systems were ignored: %s' % (
                locale_id,
                unsupported_number_systems_string,
            ))

        write_datafile(data_filename, data, dump_json=dump_json)


def _should_skip_number_elem(data, elem):
    """
    Figure out whether the numbering-containing element `elem` is in a currently
    non-supported (i.e. currently non-Latin) numbering system.

    :param data: The root data element, for stashing the warning.
    :param elem: Element with `numberSystem` key
    :return: Boolean
    """
    number_system = elem.get('numberSystem', 'latn')

    if number_system != 'latn':
        data['unsupported_number_systems'].add(number_system)
        return True

    return False


def _should_skip_elem(elem, type=None, dest=None):
    """
    Check whether the given element should be skipped.

    Elements are skipped if they are drafts or alternates of data that already exists in `dest`.

    :param elem: XML element
    :param type: Type string. May be elided if the dest dict is elided.
    :param dest: Destination dict. May be elided to skip the dict check.
    :return: skip boolean
    """
    if 'draft' in elem.attrib or 'alt' in elem.attrib:
        if dest is None or type in dest:
            return True


def _import_type_text(dest, elem, type=None):
    """
    Conditionally import the element's inner text(s) into the `dest` dict.

    The condition being, namely, that the element isn't a draft/alternate version
    of a pre-existing element.

    :param dest: Destination dict
    :param elem: XML element.
    :param type: Override type. (By default, the `type` attr of the element.)
    :return:
    """
    if type is None:
        type = elem.attrib['type']
    if _should_skip_elem(elem, type, dest):
        return
    dest[type] = _text(elem)


def parse_locale_display_names(data, tree):
    territories = data.setdefault('territories', {})
    for elem in tree.findall('.//territories/territory'):
        _import_type_text(territories, elem)
    languages = data.setdefault('languages', {})
    for elem in tree.findall('.//languages/language'):
        _import_type_text(languages, elem)
    variants = data.setdefault('variants', {})
    for elem in tree.findall('.//variants/variant'):
        _import_type_text(variants, elem)
    scripts = data.setdefault('scripts', {})
    for elem in tree.findall('.//scripts/script'):
        _import_type_text(scripts, elem)


def parse_list_patterns(data, tree):
    list_patterns = data.setdefault('list_patterns', {})
    for listType in tree.findall('.//listPatterns/listPattern'):
        by_type = list_patterns.setdefault(listType.attrib.get('type', 'standard'), {})
        for listPattern in listType.findall('listPatternPart'):
            by_type[listPattern.attrib['type']] = _text(listPattern)


def parse_dates(data, tree, sup, regions, territory):
    week_data = data.setdefault('week_data', {})
    supelem = sup.find('.//weekData')
    for elem in supelem.findall('minDays'):
        if _should_skip_elem(elem):
            continue
        territories = elem.attrib['territories'].split()
        if territory in territories or any([r in territories for r in regions]):
            week_data['min_days'] = int(elem.attrib['count'])
    for elem in supelem.findall('firstDay'):
        if _should_skip_elem(elem):
            continue
        territories = elem.attrib['territories'].split()
        if territory in territories or any([r in territories for r in regions]):
            week_data['first_day'] = weekdays[elem.attrib['day']]
    for elem in supelem.findall('weekendStart'):
        if _should_skip_elem(elem):
            continue
        territories = elem.attrib['territories'].split()
        if territory in territories or any([r in territories for r in regions]):
            week_data['weekend_start'] = weekdays[elem.attrib['day']]
    for elem in supelem.findall('weekendEnd'):
        if _should_skip_elem(elem):
            continue
        territories = elem.attrib['territories'].split()
        if territory in territories or any([r in territories for r in regions]):
            week_data['weekend_end'] = weekdays[elem.attrib['day']]
    zone_formats = data.setdefault('zone_formats', {})
    for elem in tree.findall('.//timeZoneNames/gmtFormat'):
        if not _should_skip_elem(elem):
            zone_formats['gmt'] = str(elem.text).replace('{0}', '%s')
            break
    for elem in tree.findall('.//timeZoneNames/regionFormat'):
        if not _should_skip_elem(elem):
            zone_formats['region'] = str(elem.text).replace('{0}', '%s')
            break
    for elem in tree.findall('.//timeZoneNames/fallbackFormat'):
        if not _should_skip_elem(elem):
            zone_formats['fallback'] = (
                str(elem.text).replace('{0}', '%(0)s').replace('{1}', '%(1)s')
            )
            break
    for elem in tree.findall('.//timeZoneNames/fallbackRegionFormat'):
        if not _should_skip_elem(elem):
            zone_formats['fallback_region'] = (
                str(elem.text).replace('{0}', '%(0)s').replace('{1}', '%(1)s')
            )
            break
    time_zones = data.setdefault('time_zones', {})
    for elem in tree.findall('.//timeZoneNames/zone'):
        info = {}
        city = elem.findtext('exemplarCity')
        if city:
            info['city'] = str(city)
        for child in elem.findall('long/*'):
            info.setdefault('long', {})[child.tag] = str(child.text)
        for child in elem.findall('short/*'):
            info.setdefault('short', {})[child.tag] = str(child.text)
        time_zones[elem.attrib['type']] = info
    meta_zones = data.setdefault('meta_zones', {})
    for elem in tree.findall('.//timeZoneNames/metazone'):
        info = {}
        city = elem.findtext('exemplarCity')
        if city:
            info['city'] = str(city)
        for child in elem.findall('long/*'):
            info.setdefault('long', {})[child.tag] = str(child.text)
        for child in elem.findall('short/*'):
            info.setdefault('short', {})[child.tag] = str(child.text)
        meta_zones[elem.attrib['type']] = info


def parse_calendar_months(data, calendar):
    months = data.setdefault('months', {})
    for ctxt in calendar.findall('months/monthContext'):
        ctxt_type = ctxt.attrib['type']
        ctxts = months.setdefault(ctxt_type, {})
        for width in ctxt.findall('monthWidth'):
            width_type = width.attrib['type']
            widths = ctxts.setdefault(width_type, {})
            for elem in width:
                if elem.tag == 'month':
                    _import_type_text(widths, elem, int(elem.attrib['type']))
                elif elem.tag == 'alias':
                    ctxts[width_type] = Alias(
                        _translate_alias(['months', ctxt_type, width_type],
                                         elem.attrib['path'])
                    )


def parse_calendar_days(data, calendar):
    days = data.setdefault('days', {})
    for ctxt in calendar.findall('days/dayContext'):
        ctxt_type = ctxt.attrib['type']
        ctxts = days.setdefault(ctxt_type, {})
        for width in ctxt.findall('dayWidth'):
            width_type = width.attrib['type']
            widths = ctxts.setdefault(width_type, {})
            for elem in width:
                if elem.tag == 'day':
                    _import_type_text(widths, elem, weekdays[elem.attrib['type']])
                elif elem.tag == 'alias':
                    ctxts[width_type] = Alias(
                        _translate_alias(['days', ctxt_type, width_type],
                                         elem.attrib['path'])
                    )


def parse_calendar_quarters(data, calendar):
    quarters = data.setdefault('quarters', {})
    for ctxt in calendar.findall('quarters/quarterContext'):
        ctxt_type = ctxt.attrib['type']
        ctxts = quarters.setdefault(ctxt.attrib['type'], {})
        for width in ctxt.findall('quarterWidth'):
            width_type = width.attrib['type']
            widths = ctxts.setdefault(width_type, {})
            for elem in width:
                if elem.tag == 'quarter':
                    _import_type_text(widths, elem, int(elem.attrib['type']))
                elif elem.tag == 'alias':
                    ctxts[width_type] = Alias(
                        _translate_alias(['quarters', ctxt_type,
                                          width_type],
                                         elem.attrib['path']))


def parse_calendar_eras(data, calendar):
    eras = data.setdefault('eras', {})
    for width in calendar.findall('eras/*'):
        width_type = NAME_MAP[width.tag]
        widths = eras.setdefault(width_type, {})
        for elem in width:
            if elem.tag == 'era':
                _import_type_text(widths, elem, type=int(elem.attrib.get('type')))
            elif elem.tag == 'alias':
                eras[width_type] = Alias(
                    _translate_alias(['eras', width_type],
                                     elem.attrib['path'])
                )


def parse_calendar_periods(data, calendar):
    # Day periods (AM/PM/others)
    periods = data.setdefault('day_periods', {})
    for day_period_ctx in calendar.findall('dayPeriods/dayPeriodContext'):
        ctx_type = day_period_ctx.attrib["type"]
        for day_period_width in day_period_ctx.findall('dayPeriodWidth'):
            width_type = day_period_width.attrib["type"]
            dest_dict = periods.setdefault(ctx_type, {}).setdefault(width_type, {})
            for day_period in day_period_width.findall('dayPeriod'):
                period_type = day_period.attrib['type']
                if 'alt' not in day_period.attrib:
                    dest_dict[period_type] = str(day_period.text)


def parse_calendar_date_formats(data, calendar):
    date_formats = data.setdefault('date_formats', {})
    for format in calendar.findall('dateFormats'):
        for elem in format:
            if elem.tag == 'dateFormatLength':
                type = elem.attrib.get('type')
                if _should_skip_elem(elem, type, date_formats):
                    continue
                try:
                    date_formats[type] = dates.parse_pattern(
                        str(elem.findtext('dateFormat/pattern'))
                    )
                except ValueError as e:
                    log.error(e)
            elif elem.tag == 'alias':
                date_formats = Alias(_translate_alias(
                    ['date_formats'], elem.attrib['path'])
                )


def parse_calendar_time_formats(data, calendar):
    time_formats = data.setdefault('time_formats', {})
    for format in calendar.findall('timeFormats'):
        for elem in format:
            if elem.tag == 'timeFormatLength':
                type = elem.attrib.get('type')
                if _should_skip_elem(elem, type, time_formats):
                    continue
                try:
                    time_formats[type] = dates.parse_pattern(
                        str(elem.findtext('timeFormat/pattern'))
                    )
                except ValueError as e:
                    log.error(e)
            elif elem.tag == 'alias':
                time_formats = Alias(_translate_alias(
                    ['time_formats'], elem.attrib['path'])
                )


def parse_calendar_datetime_skeletons(data, calendar):
    datetime_formats = data.setdefault('datetime_formats', {})
    datetime_skeletons = data.setdefault('datetime_skeletons', {})
    for format in calendar.findall('dateTimeFormats'):
        for elem in format:
            if elem.tag == 'dateTimeFormatLength':
                type = elem.attrib.get('type')
                if _should_skip_elem(elem, type, datetime_formats):
                    continue
                try:
                    datetime_formats[type] = str(elem.findtext('dateTimeFormat/pattern'))
                except ValueError as e:
                    log.error(e)
            elif elem.tag == 'alias':
                datetime_formats = Alias(_translate_alias(
                    ['datetime_formats'], elem.attrib['path'])
                )
            elif elem.tag == 'availableFormats':
                for datetime_skeleton in elem.findall('dateFormatItem'):
                    datetime_skeletons[datetime_skeleton.attrib['id']] = (
                        dates.parse_pattern(str(datetime_skeleton.text))
                    )


def parse_number_symbols(data, tree):
    number_symbols = data.setdefault('number_symbols', {})
    for symbol_elem in tree.findall('.//numbers/symbols'):
        if _should_skip_number_elem(data, symbol_elem):  # TODO: Support other number systems
            continue

        for elem in symbol_elem.findall('./*'):
            if _should_skip_elem(elem):
                continue
            number_symbols[elem.tag] = str(elem.text)


def parse_decimal_formats(data, tree):
    decimal_formats = data.setdefault('decimal_formats', {})
    for df_elem in tree.findall('.//decimalFormats'):
        if _should_skip_number_elem(data, df_elem):  # TODO: Support other number systems
            continue
        for elem in df_elem.findall('./decimalFormatLength'):
            length_type = elem.attrib.get('type')
            if _should_skip_elem(elem, length_type, decimal_formats):
                continue
            if elem.findall('./alias'):
                # TODO map the alias to its target
                continue
            for pattern_el in elem.findall('./decimalFormat/pattern'):
                pattern_type = pattern_el.attrib.get('type')
                pattern = numbers.parse_pattern(str(pattern_el.text))
                if pattern_type:
                    # This is a compact decimal format, see:
                    # https://www.unicode.org/reports/tr35/tr35-45/tr35-numbers.html#Compact_Number_Formats

                    # These are mapped into a `compact_decimal_formats` dictionary
                    # with the format {length: {count: {multiplier: pattern}}}.

                    # TODO: Add support for formatting them.
                    compact_decimal_formats = data.setdefault('compact_decimal_formats', {})
                    length_map = compact_decimal_formats.setdefault(length_type, {})
                    length_count_map = length_map.setdefault(pattern_el.attrib['count'], {})
                    length_count_map[pattern_type] = pattern
                else:
                    # Regular decimal format.
                    decimal_formats[length_type] = pattern


def parse_scientific_formats(data, tree):
    scientific_formats = data.setdefault('scientific_formats', {})
    for sf_elem in tree.findall('.//scientificFormats'):
        if _should_skip_number_elem(data, sf_elem):  # TODO: Support other number systems
            continue
        for elem in sf_elem.findall('./scientificFormatLength'):
            type = elem.attrib.get('type')
            if _should_skip_elem(elem, type, scientific_formats):
                continue
            pattern = str(elem.findtext('scientificFormat/pattern'))
            scientific_formats[type] = numbers.parse_pattern(pattern)


def parse_percent_formats(data, tree):
    percent_formats = data.setdefault('percent_formats', {})

    for pf_elem in tree.findall('.//percentFormats'):
        if _should_skip_number_elem(data, pf_elem):  # TODO: Support other number systems
            continue
        for elem in pf_elem.findall('.//percentFormatLength'):
            type = elem.attrib.get('type')
            if _should_skip_elem(elem, type, percent_formats):
                continue
            pattern = str(elem.findtext('percentFormat/pattern'))
            percent_formats[type] = numbers.parse_pattern(pattern)


def parse_currency_names(data, tree):
    currency_names = data.setdefault('currency_names', {})
    currency_names_plural = data.setdefault('currency_names_plural', {})
    currency_symbols = data.setdefault('currency_symbols', {})
    for elem in tree.findall('.//currencies/currency'):
        code = elem.attrib['type']
        for name in elem.findall('displayName'):
            if ('draft' in name.attrib) and code in currency_names:
                continue
            if 'count' in name.attrib:
                currency_names_plural.setdefault(code, {})[
                    name.attrib['count']] = str(name.text)
            else:
                currency_names[code] = str(name.text)
        for symbol in elem.findall('symbol'):
            if 'draft' in symbol.attrib or 'choice' in symbol.attrib:  # Skip drafts and choice-patterns
                continue
            if symbol.attrib.get('alt'):  # Skip alternate forms
                continue
            currency_symbols[code] = str(symbol.text)


def parse_unit_patterns(data, tree):
    unit_patterns = data.setdefault('unit_patterns', {})
    compound_patterns = data.setdefault('compound_unit_patterns', {})
    unit_display_names = data.setdefault('unit_display_names', {})

    for elem in tree.findall('.//units/unitLength'):
        unit_length_type = elem.attrib['type']
        for unit in elem.findall('unit'):
            unit_type = unit.attrib['type']
            unit_and_length_patterns = unit_patterns.setdefault(unit_type, {}).setdefault(unit_length_type, {})
            for pattern in unit.findall('unitPattern'):
                if pattern.attrib.get('case', 'nominative') != 'nominative':
                    # Skip non-nominative cases.
                    continue
                unit_and_length_patterns[pattern.attrib['count']] = _text(pattern)

            per_unit_pat = unit.find('perUnitPattern')
            if per_unit_pat is not None:
                unit_and_length_patterns['per'] = _text(per_unit_pat)

            display_name = unit.find('displayName')
            if display_name is not None:
                unit_display_names.setdefault(unit_type, {})[unit_length_type] = _text(display_name)

        for unit in elem.findall('compoundUnit'):
            unit_type = unit.attrib['type']
            compound_unit_info = {}
            compound_variations = {}
            for child in unit:
                if child.attrib.get('case', 'nominative') != 'nominative':
                    # Skip non-nominative cases.
                    continue
                if child.tag == "unitPrefixPattern":
                    compound_unit_info['prefix'] = _text(child)
                elif child.tag == "compoundUnitPattern":
                    compound_variations[None] = _text(child)
                elif child.tag == "compoundUnitPattern1":
                    compound_variations[child.attrib.get('count')] = _text(child)
            if compound_variations:
                compound_variation_values = set(compound_variations.values())
                if len(compound_variation_values) == 1:
                    # shortcut: if all compound variations are the same, only store one
                    compound_unit_info['compound'] = next(iter(compound_variation_values))
                else:
                    compound_unit_info['compound_variations'] = compound_variations
            compound_patterns.setdefault(unit_type, {})[unit_length_type] = compound_unit_info


def parse_date_fields(data, tree):
    date_fields = data.setdefault('date_fields', {})
    for elem in tree.findall('.//dates/fields/field'):
        field_type = elem.attrib['type']
        date_fields.setdefault(field_type, {})
        for rel_time in elem.findall('relativeTime'):
            rel_time_type = rel_time.attrib['type']
            for pattern in rel_time.findall('relativeTimePattern'):
                type_dict = date_fields[field_type].setdefault(rel_time_type, {})
                type_dict[pattern.attrib['count']] = str(pattern.text)


def parse_interval_formats(data, tree):
    # https://www.unicode.org/reports/tr35/tr35-dates.html#intervalFormats
    interval_formats = data.setdefault("interval_formats", {})
    for elem in tree.findall("dateTimeFormats/intervalFormats/*"):
        if 'draft' in elem.attrib:
            continue
        if elem.tag == "intervalFormatFallback":
            interval_formats[None] = elem.text
        elif elem.tag == "intervalFormatItem":
            skel_data = interval_formats.setdefault(elem.attrib["id"], {})
            for item_sub in elem:
                if item_sub.tag == "greatestDifference":
                    skel_data[item_sub.attrib["id"]] = split_interval_pattern(item_sub.text)
                else:
                    raise NotImplementedError("Not implemented: %s(%r)" % (item_sub.tag, item_sub.attrib))


def parse_currency_formats(data, tree):
    currency_formats = data.setdefault('currency_formats', {})
    for currency_format in tree.findall('.//currencyFormats'):
        if _should_skip_number_elem(data, currency_format):  # TODO: Support other number systems
            continue

        for length_elem in currency_format.findall('./currencyFormatLength'):
            curr_length_type = length_elem.attrib.get('type')
            for elem in length_elem.findall('currencyFormat'):
                type = elem.attrib.get('type')
                if curr_length_type:
                    # Handle `<currencyFormatLength type="short">`, etc.
                    # TODO(3.x): use nested dicts instead of colon-separated madness
                    type = '%s:%s' % (type, curr_length_type)
                if _should_skip_elem(elem, type, currency_formats):
                    continue
                for child in elem.iter():
                    if child.tag == 'alias':
                        currency_formats[type] = Alias(
                            _translate_alias(['currency_formats', elem.attrib['type']],
                                             child.attrib['path'])
                        )
                    elif child.tag == 'pattern':
                        pattern = str(child.text)
                        currency_formats[type] = numbers.parse_pattern(pattern)


def parse_currency_unit_patterns(data, tree):
    currency_unit_patterns = data.setdefault('currency_unit_patterns', {})
    for currency_formats_elem in tree.findall('.//currencyFormats'):
        if _should_skip_number_elem(data, currency_formats_elem):  # TODO: Support other number systems
            continue
        for unit_pattern_elem in currency_formats_elem.findall('./unitPattern'):
            count = unit_pattern_elem.attrib['count']
            pattern = str(unit_pattern_elem.text)
            currency_unit_patterns[count] = pattern


def parse_day_period_rules(tree):
    """
    Parse dayPeriodRule data into a dict.

    :param tree: ElementTree
    """
    day_periods = {}
    for ruleset in tree.findall(".//dayPeriodRuleSet"):
        ruleset_type = ruleset.attrib.get("type")  # None|"selection"
        for rules in ruleset.findall("dayPeriodRules"):
            locales = rules.attrib["locales"].split()
            for rule in rules.findall("dayPeriodRule"):
                type = rule.attrib["type"]
                if type in ("am", "pm"):  # These fixed periods are handled separately by `get_period_id`
                    continue
                rule = _compact_dict({
                    key: _time_to_seconds_past_midnight(rule.attrib.get(key))
                    for key in ("after", "at", "before", "from", "to")
                })
                for locale in locales:
                    dest_list = day_periods.setdefault(locale, {}).setdefault(ruleset_type, {}).setdefault(type, [])
                    dest_list.append(rule)
    return day_periods


def parse_character_order(data, tree):
    for elem in tree.findall('.//layout/orientation/characterOrder'):
        data['character_order'] = elem.text


def parse_measurement_systems(data, tree):
    measurement_systems = data.setdefault('measurement_systems', {})
    for measurement_system in tree.findall('.//measurementSystemNames/measurementSystemName'):
        type = measurement_system.attrib['type']
        if not _should_skip_elem(measurement_system, type=type, dest=measurement_systems):
            _import_type_text(measurement_systems, measurement_system, type=type)



if __name__ == '__main__':
    main()
duality-group / Babel python

Products

About

Resources

Contact Gemfury