diff options
author | Adriaan de Groot <groot@kde.org> | 2019-05-10 11:46:21 +0200 |
---|---|---|
committer | Adriaan de Groot <groot@kde.org> | 2019-05-10 13:06:55 +0200 |
commit | bf5ce44bd1b5ecab1d7b9bb77431a53ff2475d5b (patch) | |
tree | e5bce7610716ca48a89eac678d81f2e6a539712d | |
parent | 2a5d99be5007d6579329408a57370afd490b315a (diff) |
[libcalamares] Tidy up the CLDR script, add license, make more pythonic
-rw-r--r-- | src/libcalamares/locale/cldr-extractor.py | 254 |
1 files changed, 196 insertions, 58 deletions
diff --git a/src/libcalamares/locale/cldr-extractor.py b/src/libcalamares/locale/cldr-extractor.py index 6c4061fcc..06fb69e3b 100644 --- a/src/libcalamares/locale/cldr-extractor.py +++ b/src/libcalamares/locale/cldr-extractor.py @@ -1,30 +1,67 @@ #! /usr/bin/env python3 # +# === This file is part of Calamares - <https://github.com/calamares> === +# # Python3 script to scrape some data out of ICU CLDR supplemental data. # -# To use this script, you must have downloaded the CLDR data, e.g. -# http://unicode.org/Public/cldr/35.1/, and extracted the zip file. -# Run the script from **inside** the common/ durectory that is created -# (or fix the hard-coded path). +### BEGIN LICENSES +# +# Copyright 2019 Adriaan de Groot <groot@kde.org> # -# The script tries to print C++ code that compiles; if there are encoding -# problems, it will print some kind of representation of the problematic -# lines. +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: # -# To avoid having to cross-reference multiple XML files, the script -# cheats: it reads the comments as well to get names. So it looks for -# pairs of lines like this: +# 1. Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# 2. Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. # -# <likelySubtag from="und_BQ" to="pap_Latn_BQ"/> -# <!--{ ?; ?; Caribbean Netherlands } => { Papiamento; Latin; Caribbean Netherlands }--> +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. # -# It extracts the 2-character country code "BQ" from the sub-tag, and -# parses the comment to get a language and country name (instead of looking up -# "pap" and "BQ" in other tables). This may be considered a hack. +### END LICENSES + +### BEGIN USAGE # -# A large collection of exceptions can be found in the two *_mapper tables, -# which massage the CLDR names to Qt enum values. +""" +Python3 script to scrape some data out of ICU CLDR supplemental data. + +To use this script, you must have downloaded the CLDR data, e.g. +http://unicode.org/Public/cldr/35.1/, and extracted the zip file. +Run the script from **inside** the common/ durectory that is created +(or fix the hard-coded path). + +The script tries to print C++ code that compiles; if there are encoding +problems, it will print some kind of representation of the problematic +lines. + +To avoid having to cross-reference multiple XML files, the script +cheats: it reads the comments as well to get names. So it looks for +pairs of lines like this: + + <likelySubtag from="und_BQ" to="pap_Latn_BQ"/> + <!--{ ?; ?; Caribbean Netherlands } => { Papiamento; Latin; Caribbean Netherlands }--> +It extracts the 2-character country code "BQ" from the sub-tag, and +parses the comment to get a language and country name (instead of looking up +"pap" and "BQ" in other tables). This may be considered a hack. + +A large collection of exceptions can be found in the two *_mapper tables, +which massage the CLDR names to Qt enum values. +""" +# +### END USAGE import sys @@ -64,9 +101,64 @@ country_mapper = { "Eurozone" : "AnyCountry", # Not likely for GeoIP "Caribbean Netherlands" : "Bonaire", # Bonaire, Saba, St.Eustatius } + +class CountryData: + def __init__(self, country_code, language_name, country_name): + """ + Takes a 2-letter country name, and enum names from + QLocale::Language and QLocale::Country. An empty + @p country code is acceptable, for the terminating + entry in the data array (and yields a 0,0 code). + """ + if country_code: + assert len(country_code) == 2 + self.country_code = country_code + self.language_enum = language_name + self.country_enum = country_name + else: + self.country_code = "" + self.language_enum = "AnyLanguage" + self.country_enum = "AnyCountry" -def extricate(l1, l2): - if '"und_' not in l1: + def __str__(self): + if self.country_code: + char0 = "'{!s}'".format(self.country_code[0]) + char1 = "'{!s}'".format(self.country_code[1]) + else: + char0 = "0" + char1 = "0" + + return "{!s} QLocale::Language::{!s}, QLocale::Country::{!s}, {!s}, {!s} {!s},".format( + "{", + self.language_enum, + self.country_enum, + char0, + char1, + "}") + + # Must match type name below + cpp_classname = "CountryData" + + # Must match the output format of __str__ above + cpp_declaration = """ +struct CountryData +{ + QLocale::Language l; + QLocale::Country c; + char cc1; + char cc2; +}; +""" + + +def extricate_subtags(l1, l2): + """ + Given two lines @p l1 and @p l2 which are the <likelySubtag> element-line + and the comment-line underneath it, return a CountryData for them, + or None if the two lines are not relevant (e.g. not the right subtag from, + or 3-letter country codes. + """ + if 'from="und_' not in l1: return if '{ ?; ?;' not in l2: return @@ -94,47 +186,93 @@ def extricate(l1, l2): l2_country = country_mapper.get(l2_country, l2_country) l2_country = l2_country.replace(" ", "").replace("-", "").replace(".","").replace("&","And") - # There shouldn't be any UTF-8 left in there. - try: - print("{!s} QLocale::Language::{!s}, QLocale::Country::{!s}, '{!s}', '{!s}' {!s},".format( - "{", - l2_language, - l2_country, - l1_code[0], - l1_code[1], - "}")) - except UnicodeEncodeError: - print(list(map(lambda x : '?' if x > 128 else chr(x), map(lambda x:ord(x), l2_country)))) - raise - -print("""// Generated from CLDR data -#include <QLocale> -struct CountryData -{ - QLocale::Language l; - QLocale::Country c; - char cc1; - char cc2; -}; + return CountryData(l1_code, l2_language, l2_country) + + +def read_subtags_file(): + """ + Returns a list of CountryData objects from the likelySubtags file. + """ + data = [] + + with open("supplemental/likelySubtags.xml", "rt", encoding="UTF-8") as f: + l1 = "a line" + while l1: + l1 = f.readline() + if '<likelySubtag from="und_' not in l1: + continue + l2 = f.readline() + + if l1: + assert "likelySubtag" in l1, l1; + assert "<!--" in l2, l2; + + data.append(extricate_subtags(l1, l2)) -static const CountryData countryMap[] = { -""") + data.append(CountryData("", None, None)) + return [c for c in data if c is not None] -with open("supplemental/likelySubtags.xml", "rt", encoding="UTF-8") as f: - l1 = "a line" - while l1: - l1 = f.readline() - if '<likelySubtag from="und_' not in l1: - continue - l2 = f.readline() - if l1: - assert "likelySubtag" in l1, l1; - assert "<!--" in l2, l2; +cpp_header_comment = """/* GENERATED FILE DO NOT EDIT +* +* === This file is part of Calamares - <https://github.com/calamares> === +* +* This file is derived from CLDR data from Unicode, Inc. Applicable terms: +* +* A. Unicode Copyright +* 1. Copyright © 1991-2019 Unicode, Inc. All rights reserved. +* B. Definitions +* Unicode Data Files ("DATA FILES") include all data files under the directories: +* https://www.unicode.org/Public/ +* C. Terms of Use +* 2. Any person is hereby authorized, without fee, to view, use, reproduce, +* and distribute all documents and files, subject to the Terms and +* Conditions herein. +*/ + +// BEGIN Generated from CLDR data + +""" + +cpp_footer_comment = """ +// END Generated from CLDR data +""" + + +def make_identifier(classname): + """ + Given a class name (e.g. CountryData) return an identifer + for the data-table for that class. + """ + identifier = [ classname[0].lower() ] + for c in classname[1:]: + if c.isupper(): + identifier.extend(["_", c.lower()]) + else: + identifier.append(c) - extricate(l1, l2) + identifier.append("_table") + return "".join(identifier) -print("""{ QLocale::Language::AnyLanguage, QLocale::Country::AnyCountry, 0, 0 } // Terminator -}; -// END Generated from CLDR data -""") + +def export_class(cls, data): + """ + Given a @p cls and a list of @p data objects from that class, + print (to stdout) a C++ file for that data. + """ + with open("{!s}_p.cpp".format(cls.cpp_classname), "wt", encoding="UTF-8") as f: + f.write(cpp_header_comment) + f.write(cls.cpp_declaration) + f.write("\nstatic const {!s} {!s}[] = {!s}\n".format( + cls.cpp_classname, + make_identifier(cls.cpp_classname), + "{")) + for d in data: + f.write(str(d)) + f.write("\n") + f.write("};\n\n"); + f.write(cpp_footer_comment) + + +if __name__ == "__main__": + export_class(CountryData, read_subtags_file()) |