File size: 8,534 Bytes
d916065
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
# Natural Language Toolkit: BCP-47 language tags
#
# Copyright (C) 2022-2023 NLTK Project
# Author: Eric Kafe <[email protected]>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT

import re
from warnings import warn
from xml.etree import ElementTree as et

from nltk.corpus.reader import CorpusReader


class BCP47CorpusReader(CorpusReader):
    """

    Parse BCP-47 composite language tags



    Supports all the main subtags, and the 'u-sd' extension:



    >>> from nltk.corpus import bcp47

    >>> bcp47.name('oc-gascon-u-sd-fr64')

    'Occitan (post 1500): Gascon: Pyrénées-Atlantiques'



    Can load a conversion table to Wikidata Q-codes:

    >>> bcp47.load_wiki_q()

    >>> bcp47.wiki_q['en-GI-spanglis']

    'Q79388'



    """

    def __init__(self, root, fileids):
        """Read the BCP-47 database"""
        super().__init__(root, fileids)
        self.langcode = {}
        with self.open("iana/language-subtag-registry.txt") as fp:
            self.db = self.data_dict(fp.read().split("%%\n"))
        with self.open("cldr/common-subdivisions-en.xml") as fp:
            self.subdiv = self.subdiv_dict(
                et.parse(fp).iterfind("localeDisplayNames/subdivisions/subdivision")
            )
        self.morphology()

    def load_wiki_q(self):
        """Load conversion table to Wikidata Q-codes (only if needed)"""
        with self.open("cldr/tools-cldr-rdf-external-entityToCode.tsv") as fp:
            self.wiki_q = self.wiki_dict(fp.read().strip().split("\n")[1:])

    def wiki_dict(self, lines):
        """Convert Wikidata list of Q-codes to a BCP-47 dictionary"""
        return {
            pair[1]: pair[0].split("/")[-1]
            for pair in [line.strip().split("\t") for line in lines]
        }

    def subdiv_dict(self, subdivs):
        """Convert the CLDR subdivisions list to a dictionary"""
        return {sub.attrib["type"]: sub.text for sub in subdivs}

    def morphology(self):
        self.casing = {
            "language": str.lower,
            "extlang": str.lower,
            "script": str.title,
            "region": str.upper,
            "variant": str.lower,
        }
        dig = "[0-9]"
        low = "[a-z]"
        up = "[A-Z]"
        alnum = "[a-zA-Z0-9]"
        self.format = {
            "language": re.compile(f"{low*3}?"),
            "extlang": re.compile(f"{low*3}"),
            "script": re.compile(f"{up}{low*3}"),
            "region": re.compile(f"({up*2})|({dig*3})"),
            "variant": re.compile(f"{alnum*4}{(alnum+'?')*4}"),
            "singleton": re.compile(f"{low}"),
        }

    def data_dict(self, records):
        """Convert the BCP-47 language subtag registry to a dictionary"""
        self.version = records[0].replace("File-Date:", "").strip()
        dic = {}
        dic["deprecated"] = {}
        for label in [
            "language",
            "extlang",
            "script",
            "region",
            "variant",
            "redundant",
            "grandfathered",
        ]:
            dic["deprecated"][label] = {}
        for record in records[1:]:
            fields = [field.split(": ") for field in record.strip().split("\n")]
            typ = fields[0][1]
            tag = fields[1][1]
            if typ not in dic:
                dic[typ] = {}
            subfields = {}
            for field in fields[2:]:
                if len(field) == 2:
                    [key, val] = field
                    if key not in subfields:
                        subfields[key] = [val]
                    else:  # multiple value
                        subfields[key].append(val)
                else:  # multiline field
                    subfields[key][-1] += " " + field[0].strip()
                if (
                    "Deprecated" not in record
                    and typ == "language"
                    and key == "Description"
                ):
                    self.langcode[subfields[key][-1]] = tag
            for key in subfields:
                if len(subfields[key]) == 1:  # single value
                    subfields[key] = subfields[key][0]
            if "Deprecated" in record:
                dic["deprecated"][typ][tag] = subfields
            else:
                dic[typ][tag] = subfields
        return dic

    def val2str(self, val):
        """Return only first value"""
        if type(val) == list:
            #            val = "/".join(val) # Concatenate all values
            val = val[0]
        return val

    def lang2str(self, lg_record):
        """Concatenate subtag values"""
        name = f"{lg_record['language']}"
        for label in ["extlang", "script", "region", "variant", "extension"]:
            if label in lg_record:
                name += f": {lg_record[label]}"
        return name

    def parse_tag(self, tag):
        """Convert a BCP-47 tag to a dictionary of labelled subtags"""
        subtags = tag.split("-")
        lang = {}
        labels = ["language", "extlang", "script", "region", "variant", "variant"]
        while subtags and labels:
            subtag = subtags.pop(0)
            found = False
            while labels:
                label = labels.pop(0)
                subtag = self.casing[label](subtag)
                if self.format[label].fullmatch(subtag):
                    if subtag in self.db[label]:
                        found = True
                        valstr = self.val2str(self.db[label][subtag]["Description"])
                        if label == "variant" and label in lang:
                            lang[label] += ": " + valstr
                        else:
                            lang[label] = valstr
                        break
                    elif subtag in self.db["deprecated"][label]:
                        found = True
                        note = f"The {subtag!r} {label} code is deprecated"
                        if "Preferred-Value" in self.db["deprecated"][label][subtag]:
                            prefer = self.db["deprecated"][label][subtag][
                                "Preferred-Value"
                            ]
                            note += f"', prefer '{self.val2str(prefer)}'"
                        lang[label] = self.val2str(
                            self.db["deprecated"][label][subtag]["Description"]
                        )
                        warn(note)
                        break
            if not found:
                if subtag == "u" and subtags[0] == "sd":  # CLDR regional subdivisions
                    sd = subtags[1]
                    if sd in self.subdiv:
                        ext = self.subdiv[sd]
                    else:
                        ext = f"<Unknown subdivision: {ext}>"
                else:  # other extension subtags are not supported yet
                    ext = f"{subtag}{''.join(['-'+ext for ext in subtags])}".lower()
                    if not self.format["singleton"].fullmatch(subtag):
                        ext = f"<Invalid extension: {ext}>"
                        warn(ext)
                lang["extension"] = ext
                subtags = []
        return lang

    def name(self, tag):
        """

        Convert a BCP-47 tag to a colon-separated string of subtag names



        >>> from nltk.corpus import bcp47

        >>> bcp47.name('ca-Latn-ES-valencia')

        'Catalan: Latin: Spain: Valencian'



        """
        for label in ["redundant", "grandfathered"]:
            val = None
            if tag in self.db[label]:
                val = f"{self.db[label][tag]['Description']}"
                note = f"The {tag!r} code is {label}"
            elif tag in self.db["deprecated"][label]:
                val = f"{self.db['deprecated'][label][tag]['Description']}"
                note = f"The {tag!r} code is {label} and deprecated"
                if "Preferred-Value" in self.db["deprecated"][label][tag]:
                    prefer = self.db["deprecated"][label][tag]["Preferred-Value"]
                    note += f", prefer {self.val2str(prefer)!r}"
            if val:
                warn(note)
                return val
        try:
            return self.lang2str(self.parse_tag(tag))
        except:
            warn(f"Tag {tag!r} was not recognized")
            return None