File size: 8,859 Bytes
68a8c29
6570b48
25a3c87
65485b5
bdc0541
65485b5
1e9b3ba
 
d7bf3e7
1e9b3ba
741cd0d
 
 
 
 
 
 
1e9b3ba
2f590b1
741cd0d
 
16fc4ca
92a84ae
 
 
9989672
25a3c87
5d305df
 
 
 
25a3c87
5d305df
 
92a84ae
dedac74
7a00e93
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bdc0541
 
 
 
 
 
 
 
8329262
55f8482
741cd0d
 
 
 
 
10bed1d
741cd0d
 
 
 
 
 
 
 
 
bdc0541
741cd0d
 
 
bdc0541
741cd0d
 
 
 
 
 
 
537dd73
92a84ae
741cd0d
 
 
 
 
 
 
 
 
 
 
ca54e5d
741cd0d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fa7755f
741cd0d
 
 
 
 
 
 
07e3fb8
741cd0d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
import streamlit as st
import langcodes
from requests_html import HTMLSession
import urllib
import requests

# FEATURE: get wikipedia codes, e.g. from https://en.wikipedia.org/wiki/List_of_Wikipedias or https://meta.wikimedia.org/wiki/List_of_Wikipedias, some of which are nonstandard. Then output f"{code}.wikipedia.org"
# TODO: fix 'knh', it has an empty ISO section. Turns out some languages only have 639-3
# TODO: add in some nice things from https://docs.streamlit.io/library/cheatsheet like error codes and status messages. 
# TODO: add in vachan search even if lang not found
# TODO: results from glottolog even if none from others
things_to_test = [ 
  "knh", # deprecated code on ISO
  "khn", # only has 639-3 on ISO
  "xxx", # no such code on ISO or glottolog
  "Chinese", # Vachan struggles. 
]


def get_bcp47_from_langcode(langtext):
  pass

def pull_obsolete_codes(iso_code):
  session = HTMLSession() 
  r= session.get(f"https://iso639-3.sil.org/code/{iso_code}")
  # https://www.w3schools.com/cssref/css_selectors.asp
  obsolete_codes = {}
  for found_element in r.html.find(".views-field-nothing", clean=True):
    lines = found_element.text.splitlines()
    for line in lines:
      for obsolete_code_name in ["639-1","639-2/B", "639-2/T", "639-3"]:
        if obsolete_code_name in line and ":" in line:
          code = line.split()[-1]
          obsolete_codes[obsolete_code_name] = code  
  return obsolete_codes


def try_retrieving_glottolog_id(langtext):
  languoid_id = ""
  session = HTMLSession() 
  langtext_quoted = urllib.parse.quote(langtext)
  query_url=f"https://glottolog.org/glottolog?search={langtext_quoted}"
  glottolog_r= session.get(query_url)
  returned_url = glottolog_r.html.url

  
  if "languoid" in returned_url:
    last_section = returned_url.split("/")[-1]
    languoid_id = last_section
  return languoid_id    

def try_searching_vachan_engine(langtext):
  results_list = []
  langtext_quoted = urllib.parse.quote(langtext)
  query_url = f"https://api.vachanengine.org/v2/languages?search_word={langtext_quoted}"
  vachan_r= requests.get(query_url)
  if vachan_r.status_code == 200:
    results_list = vachan_r.json()
  return results_list


def main():
  st.write("# Language code/tag search")
  st.write("Fed up with language tag confusion? Here's your one-stop shop!")
  st.write("Try typing your language below! Accepts either codes like `eng`/`en`, or full names like `English` (or even some in other languages, try '法语' or 'français'), and we will use the [langcodes](https://github.com/rspeer/langcodes) library to figure out the correct modern BCP-47 code according to [official W3 Guidelines](https://www.w3.org/International/questions/qa-choosing-language-tags)")
  st.write(f"**Feedback:** Provide feedback at https://twitter.com/cleong110, or via slack: https://masakhane-nlp.slack.com/archives/D01DU3MHP7A")
  
  # https://huggingface.co/blog/streamlit-spaces
  # https://github.com/psf/requests-html
  # https://docs.streamlit.io/library/api-reference/write-magic/st.write
  example_valid_lookups = ["zh-CN", "Chinese", "zh-Latn-pinyin", "en-Latn-US", "en", "English", "fr-CA", "French (Canada)", "français", "法语"]
  langtext = st.text_input("Language Code/Tag Lookup using langcodes", "Swahili", help=f"Try language codes or language names! Examples: {example_valid_lookups}").strip()
  
  if langtext.lower() == "matlab":
    st.error("Matlab is not a real language! ¯\\_(ツ)_/¯")
    return
    
  if langtext.lower() == "python":
    st.success("[Python is the best language!(https://www.python.org/)")
    return
  
  # TODO: st.code() for these "lookup in progress" outputs. 
  st.write("* Checking whether the tag is valid. That is, the language, script, territory, and variants (if present) are all tags that have meanings assigned by IANA.")
  
  if langcodes.tag_is_valid(langtext):
    st.write(f"* ...True! '{langtext}' parses meaningfully as a language tag according to IANA.")
  else:
    st.write(f"* ...False! '{langtext}' doesn't parse meaningfully as a language tag according to IANA, some of its subcomponents may be invalid or it might be a natural language description.")
      
  
  try:
    lang = langcodes.Language.get(langtext)
  #  st.write(f"{lang} is the BCP-47 tag.")
    if "unknown" in lang.display_name().lower():
      st.write(f"* Attempting to lookup the code directly gives us '{lang.display_name()}', attempting to search for it as a natural language string.")
      lang = None
  except langcodes.LanguageTagError as e: 
    st.write(f"* Could not lookup code directly, attempting to search for it as a natural language string.")
    lang = None
    
  
  
  if lang is None:
    try:
      found = langcodes.find(langtext)
      lang = found
      st.write(f"* Natural language search found the following BCP-47 tag: {lang}")
    except LookupError as e:
      st.write("## Result: failure!")
      st.write(f"Unable to look up language code. But all hope is not lost...")
      st.write(f"* You can also try https://r12a.github.io/app-subtags/")    
      st.write(f"* Or possibly https://glottolog.org/glottolog?search={urllib.parse.quote(langtext)}")
      lang = None
  
  
  
  
  #st.write(f"langcodes found the following tag: {type(found)}") # a Language object
  if lang is not None: 
    display = lang.display_name()
    b_variant = lang.to_alpha3(variant='B')
    t_variant = lang.to_alpha3(variant='T')
    broader_tags = lang.broader_tags()
    results_from_vachan = try_searching_vachan_engine(langtext)
    standardized_tag = langcodes.standardize_tag(lang)
    languoid_id = try_retrieving_glottolog_id(langtext)
    
    
    st.write(f"## Results: probably use '{standardized_tag}'")
    # TODO: make a results dictionary so it's easy to copy-paste?
    st.write(f"Best-match BCP-47 tag for '{langtext}', according to the langcodes library: {lang}")  
    st.write(f"Breakdown of tag components:")  
    st.write(lang.describe())
    st.write(f"Display name for {lang}: {lang.display_name()}")
    st.write(f"Autonym for {lang}: {lang.autonym()}")
    st.write(f"**Correct, standardized, BCP-47 tag for {langtext}, according to the langcodes library:** `{standardized_tag}`")
    
    
    st.write("## Further Information:")
  
    st.write(f"Broader tags for this language, if any:")
    st.write(broader_tags)
    
    st.write(f"### Language Subtag Search Tool")
    st.write(f"Try also: https://r12a.github.io/app-subtags/?lookup={lang}, which will likely have links to Ethnologue, Wikipedia, and Character usage. You can also try searching for '{langtext}' there!")
    
    st.write(f"### Glottolog")
    if languoid_id:
      st.write(f"**Glottolog Languoid ID:** Searching for '{langtext}' on Glottolog returns the following 'languoid ID': [{languoid_id}](https://glottolog.org/resource/languoid/id/{languoid_id})")
    st.write(f"https://glottolog.org/glottolog?search={t_variant} may be also of interest, with links to various resources including WALS, Wikidata, Odin, and OLAC. ")
    if t_variant != b_variant:
      st.write(f"If that doesn't work, try https://glottolog.org/glottolog?search={b_variant}, or put in a [custom search query](https://glottolog.org/glottolog)") 
    st.write(f"https://glottolog.org/glottolog?search={urllib.parse.quote(langtext)} may pull up something as well.")
  
    st.write("### Older / Related Codes")
  
    st.write(f"ISO 639-3 'alpha3' code, 'terminology' or 'T' variant (deprecated): {t_variant}")
    st.write(f"ISO 639-3 'alpha3' code, 'bibliographic' or 'B' variant (deprecated): {b_variant}")
    
    # ethnologue prefers T for german (deu), and T for French
    st.write(f"If it exists, the ISO 639 Code Tables entry for the T variant would be at https://iso639-3.sil.org/code/{t_variant}. That is also the code variant that typically has a working link to Ethnologue.")
    if t_variant != b_variant:
      st.write(f"If it exists, the ISO 639 Code Tables entry for the B variant would be at https://iso639-3.sil.org/code/{b_variant}")  
  
    st.write("#### Codes scraped from iso639-3.sil.org")
    #TODO: Cleanup this bit
    t_obsolete_codes = pull_obsolete_codes(t_variant)
    b_obsolete_codes = pull_obsolete_codes(b_variant)   
    if t_obsolete_codes:
      st.write(f"Obsolete codes from previous ISO-639 iterations, pulled from https://iso639-3.sil.org/code/{t_variant}:")
      st.write(t_obsolete_codes)
    elif b_obsolete_codes:
      st.write(f"Obsolete codes from previous ISO-639 iterations, pulled from https://iso639-3.sil.org/code/{b_variant}:")
      st.write(b_obsolete_codes)
      
    
    if results_from_vachan:
      st.write("### Other potential matches, from [Vachan Engine](https://github.com/Bridgeconn/vachan-api/tree/version-2) (experimental)")
      st.write(results_from_vachan)

      
if __name__ == "__main__":
  main()