Spaces:
Sleeping
Sleeping
| import streamlit as st | |
| from uuid import uuid4 | |
| import langcodes | |
| import itertools | |
| example_languages_from_labse="""Afrikaans | |
| Albanian | |
| Amharic | |
| Arabic | |
| Armenian | |
| Assamese | |
| Azerbaijani | |
| Basque | |
| Belarusian | |
| Bengali | |
| Bosnian | |
| Bulgarian | |
| Burmese | |
| Catalan | |
| Cebuano | |
| Chinese | |
| Corsican | |
| Croatian | |
| Czech | |
| Danish | |
| Dutch | |
| English | |
| Esperanto | |
| Estonian | |
| Finnish | |
| French | |
| Western Frisian | |
| Galician | |
| Georgian | |
| German | |
| Greek | |
| Gujarati | |
| Haitian | |
| Hausa | |
| Hawaiian | |
| Hebrew | |
| Hindi | |
| Hmong | |
| Hungarian | |
| Icelandic | |
| Igbo | |
| Indonesian | |
| Irish | |
| Italian | |
| Japanese | |
| Javanese | |
| Kannada | |
| Kazakh | |
| Khmer | |
| Kinyarwanda | |
| Korean | |
| Kurdish | |
| Kyrgyz | |
| Lao | |
| Latin | |
| Latvian | |
| Lithuanian | |
| Luxembourgish | |
| Macedonian | |
| Malagasy | |
| Malay | |
| Malayalam | |
| Maltese | |
| Māori | |
| Marathi | |
| Mongolian | |
| Nepali | |
| Norwegian | |
| Chichewa | |
| Oriya | |
| Persian | |
| Polish | |
| Portuguese | |
| Panjabi | |
| Romanian | |
| Russian | |
| Samoan | |
| Scottish Gaelic | |
| Serbian | |
| Southern Sotho | |
| Shona | |
| Sinhala | |
| Slovak | |
| Slovenian | |
| Somali | |
| Spanish | |
| Sundanese | |
| Swahili | |
| Swedish | |
| Tagalog | |
| Tajik | |
| Tamil | |
| Tatar | |
| Telugu | |
| Thai | |
| Tibetan | |
| Turkish | |
| Turkmen | |
| Uyghur | |
| Ukrainian | |
| Urdu | |
| Uzbek | |
| Vietnamese | |
| Welsh | |
| Wolof | |
| Xhosa | |
| Yiddish | |
| Yoruba | |
| Zulu""".splitlines() | |
| # example_language_tag_string_from_labse = """af | |
| # sq | |
| # am | |
| # ar | |
| # hy | |
| # as | |
| # az | |
| # eu | |
| # be | |
| # bn | |
| # bs | |
| # bg | |
| # my | |
| # ca | |
| # ceb | |
| # zh | |
| # co | |
| # hr | |
| # cs | |
| # da | |
| # nl | |
| # en | |
| # eo | |
| # et | |
| # fi | |
| # fr | |
| # fy | |
| # gl | |
| # ka | |
| # de | |
| # el | |
| # gu | |
| # ht | |
| # ha | |
| # haw | |
| # he | |
| # hi | |
| # hmn | |
| # hu | |
| # is | |
| # ig | |
| # id | |
| # ga | |
| # it | |
| # ja | |
| # jv | |
| # kn | |
| # kk | |
| # km | |
| # rw | |
| # ko | |
| # ku | |
| # ky | |
| # lo | |
| # la | |
| # lv | |
| # lt | |
| # lb | |
| # mk | |
| # mg | |
| # ms | |
| # ml | |
| # mt | |
| # mi | |
| # mr | |
| # mn | |
| # ne | |
| # no | |
| # ny | |
| # or | |
| # fa | |
| # pl | |
| # pt | |
| # pa | |
| # ro | |
| # ru | |
| # sm | |
| # gd | |
| # sr | |
| # st | |
| # sn | |
| # si | |
| # sk | |
| # sl | |
| # so | |
| # es | |
| # su | |
| # sw | |
| # sv | |
| # tl | |
| # tg | |
| # ta | |
| # tt | |
| # te | |
| # th | |
| # bo | |
| # tr | |
| # tk | |
| # ug | |
| # uk | |
| # ur | |
| # uz | |
| # vi | |
| # cy | |
| # wo | |
| # xh | |
| # yi | |
| # yo | |
| # zu""" | |
| labse_huggingface_tags = """- af | |
| - sq | |
| - am | |
| - ar | |
| - hy | |
| - as | |
| - az | |
| - eu | |
| - be | |
| - bn | |
| - bs | |
| - bg | |
| - my | |
| - ca | |
| - ceb | |
| - zh | |
| - co | |
| - hr | |
| - cs | |
| - da | |
| - nl | |
| - en | |
| - eo | |
| - et | |
| - fi | |
| - fr | |
| - fy | |
| - gl | |
| - ka | |
| - de | |
| - el | |
| - gu | |
| - ht | |
| - ha | |
| - haw | |
| - he | |
| - hi | |
| - hmn | |
| - hu | |
| - is | |
| - ig | |
| - id | |
| - ga | |
| - it | |
| - ja | |
| - jv | |
| - kn | |
| - kk | |
| - km | |
| - rw | |
| - ko | |
| - ku | |
| - ky | |
| - lo | |
| - la | |
| - lv | |
| - lt | |
| - lb | |
| - mk | |
| - mg | |
| - ms | |
| - ml | |
| - mt | |
| - mi | |
| - mr | |
| - mn | |
| - ne | |
| - no | |
| - ny | |
| - or | |
| - fa | |
| - pl | |
| - pt | |
| - pa | |
| - ro | |
| - ru | |
| - sm | |
| - gd | |
| - sr | |
| - st | |
| - sn | |
| - si | |
| - sk | |
| - sl | |
| - so | |
| - es | |
| - su | |
| - sw | |
| - sv | |
| - tl | |
| - tg | |
| - ta | |
| - tt | |
| - te | |
| - th | |
| - bo | |
| - tr | |
| - tk | |
| - ug | |
| - uk | |
| - ur | |
| - uz | |
| - vi | |
| - cy | |
| - wo | |
| - xh | |
| - yi | |
| - yo | |
| - zu""".splitlines() | |
| labse_huggingface_tags = [tag.strip() for tag in labse_huggingface_tags if tag] | |
| labse_huggingface_tags = [tag.split()[-1] for tag in labse_huggingface_tags] | |
| def match_based_on_tag_distance(model_languages, data_languages, model_name, data_name="eBible", dedupe=False, threshold=9): | |
| print(f"Model language count: {len(model_languages)}") | |
| print(f"Data language count: {len(data_languages)}") | |
| if dedupe: | |
| print(f"Filtering for duplicates...") | |
| model_languages = list(set(model_languages)) | |
| data_languages = list(set(data_languages)) | |
| print(f"Model languages remaining: {len(model_languages)}") | |
| print(f"Data language remaining: {len(data_languages)}") | |
| # Match based on tag distances | |
| tag_distance_matches = [] | |
| product_of_lists = list(itertools.product(model_languages, data_languages)) | |
| print(f"checking {len(model_languages)} model languages against {len(data_languages)} data languages, giving {len(product_of_lists)} combinations") | |
| for combination in tqdm(product_of_lists): | |
| model_lang = combination[0] | |
| data_lang = combination[1] | |
| tag_distance = langcodes.tag_distance(model_lang, data_lang) | |
| # print(f"{model_lang} and {data_lang} are {tag_distance} tag-distance apart") | |
| if tag_distance <= threshold: | |
| tag_distance_matches.append((model_lang, data_lang, tag_distance)) | |
| # print(f"{model_lang} and {data_lang} are {langcodes.tag_distance(model_lang, data_lang)} tag-distance apart") | |
| # else: | |
| # print(f"{model_lang} and {data_lang} are {langcodes.tag_distance(model_lang, data_lang)} tag-distance apart") | |
| # tag_distance_matches = sorted(tag_distance_matches) | |
| model_unmatched = [lang for lang in model_languages if lang not in [match[0] for match in tag_distance_matches]] | |
| data_unmatched = [lang for lang in data_languages if lang not in [match[1] for match in tag_distance_matches]] | |
| print(f"Found {len(tag_distance_matches)} matches, {len(model_unmatched)} model languages not matched") | |
| return tag_distance_matches, model_unmatched,data_unmatched, model_languages, data_languages | |
| def parse_language_list(): | |
| language_list_options = ["Language names", "Language Tags/Codes", | |
| # "huggingface model/dataset name" | |
| ] | |
| language_list_type = st.selectbox(f"What format is your language list?",language_list_options, key=uuid4()) | |
| language_list = [] | |
| not_parsed = [] | |
| if language_list_type==language_list_options[0]: | |
| languages_input = st.text_area("Language names, comma-separated", f"{",".join(example_languages_from_labse)}", key=uuid4()) | |
| for lang in languages_input.split(","): | |
| try: | |
| language_list.append(langcodes.find(lang.strip())) | |
| except LookupError as e: | |
| not_parsed.append(lang) | |
| elif language_list_type==language_list_options[1]: | |
| languages_input = st.text_area("Language tags, comma-separated", f"{','.join(labse_huggingface_tags)}", key=uuid4()) | |
| for lang in languages_input.split(","): | |
| try: | |
| language_list.append(langcodes.get(lang.strip())) | |
| except langcodes.tag_parser.LanguageTagError as e: | |
| print(e) | |
| not_parsed.append(lang) | |
| # = [] | |
| st.write(f"Langcodes list: {language_list}") | |
| # st.write(f"Langcodes could not parse {not_parsed}") | |
| return language_list | |
| first_lang_list = parse_language_list() | |
| second_lang_list = parse_language_list() | |