Spaces:
Build error
Build error
| INDIC_NLP_LIB_HOME = "indic_nlp_library" | |
| INDIC_NLP_RESOURCES = "indic_nlp_resources" | |
| import sys | |
| from indicnlp import transliterate | |
| sys.path.append(r"{}".format(INDIC_NLP_LIB_HOME)) | |
| from indicnlp import common | |
| common.set_resources_path(INDIC_NLP_RESOURCES) | |
| from indicnlp import loader | |
| loader.load() | |
| from sacremoses import MosesPunctNormalizer | |
| from sacremoses import MosesTokenizer | |
| from sacremoses import MosesDetokenizer | |
| from collections import defaultdict | |
| import indicnlp | |
| from indicnlp.tokenize import indic_tokenize | |
| from indicnlp.tokenize import indic_detokenize | |
| from indicnlp.normalize import indic_normalize | |
| from indicnlp.transliterate import unicode_transliterate | |
| def postprocess( | |
| infname, outfname, input_size, lang, common_lang="hi", transliterate=False | |
| ): | |
| """ | |
| parse fairseq interactive output, convert script back to native Indic script (in case of Indic languages) and detokenize. | |
| infname: fairseq log file | |
| outfname: output file of translation (sentences not translated contain the dummy string 'DUMMY_OUTPUT' | |
| input_size: expected number of output sentences | |
| lang: language | |
| """ | |
| consolidated_testoutput = [] | |
| # with open(infname,'r',encoding='utf-8') as infile: | |
| # consolidated_testoutput= list(map(lambda x: x.strip(), filter(lambda x: x.startswith('H-'),infile) )) | |
| # consolidated_testoutput.sort(key=lambda x: int(x.split('\t')[0].split('-')[1])) | |
| # consolidated_testoutput=[ x.split('\t')[2] for x in consolidated_testoutput ] | |
| consolidated_testoutput = [(x, 0.0, "") for x in range(input_size)] | |
| temp_testoutput = [] | |
| with open(infname, "r", encoding="utf-8") as infile: | |
| temp_testoutput = list( | |
| map( | |
| lambda x: x.strip().split("\t"), | |
| filter(lambda x: x.startswith("H-"), infile), | |
| ) | |
| ) | |
| temp_testoutput = list( | |
| map(lambda x: (int(x[0].split("-")[1]), float(x[1]), x[2]), temp_testoutput) | |
| ) | |
| for sid, score, hyp in temp_testoutput: | |
| consolidated_testoutput[sid] = (sid, score, hyp) | |
| consolidated_testoutput = [x[2] for x in consolidated_testoutput] | |
| if lang == "en": | |
| en_detok = MosesDetokenizer(lang="en") | |
| with open(outfname, "w", encoding="utf-8") as outfile: | |
| for sent in consolidated_testoutput: | |
| outfile.write(en_detok.detokenize(sent.split(" ")) + "\n") | |
| else: | |
| xliterator = unicode_transliterate.UnicodeIndicTransliterator() | |
| with open(outfname, "w", encoding="utf-8") as outfile: | |
| for sent in consolidated_testoutput: | |
| if transliterate: | |
| outstr = indic_detokenize.trivial_detokenize( | |
| xliterator.transliterate(sent, common_lang, lang), lang | |
| ) | |
| else: | |
| outstr = indic_detokenize.trivial_detokenize(sent, lang) | |
| outfile.write(outstr + "\n") | |
| if __name__ == "__main__": | |
| # # The path to the local git repo for Indic NLP library | |
| # INDIC_NLP_LIB_HOME="indic_nlp_library" | |
| # INDIC_NLP_RESOURCES = "indic_nlp_resources" | |
| # sys.path.append('{}'.format(INDIC_NLP_LIB_HOME)) | |
| # common.set_resources_path(INDIC_NLP_RESOURCES) | |
| # # The path to the local git repo for Indic NLP Resources | |
| # INDIC_NLP_RESOURCES="" | |
| # sys.path.append('{}'.format(INDIC_NLP_LIB_HOME)) | |
| # common.set_resources_path(INDIC_NLP_RESOURCES) | |
| # loader.load() | |
| infname = sys.argv[1] | |
| outfname = sys.argv[2] | |
| input_size = int(sys.argv[3]) | |
| lang = sys.argv[4] | |
| if len(sys.argv) == 5: | |
| transliterate = False | |
| elif len(sys.argv) == 6: | |
| transliterate = sys.argv[5] | |
| if transliterate.lower() == "true": | |
| transliterate = True | |
| else: | |
| transliterate = False | |
| else: | |
| print(f"Invalid arguments: {sys.argv}") | |
| exit() | |
| postprocess( | |
| infname, outfname, input_size, lang, common_lang="hi", transliterate=transliterate | |
| ) | |