Spaces:

LangTech-MT
/

document-translator

Sleeping

App Files Files Community

mjuvilla commited on Apr 25

Commit

127870b

1 Parent(s): ad4ed41

forgot to remove placeholder text

Browse files

Files changed (1) hide show

src/translate_any_doc.py +5 -10

src/translate_any_doc.py CHANGED Viewed

@@ -13,6 +13,7 @@ from src.aligner import Aligner
 import nltk
 import glob
 from nltk.tokenize import sent_tokenize, word_tokenize
 nltk.download('punkt')
 nltk.download('punkt_tab')
@@ -285,17 +286,11 @@ def translate_document(input_file: str, source_lang: str, target_lang: str,
     paragraphs_with_runs = [get_runs_from_paragraph(line.strip(), idx) for idx, line in
                             enumerate(open(plain_text_file).readlines())]
-    # translation = translate(open(original_moses_file).read(), ip, port)
     # translate using plaintext file
-    # translated_paragraphs = []
-    # for paragraph in tqdm.tqdm(paragraphs_with_runs, desc="Translating paragraphs..."):
-    #     paragraph_text = detokenizer.detokenize([run["text"] for run in paragraph])
-    #     translated_paragraphs.append(translate(paragraph_text, ip, port))
-    translated_paragraphs = ["Catalan",
-                             "Catalan (official name in Catalonia, the Balearic Islands, Andorra, the city of Alghero and traditional in Northern Catalonia) or Valencian (official name in the Valencian Community and traditional in Carxe) is a Romance language spoken in Catalonia, the Valencian Community (except for some regions and towns in the interior), the Balearic Islands (where it is also called Mallorcan, Menorcan, Ibizan or Formentera depending on the island), Andorra, the Franja de Ponent (in Aragon), the city of Alghero (on the island of Sardinia), Northern Catalonia, Carxe (a small territory of Murcia inhabited by Valencian settlers), and in communities around the world (among which Argentina stands out, with 200,000 speakers).",
-                             "It has ten million speakers, of whom almost half are native speakers; Its linguistic domain, with an area of 68,730 km² and 13,992,625 inhabitants (2013-2015), includes 1,687 municipal districts. In 2023, it was spoken as a mother tongue by more than four million people (29% of the population of the linguistic territory), of whom 2,924,610 in Catalonia, 1,190,672 in the Valencian Community and 327,384 in the Balearic Islands. Like the other Romance languages, Catalan comes from Vulgar Latin spoken by the Romans who settled in Hispania during ancient times."]
     # time to align the translation with the original
     print("Generating alignments...")

 import nltk
 import glob
 from nltk.tokenize import sent_tokenize, word_tokenize
+import tqdm
 nltk.download('punkt')
 nltk.download('punkt_tab')
     paragraphs_with_runs = [get_runs_from_paragraph(line.strip(), idx) for idx, line in
                             enumerate(open(plain_text_file).readlines())]
     # translate using plaintext file
+    translated_paragraphs = []
+    for paragraph in tqdm.tqdm(paragraphs_with_runs, desc="Translating paragraphs..."):
+        paragraph_text = detokenizer.detokenize([run["text"] for run in paragraph])
+        translated_paragraphs.append(translate(paragraph_text, ip, port))
     # time to align the translation with the original
     print("Generating alignments...")