Spaces:
Sleeping
Sleeping
forgot to remove placeholder text
Browse files- src/translate_any_doc.py +5 -10
src/translate_any_doc.py
CHANGED
@@ -13,6 +13,7 @@ from src.aligner import Aligner
|
|
13 |
import nltk
|
14 |
import glob
|
15 |
from nltk.tokenize import sent_tokenize, word_tokenize
|
|
|
16 |
|
17 |
nltk.download('punkt')
|
18 |
nltk.download('punkt_tab')
|
@@ -285,17 +286,11 @@ def translate_document(input_file: str, source_lang: str, target_lang: str,
|
|
285 |
paragraphs_with_runs = [get_runs_from_paragraph(line.strip(), idx) for idx, line in
|
286 |
enumerate(open(plain_text_file).readlines())]
|
287 |
|
288 |
-
# translation = translate(open(original_moses_file).read(), ip, port)
|
289 |
-
|
290 |
# translate using plaintext file
|
291 |
-
|
292 |
-
|
293 |
-
|
294 |
-
|
295 |
-
|
296 |
-
translated_paragraphs = ["Catalan",
|
297 |
-
"Catalan (official name in Catalonia, the Balearic Islands, Andorra, the city of Alghero and traditional in Northern Catalonia) or Valencian (official name in the Valencian Community and traditional in Carxe) is a Romance language spoken in Catalonia, the Valencian Community (except for some regions and towns in the interior), the Balearic Islands (where it is also called Mallorcan, Menorcan, Ibizan or Formentera depending on the island), Andorra, the Franja de Ponent (in Aragon), the city of Alghero (on the island of Sardinia), Northern Catalonia, Carxe (a small territory of Murcia inhabited by Valencian settlers), and in communities around the world (among which Argentina stands out, with 200,000 speakers).",
|
298 |
-
"It has ten million speakers, of whom almost half are native speakers; Its linguistic domain, with an area of 68,730 km² and 13,992,625 inhabitants (2013-2015), includes 1,687 municipal districts. In 2023, it was spoken as a mother tongue by more than four million people (29% of the population of the linguistic territory), of whom 2,924,610 in Catalonia, 1,190,672 in the Valencian Community and 327,384 in the Balearic Islands. Like the other Romance languages, Catalan comes from Vulgar Latin spoken by the Romans who settled in Hispania during ancient times."]
|
299 |
|
300 |
# time to align the translation with the original
|
301 |
print("Generating alignments...")
|
|
|
13 |
import nltk
|
14 |
import glob
|
15 |
from nltk.tokenize import sent_tokenize, word_tokenize
|
16 |
+
import tqdm
|
17 |
|
18 |
nltk.download('punkt')
|
19 |
nltk.download('punkt_tab')
|
|
|
286 |
paragraphs_with_runs = [get_runs_from_paragraph(line.strip(), idx) for idx, line in
|
287 |
enumerate(open(plain_text_file).readlines())]
|
288 |
|
|
|
|
|
289 |
# translate using plaintext file
|
290 |
+
translated_paragraphs = []
|
291 |
+
for paragraph in tqdm.tqdm(paragraphs_with_runs, desc="Translating paragraphs..."):
|
292 |
+
paragraph_text = detokenizer.detokenize([run["text"] for run in paragraph])
|
293 |
+
translated_paragraphs.append(translate(paragraph_text, ip, port))
|
|
|
|
|
|
|
|
|
294 |
|
295 |
# time to align the translation with the original
|
296 |
print("Generating alignments...")
|