mjuvilla commited on
Commit
127870b
·
1 Parent(s): ad4ed41

forgot to remove placeholder text

Browse files
Files changed (1) hide show
  1. src/translate_any_doc.py +5 -10
src/translate_any_doc.py CHANGED
@@ -13,6 +13,7 @@ from src.aligner import Aligner
13
  import nltk
14
  import glob
15
  from nltk.tokenize import sent_tokenize, word_tokenize
 
16
 
17
  nltk.download('punkt')
18
  nltk.download('punkt_tab')
@@ -285,17 +286,11 @@ def translate_document(input_file: str, source_lang: str, target_lang: str,
285
  paragraphs_with_runs = [get_runs_from_paragraph(line.strip(), idx) for idx, line in
286
  enumerate(open(plain_text_file).readlines())]
287
 
288
- # translation = translate(open(original_moses_file).read(), ip, port)
289
-
290
  # translate using plaintext file
291
- # translated_paragraphs = []
292
- # for paragraph in tqdm.tqdm(paragraphs_with_runs, desc="Translating paragraphs..."):
293
- # paragraph_text = detokenizer.detokenize([run["text"] for run in paragraph])
294
- # translated_paragraphs.append(translate(paragraph_text, ip, port))
295
-
296
- translated_paragraphs = ["Catalan",
297
- "Catalan (official name in Catalonia, the Balearic Islands, Andorra, the city of Alghero and traditional in Northern Catalonia) or Valencian (official name in the Valencian Community and traditional in Carxe) is a Romance language spoken in Catalonia, the Valencian Community (except for some regions and towns in the interior), the Balearic Islands (where it is also called Mallorcan, Menorcan, Ibizan or Formentera depending on the island), Andorra, the Franja de Ponent (in Aragon), the city of Alghero (on the island of Sardinia), Northern Catalonia, Carxe (a small territory of Murcia inhabited by Valencian settlers), and in communities around the world (among which Argentina stands out, with 200,000 speakers).",
298
- "It has ten million speakers, of whom almost half are native speakers; Its linguistic domain, with an area of 68,730 km² and 13,992,625 inhabitants (2013-2015), includes 1,687 municipal districts. In 2023, it was spoken as a mother tongue by more than four million people (29% of the population of the linguistic territory), of whom 2,924,610 in Catalonia, 1,190,672 in the Valencian Community and 327,384 in the Balearic Islands. Like the other Romance languages, Catalan comes from Vulgar Latin spoken by the Romans who settled in Hispania during ancient times."]
299
 
300
  # time to align the translation with the original
301
  print("Generating alignments...")
 
13
  import nltk
14
  import glob
15
  from nltk.tokenize import sent_tokenize, word_tokenize
16
+ import tqdm
17
 
18
  nltk.download('punkt')
19
  nltk.download('punkt_tab')
 
286
  paragraphs_with_runs = [get_runs_from_paragraph(line.strip(), idx) for idx, line in
287
  enumerate(open(plain_text_file).readlines())]
288
 
 
 
289
  # translate using plaintext file
290
+ translated_paragraphs = []
291
+ for paragraph in tqdm.tqdm(paragraphs_with_runs, desc="Translating paragraphs..."):
292
+ paragraph_text = detokenizer.detokenize([run["text"] for run in paragraph])
293
+ translated_paragraphs.append(translate(paragraph_text, ip, port))
 
 
 
 
294
 
295
  # time to align the translation with the original
296
  print("Generating alignments...")