mjuvilla commited on
Commit
36f2ac1
·
1 Parent(s): 0348f21

added execution time computation for alignments and cleanup old imports

Browse files
Files changed (1) hide show
  1. translate_docx.py +2 -5
translate_docx.py CHANGED
@@ -3,8 +3,6 @@ import json
3
  import requests
4
  import tqdm
5
  import os
6
- import string
7
- from collections import defaultdict
8
 
9
  from docx import Document
10
  from docx.text.hyperlink import Hyperlink
@@ -16,7 +14,6 @@ nltk.download('punkt')
16
  nltk.download('punkt_tab')
17
 
18
  from nltk.tokenize import sent_tokenize, word_tokenize
19
- from nltk.tokenize.treebank import TreebankWordDetokenizer
20
 
21
  from subprocess import Popen, PIPE
22
 
@@ -318,16 +315,16 @@ def translate_document(input_file,
318
  processed_original_paragraphs_with_runs = [preprocess_runs(runs) for runs in paragraphs_with_runs]
319
 
320
  print("Generating alignments...")
 
321
  translated_sentences_with_style = generate_alignments(processed_original_paragraphs_with_runs,
322
  translated_paragraphs, aligner,
323
  temp_folder, detokenizer)
324
- print("Finished alignments")
325
 
326
  # flatten the sentences into a list of tokens
327
  translated_tokens_with_style = [item for sublist in translated_sentences_with_style for item in sublist]
328
  # group the tokens by style/run
329
  translated_runs_with_style = group_by_style(translated_tokens_with_style, detokenizer)
330
- print("Grouped by style")
331
 
332
  # group the runs by original paragraph
333
  translated_paragraphs_with_style = dict()
 
3
  import requests
4
  import tqdm
5
  import os
 
 
6
 
7
  from docx import Document
8
  from docx.text.hyperlink import Hyperlink
 
14
  nltk.download('punkt_tab')
15
 
16
  from nltk.tokenize import sent_tokenize, word_tokenize
 
17
 
18
  from subprocess import Popen, PIPE
19
 
 
315
  processed_original_paragraphs_with_runs = [preprocess_runs(runs) for runs in paragraphs_with_runs]
316
 
317
  print("Generating alignments...")
318
+ start_time = time.time()
319
  translated_sentences_with_style = generate_alignments(processed_original_paragraphs_with_runs,
320
  translated_paragraphs, aligner,
321
  temp_folder, detokenizer)
322
+ print(f"Finished alignments in {time.time() - start_time} seconds")
323
 
324
  # flatten the sentences into a list of tokens
325
  translated_tokens_with_style = [item for sublist in translated_sentences_with_style for item in sublist]
326
  # group the tokens by style/run
327
  translated_runs_with_style = group_by_style(translated_tokens_with_style, detokenizer)
 
328
 
329
  # group the runs by original paragraph
330
  translated_paragraphs_with_style = dict()