mjuvilla commited on
Commit
6e54822
·
1 Parent(s): 127870b

integrated any-doc into the gradle app, separated the translation side to make it easier to implement other translation models

Browse files
gradio_app.py CHANGED
@@ -1,8 +1,6 @@
1
  import gradio as gr
2
- from pathlib import Path
3
- import requests
4
- import json
5
- from translate_docx import translate_document, translate, Aligner
6
  from nltk.tokenize.treebank import TreebankWordDetokenizer
7
 
8
 
@@ -16,7 +14,7 @@ detokenizer = TreebankWordDetokenizer()
16
 
17
 
18
  def upload_file(filepath):
19
- translated_file_name = translate_document(filepath, aligner, detokenizer, ip)
20
  return [gr.UploadButton(visible=False), gr.DownloadButton(label=f"Download {translated_file_name}", value=translated_file_name, visible=True)]
21
 
22
  def download_file():
 
1
  import gradio as gr
2
+ from src.translate_any_doc import translate_document, translate
3
+ from src.aligner import Aligner
 
 
4
  from nltk.tokenize.treebank import TreebankWordDetokenizer
5
 
6
 
 
14
 
15
 
16
  def upload_file(filepath):
17
+ translated_file_name = translate_document(filepath, source_lang, target_lang, aligner, detokenizer, ip)
18
  return [gr.UploadButton(visible=False), gr.DownloadButton(label=f"Download {translated_file_name}", value=translated_file_name, visible=True)]
19
 
20
  def download_file():
src/mtuoc_aina_translator.py ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ import json
3
+ from nltk.tokenize import sent_tokenize
4
+
5
+ class MTUOCAinaTranslator:
6
+ def __init__(self, ip: str, port: str):
7
+ self.ip = ip
8
+ self.port = port
9
+
10
+ def translate(self, text):
11
+ stuff = sent_tokenize(text)
12
+
13
+ myobj = {
14
+ 'id': '1',
15
+ 'src': text,
16
+ }
17
+ url = 'http://' + self.ip + ':' + self.port + '/translate'
18
+ x = requests.post(url, json=myobj)
19
+ json_response = json.loads(x.text)
20
+ return json_response['tgt']
src/translate_any_doc.py CHANGED
@@ -1,8 +1,5 @@
1
  import shutil
2
  import time
3
- import json
4
-
5
- import requests
6
  import os
7
  from itertools import groupby
8
  from subprocess import Popen, PIPE
@@ -18,21 +15,6 @@ import tqdm
18
  nltk.download('punkt')
19
  nltk.download('punkt_tab')
20
 
21
- ip = "192.168.20.216"
22
- port = "8000"
23
-
24
-
25
- def translate(text, ip, port):
26
- myobj = {
27
- 'id': '1',
28
- 'src': text,
29
- }
30
- port = str(int(port))
31
- url = 'http://' + ip + ':' + port + '/translate'
32
- x = requests.post(url, json=myobj)
33
- json_response = json.loads(x.text)
34
- return json_response['tgt']
35
-
36
 
37
  def doc_to_plain_text(input_file: str, source_lang: str, target_lang: str, tikal_folder: str,
38
  original_xliff_file_path: str) -> str:
@@ -268,11 +250,10 @@ def runs_to_plain_text(paragraphs_with_style: dict[str, list[dict[str, str, str]
268
 
269
 
270
  def translate_document(input_file: str, source_lang: str, target_lang: str,
 
271
  aligner: Aligner,
272
  detokenizer,
273
- ip: str = "192.168.20.216",
274
  temp_folder: str = "tmp",
275
- port: str = "8000",
276
  tikal_folder: str = "okapi-apps_gtk2-linux-x86_64_1.47.0") -> str:
277
  input_filename = input_file.split("/")[-1]
278
  # copy the original file to the temporal folder to avoid common issues with tikal
@@ -290,7 +271,7 @@ def translate_document(input_file: str, source_lang: str, target_lang: str,
290
  translated_paragraphs = []
291
  for paragraph in tqdm.tqdm(paragraphs_with_runs, desc="Translating paragraphs..."):
292
  paragraph_text = detokenizer.detokenize([run["text"] for run in paragraph])
293
- translated_paragraphs.append(translate(paragraph_text, ip, port))
294
 
295
  # time to align the translation with the original
296
  print("Generating alignments...")
 
1
  import shutil
2
  import time
 
 
 
3
  import os
4
  from itertools import groupby
5
  from subprocess import Popen, PIPE
 
15
  nltk.download('punkt')
16
  nltk.download('punkt_tab')
17
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
 
19
  def doc_to_plain_text(input_file: str, source_lang: str, target_lang: str, tikal_folder: str,
20
  original_xliff_file_path: str) -> str:
 
250
 
251
 
252
  def translate_document(input_file: str, source_lang: str, target_lang: str,
253
+ translator,
254
  aligner: Aligner,
255
  detokenizer,
 
256
  temp_folder: str = "tmp",
 
257
  tikal_folder: str = "okapi-apps_gtk2-linux-x86_64_1.47.0") -> str:
258
  input_filename = input_file.split("/")[-1]
259
  # copy the original file to the temporal folder to avoid common issues with tikal
 
271
  translated_paragraphs = []
272
  for paragraph in tqdm.tqdm(paragraphs_with_runs, desc="Translating paragraphs..."):
273
  paragraph_text = detokenizer.detokenize([run["text"] for run in paragraph])
274
+ translated_paragraphs.append(translator.translate(paragraph_text))
275
 
276
  # time to align the translation with the original
277
  print("Generating alignments...")