Spaces:
Sleeping
Sleeping
integrated any-doc into the gradle app, separated the translation side to make it easier to implement other translation models
Browse files- gradio_app.py +3 -5
- src/mtuoc_aina_translator.py +20 -0
- src/translate_any_doc.py +2 -21
gradio_app.py
CHANGED
@@ -1,8 +1,6 @@
|
|
1 |
import gradio as gr
|
2 |
-
from
|
3 |
-
import
|
4 |
-
import json
|
5 |
-
from translate_docx import translate_document, translate, Aligner
|
6 |
from nltk.tokenize.treebank import TreebankWordDetokenizer
|
7 |
|
8 |
|
@@ -16,7 +14,7 @@ detokenizer = TreebankWordDetokenizer()
|
|
16 |
|
17 |
|
18 |
def upload_file(filepath):
|
19 |
-
translated_file_name = translate_document(filepath, aligner, detokenizer, ip)
|
20 |
return [gr.UploadButton(visible=False), gr.DownloadButton(label=f"Download {translated_file_name}", value=translated_file_name, visible=True)]
|
21 |
|
22 |
def download_file():
|
|
|
1 |
import gradio as gr
|
2 |
+
from src.translate_any_doc import translate_document, translate
|
3 |
+
from src.aligner import Aligner
|
|
|
|
|
4 |
from nltk.tokenize.treebank import TreebankWordDetokenizer
|
5 |
|
6 |
|
|
|
14 |
|
15 |
|
16 |
def upload_file(filepath):
|
17 |
+
translated_file_name = translate_document(filepath, source_lang, target_lang, aligner, detokenizer, ip)
|
18 |
return [gr.UploadButton(visible=False), gr.DownloadButton(label=f"Download {translated_file_name}", value=translated_file_name, visible=True)]
|
19 |
|
20 |
def download_file():
|
src/mtuoc_aina_translator.py
ADDED
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import requests
|
2 |
+
import json
|
3 |
+
from nltk.tokenize import sent_tokenize
|
4 |
+
|
5 |
+
class MTUOCAinaTranslator:
|
6 |
+
def __init__(self, ip: str, port: str):
|
7 |
+
self.ip = ip
|
8 |
+
self.port = port
|
9 |
+
|
10 |
+
def translate(self, text):
|
11 |
+
stuff = sent_tokenize(text)
|
12 |
+
|
13 |
+
myobj = {
|
14 |
+
'id': '1',
|
15 |
+
'src': text,
|
16 |
+
}
|
17 |
+
url = 'http://' + self.ip + ':' + self.port + '/translate'
|
18 |
+
x = requests.post(url, json=myobj)
|
19 |
+
json_response = json.loads(x.text)
|
20 |
+
return json_response['tgt']
|
src/translate_any_doc.py
CHANGED
@@ -1,8 +1,5 @@
|
|
1 |
import shutil
|
2 |
import time
|
3 |
-
import json
|
4 |
-
|
5 |
-
import requests
|
6 |
import os
|
7 |
from itertools import groupby
|
8 |
from subprocess import Popen, PIPE
|
@@ -18,21 +15,6 @@ import tqdm
|
|
18 |
nltk.download('punkt')
|
19 |
nltk.download('punkt_tab')
|
20 |
|
21 |
-
ip = "192.168.20.216"
|
22 |
-
port = "8000"
|
23 |
-
|
24 |
-
|
25 |
-
def translate(text, ip, port):
|
26 |
-
myobj = {
|
27 |
-
'id': '1',
|
28 |
-
'src': text,
|
29 |
-
}
|
30 |
-
port = str(int(port))
|
31 |
-
url = 'http://' + ip + ':' + port + '/translate'
|
32 |
-
x = requests.post(url, json=myobj)
|
33 |
-
json_response = json.loads(x.text)
|
34 |
-
return json_response['tgt']
|
35 |
-
|
36 |
|
37 |
def doc_to_plain_text(input_file: str, source_lang: str, target_lang: str, tikal_folder: str,
|
38 |
original_xliff_file_path: str) -> str:
|
@@ -268,11 +250,10 @@ def runs_to_plain_text(paragraphs_with_style: dict[str, list[dict[str, str, str]
|
|
268 |
|
269 |
|
270 |
def translate_document(input_file: str, source_lang: str, target_lang: str,
|
|
|
271 |
aligner: Aligner,
|
272 |
detokenizer,
|
273 |
-
ip: str = "192.168.20.216",
|
274 |
temp_folder: str = "tmp",
|
275 |
-
port: str = "8000",
|
276 |
tikal_folder: str = "okapi-apps_gtk2-linux-x86_64_1.47.0") -> str:
|
277 |
input_filename = input_file.split("/")[-1]
|
278 |
# copy the original file to the temporal folder to avoid common issues with tikal
|
@@ -290,7 +271,7 @@ def translate_document(input_file: str, source_lang: str, target_lang: str,
|
|
290 |
translated_paragraphs = []
|
291 |
for paragraph in tqdm.tqdm(paragraphs_with_runs, desc="Translating paragraphs..."):
|
292 |
paragraph_text = detokenizer.detokenize([run["text"] for run in paragraph])
|
293 |
-
translated_paragraphs.append(translate(paragraph_text
|
294 |
|
295 |
# time to align the translation with the original
|
296 |
print("Generating alignments...")
|
|
|
1 |
import shutil
|
2 |
import time
|
|
|
|
|
|
|
3 |
import os
|
4 |
from itertools import groupby
|
5 |
from subprocess import Popen, PIPE
|
|
|
15 |
nltk.download('punkt')
|
16 |
nltk.download('punkt_tab')
|
17 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
18 |
|
19 |
def doc_to_plain_text(input_file: str, source_lang: str, target_lang: str, tikal_folder: str,
|
20 |
original_xliff_file_path: str) -> str:
|
|
|
250 |
|
251 |
|
252 |
def translate_document(input_file: str, source_lang: str, target_lang: str,
|
253 |
+
translator,
|
254 |
aligner: Aligner,
|
255 |
detokenizer,
|
|
|
256 |
temp_folder: str = "tmp",
|
|
|
257 |
tikal_folder: str = "okapi-apps_gtk2-linux-x86_64_1.47.0") -> str:
|
258 |
input_filename = input_file.split("/")[-1]
|
259 |
# copy the original file to the temporal folder to avoid common issues with tikal
|
|
|
271 |
translated_paragraphs = []
|
272 |
for paragraph in tqdm.tqdm(paragraphs_with_runs, desc="Translating paragraphs..."):
|
273 |
paragraph_text = detokenizer.detokenize([run["text"] for run in paragraph])
|
274 |
+
translated_paragraphs.append(translator.translate(paragraph_text))
|
275 |
|
276 |
# time to align the translation with the original
|
277 |
print("Generating alignments...")
|