Spaces:
Sleeping
Sleeping
Commit
·
978cbf1
0
Parent(s):
First commit. For now the translation has not been integrated but reading a docx and writing its translation while keeping the formatting and style should work
Browse files- main.py +318 -0
- readme.md +18 -0
- requirements.txt +2 -0
main.py
ADDED
|
@@ -0,0 +1,318 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
|
| 3 |
+
from docx import Document
|
| 4 |
+
import nltk
|
| 5 |
+
|
| 6 |
+
nltk.download('punkt')
|
| 7 |
+
nltk.download('punkt_tab')
|
| 8 |
+
|
| 9 |
+
from nltk.tokenize import sent_tokenize, word_tokenize
|
| 10 |
+
from nltk.tokenize.treebank import TreebankWordDetokenizer
|
| 11 |
+
|
| 12 |
+
from subprocess import Popen, PIPE
|
| 13 |
+
|
| 14 |
+
from itertools import groupby
|
| 15 |
+
import fileinput
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
# Class to align original and translated sentences
|
| 19 |
+
# based on https://github.com/mtuoc/MTUOC-server/blob/main/GetWordAlignments_fast_align.py
|
| 20 |
+
class Aligner():
|
| 21 |
+
def __init__(self, config_folder, source_lang, target_lang, temp_folder):
|
| 22 |
+
forward_params_path = os.path.join(config_folder, f"{source_lang}-{target_lang}.params")
|
| 23 |
+
reverse_params_path = os.path.join(config_folder, f"{target_lang}-{source_lang}.params")
|
| 24 |
+
|
| 25 |
+
fwd_T, fwd_m = self.__read_err(os.path.join(config_folder, f"{source_lang}-{target_lang}.err"))
|
| 26 |
+
rev_T, rev_m = self.__read_err(os.path.join(config_folder, f"{target_lang}-{source_lang}.err"))
|
| 27 |
+
|
| 28 |
+
self.forward_alignment_file_path = os.path.join(temp_folder, "forward.align")
|
| 29 |
+
self.reverse_alignment_file_path = os.path.join(temp_folder, "reverse.align")
|
| 30 |
+
|
| 31 |
+
self.forward_command = lambda \
|
| 32 |
+
x: f'./fast_align -i {x} -d -T {fwd_T} -m {fwd_m} -f {forward_params_path} > {self.forward_alignment_file_path}'
|
| 33 |
+
self.reverse_command = lambda \
|
| 34 |
+
x: f'./fast_align -i {x} -d -T {rev_T} -m {rev_m} -f {reverse_params_path} -r > {self.reverse_alignment_file_path}'
|
| 35 |
+
|
| 36 |
+
self.symmetric_command = f'./atools -i {self.forward_alignment_file_path} -j {self.reverse_alignment_file_path} -c grow-diag-final-and'
|
| 37 |
+
|
| 38 |
+
def __simplify_alignment_file(self, file):
|
| 39 |
+
with fileinput.FileInput(file, inplace=True, backup='.bak') as f:
|
| 40 |
+
for line in f:
|
| 41 |
+
print(line.split('|||')[2].strip())
|
| 42 |
+
|
| 43 |
+
def __read_err(self, err):
|
| 44 |
+
(T, m) = ('', '')
|
| 45 |
+
for line in open(err):
|
| 46 |
+
# expected target length = source length * N
|
| 47 |
+
if 'expected target length' in line:
|
| 48 |
+
m = line.split()[-1]
|
| 49 |
+
# final tension: N
|
| 50 |
+
elif 'final tension' in line:
|
| 51 |
+
T = line.split()[-1]
|
| 52 |
+
return T, m
|
| 53 |
+
|
| 54 |
+
def align(self, file):
|
| 55 |
+
# generate forward alignment
|
| 56 |
+
process = Popen(self.forward_command(file), shell=True)
|
| 57 |
+
process.wait()
|
| 58 |
+
# generate reverse alignment
|
| 59 |
+
process = Popen(self.reverse_command(file), shell=True)
|
| 60 |
+
process.wait()
|
| 61 |
+
|
| 62 |
+
# for some reason the output file contains more information than needed, remove it
|
| 63 |
+
self.__simplify_alignment_file(self.forward_alignment_file_path)
|
| 64 |
+
self.__simplify_alignment_file(self.reverse_alignment_file_path)
|
| 65 |
+
|
| 66 |
+
# generate symmetrical alignment
|
| 67 |
+
process = Popen(self.symmetric_command, shell=True, stdin=PIPE, stdout=PIPE, stderr=PIPE)
|
| 68 |
+
process.wait()
|
| 69 |
+
|
| 70 |
+
# get final alignments and format them
|
| 71 |
+
alignments_str = process.communicate()[0].decode('utf-8')
|
| 72 |
+
alignments = []
|
| 73 |
+
for line in alignments_str.splitlines():
|
| 74 |
+
alignments.append([(int(i), int(j)) for i, j in [pair.split("-") for pair in line.strip("\n").split(" ")]])
|
| 75 |
+
|
| 76 |
+
return alignments
|
| 77 |
+
|
| 78 |
+
|
| 79 |
+
# Function to extract paragraphs with their runs
|
| 80 |
+
def extract_paragraphs_with_runs(doc):
|
| 81 |
+
paragraphs_with_runs = []
|
| 82 |
+
for para in doc.paragraphs:
|
| 83 |
+
runs = []
|
| 84 |
+
for run in para.runs:
|
| 85 |
+
runs.append({
|
| 86 |
+
'text': run.text,
|
| 87 |
+
'bold': run.bold,
|
| 88 |
+
'italic': run.italic,
|
| 89 |
+
'underline': run.underline,
|
| 90 |
+
'font_name': run.font.name,
|
| 91 |
+
'font_size': run.font.size,
|
| 92 |
+
'font_color': run.font.color.rgb
|
| 93 |
+
})
|
| 94 |
+
paragraphs_with_runs.append(runs)
|
| 95 |
+
return paragraphs_with_runs
|
| 96 |
+
|
| 97 |
+
|
| 98 |
+
def tokenize_paragraph_with_runs2(runs_in_paragraph):
|
| 99 |
+
text_paragraph = " ".join(run["text"] for run in runs_in_paragraph)
|
| 100 |
+
sentences = sent_tokenize(text_paragraph)
|
| 101 |
+
tokenized_sentences = [word_tokenize(sentence) for sentence in sentences]
|
| 102 |
+
|
| 103 |
+
tokenized_sentences_with_style = []
|
| 104 |
+
for tokenized_sentence in tokenized_sentences:
|
| 105 |
+
tokenized_sentence_with_style = []
|
| 106 |
+
token_idx = 0
|
| 107 |
+
for run in runs_in_paragraph:
|
| 108 |
+
text_in_run = run["text"].strip()
|
| 109 |
+
|
| 110 |
+
if text_in_run == tokenized_sentence[token_idx]:
|
| 111 |
+
new_run = run.copy()
|
| 112 |
+
new_run["text"] = text_in_run
|
| 113 |
+
tokenized_sentence_with_style.append(new_run)
|
| 114 |
+
token_idx += 1
|
| 115 |
+
if token_idx >= len(tokenized_sentence):
|
| 116 |
+
break
|
| 117 |
+
elif len(text_in_run) > len(tokenized_sentence[token_idx]):
|
| 118 |
+
if text_in_run.startswith(tokenized_sentence[token_idx]):
|
| 119 |
+
for token in word_tokenize(text_in_run):
|
| 120 |
+
if token == tokenized_sentence[token_idx]:
|
| 121 |
+
new_run = run.copy()
|
| 122 |
+
new_run["text"] = token
|
| 123 |
+
tokenized_sentence_with_style.append(new_run)
|
| 124 |
+
token_idx += 1
|
| 125 |
+
else:
|
| 126 |
+
raise "oops"
|
| 127 |
+
tokenized_sentences_with_style.append(tokenized_sentence_with_style)
|
| 128 |
+
return tokenized_sentences_with_style
|
| 129 |
+
|
| 130 |
+
|
| 131 |
+
def tokenize_paragraph_with_runs(runs_in_paragraph, detokenizer):
|
| 132 |
+
text_paragraph = detokenizer.detokenize([run["text"] for run in runs_in_paragraph])
|
| 133 |
+
sentences = sent_tokenize(text_paragraph)
|
| 134 |
+
tokenized_sentences = [word_tokenize(sentence) for sentence in sentences]
|
| 135 |
+
|
| 136 |
+
tokens_with_style = []
|
| 137 |
+
for run in runs_in_paragraph:
|
| 138 |
+
tokens = word_tokenize(run["text"])
|
| 139 |
+
for token in tokens:
|
| 140 |
+
tokens_with_style.append(run.copy())
|
| 141 |
+
tokens_with_style[-1]["text"] = token
|
| 142 |
+
|
| 143 |
+
token_index = 0
|
| 144 |
+
tokenized_sentences_with_style = []
|
| 145 |
+
for sentence in tokenized_sentences:
|
| 146 |
+
sentence_with_style = []
|
| 147 |
+
for word in sentence:
|
| 148 |
+
if word == tokens_with_style[token_index]["text"]:
|
| 149 |
+
sentence_with_style.append(tokens_with_style[token_index])
|
| 150 |
+
token_index += 1
|
| 151 |
+
else:
|
| 152 |
+
if word.startswith(tokens_with_style[token_index]["text"]):
|
| 153 |
+
# this token might be split into several runs
|
| 154 |
+
word_left = word
|
| 155 |
+
|
| 156 |
+
while word_left:
|
| 157 |
+
sentence_with_style.append(tokens_with_style[token_index])
|
| 158 |
+
word_left = word_left.removeprefix(tokens_with_style[token_index]["text"])
|
| 159 |
+
token_index += 1
|
| 160 |
+
else:
|
| 161 |
+
raise "oops"
|
| 162 |
+
tokenized_sentences_with_style.append(sentence_with_style)
|
| 163 |
+
return tokenized_sentences_with_style
|
| 164 |
+
|
| 165 |
+
|
| 166 |
+
def generate_alignments(original_runs_in_paragraph, translated_paragraph, aligner, temp_folder, detokenizer):
|
| 167 |
+
# clean temp folder
|
| 168 |
+
for f in os.listdir(temp_folder):
|
| 169 |
+
os.remove(os.path.join(temp_folder, f))
|
| 170 |
+
|
| 171 |
+
temp_file_path = os.path.join(temp_folder, "tokenized_sentences.txt")
|
| 172 |
+
|
| 173 |
+
# tokenize the original text by sentence and words while keeping the style
|
| 174 |
+
original_tokenized_sentences_with_style = tokenize_paragraph_with_runs(original_runs_in_paragraph, detokenizer)
|
| 175 |
+
# tokenize the translated text by sentence and word
|
| 176 |
+
translated_tokenized_sentences = [word_tokenize(sentence) for sentence in sent_tokenize(translated_paragraph)]
|
| 177 |
+
|
| 178 |
+
# write the file that fastalign will use
|
| 179 |
+
with open(temp_file_path, "w") as out_file:
|
| 180 |
+
for original, translated in zip(original_tokenized_sentences_with_style, translated_tokenized_sentences):
|
| 181 |
+
out_file.write(f"{" ".join(item["text"] for item in original)} ||| {" ".join(translated)}\n")
|
| 182 |
+
|
| 183 |
+
alignments = aligner.align(temp_file_path)
|
| 184 |
+
|
| 185 |
+
# using the alignments generated by fastalign, we need to copy the style of the original token to the translated one
|
| 186 |
+
translated_sentences_with_style = []
|
| 187 |
+
for sentence_idx, sentence_alignments in enumerate(alignments):
|
| 188 |
+
|
| 189 |
+
# reverse the order of the alignments and build a dict with it
|
| 190 |
+
sentence_alignments = {target: source for source, target in sentence_alignments}
|
| 191 |
+
|
| 192 |
+
translated_sentence_with_style = []
|
| 193 |
+
for token_idx, translated_token in enumerate(translated_tokenized_sentences[sentence_idx]):
|
| 194 |
+
# fastalign has found a token aligned with the translated one
|
| 195 |
+
if token_idx in sentence_alignments.keys():
|
| 196 |
+
# get the aligned token
|
| 197 |
+
original_idx = sentence_alignments[token_idx]
|
| 198 |
+
new_entry = original_tokenized_sentences_with_style[sentence_idx][original_idx].copy()
|
| 199 |
+
new_entry["text"] = translated_token
|
| 200 |
+
translated_sentence_with_style.append(new_entry)
|
| 201 |
+
else:
|
| 202 |
+
# WARNING this is a test
|
| 203 |
+
# since fastalign doesn't know from which word to reference this token, copy the style of the previous word
|
| 204 |
+
new_entry = translated_sentence_with_style[-1].copy()
|
| 205 |
+
new_entry["text"] = translated_token
|
| 206 |
+
translated_sentence_with_style.append(new_entry)
|
| 207 |
+
|
| 208 |
+
translated_sentences_with_style.append(translated_sentence_with_style)
|
| 209 |
+
|
| 210 |
+
return translated_sentences_with_style
|
| 211 |
+
|
| 212 |
+
|
| 213 |
+
# TODO
|
| 214 |
+
def translate_paragraph(paragraph_text):
|
| 215 |
+
translated_paragraph = ""
|
| 216 |
+
return translated_paragraphs
|
| 217 |
+
|
| 218 |
+
|
| 219 |
+
# group contiguous elements with the same boolean values
|
| 220 |
+
def group_by_style(values, detokenizer):
|
| 221 |
+
groups = []
|
| 222 |
+
for key, group in groupby(values, key=lambda x: (
|
| 223 |
+
x['bold'], x['italic'], x['underline'], x['font_name'], x['font_size'], x['font_color'])):
|
| 224 |
+
text = detokenizer.detokenize([item['text'] for item in group])
|
| 225 |
+
|
| 226 |
+
groups.append({"text": text,
|
| 227 |
+
"bold": key[0],
|
| 228 |
+
"italic": key[1],
|
| 229 |
+
"underline": key[2],
|
| 230 |
+
"font_name": key[3],
|
| 231 |
+
"font_size": key[4],
|
| 232 |
+
"font_color": key[5]})
|
| 233 |
+
return groups
|
| 234 |
+
|
| 235 |
+
|
| 236 |
+
def preprocess_runs(runs_in_paragraph):
|
| 237 |
+
new_runs = []
|
| 238 |
+
|
| 239 |
+
for run in runs_in_paragraph:
|
| 240 |
+
if not new_runs:
|
| 241 |
+
new_runs.append(run)
|
| 242 |
+
else:
|
| 243 |
+
# if the previous run has the same format as the current run, we merge the two runs together
|
| 244 |
+
if (new_runs[-1]["bold"] == run["bold"] and new_runs[-1]["font_color"] == run["font_color"] and
|
| 245 |
+
new_runs[-1]["font_color"] == run["font_color"] and new_runs[-1]["font_name"] == run["font_name"]
|
| 246 |
+
and new_runs[-1]["font_size"] == run["font_size"] and new_runs[-1]["italic"] == run["italic"]
|
| 247 |
+
and new_runs[-1]["underline"] == run["underline"]):
|
| 248 |
+
new_runs[-1]["text"] += run["text"]
|
| 249 |
+
else:
|
| 250 |
+
new_runs.append(run)
|
| 251 |
+
|
| 252 |
+
# we want to split runs that contain more than one sentence to avoid problems later when aligning styles
|
| 253 |
+
sentences = sent_tokenize(new_runs[-1]["text"])
|
| 254 |
+
if len(sentences) > 1:
|
| 255 |
+
new_runs[-1]["text"] = sentences[0]
|
| 256 |
+
for sentence in sentences[1:]:
|
| 257 |
+
new_run = new_runs[-1].copy()
|
| 258 |
+
new_run["text"] = sentence
|
| 259 |
+
new_runs.append(new_run)
|
| 260 |
+
|
| 261 |
+
return new_runs
|
| 262 |
+
|
| 263 |
+
|
| 264 |
+
if __name__ == "__main__":
|
| 265 |
+
input_file = 'data/test2.docx'
|
| 266 |
+
output_file = 'data/translated_output.docx'
|
| 267 |
+
source_lang = 'ca'
|
| 268 |
+
target_lang = 'en'
|
| 269 |
+
config_folder = "fast_align_config"
|
| 270 |
+
temp_folder = "tmp"
|
| 271 |
+
|
| 272 |
+
aligner = Aligner(config_folder, source_lang, target_lang, temp_folder)
|
| 273 |
+
|
| 274 |
+
os.makedirs(temp_folder, exist_ok=True)
|
| 275 |
+
|
| 276 |
+
# load original file, extract the paragraphs with their runs (which include style and formatting)
|
| 277 |
+
doc = Document(input_file)
|
| 278 |
+
paragraphs_with_runs = extract_paragraphs_with_runs(doc)
|
| 279 |
+
|
| 280 |
+
detokenizer = TreebankWordDetokenizer()
|
| 281 |
+
|
| 282 |
+
# translate each paragraph
|
| 283 |
+
translated_paragraphs = []
|
| 284 |
+
for paragraph in paragraphs_with_runs:
|
| 285 |
+
paragraph_text = detokenizer.detokenize([run["text"] for run in paragraph])
|
| 286 |
+
translated_paragraphs.append(translate_paragraph(paragraph_text))
|
| 287 |
+
|
| 288 |
+
out_doc = Document()
|
| 289 |
+
|
| 290 |
+
for original_runs_in_paragraph, translated_paragraph in zip(paragraphs_with_runs, translated_paragraphs):
|
| 291 |
+
# sometimes we get empty paragraphs for some reason, I think it's just docx shenanigans
|
| 292 |
+
if not original_runs_in_paragraph:
|
| 293 |
+
continue
|
| 294 |
+
|
| 295 |
+
original_runs_in_paragraph = preprocess_runs(original_runs_in_paragraph)
|
| 296 |
+
|
| 297 |
+
paragraph_with_style = generate_alignments(original_runs_in_paragraph, translated_paragraph, aligner,
|
| 298 |
+
temp_folder, detokenizer)
|
| 299 |
+
|
| 300 |
+
para = out_doc.add_paragraph()
|
| 301 |
+
|
| 302 |
+
# flatten the paragraph, we don't need it to split into sentences anymore
|
| 303 |
+
paragraph_with_style = [item for sublist in paragraph_with_style for item in sublist]
|
| 304 |
+
|
| 305 |
+
# merge tokens into runs and detokenize
|
| 306 |
+
paragraph_with_runs = group_by_style(paragraph_with_style, detokenizer)
|
| 307 |
+
|
| 308 |
+
for item in paragraph_with_runs:
|
| 309 |
+
run = para.add_run(item["text"] + " ")
|
| 310 |
+
# Preserve original run formatting
|
| 311 |
+
run.bold = item['bold']
|
| 312 |
+
run.italic = item['italic']
|
| 313 |
+
run.underline = item['underline']
|
| 314 |
+
run.font.name = item['font_name']
|
| 315 |
+
run.font.size = item['font_size']
|
| 316 |
+
run.font.color.rgb = item['font_color']
|
| 317 |
+
|
| 318 |
+
out_doc.save(output_file)
|
readme.md
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# document_translator
|
| 2 |
+
|
| 3 |
+
Project to translate files (for now .docx) using BSC's models while keeping the formatting and style of the original file.
|
| 4 |
+
|
| 5 |
+
## Requirements
|
| 6 |
+
### python 3.12
|
| 7 |
+
|
| 8 |
+
### fast_align
|
| 9 |
+
|
| 10 |
+
Clone https://github.com/clab/fast_align, run the compilation commands indicated in the project's readme, place fast_align and atools (.exe if using windows) in this project's root.
|
| 11 |
+
|
| 12 |
+
### fast_align fine-tuning files
|
| 13 |
+
|
| 14 |
+
I took the 4 files (ca-en.params, ca-en.err, en-ca.params and en-ca.err) from https://huggingface.co/projecte-aina/aina-translator-ca-en/tree/main. Maybe we could automatize the download of these files. For now, place these files in config_folder (defined in main.py).
|
| 15 |
+
|
| 16 |
+
### python requirements
|
| 17 |
+
|
| 18 |
+
pip install -r requirements.txt
|
requirements.txt
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
nltk~=3.9.1
|
| 2 |
+
python-docx~=1.1.2
|