Spaces:
Sleeping
Sleeping
fixed some formatting errors, still haven't fixed line spacing
Browse files
main.py
CHANGED
@@ -1,4 +1,5 @@
|
|
1 |
import os
|
|
|
2 |
|
3 |
from docx import Document
|
4 |
import nltk
|
@@ -223,6 +224,9 @@ def group_by_style(values, detokenizer):
|
|
223 |
x['bold'], x['italic'], x['underline'], x['font_name'], x['font_size'], x['font_color'])):
|
224 |
text = detokenizer.detokenize([item['text'] for item in group])
|
225 |
|
|
|
|
|
|
|
226 |
groups.append({"text": text,
|
227 |
"bold": key[0],
|
228 |
"italic": key[1],
|
@@ -287,7 +291,9 @@ if __name__ == "__main__":
|
|
287 |
|
288 |
out_doc = Document()
|
289 |
|
290 |
-
for original_runs_in_paragraph, translated_paragraph in zip(paragraphs_with_runs,
|
|
|
|
|
291 |
# sometimes we get empty paragraphs for some reason, I think it's just docx shenanigans
|
292 |
if not original_runs_in_paragraph:
|
293 |
continue
|
@@ -297,7 +303,7 @@ if __name__ == "__main__":
|
|
297 |
paragraph_with_style = generate_alignments(original_runs_in_paragraph, translated_paragraph, aligner,
|
298 |
temp_folder, detokenizer)
|
299 |
|
300 |
-
para = out_doc.add_paragraph()
|
301 |
|
302 |
# flatten the paragraph, we don't need it to split into sentences anymore
|
303 |
paragraph_with_style = [item for sublist in paragraph_with_style for item in sublist]
|
@@ -306,7 +312,7 @@ if __name__ == "__main__":
|
|
306 |
paragraph_with_runs = group_by_style(paragraph_with_style, detokenizer)
|
307 |
|
308 |
for item in paragraph_with_runs:
|
309 |
-
run = para.add_run(item["text"]
|
310 |
# Preserve original run formatting
|
311 |
run.bold = item['bold']
|
312 |
run.italic = item['italic']
|
|
|
1 |
import os
|
2 |
+
import string
|
3 |
|
4 |
from docx import Document
|
5 |
import nltk
|
|
|
224 |
x['bold'], x['italic'], x['underline'], x['font_name'], x['font_size'], x['font_color'])):
|
225 |
text = detokenizer.detokenize([item['text'] for item in group])
|
226 |
|
227 |
+
if groups and not text.startswith((",", ";", ":", ".", ")")):
|
228 |
+
text = " " + text
|
229 |
+
|
230 |
groups.append({"text": text,
|
231 |
"bold": key[0],
|
232 |
"italic": key[1],
|
|
|
291 |
|
292 |
out_doc = Document()
|
293 |
|
294 |
+
for original_runs_in_paragraph, translated_paragraph, original_paragraph in zip(paragraphs_with_runs,
|
295 |
+
translated_paragraphs,
|
296 |
+
doc.paragraphs):
|
297 |
# sometimes we get empty paragraphs for some reason, I think it's just docx shenanigans
|
298 |
if not original_runs_in_paragraph:
|
299 |
continue
|
|
|
303 |
paragraph_with_style = generate_alignments(original_runs_in_paragraph, translated_paragraph, aligner,
|
304 |
temp_folder, detokenizer)
|
305 |
|
306 |
+
para = out_doc.add_paragraph(style=original_paragraph.style)
|
307 |
|
308 |
# flatten the paragraph, we don't need it to split into sentences anymore
|
309 |
paragraph_with_style = [item for sublist in paragraph_with_style for item in sublist]
|
|
|
312 |
paragraph_with_runs = group_by_style(paragraph_with_style, detokenizer)
|
313 |
|
314 |
for item in paragraph_with_runs:
|
315 |
+
run = para.add_run(item["text"])
|
316 |
# Preserve original run formatting
|
317 |
run.bold = item['bold']
|
318 |
run.italic = item['italic']
|