mjuvilla commited on
Commit
b568903
·
1 Parent(s): 978cbf1

fixed some formatting errors, still haven't fixed line spacing

Browse files
Files changed (1) hide show
  1. main.py +9 -3
main.py CHANGED
@@ -1,4 +1,5 @@
1
  import os
 
2
 
3
  from docx import Document
4
  import nltk
@@ -223,6 +224,9 @@ def group_by_style(values, detokenizer):
223
  x['bold'], x['italic'], x['underline'], x['font_name'], x['font_size'], x['font_color'])):
224
  text = detokenizer.detokenize([item['text'] for item in group])
225
 
 
 
 
226
  groups.append({"text": text,
227
  "bold": key[0],
228
  "italic": key[1],
@@ -287,7 +291,9 @@ if __name__ == "__main__":
287
 
288
  out_doc = Document()
289
 
290
- for original_runs_in_paragraph, translated_paragraph in zip(paragraphs_with_runs, translated_paragraphs):
 
 
291
  # sometimes we get empty paragraphs for some reason, I think it's just docx shenanigans
292
  if not original_runs_in_paragraph:
293
  continue
@@ -297,7 +303,7 @@ if __name__ == "__main__":
297
  paragraph_with_style = generate_alignments(original_runs_in_paragraph, translated_paragraph, aligner,
298
  temp_folder, detokenizer)
299
 
300
- para = out_doc.add_paragraph()
301
 
302
  # flatten the paragraph, we don't need it to split into sentences anymore
303
  paragraph_with_style = [item for sublist in paragraph_with_style for item in sublist]
@@ -306,7 +312,7 @@ if __name__ == "__main__":
306
  paragraph_with_runs = group_by_style(paragraph_with_style, detokenizer)
307
 
308
  for item in paragraph_with_runs:
309
- run = para.add_run(item["text"] + " ")
310
  # Preserve original run formatting
311
  run.bold = item['bold']
312
  run.italic = item['italic']
 
1
  import os
2
+ import string
3
 
4
  from docx import Document
5
  import nltk
 
224
  x['bold'], x['italic'], x['underline'], x['font_name'], x['font_size'], x['font_color'])):
225
  text = detokenizer.detokenize([item['text'] for item in group])
226
 
227
+ if groups and not text.startswith((",", ";", ":", ".", ")")):
228
+ text = " " + text
229
+
230
  groups.append({"text": text,
231
  "bold": key[0],
232
  "italic": key[1],
 
291
 
292
  out_doc = Document()
293
 
294
+ for original_runs_in_paragraph, translated_paragraph, original_paragraph in zip(paragraphs_with_runs,
295
+ translated_paragraphs,
296
+ doc.paragraphs):
297
  # sometimes we get empty paragraphs for some reason, I think it's just docx shenanigans
298
  if not original_runs_in_paragraph:
299
  continue
 
303
  paragraph_with_style = generate_alignments(original_runs_in_paragraph, translated_paragraph, aligner,
304
  temp_folder, detokenizer)
305
 
306
+ para = out_doc.add_paragraph(style=original_paragraph.style)
307
 
308
  # flatten the paragraph, we don't need it to split into sentences anymore
309
  paragraph_with_style = [item for sublist in paragraph_with_style for item in sublist]
 
312
  paragraph_with_runs = group_by_style(paragraph_with_style, detokenizer)
313
 
314
  for item in paragraph_with_runs:
315
+ run = para.add_run(item["text"])
316
  # Preserve original run formatting
317
  run.bold = item['bold']
318
  run.italic = item['italic']