|
import threading
|
|
from docx import Document
|
|
from docx.shared import Pt
|
|
from docx.shared import Inches
|
|
from docx.oxml.ns import qn
|
|
from . import to_translate
|
|
from . import common
|
|
import os
|
|
import sys
|
|
import time
|
|
import datetime
|
|
import zipfile
|
|
import xml.etree.ElementTree as ET
|
|
from . import rediscon
|
|
|
|
def start(trans):
|
|
|
|
threads=trans['threads']
|
|
if threads is None or threads=="" or int(threads)<0:
|
|
max_threads=10
|
|
else:
|
|
max_threads=int(threads)
|
|
|
|
run_index=0
|
|
max_chars=1000
|
|
start_time = datetime.datetime.now()
|
|
|
|
try:
|
|
document = Document(trans['file_path'])
|
|
except Exception as e:
|
|
to_translate.error(trans['id'], "无法访问该文档")
|
|
return False
|
|
texts=[]
|
|
api_url=trans['api_url']
|
|
trans_type=trans['type']
|
|
target_lang = trans['lang']
|
|
if trans_type=="trans_text_only_inherit":
|
|
|
|
read_rune_text(document, texts)
|
|
elif trans_type=="trans_text_only_new" or trans_type=="trans_text_both_new":
|
|
|
|
read_paragraph_text(document, texts)
|
|
elif trans_type=="trans_text_both_inherit":
|
|
|
|
read_rune_text(document, texts)
|
|
elif trans_type=="trans_all_only_new":
|
|
|
|
read_paragraph_text(document, texts)
|
|
elif trans_type=="trans_all_only_inherit":
|
|
|
|
read_rune_text(document, texts)
|
|
elif trans_type=="trans_all_both_new":
|
|
|
|
read_paragraph_text(document, texts)
|
|
elif trans_type=="trans_all_both_inherit":
|
|
|
|
read_rune_text(document, texts)
|
|
|
|
read_comments_from_docx(trans['file_path'], texts)
|
|
read_insstd_from_docx(trans['file_path'], texts)
|
|
|
|
max_run=max_threads if len(texts)>max_threads else len(texts)
|
|
event=threading.Event()
|
|
before_active_count=threading.activeCount()
|
|
while run_index<=len(texts)-1:
|
|
if threading.activeCount()<max_run+before_active_count:
|
|
if not event.is_set():
|
|
thread = threading.Thread(target=to_translate.get,args=(trans,event,texts,run_index))
|
|
thread.start()
|
|
print(f"开始执行线程{run_index}")
|
|
run_index+=1
|
|
else:
|
|
return False
|
|
|
|
while True:
|
|
if event.is_set():
|
|
return False
|
|
complete=True
|
|
for text in texts:
|
|
if not text['complete']:
|
|
complete=False
|
|
if complete:
|
|
break
|
|
else:
|
|
time.sleep(1)
|
|
|
|
|
|
|
|
text_count=0
|
|
if trans_type=="trans_text_only_inherit":
|
|
|
|
write_only_new(document, texts, text_count, True)
|
|
elif trans_type=="trans_text_only_new":
|
|
|
|
write_paragraph_text(document, texts, text_count, True)
|
|
elif trans_type=="trans_text_both_new":
|
|
|
|
write_both_new(document, texts, text_count, True)
|
|
elif trans_type=="trans_text_both_inherit":
|
|
|
|
write_rune_both(document, texts, text_count, True,target_lang)
|
|
elif trans_type=="trans_all_only_new":
|
|
|
|
write_paragraph_text(document, texts, text_count, False)
|
|
elif trans_type=="trans_all_only_inherit":
|
|
|
|
write_only_new(document, texts, text_count, False)
|
|
elif trans_type=="trans_all_both_new":
|
|
|
|
write_both_new(document, texts, text_count, False)
|
|
elif trans_type=="trans_all_both_inherit":
|
|
|
|
write_rune_both(document, texts, text_count, False,target_lang)
|
|
|
|
|
|
|
|
docx_path=trans['target_file']
|
|
document.save(docx_path)
|
|
|
|
modify_comment_in_docx(docx_path, texts)
|
|
modify_inssdt_in_docx(docx_path, texts)
|
|
end_time = datetime.datetime.now()
|
|
spend_time=common.display_spend(start_time, end_time)
|
|
if trans['run_complete']:
|
|
to_translate.complete(trans,text_count,spend_time)
|
|
return True
|
|
|
|
|
|
def read_paragraph_text(document, texts):
|
|
for paragraph in document.paragraphs:
|
|
append_text(paragraph.text, texts)
|
|
|
|
for table in document.tables:
|
|
for row in table.rows:
|
|
start_span=0
|
|
for cell in row.cells:
|
|
read_cell_text(cell, texts)
|
|
|
|
def write_paragraph_text(document, texts, text_count, onlyText):
|
|
for paragraph in document.paragraphs:
|
|
replace_paragraph_text(paragraph, texts, text_count, onlyText, False)
|
|
|
|
for table in document.tables:
|
|
for row in table.rows:
|
|
for cell in row.cells:
|
|
write_paragraph_text(cell, texts, text_count, onlyText)
|
|
|
|
def write_both_new(document, texts, text_count, onlyText):
|
|
for paragraph in document.paragraphs:
|
|
replace_paragraph_text(paragraph, texts, text_count, onlyText, True)
|
|
|
|
for table in document.tables:
|
|
for row in table.rows:
|
|
for cell in row.cells:
|
|
write_both_new(cell, texts, text_count, onlyText)
|
|
|
|
def read_cell_text(cell, texts):
|
|
for index,paragraph in enumerate(cell.paragraphs):
|
|
append_text(paragraph.text, texts)
|
|
|
|
def write_cell_text(cell, texts):
|
|
for index,paragraph in enumerate(cell.paragraphs):
|
|
if check_text(paragraph.text) and len(texts)>0:
|
|
item=texts.pop(0)
|
|
|
|
for index,run in enumerate(paragraph.runs):
|
|
if index==0:
|
|
run.text=item.get('text',"")
|
|
else:
|
|
run.clear()
|
|
|
|
def read_rune_text(document, texts):
|
|
for paragraph in document.paragraphs:
|
|
line_spacing=paragraph.paragraph_format.line_spacing
|
|
|
|
read_run(paragraph.runs, texts)
|
|
|
|
if len(paragraph.hyperlinks)>0:
|
|
for hyperlink in paragraph.hyperlinks:
|
|
read_run(hyperlink.runs, texts)
|
|
|
|
|
|
|
|
for table in document.tables:
|
|
for row in table.rows:
|
|
start_span=0
|
|
for cell in row.cells:
|
|
read_cell_text(cell, texts)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def write_only_new(document, texts, text_count, onlyText):
|
|
for paragraph in document.paragraphs:
|
|
text_count+=write_run(paragraph.runs, texts)
|
|
|
|
if len(paragraph.hyperlinks)>0:
|
|
for hyperlink in paragraph.hyperlinks:
|
|
text_count+=write_run(hyperlink.runs, texts)
|
|
|
|
if onlyText:
|
|
clear_image(paragraph)
|
|
|
|
for table in document.tables:
|
|
for row in table.rows:
|
|
start_span=0
|
|
for cell in row.cells:
|
|
write_cell_text(cell, texts)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def write_rune_both(document, texts, text_count, onlyText,target_lang):
|
|
for paragraph in document.paragraphs:
|
|
|
|
if(len(paragraph.runs)>0):
|
|
paragraph.runs[-1].add_break()
|
|
add_paragraph_run(paragraph, paragraph.runs, texts, text_count,target_lang)
|
|
if len(paragraph.hyperlinks)>0:
|
|
for hyperlink in paragraph.hyperlinks:
|
|
hyperlink.runs[-1].add_break()
|
|
add_paragraph_run(paragraph, hyperlink.runs, texts, text_count,target_lang)
|
|
if onlyText:
|
|
clear_image(paragraph)
|
|
|
|
|
|
for table in document.tables:
|
|
for row in table.rows:
|
|
|
|
for cell in row.cells:
|
|
|
|
|
|
|
|
|
|
for paragraph in cell.paragraphs:
|
|
replace_paragraph_text(paragraph, texts, text_count, onlyText, True)
|
|
|
|
if len(paragraph.hyperlinks)>0:
|
|
for hyperlink in paragraph.hyperlinks:
|
|
replace_paragraph_text(hyperlink, texts, text_count, onlyText, True)
|
|
|
|
def read_run(runs,texts):
|
|
|
|
if len(runs)>0 or len(texts)==0:
|
|
for index,run in enumerate(runs):
|
|
append_text(run.text, texts)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def append_text(text, texts):
|
|
if check_text(text):
|
|
|
|
texts.append({"text":text, "type":"text", "complete":False})
|
|
|
|
def append_comment(text, comment_id, texts):
|
|
if check_text(text):
|
|
texts.append({"text":text, "type":"comment","comment_id":comment_id, "complete":False})
|
|
|
|
def check_text(text):
|
|
return text!=None and len(text)>0 and not common.is_all_punc(text)
|
|
|
|
def write_run(runs,texts):
|
|
text_count=0
|
|
if len(runs)==0:
|
|
return text_count
|
|
text=""
|
|
for index,run in enumerate(runs):
|
|
text=run.text
|
|
if check_text(text) and len(texts)>0:
|
|
item=texts.pop(0)
|
|
text_count+=item.get('count',0)
|
|
run.text=item.get('text',"")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
return text_count
|
|
|
|
|
|
def read_cell(cell,texts):
|
|
append_text(cell.text, texts)
|
|
|
|
|
|
def write_cell(cell,texts):
|
|
text=cell.text
|
|
text_count=0
|
|
if check_text(text) and len(texts)>0:
|
|
item=texts.pop(0)
|
|
text_count+=item.get('count',0)
|
|
cell.text=item.get('text',"")
|
|
return text_count
|
|
|
|
def add_paragraph_run(paragraph, runs, texts, text_count,target_lang):
|
|
for index,run in enumerate(runs):
|
|
if check_text(run.text) and len(texts)>0:
|
|
item=texts.pop(0)
|
|
text_count+=item.get('count',0)
|
|
new_run=paragraph.add_run(item.get('text',""), run.style)
|
|
set_run_style(new_run, run,target_lang)
|
|
set_paragraph_linespace(paragraph)
|
|
|
|
def set_run_style(new_run, copy_run,target_lang):
|
|
new_run.font.italic= copy_run.font.italic
|
|
new_run.font.strike= copy_run.font.strike
|
|
new_run.font.bold= copy_run.font.bold
|
|
new_run.font.size= copy_run.font.size
|
|
new_run.font.color.rgb= copy_run.font.color.rgb
|
|
new_run.underline= copy_run.underline
|
|
new_run.style= copy_run.style
|
|
|
|
|
|
if target_lang== '中文' or target_lang== '日语':
|
|
new_run.font.name = '微软雅黑'
|
|
r = new_run._element.rPr.rFonts
|
|
r.set(qn('w:eastAsia'),'微软雅黑')
|
|
else:
|
|
new_run.font.name = 'Times New Roman'
|
|
r = new_run._element.rPr.rFonts
|
|
r.set(qn('w:eastAsia'),'Times New Roman')
|
|
|
|
def set_paragraph_linespace(paragraph):
|
|
if hasattr(paragraph, "paragraph_format"):
|
|
space_before=paragraph.paragraph_format.space_before
|
|
space_after=paragraph.paragraph_format.space_after
|
|
line_spacing=paragraph.paragraph_format.line_spacing
|
|
line_spacing_rule=paragraph.paragraph_format.line_spacing_rule
|
|
if space_before!=None:
|
|
paragraph.paragraph_format.space_before=space_before
|
|
if space_after!=None:
|
|
paragraph.paragraph_format.space_after=space_after
|
|
if line_spacing!=None:
|
|
paragraph.paragraph_format.line_spacing=line_spacing
|
|
if line_spacing_rule!=None:
|
|
paragraph.paragraph_format.line_spacing_rule=line_spacing_rule
|
|
|
|
def check_image(run):
|
|
if run.element.find('.//w:drawing', namespaces=run.element.nsmap) is not None:
|
|
return True
|
|
return False
|
|
|
|
|
|
def clear_image(paragraph):
|
|
for run in paragraph.runs:
|
|
if check_image(run):
|
|
run.clear()
|
|
|
|
def replace_paragraph_text(paragraph, texts, text_count, onlyText, appendTo):
|
|
text=paragraph.text
|
|
if check_text(text) and len(texts)>0:
|
|
item=texts.pop(0)
|
|
trans_text=item.get('text',"")
|
|
if appendTo:
|
|
if len(paragraph.runs)>0:
|
|
paragraph.runs[-1].add_break()
|
|
paragraph.runs[-1].add_text(trans_text)
|
|
elif len(paragraph.hyperlinks)>0:
|
|
paragraph.hyperlinks[-1].runs[-1].add_break()
|
|
paragraph.hyperlinks[-1].runs[-1].add_text(trans_text)
|
|
else:
|
|
replaced=False
|
|
if len(paragraph.runs)>0:
|
|
for index,run in enumerate(paragraph.runs):
|
|
if not check_image(run):
|
|
if not replaced:
|
|
run.text=trans_text
|
|
replaced=True
|
|
else:
|
|
run.clear()
|
|
elif len(paragraph.hyperlinks)>0:
|
|
for hyperlink in paragraph.hyperlinks:
|
|
for index,run in enumerate(hyperlink.runs):
|
|
if not check_image(run):
|
|
if not replaced:
|
|
run.text=trans_text
|
|
replaced=True
|
|
else:
|
|
run.clear()
|
|
|
|
text_count+=item.get('count',0)
|
|
set_paragraph_linespace(paragraph)
|
|
if onlyText:
|
|
clear_image(paragraph)
|
|
|
|
def read_comments_from_docx(docx_path, texts):
|
|
comments = []
|
|
with zipfile.ZipFile(docx_path, 'r') as docx:
|
|
|
|
if 'word/comments.xml' in docx.namelist():
|
|
with docx.open('word/comments.xml') as comments_file:
|
|
|
|
tree = ET.parse(comments_file)
|
|
root = tree.getroot()
|
|
|
|
|
|
namespace = {'ns0': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'}
|
|
|
|
|
|
for comment in root.findall('ns0:comment', namespace):
|
|
comment_id = comment.get('{http://schemas.openxmlformats.org/wordprocessingml/2006/main}id')
|
|
author = comment.get('{http://schemas.openxmlformats.org/wordprocessingml/2006/main}author')
|
|
date = comment.get('{http://schemas.openxmlformats.org/wordprocessingml/2006/main}date')
|
|
text = ''.join(t.text for p in comment.findall('.//ns0:p', namespace) for r in p.findall('.//ns0:r', namespace) for t in r.findall('.//ns0:t', namespace))
|
|
append_comment(text, comment_id, texts)
|
|
|
|
def modify_comment_in_docx(docx_path, texts):
|
|
|
|
temp_docx_path = os.path.join(os.path.dirname(docx_path), 'temp_' + os.path.basename(docx_path))
|
|
|
|
|
|
with zipfile.ZipFile(docx_path, 'r') as docx:
|
|
|
|
with zipfile.ZipFile(temp_docx_path, 'w') as new_docx:
|
|
for item in docx.infolist():
|
|
|
|
with docx.open(item) as file:
|
|
if item.filename == 'word/comments.xml':
|
|
|
|
tree = ET.parse(file)
|
|
root = tree.getroot()
|
|
|
|
|
|
namespace = {'ns0': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'}
|
|
|
|
|
|
for comment in root.findall('ns0:comment', namespace):
|
|
text = ''.join(t.text for p in comment.findall('.//ns0:p', namespace) for r in p.findall('.//ns0:r', namespace) for t in r.findall('.//ns0:t', namespace))
|
|
if check_text(text):
|
|
for newitem in texts:
|
|
|
|
new_text=newitem.get('text',"")
|
|
comment_id=newitem.get('comment_id',"")
|
|
|
|
|
|
|
|
if comment.get('{http://schemas.openxmlformats.org/wordprocessingml/2006/main}id') == comment_id:
|
|
|
|
|
|
for p in comment.findall('.//ns0:t', namespace):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
p.text=new_text
|
|
|
|
modified_xml = ET.tostring(root, encoding='utf-8', xml_declaration=True).decode('utf-8')
|
|
|
|
|
|
new_docx.writestr(item.filename, modified_xml)
|
|
else:
|
|
|
|
new_docx.writestr(item.filename, file.read())
|
|
|
|
|
|
|
|
os.replace(temp_docx_path, docx_path)
|
|
|
|
|
|
def append_ins(text, ins_id, texts):
|
|
if check_text(text):
|
|
texts.append({"text": text, "type": "ins", "ins_id": ins_id, "complete": False})
|
|
|
|
|
|
def read_insstd_from_docx(docx_path, texts):
|
|
document_ins = []
|
|
namespace = '{http://schemas.openxmlformats.org/wordprocessingml/2006/main}'
|
|
namespace14='{http://schemas.microsoft.com/office/word/2010/wordml}'
|
|
with zipfile.ZipFile(docx_path, 'r') as docx:
|
|
|
|
if 'word/document.xml' in docx.namelist():
|
|
with docx.open('word/document.xml') as document_file:
|
|
|
|
tree = ET.parse(document_file)
|
|
root = tree.getroot()
|
|
for element in root.findall(namespace + 'body'):
|
|
for p in element.findall(namespace + 'p'):
|
|
for ins in p.findall(namespace + 'ins'):
|
|
ins_id = ins.get(namespace + 'id')
|
|
for r in ins.findall(namespace + 'r'):
|
|
for t in r.findall(namespace + 't'):
|
|
append_ins(t.text, ins_id, texts)
|
|
for sdt in element.findall(namespace + 'sdt'):
|
|
for sdtContent in sdt.findall(namespace + 'sdtContent'):
|
|
for p in sdtContent.findall(namespace + 'p'):
|
|
sdt_id = p.get(namespace14 + 'paraId')
|
|
for r in p.findall(namespace + 'r'):
|
|
for t in r.findall(namespace + 't'):
|
|
append_sdt(t.text, sdt_id, texts)
|
|
for ins in p.findall(namespace + 'ins'):
|
|
for r in ins.findall(namespace + 'r'):
|
|
for t in r.findall(namespace + 't'):
|
|
append_sdt(t.text, sdt_id, texts)
|
|
|
|
|
|
|
|
def append_sdt(text, sdt_id, texts):
|
|
if check_text(text):
|
|
texts.append({"text": text, "type": "sdt", "sdt_id": sdt_id, "complete": False})
|
|
|
|
|
|
|
|
def modify_inssdt_in_docx(docx_path, texts):
|
|
print(texts,docx_path)
|
|
temp_docx_path = os.path.join(os.path.dirname(docx_path), 'temp_std_' + os.path.basename(docx_path))
|
|
with zipfile.ZipFile(docx_path, 'r') as docx:
|
|
with zipfile.ZipFile(temp_docx_path, 'w') as new_docx:
|
|
for item in docx.infolist():
|
|
with docx.open(item) as file:
|
|
if item.filename == 'word/document.xml':
|
|
tree = ET.parse(file)
|
|
root = tree.getroot()
|
|
namespace = '{http://schemas.openxmlformats.org/wordprocessingml/2006/main}'
|
|
namespace14='{http://schemas.microsoft.com/office/word/2010/wordml}'
|
|
for body in root.findall(namespace + 'body'):
|
|
for sdt in body.findall(namespace + 'sdt'):
|
|
for sdtContent in sdt.findall(namespace + 'sdtContent'):
|
|
for p in sdtContent.findall(namespace + 'p') :
|
|
for r in p.findall(namespace + 'r'):
|
|
for t in r.findall(namespace + 't'):
|
|
text = t.text
|
|
if check_text(text):
|
|
for newitem in texts:
|
|
new_text = newitem.get('text', "")
|
|
sdt_id = newitem.get('sdt_id', "")
|
|
if p.get(namespace14 + 'paraId') == sdt_id:
|
|
t.text = new_text
|
|
for ins in p.findall(namespace + 'ins'):
|
|
for r in ins.findall(namespace + 'r'):
|
|
for t in r.findall(namespace + 't'):
|
|
text = t.text
|
|
if check_text(text):
|
|
for newitem in texts:
|
|
new_text = newitem.get('text', "")
|
|
sdt_id = newitem.get('sdt_id', "")
|
|
if p.get(namespace14 + 'paraId') == sdt_id:
|
|
t.text = new_text
|
|
|
|
for p in body.findall(namespace + 'p'):
|
|
for ins in p.findall(namespace + 'ins'):
|
|
for r in ins.findall(namespace + 'r'):
|
|
for t in r.findall(namespace + 't'):
|
|
text = t.text
|
|
if check_text(text):
|
|
for newitem in texts:
|
|
new_text = newitem.get('text', "")
|
|
ins_id = newitem.get('ins_id', "")
|
|
if ins.get(namespace + 'id') == ins_id:
|
|
t.text = new_text
|
|
modified_xml = ET.tostring(root, encoding='utf-8', xml_declaration=True).decode('utf-8')
|
|
new_docx.writestr(item.filename, modified_xml)
|
|
else:
|
|
new_docx.writestr(item.filename, file.read())
|
|
os.replace(temp_docx_path, docx_path)
|
|
|