|
from . import common
|
|
import datetime
|
|
import fitz
|
|
import os
|
|
import re
|
|
import shutil
|
|
import subprocess
|
|
import threading
|
|
import time
|
|
from . import to_translate
|
|
|
|
def start(trans):
|
|
|
|
threads = trans['threads']
|
|
if threads is None or threads == "" or int(threads) < 0:
|
|
max_threads = 10
|
|
else:
|
|
max_threads = int(threads)
|
|
|
|
run_index = 0
|
|
max_chars = 1000
|
|
start_time = datetime.datetime.now()
|
|
|
|
try:
|
|
src_pdf = fitz.open(trans['file_path'])
|
|
except Exception as e:
|
|
to_translate.error(trans['id'], "无法访问该文档")
|
|
return False
|
|
texts = []
|
|
api_url = trans['api_url']
|
|
trans_type = trans['type']
|
|
if trans_type == "trans_text_only_inherit":
|
|
|
|
read_block_text(src_pdf, texts)
|
|
elif trans_type == "trans_text_only_new" or trans_type == "trans_text_both_new":
|
|
|
|
read_block_text(src_pdf, texts)
|
|
elif trans_type == "trans_text_both_inherit":
|
|
|
|
read_block_text(src_pdf, texts)
|
|
elif trans_type == "trans_all_only_new":
|
|
|
|
read_block_text(src_pdf, texts)
|
|
elif trans_type == "trans_all_only_inherit":
|
|
|
|
read_block_text(src_pdf, texts)
|
|
elif trans_type == "trans_all_both_new":
|
|
|
|
read_block_text(src_pdf, texts)
|
|
elif trans_type == "trans_all_both_inherit":
|
|
|
|
read_block_text(src_pdf, texts)
|
|
|
|
|
|
uuid = trans['uuid']
|
|
html_path = trans['storage_path'] + '/uploads/' + uuid
|
|
trans['html_path'] = html_path
|
|
read_page_images(src_pdf, texts)
|
|
max_run = max_threads if len(texts) > max_threads else len(texts)
|
|
event = threading.Event()
|
|
before_active_count = threading.activeCount()
|
|
while run_index <= len(texts) - 1:
|
|
if threading.activeCount() < max_run + before_active_count:
|
|
if not event.is_set():
|
|
thread = threading.Thread(target=to_translate.get, args=(trans, event, texts, run_index))
|
|
thread.start()
|
|
run_index += 1
|
|
else:
|
|
return False
|
|
|
|
while True:
|
|
if event.is_set():
|
|
return False
|
|
complete = True
|
|
for text in texts:
|
|
if not text['complete']:
|
|
complete = False
|
|
if complete:
|
|
break
|
|
else:
|
|
time.sleep(1)
|
|
text_count = 0
|
|
if trans_type == "trans_text_only_inherit":
|
|
|
|
write_block_text(src_pdf, texts, text_count, True)
|
|
elif trans_type == "trans_text_only_new":
|
|
|
|
write_block_text(src_pdf, texts, text_count, True)
|
|
elif trans_type == "trans_text_both_new":
|
|
|
|
write_block_both(src_pdf, texts, text_count, True)
|
|
elif trans_type == "trans_text_both_inherit":
|
|
|
|
write_block_both(src_pdf, texts, text_count, True)
|
|
elif trans_type == "trans_all_only_new":
|
|
|
|
write_block_text(src_pdf, texts, text_count, False)
|
|
elif trans_type == "trans_all_only_inherit":
|
|
|
|
write_block_text(src_pdf, texts, text_count, False)
|
|
elif trans_type == "trans_all_both_new":
|
|
|
|
write_block_both(src_pdf, texts, text_count, False)
|
|
elif trans_type == "trans_all_both_inherit":
|
|
|
|
write_block_both(src_pdf, texts, text_count, False)
|
|
|
|
end_time = datetime.datetime.now()
|
|
spend_time = common.display_spend(start_time, end_time)
|
|
to_translate.complete(trans, text_count, spend_time)
|
|
return True
|
|
|
|
|
|
def read_page_images(pages, texts):
|
|
for index, page in enumerate(pages):
|
|
html = page.get_text("xhtml")
|
|
images = re.findall(r"(data:image/\w+;base64,[^\"]+)", html)
|
|
for i, image in enumerate(images):
|
|
append_text(image, 'image', texts)
|
|
|
|
|
|
def read_block_text(pages, texts):
|
|
text = ""
|
|
for page in pages:
|
|
last_x0 = 0
|
|
last_x1 = 0
|
|
for block in page.get_text("blocks"):
|
|
current_x1 = block[2]
|
|
current_x0 = block[0]
|
|
|
|
if block[5] == 0 or abs(current_x1 - last_x1) > 12 or abs(current_x0 - last_x0) > 12:
|
|
append_text(text, "text", texts)
|
|
text = block[4].replace("\n", "")
|
|
else:
|
|
text = text + (block[4].replace("\n", ""))
|
|
last_x1 = block[2]
|
|
last_x0 = block[0]
|
|
append_text(text, "text", texts)
|
|
|
|
|
|
def write_block_text(pages, newpdf, texts):
|
|
text = ""
|
|
for page in pages:
|
|
last_x0 = 0
|
|
last_x1 = 0
|
|
last_y0 = 0
|
|
new_page = newpdf.new_page(width=page.rect.width, height=page.rect.height)
|
|
font = fitz.Font("helv")
|
|
for block in page.get_text("blocks"):
|
|
current_x1 = block[2]
|
|
current_x0 = block[0]
|
|
current_y0 = block[1]
|
|
|
|
if block[5] == 0 or abs(current_x1 - last_x1) > 12 or abs(current_x0 - last_x0) > 12 and len(texts) > 0:
|
|
item = texts.pop(0)
|
|
trans_text = item.get("text", "")
|
|
new_page.insert_text((last_x0, last_y0), trans_text, fontsize=12, fontname="Helvetica", overlay=False)
|
|
text = block[4].replace("\n", "")
|
|
else:
|
|
text = text + (block[4].replace("\n", ""))
|
|
last_x1 = block[2]
|
|
last_x0 = block[0]
|
|
last_y0 = block[1]
|
|
if check_text(text) and len(texts):
|
|
new_page.insert_text((last_x0, last_y0), trans_text, fontsize=12, overlay=False)
|
|
|
|
|
|
def write_block_both(pages, newpdf, texts):
|
|
text = ""
|
|
old_text = ""
|
|
for page in pages:
|
|
last_x0 = 0
|
|
last_x1 = 0
|
|
last_y0 = 0
|
|
new_page = newpdf.new_page(width=page.rect.width, height=page.rect.height)
|
|
old_page = newpdf.new_page(width=page.rect.width, height=page.rect.height)
|
|
font = fitz.Font("helv")
|
|
for block in page.get_text("blocks"):
|
|
current_x1 = block[2]
|
|
current_x0 = block[0]
|
|
current_y0 = block[1]
|
|
|
|
if block[5] == 0 or abs(current_x1 - last_x1) > 12 or abs(current_x0 - last_x0) > 12 and len(texts) > 0:
|
|
item = texts.pop(0)
|
|
trans_text = item.get("text", "")
|
|
new_page.insert_text((last_x0, last_y0), trans_text, fontsize=12, fontname="Helvetica", overlay=False)
|
|
text = block[4].replace("\n", "")
|
|
old_page.insert_text((last_x0, last_y0), text, fontsize=12, fontname="Helvetica", overlay=False)
|
|
else:
|
|
text = text + (block[4].replace("\n", ""))
|
|
last_x1 = block[2]
|
|
last_x0 = block[0]
|
|
last_y0 = block[1]
|
|
if check_text(text) and len(texts):
|
|
new_page.insert_text((last_x0, last_y0), trans_text, fontsize=12, overlay=False)
|
|
old_page.insert_text((last_x0, last_y0), text, fontsize=12, fontname="Helvetica", overlay=False)
|
|
|
|
|
|
def write_page_text(pages, newpdf, texts):
|
|
for page in pages:
|
|
text = page.get_text("text")
|
|
new_page = newpdf.new_page(width=page.rect.width, height=page.rect.height)
|
|
if check_text(text) and len(texts) > 0:
|
|
item = texts.pop(0)
|
|
text = item.get("text", "")
|
|
new_page.insert_text((0, 0), text, fontsize=12, overlay=False)
|
|
|
|
|
|
def read_row(pages, texts):
|
|
text = ""
|
|
for page in pages:
|
|
|
|
for block in page.get_text("blocks"):
|
|
|
|
if block[5] == 0:
|
|
append_text(text, 'text', texts)
|
|
text = block[4]
|
|
else:
|
|
text = text + block[4]
|
|
|
|
|
|
def write_row(newpdf, texts, page_width, page_height):
|
|
text_count = 0
|
|
new_page = newpdf.new_page(width=page_width, height=page_height)
|
|
for text in texts:
|
|
print(text['text'])
|
|
|
|
new_page.insert_text((text['block'][0], text['block'][1]), text['text'], fontsize=16)
|
|
return
|
|
|
|
|
|
def append_text(text, content_type, texts):
|
|
if check_text(text):
|
|
|
|
texts.append({"text": text, "type": content_type, "complete": False})
|
|
|
|
|
|
def check_text(text):
|
|
return text != None and len(text) > 0 and not common.is_all_punc(text)
|
|
|
|
|
|
def draw_text_avoid_overlap(page, text, x, y, font_size):
|
|
"""
|
|
在指定位置绘制文本,避免与现有文本重叠。
|
|
"""
|
|
text_length = len(text) * font_size
|
|
while True:
|
|
text_box = page.get_textbox((x, y, x + text_length, y + font_size))
|
|
if not text_box:
|
|
break
|
|
y += font_size + 1
|
|
|
|
page.insert_text((x, y), text, fontsize=font_size)
|
|
|
|
|
|
def draw_table(page, table_data, x, y, width, cell_height):
|
|
|
|
cols = len(table_data[0])
|
|
rows = len(table_data)
|
|
|
|
|
|
for i in range(rows):
|
|
for j in range(cols):
|
|
|
|
txt = table_data[i][j]
|
|
page.insert_text((x, y), txt)
|
|
|
|
|
|
page.draw_line((x, y), (x + width / cols, y), width=0.5)
|
|
|
|
if i == 0:
|
|
page.draw_line((x, y), (x, y + cell_height), width=0.5)
|
|
|
|
if j == cols - 1:
|
|
page.draw_line((x + width / cols, y), (x + width / cols, y + cell_height), width=0.5)
|
|
|
|
if i == rows - 1:
|
|
page.draw_line((x, y + cell_height), (x + width / cols, y + cell_height), width=0.5)
|
|
|
|
x += width / cols
|
|
|
|
x = 0
|
|
y += cell_height
|
|
|
|
|
|
def wrap_text(text, width):
|
|
words = text.split(' ')
|
|
lines = []
|
|
line = ""
|
|
for word in words:
|
|
if len(line.split(' ')) >= width:
|
|
lines.append(line)
|
|
line = ""
|
|
if len(line + word + ' ') <= width * len(word):
|
|
line += word + ' '
|
|
else:
|
|
lines.append(line)
|
|
line = word + ' '
|
|
if line:
|
|
lines.append(line)
|
|
return lines
|
|
|
|
|
|
def is_paragraph(block):
|
|
|
|
if len(block) < 2:
|
|
return False
|
|
|
|
if max([line.height for line in block]) / min([line.height for line in block]) > 1.5:
|
|
return True
|
|
return False
|
|
|
|
|
|
def is_next_line_continuation(page, current_line, next_line_index):
|
|
|
|
return abs(next_line_index - current_line) < 0.1
|
|
|
|
|
|
def print_texts(texts):
|
|
for item in texts:
|
|
print(item.get("text"))
|
|
|
|
|
|
def is_scan_pdf(pages):
|
|
for index, page in enumerate(pages):
|
|
html = page.get_text("xhtml")
|
|
images = re.findall(r"(data:image/\w+;base64,[^\"]+)", html)
|
|
text = page.get_text()
|
|
if text == "" and len(images) > 0:
|
|
return True
|
|
else:
|
|
return False
|
|
|
|
|
|
def read_pdf_html(pages, texts, trans):
|
|
for index, page in enumerate(pages):
|
|
target_html = "{}-{}.html".format(trans['html_path'], page_index)
|
|
if os.path.exists(target_html):
|
|
os.remove(target_html)
|
|
dftohtml_path = shutil.which("pdftohtml")
|
|
if pdftohtml_path is None:
|
|
raise Exception("未安装pdftohtml")
|
|
subprocess.run([dftohtml_path, "-c", "-l", page_index, trans['file_path'], trans['html_path']])
|
|
if not os.path.exists(target_html):
|
|
raise Exception("无法生成html")
|
|
|
|
|