Tbb1111 commited on
Commit
7b6f181
·
verified ·
1 Parent(s): 3600037

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +16 -4
app.py CHANGED
@@ -17,10 +17,22 @@ def translate_pdf(pdf_file):
17
  for page in doc:
18
  text += page.get_text()
19
 
20
- # 使用 T5 模型进行翻译
21
- inputs = tokenizer.encode("translate English to Chinese: " + text, return_tensors="pt", max_length=512, truncation=True)
22
- outputs = model.generate(inputs, max_length=1024, num_beams=4, early_stopping=True)
23
- translated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
 
 
 
 
 
 
 
 
 
 
 
 
24
 
25
  # 创建翻译后的 PDF
26
  translated_pdf = FPDF()
 
17
  for page in doc:
18
  text += page.get_text()
19
 
20
+ # 为了避免输入超长,按段落拆分翻译
21
+ paragraphs = text.split("\n")
22
+ translated_paragraphs = []
23
+
24
+ # 分批翻译每一段
25
+ for paragraph in paragraphs:
26
+ if len(paragraph.strip()) == 0:
27
+ continue
28
+ # 使用 T5 模型进行翻译
29
+ inputs = tokenizer.encode("translate English to Chinese: " + paragraph, return_tensors="pt", max_length=512, truncation=True)
30
+ outputs = model.generate(inputs, max_length=1024, num_beams=4, early_stopping=True)
31
+ translated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
32
+ translated_paragraphs.append(translated_text)
33
+
34
+ # 将所有翻译后的段落合并
35
+ translated_text = "\n".join(translated_paragraphs)
36
 
37
  # 创建翻译后的 PDF
38
  translated_pdf = FPDF()