Spaces:

bodhak
/

pdf-tools-suite

Sleeping

pdf-tools-suite / pdfsum.py

Upload 8 files

5d01bda verified 3 days ago

1.39 kB

	from transformers import pipeline

	# 使用 huggingface 上的 pegasus 中文摘要模型
	# 如果你在 Hugging Face Space 上執行，可直接用下面這行
	summarizer = pipeline(
	"summarization",
	model="IDEA-CCNL/Randeng-Pegasus-523M-Summary-Chinese",
	tokenizer="IDEA-CCNL/Randeng-Pegasus-523M-Summary-Chinese",
	device=0 # 如果有 GPU，否則設 device=-1
	)

	def 摘要(pdf_純文字):
	"""
	中文 PDF 摘要，適用於繁簡體
	"""
	if not pdf_純文字 or len(pdf_純文字.strip()) < 20:
	return "⚠️ PDF 內容為空或無法解析（可能是掃描檔或圖片）"

	段落列表 = [p.strip() for p in pdf_純文字.split('\n') if p.strip()]
	摘要結果 = []
	for 段 in 段落列表:
	# Pegasus 的 max_length 最多 128
	if len(段) < 30:
	continue
	# 以 400 字切片
	for i in range(0, len(段), 400):
	子段 = 段[i:i+400]
	try:
	out = summarizer(子段, max_length=64, min_length=10, do_sample=False)
	if out and len(out) > 0:
	摘要結果.append(out[0]['summary_text'])
	except Exception as e:
	摘要結果.append(f"(錯誤：{e})")
	return "\n".join(摘要結果) if 摘要結果 else "⚠️ 沒有找到可摘要的內容！"