Spaces:

bodhak
/

pdf-tools-suite

Sleeping

File size: 1,385 Bytes

92c0981
 
5d01bda

from transformers import pipeline

# 使用 huggingface 上的 pegasus 中文摘要模型
# 如果你在 Hugging Face Space 上執行，可直接用下面這行
summarizer = pipeline(
    "summarization", 
    model="IDEA-CCNL/Randeng-Pegasus-523M-Summary-Chinese", 
    tokenizer="IDEA-CCNL/Randeng-Pegasus-523M-Summary-Chinese",
    device=0  # 如果有 GPU，否則設 device=-1
)

def 摘要(pdf_純文字):
    """

    中文 PDF 摘要，適用於繁簡體

    """
    if not pdf_純文字 or len(pdf_純文字.strip()) < 20:
        return "⚠️ PDF 內容為空或無法解析（可能是掃描檔或圖片）"
    
    段落列表 = [p.strip() for p in pdf_純文字.split('\n') if p.strip()]
    摘要結果 = []
    for 段 in 段落列表:
        # Pegasus 的 max_length 最多 128
        if len(段) < 30:
            continue
        # 以 400 字切片
        for i in range(0, len(段), 400):
            子段 = 段[i:i+400]
            try:
                out = summarizer(子段, max_length=64, min_length=10, do_sample=False)
                if out and len(out) > 0:
                    摘要結果.append(out[0]['summary_text'])
            except Exception as e:
                摘要結果.append(f"(錯誤：{e})")
    return "\n".join(摘要結果) if 摘要結果 else "⚠️ 沒有找到可摘要的內容！"