Spaces:

3a05chatgpt
/

pdf-summarizer-app

Sleeping

App Files Files Community

3a05chatgpt commited on 3 days ago

Commit

cd3e0b8

verified ·

1 Parent(s): 3f09f55

Upload 8 files

Browse files

Files changed (6) hide show

README.md +14 -39
app.py +53 -46
gitattributes +1 -6
pdfsum.py +13 -16
requirements.txt +3 -4
textsumm.py +14 -23

README.md CHANGED Viewed

@@ -1,44 +1,19 @@
----
-title: PDF 工具箱 (中文)
-emoji: 📑
-colorFrom: blue         # 只能用 red, yellow, green, blue, indigo, purple, pink, gray
-colorTo: purple
-sdk: streamlit
-sdk_version: 1.34.0
-app_file: app.py
-pinned: false
-license: mit
----
-# 📑 PDF 工具箱（中文）
-這是一套多功能 PDF 與文本處理工具，內建多種 AI 智能應用，適用於學習、教學與日常工作！
 ## 主要功能
-- **文字摘要**：輸入中英文長文，自動生成摘要
-- **PDF 摘要**：上傳 PDF，提取內容並自動生成摘要
-- **PDF 密碼移除**：上傳受密碼保護的 PDF，解除密碼（需輸入正確密碼）
-- **論文搜尋（arXiv）**：輸入主題關鍵字，快速查找與摘要學術論文
-- **PDF 合併**：多個 PDF 一鍵合併
-- **PDF 拆頁**：將 PDF 分割成多份
-- **PDF 轉純文字**：提取 PDF 文字內容
-## 使用說明
-1. **輸入 OpenAI API 金鑰**（如有使用 GPT 文字/PDF 摘要功能）
-2. **選擇 GPT 模型**（支援 gpt-4, gpt-4.0, gpt-4.1, gpt-4.5）
-3. **根據需求選擇功能頁籤並操作**
-## 注意事項
-- PDF 上傳建議單檔不超過 10MB
-- 文字摘要與 PDF 摘要模型預設為 `facebook/bart-large-cnn`（僅英文摘要），如需中文請改為 Pegasus、ChineseBART 等支援中文的 summarization 模型
-- 你的 API 金鑰僅儲存在本地瀏覽器，不會外傳
-- 本專案基於 MIT License
-## 技術棧
-- Streamlit
-- PyPDF2、transformers、torch 等
 ---
-> 製作者：阿亮老師
-> 非經允許、不得私自修改應用！

+# 📄 PDF 工具箱
 ## 主要功能
+- **文字摘要**：支援中文長文摘要（Pegasus模型）
+- **PDF 摘要**：PDF自動擷取中文摘要
+- **論文搜尋**：arXiv關鍵字查詢
+- 支援 Hugging Face Spaces，部署即用
+## 安裝需求
+pip install -r requirements.txt
+## 使用方式
+1. 打開 [Hugging Face Spaces](https://huggingface.co/spaces)
+2. 上傳本專案全部檔案
+3. 可選擇使用 GPU 加速
 ---
+**中文用戶專屬！**
+（如需進階功能，請洽站長）

app.py CHANGED Viewed

@@ -1,59 +1,66 @@
 import streamlit as st
 from textsumm import 摘要
-from pdfsum import pdf摘要
-from papersearch import 論文搜尋
-st.set_page_config(page_title="PDF 工具箱 (中文)", page_icon=":books:", layout="wide")
-st.sidebar.title("📑 PDF 工具箱")
-功能 = st.sidebar.radio(
-    "請選擇功能",
-    ["文字摘要", "PDF 摘要", "論文搜尋（arXiv）"],
-    index=0
-)
-st.sidebar.markdown("---")
-st.sidebar.markdown("本應用支援中文摘要（Pegasus 中文模型）")
-if 功能 == "文字摘要":
     st.header("📝 文字摘要")
-    text = st.text_area("請輸入要摘要的文字")
     if st.button("生成摘要"):
-        with st.spinner("AI 生成中..."):
-            summary = 摘要(text)
-        st.subheader("摘要結果")
-        st.success(summary)
-elif 功能 == "PDF 摘要":
     st.header("📄 PDF 摘要")
-    pdf_file = st.file_uploader("請上傳 PDF 檔案", type=["pdf"])
-    if st.button("產生 PDF 摘要"):
-        if pdf_file is not None:
-            with st.spinner("AI 解析中..."):
-                summary = pdf摘要(pdf_file)
-            st.subheader("PDF 摘要結果")
-            st.success(summary)
-        else:
-            st.warning("請先上傳 PDF 檔案")
-elif 功能 == "論文搜尋（arXiv）":
     st.header("🔎 論文搜尋（arXiv）")
-    關鍵字 = st.text_input("輸入主題或關鍵字")
-    max_results = st.slider("結果數量", 1, 30, 10)
-    col1, col2 = st.columns(2)
-    with col1:
-        start_year = st.number_input("起始年份", min_value=1991, max_value=2025, value=2011)
-    with col2:
-        end_year = st.number_input("結束年份", min_value=1991, max_value=2025, value=2025)
     if st.button("搜尋論文"):
-        with st.spinner("搜尋中..."):
-            papers = 論文搜尋(關鍵字, max_results, start_year, end_year)
-        if not papers:
-            st.info("在所選年份範圍內沒有找到相關論文。")
         else:
-            for idx, p in enumerate(papers, 1):
-                with st.expander(f"📄 {idx}. {p['標題']}"):
-                    st.write(f"**作者：** {p['作者']}")
-                    st.write(f"**發表日期：** {p['發表日期']}")
-                    st.write(f"**摘要：** {p['摘要']}")
-                    st.write(f"[arXiv 連結]({p['arXiv 連結']})")

+# app.py
 import streamlit as st
 from textsumm import 摘要
+from pdfsum import 提取_pdf摘要
+import requests
+st.set_page_config(page_title="PDF 工具箱", page_icon="📄", layout="wide")
+# 側邊欄
+st.sidebar.title("📄 PDF 工具箱")
+st.sidebar.write("請輸入 OpenAI API 金鑰（非必填）")
+api_key = st.sidebar.text_input("sk-...", type="password")
+# GPT 模型選擇
+model = st.sidebar.radio("選擇 GPT 模型", options=["gpt-4", "gpt-4.0", "gpt-4.1", "gpt-4.5"], index=0)
+# 工具選擇
+tool = st.sidebar.radio("選擇功能", options=["文字摘要", "PDF 摘要", "論文搜尋"])
+st.title("PDF 工具箱")
+if tool == "文字摘要":
     st.header("📝 文字摘要")
+    user_text = st.text_area("請輸入要摘要的中文內容")
     if st.button("生成摘要"):
+        with st.spinner("摘要生成中..."):
+            if user_text.strip():
+                summary = 摘要(user_text.strip())
+                st.success("摘要結果：")
+                st.write(summary)
+            else:
+                st.warning("請輸入內容！")
+elif tool == "PDF 摘要":
     st.header("📄 PDF 摘要")
+    uploaded_file = st.file_uploader("上傳你的 PDF 文件", type=["pdf"])
+    if uploaded_file is not None and st.button("產生 PDF 摘要"):
+        with st.spinner("摘要生成中..."):
+            summary = 提取_pdf摘要(uploaded_file, 摘要)
+            st.success("摘要結果：")
+            st.write(summary)
+elif tool == "論文搜尋":
     st.header("🔎 論文搜尋（arXiv）")
+    keyword = st.text_input("輸入主題或關鍵字", "量子")
+    max_results = st.slider("結果數量", 1, 20, 5)
+    start_year = st.number_input("起始年份", min_value=1990, max_value=2025, value=2019)
+    end_year = st.number_input("結束年份", min_value=1990, max_value=2025, value=2025)
     if st.button("搜尋論文"):
+        st.info("搜尋中...")
+        url = f"http://export.arxiv.org/api/query?search_query=all:{keyword}&start=0&max_results={max_results}"
+        resp = requests.get(url)
+        if resp.ok:
+            import xml.etree.ElementTree as ET
+            root = ET.fromstring(resp.content)
+            found = False
+            for entry in root.findall("{http://www.w3.org/2005/Atom}entry"):
+                published = entry.find("{http://www.w3.org/2005/Atom}published").text[:4]
+                if start_year <= int(published) <= end_year:
+                    found = True
+                    title = entry.find("{http://www.w3.org/2005/Atom}title").text.strip()
+                    link = entry.find("{http://www.w3.org/2005/Atom}id").text.strip()
+                    st.markdown(f"**[{title}]({link})**（{published}）")
+            if not found:
+                st.warning("在所選年份範圍內沒有找到相關論文。")
         else:
+            st.error("arXiv 查詢失敗")

gitattributes CHANGED Viewed

@@ -1,6 +1 @@
-# Git LFS 屬性設定檔（可用於大檔案控制）
-*.pdf filter=lfs diff=lfs merge=lfs -text
-*.jpg filter=lfs diff=lfs merge=lfs -text
-*.png filter=lfs diff=lfs merge=lfs -text
-# 中文註解：上面設定會讓 PDF/圖片走 Git LFS（大檔案友善處理）


1	+ * text=auto

pdfsum.py CHANGED Viewed

@@ -1,18 +1,15 @@
-import PyPDF2
-from textsumm import 摘要
-def pdf抽取文字(pdf_file):
-    # pdf_file 來自 st.file_uploader，為 BytesIO 物件
-    pdf_reader = PyPDF2.PdfReader(pdf_file)
-    all_text = ""
-    for page in pdf_reader.pages:
-        page_text = page.extract_text() or ""
-        all_text += page_text.strip() + "\n"
-    return all_text
-def pdf摘要(pdf_file):
-    內容 = pdf抽取文字(pdf_file)
-    if not 內容.strip():
-        return "⚠️ PDF 無可讀文字或為掃描檔，請上傳可解析之 PDF"
-    # 可依需求切分多頁逐一摘要
-    return 摘要(內容)

+# pdfsum.py
+from PyPDF2 import PdfReader
+def 提取_pdf文本(pdf_file):
+    reader = PdfReader(pdf_file)
+    texts = []
+    for page in reader.pages:
+        texts.append(page.extract_text() or "")
+    return "\n".join(texts)
+def 提取_pdf摘要(pdf_file, summarizer_func):
+    text = 提取_pdf文本(pdf_file)
+    if len(text.strip()) < 30:
+        return "❌ 無法提取足夠文字內容"
+    return summarizer_func(text)

requirements.txt CHANGED Viewed

@@ -1,6 +1,5 @@
-streamlit
-transformers
 torch
 PyPDF2
-sentencepiece
-arxiv

+transformers==4.41.1
+streamlit==1.35.0
 torch
 PyPDF2
+requests

textsumm.py CHANGED Viewed

@@ -1,28 +1,19 @@
 from transformers import pipeline
-# 指定使用 Hugging Face 上支援中文的摘要模型
-# 例如："IDEA-CCNL/Randeng-Pegasus-523M-Summary-Chinese" 是免費的中文摘要模型
-# device=0（如果有 GPU）可加速；沒有 GPU 可以移除 device 參數
-summarizer = pipeline(
-    "summarization",
-    model="IDEA-CCNL/Randeng-Pegasus-523M-Summary-Chinese",
-    tokenizer="IDEA-CCNL/Randeng-Pegasus-523M-Summary-Chinese",
-    device=0   # 若在 CPU 請刪除這一行
-)
-def 摘要(文本, 最大長度=128, 最小長度=30):
     """
-    用於生成中文長文摘要
-    :param 文本: 輸入的待摘要中文文本
-    :param 最大長度: 摘要最大字數
-    :param 最小長度: 摘要最小字數
-    :return: 返回摘要字串
     """
-    # transformers 需要將文本丟進 summarizer
-    results = summarizer(文本, max_length=最大長度, min_length=最小長度, do_sample=False)
-    return results[0]['summary_text']
-# 範例測試（可刪）
-if __name__ == "__main__":
-    測試文 = "人工智慧（Artificial Intelligence，簡稱 AI）是計算機科學的一個分支，..."
-    print("摘要結果：", 摘要(測試文))

+# textsumm.py
 from transformers import pipeline
+# 使用 pegasus 中文摘要模型
+summarizer = pipeline("summarization", model="IDEA-CCNL/Randeng-Pegasus-523M-Summary-Chinese")
+def 摘要(text):
     """
+    傳入中文長文本，回傳中文摘要
     """
+    # Pegasus 最佳 max_length < 256，如需長摘要可微調
+    try:
+        result = summarizer(text, max_length=128, min_length=30, do_sample=False)
+        if isinstance(result, list) and len(result) > 0:
+            return result[0]['summary_text']
+        else:
+            return "❌ 無法產生摘要"
+    except Exception as e:
+        return f"❌ 摘要失敗: {str(e)}"