Spaces:
Sleeping
Sleeping
Upload 8 files
Browse files- README.md +14 -39
- app.py +53 -46
- gitattributes +1 -6
- pdfsum.py +13 -16
- requirements.txt +3 -4
- textsumm.py +14 -23
README.md
CHANGED
@@ -1,44 +1,19 @@
|
|
1 |
-
|
2 |
-
title: PDF 工具箱 (中文)
|
3 |
-
emoji: 📑
|
4 |
-
colorFrom: blue # 只能用 red, yellow, green, blue, indigo, purple, pink, gray
|
5 |
-
colorTo: purple
|
6 |
-
sdk: streamlit
|
7 |
-
sdk_version: 1.34.0
|
8 |
-
app_file: app.py
|
9 |
-
pinned: false
|
10 |
-
license: mit
|
11 |
-
---
|
12 |
-
|
13 |
-
# 📑 PDF 工具箱(中文)
|
14 |
-
|
15 |
-
這是一套多功能 PDF 與文本處理工具,內建多種 AI 智能應用,適用於學習、教學與日常工作!
|
16 |
|
17 |
## 主要功能
|
18 |
-
-
|
19 |
-
- **PDF
|
20 |
-
-
|
21 |
-
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
3. **根據需求選擇功能頁籤並操作**
|
30 |
-
|
31 |
-
## 注意事項
|
32 |
-
- PDF 上傳建議單檔不超過 10MB
|
33 |
-
- 文字摘要與 PDF 摘要模型預設為 `facebook/bart-large-cnn`(僅英文摘要),如需中文請改為 Pegasus、ChineseBART 等支援中文的 summarization 模型
|
34 |
-
- 你的 API 金鑰僅儲存在本地瀏覽器,不會外傳
|
35 |
-
- 本專案基於 MIT License
|
36 |
-
|
37 |
-
## 技術棧
|
38 |
-
- Streamlit
|
39 |
-
- PyPDF2、transformers、torch 等
|
40 |
|
41 |
---
|
42 |
|
43 |
-
|
44 |
-
|
|
|
1 |
+
# 📄 PDF 工具箱
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2 |
|
3 |
## 主要功能
|
4 |
+
- **文字摘要**:支援中文長文摘要(Pegasus模型)
|
5 |
+
- **PDF 摘要**:PDF自動擷取中文摘要
|
6 |
+
- **論文搜尋**:arXiv關鍵字查詢
|
7 |
+
- 支援 Hugging Face Spaces,部署即用
|
8 |
+
|
9 |
+
## 安裝需求
|
10 |
+
pip install -r requirements.txt
|
11 |
+
## 使用方式
|
12 |
+
1. 打開 [Hugging Face Spaces](https://huggingface.co/spaces)
|
13 |
+
2. 上傳本專案全部檔案
|
14 |
+
3. 可選擇使用 GPU 加速
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
15 |
|
16 |
---
|
17 |
|
18 |
+
**中文用戶專屬!**
|
19 |
+
(如需進階功能,請洽站長)
|
app.py
CHANGED
@@ -1,59 +1,66 @@
|
|
|
|
1 |
import streamlit as st
|
2 |
from textsumm import 摘要
|
3 |
-
from pdfsum import
|
4 |
-
|
5 |
|
6 |
-
st.set_page_config(page_title="PDF 工具箱
|
7 |
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
index=0
|
13 |
-
)
|
14 |
|
15 |
-
|
16 |
-
st.sidebar.
|
17 |
|
18 |
-
|
|
|
|
|
|
|
|
|
|
|
19 |
st.header("📝 文字摘要")
|
20 |
-
|
21 |
if st.button("生成摘要"):
|
22 |
-
with st.spinner("
|
23 |
-
|
24 |
-
|
25 |
-
|
|
|
|
|
|
|
26 |
|
27 |
-
elif
|
28 |
st.header("📄 PDF 摘要")
|
29 |
-
|
30 |
-
if st.button("產生 PDF 摘要"):
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
st.
|
35 |
-
|
36 |
-
else:
|
37 |
-
st.warning("請先上傳 PDF 檔案")
|
38 |
-
|
39 |
-
elif 功能 == "論文搜尋(arXiv)":
|
40 |
st.header("🔎 論文搜尋(arXiv)")
|
41 |
-
|
42 |
-
max_results = st.slider("結果數量", 1,
|
43 |
-
|
44 |
-
|
45 |
-
start_year = st.number_input("起始年份", min_value=1991, max_value=2025, value=2011)
|
46 |
-
with col2:
|
47 |
-
end_year = st.number_input("結束年份", min_value=1991, max_value=2025, value=2025)
|
48 |
if st.button("搜尋論文"):
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
53 |
else:
|
54 |
-
|
55 |
-
with st.expander(f"📄 {idx}. {p['標題']}"):
|
56 |
-
st.write(f"**作者:** {p['作者']}")
|
57 |
-
st.write(f"**發表日期:** {p['發表日期']}")
|
58 |
-
st.write(f"**摘要:** {p['摘要']}")
|
59 |
-
st.write(f"[arXiv 連結]({p['arXiv 連結']})")
|
|
|
1 |
+
# app.py
|
2 |
import streamlit as st
|
3 |
from textsumm import 摘要
|
4 |
+
from pdfsum import 提取_pdf摘要
|
5 |
+
import requests
|
6 |
|
7 |
+
st.set_page_config(page_title="PDF 工具箱", page_icon="📄", layout="wide")
|
8 |
|
9 |
+
# 側邊欄
|
10 |
+
st.sidebar.title("📄 PDF 工具箱")
|
11 |
+
st.sidebar.write("請輸入 OpenAI API 金鑰(非必填)")
|
12 |
+
api_key = st.sidebar.text_input("sk-...", type="password")
|
|
|
|
|
13 |
|
14 |
+
# GPT 模型選擇
|
15 |
+
model = st.sidebar.radio("選擇 GPT 模型", options=["gpt-4", "gpt-4.0", "gpt-4.1", "gpt-4.5"], index=0)
|
16 |
|
17 |
+
# 工具選擇
|
18 |
+
tool = st.sidebar.radio("選擇功能", options=["文字摘要", "PDF 摘要", "論文搜尋"])
|
19 |
+
|
20 |
+
st.title("PDF 工具箱")
|
21 |
+
|
22 |
+
if tool == "文字摘要":
|
23 |
st.header("📝 文字摘要")
|
24 |
+
user_text = st.text_area("請輸入要摘要的中文內容")
|
25 |
if st.button("生成摘要"):
|
26 |
+
with st.spinner("摘要生成中..."):
|
27 |
+
if user_text.strip():
|
28 |
+
summary = 摘要(user_text.strip())
|
29 |
+
st.success("摘要結果:")
|
30 |
+
st.write(summary)
|
31 |
+
else:
|
32 |
+
st.warning("請輸入內容!")
|
33 |
|
34 |
+
elif tool == "PDF 摘要":
|
35 |
st.header("📄 PDF 摘要")
|
36 |
+
uploaded_file = st.file_uploader("上傳你的 PDF 文件", type=["pdf"])
|
37 |
+
if uploaded_file is not None and st.button("產生 PDF 摘要"):
|
38 |
+
with st.spinner("摘要生成中..."):
|
39 |
+
summary = 提取_pdf摘要(uploaded_file, 摘要)
|
40 |
+
st.success("摘要結果:")
|
41 |
+
st.write(summary)
|
42 |
+
elif tool == "論文搜尋":
|
|
|
|
|
|
|
|
|
43 |
st.header("🔎 論文搜尋(arXiv)")
|
44 |
+
keyword = st.text_input("輸入主題或關鍵字", "量子")
|
45 |
+
max_results = st.slider("結果數量", 1, 20, 5)
|
46 |
+
start_year = st.number_input("起始年份", min_value=1990, max_value=2025, value=2019)
|
47 |
+
end_year = st.number_input("結束年份", min_value=1990, max_value=2025, value=2025)
|
|
|
|
|
|
|
48 |
if st.button("搜尋論文"):
|
49 |
+
st.info("搜尋中...")
|
50 |
+
url = f"http://export.arxiv.org/api/query?search_query=all:{keyword}&start=0&max_results={max_results}"
|
51 |
+
resp = requests.get(url)
|
52 |
+
if resp.ok:
|
53 |
+
import xml.etree.ElementTree as ET
|
54 |
+
root = ET.fromstring(resp.content)
|
55 |
+
found = False
|
56 |
+
for entry in root.findall("{http://www.w3.org/2005/Atom}entry"):
|
57 |
+
published = entry.find("{http://www.w3.org/2005/Atom}published").text[:4]
|
58 |
+
if start_year <= int(published) <= end_year:
|
59 |
+
found = True
|
60 |
+
title = entry.find("{http://www.w3.org/2005/Atom}title").text.strip()
|
61 |
+
link = entry.find("{http://www.w3.org/2005/Atom}id").text.strip()
|
62 |
+
st.markdown(f"**[{title}]({link})**({published})")
|
63 |
+
if not found:
|
64 |
+
st.warning("在所選年份範圍內沒有找到相關論文。")
|
65 |
else:
|
66 |
+
st.error("arXiv 查詢失敗")
|
|
|
|
|
|
|
|
|
|
gitattributes
CHANGED
@@ -1,6 +1 @@
|
|
1 |
-
|
2 |
-
*.pdf filter=lfs diff=lfs merge=lfs -text
|
3 |
-
*.jpg filter=lfs diff=lfs merge=lfs -text
|
4 |
-
*.png filter=lfs diff=lfs merge=lfs -text
|
5 |
-
|
6 |
-
# 中文註解:上面設定會讓 PDF/圖片走 Git LFS(大檔案友善處理)
|
|
|
1 |
+
* text=auto
|
|
|
|
|
|
|
|
|
|
pdfsum.py
CHANGED
@@ -1,18 +1,15 @@
|
|
1 |
-
|
2 |
-
from
|
3 |
|
4 |
-
def
|
5 |
-
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
all_text += page_text.strip() + "\n"
|
11 |
-
return all_text
|
12 |
|
13 |
-
def
|
14 |
-
|
15 |
-
if
|
16 |
-
return "
|
17 |
-
|
18 |
-
return 摘要(內容)
|
|
|
1 |
+
# pdfsum.py
|
2 |
+
from PyPDF2 import PdfReader
|
3 |
|
4 |
+
def 提取_pdf文本(pdf_file):
|
5 |
+
reader = PdfReader(pdf_file)
|
6 |
+
texts = []
|
7 |
+
for page in reader.pages:
|
8 |
+
texts.append(page.extract_text() or "")
|
9 |
+
return "\n".join(texts)
|
|
|
|
|
10 |
|
11 |
+
def 提取_pdf摘要(pdf_file, summarizer_func):
|
12 |
+
text = 提取_pdf文本(pdf_file)
|
13 |
+
if len(text.strip()) < 30:
|
14 |
+
return "❌ 無法提取足夠文字內容"
|
15 |
+
return summarizer_func(text)
|
|
requirements.txt
CHANGED
@@ -1,6 +1,5 @@
|
|
1 |
-
|
2 |
-
|
3 |
torch
|
4 |
PyPDF2
|
5 |
-
|
6 |
-
arxiv
|
|
|
1 |
+
transformers==4.41.1
|
2 |
+
streamlit==1.35.0
|
3 |
torch
|
4 |
PyPDF2
|
5 |
+
requests
|
|
textsumm.py
CHANGED
@@ -1,28 +1,19 @@
|
|
|
|
1 |
from transformers import pipeline
|
2 |
|
3 |
-
#
|
4 |
-
|
5 |
-
# device=0(如果有 GPU)可加速;沒有 GPU 可以移除 device 參數
|
6 |
-
summarizer = pipeline(
|
7 |
-
"summarization",
|
8 |
-
model="IDEA-CCNL/Randeng-Pegasus-523M-Summary-Chinese",
|
9 |
-
tokenizer="IDEA-CCNL/Randeng-Pegasus-523M-Summary-Chinese",
|
10 |
-
device=0 # 若在 CPU 請刪除這一行
|
11 |
-
)
|
12 |
|
13 |
-
def 摘要(
|
14 |
"""
|
15 |
-
|
16 |
-
:param 文本: 輸入的待摘要中文文本
|
17 |
-
:param 最大長度: 摘要最大字數
|
18 |
-
:param 最小長度: 摘要最小字數
|
19 |
-
:return: 返回摘要字串
|
20 |
"""
|
21 |
-
#
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
|
|
|
1 |
+
# textsumm.py
|
2 |
from transformers import pipeline
|
3 |
|
4 |
+
# 使用 pegasus 中文摘要模型
|
5 |
+
summarizer = pipeline("summarization", model="IDEA-CCNL/Randeng-Pegasus-523M-Summary-Chinese")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
6 |
|
7 |
+
def 摘要(text):
|
8 |
"""
|
9 |
+
傳入中文長文本,回傳中文摘要
|
|
|
|
|
|
|
|
|
10 |
"""
|
11 |
+
# Pegasus 最佳 max_length < 256,如需長摘要可微調
|
12 |
+
try:
|
13 |
+
result = summarizer(text, max_length=128, min_length=30, do_sample=False)
|
14 |
+
if isinstance(result, list) and len(result) > 0:
|
15 |
+
return result[0]['summary_text']
|
16 |
+
else:
|
17 |
+
return "❌ 無法產生摘要"
|
18 |
+
except Exception as e:
|
19 |
+
return f"❌ 摘要失敗: {str(e)}"
|