Spaces:
Sleeping
Sleeping
Upload 8 files
Browse files- README.md +29 -22
- app.py +45 -118
- papersearch.py +18 -29
- pdfsum.py +16 -30
- requirements.txt +2 -7
- textsumm.py +22 -31
README.md
CHANGED
@@ -1,37 +1,44 @@
|
|
1 |
---
|
2 |
-
title: PDF
|
3 |
-
emoji:
|
4 |
-
colorFrom: blue
|
5 |
-
colorTo:
|
6 |
sdk: streamlit
|
7 |
-
sdk_version: 1.
|
8 |
app_file: app.py
|
9 |
pinned: false
|
10 |
license: mit
|
11 |
---
|
12 |
|
13 |
-
#
|
14 |
|
15 |
-
|
16 |
|
17 |
-
|
18 |
-
-
|
19 |
-
- **PDF
|
20 |
-
- **
|
21 |
-
-
|
22 |
-
-
|
|
|
|
|
23 |
|
24 |
## 使用說明
|
|
|
|
|
|
|
25 |
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
|
|
30 |
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
本專案歡迎改進建議或功能增補,請於 Hugging Face 或 GitHub 提出 issue。
|
36 |
|
37 |
---
|
|
|
|
|
|
|
|
1 |
---
|
2 |
+
title: PDF 工具箱 (中文)
|
3 |
+
emoji: 📑
|
4 |
+
colorFrom: blue # 只能用 red, yellow, green, blue, indigo, purple, pink, gray
|
5 |
+
colorTo: purple
|
6 |
sdk: streamlit
|
7 |
+
sdk_version: 1.34.0
|
8 |
app_file: app.py
|
9 |
pinned: false
|
10 |
license: mit
|
11 |
---
|
12 |
|
13 |
+
# 📑 PDF 工具箱(中文)
|
14 |
|
15 |
+
這是一套多功能 PDF 與文本處理工具,內建多種 AI 智能應用,適用於學習、教學與日常工作!
|
16 |
|
17 |
+
## 主要功能
|
18 |
+
- **文字摘要**:輸入中英文長文,自動生成摘要
|
19 |
+
- **PDF 摘要**:上傳 PDF,提取內容並自動生成摘要
|
20 |
+
- **PDF 密碼移除**:上傳受密碼保護的 PDF,解除密碼(需輸入正確密碼)
|
21 |
+
- **論文搜尋(arXiv)**:輸入主題關鍵字,快速查找與摘要學術論文
|
22 |
+
- **PDF 合併**:多個 PDF 一鍵合併
|
23 |
+
- **PDF 拆頁**:將 PDF 分割成多份
|
24 |
+
- **PDF 轉純文字**:提取 PDF 文字內容
|
25 |
|
26 |
## 使用說明
|
27 |
+
1. **輸入 OpenAI API 金鑰**(如有使用 GPT 文字/PDF 摘要功能)
|
28 |
+
2. **選擇 GPT 模型**(支援 gpt-4, gpt-4.0, gpt-4.1, gpt-4.5)
|
29 |
+
3. **根據需求選擇功能頁籤並操作**
|
30 |
|
31 |
+
## 注意事項
|
32 |
+
- PDF 上傳建議單檔不超過 10MB
|
33 |
+
- 文字摘要與 PDF 摘要模型預設為 `facebook/bart-large-cnn`(僅英文摘要),如需中文請改為 Pegasus、ChineseBART 等支援中文的 summarization 模型
|
34 |
+
- 你的 API 金鑰僅儲存在本地瀏覽器,不會外傳
|
35 |
+
- 本專案基於 MIT License
|
36 |
|
37 |
+
## 技術棧
|
38 |
+
- Streamlit
|
39 |
+
- PyPDF2、transformers、torch 等
|
|
|
|
|
40 |
|
41 |
---
|
42 |
+
|
43 |
+
> 製作者:阿亮老師
|
44 |
+
> 非經允許、不得私自修改應用!
|
app.py
CHANGED
@@ -1,132 +1,59 @@
|
|
1 |
import streamlit as st
|
2 |
-
import
|
3 |
-
from
|
4 |
-
from
|
5 |
-
from pdfpass import 移除_pdf密碼
|
6 |
-
from papersearch import 抓取論文, 篩選論文依年份
|
7 |
-
from io import BytesIO
|
8 |
-
from datetime import datetime
|
9 |
-
from pypdf import PdfReader, PdfWriter
|
10 |
|
11 |
-
|
12 |
-
st.set_page_config(page_title="PDF 工具箱", page_icon="📄", layout="wide")
|
13 |
|
14 |
-
|
15 |
-
st.sidebar.
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
if api_key:
|
20 |
-
openai.api_key = api_key
|
21 |
-
else:
|
22 |
-
st.sidebar.warning("請輸入你的 OpenAI API Key(sk- 或 sk-proj- 開頭)")
|
23 |
-
|
24 |
-
# ---- 分頁功能 ----
|
25 |
-
page = st.sidebar.radio(
|
26 |
-
"選擇功能",
|
27 |
-
[
|
28 |
-
"文字摘要",
|
29 |
-
"PDF 摘要",
|
30 |
-
"PDF 密碼移除",
|
31 |
-
"論文搜尋",
|
32 |
-
"PDF 合併",
|
33 |
-
"PDF 拆頁",
|
34 |
-
"PDF 轉純文字"
|
35 |
-
]
|
36 |
)
|
37 |
|
38 |
-
|
39 |
-
|
40 |
-
st.title("📝 文字摘要")
|
41 |
-
user_input = st.text_area("請輸入要摘要的文字")
|
42 |
-
if st.button("生成摘要"):
|
43 |
-
if not api_key:
|
44 |
-
st.error("請先輸入 OpenAI API 金鑰!")
|
45 |
-
else:
|
46 |
-
結果 = 文字摘要(user_input)
|
47 |
-
st.subheader("摘要結果")
|
48 |
-
st.write(結果[0]["summary_text"])
|
49 |
|
50 |
-
|
51 |
-
|
52 |
-
st.
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
段落們 = 分段(pdf_text)
|
57 |
-
全部摘要 = " ".join(摘要(段落們))
|
58 |
st.subheader("摘要結果")
|
59 |
-
st.
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
st.
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
st.success(
|
70 |
-
st.download_button("下載已解鎖的 PDF", data=output, file_name="unlocked_pdf.pdf", mime="application/pdf")
|
71 |
else:
|
72 |
-
st.
|
73 |
|
74 |
-
|
75 |
-
|
76 |
-
st.
|
77 |
-
|
78 |
-
max_results = st.slider("結果數量", 1, 50, 10)
|
79 |
col1, col2 = st.columns(2)
|
80 |
with col1:
|
81 |
-
start_year = st.number_input("起始年份", min_value=
|
82 |
with col2:
|
83 |
-
end_year = st.number_input("結束年份", min_value=
|
84 |
if st.button("搜尋論文"):
|
85 |
-
|
86 |
-
|
87 |
-
if
|
88 |
-
|
89 |
-
st.write(f"### {idx}. {論文['標題']}")
|
90 |
-
st.write(f"**作者**: {', '.join(論文['作者'])}")
|
91 |
-
st.write(f"**發表時間**: {論文['發表時間']}")
|
92 |
-
st.write(f"[閱讀全文]({論文['連結']})")
|
93 |
-
st.write("---")
|
94 |
else:
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
if uploaded_files and st.button("合併 PDF"):
|
102 |
-
pdf_writer = PdfWriter()
|
103 |
-
for file in uploaded_files:
|
104 |
-
pdf_reader = PdfReader(file)
|
105 |
-
for page in pdf_reader.pages:
|
106 |
-
pdf_writer.add_page(page)
|
107 |
-
output = BytesIO()
|
108 |
-
pdf_writer.write(output)
|
109 |
-
output.seek(0)
|
110 |
-
st.download_button("下載合併後的 PDF", data=output, file_name="merged.pdf", mime="application/pdf")
|
111 |
-
|
112 |
-
# PDF 拆頁
|
113 |
-
elif page == "PDF 拆頁":
|
114 |
-
st.title("✂️ PDF 拆頁")
|
115 |
-
uploaded_file = st.file_uploader("上傳一個 PDF", type=["pdf"])
|
116 |
-
if uploaded_file:
|
117 |
-
pdf_reader = PdfReader(uploaded_file)
|
118 |
-
for i, page in enumerate(pdf_reader.pages):
|
119 |
-
pdf_writer = PdfWriter()
|
120 |
-
pdf_writer.add_page(page)
|
121 |
-
output = BytesIO()
|
122 |
-
pdf_writer.write(output)
|
123 |
-
output.seek(0)
|
124 |
-
st.download_button(f"下載第 {i+1} 頁", data=output, file_name=f"page_{i+1}.pdf", mime="application/pdf")
|
125 |
-
|
126 |
-
# PDF 轉純文字
|
127 |
-
elif page == "PDF 轉純文字":
|
128 |
-
st.title("📜 PDF 轉純文字")
|
129 |
-
uploaded_file = st.file_uploader("上傳 PDF", type=["pdf"])
|
130 |
-
if uploaded_file:
|
131 |
-
pdf_text = 提取_pdf文字(uploaded_file)
|
132 |
-
st.text_area("擷取內容", pdf_text, height=300)
|
|
|
1 |
import streamlit as st
|
2 |
+
from textsumm import 摘要
|
3 |
+
from pdfsum import pdf摘要
|
4 |
+
from papersearch import 論文搜尋
|
|
|
|
|
|
|
|
|
|
|
5 |
|
6 |
+
st.set_page_config(page_title="PDF 工具箱 (中文)", page_icon=":books:", layout="wide")
|
|
|
7 |
|
8 |
+
st.sidebar.title("📑 PDF 工具箱")
|
9 |
+
功能 = st.sidebar.radio(
|
10 |
+
"請選擇功能",
|
11 |
+
["文字摘要", "PDF 摘要", "論文搜尋(arXiv)"],
|
12 |
+
index=0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
13 |
)
|
14 |
|
15 |
+
st.sidebar.markdown("---")
|
16 |
+
st.sidebar.markdown("本應用支援中文摘要(Pegasus 中文模型)")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
17 |
|
18 |
+
if 功能 == "文字摘要":
|
19 |
+
st.header("📝 文字摘要")
|
20 |
+
text = st.text_area("請輸入要摘要的文字")
|
21 |
+
if st.button("生成摘要"):
|
22 |
+
with st.spinner("AI 生成中..."):
|
23 |
+
summary = 摘要(text)
|
|
|
|
|
24 |
st.subheader("摘要結果")
|
25 |
+
st.success(summary)
|
26 |
+
|
27 |
+
elif 功能 == "PDF 摘要":
|
28 |
+
st.header("📄 PDF 摘要")
|
29 |
+
pdf_file = st.file_uploader("請上傳 PDF 檔案", type=["pdf"])
|
30 |
+
if st.button("產生 PDF 摘要"):
|
31 |
+
if pdf_file is not None:
|
32 |
+
with st.spinner("AI 解析中..."):
|
33 |
+
summary = pdf摘要(pdf_file)
|
34 |
+
st.subheader("PDF 摘要結果")
|
35 |
+
st.success(summary)
|
|
|
36 |
else:
|
37 |
+
st.warning("請先上傳 PDF 檔案")
|
38 |
|
39 |
+
elif 功能 == "論文搜尋(arXiv)":
|
40 |
+
st.header("🔎 論文搜尋(arXiv)")
|
41 |
+
關鍵字 = st.text_input("輸入主題或關鍵字")
|
42 |
+
max_results = st.slider("結果數量", 1, 30, 10)
|
|
|
43 |
col1, col2 = st.columns(2)
|
44 |
with col1:
|
45 |
+
start_year = st.number_input("起始年份", min_value=1991, max_value=2025, value=2011)
|
46 |
with col2:
|
47 |
+
end_year = st.number_input("結束年份", min_value=1991, max_value=2025, value=2025)
|
48 |
if st.button("搜尋論文"):
|
49 |
+
with st.spinner("搜尋中..."):
|
50 |
+
papers = 論文搜尋(關鍵字, max_results, start_year, end_year)
|
51 |
+
if not papers:
|
52 |
+
st.info("在所選年份範圍內沒有找到相關論文。")
|
|
|
|
|
|
|
|
|
|
|
53 |
else:
|
54 |
+
for idx, p in enumerate(papers, 1):
|
55 |
+
with st.expander(f"📄 {idx}. {p['標題']}"):
|
56 |
+
st.write(f"**作者:** {p['作者']}")
|
57 |
+
st.write(f"**發表日期:** {p['發表日期']}")
|
58 |
+
st.write(f"**摘要:** {p['摘要']}")
|
59 |
+
st.write(f"[arXiv 連結]({p['arXiv 連結']})")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
papersearch.py
CHANGED
@@ -1,31 +1,20 @@
|
|
1 |
-
import
|
2 |
-
import xml.etree.ElementTree as ET
|
3 |
-
from datetime import datetime
|
4 |
|
5 |
-
def
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
"
|
17 |
-
"
|
18 |
-
"
|
|
|
|
|
19 |
})
|
20 |
-
return
|
21 |
-
|
22 |
-
def 篩選論文依年份(論文清單, 起始, 結束):
|
23 |
-
"""
|
24 |
-
依年份篩選論文(年分區間)
|
25 |
-
"""
|
26 |
-
篩選 = []
|
27 |
-
for 論文 in 論文清單:
|
28 |
-
年份 = int(論文["發表時間"][:4])
|
29 |
-
if 起始 <= 年份 <= 結束:
|
30 |
-
篩選.append(論文)
|
31 |
-
return 篩選
|
|
|
1 |
+
import arxiv
|
|
|
|
|
2 |
|
3 |
+
def 論文搜尋(關鍵字, max_results=10, start_year=2000, end_year=2025):
|
4 |
+
search = arxiv.Search(
|
5 |
+
query=關鍵字,
|
6 |
+
max_results=max_results,
|
7 |
+
sort_by=arxiv.SortCriterion.SubmittedDate
|
8 |
+
)
|
9 |
+
papers = []
|
10 |
+
for result in search.results():
|
11 |
+
if not (start_year <= result.published.year <= end_year):
|
12 |
+
continue
|
13 |
+
papers.append({
|
14 |
+
"標題": result.title,
|
15 |
+
"作者": ", ".join([a.name for a in result.authors]),
|
16 |
+
"發表日期": str(result.published)[:10],
|
17 |
+
"摘要": result.summary,
|
18 |
+
"arXiv 連結": result.entry_id
|
19 |
})
|
20 |
+
return papers
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
pdfsum.py
CHANGED
@@ -1,32 +1,18 @@
|
|
1 |
-
|
2 |
-
from
|
3 |
|
4 |
-
|
5 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
6 |
|
7 |
-
def
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
內容
|
13 |
-
for 頁面 in reader.pages:
|
14 |
-
內容 += 頁面.extract_text()
|
15 |
-
return 內容
|
16 |
-
|
17 |
-
def 分段(內容, 每段字數=2000):
|
18 |
-
"""
|
19 |
-
將長文本切成多個段落(方便 AI 處理)
|
20 |
-
"""
|
21 |
-
return [內容[i:i+每段字數] for i in range(0, len(內容), 每段字數)]
|
22 |
-
|
23 |
-
def 摘要(段落們):
|
24 |
-
"""
|
25 |
-
對每個段落做中文摘要,再合併回一份總結
|
26 |
-
"""
|
27 |
-
結果 = []
|
28 |
-
for 段 in 段落們:
|
29 |
-
結果.append(
|
30 |
-
summarizer(段, max_length=130, min_length=30, do_sample=False)[0]["summary_text"]
|
31 |
-
)
|
32 |
-
return 結果
|
|
|
1 |
+
import PyPDF2
|
2 |
+
from textsumm import 摘要
|
3 |
|
4 |
+
def pdf抽取文字(pdf_file):
|
5 |
+
# pdf_file 來自 st.file_uploader,為 BytesIO 物件
|
6 |
+
pdf_reader = PyPDF2.PdfReader(pdf_file)
|
7 |
+
all_text = ""
|
8 |
+
for page in pdf_reader.pages:
|
9 |
+
page_text = page.extract_text() or ""
|
10 |
+
all_text += page_text.strip() + "\n"
|
11 |
+
return all_text
|
12 |
|
13 |
+
def pdf摘要(pdf_file):
|
14 |
+
內容 = pdf抽取文字(pdf_file)
|
15 |
+
if not 內容.strip():
|
16 |
+
return "⚠️ PDF 無可讀文字或為掃描檔,請上傳可解析之 PDF"
|
17 |
+
# 可依需求切分多頁逐一摘要
|
18 |
+
return 摘要(內容)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
requirements.txt
CHANGED
@@ -1,11 +1,6 @@
|
|
1 |
streamlit
|
2 |
-
openai
|
3 |
-
pypdf
|
4 |
transformers
|
5 |
torch
|
6 |
-
sentencepiece
|
7 |
-
protobuf
|
8 |
-
pikepdf
|
9 |
-
requests
|
10 |
-
tqdm
|
11 |
PyPDF2
|
|
|
|
|
|
1 |
streamlit
|
|
|
|
|
2 |
transformers
|
3 |
torch
|
|
|
|
|
|
|
|
|
|
|
4 |
PyPDF2
|
5 |
+
sentencepiece
|
6 |
+
arxiv
|
textsumm.py
CHANGED
@@ -1,31 +1,22 @@
|
|
1 |
-
|
2 |
-
|
3 |
-
|
4 |
-
|
5 |
-
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
def
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
)
|
24 |
-
return 結果[0]['summary_text']
|
25 |
-
except Exception as e:
|
26 |
-
return f"❌ 摘要生成失敗:{str(e)}"
|
27 |
-
|
28 |
-
# 若你要測試,可以取消下面註解
|
29 |
-
# if __name__ == "__main__":
|
30 |
-
# 測試文本 = "人工智慧(AI)是研究如何讓電腦模擬人類智能行為的學科,包括學習、推理、規劃、自然語言處理、知覺等。AI 技術已廣泛應用於語音辨識、影像分析、自駕車等領域。"
|
31 |
-
# print(文字摘要(測試文本))
|
|
|
1 |
+
from transformers import pipeline
|
2 |
+
|
3 |
+
# 使用 Hugging Face 支援中文的 Pegasus 模型
|
4 |
+
summarizer = pipeline(
|
5 |
+
"summarization",
|
6 |
+
model="IDEA-CCNL/Randeng-Pegasus-523M-Summary-Chinese"
|
7 |
+
)
|
8 |
+
|
9 |
+
def 摘要(text, max_length=128, min_length=20):
|
10 |
+
if not text.strip():
|
11 |
+
return "⚠️ 請輸入要摘要的內容"
|
12 |
+
# 依據模型最大長度做裁剪
|
13 |
+
if len(text) > 1500:
|
14 |
+
text = text[:1500]
|
15 |
+
result = summarizer(text, max_length=max_length, min_length=min_length, do_sample=False)
|
16 |
+
# 處理不同模型回傳格式
|
17 |
+
if isinstance(result, list) and "summary_text" in result[0]:
|
18 |
+
return result[0]["summary_text"]
|
19 |
+
elif isinstance(result, str):
|
20 |
+
return result
|
21 |
+
else:
|
22 |
+
return str(result)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|