Spaces:
Sleeping
Sleeping
Upload 8 files
Browse files- README.md +37 -12
- app.py +95 -72
- gitattributes +6 -35
- papersearch.py +28 -151
- pdfpass.py +8 -28
- pdfsum.py +30 -123
- requirements.txt +0 -0
- textsumm.py +7 -24
README.md
CHANGED
@@ -1,12 +1,37 @@
|
|
1 |
-
---
|
2 |
-
title:
|
3 |
-
emoji:
|
4 |
-
colorFrom:
|
5 |
-
colorTo:
|
6 |
-
sdk: streamlit
|
7 |
-
sdk_version: 1.
|
8 |
-
app_file: app.py
|
9 |
-
pinned: false
|
10 |
-
|
11 |
-
|
12 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
title: PDF工具箱(多功能PDF助手)
|
3 |
+
emoji: 📄
|
4 |
+
colorFrom: blue
|
5 |
+
colorTo: green
|
6 |
+
sdk: streamlit
|
7 |
+
sdk_version: 1.35.0
|
8 |
+
app_file: app.py
|
9 |
+
pinned: false
|
10 |
+
license: mit
|
11 |
+
---
|
12 |
+
|
13 |
+
# 📄 PDF 工具箱(全功能多合一)
|
14 |
+
|
15 |
+
這是一個多功能的 PDF 處理平台,支援下列中文化操作:
|
16 |
+
|
17 |
+
- **文字摘要**:用 OpenAI GPT-4/4.1/4.5 模型自動生成關鍵重點摘要
|
18 |
+
- **PDF 摘要**:支援長篇PDF內容摘要
|
19 |
+
- **PDF 密碼移除**:移除加密 PDF 密碼
|
20 |
+
- **arXiv 論文搜尋**:中文介面搜尋並過濾論文
|
21 |
+
- **PDF 合併**、**拆頁**、**轉文字**等多功能
|
22 |
+
- 全面中文介面與說明,適合教育、研究、行政等需求
|
23 |
+
|
24 |
+
## 使用說明
|
25 |
+
|
26 |
+
1. 於側邊欄輸入你的 OpenAI API Key(sk- 或 sk-proj- 開頭)
|
27 |
+
2. 選擇所需 GPT 模型(gpt-4, gpt-4.1, gpt-4.5)
|
28 |
+
3. 選擇左側功能分頁並依需求操作上傳文件
|
29 |
+
4. 所有步驟均有中文提示
|
30 |
+
|
31 |
+
> 💡 **注意**:API Key 僅用於本次對話,不會儲存於伺服器,請安心使用!
|
32 |
+
|
33 |
+
## 聯絡與貢獻
|
34 |
+
|
35 |
+
本專案歡迎改進建議或功能增補,請於 Hugging Face 或 GitHub 提出 issue。
|
36 |
+
|
37 |
+
---
|
app.py
CHANGED
@@ -1,81 +1,104 @@
|
|
1 |
import streamlit as st
|
2 |
-
|
3 |
-
from
|
4 |
-
from
|
5 |
-
from
|
|
|
6 |
from io import BytesIO
|
7 |
from datetime import datetime
|
8 |
from pypdf import PdfReader, PdfWriter
|
9 |
|
10 |
-
#
|
11 |
-
st.set_page_config(page_title="PDF
|
12 |
|
13 |
-
#
|
14 |
-
st.sidebar.title("📄 PDF
|
15 |
-
|
|
|
16 |
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
if st.button("Summarize"):
|
22 |
-
summary = summarizer(user_input, max_length=130, min_length=30, do_sample=False)
|
23 |
-
st.subheader("Summary")
|
24 |
-
st.write(summary[0]["summary_text"])
|
25 |
|
26 |
-
#
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
|
|
|
|
37 |
|
38 |
-
#
|
39 |
-
|
40 |
-
st.title("
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
45 |
if isinstance(output, BytesIO):
|
46 |
-
st.success("
|
47 |
-
st.download_button("
|
48 |
else:
|
49 |
-
st.error(f"
|
50 |
|
51 |
-
#
|
52 |
-
elif page == "
|
53 |
-
st.title("🔍
|
54 |
-
query = st.text_input("
|
55 |
-
max_results = st.slider("
|
56 |
col1, col2 = st.columns(2)
|
57 |
with col1:
|
58 |
-
start_year = st.number_input("
|
59 |
with col2:
|
60 |
-
end_year = st.number_input("
|
61 |
-
if st.button("
|
62 |
-
papers =
|
63 |
-
|
64 |
-
if
|
65 |
-
for idx,
|
66 |
-
st.write(f"### {idx}. {
|
67 |
-
st.write(f"
|
68 |
-
st.write(f"
|
69 |
-
st.write(f"[
|
70 |
st.write("---")
|
71 |
else:
|
72 |
-
st.warning("
|
73 |
|
74 |
-
#
|
75 |
-
elif page == "PDF
|
76 |
-
st.title("📎
|
77 |
-
uploaded_files = st.file_uploader("
|
78 |
-
if uploaded_files and st.button("
|
79 |
pdf_writer = PdfWriter()
|
80 |
for file in uploaded_files:
|
81 |
pdf_reader = PdfReader(file)
|
@@ -84,12 +107,12 @@ elif page == "PDF Merger":
|
|
84 |
output = BytesIO()
|
85 |
pdf_writer.write(output)
|
86 |
output.seek(0)
|
87 |
-
st.download_button("
|
88 |
|
89 |
-
#
|
90 |
-
elif page == "PDF
|
91 |
-
st.title("✂️
|
92 |
-
uploaded_file = st.file_uploader("
|
93 |
if uploaded_file:
|
94 |
pdf_reader = PdfReader(uploaded_file)
|
95 |
for i, page in enumerate(pdf_reader.pages):
|
@@ -98,12 +121,12 @@ elif page == "PDF Splitter":
|
|
98 |
output = BytesIO()
|
99 |
pdf_writer.write(output)
|
100 |
output.seek(0)
|
101 |
-
st.download_button(f"
|
102 |
|
103 |
-
#
|
104 |
-
elif page == "PDF
|
105 |
-
st.title("📜
|
106 |
-
uploaded_file = st.file_uploader("
|
107 |
if uploaded_file:
|
108 |
-
pdf_text =
|
109 |
-
st.text_area("
|
|
|
1 |
import streamlit as st
|
2 |
+
import openai
|
3 |
+
from textsumm import 文字摘要
|
4 |
+
from pdfsum import 提取_pdf文字, 分段, 摘要
|
5 |
+
from pdfpass import 移除_pdf密碼
|
6 |
+
from papersearch import 抓取論文, 篩選論文依年份
|
7 |
from io import BytesIO
|
8 |
from datetime import datetime
|
9 |
from pypdf import PdfReader, PdfWriter
|
10 |
|
11 |
+
# ---- 一定要在所有 st.xxx 指令之前 ----
|
12 |
+
st.set_page_config(page_title="PDF 工具箱", page_icon="📄", layout="wide")
|
13 |
|
14 |
+
# ---- 側邊欄(API Key 與模型選擇)----
|
15 |
+
st.sidebar.title("📄 PDF 工具箱")
|
16 |
+
api_key = st.sidebar.text_input("請輸入 OpenAI API 金鑰", type="password", placeholder="sk-...")
|
17 |
+
selected_model = st.sidebar.radio("選擇 GPT 模型", ["gpt-4", "gpt-4.0", "gpt-4.1", "gpt-4.5"], index=0)
|
18 |
|
19 |
+
if api_key:
|
20 |
+
openai.api_key = api_key
|
21 |
+
else:
|
22 |
+
st.sidebar.warning("請輸入你的 OpenAI API Key(sk- 或 sk-proj- 開頭)")
|
|
|
|
|
|
|
|
|
23 |
|
24 |
+
# ---- 分頁功能 ----
|
25 |
+
page = st.sidebar.radio(
|
26 |
+
"選擇功能",
|
27 |
+
[
|
28 |
+
"文字摘要",
|
29 |
+
"PDF 摘要",
|
30 |
+
"PDF 密碼移除",
|
31 |
+
"論文搜尋",
|
32 |
+
"PDF 合併",
|
33 |
+
"PDF 拆頁",
|
34 |
+
"PDF 轉純文字"
|
35 |
+
]
|
36 |
+
)
|
37 |
|
38 |
+
# 文字摘要
|
39 |
+
if page == "文字摘要":
|
40 |
+
st.title("📝 文字摘要")
|
41 |
+
user_input = st.text_area("請輸入要摘要的文字")
|
42 |
+
if st.button("生成摘要"):
|
43 |
+
if not api_key:
|
44 |
+
st.error("請先輸入 OpenAI API 金鑰!")
|
45 |
+
else:
|
46 |
+
結果 = 文字摘要(user_input)
|
47 |
+
st.subheader("摘要結果")
|
48 |
+
st.write(結果[0]["summary_text"])
|
49 |
+
|
50 |
+
# PDF 摘要
|
51 |
+
elif page == "PDF 摘要":
|
52 |
+
st.title("📜 PDF 摘要")
|
53 |
+
uploaded_file = st.file_uploader("上傳你的 PDF 檔案", type=["pdf"])
|
54 |
+
if uploaded_file is not None and st.button("產生 PDF 摘要"):
|
55 |
+
pdf_text = 提取_pdf文字(uploaded_file)
|
56 |
+
段落們 = 分段(pdf_text)
|
57 |
+
全部摘要 = " ".join(摘要(段落們))
|
58 |
+
st.subheader("摘要結果")
|
59 |
+
st.write(全部摘要)
|
60 |
+
|
61 |
+
# PDF 密碼移除
|
62 |
+
elif page == "PDF 密碼移除":
|
63 |
+
st.title("🔑 PDF 密碼移除")
|
64 |
+
uploaded_file = st.file_uploader("選擇需要解鎖的 PDF 檔案", type=["pdf"])
|
65 |
+
password = st.text_input("請輸入 PDF 密碼", type="password")
|
66 |
+
if uploaded_file and password and st.button("移除密碼"):
|
67 |
+
output = 移除_pdf密碼(uploaded_file, password)
|
68 |
if isinstance(output, BytesIO):
|
69 |
+
st.success("密碼移除成功!")
|
70 |
+
st.download_button("下載已解鎖的 PDF", data=output, file_name="unlocked_pdf.pdf", mime="application/pdf")
|
71 |
else:
|
72 |
+
st.error(f"錯誤:{output}")
|
73 |
|
74 |
+
# 論文搜尋
|
75 |
+
elif page == "論文搜尋":
|
76 |
+
st.title("🔍 論文搜尋(arXiv)")
|
77 |
+
query = st.text_input("輸入主題或關鍵字", placeholder="例如:人工智慧、量子計算")
|
78 |
+
max_results = st.slider("結果數量", 1, 50, 10)
|
79 |
col1, col2 = st.columns(2)
|
80 |
with col1:
|
81 |
+
start_year = st.number_input("起始年份", min_value=1900, max_value=datetime.now().year, value=2000)
|
82 |
with col2:
|
83 |
+
end_year = st.number_input("結束年份", min_value=1900, max_value=datetime.now().year, value=datetime.now().year)
|
84 |
+
if st.button("搜尋論文"):
|
85 |
+
papers = 抓取論文(query, max_results)
|
86 |
+
篩選後 = 篩選論文依年份(papers, start_year, end_year)
|
87 |
+
if 篩選後:
|
88 |
+
for idx, 論文 in enumerate(篩選後, start=1):
|
89 |
+
st.write(f"### {idx}. {論文['標題']}")
|
90 |
+
st.write(f"**作者**: {', '.join(論文['作者'])}")
|
91 |
+
st.write(f"**發表時間**: {論文['發表時間']}")
|
92 |
+
st.write(f"[閱讀全文]({論文['連結']})")
|
93 |
st.write("---")
|
94 |
else:
|
95 |
+
st.warning("在所選年份範圍內沒有找到相關論文。")
|
96 |
|
97 |
+
# PDF 合併
|
98 |
+
elif page == "PDF 合併":
|
99 |
+
st.title("📎 多檔 PDF 合併")
|
100 |
+
uploaded_files = st.file_uploader("上傳多個 PDF 檔案", type=["pdf"], accept_multiple_files=True)
|
101 |
+
if uploaded_files and st.button("合併 PDF"):
|
102 |
pdf_writer = PdfWriter()
|
103 |
for file in uploaded_files:
|
104 |
pdf_reader = PdfReader(file)
|
|
|
107 |
output = BytesIO()
|
108 |
pdf_writer.write(output)
|
109 |
output.seek(0)
|
110 |
+
st.download_button("下載合併後的 PDF", data=output, file_name="merged.pdf", mime="application/pdf")
|
111 |
|
112 |
+
# PDF 拆頁
|
113 |
+
elif page == "PDF 拆頁":
|
114 |
+
st.title("✂️ PDF 拆頁")
|
115 |
+
uploaded_file = st.file_uploader("上傳一個 PDF", type=["pdf"])
|
116 |
if uploaded_file:
|
117 |
pdf_reader = PdfReader(uploaded_file)
|
118 |
for i, page in enumerate(pdf_reader.pages):
|
|
|
121 |
output = BytesIO()
|
122 |
pdf_writer.write(output)
|
123 |
output.seek(0)
|
124 |
+
st.download_button(f"下載第 {i+1} 頁", data=output, file_name=f"page_{i+1}.pdf", mime="application/pdf")
|
125 |
|
126 |
+
# PDF 轉純文字
|
127 |
+
elif page == "PDF 轉純文字":
|
128 |
+
st.title("📜 PDF 轉純文字")
|
129 |
+
uploaded_file = st.file_uploader("上傳 PDF", type=["pdf"])
|
130 |
if uploaded_file:
|
131 |
+
pdf_text = 提取_pdf文字(uploaded_file)
|
132 |
+
st.text_area("擷取內容", pdf_text, height=300)
|
gitattributes
CHANGED
@@ -1,35 +1,6 @@
|
|
1 |
-
|
2 |
-
*.
|
3 |
-
*.
|
4 |
-
*.
|
5 |
-
|
6 |
-
|
7 |
-
*.gz filter=lfs diff=lfs merge=lfs -text
|
8 |
-
*.h5 filter=lfs diff=lfs merge=lfs -text
|
9 |
-
*.joblib filter=lfs diff=lfs merge=lfs -text
|
10 |
-
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
11 |
-
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
12 |
-
*.model filter=lfs diff=lfs merge=lfs -text
|
13 |
-
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
14 |
-
*.npy filter=lfs diff=lfs merge=lfs -text
|
15 |
-
*.npz filter=lfs diff=lfs merge=lfs -text
|
16 |
-
*.onnx filter=lfs diff=lfs merge=lfs -text
|
17 |
-
*.ot filter=lfs diff=lfs merge=lfs -text
|
18 |
-
*.parquet filter=lfs diff=lfs merge=lfs -text
|
19 |
-
*.pb filter=lfs diff=lfs merge=lfs -text
|
20 |
-
*.pickle filter=lfs diff=lfs merge=lfs -text
|
21 |
-
*.pkl filter=lfs diff=lfs merge=lfs -text
|
22 |
-
*.pt filter=lfs diff=lfs merge=lfs -text
|
23 |
-
*.pth filter=lfs diff=lfs merge=lfs -text
|
24 |
-
*.rar filter=lfs diff=lfs merge=lfs -text
|
25 |
-
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
26 |
-
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
27 |
-
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
28 |
-
*.tar filter=lfs diff=lfs merge=lfs -text
|
29 |
-
*.tflite filter=lfs diff=lfs merge=lfs -text
|
30 |
-
*.tgz filter=lfs diff=lfs merge=lfs -text
|
31 |
-
*.wasm filter=lfs diff=lfs merge=lfs -text
|
32 |
-
*.xz filter=lfs diff=lfs merge=lfs -text
|
33 |
-
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
-
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
-
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
1 |
+
# Git LFS 屬性設定檔(可用於大檔案控制)
|
2 |
+
*.pdf filter=lfs diff=lfs merge=lfs -text
|
3 |
+
*.jpg filter=lfs diff=lfs merge=lfs -text
|
4 |
+
*.png filter=lfs diff=lfs merge=lfs -text
|
5 |
+
|
6 |
+
# 中文註解:上面設定會讓 PDF/圖片走 Git LFS(大檔案友善處理)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
papersearch.py
CHANGED
@@ -1,154 +1,31 @@
|
|
1 |
-
# import streamlit as st
|
2 |
-
# import requests
|
3 |
-
# import xmltodict
|
4 |
-
|
5 |
-
# # arXiv API base URL
|
6 |
-
# ARXIV_API_BASE = "http://export.arxiv.org/api/query"
|
7 |
-
|
8 |
-
# def fetch_papers(query, max_results=10):
|
9 |
-
# """Fetch papers from the arXiv API."""
|
10 |
-
# try:
|
11 |
-
# # Build the API query URL
|
12 |
-
# api_url = f"{ARXIV_API_BASE}?search_query=all:{query}&start=0&max_results={max_results}"
|
13 |
-
|
14 |
-
# # Make the API request
|
15 |
-
# response = requests.get(api_url, headers={'Accept': 'application/xml'})
|
16 |
-
# response.raise_for_status()
|
17 |
-
|
18 |
-
# # Parse the XML response
|
19 |
-
# data = xmltodict.parse(response.text)
|
20 |
-
# entries = data.get('feed', {}).get('entry', [])
|
21 |
-
|
22 |
-
# if not isinstance(entries, list): # Handle single result
|
23 |
-
# entries = [entries]
|
24 |
-
|
25 |
-
# # Extract relevant fields
|
26 |
-
# papers = []
|
27 |
-
# for entry in entries:
|
28 |
-
# papers.append({
|
29 |
-
# 'title': entry.get('title'),
|
30 |
-
# 'summary': entry.get('summary'),
|
31 |
-
# 'published': entry.get('published'),
|
32 |
-
# 'authors': [author['name'] for author in entry.get('author', [])] if isinstance(entry.get('author'), list) else [entry.get('author', {}).get('name')],
|
33 |
-
# 'link': entry.get('id')
|
34 |
-
# })
|
35 |
-
|
36 |
-
# return papers
|
37 |
-
# except Exception as e:
|
38 |
-
# st.error(f"Error fetching papers: {e}")
|
39 |
-
# return []
|
40 |
-
|
41 |
-
# # Streamlit app UI
|
42 |
-
# st.title("arXiv Research Paper Search")
|
43 |
-
# st.subheader("Find academic papers on your topic of interest")
|
44 |
-
|
45 |
-
# # Input fields
|
46 |
-
# query = st.text_input("Enter a topic or keywords", placeholder="e.g., machine learning, quantum computing")
|
47 |
-
# max_results = st.slider("Number of results", min_value=1, max_value=50, value=10)
|
48 |
-
|
49 |
-
# if st.button("Search"):
|
50 |
-
# if query.strip():
|
51 |
-
# st.info(f"Searching for papers on: **{query}**")
|
52 |
-
# papers = fetch_papers(query, max_results)
|
53 |
-
|
54 |
-
# if papers:
|
55 |
-
# st.success(f"Found {len(papers)} papers!")
|
56 |
-
# for idx, paper in enumerate(papers, start=1):
|
57 |
-
# st.write(f"### {idx}. {paper['title']}")
|
58 |
-
# st.write(f"**Authors**: {', '.join(paper['authors'])}")
|
59 |
-
# st.write(f"**Published**: {paper['published']}")
|
60 |
-
# st.write(f"[Read More]({paper['link']})")
|
61 |
-
# st.write("---")
|
62 |
-
# else:
|
63 |
-
# st.warning("No papers found. Try a different query.")
|
64 |
-
# else:
|
65 |
-
# st.error("Please enter a topic or keywords to search.")
|
66 |
-
|
67 |
-
import streamlit as st
|
68 |
import requests
|
69 |
-
import
|
70 |
from datetime import datetime
|
71 |
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
'authors': [author['name'] for author in entry.get('author', [])] if isinstance(entry.get('author'), list) else [entry.get('author', {}).get('name')],
|
100 |
-
'link': entry.get('id')
|
101 |
-
})
|
102 |
-
|
103 |
-
return papers
|
104 |
-
except Exception as e:
|
105 |
-
st.error(f"Error fetching papers: {e}")
|
106 |
-
return []
|
107 |
-
|
108 |
-
def filter_papers_by_year(papers, start_year, end_year):
|
109 |
-
"""Filter papers by the publication year range."""
|
110 |
-
filtered_papers = []
|
111 |
-
for paper in papers:
|
112 |
-
try:
|
113 |
-
published_year = int(paper['published'][:4]) # Extract year from the published date
|
114 |
-
if start_year <= published_year <= end_year:
|
115 |
-
filtered_papers.append(paper)
|
116 |
-
except ValueError:
|
117 |
-
continue # Skip if the year is not valid
|
118 |
-
return filtered_papers
|
119 |
-
|
120 |
-
# Streamlit app UI
|
121 |
-
st.title("arXiv Research Paper Search")
|
122 |
-
st.subheader("Find academic papers on your topic of interest")
|
123 |
-
|
124 |
-
# Input fields
|
125 |
-
query = st.text_input("Enter a topic or keywords", placeholder="e.g., machine learning, quantum computing")
|
126 |
-
max_results = st.slider("Number of results", min_value=1, max_value=50, value=10)
|
127 |
-
|
128 |
-
# Year filter
|
129 |
-
col1, col2 = st.columns(2)
|
130 |
-
with col1:
|
131 |
-
start_year = st.number_input("Start Year", min_value=1900, max_value=datetime.now().year, value=2000, step=1)
|
132 |
-
with col2:
|
133 |
-
end_year = st.number_input("End Year", min_value=1900, max_value=datetime.now().year, value=datetime.now().year, step=1)
|
134 |
-
|
135 |
-
if st.button("Search"):
|
136 |
-
if query.strip():
|
137 |
-
st.info(f"Searching for papers on: **{query}**")
|
138 |
-
papers = fetch_papers(query, max_results)
|
139 |
-
|
140 |
-
# Filter papers by year
|
141 |
-
papers_filtered = filter_papers_by_year(papers, start_year, end_year)
|
142 |
-
|
143 |
-
if papers_filtered:
|
144 |
-
st.success(f"Found {len(papers_filtered)} papers between {start_year} and {end_year}!")
|
145 |
-
for idx, paper in enumerate(papers_filtered, start=1):
|
146 |
-
st.write(f"### {idx}. {paper['title']}")
|
147 |
-
st.write(f"**Authors**: {', '.join(paper['authors'])}")
|
148 |
-
st.write(f"**Published**: {paper['published']}")
|
149 |
-
st.write(f"[Read More]({paper['link']})")
|
150 |
-
st.write("---")
|
151 |
-
else:
|
152 |
-
st.warning(f"No papers found between {start_year} and {end_year}. Try a different query or adjust the year range.")
|
153 |
-
else:
|
154 |
-
st.error("Please enter a topic or keywords to search.")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import requests
|
2 |
+
import xml.etree.ElementTree as ET
|
3 |
from datetime import datetime
|
4 |
|
5 |
+
def 抓取論文(關鍵字, 最大數量=10):
|
6 |
+
"""
|
7 |
+
從 arXiv 依關鍵字搜尋論文(最新)
|
8 |
+
"""
|
9 |
+
url = f"https://export.arxiv.org/api/query?search_query=all:{關鍵字}&start=0&max_results={最大數量}&sortBy=lastUpdatedDate"
|
10 |
+
res = requests.get(url)
|
11 |
+
root = ET.fromstring(res.content)
|
12 |
+
論文清單 = []
|
13 |
+
for entry in root.findall('{http://www.w3.org/2005/Atom}entry'):
|
14 |
+
論文清單.append({
|
15 |
+
"標題": entry.find('{http://www.w3.org/2005/Atom}title').text.strip(),
|
16 |
+
"作者": [author.find('{http://www.w3.org/2005/Atom}name').text for author in entry.findall('{http://www.w3.org/2005/Atom}author')],
|
17 |
+
"發表時間": entry.find('{http://www.w3.org/2005/Atom}published').text[:10],
|
18 |
+
"連結": entry.find('{http://www.w3.org/2005/Atom}id').text
|
19 |
+
})
|
20 |
+
return 論文清單
|
21 |
+
|
22 |
+
def 篩選論文依年份(論文清單, 起始, 結束):
|
23 |
+
"""
|
24 |
+
依年份篩選論文(年分區間)
|
25 |
+
"""
|
26 |
+
篩選 = []
|
27 |
+
for 論文 in 論文清單:
|
28 |
+
年份 = int(論文["發表時間"][:4])
|
29 |
+
if 起始 <= 年份 <= 結束:
|
30 |
+
篩選.append(論文)
|
31 |
+
return 篩選
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
pdfpass.py
CHANGED
@@ -1,40 +1,20 @@
|
|
1 |
-
import
|
2 |
-
from PyPDF2 import PdfReader, PdfWriter
|
3 |
from io import BytesIO
|
4 |
|
5 |
-
def
|
|
|
|
|
|
|
6 |
try:
|
7 |
-
reader = PdfReader(
|
8 |
if reader.is_encrypted:
|
9 |
-
reader.decrypt(
|
10 |
writer = PdfWriter()
|
11 |
for page in reader.pages:
|
12 |
writer.add_page(page)
|
13 |
-
|
14 |
output = BytesIO()
|
15 |
writer.write(output)
|
16 |
output.seek(0)
|
17 |
return output
|
18 |
except Exception as e:
|
19 |
-
return
|
20 |
-
|
21 |
-
st.title("PDF Password Remover")
|
22 |
-
st.write("Upload a password-protected PDF and remove its password.")
|
23 |
-
|
24 |
-
# File upload
|
25 |
-
uploaded_file = st.file_uploader("Choose a PDF file", type=["pdf"])
|
26 |
-
password = st.text_input("Enter the PDF password", type="password")
|
27 |
-
|
28 |
-
if uploaded_file and password:
|
29 |
-
if st.button("Remove Password"):
|
30 |
-
output = remove_pdf_password(uploaded_file, password)
|
31 |
-
if isinstance(output, BytesIO):
|
32 |
-
st.success("Password removed successfully!")
|
33 |
-
st.download_button(
|
34 |
-
label="Download PDF without Password",
|
35 |
-
data=output,
|
36 |
-
file_name="unlocked_pdf.pdf",
|
37 |
-
mime="application/pdf",
|
38 |
-
)
|
39 |
-
else:
|
40 |
-
st.error(f"Error: {output}")
|
|
|
1 |
+
from pypdf import PdfReader, PdfWriter
|
|
|
2 |
from io import BytesIO
|
3 |
|
4 |
+
def 移除_pdf密碼(pdf檔案, 密碼):
|
5 |
+
"""
|
6 |
+
解鎖帶有密碼保護的 PDF 檔案,回傳已解鎖的檔案(BytesIO)或錯誤訊息
|
7 |
+
"""
|
8 |
try:
|
9 |
+
reader = PdfReader(pdf檔案)
|
10 |
if reader.is_encrypted:
|
11 |
+
reader.decrypt(密碼)
|
12 |
writer = PdfWriter()
|
13 |
for page in reader.pages:
|
14 |
writer.add_page(page)
|
|
|
15 |
output = BytesIO()
|
16 |
writer.write(output)
|
17 |
output.seek(0)
|
18 |
return output
|
19 |
except Exception as e:
|
20 |
+
return f"解鎖失敗:{e}"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
pdfsum.py
CHANGED
@@ -1,125 +1,32 @@
|
|
1 |
-
|
2 |
-
# from transformers import pipeline
|
3 |
-
# from PyPDF2 import PdfReader
|
4 |
-
|
5 |
-
# # Initialize the summarizer
|
6 |
-
# summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
|
7 |
-
|
8 |
-
# def extract_text_from_pdf(pdf_file):
|
9 |
-
# """Extract text from an uploaded PDF file."""
|
10 |
-
# try:
|
11 |
-
# reader = PdfReader(pdf_file)
|
12 |
-
# text = ""
|
13 |
-
# for page in reader.pages:
|
14 |
-
# page_text = page.extract_text()
|
15 |
-
# if page_text: # Skip pages with no text
|
16 |
-
# text += page_text + "\n"
|
17 |
-
# return text
|
18 |
-
# except Exception as e:
|
19 |
-
# raise ValueError(f"Error extracting text from PDF: {e}")
|
20 |
-
|
21 |
-
# def split_text_into_chunks(text, max_chunk_size=1024):
|
22 |
-
# """Split the text into smaller chunks for summarization."""
|
23 |
-
# chunks = []
|
24 |
-
# while len(text) > max_chunk_size:
|
25 |
-
# split_point = text.rfind(". ", 0, max_chunk_size) + 1 # Split at the last sentence boundary
|
26 |
-
# if split_point == 0: # No sentence boundary found, split arbitrarily
|
27 |
-
# split_point = max_chunk_size
|
28 |
-
# chunks.append
|
29 |
-
|
30 |
-
# # Streamlit Dashboard
|
31 |
-
# st.title("PDF Summarizer")
|
32 |
-
# st.write("Upload a PDF file to get a summarized version of its content.")
|
33 |
-
|
34 |
-
# uploaded_file = st.file_uploader("Upload your PDF", type=["pdf"])
|
35 |
-
|
36 |
-
# if uploaded_file is not None:
|
37 |
-
# # Extract text from the PDF
|
38 |
-
# st.write("Processing your PDF...")
|
39 |
-
# try:
|
40 |
-
# pdf_text = extract_text_from_pdf(uploaded_file)
|
41 |
-
# st.write("PDF content extracted successfully.")
|
42 |
-
|
43 |
-
# # Display extracted text (optional)
|
44 |
-
# with st.expander("View Extracted Text"):
|
45 |
-
# st.text_area("Extracted Text", pdf_text, height=300)
|
46 |
-
|
47 |
-
# # Summarize the extracted text
|
48 |
-
# if st.button("Summarize"):
|
49 |
-
# st.write("Generating summary...")
|
50 |
-
# summary = summarizer(pdf_text, max_length=130, min_length=30, do_sample=False)
|
51 |
-
# st.subheader("Summary")
|
52 |
-
# st.write(summary[0]["summary_text"])
|
53 |
-
# except Exception as e:
|
54 |
-
# st.error(f"An error occurred while processing the PDF: {str(e)}")
|
55 |
-
|
56 |
-
import streamlit as st
|
57 |
from transformers import pipeline
|
58 |
-
import pdfplumber
|
59 |
-
|
60 |
-
# Initialize the summarizer
|
61 |
-
summarizer = pipeline("summarization", model="t5-small")
|
62 |
-
|
63 |
-
def extract_text_from_pdf(pdf_file):
|
64 |
-
"""Extract text from an uploaded PDF file using pdfplumber."""
|
65 |
-
try:
|
66 |
-
text = ""
|
67 |
-
with pdfplumber.open(pdf_file) as pdf:
|
68 |
-
for page in pdf.pages:
|
69 |
-
text += page.extract_text() + "\n"
|
70 |
-
if not text.strip():
|
71 |
-
raise ValueError("No extractable text found in the PDF.")
|
72 |
-
return text
|
73 |
-
except Exception as e:
|
74 |
-
raise ValueError(f"Error extracting text from PDF: {e}")
|
75 |
-
|
76 |
-
def split_text_into_chunks(text, max_chunk_size=1024):
|
77 |
-
"""Split the text into smaller chunks for summarization."""
|
78 |
-
chunks = []
|
79 |
-
while len(text) > max_chunk_size:
|
80 |
-
split_point = text.rfind(". ", 0, max_chunk_size) + 1 # Find the last full sentence
|
81 |
-
if split_point == 0: # No sentence boundary found, split arbitrarily
|
82 |
-
split_point = max_chunk_size
|
83 |
-
chunks.append(text[:split_point])
|
84 |
-
text = text[split_point:]
|
85 |
-
if text:
|
86 |
-
chunks.append(text)
|
87 |
-
return chunks
|
88 |
-
|
89 |
-
def summarize_text(chunks):
|
90 |
-
"""Summarize each chunk of text with dynamic max_length."""
|
91 |
-
summaries = []
|
92 |
-
for chunk in chunks:
|
93 |
-
input_length = len(chunk.split()) # Approximate token count
|
94 |
-
max_length = max(48, int(input_length * 0.8)) # Set max_length to 80% of input length
|
95 |
-
summary = summarizer(chunk, max_length=max_length, min_length=10, do_sample=False)
|
96 |
-
summaries.append(summary[0]["summary_text"])
|
97 |
-
return summaries
|
98 |
-
|
99 |
-
# Streamlit Dashboard
|
100 |
-
st.title("PDF Summarizer")
|
101 |
-
st.write("Upload a PDF file to get a summarized version of its content.")
|
102 |
-
|
103 |
-
uploaded_file = st.file_uploader("Upload your PDF", type=["pdf"])
|
104 |
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
|
113 |
-
|
114 |
-
|
115 |
-
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
-
|
121 |
-
|
122 |
-
|
123 |
-
|
124 |
-
|
125 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from PyPDF2 import PdfReader
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2 |
from transformers import pipeline
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
3 |
|
4 |
+
# 這裡你也可以改成你要的中文 BART、T5 等 summarization 模型
|
5 |
+
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
|
6 |
+
|
7 |
+
def 提取_pdf文字(pdf檔案):
|
8 |
+
"""
|
9 |
+
從 PDF 檔案讀取並合併所有頁面的內容為純文字
|
10 |
+
"""
|
11 |
+
reader = PdfReader(pdf檔案)
|
12 |
+
內容 = ""
|
13 |
+
for 頁面 in reader.pages:
|
14 |
+
內容 += 頁面.extract_text()
|
15 |
+
return 內容
|
16 |
+
|
17 |
+
def 分段(內容, 每段字數=2000):
|
18 |
+
"""
|
19 |
+
將長文本切成多個段落(方便 AI 處理)
|
20 |
+
"""
|
21 |
+
return [內容[i:i+每段字數] for i in range(0, len(內容), 每段字數)]
|
22 |
+
|
23 |
+
def 摘要(段落們):
|
24 |
+
"""
|
25 |
+
對每個段落做中文摘要,再合併回一份總結
|
26 |
+
"""
|
27 |
+
結果 = []
|
28 |
+
for 段 in 段落們:
|
29 |
+
結果.append(
|
30 |
+
summarizer(段, max_length=130, min_length=30, do_sample=False)[0]["summary_text"]
|
31 |
+
)
|
32 |
+
return 結果
|
requirements.txt
CHANGED
Binary files a/requirements.txt and b/requirements.txt differ
|
|
textsumm.py
CHANGED
@@ -1,28 +1,11 @@
|
|
1 |
from transformers import pipeline
|
2 |
|
|
|
3 |
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
|
4 |
-
ARTICLE ="""
|
5 |
-
There is widespread international concern that Russia's war will provoke a global food crisis similar to, or
|
6 |
-
worse than, that faced in 2007 and 2008. The war comes at a time when the global food system was already
|
7 |
-
struggling to feed its growing population in a sustainable way, under the pressure caused by climate change
|
8 |
-
and the Covid-19 pandemic. Russia and Ukraine are key agricultural players, together exporting nearly 12
|
9 |
-
% of food calories traded globally. They are major providers of basic agro-commodities, including wheat,
|
10 |
-
maize and sunflower oil, and Russia is the world's top exporter of fertilisers. The global supply chain will
|
11 |
-
get impacted until Russia and Ukraine retreat and will end the war.
|
12 |
-
The war's impact on global food supply centred on three factors. First is a significant reduction in exports
|
13 |
-
and production of essential commodities from both countries, caused by the war and not the economic
|
14 |
-
sanctions imposed on Russia, which, intentionally, did not target the agricultural sector. Overall, the
|
15 |
-
European Commission estimates that 'up to 25 million tonnes of wheat would need to be substituted to
|
16 |
-
meet worldwide food needs in the current and the next season. Second factor is a global spike in prices of
|
17 |
-
food supplies and inputs needed for agri-food production, which were already at record levels before the
|
18 |
-
war. The war has further pushed the prices up. Third factor is the international response to the above,
|
19 |
-
which could either amplify the effects of the crisis (mainly by uncoordinated export bans) or mitigate them
|
20 |
-
(applying lessons learnt from the 2007-2008 food crisis). A number of countries, other than Russia and
|
21 |
-
Ukraine, have already imposed or announced their intention to impose some control over exports of
|
22 |
-
essential agricultural commodities, including Egypt, Argentina, Indonesia, Serbia, Turkey and, in the EU,
|
23 |
-
Hungary. We should keep this in our mind that the long duration of war will make the global situation
|
24 |
-
irrecoverable.
|
25 |
-
|
26 |
-
"""
|
27 |
-
print(summarizer(ARTICLE, max_length=130, min_length=30, do_sample=False))
|
28 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
from transformers import pipeline
|
2 |
|
3 |
+
# 建立中文摘要管道
|
4 |
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
5 |
|
6 |
+
def 文字摘要(輸入文本, max_length=130, min_length=30, do_sample=False):
|
7 |
+
"""
|
8 |
+
將輸入的純文字自動摘要為繁體中文重點
|
9 |
+
"""
|
10 |
+
result = summarizer(輸入文本, max_length=max_length, min_length=min_length, do_sample=do_sample)
|
11 |
+
return result
|