Spaces:
Sleeping
Sleeping
Upload 8 files
#1
by
3a05chatgpt
- opened
- README.md +37 -12
- app.py +21 -102
- gitattributes +6 -0
- papersearch.py +28 -151
- pdfpass.py +8 -28
- pdfsum.py +32 -123
- requirements.txt +0 -0
- textsumm.py +26 -23
README.md
CHANGED
@@ -1,12 +1,37 @@
|
|
1 |
-
---
|
2 |
-
title:
|
3 |
-
emoji:
|
4 |
-
colorFrom:
|
5 |
-
colorTo:
|
6 |
-
sdk: streamlit
|
7 |
-
sdk_version: 1.
|
8 |
-
app_file: app.py
|
9 |
-
pinned: false
|
10 |
-
|
11 |
-
|
12 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
title: PDF工具箱(多功能PDF助手)
|
3 |
+
emoji: 📄
|
4 |
+
colorFrom: blue
|
5 |
+
colorTo: green
|
6 |
+
sdk: streamlit
|
7 |
+
sdk_version: 1.35.0
|
8 |
+
app_file: app.py
|
9 |
+
pinned: false
|
10 |
+
license: mit
|
11 |
+
---
|
12 |
+
|
13 |
+
# 📄 PDF 工具箱(全功能多合一)
|
14 |
+
|
15 |
+
這是一個多功能的 PDF 處理平台,支援下列中文化操作:
|
16 |
+
|
17 |
+
- **文字摘要**:用 OpenAI GPT-4/4.1/4.5 模型自動生成關鍵重點摘要
|
18 |
+
- **PDF 摘要**:支援長篇PDF內容摘要
|
19 |
+
- **PDF 密碼移除**:移除加密 PDF 密碼
|
20 |
+
- **arXiv 論文搜尋**:中文介面搜尋並過濾論文
|
21 |
+
- **PDF 合併**、**拆頁**、**轉文字**等多功能
|
22 |
+
- 全面中文介面與說明,適合教育、研究、行政等需求
|
23 |
+
|
24 |
+
## 使用說明
|
25 |
+
|
26 |
+
1. 於側邊欄輸入你的 OpenAI API Key(sk- 或 sk-proj- 開頭)
|
27 |
+
2. 選擇所需 GPT 模型(gpt-4, gpt-4.1, gpt-4.5)
|
28 |
+
3. 選擇左側功能分頁並依需求操作上傳文件
|
29 |
+
4. 所有步驟均有中文提示
|
30 |
+
|
31 |
+
> 💡 **注意**:API Key 僅用於本次對話,不會儲存於伺服器,請安心使用!
|
32 |
+
|
33 |
+
## 聯絡與貢獻
|
34 |
+
|
35 |
+
本專案歡迎改進建議或功能增補,請於 Hugging Face 或 GitHub 提出 issue。
|
36 |
+
|
37 |
+
---
|
app.py
CHANGED
@@ -1,109 +1,28 @@
|
|
1 |
import streamlit as st
|
2 |
-
from
|
3 |
-
|
4 |
-
from pdfpass import remove_pdf_password
|
5 |
-
from papersearch import fetch_papers, filter_papers_by_year
|
6 |
-
from io import BytesIO
|
7 |
-
from datetime import datetime
|
8 |
-
from pypdf import PdfReader, PdfWriter
|
9 |
|
10 |
-
|
11 |
-
st.set_page_config(page_title="PDF Tools Suite", page_icon="📄", layout="wide")
|
12 |
|
13 |
-
|
14 |
-
st.sidebar.
|
15 |
-
|
|
|
16 |
|
17 |
-
|
18 |
-
if page == "Text Summarizer":
|
19 |
-
st.title("📝 Text Summarizer")
|
20 |
-
user_input = st.text_area("Enter text to summarize")
|
21 |
-
if st.button("Summarize"):
|
22 |
-
summary = summarizer(user_input, max_length=130, min_length=30, do_sample=False)
|
23 |
-
st.subheader("Summary")
|
24 |
-
st.write(summary[0]["summary_text"])
|
25 |
|
26 |
-
|
27 |
-
|
28 |
-
st.title("📜 PDF Summarizer")
|
29 |
-
uploaded_file = st.file_uploader("Upload your PDF", type=["pdf"])
|
30 |
if uploaded_file is not None:
|
31 |
-
|
32 |
-
|
33 |
-
summaries = summarize_text(chunks)
|
34 |
-
full_summary = " ".join(summaries)
|
35 |
-
st.subheader("Summary")
|
36 |
-
st.write(full_summary)
|
37 |
-
|
38 |
-
# Tool: PDF Password Remover
|
39 |
-
elif page == "PDF Password Remover":
|
40 |
-
st.title("🔑 Remove PDF Password")
|
41 |
-
uploaded_file = st.file_uploader("Choose a password-protected PDF", type=["pdf"])
|
42 |
-
password = st.text_input("Enter the PDF password", type="password")
|
43 |
-
if uploaded_file and password and st.button("Remove Password"):
|
44 |
-
output = remove_pdf_password(uploaded_file, password)
|
45 |
-
if isinstance(output, BytesIO):
|
46 |
-
st.success("Password removed successfully!")
|
47 |
-
st.download_button("Download PDF", data=output, file_name="unlocked_pdf.pdf", mime="application/pdf")
|
48 |
-
else:
|
49 |
-
st.error(f"Error: {output}")
|
50 |
-
|
51 |
-
# Tool: Research Paper Search
|
52 |
-
elif page == "Research Paper Search":
|
53 |
-
st.title("🔍 Research Paper Search (arXiv)")
|
54 |
-
query = st.text_input("Enter topic or keywords", placeholder="e.g., machine learning")
|
55 |
-
max_results = st.slider("Number of results", 1, 50, 10)
|
56 |
-
col1, col2 = st.columns(2)
|
57 |
-
with col1:
|
58 |
-
start_year = st.number_input("Start Year", min_value=1900, max_value=datetime.now().year, value=2000)
|
59 |
-
with col2:
|
60 |
-
end_year = st.number_input("End Year", min_value=1900, max_value=datetime.now().year, value=datetime.now().year)
|
61 |
-
if st.button("Search"):
|
62 |
-
papers = fetch_papers(query, max_results)
|
63 |
-
papers_filtered = filter_papers_by_year(papers, start_year, end_year)
|
64 |
-
if papers_filtered:
|
65 |
-
for idx, paper in enumerate(papers_filtered, start=1):
|
66 |
-
st.write(f"### {idx}. {paper['title']}")
|
67 |
-
st.write(f"**Authors**: {', '.join(paper['authors'])}")
|
68 |
-
st.write(f"**Published**: {paper['published']}")
|
69 |
-
st.write(f"[Read More]({paper['link']})")
|
70 |
-
st.write("---")
|
71 |
-
else:
|
72 |
-
st.warning("No papers found in the selected range.")
|
73 |
-
|
74 |
-
# Tool: PDF Merger
|
75 |
-
elif page == "PDF Merger":
|
76 |
-
st.title("📎 Merge Multiple PDFs")
|
77 |
-
uploaded_files = st.file_uploader("Upload multiple PDF files", type=["pdf"], accept_multiple_files=True)
|
78 |
-
if uploaded_files and st.button("Merge PDFs"):
|
79 |
-
pdf_writer = PdfWriter()
|
80 |
-
for file in uploaded_files:
|
81 |
-
pdf_reader = PdfReader(file)
|
82 |
-
for page in pdf_reader.pages:
|
83 |
-
pdf_writer.add_page(page)
|
84 |
-
output = BytesIO()
|
85 |
-
pdf_writer.write(output)
|
86 |
-
output.seek(0)
|
87 |
-
st.download_button("Download Merged PDF", data=output, file_name="merged.pdf", mime="application/pdf")
|
88 |
-
|
89 |
-
# Tool: PDF Splitter
|
90 |
-
elif page == "PDF Splitter":
|
91 |
-
st.title("✂️ Split PDF into Pages")
|
92 |
-
uploaded_file = st.file_uploader("Upload a PDF", type=["pdf"])
|
93 |
-
if uploaded_file:
|
94 |
pdf_reader = PdfReader(uploaded_file)
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
st.
|
102 |
-
|
103 |
-
|
104 |
-
elif page == "PDF to Text Converter":
|
105 |
-
st.title("📜 Extract Text from PDF")
|
106 |
-
uploaded_file = st.file_uploader("Upload a PDF", type=["pdf"])
|
107 |
-
if uploaded_file:
|
108 |
-
pdf_text = extract_text_from_pdf(uploaded_file)
|
109 |
-
st.text_area("Extracted Text", pdf_text, height=300)
|
|
|
1 |
import streamlit as st
|
2 |
+
from pdfsum import 摘要
|
3 |
+
# 其他功能如有各自py檔也可import
|
|
|
|
|
|
|
|
|
|
|
4 |
|
5 |
+
st.set_page_config(page_title="PDF 工具箱", layout="wide")
|
|
|
6 |
|
7 |
+
st.sidebar.title("PDF 工具箱")
|
8 |
+
api_key = st.sidebar.text_input("請輸入 OpenAI API 金鑰", type="password")
|
9 |
+
gpt_model = st.sidebar.radio("選擇 GPT 模型", ["gpt-4", "gpt-4.0", "gpt-4.1", "gpt-4.5"])
|
10 |
+
功能 = st.sidebar.radio("選擇功能", ["文字摘要", "PDF 摘要", "PDF 密碼移除", "論文搜尋", "PDF 合併", "PDF 拆頁", "PDF 轉純文字"])
|
11 |
|
12 |
+
st.title("PDF 摘要")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
13 |
|
14 |
+
if 功能 == "PDF 摘要":
|
15 |
+
uploaded_file = st.file_uploader("上傳你的PDF檔案", type=["pdf"])
|
|
|
|
|
16 |
if uploaded_file is not None:
|
17 |
+
# 讀取PDF檔案內容
|
18 |
+
from PyPDF2 import PdfReader
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
19 |
pdf_reader = PdfReader(uploaded_file)
|
20 |
+
內容 = ""
|
21 |
+
for page in pdf_reader.pages:
|
22 |
+
內容 += page.extract_text() or ""
|
23 |
+
if st.button("產生 PDF 摘要"):
|
24 |
+
st.info("正在產生摘要,請稍候...")
|
25 |
+
result = 摘要(內容)
|
26 |
+
st.success(result)
|
27 |
+
else:
|
28 |
+
st.info("請選擇功能")
|
|
|
|
|
|
|
|
|
|
|
|
gitattributes
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Git LFS 屬性設定檔(可用於大檔案控制)
|
2 |
+
*.pdf filter=lfs diff=lfs merge=lfs -text
|
3 |
+
*.jpg filter=lfs diff=lfs merge=lfs -text
|
4 |
+
*.png filter=lfs diff=lfs merge=lfs -text
|
5 |
+
|
6 |
+
# 中文註解:上面設定會讓 PDF/圖片走 Git LFS(大檔案友善處理)
|
papersearch.py
CHANGED
@@ -1,154 +1,31 @@
|
|
1 |
-
# import streamlit as st
|
2 |
-
# import requests
|
3 |
-
# import xmltodict
|
4 |
-
|
5 |
-
# # arXiv API base URL
|
6 |
-
# ARXIV_API_BASE = "http://export.arxiv.org/api/query"
|
7 |
-
|
8 |
-
# def fetch_papers(query, max_results=10):
|
9 |
-
# """Fetch papers from the arXiv API."""
|
10 |
-
# try:
|
11 |
-
# # Build the API query URL
|
12 |
-
# api_url = f"{ARXIV_API_BASE}?search_query=all:{query}&start=0&max_results={max_results}"
|
13 |
-
|
14 |
-
# # Make the API request
|
15 |
-
# response = requests.get(api_url, headers={'Accept': 'application/xml'})
|
16 |
-
# response.raise_for_status()
|
17 |
-
|
18 |
-
# # Parse the XML response
|
19 |
-
# data = xmltodict.parse(response.text)
|
20 |
-
# entries = data.get('feed', {}).get('entry', [])
|
21 |
-
|
22 |
-
# if not isinstance(entries, list): # Handle single result
|
23 |
-
# entries = [entries]
|
24 |
-
|
25 |
-
# # Extract relevant fields
|
26 |
-
# papers = []
|
27 |
-
# for entry in entries:
|
28 |
-
# papers.append({
|
29 |
-
# 'title': entry.get('title'),
|
30 |
-
# 'summary': entry.get('summary'),
|
31 |
-
# 'published': entry.get('published'),
|
32 |
-
# 'authors': [author['name'] for author in entry.get('author', [])] if isinstance(entry.get('author'), list) else [entry.get('author', {}).get('name')],
|
33 |
-
# 'link': entry.get('id')
|
34 |
-
# })
|
35 |
-
|
36 |
-
# return papers
|
37 |
-
# except Exception as e:
|
38 |
-
# st.error(f"Error fetching papers: {e}")
|
39 |
-
# return []
|
40 |
-
|
41 |
-
# # Streamlit app UI
|
42 |
-
# st.title("arXiv Research Paper Search")
|
43 |
-
# st.subheader("Find academic papers on your topic of interest")
|
44 |
-
|
45 |
-
# # Input fields
|
46 |
-
# query = st.text_input("Enter a topic or keywords", placeholder="e.g., machine learning, quantum computing")
|
47 |
-
# max_results = st.slider("Number of results", min_value=1, max_value=50, value=10)
|
48 |
-
|
49 |
-
# if st.button("Search"):
|
50 |
-
# if query.strip():
|
51 |
-
# st.info(f"Searching for papers on: **{query}**")
|
52 |
-
# papers = fetch_papers(query, max_results)
|
53 |
-
|
54 |
-
# if papers:
|
55 |
-
# st.success(f"Found {len(papers)} papers!")
|
56 |
-
# for idx, paper in enumerate(papers, start=1):
|
57 |
-
# st.write(f"### {idx}. {paper['title']}")
|
58 |
-
# st.write(f"**Authors**: {', '.join(paper['authors'])}")
|
59 |
-
# st.write(f"**Published**: {paper['published']}")
|
60 |
-
# st.write(f"[Read More]({paper['link']})")
|
61 |
-
# st.write("---")
|
62 |
-
# else:
|
63 |
-
# st.warning("No papers found. Try a different query.")
|
64 |
-
# else:
|
65 |
-
# st.error("Please enter a topic or keywords to search.")
|
66 |
-
|
67 |
-
import streamlit as st
|
68 |
import requests
|
69 |
-
import
|
70 |
from datetime import datetime
|
71 |
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
'authors': [author['name'] for author in entry.get('author', [])] if isinstance(entry.get('author'), list) else [entry.get('author', {}).get('name')],
|
100 |
-
'link': entry.get('id')
|
101 |
-
})
|
102 |
-
|
103 |
-
return papers
|
104 |
-
except Exception as e:
|
105 |
-
st.error(f"Error fetching papers: {e}")
|
106 |
-
return []
|
107 |
-
|
108 |
-
def filter_papers_by_year(papers, start_year, end_year):
|
109 |
-
"""Filter papers by the publication year range."""
|
110 |
-
filtered_papers = []
|
111 |
-
for paper in papers:
|
112 |
-
try:
|
113 |
-
published_year = int(paper['published'][:4]) # Extract year from the published date
|
114 |
-
if start_year <= published_year <= end_year:
|
115 |
-
filtered_papers.append(paper)
|
116 |
-
except ValueError:
|
117 |
-
continue # Skip if the year is not valid
|
118 |
-
return filtered_papers
|
119 |
-
|
120 |
-
# Streamlit app UI
|
121 |
-
st.title("arXiv Research Paper Search")
|
122 |
-
st.subheader("Find academic papers on your topic of interest")
|
123 |
-
|
124 |
-
# Input fields
|
125 |
-
query = st.text_input("Enter a topic or keywords", placeholder="e.g., machine learning, quantum computing")
|
126 |
-
max_results = st.slider("Number of results", min_value=1, max_value=50, value=10)
|
127 |
-
|
128 |
-
# Year filter
|
129 |
-
col1, col2 = st.columns(2)
|
130 |
-
with col1:
|
131 |
-
start_year = st.number_input("Start Year", min_value=1900, max_value=datetime.now().year, value=2000, step=1)
|
132 |
-
with col2:
|
133 |
-
end_year = st.number_input("End Year", min_value=1900, max_value=datetime.now().year, value=datetime.now().year, step=1)
|
134 |
-
|
135 |
-
if st.button("Search"):
|
136 |
-
if query.strip():
|
137 |
-
st.info(f"Searching for papers on: **{query}**")
|
138 |
-
papers = fetch_papers(query, max_results)
|
139 |
-
|
140 |
-
# Filter papers by year
|
141 |
-
papers_filtered = filter_papers_by_year(papers, start_year, end_year)
|
142 |
-
|
143 |
-
if papers_filtered:
|
144 |
-
st.success(f"Found {len(papers_filtered)} papers between {start_year} and {end_year}!")
|
145 |
-
for idx, paper in enumerate(papers_filtered, start=1):
|
146 |
-
st.write(f"### {idx}. {paper['title']}")
|
147 |
-
st.write(f"**Authors**: {', '.join(paper['authors'])}")
|
148 |
-
st.write(f"**Published**: {paper['published']}")
|
149 |
-
st.write(f"[Read More]({paper['link']})")
|
150 |
-
st.write("---")
|
151 |
-
else:
|
152 |
-
st.warning(f"No papers found between {start_year} and {end_year}. Try a different query or adjust the year range.")
|
153 |
-
else:
|
154 |
-
st.error("Please enter a topic or keywords to search.")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import requests
|
2 |
+
import xml.etree.ElementTree as ET
|
3 |
from datetime import datetime
|
4 |
|
5 |
+
def 抓取論文(關鍵字, 最大數量=10):
|
6 |
+
"""
|
7 |
+
從 arXiv 依關鍵字搜尋論文(最新)
|
8 |
+
"""
|
9 |
+
url = f"https://export.arxiv.org/api/query?search_query=all:{關鍵字}&start=0&max_results={最大數量}&sortBy=lastUpdatedDate"
|
10 |
+
res = requests.get(url)
|
11 |
+
root = ET.fromstring(res.content)
|
12 |
+
論文清單 = []
|
13 |
+
for entry in root.findall('{http://www.w3.org/2005/Atom}entry'):
|
14 |
+
論文清單.append({
|
15 |
+
"標題": entry.find('{http://www.w3.org/2005/Atom}title').text.strip(),
|
16 |
+
"作者": [author.find('{http://www.w3.org/2005/Atom}name').text for author in entry.findall('{http://www.w3.org/2005/Atom}author')],
|
17 |
+
"發表時間": entry.find('{http://www.w3.org/2005/Atom}published').text[:10],
|
18 |
+
"連結": entry.find('{http://www.w3.org/2005/Atom}id').text
|
19 |
+
})
|
20 |
+
return 論文清單
|
21 |
+
|
22 |
+
def 篩選論文依年份(論文清單, 起始, 結束):
|
23 |
+
"""
|
24 |
+
依年份篩選論文(年分區間)
|
25 |
+
"""
|
26 |
+
篩選 = []
|
27 |
+
for 論文 in 論文清單:
|
28 |
+
年份 = int(論文["發表時間"][:4])
|
29 |
+
if 起始 <= 年份 <= 結束:
|
30 |
+
篩選.append(論文)
|
31 |
+
return 篩選
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
pdfpass.py
CHANGED
@@ -1,40 +1,20 @@
|
|
1 |
-
import
|
2 |
-
from PyPDF2 import PdfReader, PdfWriter
|
3 |
from io import BytesIO
|
4 |
|
5 |
-
def
|
|
|
|
|
|
|
6 |
try:
|
7 |
-
reader = PdfReader(
|
8 |
if reader.is_encrypted:
|
9 |
-
reader.decrypt(
|
10 |
writer = PdfWriter()
|
11 |
for page in reader.pages:
|
12 |
writer.add_page(page)
|
13 |
-
|
14 |
output = BytesIO()
|
15 |
writer.write(output)
|
16 |
output.seek(0)
|
17 |
return output
|
18 |
except Exception as e:
|
19 |
-
return
|
20 |
-
|
21 |
-
st.title("PDF Password Remover")
|
22 |
-
st.write("Upload a password-protected PDF and remove its password.")
|
23 |
-
|
24 |
-
# File upload
|
25 |
-
uploaded_file = st.file_uploader("Choose a PDF file", type=["pdf"])
|
26 |
-
password = st.text_input("Enter the PDF password", type="password")
|
27 |
-
|
28 |
-
if uploaded_file and password:
|
29 |
-
if st.button("Remove Password"):
|
30 |
-
output = remove_pdf_password(uploaded_file, password)
|
31 |
-
if isinstance(output, BytesIO):
|
32 |
-
st.success("Password removed successfully!")
|
33 |
-
st.download_button(
|
34 |
-
label="Download PDF without Password",
|
35 |
-
data=output,
|
36 |
-
file_name="unlocked_pdf.pdf",
|
37 |
-
mime="application/pdf",
|
38 |
-
)
|
39 |
-
else:
|
40 |
-
st.error(f"Error: {output}")
|
|
|
1 |
+
from pypdf import PdfReader, PdfWriter
|
|
|
2 |
from io import BytesIO
|
3 |
|
4 |
+
def 移除_pdf密碼(pdf檔案, 密碼):
|
5 |
+
"""
|
6 |
+
解鎖帶有密碼保護的 PDF 檔案,回傳已解鎖的檔案(BytesIO)或錯誤訊息
|
7 |
+
"""
|
8 |
try:
|
9 |
+
reader = PdfReader(pdf檔案)
|
10 |
if reader.is_encrypted:
|
11 |
+
reader.decrypt(密碼)
|
12 |
writer = PdfWriter()
|
13 |
for page in reader.pages:
|
14 |
writer.add_page(page)
|
|
|
15 |
output = BytesIO()
|
16 |
writer.write(output)
|
17 |
output.seek(0)
|
18 |
return output
|
19 |
except Exception as e:
|
20 |
+
return f"解鎖失敗:{e}"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
pdfsum.py
CHANGED
@@ -1,125 +1,34 @@
|
|
1 |
-
# import streamlit as st
|
2 |
-
# from transformers import pipeline
|
3 |
-
# from PyPDF2 import PdfReader
|
4 |
-
|
5 |
-
# # Initialize the summarizer
|
6 |
-
# summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
|
7 |
-
|
8 |
-
# def extract_text_from_pdf(pdf_file):
|
9 |
-
# """Extract text from an uploaded PDF file."""
|
10 |
-
# try:
|
11 |
-
# reader = PdfReader(pdf_file)
|
12 |
-
# text = ""
|
13 |
-
# for page in reader.pages:
|
14 |
-
# page_text = page.extract_text()
|
15 |
-
# if page_text: # Skip pages with no text
|
16 |
-
# text += page_text + "\n"
|
17 |
-
# return text
|
18 |
-
# except Exception as e:
|
19 |
-
# raise ValueError(f"Error extracting text from PDF: {e}")
|
20 |
-
|
21 |
-
# def split_text_into_chunks(text, max_chunk_size=1024):
|
22 |
-
# """Split the text into smaller chunks for summarization."""
|
23 |
-
# chunks = []
|
24 |
-
# while len(text) > max_chunk_size:
|
25 |
-
# split_point = text.rfind(". ", 0, max_chunk_size) + 1 # Split at the last sentence boundary
|
26 |
-
# if split_point == 0: # No sentence boundary found, split arbitrarily
|
27 |
-
# split_point = max_chunk_size
|
28 |
-
# chunks.append
|
29 |
-
|
30 |
-
# # Streamlit Dashboard
|
31 |
-
# st.title("PDF Summarizer")
|
32 |
-
# st.write("Upload a PDF file to get a summarized version of its content.")
|
33 |
-
|
34 |
-
# uploaded_file = st.file_uploader("Upload your PDF", type=["pdf"])
|
35 |
-
|
36 |
-
# if uploaded_file is not None:
|
37 |
-
# # Extract text from the PDF
|
38 |
-
# st.write("Processing your PDF...")
|
39 |
-
# try:
|
40 |
-
# pdf_text = extract_text_from_pdf(uploaded_file)
|
41 |
-
# st.write("PDF content extracted successfully.")
|
42 |
-
|
43 |
-
# # Display extracted text (optional)
|
44 |
-
# with st.expander("View Extracted Text"):
|
45 |
-
# st.text_area("Extracted Text", pdf_text, height=300)
|
46 |
-
|
47 |
-
# # Summarize the extracted text
|
48 |
-
# if st.button("Summarize"):
|
49 |
-
# st.write("Generating summary...")
|
50 |
-
# summary = summarizer(pdf_text, max_length=130, min_length=30, do_sample=False)
|
51 |
-
# st.subheader("Summary")
|
52 |
-
# st.write(summary[0]["summary_text"])
|
53 |
-
# except Exception as e:
|
54 |
-
# st.error(f"An error occurred while processing the PDF: {str(e)}")
|
55 |
-
|
56 |
-
import streamlit as st
|
57 |
from transformers import pipeline
|
58 |
-
import pdfplumber
|
59 |
-
|
60 |
-
# Initialize the summarizer
|
61 |
-
summarizer = pipeline("summarization", model="t5-small")
|
62 |
-
|
63 |
-
def extract_text_from_pdf(pdf_file):
|
64 |
-
"""Extract text from an uploaded PDF file using pdfplumber."""
|
65 |
-
try:
|
66 |
-
text = ""
|
67 |
-
with pdfplumber.open(pdf_file) as pdf:
|
68 |
-
for page in pdf.pages:
|
69 |
-
text += page.extract_text() + "\n"
|
70 |
-
if not text.strip():
|
71 |
-
raise ValueError("No extractable text found in the PDF.")
|
72 |
-
return text
|
73 |
-
except Exception as e:
|
74 |
-
raise ValueError(f"Error extracting text from PDF: {e}")
|
75 |
-
|
76 |
-
def split_text_into_chunks(text, max_chunk_size=1024):
|
77 |
-
"""Split the text into smaller chunks for summarization."""
|
78 |
-
chunks = []
|
79 |
-
while len(text) > max_chunk_size:
|
80 |
-
split_point = text.rfind(". ", 0, max_chunk_size) + 1 # Find the last full sentence
|
81 |
-
if split_point == 0: # No sentence boundary found, split arbitrarily
|
82 |
-
split_point = max_chunk_size
|
83 |
-
chunks.append(text[:split_point])
|
84 |
-
text = text[split_point:]
|
85 |
-
if text:
|
86 |
-
chunks.append(text)
|
87 |
-
return chunks
|
88 |
-
|
89 |
-
def summarize_text(chunks):
|
90 |
-
"""Summarize each chunk of text with dynamic max_length."""
|
91 |
-
summaries = []
|
92 |
-
for chunk in chunks:
|
93 |
-
input_length = len(chunk.split()) # Approximate token count
|
94 |
-
max_length = max(48, int(input_length * 0.8)) # Set max_length to 80% of input length
|
95 |
-
summary = summarizer(chunk, max_length=max_length, min_length=10, do_sample=False)
|
96 |
-
summaries.append(summary[0]["summary_text"])
|
97 |
-
return summaries
|
98 |
-
|
99 |
-
# Streamlit Dashboard
|
100 |
-
st.title("PDF Summarizer")
|
101 |
-
st.write("Upload a PDF file to get a summarized version of its content.")
|
102 |
-
|
103 |
-
uploaded_file = st.file_uploader("Upload your PDF", type=["pdf"])
|
104 |
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
|
113 |
-
|
114 |
-
|
115 |
-
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
-
|
121 |
-
|
122 |
-
|
123 |
-
|
124 |
-
|
125 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
from transformers import pipeline
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2 |
|
3 |
+
# 使用 huggingface 上的 pegasus 中文摘要模型
|
4 |
+
# 如果你��� Hugging Face Space 上執行,可直接用下面這行
|
5 |
+
summarizer = pipeline(
|
6 |
+
"summarization",
|
7 |
+
model="IDEA-CCNL/Randeng-Pegasus-523M-Summary-Chinese",
|
8 |
+
tokenizer="IDEA-CCNL/Randeng-Pegasus-523M-Summary-Chinese",
|
9 |
+
device=0 # 如果有 GPU,否則設 device=-1
|
10 |
+
)
|
11 |
+
|
12 |
+
def 摘要(pdf_純文字):
|
13 |
+
"""
|
14 |
+
中文 PDF 摘要,適用於繁簡體
|
15 |
+
"""
|
16 |
+
if not pdf_純文字 or len(pdf_純文字.strip()) < 20:
|
17 |
+
return "⚠️ PDF 內容為空或無法解析(可能是掃描檔或圖片)"
|
18 |
+
|
19 |
+
段落列表 = [p.strip() for p in pdf_純文字.split('\n') if p.strip()]
|
20 |
+
摘要結果 = []
|
21 |
+
for 段 in 段落列表:
|
22 |
+
# Pegasus 的 max_length 最多 128
|
23 |
+
if len(段) < 30:
|
24 |
+
continue
|
25 |
+
# 以 400 字切片
|
26 |
+
for i in range(0, len(段), 400):
|
27 |
+
子段 = 段[i:i+400]
|
28 |
+
try:
|
29 |
+
out = summarizer(子段, max_length=64, min_length=10, do_sample=False)
|
30 |
+
if out and len(out) > 0:
|
31 |
+
摘要結果.append(out[0]['summary_text'])
|
32 |
+
except Exception as e:
|
33 |
+
摘要結果.append(f"(錯誤:{e})")
|
34 |
+
return "\n".join(摘要結果) if 摘要結果 else "⚠️ 沒有找到可摘要的內容!"
|
requirements.txt
CHANGED
Binary files a/requirements.txt and b/requirements.txt differ
|
|
textsumm.py
CHANGED
@@ -1,28 +1,31 @@
|
|
|
|
|
|
|
|
1 |
from transformers import pipeline
|
2 |
|
|
|
3 |
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
|
4 |
-
ARTICLE ="""
|
5 |
-
There is widespread international concern that Russia's war will provoke a global food crisis similar to, or
|
6 |
-
worse than, that faced in 2007 and 2008. The war comes at a time when the global food system was already
|
7 |
-
struggling to feed its growing population in a sustainable way, under the pressure caused by climate change
|
8 |
-
and the Covid-19 pandemic. Russia and Ukraine are key agricultural players, together exporting nearly 12
|
9 |
-
% of food calories traded globally. They are major providers of basic agro-commodities, including wheat,
|
10 |
-
maize and sunflower oil, and Russia is the world's top exporter of fertilisers. The global supply chain will
|
11 |
-
get impacted until Russia and Ukraine retreat and will end the war.
|
12 |
-
The war's impact on global food supply centred on three factors. First is a significant reduction in exports
|
13 |
-
and production of essential commodities from both countries, caused by the war and not the economic
|
14 |
-
sanctions imposed on Russia, which, intentionally, did not target the agricultural sector. Overall, the
|
15 |
-
European Commission estimates that 'up to 25 million tonnes of wheat would need to be substituted to
|
16 |
-
meet worldwide food needs in the current and the next season. Second factor is a global spike in prices of
|
17 |
-
food supplies and inputs needed for agri-food production, which were already at record levels before the
|
18 |
-
war. The war has further pushed the prices up. Third factor is the international response to the above,
|
19 |
-
which could either amplify the effects of the crisis (mainly by uncoordinated export bans) or mitigate them
|
20 |
-
(applying lessons learnt from the 2007-2008 food crisis). A number of countries, other than Russia and
|
21 |
-
Ukraine, have already imposed or announced their intention to impose some control over exports of
|
22 |
-
essential agricultural commodities, including Egypt, Argentina, Indonesia, Serbia, Turkey and, in the EU,
|
23 |
-
Hungary. We should keep this in our mind that the long duration of war will make the global situation
|
24 |
-
irrecoverable.
|
25 |
|
26 |
-
|
27 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
28 |
|
|
|
|
|
|
|
|
|
|
1 |
+
# textsumm.py
|
2 |
+
# 中文化摘要模組,安裝 transformers 與 torch 即可使用
|
3 |
+
|
4 |
from transformers import pipeline
|
5 |
|
6 |
+
# 初始化摘要 pipeline
|
7 |
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
8 |
|
9 |
+
def 文字摘要(輸入文本, 最長長度=120, 最短長度=40):
|
10 |
+
"""
|
11 |
+
輸入:一段文本
|
12 |
+
輸出:摘要(中文說明)
|
13 |
+
"""
|
14 |
+
if len(輸入文本.strip()) == 0:
|
15 |
+
return "❗️ 請輸入需要摘要的內容。"
|
16 |
+
|
17 |
+
try:
|
18 |
+
結果 = summarizer(
|
19 |
+
輸入文本,
|
20 |
+
max_length=最長長度,
|
21 |
+
min_length=最短長度,
|
22 |
+
do_sample=False
|
23 |
+
)
|
24 |
+
return 結果[0]['summary_text']
|
25 |
+
except Exception as e:
|
26 |
+
return f"❌ 摘要生成失敗:{str(e)}"
|
27 |
|
28 |
+
# 若你要測試,可以取消下面註解
|
29 |
+
# if __name__ == "__main__":
|
30 |
+
# 測試文本 = "人工智慧(AI)是研究如何讓電腦模擬人類智能行為的學科,包括學習、推理、規劃、自然語言處理、知覺等。AI 技術已廣泛應用於語音辨識、影像分析、自駕車等領域。"
|
31 |
+
# print(文字摘要(測試文本))
|