Spaces:
Sleeping
Sleeping
Upload 5 files
Browse files- README.md +35 -14
- app.py +27 -60
- gitattributes +39 -1
- main.py +41 -0
- requirements.txt +2 -5
README.md
CHANGED
@@ -1,24 +1,45 @@
|
|
1 |
---
|
2 |
-
title:
|
3 |
-
emoji:
|
4 |
-
colorFrom:
|
5 |
-
colorTo:
|
6 |
-
sdk: streamlit
|
7 |
-
sdk_version: 1.
|
8 |
app_file: app.py
|
9 |
pinned: false
|
|
|
10 |
---
|
11 |
|
12 |
-
#
|
|
|
|
|
|
|
13 |
|
14 |
## 主要功能
|
15 |
|
16 |
-
-
|
17 |
-
-
|
18 |
-
-
|
19 |
-
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
20 |
|
21 |
-
|
|
|
|
|
22 |
|
23 |
-
|
24 |
-
pip install -r requirements.txt
|
|
|
1 |
---
|
2 |
+
title: Pdf 2 Summary
|
3 |
+
emoji: 🌍
|
4 |
+
colorFrom: red
|
5 |
+
colorTo: purple
|
6 |
+
sdk: streamlit
|
7 |
+
sdk_version: 1.31.1
|
8 |
app_file: app.py
|
9 |
pinned: false
|
10 |
+
license: mit
|
11 |
---
|
12 |
|
13 |
+
# 進階 NLP PDF 摘要生成工具 🚀
|
14 |
+
|
15 |
+
本工具可將冗長的 PDF 文件,透過最先進的自然語言處理(NLP)技術,快速提煉成條列式的重點摘要。
|
16 |
+
以下為本應用的功能介紹與操作說明。
|
17 |
|
18 |
## 主要功能
|
19 |
|
20 |
+
- **PDF 文字擷取**:利用 PyPDF2 有效讀取 PDF 文件內容,確保摘要過程不遺漏任何重點。
|
21 |
+
- **高級文字前處理**:利用 spaCy 套件進行斷詞、詞形還原、停用字過濾等,使語料更精練、適合進一步分析。
|
22 |
+
- **關鍵詞擷取**:從段落中自動擷取主題關鍵詞及專有名詞,精確找出文本核心內容。
|
23 |
+
- **句子重要度評分**:根據關鍵詞出現及語意關聯性,動態計算句子分數,確保摘要涵蓋文章主旨。
|
24 |
+
- **摘要比例調整**:可依需求調整摘要長度,百分比自訂,生成不同精簡度的重點摘要。
|
25 |
+
|
26 |
+
## 使用說明
|
27 |
+
|
28 |
+
1. **安裝需求**:請確保已安裝 Python 及所需套件(PyPDF2、spaCy、Streamlit)。
|
29 |
+
2. **啟動應用**:終端機執行 `streamlit run app.py`,依照網址開啟網頁介面。
|
30 |
+
3. **上傳 PDF**:於網頁上傳欲摘要之 PDF 檔案。
|
31 |
+
4. **調整摘要比例**:可使用滑桿設定想要的摘要比例。
|
32 |
+
5. **取得摘要結果**:系統自動處理,於下方呈現條列式重點摘要。
|
33 |
+
|
34 |
+
## 貢獻與授權
|
35 |
+
|
36 |
+
歡迎提出改進建議或 Bug 回報。
|
37 |
+
本專案採用 MIT 授權。
|
38 |
+
|
39 |
+
## 感謝
|
40 |
|
41 |
+
- spaCy 團隊的 NLP 函式庫
|
42 |
+
- PyPDF2 團隊的 PDF 處理能力
|
43 |
+
- Streamlit 團隊的開發框架
|
44 |
|
45 |
+
本工具為阿亮 老師製作,期待能提升您的工作與學習效率!
|
|
app.py
CHANGED
@@ -1,66 +1,33 @@
|
|
1 |
-
# app.py
|
2 |
import streamlit as st
|
3 |
-
from
|
4 |
-
|
5 |
-
import requests
|
6 |
|
7 |
-
|
|
|
8 |
|
9 |
-
#
|
10 |
-
st.
|
11 |
-
st.sidebar.write("請輸入 OpenAI API 金鑰(非必填)")
|
12 |
-
api_key = st.sidebar.text_input("sk-...", type="password")
|
13 |
|
14 |
-
#
|
15 |
-
|
16 |
|
17 |
-
#
|
18 |
-
|
19 |
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
if uploaded_file is not None and st.button("產生 PDF 摘要"):
|
38 |
-
with st.spinner("摘要生成中..."):
|
39 |
-
summary = 提取_pdf摘要(uploaded_file, 摘要)
|
40 |
-
st.success("摘要結果:")
|
41 |
-
st.write(summary)
|
42 |
-
elif tool == "論文搜尋":
|
43 |
-
st.header("🔎 論文搜尋(arXiv)")
|
44 |
-
keyword = st.text_input("輸入主題或關鍵字", "量子")
|
45 |
-
max_results = st.slider("結果數量", 1, 20, 5)
|
46 |
-
start_year = st.number_input("起始年份", min_value=1990, max_value=2025, value=2019)
|
47 |
-
end_year = st.number_input("結束年份", min_value=1990, max_value=2025, value=2025)
|
48 |
-
if st.button("搜尋論文"):
|
49 |
-
st.info("搜尋中...")
|
50 |
-
url = f"http://export.arxiv.org/api/query?search_query=all:{keyword}&start=0&max_results={max_results}"
|
51 |
-
resp = requests.get(url)
|
52 |
-
if resp.ok:
|
53 |
-
import xml.etree.ElementTree as ET
|
54 |
-
root = ET.fromstring(resp.content)
|
55 |
-
found = False
|
56 |
-
for entry in root.findall("{http://www.w3.org/2005/Atom}entry"):
|
57 |
-
published = entry.find("{http://www.w3.org/2005/Atom}published").text[:4]
|
58 |
-
if start_year <= int(published) <= end_year:
|
59 |
-
found = True
|
60 |
-
title = entry.find("{http://www.w3.org/2005/Atom}title").text.strip()
|
61 |
-
link = entry.find("{http://www.w3.org/2005/Atom}id").text.strip()
|
62 |
-
st.markdown(f"**[{title}]({link})**({published})")
|
63 |
-
if not found:
|
64 |
-
st.warning("在所選年份範圍內沒有找到相關論文。")
|
65 |
-
else:
|
66 |
-
st.error("arXiv 查詢失敗")
|
|
|
|
|
1 |
import streamlit as st
|
2 |
+
from main import read_pdf, extract_key_phrases, score_sentences, summarize_text
|
3 |
+
import io
|
|
|
4 |
|
5 |
+
# 設定 Streamlit 應用標題
|
6 |
+
st.title("PDF 條列重點摘要工具 🗟 🔏")
|
7 |
|
8 |
+
# 上傳 PDF 檔案元件
|
9 |
+
uploaded_file = st.file_uploader("請上傳您的 PDF 文件", type="pdf")
|
|
|
|
|
10 |
|
11 |
+
# 摘要比例滑桿
|
12 |
+
summary_scale = st.slider("請選擇摘要比例(%)", min_value=1, max_value=100, value=20)
|
13 |
|
14 |
+
# 產生摘要按鈕
|
15 |
+
submit_button = st.button("產生摘要")
|
16 |
|
17 |
+
# 若按下按鈕且有上傳檔案
|
18 |
+
if submit_button and uploaded_file is not None:
|
19 |
+
with st.spinner('正在處理,請稍候...'):
|
20 |
+
# 讀取 PDF 內容
|
21 |
+
text = read_pdf(io.BytesIO(uploaded_file.getvalue()))
|
22 |
+
# 擷取關鍵詞
|
23 |
+
key_phrases = extract_key_phrases(text)
|
24 |
+
# 句子評分
|
25 |
+
sentence_scores = score_sentences(text, key_phrases)
|
26 |
+
# 計算要顯示的重點句數
|
27 |
+
total_sentences = len(list(sentence_scores.keys()))
|
28 |
+
num_points = max(1, total_sentences * summary_scale // 100)
|
29 |
+
# 產生條列式摘要
|
30 |
+
summary = summarize_text(sentence_scores, num_points=num_points)
|
31 |
+
# 顯示摘要
|
32 |
+
st.subheader("摘要結果:")
|
33 |
+
st.markdown(summary)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
gitattributes
CHANGED
@@ -1 +1,39 @@
|
|
1 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
28 |
+
*.tar filter=lfs diff=lfs merge=lfs -text
|
29 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
30 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
31 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
32 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
33 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
en_core_web_sm/en_core_web_sm-3.7.1/ner/model filter=lfs diff=lfs merge=lfs -text
|
37 |
+
en_core_web_sm/en_core_web_sm-3.7.1/tok2vec/model filter=lfs diff=lfs merge=lfs -text
|
38 |
+
en_core_web_sm-3.7.1/ner/model filter=lfs diff=lfs merge=lfs -text
|
39 |
+
en_core_web_sm-3.7.1/tok2vec/model filter=lfs diff=lfs merge=lfs -text
|
main.py
ADDED
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import PyPDF2
|
2 |
+
import spacy
|
3 |
+
from collections import Counter
|
4 |
+
import heapq
|
5 |
+
import io
|
6 |
+
|
7 |
+
# 載入 spaCy 模型
|
8 |
+
nlp = spacy.load("./en_core_web_sm-3.7.1")
|
9 |
+
|
10 |
+
def read_pdf(file_stream):
|
11 |
+
"""讀取 PDF 文字內容"""
|
12 |
+
text = ''
|
13 |
+
reader = PyPDF2.PdfReader(file_stream)
|
14 |
+
for page in reader.pages:
|
15 |
+
text += page.extract_text() + ' '
|
16 |
+
return text.strip()
|
17 |
+
|
18 |
+
def extract_key_phrases(text):
|
19 |
+
"""擷取關鍵詞及專有名詞"""
|
20 |
+
doc = nlp(text)
|
21 |
+
key_phrases = [chunk.text for chunk in doc.noun_chunks] + [ent.text for ent in doc.ents]
|
22 |
+
return key_phrases
|
23 |
+
|
24 |
+
def score_sentences(text, key_phrases):
|
25 |
+
"""根據關鍵詞出現次數給句子評分"""
|
26 |
+
sentence_scores = {}
|
27 |
+
doc = nlp(text)
|
28 |
+
for sent in doc.sents:
|
29 |
+
for phrase in key_phrases:
|
30 |
+
if phrase in sent.text:
|
31 |
+
if sent in sentence_scores:
|
32 |
+
sentence_scores[sent] += 1
|
33 |
+
else:
|
34 |
+
sentence_scores[sent] = 1
|
35 |
+
return sentence_scores
|
36 |
+
|
37 |
+
def summarize_text(sentence_scores, num_points=5):
|
38 |
+
"""依分數挑出重點句並條列化"""
|
39 |
+
summary_sentences = heapq.nlargest(num_points, sentence_scores, key=sentence_scores.get)
|
40 |
+
summary = '\n'.join([f"- {sent.text}" for sent in summary_sentences])
|
41 |
+
return summary
|
requirements.txt
CHANGED
@@ -1,5 +1,2 @@
|
|
1 |
-
|
2 |
-
|
3 |
-
torch
|
4 |
-
PyPDF2
|
5 |
-
requests
|
|
|
1 |
+
PyPDF2
|
2 |
+
spacy==3.7.4
|
|
|
|
|
|