Spaces:
Sleeping
Sleeping
PDF Summarizer version 1
Browse files- README.md +47 -14
- app.py +33 -0
- gitattributes.txt +39 -0
- main.py +46 -0
- requirements.txt +2 -0
README.md
CHANGED
@@ -1,14 +1,47 @@
|
|
1 |
-
---
|
2 |
-
title:
|
3 |
-
emoji: 🌍
|
4 |
-
colorFrom:
|
5 |
-
colorTo:
|
6 |
-
sdk:
|
7 |
-
sdk_version:
|
8 |
-
app_file: app.py
|
9 |
-
pinned: false
|
10 |
-
license: mit
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
title: Pdf 2 Summary
|
3 |
+
emoji: 🌍
|
4 |
+
colorFrom: red
|
5 |
+
colorTo: purple
|
6 |
+
sdk: streamlit
|
7 |
+
sdk_version: 1.31.1
|
8 |
+
app_file: app.py
|
9 |
+
pinned: false
|
10 |
+
license: mit
|
11 |
+
---
|
12 |
+
|
13 |
+
# 進階 NLP PDF 條列式重點摘要工具 🚀
|
14 |
+
|
15 |
+
本工具可以將冗長的 PDF 文件,透過先進的自然語言處理(NLP)技術,快速轉換為條列式、易讀的重點摘要。以下說明本應用的功能與操作方式,協助教師與學生輕鬆掌握大量資訊。
|
16 |
+
|
17 |
+
## 功能特色
|
18 |
+
|
19 |
+
- **PDF 文字擷取**:利用 PyPDF2 高效讀取 PDF 內容,確保摘要過程不遺漏任何資料。
|
20 |
+
- **高級文字前處理**:結合 spaCy 斷詞、詞形還原、去除停用字,讓資料更精煉、更容易分析。
|
21 |
+
- **關鍵詞擷取**:透過 NLP 技術找出文章最重要的關鍵詞與專有名詞,抓住主題核心。
|
22 |
+
- **句子重要度評分**:自訂演算法根據關鍵詞分布與語意相關性,挑選出最能代表全文重點的句子。
|
23 |
+
- **動態摘要調整**:可依需求調整摘要比例,彈性生成長短不一的重點摘要。
|
24 |
+
|
25 |
+
## 操作說明
|
26 |
+
|
27 |
+
1. **安裝套件**:請先安裝 Python 與必要函式庫(PyPDF2、spaCy、Streamlit)。
|
28 |
+
2. **啟動應用程式**:在終端機執行 `streamlit run app.py`,依畫面指示開啟網頁介面。
|
29 |
+
3. **上傳 PDF 檔案**:於網頁上傳欲摘要的 PDF 文件。
|
30 |
+
4. **調整摘要比例**:用滑桿設定想要的摘要長度比例(%)。
|
31 |
+
5. **查看條列摘要**:系統自動處理,於下方顯示條列式重點摘要。
|
32 |
+
|
33 |
+
## 參與貢獻
|
34 |
+
|
35 |
+
本專案採用 MIT 開源授權檔案。
|
36 |
+
|
37 |
+
## 授權
|
38 |
+
|
39 |
+
本專案採用 MIT 開源授權檔案。
|
40 |
+
|
41 |
+
## 致謝
|
42 |
+
|
43 |
+
- 感謝 NLP 函式庫
|
44 |
+
- 感謝 PyPDF2 函式庫
|
45 |
+
- 感謝 Streamlit 函式庫
|
46 |
+
|
47 |
+
感謝您使用本工具,期待能幫助您在學習與工作上更有效率!
|
app.py
ADDED
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
from main import read_pdf, extract_key_phrases, score_sentences, summarize_text
|
3 |
+
import io
|
4 |
+
|
5 |
+
# 設定 Streamlit 應用標題
|
6 |
+
st.title("PDF 條列式重點摘要工具 🗟 🔏")
|
7 |
+
|
8 |
+
# 上傳 PDF 檔案元件
|
9 |
+
uploaded_file = st.file_uploader("請上傳您的 PDF 文件", type="pdf")
|
10 |
+
|
11 |
+
# 摘要比例滑桿
|
12 |
+
summary_scale = st.slider("請選擇摘要比例(%)", min_value=1, max_value=100, value=20)
|
13 |
+
|
14 |
+
# 產生摘要按鈕
|
15 |
+
submit_button = st.button("產生摘要")
|
16 |
+
|
17 |
+
# 若按下按鈕且有上傳檔案
|
18 |
+
if submit_button and uploaded_file is not None:
|
19 |
+
with st.spinner('正在處理中,請稍候...'):
|
20 |
+
# 讀取 PDF 內容
|
21 |
+
text = read_pdf(io.BytesIO(uploaded_file.getvalue()))
|
22 |
+
# 擷取關鍵詞
|
23 |
+
key_phrases = extract_key_phrases(text)
|
24 |
+
# 句子評分
|
25 |
+
sentence_scores = score_sentences(text, key_phrases)
|
26 |
+
# 計算要顯示的重點句數
|
27 |
+
total_sentences = len(list(sentence_scores.keys()))
|
28 |
+
num_points = max(1, total_sentences * summary_scale // 100)
|
29 |
+
# 產生條列式摘要
|
30 |
+
summary = summarize_text(sentence_scores, num_points=num_points)
|
31 |
+
# 顯示摘要
|
32 |
+
st.subheader("摘要結果:")
|
33 |
+
st.markdown(summary)
|
gitattributes.txt
ADDED
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
28 |
+
*.tar filter=lfs diff=lfs merge=lfs -text
|
29 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
30 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
31 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
32 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
33 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
en_core_web_sm/en_core_web_sm-3.7.1/ner/model filter=lfs diff=lfs merge=lfs -text
|
37 |
+
en_core_web_sm/en_core_web_sm-3.7.1/tok2vec/model filter=lfs diff=lfs merge=lfs -text
|
38 |
+
en_core_web_sm-3.7.1/ner/model filter=lfs diff=lfs merge=lfs -text
|
39 |
+
en_core_web_sm-3.7.1/tok2vec/model filter=lfs diff=lfs merge=lfs -text
|
main.py
ADDED
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import PyPDF2
|
2 |
+
import spacy
|
3 |
+
import subprocess
|
4 |
+
from collections import Counter
|
5 |
+
import heapq
|
6 |
+
import io
|
7 |
+
|
8 |
+
# 自動檢查、下載 spaCy 語言模型(en_core_web_sm),避免 Space 缺模型報錯
|
9 |
+
try:
|
10 |
+
nlp = spacy.load("en_core_web_sm")
|
11 |
+
except OSError:
|
12 |
+
subprocess.run(["python", "-m", "spacy", "download", "en_core_web_sm"])
|
13 |
+
nlp = spacy.load("en_core_web_sm")
|
14 |
+
|
15 |
+
def read_pdf(file_stream):
|
16 |
+
"""讀取 PDF 文字內容"""
|
17 |
+
text = ''
|
18 |
+
reader = PyPDF2.PdfReader(file_stream)
|
19 |
+
for page in reader.pages:
|
20 |
+
text += page.extract_text() + ' '
|
21 |
+
return text.strip()
|
22 |
+
|
23 |
+
def extract_key_phrases(text):
|
24 |
+
"""擷取文章中的關鍵詞與專有名詞"""
|
25 |
+
doc = nlp(text)
|
26 |
+
key_phrases = [chunk.text for chunk in doc.noun_chunks] + [ent.text for ent in doc.ents]
|
27 |
+
return key_phrases
|
28 |
+
|
29 |
+
def score_sentences(text, key_phrases):
|
30 |
+
"""根據關鍵詞分數給每個句子計分"""
|
31 |
+
sentence_scores = {}
|
32 |
+
doc = nlp(text)
|
33 |
+
for sent in doc.sents:
|
34 |
+
for phrase in key_phrases:
|
35 |
+
if phrase in sent.text:
|
36 |
+
if sent in sentence_scores:
|
37 |
+
sentence_scores[sent] += 1
|
38 |
+
else:
|
39 |
+
sentence_scores[sent] = 1
|
40 |
+
return sentence_scores
|
41 |
+
|
42 |
+
def summarize_text(sentence_scores, num_points=5):
|
43 |
+
"""依據分數挑出重要句子並條列化輸出"""
|
44 |
+
summary_sentences = heapq.nlargest(num_points, sentence_scores, key=sentence_scores.get)
|
45 |
+
summary = '\n'.join([f"- {sent.text}" for sent in summary_sentences])
|
46 |
+
return summary
|
requirements.txt
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
PyPDF2
|
2 |
+
spacy==3.7.4
|