Spaces:
Sleeping
Sleeping
Upload 5 files
Browse files- .gitattributes +39 -36
- README.md +25 -23
- app.py +2 -2
- main.py +10 -5
- requirements.txt +1 -1
.gitattributes
CHANGED
@@ -1,36 +1,39 @@
|
|
1 |
-
*.7z filter=lfs diff=lfs merge=lfs -text
|
2 |
-
*.arrow filter=lfs diff=lfs merge=lfs -text
|
3 |
-
*.bin filter=lfs diff=lfs merge=lfs -text
|
4 |
-
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
5 |
-
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
6 |
-
*.ftz filter=lfs diff=lfs merge=lfs -text
|
7 |
-
*.gz filter=lfs diff=lfs merge=lfs -text
|
8 |
-
*.h5 filter=lfs diff=lfs merge=lfs -text
|
9 |
-
*.joblib filter=lfs diff=lfs merge=lfs -text
|
10 |
-
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
11 |
-
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
12 |
-
*.model filter=lfs diff=lfs merge=lfs -text
|
13 |
-
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
14 |
-
*.npy filter=lfs diff=lfs merge=lfs -text
|
15 |
-
*.npz filter=lfs diff=lfs merge=lfs -text
|
16 |
-
*.onnx filter=lfs diff=lfs merge=lfs -text
|
17 |
-
*.ot filter=lfs diff=lfs merge=lfs -text
|
18 |
-
*.parquet filter=lfs diff=lfs merge=lfs -text
|
19 |
-
*.pb filter=lfs diff=lfs merge=lfs -text
|
20 |
-
*.pickle filter=lfs diff=lfs merge=lfs -text
|
21 |
-
*.pkl filter=lfs diff=lfs merge=lfs -text
|
22 |
-
*.pt filter=lfs diff=lfs merge=lfs -text
|
23 |
-
*.pth filter=lfs diff=lfs merge=lfs -text
|
24 |
-
*.rar filter=lfs diff=lfs merge=lfs -text
|
25 |
-
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
26 |
-
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
27 |
-
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
28 |
-
*.tar filter=lfs diff=lfs merge=lfs -text
|
29 |
-
*.tflite filter=lfs diff=lfs merge=lfs -text
|
30 |
-
*.tgz filter=lfs diff=lfs merge=lfs -text
|
31 |
-
*.wasm filter=lfs diff=lfs merge=lfs -text
|
32 |
-
*.xz filter=lfs diff=lfs merge=lfs -text
|
33 |
-
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
-
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
-
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
-
|
|
|
|
|
|
|
|
1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
28 |
+
*.tar filter=lfs diff=lfs merge=lfs -text
|
29 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
30 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
31 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
32 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
33 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
en_core_web_sm/en_core_web_sm-3.7.1/ner/model filter=lfs diff=lfs merge=lfs -text
|
37 |
+
en_core_web_sm/en_core_web_sm-3.7.1/tok2vec/model filter=lfs diff=lfs merge=lfs -text
|
38 |
+
en_core_web_sm-3.7.1/ner/model filter=lfs diff=lfs merge=lfs -text
|
39 |
+
en_core_web_sm-3.7.1/tok2vec/model filter=lfs diff=lfs merge=lfs -text
|
README.md
CHANGED
@@ -10,36 +10,38 @@ pinned: false
|
|
10 |
license: mit
|
11 |
---
|
12 |
|
13 |
-
# 進階 NLP PDF
|
14 |
|
15 |
-
|
16 |
-
以下為本應用的功能介紹與操作說明。
|
17 |
|
18 |
-
##
|
19 |
|
20 |
-
- **PDF 文字擷取**:利用 PyPDF2
|
21 |
-
-
|
22 |
-
-
|
23 |
-
-
|
24 |
-
-
|
25 |
|
26 |
-
##
|
27 |
|
28 |
-
1.
|
29 |
-
2.
|
30 |
-
3. **上傳 PDF
|
31 |
-
4.
|
32 |
-
5.
|
33 |
|
34 |
-
##
|
35 |
|
36 |
-
|
37 |
-
本專案採用 MIT 授權。
|
38 |
|
39 |
-
##
|
40 |
|
41 |
-
|
42 |
-
- PyPDF2 團隊的 PDF 處理能力
|
43 |
-
- Streamlit 團隊的開發框架
|
44 |
|
45 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
10 |
license: mit
|
11 |
---
|
12 |
|
13 |
+
# 進階 NLP PDF 條列式重點摘要工具 🚀
|
14 |
|
15 |
+
本工具可以將冗長的 PDF 文件,透過先進的自然語言處理(NLP)技術,快速轉換為條列式、易讀的重點摘要。以下說明本應用的功能與操作方式,協助教師與學生輕鬆掌握大量資訊。
|
|
|
16 |
|
17 |
+
## 功能特色
|
18 |
|
19 |
+
- **PDF 文字擷取**:利用 PyPDF2 高效讀取 PDF 內容,確保摘要過程不遺漏任何資料。
|
20 |
+
- **高級文字前處理**:結合 spaCy 斷詞、詞形還原、去除停用字,讓資料更精煉、更容易分析。
|
21 |
+
- **關鍵詞擷取**:透過 NLP 技術找出文章最重要的關鍵詞與專有名詞,抓住主題核心。
|
22 |
+
- **句子重要度評分**:自訂演算法根據關鍵詞分布與語意相關性,挑選出最能代表全文重點的句子。
|
23 |
+
- **動態摘要調整**:可依需求調整摘要比例,彈性生成長短不一的重點摘要。
|
24 |
|
25 |
+
## 操作說明
|
26 |
|
27 |
+
1. **安裝套件**:請先安裝 Python 與必要函式庫(PyPDF2、spaCy、Streamlit)。
|
28 |
+
2. **啟動應用程式**:在終端機執行 `streamlit run app.py`,依畫面指示開啟網頁介面。
|
29 |
+
3. **上傳 PDF 檔案**:於網頁上傳欲摘要的 PDF 文件。
|
30 |
+
4. **調整摘要比例**:用滑桿設定想要的摘要長度比例(%)。
|
31 |
+
5. **查看條列摘要**:系統自動處理,於下方顯示條列式重點摘要。
|
32 |
|
33 |
+
## 參與貢獻
|
34 |
|
35 |
+
本專案採用 MIT 開源授權檔案。
|
|
|
36 |
|
37 |
+
## 授權
|
38 |
|
39 |
+
本專案採用 MIT 開源授權檔案。
|
|
|
|
|
40 |
|
41 |
+
## 致謝
|
42 |
+
|
43 |
+
- 感謝 NLP 函式庫
|
44 |
+
- 感謝 PyPDF2 函式庫
|
45 |
+
- 感謝 Streamlit 函式庫
|
46 |
+
|
47 |
+
感謝您使用本工具,期待能幫助您在學習與工作上更有效率!
|
app.py
CHANGED
@@ -3,7 +3,7 @@ from main import read_pdf, extract_key_phrases, score_sentences, summarize_text
|
|
3 |
import io
|
4 |
|
5 |
# 設定 Streamlit 應用標題
|
6 |
-
st.title("PDF
|
7 |
|
8 |
# 上傳 PDF 檔案元件
|
9 |
uploaded_file = st.file_uploader("請上傳您的 PDF 文件", type="pdf")
|
@@ -16,7 +16,7 @@ submit_button = st.button("產生摘要")
|
|
16 |
|
17 |
# 若按下按鈕且有上傳檔案
|
18 |
if submit_button and uploaded_file is not None:
|
19 |
-
with st.spinner('
|
20 |
# 讀取 PDF 內容
|
21 |
text = read_pdf(io.BytesIO(uploaded_file.getvalue()))
|
22 |
# 擷取關鍵詞
|
|
|
3 |
import io
|
4 |
|
5 |
# 設定 Streamlit 應用標題
|
6 |
+
st.title("PDF 條列式重點摘要工具 🗟 🔏")
|
7 |
|
8 |
# 上傳 PDF 檔案元件
|
9 |
uploaded_file = st.file_uploader("請上傳您的 PDF 文件", type="pdf")
|
|
|
16 |
|
17 |
# 若按下按鈕且有上傳檔案
|
18 |
if submit_button and uploaded_file is not None:
|
19 |
+
with st.spinner('正在處理中,請稍候...'):
|
20 |
# 讀取 PDF 內容
|
21 |
text = read_pdf(io.BytesIO(uploaded_file.getvalue()))
|
22 |
# 擷取關鍵詞
|
main.py
CHANGED
@@ -1,11 +1,16 @@
|
|
1 |
import PyPDF2
|
2 |
import spacy
|
|
|
3 |
from collections import Counter
|
4 |
import heapq
|
5 |
import io
|
6 |
|
7 |
-
#
|
8 |
-
|
|
|
|
|
|
|
|
|
9 |
|
10 |
def read_pdf(file_stream):
|
11 |
"""讀取 PDF 文字內容"""
|
@@ -16,13 +21,13 @@ def read_pdf(file_stream):
|
|
16 |
return text.strip()
|
17 |
|
18 |
def extract_key_phrases(text):
|
19 |
-
"""
|
20 |
doc = nlp(text)
|
21 |
key_phrases = [chunk.text for chunk in doc.noun_chunks] + [ent.text for ent in doc.ents]
|
22 |
return key_phrases
|
23 |
|
24 |
def score_sentences(text, key_phrases):
|
25 |
-
"""
|
26 |
sentence_scores = {}
|
27 |
doc = nlp(text)
|
28 |
for sent in doc.sents:
|
@@ -35,7 +40,7 @@ def score_sentences(text, key_phrases):
|
|
35 |
return sentence_scores
|
36 |
|
37 |
def summarize_text(sentence_scores, num_points=5):
|
38 |
-
"""
|
39 |
summary_sentences = heapq.nlargest(num_points, sentence_scores, key=sentence_scores.get)
|
40 |
summary = '\n'.join([f"- {sent.text}" for sent in summary_sentences])
|
41 |
return summary
|
|
|
1 |
import PyPDF2
|
2 |
import spacy
|
3 |
+
import subprocess
|
4 |
from collections import Counter
|
5 |
import heapq
|
6 |
import io
|
7 |
|
8 |
+
# 自動檢查、下載 spaCy 語言模型(en_core_web_sm),避免 Space 缺模型報錯
|
9 |
+
try:
|
10 |
+
nlp = spacy.load("en_core_web_sm")
|
11 |
+
except OSError:
|
12 |
+
subprocess.run(["python", "-m", "spacy", "download", "en_core_web_sm"])
|
13 |
+
nlp = spacy.load("en_core_web_sm")
|
14 |
|
15 |
def read_pdf(file_stream):
|
16 |
"""讀取 PDF 文字內容"""
|
|
|
21 |
return text.strip()
|
22 |
|
23 |
def extract_key_phrases(text):
|
24 |
+
"""擷取文章中的關鍵詞與專有名詞"""
|
25 |
doc = nlp(text)
|
26 |
key_phrases = [chunk.text for chunk in doc.noun_chunks] + [ent.text for ent in doc.ents]
|
27 |
return key_phrases
|
28 |
|
29 |
def score_sentences(text, key_phrases):
|
30 |
+
"""根據關鍵詞分數給每個句子計分"""
|
31 |
sentence_scores = {}
|
32 |
doc = nlp(text)
|
33 |
for sent in doc.sents:
|
|
|
40 |
return sentence_scores
|
41 |
|
42 |
def summarize_text(sentence_scores, num_points=5):
|
43 |
+
"""依據分數挑出重要句子並條列化輸出"""
|
44 |
summary_sentences = heapq.nlargest(num_points, sentence_scores, key=sentence_scores.get)
|
45 |
summary = '\n'.join([f"- {sent.text}" for sent in summary_sentences])
|
46 |
return summary
|
requirements.txt
CHANGED
@@ -1,2 +1,2 @@
|
|
1 |
-
PyPDF2
|
2 |
spacy==3.7.4
|
|
|
1 |
+
PyPDF2
|
2 |
spacy==3.7.4
|