studyonly commited on
Commit
789bf02
·
verified ·
1 Parent(s): dd6bc7a

PDF Summarizer version 1

Browse files
Files changed (5) hide show
  1. README.md +47 -14
  2. app.py +33 -0
  3. gitattributes.txt +39 -0
  4. main.py +46 -0
  5. requirements.txt +2 -0
README.md CHANGED
@@ -1,14 +1,47 @@
1
- ---
2
- title: PDF Summarizer Studyonly
3
- emoji: 🌍
4
- colorFrom: pink
5
- colorTo: yellow
6
- sdk: gradio
7
- sdk_version: 5.36.2
8
- app_file: app.py
9
- pinned: false
10
- license: mit
11
- short_description: PDF Summarizer APP created by Alex
12
- ---
13
-
14
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Pdf 2 Summary
3
+ emoji: 🌍
4
+ colorFrom: red
5
+ colorTo: purple
6
+ sdk: streamlit
7
+ sdk_version: 1.31.1
8
+ app_file: app.py
9
+ pinned: false
10
+ license: mit
11
+ ---
12
+
13
+ # 進階 NLP PDF 條列式重點摘要工具 🚀
14
+
15
+ 本工具可以將冗長的 PDF 文件,透過先進的自然語言處理(NLP)技術,快速轉換為條列式、易讀的重點摘要。以下說明本應用的功能與操作方式,協助教師與學生輕鬆掌握大量資訊。
16
+
17
+ ## 功能特色
18
+
19
+ - **PDF 文字擷取**:利用 PyPDF2 高效讀取 PDF 內容,確保摘要過程不遺漏任何資料。
20
+ - **高級文字前處理**:結合 spaCy 斷詞、詞形還原、去除停用字,讓資料更精煉、更容易分析。
21
+ - **關鍵詞擷取**:透過 NLP 技術找出文章最重要的關鍵詞與專有名詞,抓住主題核心。
22
+ - **句子重要度評分**:自訂演算法根據關鍵詞分布與語意相關性,挑選出最能代表全文重點的句子。
23
+ - **動態摘要調整**:可依需求調整摘要比例,彈性生成長短不一的重點摘要。
24
+
25
+ ## 操作說明
26
+
27
+ 1. **安裝套件**:請先安裝 Python 與必要函式庫(PyPDF2、spaCy、Streamlit)。
28
+ 2. **啟動應用程式**:在終端機執行 `streamlit run app.py`,依畫面指示開啟網頁介面。
29
+ 3. **上傳 PDF 檔案**:於網頁上傳欲摘要的 PDF 文件。
30
+ 4. **調整摘要比例**:用滑桿設定想要的摘要長度比例(%)。
31
+ 5. **查看條列摘要**:系統自動處理,於下方顯示條列式重點摘要。
32
+
33
+ ## 參與貢獻
34
+
35
+ 本專案採用 MIT 開源授權檔案。
36
+
37
+ ## 授權
38
+
39
+ 本專案採用 MIT 開源授權檔案。
40
+
41
+ ## 致謝
42
+
43
+ - 感謝 NLP 函式庫
44
+ - 感謝 PyPDF2 函式庫
45
+ - 感謝 Streamlit 函式庫
46
+
47
+ 感謝您使用本工具,期待能幫助您在學習與工作上更有效率!
app.py ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from main import read_pdf, extract_key_phrases, score_sentences, summarize_text
3
+ import io
4
+
5
+ # 設定 Streamlit 應用標題
6
+ st.title("PDF 條列式重點摘要工具 🗟 🔏")
7
+
8
+ # 上傳 PDF 檔案元件
9
+ uploaded_file = st.file_uploader("請上傳您的 PDF 文件", type="pdf")
10
+
11
+ # 摘要比例滑桿
12
+ summary_scale = st.slider("請選擇摘要比例(%)", min_value=1, max_value=100, value=20)
13
+
14
+ # 產生摘要按鈕
15
+ submit_button = st.button("產生摘要")
16
+
17
+ # 若按下按鈕且有上傳檔案
18
+ if submit_button and uploaded_file is not None:
19
+ with st.spinner('正在處理中,請稍候...'):
20
+ # 讀取 PDF 內容
21
+ text = read_pdf(io.BytesIO(uploaded_file.getvalue()))
22
+ # 擷取關鍵詞
23
+ key_phrases = extract_key_phrases(text)
24
+ # 句子評分
25
+ sentence_scores = score_sentences(text, key_phrases)
26
+ # 計算要顯示的重點句數
27
+ total_sentences = len(list(sentence_scores.keys()))
28
+ num_points = max(1, total_sentences * summary_scale // 100)
29
+ # 產生條列式摘要
30
+ summary = summarize_text(sentence_scores, num_points=num_points)
31
+ # 顯示摘要
32
+ st.subheader("摘要結果:")
33
+ st.markdown(summary)
gitattributes.txt ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ en_core_web_sm/en_core_web_sm-3.7.1/ner/model filter=lfs diff=lfs merge=lfs -text
37
+ en_core_web_sm/en_core_web_sm-3.7.1/tok2vec/model filter=lfs diff=lfs merge=lfs -text
38
+ en_core_web_sm-3.7.1/ner/model filter=lfs diff=lfs merge=lfs -text
39
+ en_core_web_sm-3.7.1/tok2vec/model filter=lfs diff=lfs merge=lfs -text
main.py ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import PyPDF2
2
+ import spacy
3
+ import subprocess
4
+ from collections import Counter
5
+ import heapq
6
+ import io
7
+
8
+ # 自動檢查、下載 spaCy 語言模型(en_core_web_sm),避免 Space 缺模型報錯
9
+ try:
10
+ nlp = spacy.load("en_core_web_sm")
11
+ except OSError:
12
+ subprocess.run(["python", "-m", "spacy", "download", "en_core_web_sm"])
13
+ nlp = spacy.load("en_core_web_sm")
14
+
15
+ def read_pdf(file_stream):
16
+ """讀取 PDF 文字內容"""
17
+ text = ''
18
+ reader = PyPDF2.PdfReader(file_stream)
19
+ for page in reader.pages:
20
+ text += page.extract_text() + ' '
21
+ return text.strip()
22
+
23
+ def extract_key_phrases(text):
24
+ """擷取文章中的關鍵詞與專有名詞"""
25
+ doc = nlp(text)
26
+ key_phrases = [chunk.text for chunk in doc.noun_chunks] + [ent.text for ent in doc.ents]
27
+ return key_phrases
28
+
29
+ def score_sentences(text, key_phrases):
30
+ """根據關鍵詞分數給每個句子計分"""
31
+ sentence_scores = {}
32
+ doc = nlp(text)
33
+ for sent in doc.sents:
34
+ for phrase in key_phrases:
35
+ if phrase in sent.text:
36
+ if sent in sentence_scores:
37
+ sentence_scores[sent] += 1
38
+ else:
39
+ sentence_scores[sent] = 1
40
+ return sentence_scores
41
+
42
+ def summarize_text(sentence_scores, num_points=5):
43
+ """依據分數挑出重要句子並條列化輸出"""
44
+ summary_sentences = heapq.nlargest(num_points, sentence_scores, key=sentence_scores.get)
45
+ summary = '\n'.join([f"- {sent.text}" for sent in summary_sentences])
46
+ return summary
requirements.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ PyPDF2
2
+ spacy==3.7.4