3a05chatgpt commited on
Commit
4d9a0e6
·
verified ·
1 Parent(s): 9581815

Upload 5 files

Browse files
Files changed (5) hide show
  1. README.md +35 -14
  2. app.py +27 -60
  3. gitattributes +39 -1
  4. main.py +41 -0
  5. requirements.txt +2 -5
README.md CHANGED
@@ -1,24 +1,45 @@
1
  ---
2
- title: PDF 工具箱
3
- emoji: 📄
4
- colorFrom: blue
5
- colorTo: green
6
- sdk: streamlit # 如果你用 Gradio 請改成 gradio
7
- sdk_version: 1.23.0 # Gradio 可移除這行
8
  app_file: app.py
9
  pinned: false
 
10
  ---
11
 
12
- # 📄 PDF 工具箱
 
 
 
13
 
14
  ## 主要功能
15
 
16
- - **文字摘要**:支援中文長文摘要(Pegasus 模型)
17
- - **PDF 摘要**:PDF 自動抓取中文摘要
18
- - **論文搜尋**:arXiv 關鍵字查詢
19
- - 支援 Hugging Face Spaces,一鍵部署
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
 
21
- ## 安裝需求
 
 
22
 
23
- ```bash
24
- pip install -r requirements.txt
 
1
  ---
2
+ title: Pdf 2 Summary
3
+ emoji: 🌍
4
+ colorFrom: red
5
+ colorTo: purple
6
+ sdk: streamlit
7
+ sdk_version: 1.31.1
8
  app_file: app.py
9
  pinned: false
10
+ license: mit
11
  ---
12
 
13
+ # 進階 NLP PDF 摘要生成工具 🚀
14
+
15
+ 本工具可將冗長的 PDF 文件,透過最先進的自然語言處理(NLP)技術,快速提煉成條列式的重點摘要。
16
+ 以下為本應用的功能介紹與操作說明。
17
 
18
  ## 主要功能
19
 
20
+ - **PDF 文字擷取**:利用 PyPDF2 有效讀取 PDF 文件內容,確保摘要過程不遺漏任何重點。
21
+ - **高級文字前處理**:利用 spaCy 套件進行斷詞、詞形還原、停用字過濾等,使語料更精練、適合進一步分析。
22
+ - **關鍵詞擷取**:從段落中自動擷取主題關鍵詞及專有名詞,精確找出文本核心內容。
23
+ - **句子重要度評分**:根據關鍵詞出現及語意關聯性,動態計算句子分數,確保摘要涵蓋文章主旨。
24
+ - **摘要比例調整**:可依需求調整摘要長度,百分比自訂,生成不同精簡度的重點摘要。
25
+
26
+ ## 使用說明
27
+
28
+ 1. **安裝需求**:請確保已安裝 Python 及所需套件(PyPDF2、spaCy、Streamlit)。
29
+ 2. **啟動應用**:終端機執行 `streamlit run app.py`,依照網址開啟網頁介面。
30
+ 3. **上傳 PDF**:於網頁上傳欲摘要之 PDF 檔案。
31
+ 4. **調整摘要比例**:可使用滑桿設定想要的摘要比例。
32
+ 5. **取得摘要結果**:系統自動處理,於下方呈現條列式重點摘要。
33
+
34
+ ## 貢獻與授權
35
+
36
+ 歡迎提出改進建議或 Bug 回報。
37
+ 本專案採用 MIT 授權。
38
+
39
+ ## 感謝
40
 
41
+ - spaCy 團隊的 NLP 函式庫
42
+ - PyPDF2 團隊的 PDF 處理能力
43
+ - Streamlit 團隊的開發框架
44
 
45
+ 本工具為阿亮 老師製作,期待能提升您的工作與學習效率!
 
app.py CHANGED
@@ -1,66 +1,33 @@
1
- # app.py
2
  import streamlit as st
3
- from textsumm import 摘要
4
- from pdfsum import 提取_pdf摘要
5
- import requests
6
 
7
- st.set_page_config(page_title="PDF 工具箱", page_icon="📄", layout="wide")
 
8
 
9
- # 側邊欄
10
- st.sidebar.title("📄 PDF 工具箱")
11
- st.sidebar.write("請輸入 OpenAI API 金鑰(非必填)")
12
- api_key = st.sidebar.text_input("sk-...", type="password")
13
 
14
- # GPT 模型選擇
15
- model = st.sidebar.radio("選擇 GPT 模型", options=["gpt-4", "gpt-4.0", "gpt-4.1", "gpt-4.5"], index=0)
16
 
17
- # 工具選擇
18
- tool = st.sidebar.radio("選擇功能", options=["文字摘要", "PDF 摘要", "論文搜尋"])
19
 
20
- st.title("PDF 工具箱")
21
-
22
- if tool == "文字摘要":
23
- st.header("📝 文字摘要")
24
- user_text = st.text_area("請輸入要摘要的中文內容")
25
- if st.button("生成摘要"):
26
- with st.spinner("摘要生成中..."):
27
- if user_text.strip():
28
- summary = 摘要(user_text.strip())
29
- st.success("摘要結果:")
30
- st.write(summary)
31
- else:
32
- st.warning("請輸入內容!")
33
-
34
- elif tool == "PDF 摘要":
35
- st.header("📄 PDF 摘要")
36
- uploaded_file = st.file_uploader("上傳你的 PDF 文件", type=["pdf"])
37
- if uploaded_file is not None and st.button("產生 PDF 摘要"):
38
- with st.spinner("摘要生成中..."):
39
- summary = 提取_pdf摘要(uploaded_file, 摘要)
40
- st.success("摘要結果:")
41
- st.write(summary)
42
- elif tool == "論文搜尋":
43
- st.header("🔎 論文搜尋(arXiv)")
44
- keyword = st.text_input("輸入主題或關鍵字", "量子")
45
- max_results = st.slider("結果數量", 1, 20, 5)
46
- start_year = st.number_input("起始年份", min_value=1990, max_value=2025, value=2019)
47
- end_year = st.number_input("結束年份", min_value=1990, max_value=2025, value=2025)
48
- if st.button("搜尋論文"):
49
- st.info("搜尋中...")
50
- url = f"http://export.arxiv.org/api/query?search_query=all:{keyword}&start=0&max_results={max_results}"
51
- resp = requests.get(url)
52
- if resp.ok:
53
- import xml.etree.ElementTree as ET
54
- root = ET.fromstring(resp.content)
55
- found = False
56
- for entry in root.findall("{http://www.w3.org/2005/Atom}entry"):
57
- published = entry.find("{http://www.w3.org/2005/Atom}published").text[:4]
58
- if start_year <= int(published) <= end_year:
59
- found = True
60
- title = entry.find("{http://www.w3.org/2005/Atom}title").text.strip()
61
- link = entry.find("{http://www.w3.org/2005/Atom}id").text.strip()
62
- st.markdown(f"**[{title}]({link})**({published})")
63
- if not found:
64
- st.warning("在所選年份範圍內沒有找到相關論文。")
65
- else:
66
- st.error("arXiv 查詢失敗")
 
 
1
  import streamlit as st
2
+ from main import read_pdf, extract_key_phrases, score_sentences, summarize_text
3
+ import io
 
4
 
5
+ # 設定 Streamlit 應用標題
6
+ st.title("PDF 條列重點摘要工具 🗟 🔏")
7
 
8
+ # 上傳 PDF 檔案元件
9
+ uploaded_file = st.file_uploader("請上傳您的 PDF 文件", type="pdf")
 
 
10
 
11
+ # 摘要比例滑桿
12
+ summary_scale = st.slider("請選擇摘要比例(%)", min_value=1, max_value=100, value=20)
13
 
14
+ # 產生摘要按鈕
15
+ submit_button = st.button("產生摘要")
16
 
17
+ # 若按下按鈕且有上傳檔案
18
+ if submit_button and uploaded_file is not None:
19
+ with st.spinner('正在處理,請稍候...'):
20
+ # 讀取 PDF 內容
21
+ text = read_pdf(io.BytesIO(uploaded_file.getvalue()))
22
+ # 擷取關鍵詞
23
+ key_phrases = extract_key_phrases(text)
24
+ # 句子評分
25
+ sentence_scores = score_sentences(text, key_phrases)
26
+ # 計算要顯示的重點句數
27
+ total_sentences = len(list(sentence_scores.keys()))
28
+ num_points = max(1, total_sentences * summary_scale // 100)
29
+ # 產生條列式摘要
30
+ summary = summarize_text(sentence_scores, num_points=num_points)
31
+ # 顯示摘要
32
+ st.subheader("摘要結果:")
33
+ st.markdown(summary)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
gitattributes CHANGED
@@ -1 +1,39 @@
1
- * text=auto
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ en_core_web_sm/en_core_web_sm-3.7.1/ner/model filter=lfs diff=lfs merge=lfs -text
37
+ en_core_web_sm/en_core_web_sm-3.7.1/tok2vec/model filter=lfs diff=lfs merge=lfs -text
38
+ en_core_web_sm-3.7.1/ner/model filter=lfs diff=lfs merge=lfs -text
39
+ en_core_web_sm-3.7.1/tok2vec/model filter=lfs diff=lfs merge=lfs -text
main.py ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import PyPDF2
2
+ import spacy
3
+ from collections import Counter
4
+ import heapq
5
+ import io
6
+
7
+ # 載入 spaCy 模型
8
+ nlp = spacy.load("./en_core_web_sm-3.7.1")
9
+
10
+ def read_pdf(file_stream):
11
+ """讀取 PDF 文字內容"""
12
+ text = ''
13
+ reader = PyPDF2.PdfReader(file_stream)
14
+ for page in reader.pages:
15
+ text += page.extract_text() + ' '
16
+ return text.strip()
17
+
18
+ def extract_key_phrases(text):
19
+ """擷取關鍵詞及專有名詞"""
20
+ doc = nlp(text)
21
+ key_phrases = [chunk.text for chunk in doc.noun_chunks] + [ent.text for ent in doc.ents]
22
+ return key_phrases
23
+
24
+ def score_sentences(text, key_phrases):
25
+ """根據關鍵詞出現次數給句子評分"""
26
+ sentence_scores = {}
27
+ doc = nlp(text)
28
+ for sent in doc.sents:
29
+ for phrase in key_phrases:
30
+ if phrase in sent.text:
31
+ if sent in sentence_scores:
32
+ sentence_scores[sent] += 1
33
+ else:
34
+ sentence_scores[sent] = 1
35
+ return sentence_scores
36
+
37
+ def summarize_text(sentence_scores, num_points=5):
38
+ """依分數挑出重點句並條列化"""
39
+ summary_sentences = heapq.nlargest(num_points, sentence_scores, key=sentence_scores.get)
40
+ summary = '\n'.join([f"- {sent.text}" for sent in summary_sentences])
41
+ return summary
requirements.txt CHANGED
@@ -1,5 +1,2 @@
1
- transformers==4.41.1
2
- streamlit==1.35.0
3
- torch
4
- PyPDF2
5
- requests
 
1
+ PyPDF2
2
+ spacy==3.7.4