Files changed (8) hide show
  1. README.md +37 -12
  2. app.py +21 -102
  3. gitattributes +6 -0
  4. papersearch.py +28 -151
  5. pdfpass.py +8 -28
  6. pdfsum.py +32 -123
  7. requirements.txt +0 -0
  8. textsumm.py +26 -23
README.md CHANGED
@@ -1,12 +1,37 @@
1
- ---
2
- title: Pdf Tools Suite
3
- emoji: 📚
4
- colorFrom: gray
5
- colorTo: purple
6
- sdk: streamlit
7
- sdk_version: 1.42.1
8
- app_file: app.py
9
- pinned: false
10
- ---
11
-
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: PDF工具箱(多功能PDF助手)
3
+ emoji: 📄
4
+ colorFrom: blue
5
+ colorTo: green
6
+ sdk: streamlit
7
+ sdk_version: 1.35.0
8
+ app_file: app.py
9
+ pinned: false
10
+ license: mit
11
+ ---
12
+
13
+ # 📄 PDF 工具箱(全功能多合一)
14
+
15
+ 這是一個多功能的 PDF 處理平台,支援下列中文化操作:
16
+
17
+ - **文字摘要**:用 OpenAI GPT-4/4.1/4.5 模型自動生成關鍵重點摘要
18
+ - **PDF 摘要**:支援長篇PDF內容摘要
19
+ - **PDF 密碼移除**:移除加密 PDF 密碼
20
+ - **arXiv 論文搜尋**:中文介面搜尋並過濾論文
21
+ - **PDF 合併**、**拆頁**、**轉文字**等多功能
22
+ - 全面中文介面與說明,適合教育、研究、行政等需求
23
+
24
+ ## 使用說明
25
+
26
+ 1. 於側邊欄輸入你的 OpenAI API Key(sk- 或 sk-proj- 開頭)
27
+ 2. 選擇所需 GPT 模型(gpt-4, gpt-4.1, gpt-4.5)
28
+ 3. 選擇左側功能分頁並依需求操作上傳文件
29
+ 4. 所有步驟均有中文提示
30
+
31
+ > 💡 **注意**:API Key 僅用於本次對話,不會儲存於伺服器,請安心使用!
32
+
33
+ ## 聯絡與貢獻
34
+
35
+ 本專案歡迎改進建議或功能增補,請於 Hugging Face 或 GitHub 提出 issue。
36
+
37
+ ---
app.py CHANGED
@@ -1,109 +1,28 @@
1
  import streamlit as st
2
- from textsumm import summarizer
3
- from pdfsum import extract_text_from_pdf, summarize_text, split_text_into_chunks
4
- from pdfpass import remove_pdf_password
5
- from papersearch import fetch_papers, filter_papers_by_year
6
- from io import BytesIO
7
- from datetime import datetime
8
- from pypdf import PdfReader, PdfWriter
9
 
10
- # Streamlit App Config
11
- st.set_page_config(page_title="PDF Tools Suite", page_icon="📄", layout="wide")
12
 
13
- # Sidebar Navigation
14
- st.sidebar.title("📄 PDF Tools Suite")
15
- page = st.sidebar.radio("Select a tool", ["Text Summarizer", "PDF Summarizer", "PDF Password Remover", "Research Paper Search", "PDF Merger", "PDF Splitter", "PDF to Text Converter"])
 
16
 
17
- # Tool: Text Summarizer
18
- if page == "Text Summarizer":
19
- st.title("📝 Text Summarizer")
20
- user_input = st.text_area("Enter text to summarize")
21
- if st.button("Summarize"):
22
- summary = summarizer(user_input, max_length=130, min_length=30, do_sample=False)
23
- st.subheader("Summary")
24
- st.write(summary[0]["summary_text"])
25
 
26
- # Tool: PDF Summarizer
27
- elif page == "PDF Summarizer":
28
- st.title("📜 PDF Summarizer")
29
- uploaded_file = st.file_uploader("Upload your PDF", type=["pdf"])
30
  if uploaded_file is not None:
31
- pdf_text = extract_text_from_pdf(uploaded_file)
32
- chunks = split_text_into_chunks(pdf_text)
33
- summaries = summarize_text(chunks)
34
- full_summary = " ".join(summaries)
35
- st.subheader("Summary")
36
- st.write(full_summary)
37
-
38
- # Tool: PDF Password Remover
39
- elif page == "PDF Password Remover":
40
- st.title("🔑 Remove PDF Password")
41
- uploaded_file = st.file_uploader("Choose a password-protected PDF", type=["pdf"])
42
- password = st.text_input("Enter the PDF password", type="password")
43
- if uploaded_file and password and st.button("Remove Password"):
44
- output = remove_pdf_password(uploaded_file, password)
45
- if isinstance(output, BytesIO):
46
- st.success("Password removed successfully!")
47
- st.download_button("Download PDF", data=output, file_name="unlocked_pdf.pdf", mime="application/pdf")
48
- else:
49
- st.error(f"Error: {output}")
50
-
51
- # Tool: Research Paper Search
52
- elif page == "Research Paper Search":
53
- st.title("🔍 Research Paper Search (arXiv)")
54
- query = st.text_input("Enter topic or keywords", placeholder="e.g., machine learning")
55
- max_results = st.slider("Number of results", 1, 50, 10)
56
- col1, col2 = st.columns(2)
57
- with col1:
58
- start_year = st.number_input("Start Year", min_value=1900, max_value=datetime.now().year, value=2000)
59
- with col2:
60
- end_year = st.number_input("End Year", min_value=1900, max_value=datetime.now().year, value=datetime.now().year)
61
- if st.button("Search"):
62
- papers = fetch_papers(query, max_results)
63
- papers_filtered = filter_papers_by_year(papers, start_year, end_year)
64
- if papers_filtered:
65
- for idx, paper in enumerate(papers_filtered, start=1):
66
- st.write(f"### {idx}. {paper['title']}")
67
- st.write(f"**Authors**: {', '.join(paper['authors'])}")
68
- st.write(f"**Published**: {paper['published']}")
69
- st.write(f"[Read More]({paper['link']})")
70
- st.write("---")
71
- else:
72
- st.warning("No papers found in the selected range.")
73
-
74
- # Tool: PDF Merger
75
- elif page == "PDF Merger":
76
- st.title("📎 Merge Multiple PDFs")
77
- uploaded_files = st.file_uploader("Upload multiple PDF files", type=["pdf"], accept_multiple_files=True)
78
- if uploaded_files and st.button("Merge PDFs"):
79
- pdf_writer = PdfWriter()
80
- for file in uploaded_files:
81
- pdf_reader = PdfReader(file)
82
- for page in pdf_reader.pages:
83
- pdf_writer.add_page(page)
84
- output = BytesIO()
85
- pdf_writer.write(output)
86
- output.seek(0)
87
- st.download_button("Download Merged PDF", data=output, file_name="merged.pdf", mime="application/pdf")
88
-
89
- # Tool: PDF Splitter
90
- elif page == "PDF Splitter":
91
- st.title("✂️ Split PDF into Pages")
92
- uploaded_file = st.file_uploader("Upload a PDF", type=["pdf"])
93
- if uploaded_file:
94
  pdf_reader = PdfReader(uploaded_file)
95
- for i, page in enumerate(pdf_reader.pages):
96
- pdf_writer = PdfWriter()
97
- pdf_writer.add_page(page)
98
- output = BytesIO()
99
- pdf_writer.write(output)
100
- output.seek(0)
101
- st.download_button(f"Download Page {i+1}", data=output, file_name=f"page_{i+1}.pdf", mime="application/pdf")
102
-
103
- # Tool: PDF to Text Converter
104
- elif page == "PDF to Text Converter":
105
- st.title("📜 Extract Text from PDF")
106
- uploaded_file = st.file_uploader("Upload a PDF", type=["pdf"])
107
- if uploaded_file:
108
- pdf_text = extract_text_from_pdf(uploaded_file)
109
- st.text_area("Extracted Text", pdf_text, height=300)
 
1
  import streamlit as st
2
+ from pdfsum import 摘要
3
+ # 其他功能如有各自py檔也可import
 
 
 
 
 
4
 
5
+ st.set_page_config(page_title="PDF 工具箱", layout="wide")
 
6
 
7
+ st.sidebar.title("PDF 工具箱")
8
+ api_key = st.sidebar.text_input("請輸入 OpenAI API 金鑰", type="password")
9
+ gpt_model = st.sidebar.radio("選擇 GPT 模型", ["gpt-4", "gpt-4.0", "gpt-4.1", "gpt-4.5"])
10
+ 功能 = st.sidebar.radio("選擇功能", ["文字摘要", "PDF 摘要", "PDF 密碼移除", "論文搜尋", "PDF 合併", "PDF 拆頁", "PDF 轉純文字"])
11
 
12
+ st.title("PDF 摘要")
 
 
 
 
 
 
 
13
 
14
+ if 功能 == "PDF 摘要":
15
+ uploaded_file = st.file_uploader("上傳你的PDF檔案", type=["pdf"])
 
 
16
  if uploaded_file is not None:
17
+ # 讀取PDF檔案內容
18
+ from PyPDF2 import PdfReader
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
  pdf_reader = PdfReader(uploaded_file)
20
+ 內容 = ""
21
+ for page in pdf_reader.pages:
22
+ 內容 += page.extract_text() or ""
23
+ if st.button("產生 PDF 摘要"):
24
+ st.info("正在產生摘要,請稍候...")
25
+ result = 摘要(內容)
26
+ st.success(result)
27
+ else:
28
+ st.info("請選擇功能")
 
 
 
 
 
 
gitattributes ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ # Git LFS 屬性設定檔(可用於大檔案控制)
2
+ *.pdf filter=lfs diff=lfs merge=lfs -text
3
+ *.jpg filter=lfs diff=lfs merge=lfs -text
4
+ *.png filter=lfs diff=lfs merge=lfs -text
5
+
6
+ # 中文註解:上面設定會讓 PDF/圖片走 Git LFS(大檔案友善處理)
papersearch.py CHANGED
@@ -1,154 +1,31 @@
1
- # import streamlit as st
2
- # import requests
3
- # import xmltodict
4
-
5
- # # arXiv API base URL
6
- # ARXIV_API_BASE = "http://export.arxiv.org/api/query"
7
-
8
- # def fetch_papers(query, max_results=10):
9
- # """Fetch papers from the arXiv API."""
10
- # try:
11
- # # Build the API query URL
12
- # api_url = f"{ARXIV_API_BASE}?search_query=all:{query}&start=0&max_results={max_results}"
13
-
14
- # # Make the API request
15
- # response = requests.get(api_url, headers={'Accept': 'application/xml'})
16
- # response.raise_for_status()
17
-
18
- # # Parse the XML response
19
- # data = xmltodict.parse(response.text)
20
- # entries = data.get('feed', {}).get('entry', [])
21
-
22
- # if not isinstance(entries, list): # Handle single result
23
- # entries = [entries]
24
-
25
- # # Extract relevant fields
26
- # papers = []
27
- # for entry in entries:
28
- # papers.append({
29
- # 'title': entry.get('title'),
30
- # 'summary': entry.get('summary'),
31
- # 'published': entry.get('published'),
32
- # 'authors': [author['name'] for author in entry.get('author', [])] if isinstance(entry.get('author'), list) else [entry.get('author', {}).get('name')],
33
- # 'link': entry.get('id')
34
- # })
35
-
36
- # return papers
37
- # except Exception as e:
38
- # st.error(f"Error fetching papers: {e}")
39
- # return []
40
-
41
- # # Streamlit app UI
42
- # st.title("arXiv Research Paper Search")
43
- # st.subheader("Find academic papers on your topic of interest")
44
-
45
- # # Input fields
46
- # query = st.text_input("Enter a topic or keywords", placeholder="e.g., machine learning, quantum computing")
47
- # max_results = st.slider("Number of results", min_value=1, max_value=50, value=10)
48
-
49
- # if st.button("Search"):
50
- # if query.strip():
51
- # st.info(f"Searching for papers on: **{query}**")
52
- # papers = fetch_papers(query, max_results)
53
-
54
- # if papers:
55
- # st.success(f"Found {len(papers)} papers!")
56
- # for idx, paper in enumerate(papers, start=1):
57
- # st.write(f"### {idx}. {paper['title']}")
58
- # st.write(f"**Authors**: {', '.join(paper['authors'])}")
59
- # st.write(f"**Published**: {paper['published']}")
60
- # st.write(f"[Read More]({paper['link']})")
61
- # st.write("---")
62
- # else:
63
- # st.warning("No papers found. Try a different query.")
64
- # else:
65
- # st.error("Please enter a topic or keywords to search.")
66
-
67
- import streamlit as st
68
  import requests
69
- import xmltodict
70
  from datetime import datetime
71
 
72
- # arXiv API base URL
73
- ARXIV_API_BASE = "http://export.arxiv.org/api/query"
74
-
75
- def fetch_papers(query, max_results=10):
76
- """Fetch papers from the arXiv API."""
77
- try:
78
- # Build the API query URL
79
- api_url = f"{ARXIV_API_BASE}?search_query=all:{query}&start=0&max_results={max_results}"
80
-
81
- # Make the API request
82
- response = requests.get(api_url, headers={'Accept': 'application/xml'})
83
- response.raise_for_status()
84
-
85
- # Parse the XML response
86
- data = xmltodict.parse(response.text)
87
- entries = data.get('feed', {}).get('entry', [])
88
-
89
- if not isinstance(entries, list): # Handle single result
90
- entries = [entries]
91
-
92
- # Extract relevant fields
93
- papers = []
94
- for entry in entries:
95
- papers.append({
96
- 'title': entry.get('title'),
97
- 'summary': entry.get('summary'),
98
- 'published': entry.get('published'),
99
- 'authors': [author['name'] for author in entry.get('author', [])] if isinstance(entry.get('author'), list) else [entry.get('author', {}).get('name')],
100
- 'link': entry.get('id')
101
- })
102
-
103
- return papers
104
- except Exception as e:
105
- st.error(f"Error fetching papers: {e}")
106
- return []
107
-
108
- def filter_papers_by_year(papers, start_year, end_year):
109
- """Filter papers by the publication year range."""
110
- filtered_papers = []
111
- for paper in papers:
112
- try:
113
- published_year = int(paper['published'][:4]) # Extract year from the published date
114
- if start_year <= published_year <= end_year:
115
- filtered_papers.append(paper)
116
- except ValueError:
117
- continue # Skip if the year is not valid
118
- return filtered_papers
119
-
120
- # Streamlit app UI
121
- st.title("arXiv Research Paper Search")
122
- st.subheader("Find academic papers on your topic of interest")
123
-
124
- # Input fields
125
- query = st.text_input("Enter a topic or keywords", placeholder="e.g., machine learning, quantum computing")
126
- max_results = st.slider("Number of results", min_value=1, max_value=50, value=10)
127
-
128
- # Year filter
129
- col1, col2 = st.columns(2)
130
- with col1:
131
- start_year = st.number_input("Start Year", min_value=1900, max_value=datetime.now().year, value=2000, step=1)
132
- with col2:
133
- end_year = st.number_input("End Year", min_value=1900, max_value=datetime.now().year, value=datetime.now().year, step=1)
134
-
135
- if st.button("Search"):
136
- if query.strip():
137
- st.info(f"Searching for papers on: **{query}**")
138
- papers = fetch_papers(query, max_results)
139
-
140
- # Filter papers by year
141
- papers_filtered = filter_papers_by_year(papers, start_year, end_year)
142
-
143
- if papers_filtered:
144
- st.success(f"Found {len(papers_filtered)} papers between {start_year} and {end_year}!")
145
- for idx, paper in enumerate(papers_filtered, start=1):
146
- st.write(f"### {idx}. {paper['title']}")
147
- st.write(f"**Authors**: {', '.join(paper['authors'])}")
148
- st.write(f"**Published**: {paper['published']}")
149
- st.write(f"[Read More]({paper['link']})")
150
- st.write("---")
151
- else:
152
- st.warning(f"No papers found between {start_year} and {end_year}. Try a different query or adjust the year range.")
153
- else:
154
- st.error("Please enter a topic or keywords to search.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import requests
2
+ import xml.etree.ElementTree as ET
3
  from datetime import datetime
4
 
5
+ def 抓取論文(關鍵字, 最大數量=10):
6
+ """
7
+ 從 arXiv 依關鍵字搜尋論文(最新)
8
+ """
9
+ url = f"https://export.arxiv.org/api/query?search_query=all:{關鍵字}&start=0&max_results={最大數量}&sortBy=lastUpdatedDate"
10
+ res = requests.get(url)
11
+ root = ET.fromstring(res.content)
12
+ 論文清單 = []
13
+ for entry in root.findall('{http://www.w3.org/2005/Atom}entry'):
14
+ 論文清單.append({
15
+ "標題": entry.find('{http://www.w3.org/2005/Atom}title').text.strip(),
16
+ "作者": [author.find('{http://www.w3.org/2005/Atom}name').text for author in entry.findall('{http://www.w3.org/2005/Atom}author')],
17
+ "發表時間": entry.find('{http://www.w3.org/2005/Atom}published').text[:10],
18
+ "連結": entry.find('{http://www.w3.org/2005/Atom}id').text
19
+ })
20
+ return 論文清單
21
+
22
+ def 篩選論文依年份(論文清單, 起始, 結束):
23
+ """
24
+ 依年份篩選論文(年分區間)
25
+ """
26
+ 篩選 = []
27
+ for 論文 in 論文清單:
28
+ 年份 = int(論文["發表時間"][:4])
29
+ if 起始 <= 年份 <= 結束:
30
+ 篩選.append(論文)
31
+ return 篩選
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
pdfpass.py CHANGED
@@ -1,40 +1,20 @@
1
- import streamlit as st
2
- from PyPDF2 import PdfReader, PdfWriter
3
  from io import BytesIO
4
 
5
- def remove_pdf_password(file, password):
 
 
 
6
  try:
7
- reader = PdfReader(file)
8
  if reader.is_encrypted:
9
- reader.decrypt(password)
10
  writer = PdfWriter()
11
  for page in reader.pages:
12
  writer.add_page(page)
13
-
14
  output = BytesIO()
15
  writer.write(output)
16
  output.seek(0)
17
  return output
18
  except Exception as e:
19
- return str(e)
20
-
21
- st.title("PDF Password Remover")
22
- st.write("Upload a password-protected PDF and remove its password.")
23
-
24
- # File upload
25
- uploaded_file = st.file_uploader("Choose a PDF file", type=["pdf"])
26
- password = st.text_input("Enter the PDF password", type="password")
27
-
28
- if uploaded_file and password:
29
- if st.button("Remove Password"):
30
- output = remove_pdf_password(uploaded_file, password)
31
- if isinstance(output, BytesIO):
32
- st.success("Password removed successfully!")
33
- st.download_button(
34
- label="Download PDF without Password",
35
- data=output,
36
- file_name="unlocked_pdf.pdf",
37
- mime="application/pdf",
38
- )
39
- else:
40
- st.error(f"Error: {output}")
 
1
+ from pypdf import PdfReader, PdfWriter
 
2
  from io import BytesIO
3
 
4
+ def 移除_pdf密碼(pdf檔案, 密碼):
5
+ """
6
+ 解鎖帶有密碼保護的 PDF 檔案,回傳已解鎖的檔案(BytesIO)或錯誤訊息
7
+ """
8
  try:
9
+ reader = PdfReader(pdf檔案)
10
  if reader.is_encrypted:
11
+ reader.decrypt(密碼)
12
  writer = PdfWriter()
13
  for page in reader.pages:
14
  writer.add_page(page)
 
15
  output = BytesIO()
16
  writer.write(output)
17
  output.seek(0)
18
  return output
19
  except Exception as e:
20
+ return f"解鎖失敗:{e}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
pdfsum.py CHANGED
@@ -1,125 +1,34 @@
1
- # import streamlit as st
2
- # from transformers import pipeline
3
- # from PyPDF2 import PdfReader
4
-
5
- # # Initialize the summarizer
6
- # summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
7
-
8
- # def extract_text_from_pdf(pdf_file):
9
- # """Extract text from an uploaded PDF file."""
10
- # try:
11
- # reader = PdfReader(pdf_file)
12
- # text = ""
13
- # for page in reader.pages:
14
- # page_text = page.extract_text()
15
- # if page_text: # Skip pages with no text
16
- # text += page_text + "\n"
17
- # return text
18
- # except Exception as e:
19
- # raise ValueError(f"Error extracting text from PDF: {e}")
20
-
21
- # def split_text_into_chunks(text, max_chunk_size=1024):
22
- # """Split the text into smaller chunks for summarization."""
23
- # chunks = []
24
- # while len(text) > max_chunk_size:
25
- # split_point = text.rfind(". ", 0, max_chunk_size) + 1 # Split at the last sentence boundary
26
- # if split_point == 0: # No sentence boundary found, split arbitrarily
27
- # split_point = max_chunk_size
28
- # chunks.append
29
-
30
- # # Streamlit Dashboard
31
- # st.title("PDF Summarizer")
32
- # st.write("Upload a PDF file to get a summarized version of its content.")
33
-
34
- # uploaded_file = st.file_uploader("Upload your PDF", type=["pdf"])
35
-
36
- # if uploaded_file is not None:
37
- # # Extract text from the PDF
38
- # st.write("Processing your PDF...")
39
- # try:
40
- # pdf_text = extract_text_from_pdf(uploaded_file)
41
- # st.write("PDF content extracted successfully.")
42
-
43
- # # Display extracted text (optional)
44
- # with st.expander("View Extracted Text"):
45
- # st.text_area("Extracted Text", pdf_text, height=300)
46
-
47
- # # Summarize the extracted text
48
- # if st.button("Summarize"):
49
- # st.write("Generating summary...")
50
- # summary = summarizer(pdf_text, max_length=130, min_length=30, do_sample=False)
51
- # st.subheader("Summary")
52
- # st.write(summary[0]["summary_text"])
53
- # except Exception as e:
54
- # st.error(f"An error occurred while processing the PDF: {str(e)}")
55
-
56
- import streamlit as st
57
  from transformers import pipeline
58
- import pdfplumber
59
-
60
- # Initialize the summarizer
61
- summarizer = pipeline("summarization", model="t5-small")
62
-
63
- def extract_text_from_pdf(pdf_file):
64
- """Extract text from an uploaded PDF file using pdfplumber."""
65
- try:
66
- text = ""
67
- with pdfplumber.open(pdf_file) as pdf:
68
- for page in pdf.pages:
69
- text += page.extract_text() + "\n"
70
- if not text.strip():
71
- raise ValueError("No extractable text found in the PDF.")
72
- return text
73
- except Exception as e:
74
- raise ValueError(f"Error extracting text from PDF: {e}")
75
-
76
- def split_text_into_chunks(text, max_chunk_size=1024):
77
- """Split the text into smaller chunks for summarization."""
78
- chunks = []
79
- while len(text) > max_chunk_size:
80
- split_point = text.rfind(". ", 0, max_chunk_size) + 1 # Find the last full sentence
81
- if split_point == 0: # No sentence boundary found, split arbitrarily
82
- split_point = max_chunk_size
83
- chunks.append(text[:split_point])
84
- text = text[split_point:]
85
- if text:
86
- chunks.append(text)
87
- return chunks
88
-
89
- def summarize_text(chunks):
90
- """Summarize each chunk of text with dynamic max_length."""
91
- summaries = []
92
- for chunk in chunks:
93
- input_length = len(chunk.split()) # Approximate token count
94
- max_length = max(48, int(input_length * 0.8)) # Set max_length to 80% of input length
95
- summary = summarizer(chunk, max_length=max_length, min_length=10, do_sample=False)
96
- summaries.append(summary[0]["summary_text"])
97
- return summaries
98
-
99
- # Streamlit Dashboard
100
- st.title("PDF Summarizer")
101
- st.write("Upload a PDF file to get a summarized version of its content.")
102
-
103
- uploaded_file = st.file_uploader("Upload your PDF", type=["pdf"])
104
 
105
- if uploaded_file is not None:
106
- try:
107
- # Extract text from the PDF
108
- st.write("Processing your PDF...")
109
- pdf_text = extract_text_from_pdf(uploaded_file)
110
- st.write("PDF content extracted successfully.")
111
-
112
- # Display extracted text (optional)
113
- with st.expander("View Extracted Text"):
114
- st.text_area("Extracted Text", pdf_text, height=300)
115
-
116
- # Summarize the extracted text
117
- if st.button("Summarize"):
118
- st.write("Generating summary...")
119
- chunks = split_text_into_chunks(pdf_text)
120
- summaries = summarize_text(chunks)
121
- full_summary = " ".join(summaries)
122
- st.subheader("Summary")
123
- st.write(full_summary)
124
- except Exception as e:
125
- st.error(f"An error occurred while processing the PDF: {str(e)}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  from transformers import pipeline
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
 
3
+ # 使用 huggingface 上的 pegasus 中文摘要模型
4
+ # 如果你��� Hugging Face Space 上執行,可直接用下面這行
5
+ summarizer = pipeline(
6
+ "summarization",
7
+ model="IDEA-CCNL/Randeng-Pegasus-523M-Summary-Chinese",
8
+ tokenizer="IDEA-CCNL/Randeng-Pegasus-523M-Summary-Chinese",
9
+ device=0 # 如果有 GPU,否則設 device=-1
10
+ )
11
+
12
+ def 摘要(pdf_純文字):
13
+ """
14
+ 中文 PDF 摘要,適用於繁簡體
15
+ """
16
+ if not pdf_純文字 or len(pdf_純文字.strip()) < 20:
17
+ return "⚠️ PDF 內容為空或無法解析(可能是掃描檔或圖片)"
18
+
19
+ 段落列表 = [p.strip() for p in pdf_純文字.split('\n') if p.strip()]
20
+ 摘要結果 = []
21
+ for 段 in 段落列表:
22
+ # Pegasus max_length 最多 128
23
+ if len(段) < 30:
24
+ continue
25
+ # 以 400 字切片
26
+ for i in range(0, len(段), 400):
27
+ 子段 = 段[i:i+400]
28
+ try:
29
+ out = summarizer(子段, max_length=64, min_length=10, do_sample=False)
30
+ if out and len(out) > 0:
31
+ 摘要結果.append(out[0]['summary_text'])
32
+ except Exception as e:
33
+ 摘要結果.append(f"(錯誤:{e})")
34
+ return "\n".join(摘要結果) if 摘要結果 else "⚠️ 沒有找到可摘要的內容!"
requirements.txt CHANGED
Binary files a/requirements.txt and b/requirements.txt differ
 
textsumm.py CHANGED
@@ -1,28 +1,31 @@
 
 
 
1
  from transformers import pipeline
2
 
 
3
  summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
4
- ARTICLE ="""
5
- There is widespread international concern that Russia's war will provoke a global food crisis similar to, or
6
- worse than, that faced in 2007 and 2008. The war comes at a time when the global food system was already
7
- struggling to feed its growing population in a sustainable way, under the pressure caused by climate change
8
- and the Covid-19 pandemic. Russia and Ukraine are key agricultural players, together exporting nearly 12
9
- % of food calories traded globally. They are major providers of basic agro-commodities, including wheat,
10
- maize and sunflower oil, and Russia is the world's top exporter of fertilisers. The global supply chain will
11
- get impacted until Russia and Ukraine retreat and will end the war.
12
- The war's impact on global food supply centred on three factors. First is a significant reduction in exports
13
- and production of essential commodities from both countries, caused by the war and not the economic
14
- sanctions imposed on Russia, which, intentionally, did not target the agricultural sector. Overall, the
15
- European Commission estimates that 'up to 25 million tonnes of wheat would need to be substituted to
16
- meet worldwide food needs in the current and the next season. Second factor is a global spike in prices of
17
- food supplies and inputs needed for agri-food production, which were already at record levels before the
18
- war. The war has further pushed the prices up. Third factor is the international response to the above,
19
- which could either amplify the effects of the crisis (mainly by uncoordinated export bans) or mitigate them
20
- (applying lessons learnt from the 2007-2008 food crisis). A number of countries, other than Russia and
21
- Ukraine, have already imposed or announced their intention to impose some control over exports of
22
- essential agricultural commodities, including Egypt, Argentina, Indonesia, Serbia, Turkey and, in the EU,
23
- Hungary. We should keep this in our mind that the long duration of war will make the global situation
24
- irrecoverable.
25
 
26
- """
27
- print(summarizer(ARTICLE, max_length=130, min_length=30, do_sample=False))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
 
 
 
 
 
 
1
+ # textsumm.py
2
+ # 中文化摘要模組,安裝 transformers 與 torch 即可使用
3
+
4
  from transformers import pipeline
5
 
6
+ # 初始化摘要 pipeline
7
  summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
 
9
+ def 文字摘要(輸入文本, 最長長度=120, 最短長度=40):
10
+ """
11
+ 輸入:一段文本
12
+ 輸出:摘要(中文說明)
13
+ """
14
+ if len(輸入文本.strip()) == 0:
15
+ return "❗️ 請輸入需要摘要的內容。"
16
+
17
+ try:
18
+ 結果 = summarizer(
19
+ 輸入文本,
20
+ max_length=最長長度,
21
+ min_length=最短長度,
22
+ do_sample=False
23
+ )
24
+ return 結果[0]['summary_text']
25
+ except Exception as e:
26
+ return f"❌ 摘要生成失敗:{str(e)}"
27
 
28
+ # 若你要測試,可以取消下面註解
29
+ # if __name__ == "__main__":
30
+ # 測試文本 = "人工智慧(AI)是研究如何讓電腦模擬人類智能行為的學科,包括學習、推理、規劃、自然語言處理、知覺等。AI 技術已廣泛應用於語音辨識、影像分析、自駕車等領域。"
31
+ # print(文字摘要(測試文本))