3a05chatgpt commited on
Commit
f1e5728
·
verified ·
1 Parent(s): 93c008b

Upload 8 files

Browse files
Files changed (8) hide show
  1. README.md +37 -12
  2. app.py +95 -72
  3. gitattributes +6 -35
  4. papersearch.py +28 -151
  5. pdfpass.py +8 -28
  6. pdfsum.py +30 -123
  7. requirements.txt +0 -0
  8. textsumm.py +7 -24
README.md CHANGED
@@ -1,12 +1,37 @@
1
- ---
2
- title: Pdf Tools Suite
3
- emoji: 📚
4
- colorFrom: gray
5
- colorTo: purple
6
- sdk: streamlit
7
- sdk_version: 1.42.1
8
- app_file: app.py
9
- pinned: false
10
- ---
11
-
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: PDF工具箱(多功能PDF助手)
3
+ emoji: 📄
4
+ colorFrom: blue
5
+ colorTo: green
6
+ sdk: streamlit
7
+ sdk_version: 1.35.0
8
+ app_file: app.py
9
+ pinned: false
10
+ license: mit
11
+ ---
12
+
13
+ # 📄 PDF 工具箱(全功能多合一)
14
+
15
+ 這是一個多功能的 PDF 處理平台,支援下列中文化操作:
16
+
17
+ - **文字摘要**:用 OpenAI GPT-4/4.1/4.5 模型自動生成關鍵重點摘要
18
+ - **PDF 摘要**:支援長篇PDF內容摘要
19
+ - **PDF 密碼移除**:移除加密 PDF 密碼
20
+ - **arXiv 論文搜尋**:中文介面搜尋並過濾論文
21
+ - **PDF 合併**、**拆頁**、**轉文字**等多功能
22
+ - 全面中文介面與說明,適合教育、研究、行政等需求
23
+
24
+ ## 使用說明
25
+
26
+ 1. 於側邊欄輸入你的 OpenAI API Key(sk- 或 sk-proj- 開頭)
27
+ 2. 選擇所需 GPT 模型(gpt-4, gpt-4.1, gpt-4.5)
28
+ 3. 選擇左側功能分頁並依需求操作上傳文件
29
+ 4. 所有步驟均有中文提示
30
+
31
+ > 💡 **注意**:API Key 僅用於本次對話,不會儲存於伺服器,請安心使用!
32
+
33
+ ## 聯絡與貢獻
34
+
35
+ 本專案歡迎改進建議或功能增補,請於 Hugging Face 或 GitHub 提出 issue。
36
+
37
+ ---
app.py CHANGED
@@ -1,81 +1,104 @@
1
  import streamlit as st
2
- from textsumm import summarizer
3
- from pdfsum import extract_text_from_pdf, summarize_text, split_text_into_chunks
4
- from pdfpass import remove_pdf_password
5
- from papersearch import fetch_papers, filter_papers_by_year
 
6
  from io import BytesIO
7
  from datetime import datetime
8
  from pypdf import PdfReader, PdfWriter
9
 
10
- # Streamlit App Config
11
- st.set_page_config(page_title="PDF Tools Suite", page_icon="📄", layout="wide")
12
 
13
- # Sidebar Navigation
14
- st.sidebar.title("📄 PDF Tools Suite")
15
- page = st.sidebar.radio("Select a tool", ["Text Summarizer", "PDF Summarizer", "PDF Password Remover", "Research Paper Search", "PDF Merger", "PDF Splitter", "PDF to Text Converter"])
 
16
 
17
- # Tool: Text Summarizer
18
- if page == "Text Summarizer":
19
- st.title("📝 Text Summarizer")
20
- user_input = st.text_area("Enter text to summarize")
21
- if st.button("Summarize"):
22
- summary = summarizer(user_input, max_length=130, min_length=30, do_sample=False)
23
- st.subheader("Summary")
24
- st.write(summary[0]["summary_text"])
25
 
26
- # Tool: PDF Summarizer
27
- elif page == "PDF Summarizer":
28
- st.title("📜 PDF Summarizer")
29
- uploaded_file = st.file_uploader("Upload your PDF", type=["pdf"])
30
- if uploaded_file is not None:
31
- pdf_text = extract_text_from_pdf(uploaded_file)
32
- chunks = split_text_into_chunks(pdf_text)
33
- summaries = summarize_text(chunks)
34
- full_summary = " ".join(summaries)
35
- st.subheader("Summary")
36
- st.write(full_summary)
 
 
37
 
38
- # Tool: PDF Password Remover
39
- elif page == "PDF Password Remover":
40
- st.title("🔑 Remove PDF Password")
41
- uploaded_file = st.file_uploader("Choose a password-protected PDF", type=["pdf"])
42
- password = st.text_input("Enter the PDF password", type="password")
43
- if uploaded_file and password and st.button("Remove Password"):
44
- output = remove_pdf_password(uploaded_file, password)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45
  if isinstance(output, BytesIO):
46
- st.success("Password removed successfully!")
47
- st.download_button("Download PDF", data=output, file_name="unlocked_pdf.pdf", mime="application/pdf")
48
  else:
49
- st.error(f"Error: {output}")
50
 
51
- # Tool: Research Paper Search
52
- elif page == "Research Paper Search":
53
- st.title("🔍 Research Paper Search (arXiv)")
54
- query = st.text_input("Enter topic or keywords", placeholder="e.g., machine learning")
55
- max_results = st.slider("Number of results", 1, 50, 10)
56
  col1, col2 = st.columns(2)
57
  with col1:
58
- start_year = st.number_input("Start Year", min_value=1900, max_value=datetime.now().year, value=2000)
59
  with col2:
60
- end_year = st.number_input("End Year", min_value=1900, max_value=datetime.now().year, value=datetime.now().year)
61
- if st.button("Search"):
62
- papers = fetch_papers(query, max_results)
63
- papers_filtered = filter_papers_by_year(papers, start_year, end_year)
64
- if papers_filtered:
65
- for idx, paper in enumerate(papers_filtered, start=1):
66
- st.write(f"### {idx}. {paper['title']}")
67
- st.write(f"**Authors**: {', '.join(paper['authors'])}")
68
- st.write(f"**Published**: {paper['published']}")
69
- st.write(f"[Read More]({paper['link']})")
70
  st.write("---")
71
  else:
72
- st.warning("No papers found in the selected range.")
73
 
74
- # Tool: PDF Merger
75
- elif page == "PDF Merger":
76
- st.title("📎 Merge Multiple PDFs")
77
- uploaded_files = st.file_uploader("Upload multiple PDF files", type=["pdf"], accept_multiple_files=True)
78
- if uploaded_files and st.button("Merge PDFs"):
79
  pdf_writer = PdfWriter()
80
  for file in uploaded_files:
81
  pdf_reader = PdfReader(file)
@@ -84,12 +107,12 @@ elif page == "PDF Merger":
84
  output = BytesIO()
85
  pdf_writer.write(output)
86
  output.seek(0)
87
- st.download_button("Download Merged PDF", data=output, file_name="merged.pdf", mime="application/pdf")
88
 
89
- # Tool: PDF Splitter
90
- elif page == "PDF Splitter":
91
- st.title("✂️ Split PDF into Pages")
92
- uploaded_file = st.file_uploader("Upload a PDF", type=["pdf"])
93
  if uploaded_file:
94
  pdf_reader = PdfReader(uploaded_file)
95
  for i, page in enumerate(pdf_reader.pages):
@@ -98,12 +121,12 @@ elif page == "PDF Splitter":
98
  output = BytesIO()
99
  pdf_writer.write(output)
100
  output.seek(0)
101
- st.download_button(f"Download Page {i+1}", data=output, file_name=f"page_{i+1}.pdf", mime="application/pdf")
102
 
103
- # Tool: PDF to Text Converter
104
- elif page == "PDF to Text Converter":
105
- st.title("📜 Extract Text from PDF")
106
- uploaded_file = st.file_uploader("Upload a PDF", type=["pdf"])
107
  if uploaded_file:
108
- pdf_text = extract_text_from_pdf(uploaded_file)
109
- st.text_area("Extracted Text", pdf_text, height=300)
 
1
  import streamlit as st
2
+ import openai
3
+ from textsumm import 文字摘要
4
+ from pdfsum import 提取_pdf文字, 分段, 摘要
5
+ from pdfpass import 移除_pdf密碼
6
+ from papersearch import 抓取論文, 篩選論文依年份
7
  from io import BytesIO
8
  from datetime import datetime
9
  from pypdf import PdfReader, PdfWriter
10
 
11
+ # ---- 一定要在所有 st.xxx 指令之前 ----
12
+ st.set_page_config(page_title="PDF 工具箱", page_icon="📄", layout="wide")
13
 
14
+ # ---- 側邊欄(API Key 與模型選擇)----
15
+ st.sidebar.title("📄 PDF 工具箱")
16
+ api_key = st.sidebar.text_input("請輸入 OpenAI API 金鑰", type="password", placeholder="sk-...")
17
+ selected_model = st.sidebar.radio("選擇 GPT 模型", ["gpt-4", "gpt-4.0", "gpt-4.1", "gpt-4.5"], index=0)
18
 
19
+ if api_key:
20
+ openai.api_key = api_key
21
+ else:
22
+ st.sidebar.warning("請輸入你的 OpenAI API Key(sk- 或 sk-proj- 開頭)")
 
 
 
 
23
 
24
+ # ---- 分頁功能 ----
25
+ page = st.sidebar.radio(
26
+ "選擇功能",
27
+ [
28
+ "文字摘要",
29
+ "PDF 摘要",
30
+ "PDF 密碼移除",
31
+ "論文搜尋",
32
+ "PDF 合併",
33
+ "PDF 拆頁",
34
+ "PDF 轉純文字"
35
+ ]
36
+ )
37
 
38
+ # 文字摘要
39
+ if page == "文字摘要":
40
+ st.title("📝 文字摘要")
41
+ user_input = st.text_area("請輸入要摘要的文字")
42
+ if st.button("生成摘要"):
43
+ if not api_key:
44
+ st.error("請先輸入 OpenAI API 金鑰!")
45
+ else:
46
+ 結果 = 文字摘要(user_input)
47
+ st.subheader("摘要結果")
48
+ st.write(結果[0]["summary_text"])
49
+
50
+ # PDF 摘要
51
+ elif page == "PDF 摘要":
52
+ st.title("📜 PDF 摘要")
53
+ uploaded_file = st.file_uploader("上傳你的 PDF 檔案", type=["pdf"])
54
+ if uploaded_file is not None and st.button("產生 PDF 摘要"):
55
+ pdf_text = 提取_pdf文字(uploaded_file)
56
+ 段落們 = 分段(pdf_text)
57
+ 全部摘要 = " ".join(摘要(段落們))
58
+ st.subheader("摘要結果")
59
+ st.write(全部摘要)
60
+
61
+ # PDF 密碼移除
62
+ elif page == "PDF 密碼移除":
63
+ st.title("🔑 PDF 密碼移除")
64
+ uploaded_file = st.file_uploader("選擇需要解鎖的 PDF 檔案", type=["pdf"])
65
+ password = st.text_input("請輸入 PDF 密碼", type="password")
66
+ if uploaded_file and password and st.button("移除密碼"):
67
+ output = 移除_pdf密碼(uploaded_file, password)
68
  if isinstance(output, BytesIO):
69
+ st.success("密碼移除成功!")
70
+ st.download_button("下載已解鎖的 PDF", data=output, file_name="unlocked_pdf.pdf", mime="application/pdf")
71
  else:
72
+ st.error(f"錯誤:{output}")
73
 
74
+ # 論文搜尋
75
+ elif page == "論文搜尋":
76
+ st.title("🔍 論文搜尋(arXiv")
77
+ query = st.text_input("輸入主題或關鍵字", placeholder="例如:人工智慧、量子計算")
78
+ max_results = st.slider("結果數量", 1, 50, 10)
79
  col1, col2 = st.columns(2)
80
  with col1:
81
+ start_year = st.number_input("起始年份", min_value=1900, max_value=datetime.now().year, value=2000)
82
  with col2:
83
+ end_year = st.number_input("結束年份", min_value=1900, max_value=datetime.now().year, value=datetime.now().year)
84
+ if st.button("搜尋論文"):
85
+ papers = 抓取論文(query, max_results)
86
+ 篩選後 = 篩選論文依年份(papers, start_year, end_year)
87
+ if 篩選後:
88
+ for idx, 論文 in enumerate(篩選後, start=1):
89
+ st.write(f"### {idx}. {論文['標題']}")
90
+ st.write(f"**作者**: {', '.join(論文['作者'])}")
91
+ st.write(f"**發表時間**: {論文['發表時間']}")
92
+ st.write(f"[閱讀全文]({論文['連結']})")
93
  st.write("---")
94
  else:
95
+ st.warning("在所選年份範圍內沒有找到相關論文。")
96
 
97
+ # PDF 合併
98
+ elif page == "PDF 合併":
99
+ st.title("📎 多檔 PDF 合併")
100
+ uploaded_files = st.file_uploader("上傳多個 PDF 檔案", type=["pdf"], accept_multiple_files=True)
101
+ if uploaded_files and st.button("合併 PDF"):
102
  pdf_writer = PdfWriter()
103
  for file in uploaded_files:
104
  pdf_reader = PdfReader(file)
 
107
  output = BytesIO()
108
  pdf_writer.write(output)
109
  output.seek(0)
110
+ st.download_button("下載合併後的 PDF", data=output, file_name="merged.pdf", mime="application/pdf")
111
 
112
+ # PDF 拆頁
113
+ elif page == "PDF 拆頁":
114
+ st.title("✂️ PDF 拆頁")
115
+ uploaded_file = st.file_uploader("上傳一個 PDF", type=["pdf"])
116
  if uploaded_file:
117
  pdf_reader = PdfReader(uploaded_file)
118
  for i, page in enumerate(pdf_reader.pages):
 
121
  output = BytesIO()
122
  pdf_writer.write(output)
123
  output.seek(0)
124
+ st.download_button(f"下載第 {i+1}", data=output, file_name=f"page_{i+1}.pdf", mime="application/pdf")
125
 
126
+ # PDF 轉純文字
127
+ elif page == "PDF 轉純文字":
128
+ st.title("📜 PDF 轉純文字")
129
+ uploaded_file = st.file_uploader("上傳 PDF", type=["pdf"])
130
  if uploaded_file:
131
+ pdf_text = 提取_pdf文字(uploaded_file)
132
+ st.text_area("擷取內容", pdf_text, height=300)
gitattributes CHANGED
@@ -1,35 +1,6 @@
1
- *.7z filter=lfs diff=lfs merge=lfs -text
2
- *.arrow filter=lfs diff=lfs merge=lfs -text
3
- *.bin filter=lfs diff=lfs merge=lfs -text
4
- *.bz2 filter=lfs diff=lfs merge=lfs -text
5
- *.ckpt filter=lfs diff=lfs merge=lfs -text
6
- *.ftz filter=lfs diff=lfs merge=lfs -text
7
- *.gz filter=lfs diff=lfs merge=lfs -text
8
- *.h5 filter=lfs diff=lfs merge=lfs -text
9
- *.joblib filter=lfs diff=lfs merge=lfs -text
10
- *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
- *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
- *.model filter=lfs diff=lfs merge=lfs -text
13
- *.msgpack filter=lfs diff=lfs merge=lfs -text
14
- *.npy filter=lfs diff=lfs merge=lfs -text
15
- *.npz filter=lfs diff=lfs merge=lfs -text
16
- *.onnx filter=lfs diff=lfs merge=lfs -text
17
- *.ot filter=lfs diff=lfs merge=lfs -text
18
- *.parquet filter=lfs diff=lfs merge=lfs -text
19
- *.pb filter=lfs diff=lfs merge=lfs -text
20
- *.pickle filter=lfs diff=lfs merge=lfs -text
21
- *.pkl filter=lfs diff=lfs merge=lfs -text
22
- *.pt filter=lfs diff=lfs merge=lfs -text
23
- *.pth filter=lfs diff=lfs merge=lfs -text
24
- *.rar filter=lfs diff=lfs merge=lfs -text
25
- *.safetensors filter=lfs diff=lfs merge=lfs -text
26
- saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
- *.tar.* filter=lfs diff=lfs merge=lfs -text
28
- *.tar filter=lfs diff=lfs merge=lfs -text
29
- *.tflite filter=lfs diff=lfs merge=lfs -text
30
- *.tgz filter=lfs diff=lfs merge=lfs -text
31
- *.wasm filter=lfs diff=lfs merge=lfs -text
32
- *.xz filter=lfs diff=lfs merge=lfs -text
33
- *.zip filter=lfs diff=lfs merge=lfs -text
34
- *.zst filter=lfs diff=lfs merge=lfs -text
35
- *tfevents* filter=lfs diff=lfs merge=lfs -text
 
1
+ # Git LFS 屬性設定檔(可用於大檔案控制)
2
+ *.pdf filter=lfs diff=lfs merge=lfs -text
3
+ *.jpg filter=lfs diff=lfs merge=lfs -text
4
+ *.png filter=lfs diff=lfs merge=lfs -text
5
+
6
+ # 中文註解:上面設定會讓 PDF/圖片走 Git LFS(大檔案友善處理)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
papersearch.py CHANGED
@@ -1,154 +1,31 @@
1
- # import streamlit as st
2
- # import requests
3
- # import xmltodict
4
-
5
- # # arXiv API base URL
6
- # ARXIV_API_BASE = "http://export.arxiv.org/api/query"
7
-
8
- # def fetch_papers(query, max_results=10):
9
- # """Fetch papers from the arXiv API."""
10
- # try:
11
- # # Build the API query URL
12
- # api_url = f"{ARXIV_API_BASE}?search_query=all:{query}&start=0&max_results={max_results}"
13
-
14
- # # Make the API request
15
- # response = requests.get(api_url, headers={'Accept': 'application/xml'})
16
- # response.raise_for_status()
17
-
18
- # # Parse the XML response
19
- # data = xmltodict.parse(response.text)
20
- # entries = data.get('feed', {}).get('entry', [])
21
-
22
- # if not isinstance(entries, list): # Handle single result
23
- # entries = [entries]
24
-
25
- # # Extract relevant fields
26
- # papers = []
27
- # for entry in entries:
28
- # papers.append({
29
- # 'title': entry.get('title'),
30
- # 'summary': entry.get('summary'),
31
- # 'published': entry.get('published'),
32
- # 'authors': [author['name'] for author in entry.get('author', [])] if isinstance(entry.get('author'), list) else [entry.get('author', {}).get('name')],
33
- # 'link': entry.get('id')
34
- # })
35
-
36
- # return papers
37
- # except Exception as e:
38
- # st.error(f"Error fetching papers: {e}")
39
- # return []
40
-
41
- # # Streamlit app UI
42
- # st.title("arXiv Research Paper Search")
43
- # st.subheader("Find academic papers on your topic of interest")
44
-
45
- # # Input fields
46
- # query = st.text_input("Enter a topic or keywords", placeholder="e.g., machine learning, quantum computing")
47
- # max_results = st.slider("Number of results", min_value=1, max_value=50, value=10)
48
-
49
- # if st.button("Search"):
50
- # if query.strip():
51
- # st.info(f"Searching for papers on: **{query}**")
52
- # papers = fetch_papers(query, max_results)
53
-
54
- # if papers:
55
- # st.success(f"Found {len(papers)} papers!")
56
- # for idx, paper in enumerate(papers, start=1):
57
- # st.write(f"### {idx}. {paper['title']}")
58
- # st.write(f"**Authors**: {', '.join(paper['authors'])}")
59
- # st.write(f"**Published**: {paper['published']}")
60
- # st.write(f"[Read More]({paper['link']})")
61
- # st.write("---")
62
- # else:
63
- # st.warning("No papers found. Try a different query.")
64
- # else:
65
- # st.error("Please enter a topic or keywords to search.")
66
-
67
- import streamlit as st
68
  import requests
69
- import xmltodict
70
  from datetime import datetime
71
 
72
- # arXiv API base URL
73
- ARXIV_API_BASE = "http://export.arxiv.org/api/query"
74
-
75
- def fetch_papers(query, max_results=10):
76
- """Fetch papers from the arXiv API."""
77
- try:
78
- # Build the API query URL
79
- api_url = f"{ARXIV_API_BASE}?search_query=all:{query}&start=0&max_results={max_results}"
80
-
81
- # Make the API request
82
- response = requests.get(api_url, headers={'Accept': 'application/xml'})
83
- response.raise_for_status()
84
-
85
- # Parse the XML response
86
- data = xmltodict.parse(response.text)
87
- entries = data.get('feed', {}).get('entry', [])
88
-
89
- if not isinstance(entries, list): # Handle single result
90
- entries = [entries]
91
-
92
- # Extract relevant fields
93
- papers = []
94
- for entry in entries:
95
- papers.append({
96
- 'title': entry.get('title'),
97
- 'summary': entry.get('summary'),
98
- 'published': entry.get('published'),
99
- 'authors': [author['name'] for author in entry.get('author', [])] if isinstance(entry.get('author'), list) else [entry.get('author', {}).get('name')],
100
- 'link': entry.get('id')
101
- })
102
-
103
- return papers
104
- except Exception as e:
105
- st.error(f"Error fetching papers: {e}")
106
- return []
107
-
108
- def filter_papers_by_year(papers, start_year, end_year):
109
- """Filter papers by the publication year range."""
110
- filtered_papers = []
111
- for paper in papers:
112
- try:
113
- published_year = int(paper['published'][:4]) # Extract year from the published date
114
- if start_year <= published_year <= end_year:
115
- filtered_papers.append(paper)
116
- except ValueError:
117
- continue # Skip if the year is not valid
118
- return filtered_papers
119
-
120
- # Streamlit app UI
121
- st.title("arXiv Research Paper Search")
122
- st.subheader("Find academic papers on your topic of interest")
123
-
124
- # Input fields
125
- query = st.text_input("Enter a topic or keywords", placeholder="e.g., machine learning, quantum computing")
126
- max_results = st.slider("Number of results", min_value=1, max_value=50, value=10)
127
-
128
- # Year filter
129
- col1, col2 = st.columns(2)
130
- with col1:
131
- start_year = st.number_input("Start Year", min_value=1900, max_value=datetime.now().year, value=2000, step=1)
132
- with col2:
133
- end_year = st.number_input("End Year", min_value=1900, max_value=datetime.now().year, value=datetime.now().year, step=1)
134
-
135
- if st.button("Search"):
136
- if query.strip():
137
- st.info(f"Searching for papers on: **{query}**")
138
- papers = fetch_papers(query, max_results)
139
-
140
- # Filter papers by year
141
- papers_filtered = filter_papers_by_year(papers, start_year, end_year)
142
-
143
- if papers_filtered:
144
- st.success(f"Found {len(papers_filtered)} papers between {start_year} and {end_year}!")
145
- for idx, paper in enumerate(papers_filtered, start=1):
146
- st.write(f"### {idx}. {paper['title']}")
147
- st.write(f"**Authors**: {', '.join(paper['authors'])}")
148
- st.write(f"**Published**: {paper['published']}")
149
- st.write(f"[Read More]({paper['link']})")
150
- st.write("---")
151
- else:
152
- st.warning(f"No papers found between {start_year} and {end_year}. Try a different query or adjust the year range.")
153
- else:
154
- st.error("Please enter a topic or keywords to search.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import requests
2
+ import xml.etree.ElementTree as ET
3
  from datetime import datetime
4
 
5
+ def 抓取論文(關鍵字, 最大數量=10):
6
+ """
7
+ 從 arXiv 依關鍵字搜尋論文(最新)
8
+ """
9
+ url = f"https://export.arxiv.org/api/query?search_query=all:{關鍵字}&start=0&max_results={最大數量}&sortBy=lastUpdatedDate"
10
+ res = requests.get(url)
11
+ root = ET.fromstring(res.content)
12
+ 論文清單 = []
13
+ for entry in root.findall('{http://www.w3.org/2005/Atom}entry'):
14
+ 論文清單.append({
15
+ "標題": entry.find('{http://www.w3.org/2005/Atom}title').text.strip(),
16
+ "作者": [author.find('{http://www.w3.org/2005/Atom}name').text for author in entry.findall('{http://www.w3.org/2005/Atom}author')],
17
+ "發表時間": entry.find('{http://www.w3.org/2005/Atom}published').text[:10],
18
+ "連結": entry.find('{http://www.w3.org/2005/Atom}id').text
19
+ })
20
+ return 論文清單
21
+
22
+ def 篩選論文依年份(論文清單, 起始, 結束):
23
+ """
24
+ 依年份篩選論文(年分區間)
25
+ """
26
+ 篩選 = []
27
+ for 論文 in 論文清單:
28
+ 年份 = int(論文["發表時間"][:4])
29
+ if 起始 <= 年份 <= 結束:
30
+ 篩選.append(論文)
31
+ return 篩選
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
pdfpass.py CHANGED
@@ -1,40 +1,20 @@
1
- import streamlit as st
2
- from PyPDF2 import PdfReader, PdfWriter
3
  from io import BytesIO
4
 
5
- def remove_pdf_password(file, password):
 
 
 
6
  try:
7
- reader = PdfReader(file)
8
  if reader.is_encrypted:
9
- reader.decrypt(password)
10
  writer = PdfWriter()
11
  for page in reader.pages:
12
  writer.add_page(page)
13
-
14
  output = BytesIO()
15
  writer.write(output)
16
  output.seek(0)
17
  return output
18
  except Exception as e:
19
- return str(e)
20
-
21
- st.title("PDF Password Remover")
22
- st.write("Upload a password-protected PDF and remove its password.")
23
-
24
- # File upload
25
- uploaded_file = st.file_uploader("Choose a PDF file", type=["pdf"])
26
- password = st.text_input("Enter the PDF password", type="password")
27
-
28
- if uploaded_file and password:
29
- if st.button("Remove Password"):
30
- output = remove_pdf_password(uploaded_file, password)
31
- if isinstance(output, BytesIO):
32
- st.success("Password removed successfully!")
33
- st.download_button(
34
- label="Download PDF without Password",
35
- data=output,
36
- file_name="unlocked_pdf.pdf",
37
- mime="application/pdf",
38
- )
39
- else:
40
- st.error(f"Error: {output}")
 
1
+ from pypdf import PdfReader, PdfWriter
 
2
  from io import BytesIO
3
 
4
+ def 移除_pdf密碼(pdf檔案, 密碼):
5
+ """
6
+ 解鎖帶有密碼保護的 PDF 檔案,回傳已解鎖的檔案(BytesIO)或錯誤訊息
7
+ """
8
  try:
9
+ reader = PdfReader(pdf檔案)
10
  if reader.is_encrypted:
11
+ reader.decrypt(密碼)
12
  writer = PdfWriter()
13
  for page in reader.pages:
14
  writer.add_page(page)
 
15
  output = BytesIO()
16
  writer.write(output)
17
  output.seek(0)
18
  return output
19
  except Exception as e:
20
+ return f"解鎖失敗:{e}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
pdfsum.py CHANGED
@@ -1,125 +1,32 @@
1
- # import streamlit as st
2
- # from transformers import pipeline
3
- # from PyPDF2 import PdfReader
4
-
5
- # # Initialize the summarizer
6
- # summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
7
-
8
- # def extract_text_from_pdf(pdf_file):
9
- # """Extract text from an uploaded PDF file."""
10
- # try:
11
- # reader = PdfReader(pdf_file)
12
- # text = ""
13
- # for page in reader.pages:
14
- # page_text = page.extract_text()
15
- # if page_text: # Skip pages with no text
16
- # text += page_text + "\n"
17
- # return text
18
- # except Exception as e:
19
- # raise ValueError(f"Error extracting text from PDF: {e}")
20
-
21
- # def split_text_into_chunks(text, max_chunk_size=1024):
22
- # """Split the text into smaller chunks for summarization."""
23
- # chunks = []
24
- # while len(text) > max_chunk_size:
25
- # split_point = text.rfind(". ", 0, max_chunk_size) + 1 # Split at the last sentence boundary
26
- # if split_point == 0: # No sentence boundary found, split arbitrarily
27
- # split_point = max_chunk_size
28
- # chunks.append
29
-
30
- # # Streamlit Dashboard
31
- # st.title("PDF Summarizer")
32
- # st.write("Upload a PDF file to get a summarized version of its content.")
33
-
34
- # uploaded_file = st.file_uploader("Upload your PDF", type=["pdf"])
35
-
36
- # if uploaded_file is not None:
37
- # # Extract text from the PDF
38
- # st.write("Processing your PDF...")
39
- # try:
40
- # pdf_text = extract_text_from_pdf(uploaded_file)
41
- # st.write("PDF content extracted successfully.")
42
-
43
- # # Display extracted text (optional)
44
- # with st.expander("View Extracted Text"):
45
- # st.text_area("Extracted Text", pdf_text, height=300)
46
-
47
- # # Summarize the extracted text
48
- # if st.button("Summarize"):
49
- # st.write("Generating summary...")
50
- # summary = summarizer(pdf_text, max_length=130, min_length=30, do_sample=False)
51
- # st.subheader("Summary")
52
- # st.write(summary[0]["summary_text"])
53
- # except Exception as e:
54
- # st.error(f"An error occurred while processing the PDF: {str(e)}")
55
-
56
- import streamlit as st
57
  from transformers import pipeline
58
- import pdfplumber
59
-
60
- # Initialize the summarizer
61
- summarizer = pipeline("summarization", model="t5-small")
62
-
63
- def extract_text_from_pdf(pdf_file):
64
- """Extract text from an uploaded PDF file using pdfplumber."""
65
- try:
66
- text = ""
67
- with pdfplumber.open(pdf_file) as pdf:
68
- for page in pdf.pages:
69
- text += page.extract_text() + "\n"
70
- if not text.strip():
71
- raise ValueError("No extractable text found in the PDF.")
72
- return text
73
- except Exception as e:
74
- raise ValueError(f"Error extracting text from PDF: {e}")
75
-
76
- def split_text_into_chunks(text, max_chunk_size=1024):
77
- """Split the text into smaller chunks for summarization."""
78
- chunks = []
79
- while len(text) > max_chunk_size:
80
- split_point = text.rfind(". ", 0, max_chunk_size) + 1 # Find the last full sentence
81
- if split_point == 0: # No sentence boundary found, split arbitrarily
82
- split_point = max_chunk_size
83
- chunks.append(text[:split_point])
84
- text = text[split_point:]
85
- if text:
86
- chunks.append(text)
87
- return chunks
88
-
89
- def summarize_text(chunks):
90
- """Summarize each chunk of text with dynamic max_length."""
91
- summaries = []
92
- for chunk in chunks:
93
- input_length = len(chunk.split()) # Approximate token count
94
- max_length = max(48, int(input_length * 0.8)) # Set max_length to 80% of input length
95
- summary = summarizer(chunk, max_length=max_length, min_length=10, do_sample=False)
96
- summaries.append(summary[0]["summary_text"])
97
- return summaries
98
-
99
- # Streamlit Dashboard
100
- st.title("PDF Summarizer")
101
- st.write("Upload a PDF file to get a summarized version of its content.")
102
-
103
- uploaded_file = st.file_uploader("Upload your PDF", type=["pdf"])
104
 
105
- if uploaded_file is not None:
106
- try:
107
- # Extract text from the PDF
108
- st.write("Processing your PDF...")
109
- pdf_text = extract_text_from_pdf(uploaded_file)
110
- st.write("PDF content extracted successfully.")
111
-
112
- # Display extracted text (optional)
113
- with st.expander("View Extracted Text"):
114
- st.text_area("Extracted Text", pdf_text, height=300)
115
-
116
- # Summarize the extracted text
117
- if st.button("Summarize"):
118
- st.write("Generating summary...")
119
- chunks = split_text_into_chunks(pdf_text)
120
- summaries = summarize_text(chunks)
121
- full_summary = " ".join(summaries)
122
- st.subheader("Summary")
123
- st.write(full_summary)
124
- except Exception as e:
125
- st.error(f"An error occurred while processing the PDF: {str(e)}")
 
 
 
 
 
 
 
 
 
1
+ from PyPDF2 import PdfReader
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  from transformers import pipeline
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
 
4
+ # 這裡你也可以改成你要的中文 BART、T5 summarization 模型
5
+ summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
6
+
7
+ def 提取_pdf文字(pdf檔案):
8
+ """
9
+ PDF 檔案讀取並合併所有頁面的內容為純文字
10
+ """
11
+ reader = PdfReader(pdf檔案)
12
+ 內容 = ""
13
+ for 頁面 in reader.pages:
14
+ 內容 += 頁面.extract_text()
15
+ return 內容
16
+
17
+ def 分段(內容, 每段字數=2000):
18
+ """
19
+ 將長文本切成多個段落(方便 AI 處理)
20
+ """
21
+ return [內容[i:i+每段字數] for i in range(0, len(內容), 每段字數)]
22
+
23
+ def 摘要(段落們):
24
+ """
25
+ 對每個段落做中文摘要,再合併回一份總結
26
+ """
27
+ 結果 = []
28
+ for 段 in 段落們:
29
+ 結果.append(
30
+ summarizer(段, max_length=130, min_length=30, do_sample=False)[0]["summary_text"]
31
+ )
32
+ return 結果
requirements.txt CHANGED
Binary files a/requirements.txt and b/requirements.txt differ
 
textsumm.py CHANGED
@@ -1,28 +1,11 @@
1
  from transformers import pipeline
2
 
 
3
  summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
4
- ARTICLE ="""
5
- There is widespread international concern that Russia's war will provoke a global food crisis similar to, or
6
- worse than, that faced in 2007 and 2008. The war comes at a time when the global food system was already
7
- struggling to feed its growing population in a sustainable way, under the pressure caused by climate change
8
- and the Covid-19 pandemic. Russia and Ukraine are key agricultural players, together exporting nearly 12
9
- % of food calories traded globally. They are major providers of basic agro-commodities, including wheat,
10
- maize and sunflower oil, and Russia is the world's top exporter of fertilisers. The global supply chain will
11
- get impacted until Russia and Ukraine retreat and will end the war.
12
- The war's impact on global food supply centred on three factors. First is a significant reduction in exports
13
- and production of essential commodities from both countries, caused by the war and not the economic
14
- sanctions imposed on Russia, which, intentionally, did not target the agricultural sector. Overall, the
15
- European Commission estimates that 'up to 25 million tonnes of wheat would need to be substituted to
16
- meet worldwide food needs in the current and the next season. Second factor is a global spike in prices of
17
- food supplies and inputs needed for agri-food production, which were already at record levels before the
18
- war. The war has further pushed the prices up. Third factor is the international response to the above,
19
- which could either amplify the effects of the crisis (mainly by uncoordinated export bans) or mitigate them
20
- (applying lessons learnt from the 2007-2008 food crisis). A number of countries, other than Russia and
21
- Ukraine, have already imposed or announced their intention to impose some control over exports of
22
- essential agricultural commodities, including Egypt, Argentina, Indonesia, Serbia, Turkey and, in the EU,
23
- Hungary. We should keep this in our mind that the long duration of war will make the global situation
24
- irrecoverable.
25
-
26
- """
27
- print(summarizer(ARTICLE, max_length=130, min_length=30, do_sample=False))
28
 
 
 
 
 
 
 
 
1
  from transformers import pipeline
2
 
3
+ # 建立中文摘要管道
4
  summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
 
6
+ def 文字摘要(輸入文本, max_length=130, min_length=30, do_sample=False):
7
+ """
8
+ 將輸入的純文字自動摘要為繁體中文重點
9
+ """
10
+ result = summarizer(輸入文本, max_length=max_length, min_length=min_length, do_sample=do_sample)
11
+ return result