3a05chatgpt commited on
Commit
93c008b
·
verified ·
1 Parent(s): 3e5b93f

Upload 8 files

Browse files
Files changed (8) hide show
  1. README.md +5 -1
  2. app.py +109 -168
  3. gitattributes +35 -0
  4. papersearch.py +154 -0
  5. pdfpass.py +40 -0
  6. pdfsum.py +125 -0
  7. requirements.txt +0 -0
  8. textsumm.py +28 -0
README.md CHANGED
@@ -1,3 +1,4 @@
 
1
  title: Pdf Tools Suite
2
  emoji: 📚
3
  colorFrom: gray
@@ -5,4 +6,7 @@ colorTo: purple
5
  sdk: streamlit
6
  sdk_version: 1.42.1
7
  app_file: app.py
8
- pinned: false
 
 
 
 
1
+ ---
2
  title: Pdf Tools Suite
3
  emoji: 📚
4
  colorFrom: gray
 
6
  sdk: streamlit
7
  sdk_version: 1.42.1
8
  app_file: app.py
9
+ pinned: false
10
+ ---
11
+
12
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py CHANGED
@@ -1,168 +1,109 @@
1
- import openai
2
- import gradio as gr
3
- import fitz # PyMuPDF
4
- from openai import OpenAI
5
- import traceback
6
-
7
- # 全域變數
8
- api_key = ""
9
- selected_model = "gpt-4"
10
- summary_text = ""
11
- client = None
12
- pdf_text = ""
13
-
14
- def set_api_key(user_api_key):
15
- """設定 OpenAI API Key 並初始化客戶端"""
16
- global api_key, client
17
- try:
18
- api_key = user_api_key.strip()
19
- if not api_key:
20
- return " API Key 不能為空"
21
-
22
- # 支援新舊 key 格式
23
- if not (api_key.startswith('sk-') or api_key.startswith('sk-proj-')):
24
- return "❌ API Key 格式錯誤,必須以 'sk-' 或 'sk-proj-' 開頭"
25
-
26
- client = OpenAI(api_key=api_key)
27
-
28
- # 測試 API Key 是否有效
29
- test_response = client.chat.completions.create(
30
- model="gpt-4",
31
- messages=[{"role": "user", "content": "你好"}],
32
- max_tokens=5
33
- )
34
- return " API Key 已設定並驗證成功!"
35
- except Exception as e:
36
- if "incorrect_api_key" in str(e).lower():
37
- return "❌ API Key 無效,請檢查是否正確"
38
- elif "quota" in str(e).lower():
39
- return "⚠️ API Key 有效,但配額不足"
40
- else:
41
- return f" API Key 設定失敗: {str(e)}"
42
-
43
- def set_model(model_name):
44
- global selected_model
45
- selected_model = model_name
46
- return f" 模型已選擇:{model_name}"
47
-
48
- def extract_pdf_text(file_path):
49
- try:
50
- doc = fitz.open(file_path)
51
- text = ""
52
- for page_num, page in enumerate(doc):
53
- page_text = page.get_text()
54
- if page_text.strip():
55
- text += f"\n--- {page_num + 1} ---\n{page_text}"
56
- doc.close()
57
- return text
58
- except Exception as e:
59
- return f"❌ PDF 解析錯誤: {str(e)}"
60
-
61
- def generate_summary(pdf_file):
62
- global summary_text, pdf_text
63
- if not client:
64
- return "❌ 請先設定 OpenAI API Key"
65
- if not pdf_file:
66
- return " 請先上傳 PDF 文件"
67
- try:
68
- pdf_text = extract_pdf_text(pdf_file.name)
69
- if not pdf_text.strip():
70
- return "⚠️ 無法解析 PDF 文字,可能為純圖片 PDF 或空白文件。"
71
- pdf_text_truncated = pdf_text[:8000]
72
- response = client.chat.completions.create(
73
- model=selected_model,
74
- messages=[
75
- {"role": "system", "content": "請將以下 PDF 內容整理為條列式摘要,用繁體中文回答:"},
76
- {"role": "user", "content": pdf_text_truncated}
77
- ],
78
- temperature=0.3
79
- )
80
- summary_text = response.choices[0].message.content
81
- return summary_text
82
- except Exception as e:
83
- print(traceback.format_exc())
84
- return f"❌ 摘要生成失敗: {str(e)}"
85
-
86
- def ask_question(user_question):
87
- if not client:
88
- return "❌ 請先設定 OpenAI API Key"
89
- if not summary_text and not pdf_text:
90
- return "❌ 請先生成 PDF 摘要"
91
- if not user_question.strip():
92
- return " 請輸入問題"
93
- try:
94
- context = f"PDF 摘要:\n{summary_text}\n\n原始內容(部分):\n{pdf_text[:2000]}"
95
- response = client.chat.completions.create(
96
- model=selected_model,
97
- messages=[
98
- {"role": "system", "content": f"根據以下 PDF 內容回答問題,請用繁體中文回答:\n{context}"},
99
- {"role": "user", "content": user_question}
100
- ],
101
- temperature=0.2
102
- )
103
- return response.choices[0].message.content
104
- except Exception as e:
105
- print(traceback.format_exc())
106
- return f" 問答生成失敗: {str(e)}"
107
-
108
- def clear_all():
109
- global summary_text, pdf_text
110
- summary_text = ""
111
- pdf_text = ""
112
- return "", "", ""
113
-
114
- with gr.Blocks(
115
- title="PDF 摘要助手",
116
- css="""
117
- .gradio-container {
118
- max-width: none !important;
119
- width: 100% !important;
120
- background: linear-gradient(135deg, #667eea 0%, #764ba2 100%) !important;
121
- min-height: 100vh;
122
- }
123
- .main-content {
124
- max-width: 1600px !important;
125
- margin: 20px auto !important;
126
- padding: 30px !important;
127
- background: rgba(255, 255, 255, 0.95) !important;
128
- border-radius: 20px !important;
129
- }
130
- """
131
- ) as demo:
132
- with gr.Column():
133
- gr.Markdown("## 📄 PDF 摘要 & 問答助手")
134
-
135
- with gr.Tab("🔧 設定"):
136
- api_key_input = gr.Textbox(label="🔑 輸入 OpenAI API Key", type="password")
137
- api_key_status = gr.Textbox(label="API 狀態", interactive=False, value="等待設定 API Key...")
138
- api_key_btn = gr.Button("確認 API Key")
139
- api_key_btn.click(set_api_key, inputs=api_key_input, outputs=api_key_status)
140
-
141
- model_choice = gr.Radio(["gpt-4", "gpt-4.1", "gpt-4.5"], label="選擇 AI 模型", value="gpt-4")
142
- model_status = gr.Textbox(label="模型狀態", interactive=False, value="✅ 已選擇:gpt-4")
143
- model_choice.change(set_model, inputs=model_choice, outputs=model_status)
144
-
145
- with gr.Tab("📄 摘要"):
146
- pdf_upload = gr.File(label="上傳 PDF", file_types=[".pdf"])
147
- summary_btn = gr.Button("生成摘要")
148
- summary_output = gr.Textbox(label="PDF 摘要", lines=12)
149
- summary_btn.click(generate_summary, inputs=pdf_upload, outputs=summary_output)
150
-
151
- with gr.Tab("❓ 問答"):
152
- question_input = gr.Textbox(label="請輸入問題", lines=2)
153
- question_btn = gr.Button("送出問題")
154
- answer_output = gr.Textbox(label="AI 回答", lines=8)
155
- question_btn.click(ask_question, inputs=question_input, outputs=answer_output)
156
- question_input.submit(ask_question, inputs=question_input, outputs=answer_output)
157
-
158
- clear_btn = gr.Button("🗑️ 清除所有資料")
159
- clear_btn.click(clear_all, outputs=[summary_output, question_input, answer_output])
160
-
161
- if __name__ == "__main__":
162
- demo.launch(
163
- show_error=True,
164
- share=True,
165
- server_name="0.0.0.0",
166
- server_port=7860
167
- )
168
-
 
1
+ import streamlit as st
2
+ from textsumm import summarizer
3
+ from pdfsum import extract_text_from_pdf, summarize_text, split_text_into_chunks
4
+ from pdfpass import remove_pdf_password
5
+ from papersearch import fetch_papers, filter_papers_by_year
6
+ from io import BytesIO
7
+ from datetime import datetime
8
+ from pypdf import PdfReader, PdfWriter
9
+
10
+ # Streamlit App Config
11
+ st.set_page_config(page_title="PDF Tools Suite", page_icon="📄", layout="wide")
12
+
13
+ # Sidebar Navigation
14
+ st.sidebar.title("📄 PDF Tools Suite")
15
+ page = st.sidebar.radio("Select a tool", ["Text Summarizer", "PDF Summarizer", "PDF Password Remover", "Research Paper Search", "PDF Merger", "PDF Splitter", "PDF to Text Converter"])
16
+
17
+ # Tool: Text Summarizer
18
+ if page == "Text Summarizer":
19
+ st.title("📝 Text Summarizer")
20
+ user_input = st.text_area("Enter text to summarize")
21
+ if st.button("Summarize"):
22
+ summary = summarizer(user_input, max_length=130, min_length=30, do_sample=False)
23
+ st.subheader("Summary")
24
+ st.write(summary[0]["summary_text"])
25
+
26
+ # Tool: PDF Summarizer
27
+ elif page == "PDF Summarizer":
28
+ st.title("📜 PDF Summarizer")
29
+ uploaded_file = st.file_uploader("Upload your PDF", type=["pdf"])
30
+ if uploaded_file is not None:
31
+ pdf_text = extract_text_from_pdf(uploaded_file)
32
+ chunks = split_text_into_chunks(pdf_text)
33
+ summaries = summarize_text(chunks)
34
+ full_summary = " ".join(summaries)
35
+ st.subheader("Summary")
36
+ st.write(full_summary)
37
+
38
+ # Tool: PDF Password Remover
39
+ elif page == "PDF Password Remover":
40
+ st.title("🔑 Remove PDF Password")
41
+ uploaded_file = st.file_uploader("Choose a password-protected PDF", type=["pdf"])
42
+ password = st.text_input("Enter the PDF password", type="password")
43
+ if uploaded_file and password and st.button("Remove Password"):
44
+ output = remove_pdf_password(uploaded_file, password)
45
+ if isinstance(output, BytesIO):
46
+ st.success("Password removed successfully!")
47
+ st.download_button("Download PDF", data=output, file_name="unlocked_pdf.pdf", mime="application/pdf")
48
+ else:
49
+ st.error(f"Error: {output}")
50
+
51
+ # Tool: Research Paper Search
52
+ elif page == "Research Paper Search":
53
+ st.title("🔍 Research Paper Search (arXiv)")
54
+ query = st.text_input("Enter topic or keywords", placeholder="e.g., machine learning")
55
+ max_results = st.slider("Number of results", 1, 50, 10)
56
+ col1, col2 = st.columns(2)
57
+ with col1:
58
+ start_year = st.number_input("Start Year", min_value=1900, max_value=datetime.now().year, value=2000)
59
+ with col2:
60
+ end_year = st.number_input("End Year", min_value=1900, max_value=datetime.now().year, value=datetime.now().year)
61
+ if st.button("Search"):
62
+ papers = fetch_papers(query, max_results)
63
+ papers_filtered = filter_papers_by_year(papers, start_year, end_year)
64
+ if papers_filtered:
65
+ for idx, paper in enumerate(papers_filtered, start=1):
66
+ st.write(f"### {idx}. {paper['title']}")
67
+ st.write(f"**Authors**: {', '.join(paper['authors'])}")
68
+ st.write(f"**Published**: {paper['published']}")
69
+ st.write(f"[Read More]({paper['link']})")
70
+ st.write("---")
71
+ else:
72
+ st.warning("No papers found in the selected range.")
73
+
74
+ # Tool: PDF Merger
75
+ elif page == "PDF Merger":
76
+ st.title("📎 Merge Multiple PDFs")
77
+ uploaded_files = st.file_uploader("Upload multiple PDF files", type=["pdf"], accept_multiple_files=True)
78
+ if uploaded_files and st.button("Merge PDFs"):
79
+ pdf_writer = PdfWriter()
80
+ for file in uploaded_files:
81
+ pdf_reader = PdfReader(file)
82
+ for page in pdf_reader.pages:
83
+ pdf_writer.add_page(page)
84
+ output = BytesIO()
85
+ pdf_writer.write(output)
86
+ output.seek(0)
87
+ st.download_button("Download Merged PDF", data=output, file_name="merged.pdf", mime="application/pdf")
88
+
89
+ # Tool: PDF Splitter
90
+ elif page == "PDF Splitter":
91
+ st.title("✂️ Split PDF into Pages")
92
+ uploaded_file = st.file_uploader("Upload a PDF", type=["pdf"])
93
+ if uploaded_file:
94
+ pdf_reader = PdfReader(uploaded_file)
95
+ for i, page in enumerate(pdf_reader.pages):
96
+ pdf_writer = PdfWriter()
97
+ pdf_writer.add_page(page)
98
+ output = BytesIO()
99
+ pdf_writer.write(output)
100
+ output.seek(0)
101
+ st.download_button(f"Download Page {i+1}", data=output, file_name=f"page_{i+1}.pdf", mime="application/pdf")
102
+
103
+ # Tool: PDF to Text Converter
104
+ elif page == "PDF to Text Converter":
105
+ st.title("📜 Extract Text from PDF")
106
+ uploaded_file = st.file_uploader("Upload a PDF", type=["pdf"])
107
+ if uploaded_file:
108
+ pdf_text = extract_text_from_pdf(uploaded_file)
109
+ st.text_area("Extracted Text", pdf_text, height=300)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
papersearch.py ADDED
@@ -0,0 +1,154 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # import streamlit as st
2
+ # import requests
3
+ # import xmltodict
4
+
5
+ # # arXiv API base URL
6
+ # ARXIV_API_BASE = "http://export.arxiv.org/api/query"
7
+
8
+ # def fetch_papers(query, max_results=10):
9
+ # """Fetch papers from the arXiv API."""
10
+ # try:
11
+ # # Build the API query URL
12
+ # api_url = f"{ARXIV_API_BASE}?search_query=all:{query}&start=0&max_results={max_results}"
13
+
14
+ # # Make the API request
15
+ # response = requests.get(api_url, headers={'Accept': 'application/xml'})
16
+ # response.raise_for_status()
17
+
18
+ # # Parse the XML response
19
+ # data = xmltodict.parse(response.text)
20
+ # entries = data.get('feed', {}).get('entry', [])
21
+
22
+ # if not isinstance(entries, list): # Handle single result
23
+ # entries = [entries]
24
+
25
+ # # Extract relevant fields
26
+ # papers = []
27
+ # for entry in entries:
28
+ # papers.append({
29
+ # 'title': entry.get('title'),
30
+ # 'summary': entry.get('summary'),
31
+ # 'published': entry.get('published'),
32
+ # 'authors': [author['name'] for author in entry.get('author', [])] if isinstance(entry.get('author'), list) else [entry.get('author', {}).get('name')],
33
+ # 'link': entry.get('id')
34
+ # })
35
+
36
+ # return papers
37
+ # except Exception as e:
38
+ # st.error(f"Error fetching papers: {e}")
39
+ # return []
40
+
41
+ # # Streamlit app UI
42
+ # st.title("arXiv Research Paper Search")
43
+ # st.subheader("Find academic papers on your topic of interest")
44
+
45
+ # # Input fields
46
+ # query = st.text_input("Enter a topic or keywords", placeholder="e.g., machine learning, quantum computing")
47
+ # max_results = st.slider("Number of results", min_value=1, max_value=50, value=10)
48
+
49
+ # if st.button("Search"):
50
+ # if query.strip():
51
+ # st.info(f"Searching for papers on: **{query}**")
52
+ # papers = fetch_papers(query, max_results)
53
+
54
+ # if papers:
55
+ # st.success(f"Found {len(papers)} papers!")
56
+ # for idx, paper in enumerate(papers, start=1):
57
+ # st.write(f"### {idx}. {paper['title']}")
58
+ # st.write(f"**Authors**: {', '.join(paper['authors'])}")
59
+ # st.write(f"**Published**: {paper['published']}")
60
+ # st.write(f"[Read More]({paper['link']})")
61
+ # st.write("---")
62
+ # else:
63
+ # st.warning("No papers found. Try a different query.")
64
+ # else:
65
+ # st.error("Please enter a topic or keywords to search.")
66
+
67
+ import streamlit as st
68
+ import requests
69
+ import xmltodict
70
+ from datetime import datetime
71
+
72
+ # arXiv API base URL
73
+ ARXIV_API_BASE = "http://export.arxiv.org/api/query"
74
+
75
+ def fetch_papers(query, max_results=10):
76
+ """Fetch papers from the arXiv API."""
77
+ try:
78
+ # Build the API query URL
79
+ api_url = f"{ARXIV_API_BASE}?search_query=all:{query}&start=0&max_results={max_results}"
80
+
81
+ # Make the API request
82
+ response = requests.get(api_url, headers={'Accept': 'application/xml'})
83
+ response.raise_for_status()
84
+
85
+ # Parse the XML response
86
+ data = xmltodict.parse(response.text)
87
+ entries = data.get('feed', {}).get('entry', [])
88
+
89
+ if not isinstance(entries, list): # Handle single result
90
+ entries = [entries]
91
+
92
+ # Extract relevant fields
93
+ papers = []
94
+ for entry in entries:
95
+ papers.append({
96
+ 'title': entry.get('title'),
97
+ 'summary': entry.get('summary'),
98
+ 'published': entry.get('published'),
99
+ 'authors': [author['name'] for author in entry.get('author', [])] if isinstance(entry.get('author'), list) else [entry.get('author', {}).get('name')],
100
+ 'link': entry.get('id')
101
+ })
102
+
103
+ return papers
104
+ except Exception as e:
105
+ st.error(f"Error fetching papers: {e}")
106
+ return []
107
+
108
+ def filter_papers_by_year(papers, start_year, end_year):
109
+ """Filter papers by the publication year range."""
110
+ filtered_papers = []
111
+ for paper in papers:
112
+ try:
113
+ published_year = int(paper['published'][:4]) # Extract year from the published date
114
+ if start_year <= published_year <= end_year:
115
+ filtered_papers.append(paper)
116
+ except ValueError:
117
+ continue # Skip if the year is not valid
118
+ return filtered_papers
119
+
120
+ # Streamlit app UI
121
+ st.title("arXiv Research Paper Search")
122
+ st.subheader("Find academic papers on your topic of interest")
123
+
124
+ # Input fields
125
+ query = st.text_input("Enter a topic or keywords", placeholder="e.g., machine learning, quantum computing")
126
+ max_results = st.slider("Number of results", min_value=1, max_value=50, value=10)
127
+
128
+ # Year filter
129
+ col1, col2 = st.columns(2)
130
+ with col1:
131
+ start_year = st.number_input("Start Year", min_value=1900, max_value=datetime.now().year, value=2000, step=1)
132
+ with col2:
133
+ end_year = st.number_input("End Year", min_value=1900, max_value=datetime.now().year, value=datetime.now().year, step=1)
134
+
135
+ if st.button("Search"):
136
+ if query.strip():
137
+ st.info(f"Searching for papers on: **{query}**")
138
+ papers = fetch_papers(query, max_results)
139
+
140
+ # Filter papers by year
141
+ papers_filtered = filter_papers_by_year(papers, start_year, end_year)
142
+
143
+ if papers_filtered:
144
+ st.success(f"Found {len(papers_filtered)} papers between {start_year} and {end_year}!")
145
+ for idx, paper in enumerate(papers_filtered, start=1):
146
+ st.write(f"### {idx}. {paper['title']}")
147
+ st.write(f"**Authors**: {', '.join(paper['authors'])}")
148
+ st.write(f"**Published**: {paper['published']}")
149
+ st.write(f"[Read More]({paper['link']})")
150
+ st.write("---")
151
+ else:
152
+ st.warning(f"No papers found between {start_year} and {end_year}. Try a different query or adjust the year range.")
153
+ else:
154
+ st.error("Please enter a topic or keywords to search.")
pdfpass.py ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from PyPDF2 import PdfReader, PdfWriter
3
+ from io import BytesIO
4
+
5
+ def remove_pdf_password(file, password):
6
+ try:
7
+ reader = PdfReader(file)
8
+ if reader.is_encrypted:
9
+ reader.decrypt(password)
10
+ writer = PdfWriter()
11
+ for page in reader.pages:
12
+ writer.add_page(page)
13
+
14
+ output = BytesIO()
15
+ writer.write(output)
16
+ output.seek(0)
17
+ return output
18
+ except Exception as e:
19
+ return str(e)
20
+
21
+ st.title("PDF Password Remover")
22
+ st.write("Upload a password-protected PDF and remove its password.")
23
+
24
+ # File upload
25
+ uploaded_file = st.file_uploader("Choose a PDF file", type=["pdf"])
26
+ password = st.text_input("Enter the PDF password", type="password")
27
+
28
+ if uploaded_file and password:
29
+ if st.button("Remove Password"):
30
+ output = remove_pdf_password(uploaded_file, password)
31
+ if isinstance(output, BytesIO):
32
+ st.success("Password removed successfully!")
33
+ st.download_button(
34
+ label="Download PDF without Password",
35
+ data=output,
36
+ file_name="unlocked_pdf.pdf",
37
+ mime="application/pdf",
38
+ )
39
+ else:
40
+ st.error(f"Error: {output}")
pdfsum.py ADDED
@@ -0,0 +1,125 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # import streamlit as st
2
+ # from transformers import pipeline
3
+ # from PyPDF2 import PdfReader
4
+
5
+ # # Initialize the summarizer
6
+ # summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
7
+
8
+ # def extract_text_from_pdf(pdf_file):
9
+ # """Extract text from an uploaded PDF file."""
10
+ # try:
11
+ # reader = PdfReader(pdf_file)
12
+ # text = ""
13
+ # for page in reader.pages:
14
+ # page_text = page.extract_text()
15
+ # if page_text: # Skip pages with no text
16
+ # text += page_text + "\n"
17
+ # return text
18
+ # except Exception as e:
19
+ # raise ValueError(f"Error extracting text from PDF: {e}")
20
+
21
+ # def split_text_into_chunks(text, max_chunk_size=1024):
22
+ # """Split the text into smaller chunks for summarization."""
23
+ # chunks = []
24
+ # while len(text) > max_chunk_size:
25
+ # split_point = text.rfind(". ", 0, max_chunk_size) + 1 # Split at the last sentence boundary
26
+ # if split_point == 0: # No sentence boundary found, split arbitrarily
27
+ # split_point = max_chunk_size
28
+ # chunks.append
29
+
30
+ # # Streamlit Dashboard
31
+ # st.title("PDF Summarizer")
32
+ # st.write("Upload a PDF file to get a summarized version of its content.")
33
+
34
+ # uploaded_file = st.file_uploader("Upload your PDF", type=["pdf"])
35
+
36
+ # if uploaded_file is not None:
37
+ # # Extract text from the PDF
38
+ # st.write("Processing your PDF...")
39
+ # try:
40
+ # pdf_text = extract_text_from_pdf(uploaded_file)
41
+ # st.write("PDF content extracted successfully.")
42
+
43
+ # # Display extracted text (optional)
44
+ # with st.expander("View Extracted Text"):
45
+ # st.text_area("Extracted Text", pdf_text, height=300)
46
+
47
+ # # Summarize the extracted text
48
+ # if st.button("Summarize"):
49
+ # st.write("Generating summary...")
50
+ # summary = summarizer(pdf_text, max_length=130, min_length=30, do_sample=False)
51
+ # st.subheader("Summary")
52
+ # st.write(summary[0]["summary_text"])
53
+ # except Exception as e:
54
+ # st.error(f"An error occurred while processing the PDF: {str(e)}")
55
+
56
+ import streamlit as st
57
+ from transformers import pipeline
58
+ import pdfplumber
59
+
60
+ # Initialize the summarizer
61
+ summarizer = pipeline("summarization", model="t5-small")
62
+
63
+ def extract_text_from_pdf(pdf_file):
64
+ """Extract text from an uploaded PDF file using pdfplumber."""
65
+ try:
66
+ text = ""
67
+ with pdfplumber.open(pdf_file) as pdf:
68
+ for page in pdf.pages:
69
+ text += page.extract_text() + "\n"
70
+ if not text.strip():
71
+ raise ValueError("No extractable text found in the PDF.")
72
+ return text
73
+ except Exception as e:
74
+ raise ValueError(f"Error extracting text from PDF: {e}")
75
+
76
+ def split_text_into_chunks(text, max_chunk_size=1024):
77
+ """Split the text into smaller chunks for summarization."""
78
+ chunks = []
79
+ while len(text) > max_chunk_size:
80
+ split_point = text.rfind(". ", 0, max_chunk_size) + 1 # Find the last full sentence
81
+ if split_point == 0: # No sentence boundary found, split arbitrarily
82
+ split_point = max_chunk_size
83
+ chunks.append(text[:split_point])
84
+ text = text[split_point:]
85
+ if text:
86
+ chunks.append(text)
87
+ return chunks
88
+
89
+ def summarize_text(chunks):
90
+ """Summarize each chunk of text with dynamic max_length."""
91
+ summaries = []
92
+ for chunk in chunks:
93
+ input_length = len(chunk.split()) # Approximate token count
94
+ max_length = max(48, int(input_length * 0.8)) # Set max_length to 80% of input length
95
+ summary = summarizer(chunk, max_length=max_length, min_length=10, do_sample=False)
96
+ summaries.append(summary[0]["summary_text"])
97
+ return summaries
98
+
99
+ # Streamlit Dashboard
100
+ st.title("PDF Summarizer")
101
+ st.write("Upload a PDF file to get a summarized version of its content.")
102
+
103
+ uploaded_file = st.file_uploader("Upload your PDF", type=["pdf"])
104
+
105
+ if uploaded_file is not None:
106
+ try:
107
+ # Extract text from the PDF
108
+ st.write("Processing your PDF...")
109
+ pdf_text = extract_text_from_pdf(uploaded_file)
110
+ st.write("PDF content extracted successfully.")
111
+
112
+ # Display extracted text (optional)
113
+ with st.expander("View Extracted Text"):
114
+ st.text_area("Extracted Text", pdf_text, height=300)
115
+
116
+ # Summarize the extracted text
117
+ if st.button("Summarize"):
118
+ st.write("Generating summary...")
119
+ chunks = split_text_into_chunks(pdf_text)
120
+ summaries = summarize_text(chunks)
121
+ full_summary = " ".join(summaries)
122
+ st.subheader("Summary")
123
+ st.write(full_summary)
124
+ except Exception as e:
125
+ st.error(f"An error occurred while processing the PDF: {str(e)}")
requirements.txt CHANGED
Binary files a/requirements.txt and b/requirements.txt differ
 
textsumm.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import pipeline
2
+
3
+ summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
4
+ ARTICLE ="""
5
+ There is widespread international concern that Russia's war will provoke a global food crisis similar to, or
6
+ worse than, that faced in 2007 and 2008. The war comes at a time when the global food system was already
7
+ struggling to feed its growing population in a sustainable way, under the pressure caused by climate change
8
+ and the Covid-19 pandemic. Russia and Ukraine are key agricultural players, together exporting nearly 12
9
+ % of food calories traded globally. They are major providers of basic agro-commodities, including wheat,
10
+ maize and sunflower oil, and Russia is the world's top exporter of fertilisers. The global supply chain will
11
+ get impacted until Russia and Ukraine retreat and will end the war.
12
+ The war's impact on global food supply centred on three factors. First is a significant reduction in exports
13
+ and production of essential commodities from both countries, caused by the war and not the economic
14
+ sanctions imposed on Russia, which, intentionally, did not target the agricultural sector. Overall, the
15
+ European Commission estimates that 'up to 25 million tonnes of wheat would need to be substituted to
16
+ meet worldwide food needs in the current and the next season. Second factor is a global spike in prices of
17
+ food supplies and inputs needed for agri-food production, which were already at record levels before the
18
+ war. The war has further pushed the prices up. Third factor is the international response to the above,
19
+ which could either amplify the effects of the crisis (mainly by uncoordinated export bans) or mitigate them
20
+ (applying lessons learnt from the 2007-2008 food crisis). A number of countries, other than Russia and
21
+ Ukraine, have already imposed or announced their intention to impose some control over exports of
22
+ essential agricultural commodities, including Egypt, Argentina, Indonesia, Serbia, Turkey and, in the EU,
23
+ Hungary. We should keep this in our mind that the long duration of war will make the global situation
24
+ irrecoverable.
25
+
26
+ """
27
+ print(summarizer(ARTICLE, max_length=130, min_length=30, do_sample=False))
28
+