Spaces:
Sleeping
Sleeping
Upload 8 files
Browse files- README.md +5 -1
- app.py +109 -168
- gitattributes +35 -0
- papersearch.py +154 -0
- pdfpass.py +40 -0
- pdfsum.py +125 -0
- requirements.txt +0 -0
- textsumm.py +28 -0
README.md
CHANGED
@@ -1,3 +1,4 @@
|
|
|
|
1 |
title: Pdf Tools Suite
|
2 |
emoji: 📚
|
3 |
colorFrom: gray
|
@@ -5,4 +6,7 @@ colorTo: purple
|
|
5 |
sdk: streamlit
|
6 |
sdk_version: 1.42.1
|
7 |
app_file: app.py
|
8 |
-
pinned: false
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
title: Pdf Tools Suite
|
3 |
emoji: 📚
|
4 |
colorFrom: gray
|
|
|
6 |
sdk: streamlit
|
7 |
sdk_version: 1.42.1
|
8 |
app_file: app.py
|
9 |
+
pinned: false
|
10 |
+
---
|
11 |
+
|
12 |
+
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
app.py
CHANGED
@@ -1,168 +1,109 @@
|
|
1 |
-
import
|
2 |
-
|
3 |
-
import
|
4 |
-
from
|
5 |
-
import
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
)
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
)
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
summary_text = ""
|
111 |
-
pdf_text = ""
|
112 |
-
return "", "", ""
|
113 |
-
|
114 |
-
with gr.Blocks(
|
115 |
-
title="PDF 摘要助手",
|
116 |
-
css="""
|
117 |
-
.gradio-container {
|
118 |
-
max-width: none !important;
|
119 |
-
width: 100% !important;
|
120 |
-
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%) !important;
|
121 |
-
min-height: 100vh;
|
122 |
-
}
|
123 |
-
.main-content {
|
124 |
-
max-width: 1600px !important;
|
125 |
-
margin: 20px auto !important;
|
126 |
-
padding: 30px !important;
|
127 |
-
background: rgba(255, 255, 255, 0.95) !important;
|
128 |
-
border-radius: 20px !important;
|
129 |
-
}
|
130 |
-
"""
|
131 |
-
) as demo:
|
132 |
-
with gr.Column():
|
133 |
-
gr.Markdown("## 📄 PDF 摘要 & 問答助手")
|
134 |
-
|
135 |
-
with gr.Tab("🔧 設定"):
|
136 |
-
api_key_input = gr.Textbox(label="🔑 輸入 OpenAI API Key", type="password")
|
137 |
-
api_key_status = gr.Textbox(label="API 狀態", interactive=False, value="等待設定 API Key...")
|
138 |
-
api_key_btn = gr.Button("確認 API Key")
|
139 |
-
api_key_btn.click(set_api_key, inputs=api_key_input, outputs=api_key_status)
|
140 |
-
|
141 |
-
model_choice = gr.Radio(["gpt-4", "gpt-4.1", "gpt-4.5"], label="選擇 AI 模型", value="gpt-4")
|
142 |
-
model_status = gr.Textbox(label="模型狀態", interactive=False, value="✅ 已選擇:gpt-4")
|
143 |
-
model_choice.change(set_model, inputs=model_choice, outputs=model_status)
|
144 |
-
|
145 |
-
with gr.Tab("📄 摘要"):
|
146 |
-
pdf_upload = gr.File(label="上傳 PDF", file_types=[".pdf"])
|
147 |
-
summary_btn = gr.Button("生成摘要")
|
148 |
-
summary_output = gr.Textbox(label="PDF 摘要", lines=12)
|
149 |
-
summary_btn.click(generate_summary, inputs=pdf_upload, outputs=summary_output)
|
150 |
-
|
151 |
-
with gr.Tab("❓ 問答"):
|
152 |
-
question_input = gr.Textbox(label="請輸入問題", lines=2)
|
153 |
-
question_btn = gr.Button("送出問題")
|
154 |
-
answer_output = gr.Textbox(label="AI 回答", lines=8)
|
155 |
-
question_btn.click(ask_question, inputs=question_input, outputs=answer_output)
|
156 |
-
question_input.submit(ask_question, inputs=question_input, outputs=answer_output)
|
157 |
-
|
158 |
-
clear_btn = gr.Button("🗑️ 清除所有資料")
|
159 |
-
clear_btn.click(clear_all, outputs=[summary_output, question_input, answer_output])
|
160 |
-
|
161 |
-
if __name__ == "__main__":
|
162 |
-
demo.launch(
|
163 |
-
show_error=True,
|
164 |
-
share=True,
|
165 |
-
server_name="0.0.0.0",
|
166 |
-
server_port=7860
|
167 |
-
)
|
168 |
-
|
|
|
1 |
+
import streamlit as st
|
2 |
+
from textsumm import summarizer
|
3 |
+
from pdfsum import extract_text_from_pdf, summarize_text, split_text_into_chunks
|
4 |
+
from pdfpass import remove_pdf_password
|
5 |
+
from papersearch import fetch_papers, filter_papers_by_year
|
6 |
+
from io import BytesIO
|
7 |
+
from datetime import datetime
|
8 |
+
from pypdf import PdfReader, PdfWriter
|
9 |
+
|
10 |
+
# Streamlit App Config
|
11 |
+
st.set_page_config(page_title="PDF Tools Suite", page_icon="📄", layout="wide")
|
12 |
+
|
13 |
+
# Sidebar Navigation
|
14 |
+
st.sidebar.title("📄 PDF Tools Suite")
|
15 |
+
page = st.sidebar.radio("Select a tool", ["Text Summarizer", "PDF Summarizer", "PDF Password Remover", "Research Paper Search", "PDF Merger", "PDF Splitter", "PDF to Text Converter"])
|
16 |
+
|
17 |
+
# Tool: Text Summarizer
|
18 |
+
if page == "Text Summarizer":
|
19 |
+
st.title("📝 Text Summarizer")
|
20 |
+
user_input = st.text_area("Enter text to summarize")
|
21 |
+
if st.button("Summarize"):
|
22 |
+
summary = summarizer(user_input, max_length=130, min_length=30, do_sample=False)
|
23 |
+
st.subheader("Summary")
|
24 |
+
st.write(summary[0]["summary_text"])
|
25 |
+
|
26 |
+
# Tool: PDF Summarizer
|
27 |
+
elif page == "PDF Summarizer":
|
28 |
+
st.title("📜 PDF Summarizer")
|
29 |
+
uploaded_file = st.file_uploader("Upload your PDF", type=["pdf"])
|
30 |
+
if uploaded_file is not None:
|
31 |
+
pdf_text = extract_text_from_pdf(uploaded_file)
|
32 |
+
chunks = split_text_into_chunks(pdf_text)
|
33 |
+
summaries = summarize_text(chunks)
|
34 |
+
full_summary = " ".join(summaries)
|
35 |
+
st.subheader("Summary")
|
36 |
+
st.write(full_summary)
|
37 |
+
|
38 |
+
# Tool: PDF Password Remover
|
39 |
+
elif page == "PDF Password Remover":
|
40 |
+
st.title("🔑 Remove PDF Password")
|
41 |
+
uploaded_file = st.file_uploader("Choose a password-protected PDF", type=["pdf"])
|
42 |
+
password = st.text_input("Enter the PDF password", type="password")
|
43 |
+
if uploaded_file and password and st.button("Remove Password"):
|
44 |
+
output = remove_pdf_password(uploaded_file, password)
|
45 |
+
if isinstance(output, BytesIO):
|
46 |
+
st.success("Password removed successfully!")
|
47 |
+
st.download_button("Download PDF", data=output, file_name="unlocked_pdf.pdf", mime="application/pdf")
|
48 |
+
else:
|
49 |
+
st.error(f"Error: {output}")
|
50 |
+
|
51 |
+
# Tool: Research Paper Search
|
52 |
+
elif page == "Research Paper Search":
|
53 |
+
st.title("🔍 Research Paper Search (arXiv)")
|
54 |
+
query = st.text_input("Enter topic or keywords", placeholder="e.g., machine learning")
|
55 |
+
max_results = st.slider("Number of results", 1, 50, 10)
|
56 |
+
col1, col2 = st.columns(2)
|
57 |
+
with col1:
|
58 |
+
start_year = st.number_input("Start Year", min_value=1900, max_value=datetime.now().year, value=2000)
|
59 |
+
with col2:
|
60 |
+
end_year = st.number_input("End Year", min_value=1900, max_value=datetime.now().year, value=datetime.now().year)
|
61 |
+
if st.button("Search"):
|
62 |
+
papers = fetch_papers(query, max_results)
|
63 |
+
papers_filtered = filter_papers_by_year(papers, start_year, end_year)
|
64 |
+
if papers_filtered:
|
65 |
+
for idx, paper in enumerate(papers_filtered, start=1):
|
66 |
+
st.write(f"### {idx}. {paper['title']}")
|
67 |
+
st.write(f"**Authors**: {', '.join(paper['authors'])}")
|
68 |
+
st.write(f"**Published**: {paper['published']}")
|
69 |
+
st.write(f"[Read More]({paper['link']})")
|
70 |
+
st.write("---")
|
71 |
+
else:
|
72 |
+
st.warning("No papers found in the selected range.")
|
73 |
+
|
74 |
+
# Tool: PDF Merger
|
75 |
+
elif page == "PDF Merger":
|
76 |
+
st.title("📎 Merge Multiple PDFs")
|
77 |
+
uploaded_files = st.file_uploader("Upload multiple PDF files", type=["pdf"], accept_multiple_files=True)
|
78 |
+
if uploaded_files and st.button("Merge PDFs"):
|
79 |
+
pdf_writer = PdfWriter()
|
80 |
+
for file in uploaded_files:
|
81 |
+
pdf_reader = PdfReader(file)
|
82 |
+
for page in pdf_reader.pages:
|
83 |
+
pdf_writer.add_page(page)
|
84 |
+
output = BytesIO()
|
85 |
+
pdf_writer.write(output)
|
86 |
+
output.seek(0)
|
87 |
+
st.download_button("Download Merged PDF", data=output, file_name="merged.pdf", mime="application/pdf")
|
88 |
+
|
89 |
+
# Tool: PDF Splitter
|
90 |
+
elif page == "PDF Splitter":
|
91 |
+
st.title("✂️ Split PDF into Pages")
|
92 |
+
uploaded_file = st.file_uploader("Upload a PDF", type=["pdf"])
|
93 |
+
if uploaded_file:
|
94 |
+
pdf_reader = PdfReader(uploaded_file)
|
95 |
+
for i, page in enumerate(pdf_reader.pages):
|
96 |
+
pdf_writer = PdfWriter()
|
97 |
+
pdf_writer.add_page(page)
|
98 |
+
output = BytesIO()
|
99 |
+
pdf_writer.write(output)
|
100 |
+
output.seek(0)
|
101 |
+
st.download_button(f"Download Page {i+1}", data=output, file_name=f"page_{i+1}.pdf", mime="application/pdf")
|
102 |
+
|
103 |
+
# Tool: PDF to Text Converter
|
104 |
+
elif page == "PDF to Text Converter":
|
105 |
+
st.title("📜 Extract Text from PDF")
|
106 |
+
uploaded_file = st.file_uploader("Upload a PDF", type=["pdf"])
|
107 |
+
if uploaded_file:
|
108 |
+
pdf_text = extract_text_from_pdf(uploaded_file)
|
109 |
+
st.text_area("Extracted Text", pdf_text, height=300)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
gitattributes
ADDED
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
28 |
+
*.tar filter=lfs diff=lfs merge=lfs -text
|
29 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
30 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
31 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
32 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
33 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
papersearch.py
ADDED
@@ -0,0 +1,154 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# import streamlit as st
|
2 |
+
# import requests
|
3 |
+
# import xmltodict
|
4 |
+
|
5 |
+
# # arXiv API base URL
|
6 |
+
# ARXIV_API_BASE = "http://export.arxiv.org/api/query"
|
7 |
+
|
8 |
+
# def fetch_papers(query, max_results=10):
|
9 |
+
# """Fetch papers from the arXiv API."""
|
10 |
+
# try:
|
11 |
+
# # Build the API query URL
|
12 |
+
# api_url = f"{ARXIV_API_BASE}?search_query=all:{query}&start=0&max_results={max_results}"
|
13 |
+
|
14 |
+
# # Make the API request
|
15 |
+
# response = requests.get(api_url, headers={'Accept': 'application/xml'})
|
16 |
+
# response.raise_for_status()
|
17 |
+
|
18 |
+
# # Parse the XML response
|
19 |
+
# data = xmltodict.parse(response.text)
|
20 |
+
# entries = data.get('feed', {}).get('entry', [])
|
21 |
+
|
22 |
+
# if not isinstance(entries, list): # Handle single result
|
23 |
+
# entries = [entries]
|
24 |
+
|
25 |
+
# # Extract relevant fields
|
26 |
+
# papers = []
|
27 |
+
# for entry in entries:
|
28 |
+
# papers.append({
|
29 |
+
# 'title': entry.get('title'),
|
30 |
+
# 'summary': entry.get('summary'),
|
31 |
+
# 'published': entry.get('published'),
|
32 |
+
# 'authors': [author['name'] for author in entry.get('author', [])] if isinstance(entry.get('author'), list) else [entry.get('author', {}).get('name')],
|
33 |
+
# 'link': entry.get('id')
|
34 |
+
# })
|
35 |
+
|
36 |
+
# return papers
|
37 |
+
# except Exception as e:
|
38 |
+
# st.error(f"Error fetching papers: {e}")
|
39 |
+
# return []
|
40 |
+
|
41 |
+
# # Streamlit app UI
|
42 |
+
# st.title("arXiv Research Paper Search")
|
43 |
+
# st.subheader("Find academic papers on your topic of interest")
|
44 |
+
|
45 |
+
# # Input fields
|
46 |
+
# query = st.text_input("Enter a topic or keywords", placeholder="e.g., machine learning, quantum computing")
|
47 |
+
# max_results = st.slider("Number of results", min_value=1, max_value=50, value=10)
|
48 |
+
|
49 |
+
# if st.button("Search"):
|
50 |
+
# if query.strip():
|
51 |
+
# st.info(f"Searching for papers on: **{query}**")
|
52 |
+
# papers = fetch_papers(query, max_results)
|
53 |
+
|
54 |
+
# if papers:
|
55 |
+
# st.success(f"Found {len(papers)} papers!")
|
56 |
+
# for idx, paper in enumerate(papers, start=1):
|
57 |
+
# st.write(f"### {idx}. {paper['title']}")
|
58 |
+
# st.write(f"**Authors**: {', '.join(paper['authors'])}")
|
59 |
+
# st.write(f"**Published**: {paper['published']}")
|
60 |
+
# st.write(f"[Read More]({paper['link']})")
|
61 |
+
# st.write("---")
|
62 |
+
# else:
|
63 |
+
# st.warning("No papers found. Try a different query.")
|
64 |
+
# else:
|
65 |
+
# st.error("Please enter a topic or keywords to search.")
|
66 |
+
|
67 |
+
import streamlit as st
|
68 |
+
import requests
|
69 |
+
import xmltodict
|
70 |
+
from datetime import datetime
|
71 |
+
|
72 |
+
# arXiv API base URL
|
73 |
+
ARXIV_API_BASE = "http://export.arxiv.org/api/query"
|
74 |
+
|
75 |
+
def fetch_papers(query, max_results=10):
|
76 |
+
"""Fetch papers from the arXiv API."""
|
77 |
+
try:
|
78 |
+
# Build the API query URL
|
79 |
+
api_url = f"{ARXIV_API_BASE}?search_query=all:{query}&start=0&max_results={max_results}"
|
80 |
+
|
81 |
+
# Make the API request
|
82 |
+
response = requests.get(api_url, headers={'Accept': 'application/xml'})
|
83 |
+
response.raise_for_status()
|
84 |
+
|
85 |
+
# Parse the XML response
|
86 |
+
data = xmltodict.parse(response.text)
|
87 |
+
entries = data.get('feed', {}).get('entry', [])
|
88 |
+
|
89 |
+
if not isinstance(entries, list): # Handle single result
|
90 |
+
entries = [entries]
|
91 |
+
|
92 |
+
# Extract relevant fields
|
93 |
+
papers = []
|
94 |
+
for entry in entries:
|
95 |
+
papers.append({
|
96 |
+
'title': entry.get('title'),
|
97 |
+
'summary': entry.get('summary'),
|
98 |
+
'published': entry.get('published'),
|
99 |
+
'authors': [author['name'] for author in entry.get('author', [])] if isinstance(entry.get('author'), list) else [entry.get('author', {}).get('name')],
|
100 |
+
'link': entry.get('id')
|
101 |
+
})
|
102 |
+
|
103 |
+
return papers
|
104 |
+
except Exception as e:
|
105 |
+
st.error(f"Error fetching papers: {e}")
|
106 |
+
return []
|
107 |
+
|
108 |
+
def filter_papers_by_year(papers, start_year, end_year):
|
109 |
+
"""Filter papers by the publication year range."""
|
110 |
+
filtered_papers = []
|
111 |
+
for paper in papers:
|
112 |
+
try:
|
113 |
+
published_year = int(paper['published'][:4]) # Extract year from the published date
|
114 |
+
if start_year <= published_year <= end_year:
|
115 |
+
filtered_papers.append(paper)
|
116 |
+
except ValueError:
|
117 |
+
continue # Skip if the year is not valid
|
118 |
+
return filtered_papers
|
119 |
+
|
120 |
+
# Streamlit app UI
|
121 |
+
st.title("arXiv Research Paper Search")
|
122 |
+
st.subheader("Find academic papers on your topic of interest")
|
123 |
+
|
124 |
+
# Input fields
|
125 |
+
query = st.text_input("Enter a topic or keywords", placeholder="e.g., machine learning, quantum computing")
|
126 |
+
max_results = st.slider("Number of results", min_value=1, max_value=50, value=10)
|
127 |
+
|
128 |
+
# Year filter
|
129 |
+
col1, col2 = st.columns(2)
|
130 |
+
with col1:
|
131 |
+
start_year = st.number_input("Start Year", min_value=1900, max_value=datetime.now().year, value=2000, step=1)
|
132 |
+
with col2:
|
133 |
+
end_year = st.number_input("End Year", min_value=1900, max_value=datetime.now().year, value=datetime.now().year, step=1)
|
134 |
+
|
135 |
+
if st.button("Search"):
|
136 |
+
if query.strip():
|
137 |
+
st.info(f"Searching for papers on: **{query}**")
|
138 |
+
papers = fetch_papers(query, max_results)
|
139 |
+
|
140 |
+
# Filter papers by year
|
141 |
+
papers_filtered = filter_papers_by_year(papers, start_year, end_year)
|
142 |
+
|
143 |
+
if papers_filtered:
|
144 |
+
st.success(f"Found {len(papers_filtered)} papers between {start_year} and {end_year}!")
|
145 |
+
for idx, paper in enumerate(papers_filtered, start=1):
|
146 |
+
st.write(f"### {idx}. {paper['title']}")
|
147 |
+
st.write(f"**Authors**: {', '.join(paper['authors'])}")
|
148 |
+
st.write(f"**Published**: {paper['published']}")
|
149 |
+
st.write(f"[Read More]({paper['link']})")
|
150 |
+
st.write("---")
|
151 |
+
else:
|
152 |
+
st.warning(f"No papers found between {start_year} and {end_year}. Try a different query or adjust the year range.")
|
153 |
+
else:
|
154 |
+
st.error("Please enter a topic or keywords to search.")
|
pdfpass.py
ADDED
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
from PyPDF2 import PdfReader, PdfWriter
|
3 |
+
from io import BytesIO
|
4 |
+
|
5 |
+
def remove_pdf_password(file, password):
|
6 |
+
try:
|
7 |
+
reader = PdfReader(file)
|
8 |
+
if reader.is_encrypted:
|
9 |
+
reader.decrypt(password)
|
10 |
+
writer = PdfWriter()
|
11 |
+
for page in reader.pages:
|
12 |
+
writer.add_page(page)
|
13 |
+
|
14 |
+
output = BytesIO()
|
15 |
+
writer.write(output)
|
16 |
+
output.seek(0)
|
17 |
+
return output
|
18 |
+
except Exception as e:
|
19 |
+
return str(e)
|
20 |
+
|
21 |
+
st.title("PDF Password Remover")
|
22 |
+
st.write("Upload a password-protected PDF and remove its password.")
|
23 |
+
|
24 |
+
# File upload
|
25 |
+
uploaded_file = st.file_uploader("Choose a PDF file", type=["pdf"])
|
26 |
+
password = st.text_input("Enter the PDF password", type="password")
|
27 |
+
|
28 |
+
if uploaded_file and password:
|
29 |
+
if st.button("Remove Password"):
|
30 |
+
output = remove_pdf_password(uploaded_file, password)
|
31 |
+
if isinstance(output, BytesIO):
|
32 |
+
st.success("Password removed successfully!")
|
33 |
+
st.download_button(
|
34 |
+
label="Download PDF without Password",
|
35 |
+
data=output,
|
36 |
+
file_name="unlocked_pdf.pdf",
|
37 |
+
mime="application/pdf",
|
38 |
+
)
|
39 |
+
else:
|
40 |
+
st.error(f"Error: {output}")
|
pdfsum.py
ADDED
@@ -0,0 +1,125 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# import streamlit as st
|
2 |
+
# from transformers import pipeline
|
3 |
+
# from PyPDF2 import PdfReader
|
4 |
+
|
5 |
+
# # Initialize the summarizer
|
6 |
+
# summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
|
7 |
+
|
8 |
+
# def extract_text_from_pdf(pdf_file):
|
9 |
+
# """Extract text from an uploaded PDF file."""
|
10 |
+
# try:
|
11 |
+
# reader = PdfReader(pdf_file)
|
12 |
+
# text = ""
|
13 |
+
# for page in reader.pages:
|
14 |
+
# page_text = page.extract_text()
|
15 |
+
# if page_text: # Skip pages with no text
|
16 |
+
# text += page_text + "\n"
|
17 |
+
# return text
|
18 |
+
# except Exception as e:
|
19 |
+
# raise ValueError(f"Error extracting text from PDF: {e}")
|
20 |
+
|
21 |
+
# def split_text_into_chunks(text, max_chunk_size=1024):
|
22 |
+
# """Split the text into smaller chunks for summarization."""
|
23 |
+
# chunks = []
|
24 |
+
# while len(text) > max_chunk_size:
|
25 |
+
# split_point = text.rfind(". ", 0, max_chunk_size) + 1 # Split at the last sentence boundary
|
26 |
+
# if split_point == 0: # No sentence boundary found, split arbitrarily
|
27 |
+
# split_point = max_chunk_size
|
28 |
+
# chunks.append
|
29 |
+
|
30 |
+
# # Streamlit Dashboard
|
31 |
+
# st.title("PDF Summarizer")
|
32 |
+
# st.write("Upload a PDF file to get a summarized version of its content.")
|
33 |
+
|
34 |
+
# uploaded_file = st.file_uploader("Upload your PDF", type=["pdf"])
|
35 |
+
|
36 |
+
# if uploaded_file is not None:
|
37 |
+
# # Extract text from the PDF
|
38 |
+
# st.write("Processing your PDF...")
|
39 |
+
# try:
|
40 |
+
# pdf_text = extract_text_from_pdf(uploaded_file)
|
41 |
+
# st.write("PDF content extracted successfully.")
|
42 |
+
|
43 |
+
# # Display extracted text (optional)
|
44 |
+
# with st.expander("View Extracted Text"):
|
45 |
+
# st.text_area("Extracted Text", pdf_text, height=300)
|
46 |
+
|
47 |
+
# # Summarize the extracted text
|
48 |
+
# if st.button("Summarize"):
|
49 |
+
# st.write("Generating summary...")
|
50 |
+
# summary = summarizer(pdf_text, max_length=130, min_length=30, do_sample=False)
|
51 |
+
# st.subheader("Summary")
|
52 |
+
# st.write(summary[0]["summary_text"])
|
53 |
+
# except Exception as e:
|
54 |
+
# st.error(f"An error occurred while processing the PDF: {str(e)}")
|
55 |
+
|
56 |
+
import streamlit as st
|
57 |
+
from transformers import pipeline
|
58 |
+
import pdfplumber
|
59 |
+
|
60 |
+
# Initialize the summarizer
|
61 |
+
summarizer = pipeline("summarization", model="t5-small")
|
62 |
+
|
63 |
+
def extract_text_from_pdf(pdf_file):
|
64 |
+
"""Extract text from an uploaded PDF file using pdfplumber."""
|
65 |
+
try:
|
66 |
+
text = ""
|
67 |
+
with pdfplumber.open(pdf_file) as pdf:
|
68 |
+
for page in pdf.pages:
|
69 |
+
text += page.extract_text() + "\n"
|
70 |
+
if not text.strip():
|
71 |
+
raise ValueError("No extractable text found in the PDF.")
|
72 |
+
return text
|
73 |
+
except Exception as e:
|
74 |
+
raise ValueError(f"Error extracting text from PDF: {e}")
|
75 |
+
|
76 |
+
def split_text_into_chunks(text, max_chunk_size=1024):
|
77 |
+
"""Split the text into smaller chunks for summarization."""
|
78 |
+
chunks = []
|
79 |
+
while len(text) > max_chunk_size:
|
80 |
+
split_point = text.rfind(". ", 0, max_chunk_size) + 1 # Find the last full sentence
|
81 |
+
if split_point == 0: # No sentence boundary found, split arbitrarily
|
82 |
+
split_point = max_chunk_size
|
83 |
+
chunks.append(text[:split_point])
|
84 |
+
text = text[split_point:]
|
85 |
+
if text:
|
86 |
+
chunks.append(text)
|
87 |
+
return chunks
|
88 |
+
|
89 |
+
def summarize_text(chunks):
|
90 |
+
"""Summarize each chunk of text with dynamic max_length."""
|
91 |
+
summaries = []
|
92 |
+
for chunk in chunks:
|
93 |
+
input_length = len(chunk.split()) # Approximate token count
|
94 |
+
max_length = max(48, int(input_length * 0.8)) # Set max_length to 80% of input length
|
95 |
+
summary = summarizer(chunk, max_length=max_length, min_length=10, do_sample=False)
|
96 |
+
summaries.append(summary[0]["summary_text"])
|
97 |
+
return summaries
|
98 |
+
|
99 |
+
# Streamlit Dashboard
|
100 |
+
st.title("PDF Summarizer")
|
101 |
+
st.write("Upload a PDF file to get a summarized version of its content.")
|
102 |
+
|
103 |
+
uploaded_file = st.file_uploader("Upload your PDF", type=["pdf"])
|
104 |
+
|
105 |
+
if uploaded_file is not None:
|
106 |
+
try:
|
107 |
+
# Extract text from the PDF
|
108 |
+
st.write("Processing your PDF...")
|
109 |
+
pdf_text = extract_text_from_pdf(uploaded_file)
|
110 |
+
st.write("PDF content extracted successfully.")
|
111 |
+
|
112 |
+
# Display extracted text (optional)
|
113 |
+
with st.expander("View Extracted Text"):
|
114 |
+
st.text_area("Extracted Text", pdf_text, height=300)
|
115 |
+
|
116 |
+
# Summarize the extracted text
|
117 |
+
if st.button("Summarize"):
|
118 |
+
st.write("Generating summary...")
|
119 |
+
chunks = split_text_into_chunks(pdf_text)
|
120 |
+
summaries = summarize_text(chunks)
|
121 |
+
full_summary = " ".join(summaries)
|
122 |
+
st.subheader("Summary")
|
123 |
+
st.write(full_summary)
|
124 |
+
except Exception as e:
|
125 |
+
st.error(f"An error occurred while processing the PDF: {str(e)}")
|
requirements.txt
CHANGED
Binary files a/requirements.txt and b/requirements.txt differ
|
|
textsumm.py
ADDED
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from transformers import pipeline
|
2 |
+
|
3 |
+
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
|
4 |
+
ARTICLE ="""
|
5 |
+
There is widespread international concern that Russia's war will provoke a global food crisis similar to, or
|
6 |
+
worse than, that faced in 2007 and 2008. The war comes at a time when the global food system was already
|
7 |
+
struggling to feed its growing population in a sustainable way, under the pressure caused by climate change
|
8 |
+
and the Covid-19 pandemic. Russia and Ukraine are key agricultural players, together exporting nearly 12
|
9 |
+
% of food calories traded globally. They are major providers of basic agro-commodities, including wheat,
|
10 |
+
maize and sunflower oil, and Russia is the world's top exporter of fertilisers. The global supply chain will
|
11 |
+
get impacted until Russia and Ukraine retreat and will end the war.
|
12 |
+
The war's impact on global food supply centred on three factors. First is a significant reduction in exports
|
13 |
+
and production of essential commodities from both countries, caused by the war and not the economic
|
14 |
+
sanctions imposed on Russia, which, intentionally, did not target the agricultural sector. Overall, the
|
15 |
+
European Commission estimates that 'up to 25 million tonnes of wheat would need to be substituted to
|
16 |
+
meet worldwide food needs in the current and the next season. Second factor is a global spike in prices of
|
17 |
+
food supplies and inputs needed for agri-food production, which were already at record levels before the
|
18 |
+
war. The war has further pushed the prices up. Third factor is the international response to the above,
|
19 |
+
which could either amplify the effects of the crisis (mainly by uncoordinated export bans) or mitigate them
|
20 |
+
(applying lessons learnt from the 2007-2008 food crisis). A number of countries, other than Russia and
|
21 |
+
Ukraine, have already imposed or announced their intention to impose some control over exports of
|
22 |
+
essential agricultural commodities, including Egypt, Argentina, Indonesia, Serbia, Turkey and, in the EU,
|
23 |
+
Hungary. We should keep this in our mind that the long duration of war will make the global situation
|
24 |
+
irrecoverable.
|
25 |
+
|
26 |
+
"""
|
27 |
+
print(summarizer(ARTICLE, max_length=130, min_length=30, do_sample=False))
|
28 |
+
|