Spaces:
Sleeping
Sleeping
Upload 6 files
Browse files- app.py +109 -0
- papersearch.py +154 -0
- pdfpass.py +40 -0
- pdfsum.py +125 -0
- requirements.txt +0 -0
- textsumm.py +28 -0
app.py
ADDED
@@ -0,0 +1,109 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
from textsumm import summarizer
|
3 |
+
from pdfsum import extract_text_from_pdf, summarize_text, split_text_into_chunks
|
4 |
+
from pdfpass import remove_pdf_password
|
5 |
+
from papersearch import fetch_papers, filter_papers_by_year
|
6 |
+
from io import BytesIO
|
7 |
+
from datetime import datetime
|
8 |
+
from pypdf import PdfReader, PdfWriter
|
9 |
+
|
10 |
+
# Streamlit App Config
|
11 |
+
st.set_page_config(page_title="PDF Tools Suite", page_icon="📄", layout="wide")
|
12 |
+
|
13 |
+
# Sidebar Navigation
|
14 |
+
st.sidebar.title("📄 PDF Tools Suite")
|
15 |
+
page = st.sidebar.radio("Select a tool", ["Text Summarizer", "PDF Summarizer", "PDF Password Remover", "Research Paper Search", "PDF Merger", "PDF Splitter", "PDF to Text Converter"])
|
16 |
+
|
17 |
+
# Tool: Text Summarizer
|
18 |
+
if page == "Text Summarizer":
|
19 |
+
st.title("📝 Text Summarizer")
|
20 |
+
user_input = st.text_area("Enter text to summarize")
|
21 |
+
if st.button("Summarize"):
|
22 |
+
summary = summarizer(user_input, max_length=130, min_length=30, do_sample=False)
|
23 |
+
st.subheader("Summary")
|
24 |
+
st.write(summary[0]["summary_text"])
|
25 |
+
|
26 |
+
# Tool: PDF Summarizer
|
27 |
+
elif page == "PDF Summarizer":
|
28 |
+
st.title("📜 PDF Summarizer")
|
29 |
+
uploaded_file = st.file_uploader("Upload your PDF", type=["pdf"])
|
30 |
+
if uploaded_file is not None:
|
31 |
+
pdf_text = extract_text_from_pdf(uploaded_file)
|
32 |
+
chunks = split_text_into_chunks(pdf_text)
|
33 |
+
summaries = summarize_text(chunks)
|
34 |
+
full_summary = " ".join(summaries)
|
35 |
+
st.subheader("Summary")
|
36 |
+
st.write(full_summary)
|
37 |
+
|
38 |
+
# Tool: PDF Password Remover
|
39 |
+
elif page == "PDF Password Remover":
|
40 |
+
st.title("🔑 Remove PDF Password")
|
41 |
+
uploaded_file = st.file_uploader("Choose a password-protected PDF", type=["pdf"])
|
42 |
+
password = st.text_input("Enter the PDF password", type="password")
|
43 |
+
if uploaded_file and password and st.button("Remove Password"):
|
44 |
+
output = remove_pdf_password(uploaded_file, password)
|
45 |
+
if isinstance(output, BytesIO):
|
46 |
+
st.success("Password removed successfully!")
|
47 |
+
st.download_button("Download PDF", data=output, file_name="unlocked_pdf.pdf", mime="application/pdf")
|
48 |
+
else:
|
49 |
+
st.error(f"Error: {output}")
|
50 |
+
|
51 |
+
# Tool: Research Paper Search
|
52 |
+
elif page == "Research Paper Search":
|
53 |
+
st.title("🔍 Research Paper Search (arXiv)")
|
54 |
+
query = st.text_input("Enter topic or keywords", placeholder="e.g., machine learning")
|
55 |
+
max_results = st.slider("Number of results", 1, 50, 10)
|
56 |
+
col1, col2 = st.columns(2)
|
57 |
+
with col1:
|
58 |
+
start_year = st.number_input("Start Year", min_value=1900, max_value=datetime.now().year, value=2000)
|
59 |
+
with col2:
|
60 |
+
end_year = st.number_input("End Year", min_value=1900, max_value=datetime.now().year, value=datetime.now().year)
|
61 |
+
if st.button("Search"):
|
62 |
+
papers = fetch_papers(query, max_results)
|
63 |
+
papers_filtered = filter_papers_by_year(papers, start_year, end_year)
|
64 |
+
if papers_filtered:
|
65 |
+
for idx, paper in enumerate(papers_filtered, start=1):
|
66 |
+
st.write(f"### {idx}. {paper['title']}")
|
67 |
+
st.write(f"**Authors**: {', '.join(paper['authors'])}")
|
68 |
+
st.write(f"**Published**: {paper['published']}")
|
69 |
+
st.write(f"[Read More]({paper['link']})")
|
70 |
+
st.write("---")
|
71 |
+
else:
|
72 |
+
st.warning("No papers found in the selected range.")
|
73 |
+
|
74 |
+
# Tool: PDF Merger
|
75 |
+
elif page == "PDF Merger":
|
76 |
+
st.title("📎 Merge Multiple PDFs")
|
77 |
+
uploaded_files = st.file_uploader("Upload multiple PDF files", type=["pdf"], accept_multiple_files=True)
|
78 |
+
if uploaded_files and st.button("Merge PDFs"):
|
79 |
+
pdf_writer = PdfWriter()
|
80 |
+
for file in uploaded_files:
|
81 |
+
pdf_reader = PdfReader(file)
|
82 |
+
for page in pdf_reader.pages:
|
83 |
+
pdf_writer.add_page(page)
|
84 |
+
output = BytesIO()
|
85 |
+
pdf_writer.write(output)
|
86 |
+
output.seek(0)
|
87 |
+
st.download_button("Download Merged PDF", data=output, file_name="merged.pdf", mime="application/pdf")
|
88 |
+
|
89 |
+
# Tool: PDF Splitter
|
90 |
+
elif page == "PDF Splitter":
|
91 |
+
st.title("✂️ Split PDF into Pages")
|
92 |
+
uploaded_file = st.file_uploader("Upload a PDF", type=["pdf"])
|
93 |
+
if uploaded_file:
|
94 |
+
pdf_reader = PdfReader(uploaded_file)
|
95 |
+
for i, page in enumerate(pdf_reader.pages):
|
96 |
+
pdf_writer = PdfWriter()
|
97 |
+
pdf_writer.add_page(page)
|
98 |
+
output = BytesIO()
|
99 |
+
pdf_writer.write(output)
|
100 |
+
output.seek(0)
|
101 |
+
st.download_button(f"Download Page {i+1}", data=output, file_name=f"page_{i+1}.pdf", mime="application/pdf")
|
102 |
+
|
103 |
+
# Tool: PDF to Text Converter
|
104 |
+
elif page == "PDF to Text Converter":
|
105 |
+
st.title("📜 Extract Text from PDF")
|
106 |
+
uploaded_file = st.file_uploader("Upload a PDF", type=["pdf"])
|
107 |
+
if uploaded_file:
|
108 |
+
pdf_text = extract_text_from_pdf(uploaded_file)
|
109 |
+
st.text_area("Extracted Text", pdf_text, height=300)
|
papersearch.py
ADDED
@@ -0,0 +1,154 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# import streamlit as st
|
2 |
+
# import requests
|
3 |
+
# import xmltodict
|
4 |
+
|
5 |
+
# # arXiv API base URL
|
6 |
+
# ARXIV_API_BASE = "http://export.arxiv.org/api/query"
|
7 |
+
|
8 |
+
# def fetch_papers(query, max_results=10):
|
9 |
+
# """Fetch papers from the arXiv API."""
|
10 |
+
# try:
|
11 |
+
# # Build the API query URL
|
12 |
+
# api_url = f"{ARXIV_API_BASE}?search_query=all:{query}&start=0&max_results={max_results}"
|
13 |
+
|
14 |
+
# # Make the API request
|
15 |
+
# response = requests.get(api_url, headers={'Accept': 'application/xml'})
|
16 |
+
# response.raise_for_status()
|
17 |
+
|
18 |
+
# # Parse the XML response
|
19 |
+
# data = xmltodict.parse(response.text)
|
20 |
+
# entries = data.get('feed', {}).get('entry', [])
|
21 |
+
|
22 |
+
# if not isinstance(entries, list): # Handle single result
|
23 |
+
# entries = [entries]
|
24 |
+
|
25 |
+
# # Extract relevant fields
|
26 |
+
# papers = []
|
27 |
+
# for entry in entries:
|
28 |
+
# papers.append({
|
29 |
+
# 'title': entry.get('title'),
|
30 |
+
# 'summary': entry.get('summary'),
|
31 |
+
# 'published': entry.get('published'),
|
32 |
+
# 'authors': [author['name'] for author in entry.get('author', [])] if isinstance(entry.get('author'), list) else [entry.get('author', {}).get('name')],
|
33 |
+
# 'link': entry.get('id')
|
34 |
+
# })
|
35 |
+
|
36 |
+
# return papers
|
37 |
+
# except Exception as e:
|
38 |
+
# st.error(f"Error fetching papers: {e}")
|
39 |
+
# return []
|
40 |
+
|
41 |
+
# # Streamlit app UI
|
42 |
+
# st.title("arXiv Research Paper Search")
|
43 |
+
# st.subheader("Find academic papers on your topic of interest")
|
44 |
+
|
45 |
+
# # Input fields
|
46 |
+
# query = st.text_input("Enter a topic or keywords", placeholder="e.g., machine learning, quantum computing")
|
47 |
+
# max_results = st.slider("Number of results", min_value=1, max_value=50, value=10)
|
48 |
+
|
49 |
+
# if st.button("Search"):
|
50 |
+
# if query.strip():
|
51 |
+
# st.info(f"Searching for papers on: **{query}**")
|
52 |
+
# papers = fetch_papers(query, max_results)
|
53 |
+
|
54 |
+
# if papers:
|
55 |
+
# st.success(f"Found {len(papers)} papers!")
|
56 |
+
# for idx, paper in enumerate(papers, start=1):
|
57 |
+
# st.write(f"### {idx}. {paper['title']}")
|
58 |
+
# st.write(f"**Authors**: {', '.join(paper['authors'])}")
|
59 |
+
# st.write(f"**Published**: {paper['published']}")
|
60 |
+
# st.write(f"[Read More]({paper['link']})")
|
61 |
+
# st.write("---")
|
62 |
+
# else:
|
63 |
+
# st.warning("No papers found. Try a different query.")
|
64 |
+
# else:
|
65 |
+
# st.error("Please enter a topic or keywords to search.")
|
66 |
+
|
67 |
+
import streamlit as st
|
68 |
+
import requests
|
69 |
+
import xmltodict
|
70 |
+
from datetime import datetime
|
71 |
+
|
72 |
+
# arXiv API base URL
|
73 |
+
ARXIV_API_BASE = "http://export.arxiv.org/api/query"
|
74 |
+
|
75 |
+
def fetch_papers(query, max_results=10):
|
76 |
+
"""Fetch papers from the arXiv API."""
|
77 |
+
try:
|
78 |
+
# Build the API query URL
|
79 |
+
api_url = f"{ARXIV_API_BASE}?search_query=all:{query}&start=0&max_results={max_results}"
|
80 |
+
|
81 |
+
# Make the API request
|
82 |
+
response = requests.get(api_url, headers={'Accept': 'application/xml'})
|
83 |
+
response.raise_for_status()
|
84 |
+
|
85 |
+
# Parse the XML response
|
86 |
+
data = xmltodict.parse(response.text)
|
87 |
+
entries = data.get('feed', {}).get('entry', [])
|
88 |
+
|
89 |
+
if not isinstance(entries, list): # Handle single result
|
90 |
+
entries = [entries]
|
91 |
+
|
92 |
+
# Extract relevant fields
|
93 |
+
papers = []
|
94 |
+
for entry in entries:
|
95 |
+
papers.append({
|
96 |
+
'title': entry.get('title'),
|
97 |
+
'summary': entry.get('summary'),
|
98 |
+
'published': entry.get('published'),
|
99 |
+
'authors': [author['name'] for author in entry.get('author', [])] if isinstance(entry.get('author'), list) else [entry.get('author', {}).get('name')],
|
100 |
+
'link': entry.get('id')
|
101 |
+
})
|
102 |
+
|
103 |
+
return papers
|
104 |
+
except Exception as e:
|
105 |
+
st.error(f"Error fetching papers: {e}")
|
106 |
+
return []
|
107 |
+
|
108 |
+
def filter_papers_by_year(papers, start_year, end_year):
|
109 |
+
"""Filter papers by the publication year range."""
|
110 |
+
filtered_papers = []
|
111 |
+
for paper in papers:
|
112 |
+
try:
|
113 |
+
published_year = int(paper['published'][:4]) # Extract year from the published date
|
114 |
+
if start_year <= published_year <= end_year:
|
115 |
+
filtered_papers.append(paper)
|
116 |
+
except ValueError:
|
117 |
+
continue # Skip if the year is not valid
|
118 |
+
return filtered_papers
|
119 |
+
|
120 |
+
# Streamlit app UI
|
121 |
+
st.title("arXiv Research Paper Search")
|
122 |
+
st.subheader("Find academic papers on your topic of interest")
|
123 |
+
|
124 |
+
# Input fields
|
125 |
+
query = st.text_input("Enter a topic or keywords", placeholder="e.g., machine learning, quantum computing")
|
126 |
+
max_results = st.slider("Number of results", min_value=1, max_value=50, value=10)
|
127 |
+
|
128 |
+
# Year filter
|
129 |
+
col1, col2 = st.columns(2)
|
130 |
+
with col1:
|
131 |
+
start_year = st.number_input("Start Year", min_value=1900, max_value=datetime.now().year, value=2000, step=1)
|
132 |
+
with col2:
|
133 |
+
end_year = st.number_input("End Year", min_value=1900, max_value=datetime.now().year, value=datetime.now().year, step=1)
|
134 |
+
|
135 |
+
if st.button("Search"):
|
136 |
+
if query.strip():
|
137 |
+
st.info(f"Searching for papers on: **{query}**")
|
138 |
+
papers = fetch_papers(query, max_results)
|
139 |
+
|
140 |
+
# Filter papers by year
|
141 |
+
papers_filtered = filter_papers_by_year(papers, start_year, end_year)
|
142 |
+
|
143 |
+
if papers_filtered:
|
144 |
+
st.success(f"Found {len(papers_filtered)} papers between {start_year} and {end_year}!")
|
145 |
+
for idx, paper in enumerate(papers_filtered, start=1):
|
146 |
+
st.write(f"### {idx}. {paper['title']}")
|
147 |
+
st.write(f"**Authors**: {', '.join(paper['authors'])}")
|
148 |
+
st.write(f"**Published**: {paper['published']}")
|
149 |
+
st.write(f"[Read More]({paper['link']})")
|
150 |
+
st.write("---")
|
151 |
+
else:
|
152 |
+
st.warning(f"No papers found between {start_year} and {end_year}. Try a different query or adjust the year range.")
|
153 |
+
else:
|
154 |
+
st.error("Please enter a topic or keywords to search.")
|
pdfpass.py
ADDED
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
from PyPDF2 import PdfReader, PdfWriter
|
3 |
+
from io import BytesIO
|
4 |
+
|
5 |
+
def remove_pdf_password(file, password):
|
6 |
+
try:
|
7 |
+
reader = PdfReader(file)
|
8 |
+
if reader.is_encrypted:
|
9 |
+
reader.decrypt(password)
|
10 |
+
writer = PdfWriter()
|
11 |
+
for page in reader.pages:
|
12 |
+
writer.add_page(page)
|
13 |
+
|
14 |
+
output = BytesIO()
|
15 |
+
writer.write(output)
|
16 |
+
output.seek(0)
|
17 |
+
return output
|
18 |
+
except Exception as e:
|
19 |
+
return str(e)
|
20 |
+
|
21 |
+
st.title("PDF Password Remover")
|
22 |
+
st.write("Upload a password-protected PDF and remove its password.")
|
23 |
+
|
24 |
+
# File upload
|
25 |
+
uploaded_file = st.file_uploader("Choose a PDF file", type=["pdf"])
|
26 |
+
password = st.text_input("Enter the PDF password", type="password")
|
27 |
+
|
28 |
+
if uploaded_file and password:
|
29 |
+
if st.button("Remove Password"):
|
30 |
+
output = remove_pdf_password(uploaded_file, password)
|
31 |
+
if isinstance(output, BytesIO):
|
32 |
+
st.success("Password removed successfully!")
|
33 |
+
st.download_button(
|
34 |
+
label="Download PDF without Password",
|
35 |
+
data=output,
|
36 |
+
file_name="unlocked_pdf.pdf",
|
37 |
+
mime="application/pdf",
|
38 |
+
)
|
39 |
+
else:
|
40 |
+
st.error(f"Error: {output}")
|
pdfsum.py
ADDED
@@ -0,0 +1,125 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# import streamlit as st
|
2 |
+
# from transformers import pipeline
|
3 |
+
# from PyPDF2 import PdfReader
|
4 |
+
|
5 |
+
# # Initialize the summarizer
|
6 |
+
# summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
|
7 |
+
|
8 |
+
# def extract_text_from_pdf(pdf_file):
|
9 |
+
# """Extract text from an uploaded PDF file."""
|
10 |
+
# try:
|
11 |
+
# reader = PdfReader(pdf_file)
|
12 |
+
# text = ""
|
13 |
+
# for page in reader.pages:
|
14 |
+
# page_text = page.extract_text()
|
15 |
+
# if page_text: # Skip pages with no text
|
16 |
+
# text += page_text + "\n"
|
17 |
+
# return text
|
18 |
+
# except Exception as e:
|
19 |
+
# raise ValueError(f"Error extracting text from PDF: {e}")
|
20 |
+
|
21 |
+
# def split_text_into_chunks(text, max_chunk_size=1024):
|
22 |
+
# """Split the text into smaller chunks for summarization."""
|
23 |
+
# chunks = []
|
24 |
+
# while len(text) > max_chunk_size:
|
25 |
+
# split_point = text.rfind(". ", 0, max_chunk_size) + 1 # Split at the last sentence boundary
|
26 |
+
# if split_point == 0: # No sentence boundary found, split arbitrarily
|
27 |
+
# split_point = max_chunk_size
|
28 |
+
# chunks.append
|
29 |
+
|
30 |
+
# # Streamlit Dashboard
|
31 |
+
# st.title("PDF Summarizer")
|
32 |
+
# st.write("Upload a PDF file to get a summarized version of its content.")
|
33 |
+
|
34 |
+
# uploaded_file = st.file_uploader("Upload your PDF", type=["pdf"])
|
35 |
+
|
36 |
+
# if uploaded_file is not None:
|
37 |
+
# # Extract text from the PDF
|
38 |
+
# st.write("Processing your PDF...")
|
39 |
+
# try:
|
40 |
+
# pdf_text = extract_text_from_pdf(uploaded_file)
|
41 |
+
# st.write("PDF content extracted successfully.")
|
42 |
+
|
43 |
+
# # Display extracted text (optional)
|
44 |
+
# with st.expander("View Extracted Text"):
|
45 |
+
# st.text_area("Extracted Text", pdf_text, height=300)
|
46 |
+
|
47 |
+
# # Summarize the extracted text
|
48 |
+
# if st.button("Summarize"):
|
49 |
+
# st.write("Generating summary...")
|
50 |
+
# summary = summarizer(pdf_text, max_length=130, min_length=30, do_sample=False)
|
51 |
+
# st.subheader("Summary")
|
52 |
+
# st.write(summary[0]["summary_text"])
|
53 |
+
# except Exception as e:
|
54 |
+
# st.error(f"An error occurred while processing the PDF: {str(e)}")
|
55 |
+
|
56 |
+
import streamlit as st
|
57 |
+
from transformers import pipeline
|
58 |
+
import pdfplumber
|
59 |
+
|
60 |
+
# Initialize the summarizer
|
61 |
+
summarizer = pipeline("summarization", model="t5-small")
|
62 |
+
|
63 |
+
def extract_text_from_pdf(pdf_file):
|
64 |
+
"""Extract text from an uploaded PDF file using pdfplumber."""
|
65 |
+
try:
|
66 |
+
text = ""
|
67 |
+
with pdfplumber.open(pdf_file) as pdf:
|
68 |
+
for page in pdf.pages:
|
69 |
+
text += page.extract_text() + "\n"
|
70 |
+
if not text.strip():
|
71 |
+
raise ValueError("No extractable text found in the PDF.")
|
72 |
+
return text
|
73 |
+
except Exception as e:
|
74 |
+
raise ValueError(f"Error extracting text from PDF: {e}")
|
75 |
+
|
76 |
+
def split_text_into_chunks(text, max_chunk_size=1024):
|
77 |
+
"""Split the text into smaller chunks for summarization."""
|
78 |
+
chunks = []
|
79 |
+
while len(text) > max_chunk_size:
|
80 |
+
split_point = text.rfind(". ", 0, max_chunk_size) + 1 # Find the last full sentence
|
81 |
+
if split_point == 0: # No sentence boundary found, split arbitrarily
|
82 |
+
split_point = max_chunk_size
|
83 |
+
chunks.append(text[:split_point])
|
84 |
+
text = text[split_point:]
|
85 |
+
if text:
|
86 |
+
chunks.append(text)
|
87 |
+
return chunks
|
88 |
+
|
89 |
+
def summarize_text(chunks):
|
90 |
+
"""Summarize each chunk of text with dynamic max_length."""
|
91 |
+
summaries = []
|
92 |
+
for chunk in chunks:
|
93 |
+
input_length = len(chunk.split()) # Approximate token count
|
94 |
+
max_length = max(48, int(input_length * 0.8)) # Set max_length to 80% of input length
|
95 |
+
summary = summarizer(chunk, max_length=max_length, min_length=10, do_sample=False)
|
96 |
+
summaries.append(summary[0]["summary_text"])
|
97 |
+
return summaries
|
98 |
+
|
99 |
+
# Streamlit Dashboard
|
100 |
+
st.title("PDF Summarizer")
|
101 |
+
st.write("Upload a PDF file to get a summarized version of its content.")
|
102 |
+
|
103 |
+
uploaded_file = st.file_uploader("Upload your PDF", type=["pdf"])
|
104 |
+
|
105 |
+
if uploaded_file is not None:
|
106 |
+
try:
|
107 |
+
# Extract text from the PDF
|
108 |
+
st.write("Processing your PDF...")
|
109 |
+
pdf_text = extract_text_from_pdf(uploaded_file)
|
110 |
+
st.write("PDF content extracted successfully.")
|
111 |
+
|
112 |
+
# Display extracted text (optional)
|
113 |
+
with st.expander("View Extracted Text"):
|
114 |
+
st.text_area("Extracted Text", pdf_text, height=300)
|
115 |
+
|
116 |
+
# Summarize the extracted text
|
117 |
+
if st.button("Summarize"):
|
118 |
+
st.write("Generating summary...")
|
119 |
+
chunks = split_text_into_chunks(pdf_text)
|
120 |
+
summaries = summarize_text(chunks)
|
121 |
+
full_summary = " ".join(summaries)
|
122 |
+
st.subheader("Summary")
|
123 |
+
st.write(full_summary)
|
124 |
+
except Exception as e:
|
125 |
+
st.error(f"An error occurred while processing the PDF: {str(e)}")
|
requirements.txt
ADDED
Binary file (352 Bytes). View file
|
|
textsumm.py
ADDED
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from transformers import pipeline
|
2 |
+
|
3 |
+
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
|
4 |
+
ARTICLE ="""
|
5 |
+
There is widespread international concern that Russia's war will provoke a global food crisis similar to, or
|
6 |
+
worse than, that faced in 2007 and 2008. The war comes at a time when the global food system was already
|
7 |
+
struggling to feed its growing population in a sustainable way, under the pressure caused by climate change
|
8 |
+
and the Covid-19 pandemic. Russia and Ukraine are key agricultural players, together exporting nearly 12
|
9 |
+
% of food calories traded globally. They are major providers of basic agro-commodities, including wheat,
|
10 |
+
maize and sunflower oil, and Russia is the world's top exporter of fertilisers. The global supply chain will
|
11 |
+
get impacted until Russia and Ukraine retreat and will end the war.
|
12 |
+
The war's impact on global food supply centred on three factors. First is a significant reduction in exports
|
13 |
+
and production of essential commodities from both countries, caused by the war and not the economic
|
14 |
+
sanctions imposed on Russia, which, intentionally, did not target the agricultural sector. Overall, the
|
15 |
+
European Commission estimates that 'up to 25 million tonnes of wheat would need to be substituted to
|
16 |
+
meet worldwide food needs in the current and the next season. Second factor is a global spike in prices of
|
17 |
+
food supplies and inputs needed for agri-food production, which were already at record levels before the
|
18 |
+
war. The war has further pushed the prices up. Third factor is the international response to the above,
|
19 |
+
which could either amplify the effects of the crisis (mainly by uncoordinated export bans) or mitigate them
|
20 |
+
(applying lessons learnt from the 2007-2008 food crisis). A number of countries, other than Russia and
|
21 |
+
Ukraine, have already imposed or announced their intention to impose some control over exports of
|
22 |
+
essential agricultural commodities, including Egypt, Argentina, Indonesia, Serbia, Turkey and, in the EU,
|
23 |
+
Hungary. We should keep this in our mind that the long duration of war will make the global situation
|
24 |
+
irrecoverable.
|
25 |
+
|
26 |
+
"""
|
27 |
+
print(summarizer(ARTICLE, max_length=130, min_length=30, do_sample=False))
|
28 |
+
|