Spaces:

bodhak
/

pdf-tools-suite

Sleeping

App Files Files Community

bodhak commited on Feb 20

Commit

92c0981

verified ·

1 Parent(s): efeec11

Upload 6 files

Browse files

Files changed (6) hide show

app.py +109 -0
papersearch.py +154 -0
pdfpass.py +40 -0
pdfsum.py +125 -0
requirements.txt +0 -0
textsumm.py +28 -0

app.py ADDED Viewed

	@@ -0,0 +1,109 @@

+import streamlit as st
+from textsumm import summarizer
+from pdfsum import extract_text_from_pdf, summarize_text, split_text_into_chunks
+from pdfpass import remove_pdf_password
+from papersearch import fetch_papers, filter_papers_by_year
+from io import BytesIO
+from datetime import datetime
+from pypdf import PdfReader, PdfWriter
+# Streamlit App Config
+st.set_page_config(page_title="PDF Tools Suite", page_icon="📄", layout="wide")
+# Sidebar Navigation
+st.sidebar.title("📄 PDF Tools Suite")
+page = st.sidebar.radio("Select a tool", ["Text Summarizer", "PDF Summarizer", "PDF Password Remover", "Research Paper Search", "PDF Merger", "PDF Splitter", "PDF to Text Converter"])
+# Tool: Text Summarizer
+if page == "Text Summarizer":
+    st.title("📝 Text Summarizer")
+    user_input = st.text_area("Enter text to summarize")
+    if st.button("Summarize"):
+        summary = summarizer(user_input, max_length=130, min_length=30, do_sample=False)
+        st.subheader("Summary")
+        st.write(summary[0]["summary_text"])
+# Tool: PDF Summarizer
+elif page == "PDF Summarizer":
+    st.title("📜 PDF Summarizer")
+    uploaded_file = st.file_uploader("Upload your PDF", type=["pdf"])
+    if uploaded_file is not None:
+        pdf_text = extract_text_from_pdf(uploaded_file)
+        chunks = split_text_into_chunks(pdf_text)
+        summaries = summarize_text(chunks)
+        full_summary = " ".join(summaries)
+        st.subheader("Summary")
+        st.write(full_summary)
+# Tool: PDF Password Remover
+elif page == "PDF Password Remover":
+    st.title("🔑 Remove PDF Password")
+    uploaded_file = st.file_uploader("Choose a password-protected PDF", type=["pdf"])
+    password = st.text_input("Enter the PDF password", type="password")
+    if uploaded_file and password and st.button("Remove Password"):
+        output = remove_pdf_password(uploaded_file, password)
+        if isinstance(output, BytesIO):
+            st.success("Password removed successfully!")
+            st.download_button("Download PDF", data=output, file_name="unlocked_pdf.pdf", mime="application/pdf")
+        else:
+            st.error(f"Error: {output}")
+# Tool: Research Paper Search
+elif page == "Research Paper Search":
+    st.title("🔍 Research Paper Search (arXiv)")
+    query = st.text_input("Enter topic or keywords", placeholder="e.g., machine learning")
+    max_results = st.slider("Number of results", 1, 50, 10)
+    col1, col2 = st.columns(2)
+    with col1:
+        start_year = st.number_input("Start Year", min_value=1900, max_value=datetime.now().year, value=2000)
+    with col2:
+        end_year = st.number_input("End Year", min_value=1900, max_value=datetime.now().year, value=datetime.now().year)
+    if st.button("Search"):
+        papers = fetch_papers(query, max_results)
+        papers_filtered = filter_papers_by_year(papers, start_year, end_year)
+        if papers_filtered:
+            for idx, paper in enumerate(papers_filtered, start=1):
+                st.write(f"### {idx}. {paper['title']}")
+                st.write(f"**Authors**: {', '.join(paper['authors'])}")
+                st.write(f"**Published**: {paper['published']}")
+                st.write(f"[Read More]({paper['link']})")
+                st.write("---")
+        else:
+            st.warning("No papers found in the selected range.")
+# Tool: PDF Merger
+elif page == "PDF Merger":
+    st.title("📎 Merge Multiple PDFs")
+    uploaded_files = st.file_uploader("Upload multiple PDF files", type=["pdf"], accept_multiple_files=True)
+    if uploaded_files and st.button("Merge PDFs"):
+        pdf_writer = PdfWriter()
+        for file in uploaded_files:
+            pdf_reader = PdfReader(file)
+            for page in pdf_reader.pages:
+                pdf_writer.add_page(page)
+        output = BytesIO()
+        pdf_writer.write(output)
+        output.seek(0)
+        st.download_button("Download Merged PDF", data=output, file_name="merged.pdf", mime="application/pdf")
+# Tool: PDF Splitter
+elif page == "PDF Splitter":
+    st.title("✂️ Split PDF into Pages")
+    uploaded_file = st.file_uploader("Upload a PDF", type=["pdf"])
+    if uploaded_file:
+        pdf_reader = PdfReader(uploaded_file)
+        for i, page in enumerate(pdf_reader.pages):
+            pdf_writer = PdfWriter()
+            pdf_writer.add_page(page)
+            output = BytesIO()
+            pdf_writer.write(output)
+            output.seek(0)
+            st.download_button(f"Download Page {i+1}", data=output, file_name=f"page_{i+1}.pdf", mime="application/pdf")
+# Tool: PDF to Text Converter
+elif page == "PDF to Text Converter":
+    st.title("📜 Extract Text from PDF")
+    uploaded_file = st.file_uploader("Upload a PDF", type=["pdf"])
+    if uploaded_file:
+        pdf_text = extract_text_from_pdf(uploaded_file)
+        st.text_area("Extracted Text", pdf_text, height=300)

papersearch.py ADDED Viewed

	@@ -0,0 +1,154 @@

+# import streamlit as st
+# import requests
+# import xmltodict
+# # arXiv API base URL
+# ARXIV_API_BASE = "http://export.arxiv.org/api/query"
+# def fetch_papers(query, max_results=10):
+#     """Fetch papers from the arXiv API."""
+#     try:
+#         # Build the API query URL
+#         api_url = f"{ARXIV_API_BASE}?search_query=all:{query}&start=0&max_results={max_results}"
+#         # Make the API request
+#         response = requests.get(api_url, headers={'Accept': 'application/xml'})
+#         response.raise_for_status()
+#         # Parse the XML response
+#         data = xmltodict.parse(response.text)
+#         entries = data.get('feed', {}).get('entry', [])
+#         if not isinstance(entries, list):  # Handle single result
+#             entries = [entries]
+#         # Extract relevant fields
+#         papers = []
+#         for entry in entries:
+#             papers.append({
+#                 'title': entry.get('title'),
+#                 'summary': entry.get('summary'),
+#                 'published': entry.get('published'),
+#                 'authors': [author['name'] for author in entry.get('author', [])] if isinstance(entry.get('author'), list) else [entry.get('author', {}).get('name')],
+#                 'link': entry.get('id')
+#             })
+#         return papers
+#     except Exception as e:
+#         st.error(f"Error fetching papers: {e}")
+#         return []
+# # Streamlit app UI
+# st.title("arXiv Research Paper Search")
+# st.subheader("Find academic papers on your topic of interest")
+# # Input fields
+# query = st.text_input("Enter a topic or keywords", placeholder="e.g., machine learning, quantum computing")
+# max_results = st.slider("Number of results", min_value=1, max_value=50, value=10)
+# if st.button("Search"):
+#     if query.strip():
+#         st.info(f"Searching for papers on: **{query}**")
+#         papers = fetch_papers(query, max_results)
+#         if papers:
+#             st.success(f"Found {len(papers)} papers!")
+#             for idx, paper in enumerate(papers, start=1):
+#                 st.write(f"### {idx}. {paper['title']}")
+#                 st.write(f"**Authors**: {', '.join(paper['authors'])}")
+#                 st.write(f"**Published**: {paper['published']}")
+#                 st.write(f"[Read More]({paper['link']})")
+#                 st.write("---")
+#         else:
+#             st.warning("No papers found. Try a different query.")
+#     else:
+#         st.error("Please enter a topic or keywords to search.")
+import streamlit as st
+import requests
+import xmltodict
+from datetime import datetime
+# arXiv API base URL
+ARXIV_API_BASE = "http://export.arxiv.org/api/query"
+def fetch_papers(query, max_results=10):
+    """Fetch papers from the arXiv API."""
+    try:
+        # Build the API query URL
+        api_url = f"{ARXIV_API_BASE}?search_query=all:{query}&start=0&max_results={max_results}"
+        # Make the API request
+        response = requests.get(api_url, headers={'Accept': 'application/xml'})
+        response.raise_for_status()
+        # Parse the XML response
+        data = xmltodict.parse(response.text)
+        entries = data.get('feed', {}).get('entry', [])
+        if not isinstance(entries, list):  # Handle single result
+            entries = [entries]
+        # Extract relevant fields
+        papers = []
+        for entry in entries:
+            papers.append({
+                'title': entry.get('title'),
+                'summary': entry.get('summary'),
+                'published': entry.get('published'),
+                'authors': [author['name'] for author in entry.get('author', [])] if isinstance(entry.get('author'), list) else [entry.get('author', {}).get('name')],
+                'link': entry.get('id')
+            })
+        return papers
+    except Exception as e:
+        st.error(f"Error fetching papers: {e}")
+        return []
+def filter_papers_by_year(papers, start_year, end_year):
+    """Filter papers by the publication year range."""
+    filtered_papers = []
+    for paper in papers:
+        try:
+            published_year = int(paper['published'][:4])  # Extract year from the published date
+            if start_year <= published_year <= end_year:
+                filtered_papers.append(paper)
+        except ValueError:
+            continue  # Skip if the year is not valid
+    return filtered_papers
+# Streamlit app UI
+st.title("arXiv Research Paper Search")
+st.subheader("Find academic papers on your topic of interest")
+# Input fields
+query = st.text_input("Enter a topic or keywords", placeholder="e.g., machine learning, quantum computing")
+max_results = st.slider("Number of results", min_value=1, max_value=50, value=10)
+# Year filter
+col1, col2 = st.columns(2)
+with col1:
+    start_year = st.number_input("Start Year", min_value=1900, max_value=datetime.now().year, value=2000, step=1)
+with col2:
+    end_year = st.number_input("End Year", min_value=1900, max_value=datetime.now().year, value=datetime.now().year, step=1)
+if st.button("Search"):
+    if query.strip():
+        st.info(f"Searching for papers on: **{query}**")
+        papers = fetch_papers(query, max_results)
+        # Filter papers by year
+        papers_filtered = filter_papers_by_year(papers, start_year, end_year)
+        if papers_filtered:
+            st.success(f"Found {len(papers_filtered)} papers between {start_year} and {end_year}!")
+            for idx, paper in enumerate(papers_filtered, start=1):
+                st.write(f"### {idx}. {paper['title']}")
+                st.write(f"**Authors**: {', '.join(paper['authors'])}")
+                st.write(f"**Published**: {paper['published']}")
+                st.write(f"[Read More]({paper['link']})")
+                st.write("---")
+        else:
+            st.warning(f"No papers found between {start_year} and {end_year}. Try a different query or adjust the year range.")
+    else:
+        st.error("Please enter a topic or keywords to search.")

pdfpass.py ADDED Viewed

	@@ -0,0 +1,40 @@

+import streamlit as st
+from PyPDF2 import PdfReader, PdfWriter
+from io import BytesIO
+def remove_pdf_password(file, password):
+    try:
+        reader = PdfReader(file)
+        if reader.is_encrypted:
+            reader.decrypt(password)
+        writer = PdfWriter()
+        for page in reader.pages:
+            writer.add_page(page)
+        output = BytesIO()
+        writer.write(output)
+        output.seek(0)
+        return output
+    except Exception as e:
+        return str(e)
+st.title("PDF Password Remover")
+st.write("Upload a password-protected PDF and remove its password.")
+# File upload
+uploaded_file = st.file_uploader("Choose a PDF file", type=["pdf"])
+password = st.text_input("Enter the PDF password", type="password")
+if uploaded_file and password:
+    if st.button("Remove Password"):
+        output = remove_pdf_password(uploaded_file, password)
+        if isinstance(output, BytesIO):
+            st.success("Password removed successfully!")
+            st.download_button(
+                label="Download PDF without Password",
+                data=output,
+                file_name="unlocked_pdf.pdf",
+                mime="application/pdf",
+            )
+        else:
+            st.error(f"Error: {output}")

pdfsum.py ADDED Viewed

	@@ -0,0 +1,125 @@

+# import streamlit as st
+# from transformers import pipeline
+# from PyPDF2 import PdfReader
+# # Initialize the summarizer
+# summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
+# def extract_text_from_pdf(pdf_file):
+#     """Extract text from an uploaded PDF file."""
+#     try:
+#         reader = PdfReader(pdf_file)
+#         text = ""
+#         for page in reader.pages:
+#             page_text = page.extract_text()
+#             if page_text:  # Skip pages with no text
+#                 text += page_text + "\n"
+#         return text
+#     except Exception as e:
+#         raise ValueError(f"Error extracting text from PDF: {e}")
+# def split_text_into_chunks(text, max_chunk_size=1024):
+#     """Split the text into smaller chunks for summarization."""
+#     chunks = []
+#     while len(text) > max_chunk_size:
+#         split_point = text.rfind(". ", 0, max_chunk_size) + 1  # Split at the last sentence boundary
+#         if split_point == 0:  # No sentence boundary found, split arbitrarily
+#             split_point = max_chunk_size
+#         chunks.append
+# # Streamlit Dashboard
+# st.title("PDF Summarizer")
+# st.write("Upload a PDF file to get a summarized version of its content.")
+# uploaded_file = st.file_uploader("Upload your PDF", type=["pdf"])
+# if uploaded_file is not None:
+#     # Extract text from the PDF
+#     st.write("Processing your PDF...")
+#     try:
+#         pdf_text = extract_text_from_pdf(uploaded_file)
+#         st.write("PDF content extracted successfully.")
+#         # Display extracted text (optional)
+#         with st.expander("View Extracted Text"):
+#             st.text_area("Extracted Text", pdf_text, height=300)
+#         # Summarize the extracted text
+#         if st.button("Summarize"):
+#             st.write("Generating summary...")
+#             summary = summarizer(pdf_text, max_length=130, min_length=30, do_sample=False)
+#             st.subheader("Summary")
+#             st.write(summary[0]["summary_text"])
+#     except Exception as e:
+#         st.error(f"An error occurred while processing the PDF: {str(e)}")
+import streamlit as st
+from transformers import pipeline
+import pdfplumber
+# Initialize the summarizer
+summarizer = pipeline("summarization", model="t5-small")
+def extract_text_from_pdf(pdf_file):
+    """Extract text from an uploaded PDF file using pdfplumber."""
+    try:
+        text = ""
+        with pdfplumber.open(pdf_file) as pdf:
+            for page in pdf.pages:
+                text += page.extract_text() + "\n"
+        if not text.strip():
+            raise ValueError("No extractable text found in the PDF.")
+        return text
+    except Exception as e:
+        raise ValueError(f"Error extracting text from PDF: {e}")
+def split_text_into_chunks(text, max_chunk_size=1024):
+    """Split the text into smaller chunks for summarization."""
+    chunks = []
+    while len(text) > max_chunk_size:
+        split_point = text.rfind(". ", 0, max_chunk_size) + 1  # Find the last full sentence
+        if split_point == 0:  # No sentence boundary found, split arbitrarily
+            split_point = max_chunk_size
+        chunks.append(text[:split_point])
+        text = text[split_point:]
+    if text:
+        chunks.append(text)
+    return chunks
+def summarize_text(chunks):
+    """Summarize each chunk of text with dynamic max_length."""
+    summaries = []
+    for chunk in chunks:
+        input_length = len(chunk.split())  # Approximate token count
+        max_length = max(48, int(input_length * 0.8))  # Set max_length to 80% of input length
+        summary = summarizer(chunk, max_length=max_length, min_length=10, do_sample=False)
+        summaries.append(summary[0]["summary_text"])
+    return summaries
+# Streamlit Dashboard
+st.title("PDF Summarizer")
+st.write("Upload a PDF file to get a summarized version of its content.")
+uploaded_file = st.file_uploader("Upload your PDF", type=["pdf"])
+if uploaded_file is not None:
+    try:
+        # Extract text from the PDF
+        st.write("Processing your PDF...")
+        pdf_text = extract_text_from_pdf(uploaded_file)
+        st.write("PDF content extracted successfully.")
+        # Display extracted text (optional)
+        with st.expander("View Extracted Text"):
+            st.text_area("Extracted Text", pdf_text, height=300)
+        # Summarize the extracted text
+        if st.button("Summarize"):
+            st.write("Generating summary...")
+            chunks = split_text_into_chunks(pdf_text)
+            summaries = summarize_text(chunks)
+            full_summary = " ".join(summaries)
+            st.subheader("Summary")
+            st.write(full_summary)
+    except Exception as e:
+        st.error(f"An error occurred while processing the PDF: {str(e)}")

requirements.txt ADDED Viewed

Binary file (352 Bytes). View file

textsumm.py ADDED Viewed

	@@ -0,0 +1,28 @@

+from transformers import pipeline
+summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
+ARTICLE ="""
+There is widespread international concern that Russia's war will provoke a global food crisis similar to, or
+worse than, that faced in 2007 and 2008. The war comes at a time when the global food system was already
+struggling to feed its growing population in a sustainable way, under the pressure caused by climate change
+and the Covid-19 pandemic. Russia and Ukraine are key agricultural players, together exporting nearly 12
+% of food calories traded globally. They are major providers of basic agro-commodities, including wheat,
+maize and sunflower oil, and Russia is the world's top exporter of fertilisers. The global supply chain will
+get impacted until Russia and Ukraine retreat and will end the war.
+The war's impact on global food supply centred on three factors. First is a significant reduction in exports
+and production of essential commodities from both countries, caused by the war and not the economic
+sanctions imposed on Russia, which, intentionally, did not target the agricultural sector. Overall, the
+European Commission estimates that 'up to 25 million tonnes of wheat would need to be substituted to
+meet worldwide food needs in the current and the next season. Second factor is a global spike in prices of
+food supplies and inputs needed for agri-food production, which were already at record levels before the
+war. The war has further pushed the prices up. Third factor is the international response to the above,
+which could either amplify the effects of the crisis (mainly by uncoordinated export bans) or mitigate them
+(applying lessons learnt from the 2007-2008 food crisis). A number of countries, other than Russia and
+Ukraine, have already imposed or announced their intention to impose some control over exports of
+essential agricultural commodities, including Egypt, Argentina, Indonesia, Serbia, Turkey and, in the EU,
+Hungary. We should keep this in our mind that the long duration of war will make the global situation
+irrecoverable.
+"""
+print(summarizer(ARTICLE, max_length=130, min_length=30, do_sample=False))