bodhak commited on
Commit
92c0981
·
verified ·
1 Parent(s): efeec11

Upload 6 files

Browse files
Files changed (6) hide show
  1. app.py +109 -0
  2. papersearch.py +154 -0
  3. pdfpass.py +40 -0
  4. pdfsum.py +125 -0
  5. requirements.txt +0 -0
  6. textsumm.py +28 -0
app.py ADDED
@@ -0,0 +1,109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from textsumm import summarizer
3
+ from pdfsum import extract_text_from_pdf, summarize_text, split_text_into_chunks
4
+ from pdfpass import remove_pdf_password
5
+ from papersearch import fetch_papers, filter_papers_by_year
6
+ from io import BytesIO
7
+ from datetime import datetime
8
+ from pypdf import PdfReader, PdfWriter
9
+
10
+ # Streamlit App Config
11
+ st.set_page_config(page_title="PDF Tools Suite", page_icon="📄", layout="wide")
12
+
13
+ # Sidebar Navigation
14
+ st.sidebar.title("📄 PDF Tools Suite")
15
+ page = st.sidebar.radio("Select a tool", ["Text Summarizer", "PDF Summarizer", "PDF Password Remover", "Research Paper Search", "PDF Merger", "PDF Splitter", "PDF to Text Converter"])
16
+
17
+ # Tool: Text Summarizer
18
+ if page == "Text Summarizer":
19
+ st.title("📝 Text Summarizer")
20
+ user_input = st.text_area("Enter text to summarize")
21
+ if st.button("Summarize"):
22
+ summary = summarizer(user_input, max_length=130, min_length=30, do_sample=False)
23
+ st.subheader("Summary")
24
+ st.write(summary[0]["summary_text"])
25
+
26
+ # Tool: PDF Summarizer
27
+ elif page == "PDF Summarizer":
28
+ st.title("📜 PDF Summarizer")
29
+ uploaded_file = st.file_uploader("Upload your PDF", type=["pdf"])
30
+ if uploaded_file is not None:
31
+ pdf_text = extract_text_from_pdf(uploaded_file)
32
+ chunks = split_text_into_chunks(pdf_text)
33
+ summaries = summarize_text(chunks)
34
+ full_summary = " ".join(summaries)
35
+ st.subheader("Summary")
36
+ st.write(full_summary)
37
+
38
+ # Tool: PDF Password Remover
39
+ elif page == "PDF Password Remover":
40
+ st.title("🔑 Remove PDF Password")
41
+ uploaded_file = st.file_uploader("Choose a password-protected PDF", type=["pdf"])
42
+ password = st.text_input("Enter the PDF password", type="password")
43
+ if uploaded_file and password and st.button("Remove Password"):
44
+ output = remove_pdf_password(uploaded_file, password)
45
+ if isinstance(output, BytesIO):
46
+ st.success("Password removed successfully!")
47
+ st.download_button("Download PDF", data=output, file_name="unlocked_pdf.pdf", mime="application/pdf")
48
+ else:
49
+ st.error(f"Error: {output}")
50
+
51
+ # Tool: Research Paper Search
52
+ elif page == "Research Paper Search":
53
+ st.title("🔍 Research Paper Search (arXiv)")
54
+ query = st.text_input("Enter topic or keywords", placeholder="e.g., machine learning")
55
+ max_results = st.slider("Number of results", 1, 50, 10)
56
+ col1, col2 = st.columns(2)
57
+ with col1:
58
+ start_year = st.number_input("Start Year", min_value=1900, max_value=datetime.now().year, value=2000)
59
+ with col2:
60
+ end_year = st.number_input("End Year", min_value=1900, max_value=datetime.now().year, value=datetime.now().year)
61
+ if st.button("Search"):
62
+ papers = fetch_papers(query, max_results)
63
+ papers_filtered = filter_papers_by_year(papers, start_year, end_year)
64
+ if papers_filtered:
65
+ for idx, paper in enumerate(papers_filtered, start=1):
66
+ st.write(f"### {idx}. {paper['title']}")
67
+ st.write(f"**Authors**: {', '.join(paper['authors'])}")
68
+ st.write(f"**Published**: {paper['published']}")
69
+ st.write(f"[Read More]({paper['link']})")
70
+ st.write("---")
71
+ else:
72
+ st.warning("No papers found in the selected range.")
73
+
74
+ # Tool: PDF Merger
75
+ elif page == "PDF Merger":
76
+ st.title("📎 Merge Multiple PDFs")
77
+ uploaded_files = st.file_uploader("Upload multiple PDF files", type=["pdf"], accept_multiple_files=True)
78
+ if uploaded_files and st.button("Merge PDFs"):
79
+ pdf_writer = PdfWriter()
80
+ for file in uploaded_files:
81
+ pdf_reader = PdfReader(file)
82
+ for page in pdf_reader.pages:
83
+ pdf_writer.add_page(page)
84
+ output = BytesIO()
85
+ pdf_writer.write(output)
86
+ output.seek(0)
87
+ st.download_button("Download Merged PDF", data=output, file_name="merged.pdf", mime="application/pdf")
88
+
89
+ # Tool: PDF Splitter
90
+ elif page == "PDF Splitter":
91
+ st.title("✂️ Split PDF into Pages")
92
+ uploaded_file = st.file_uploader("Upload a PDF", type=["pdf"])
93
+ if uploaded_file:
94
+ pdf_reader = PdfReader(uploaded_file)
95
+ for i, page in enumerate(pdf_reader.pages):
96
+ pdf_writer = PdfWriter()
97
+ pdf_writer.add_page(page)
98
+ output = BytesIO()
99
+ pdf_writer.write(output)
100
+ output.seek(0)
101
+ st.download_button(f"Download Page {i+1}", data=output, file_name=f"page_{i+1}.pdf", mime="application/pdf")
102
+
103
+ # Tool: PDF to Text Converter
104
+ elif page == "PDF to Text Converter":
105
+ st.title("📜 Extract Text from PDF")
106
+ uploaded_file = st.file_uploader("Upload a PDF", type=["pdf"])
107
+ if uploaded_file:
108
+ pdf_text = extract_text_from_pdf(uploaded_file)
109
+ st.text_area("Extracted Text", pdf_text, height=300)
papersearch.py ADDED
@@ -0,0 +1,154 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # import streamlit as st
2
+ # import requests
3
+ # import xmltodict
4
+
5
+ # # arXiv API base URL
6
+ # ARXIV_API_BASE = "http://export.arxiv.org/api/query"
7
+
8
+ # def fetch_papers(query, max_results=10):
9
+ # """Fetch papers from the arXiv API."""
10
+ # try:
11
+ # # Build the API query URL
12
+ # api_url = f"{ARXIV_API_BASE}?search_query=all:{query}&start=0&max_results={max_results}"
13
+
14
+ # # Make the API request
15
+ # response = requests.get(api_url, headers={'Accept': 'application/xml'})
16
+ # response.raise_for_status()
17
+
18
+ # # Parse the XML response
19
+ # data = xmltodict.parse(response.text)
20
+ # entries = data.get('feed', {}).get('entry', [])
21
+
22
+ # if not isinstance(entries, list): # Handle single result
23
+ # entries = [entries]
24
+
25
+ # # Extract relevant fields
26
+ # papers = []
27
+ # for entry in entries:
28
+ # papers.append({
29
+ # 'title': entry.get('title'),
30
+ # 'summary': entry.get('summary'),
31
+ # 'published': entry.get('published'),
32
+ # 'authors': [author['name'] for author in entry.get('author', [])] if isinstance(entry.get('author'), list) else [entry.get('author', {}).get('name')],
33
+ # 'link': entry.get('id')
34
+ # })
35
+
36
+ # return papers
37
+ # except Exception as e:
38
+ # st.error(f"Error fetching papers: {e}")
39
+ # return []
40
+
41
+ # # Streamlit app UI
42
+ # st.title("arXiv Research Paper Search")
43
+ # st.subheader("Find academic papers on your topic of interest")
44
+
45
+ # # Input fields
46
+ # query = st.text_input("Enter a topic or keywords", placeholder="e.g., machine learning, quantum computing")
47
+ # max_results = st.slider("Number of results", min_value=1, max_value=50, value=10)
48
+
49
+ # if st.button("Search"):
50
+ # if query.strip():
51
+ # st.info(f"Searching for papers on: **{query}**")
52
+ # papers = fetch_papers(query, max_results)
53
+
54
+ # if papers:
55
+ # st.success(f"Found {len(papers)} papers!")
56
+ # for idx, paper in enumerate(papers, start=1):
57
+ # st.write(f"### {idx}. {paper['title']}")
58
+ # st.write(f"**Authors**: {', '.join(paper['authors'])}")
59
+ # st.write(f"**Published**: {paper['published']}")
60
+ # st.write(f"[Read More]({paper['link']})")
61
+ # st.write("---")
62
+ # else:
63
+ # st.warning("No papers found. Try a different query.")
64
+ # else:
65
+ # st.error("Please enter a topic or keywords to search.")
66
+
67
+ import streamlit as st
68
+ import requests
69
+ import xmltodict
70
+ from datetime import datetime
71
+
72
+ # arXiv API base URL
73
+ ARXIV_API_BASE = "http://export.arxiv.org/api/query"
74
+
75
+ def fetch_papers(query, max_results=10):
76
+ """Fetch papers from the arXiv API."""
77
+ try:
78
+ # Build the API query URL
79
+ api_url = f"{ARXIV_API_BASE}?search_query=all:{query}&start=0&max_results={max_results}"
80
+
81
+ # Make the API request
82
+ response = requests.get(api_url, headers={'Accept': 'application/xml'})
83
+ response.raise_for_status()
84
+
85
+ # Parse the XML response
86
+ data = xmltodict.parse(response.text)
87
+ entries = data.get('feed', {}).get('entry', [])
88
+
89
+ if not isinstance(entries, list): # Handle single result
90
+ entries = [entries]
91
+
92
+ # Extract relevant fields
93
+ papers = []
94
+ for entry in entries:
95
+ papers.append({
96
+ 'title': entry.get('title'),
97
+ 'summary': entry.get('summary'),
98
+ 'published': entry.get('published'),
99
+ 'authors': [author['name'] for author in entry.get('author', [])] if isinstance(entry.get('author'), list) else [entry.get('author', {}).get('name')],
100
+ 'link': entry.get('id')
101
+ })
102
+
103
+ return papers
104
+ except Exception as e:
105
+ st.error(f"Error fetching papers: {e}")
106
+ return []
107
+
108
+ def filter_papers_by_year(papers, start_year, end_year):
109
+ """Filter papers by the publication year range."""
110
+ filtered_papers = []
111
+ for paper in papers:
112
+ try:
113
+ published_year = int(paper['published'][:4]) # Extract year from the published date
114
+ if start_year <= published_year <= end_year:
115
+ filtered_papers.append(paper)
116
+ except ValueError:
117
+ continue # Skip if the year is not valid
118
+ return filtered_papers
119
+
120
+ # Streamlit app UI
121
+ st.title("arXiv Research Paper Search")
122
+ st.subheader("Find academic papers on your topic of interest")
123
+
124
+ # Input fields
125
+ query = st.text_input("Enter a topic or keywords", placeholder="e.g., machine learning, quantum computing")
126
+ max_results = st.slider("Number of results", min_value=1, max_value=50, value=10)
127
+
128
+ # Year filter
129
+ col1, col2 = st.columns(2)
130
+ with col1:
131
+ start_year = st.number_input("Start Year", min_value=1900, max_value=datetime.now().year, value=2000, step=1)
132
+ with col2:
133
+ end_year = st.number_input("End Year", min_value=1900, max_value=datetime.now().year, value=datetime.now().year, step=1)
134
+
135
+ if st.button("Search"):
136
+ if query.strip():
137
+ st.info(f"Searching for papers on: **{query}**")
138
+ papers = fetch_papers(query, max_results)
139
+
140
+ # Filter papers by year
141
+ papers_filtered = filter_papers_by_year(papers, start_year, end_year)
142
+
143
+ if papers_filtered:
144
+ st.success(f"Found {len(papers_filtered)} papers between {start_year} and {end_year}!")
145
+ for idx, paper in enumerate(papers_filtered, start=1):
146
+ st.write(f"### {idx}. {paper['title']}")
147
+ st.write(f"**Authors**: {', '.join(paper['authors'])}")
148
+ st.write(f"**Published**: {paper['published']}")
149
+ st.write(f"[Read More]({paper['link']})")
150
+ st.write("---")
151
+ else:
152
+ st.warning(f"No papers found between {start_year} and {end_year}. Try a different query or adjust the year range.")
153
+ else:
154
+ st.error("Please enter a topic or keywords to search.")
pdfpass.py ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from PyPDF2 import PdfReader, PdfWriter
3
+ from io import BytesIO
4
+
5
+ def remove_pdf_password(file, password):
6
+ try:
7
+ reader = PdfReader(file)
8
+ if reader.is_encrypted:
9
+ reader.decrypt(password)
10
+ writer = PdfWriter()
11
+ for page in reader.pages:
12
+ writer.add_page(page)
13
+
14
+ output = BytesIO()
15
+ writer.write(output)
16
+ output.seek(0)
17
+ return output
18
+ except Exception as e:
19
+ return str(e)
20
+
21
+ st.title("PDF Password Remover")
22
+ st.write("Upload a password-protected PDF and remove its password.")
23
+
24
+ # File upload
25
+ uploaded_file = st.file_uploader("Choose a PDF file", type=["pdf"])
26
+ password = st.text_input("Enter the PDF password", type="password")
27
+
28
+ if uploaded_file and password:
29
+ if st.button("Remove Password"):
30
+ output = remove_pdf_password(uploaded_file, password)
31
+ if isinstance(output, BytesIO):
32
+ st.success("Password removed successfully!")
33
+ st.download_button(
34
+ label="Download PDF without Password",
35
+ data=output,
36
+ file_name="unlocked_pdf.pdf",
37
+ mime="application/pdf",
38
+ )
39
+ else:
40
+ st.error(f"Error: {output}")
pdfsum.py ADDED
@@ -0,0 +1,125 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # import streamlit as st
2
+ # from transformers import pipeline
3
+ # from PyPDF2 import PdfReader
4
+
5
+ # # Initialize the summarizer
6
+ # summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
7
+
8
+ # def extract_text_from_pdf(pdf_file):
9
+ # """Extract text from an uploaded PDF file."""
10
+ # try:
11
+ # reader = PdfReader(pdf_file)
12
+ # text = ""
13
+ # for page in reader.pages:
14
+ # page_text = page.extract_text()
15
+ # if page_text: # Skip pages with no text
16
+ # text += page_text + "\n"
17
+ # return text
18
+ # except Exception as e:
19
+ # raise ValueError(f"Error extracting text from PDF: {e}")
20
+
21
+ # def split_text_into_chunks(text, max_chunk_size=1024):
22
+ # """Split the text into smaller chunks for summarization."""
23
+ # chunks = []
24
+ # while len(text) > max_chunk_size:
25
+ # split_point = text.rfind(". ", 0, max_chunk_size) + 1 # Split at the last sentence boundary
26
+ # if split_point == 0: # No sentence boundary found, split arbitrarily
27
+ # split_point = max_chunk_size
28
+ # chunks.append
29
+
30
+ # # Streamlit Dashboard
31
+ # st.title("PDF Summarizer")
32
+ # st.write("Upload a PDF file to get a summarized version of its content.")
33
+
34
+ # uploaded_file = st.file_uploader("Upload your PDF", type=["pdf"])
35
+
36
+ # if uploaded_file is not None:
37
+ # # Extract text from the PDF
38
+ # st.write("Processing your PDF...")
39
+ # try:
40
+ # pdf_text = extract_text_from_pdf(uploaded_file)
41
+ # st.write("PDF content extracted successfully.")
42
+
43
+ # # Display extracted text (optional)
44
+ # with st.expander("View Extracted Text"):
45
+ # st.text_area("Extracted Text", pdf_text, height=300)
46
+
47
+ # # Summarize the extracted text
48
+ # if st.button("Summarize"):
49
+ # st.write("Generating summary...")
50
+ # summary = summarizer(pdf_text, max_length=130, min_length=30, do_sample=False)
51
+ # st.subheader("Summary")
52
+ # st.write(summary[0]["summary_text"])
53
+ # except Exception as e:
54
+ # st.error(f"An error occurred while processing the PDF: {str(e)}")
55
+
56
+ import streamlit as st
57
+ from transformers import pipeline
58
+ import pdfplumber
59
+
60
+ # Initialize the summarizer
61
+ summarizer = pipeline("summarization", model="t5-small")
62
+
63
+ def extract_text_from_pdf(pdf_file):
64
+ """Extract text from an uploaded PDF file using pdfplumber."""
65
+ try:
66
+ text = ""
67
+ with pdfplumber.open(pdf_file) as pdf:
68
+ for page in pdf.pages:
69
+ text += page.extract_text() + "\n"
70
+ if not text.strip():
71
+ raise ValueError("No extractable text found in the PDF.")
72
+ return text
73
+ except Exception as e:
74
+ raise ValueError(f"Error extracting text from PDF: {e}")
75
+
76
+ def split_text_into_chunks(text, max_chunk_size=1024):
77
+ """Split the text into smaller chunks for summarization."""
78
+ chunks = []
79
+ while len(text) > max_chunk_size:
80
+ split_point = text.rfind(". ", 0, max_chunk_size) + 1 # Find the last full sentence
81
+ if split_point == 0: # No sentence boundary found, split arbitrarily
82
+ split_point = max_chunk_size
83
+ chunks.append(text[:split_point])
84
+ text = text[split_point:]
85
+ if text:
86
+ chunks.append(text)
87
+ return chunks
88
+
89
+ def summarize_text(chunks):
90
+ """Summarize each chunk of text with dynamic max_length."""
91
+ summaries = []
92
+ for chunk in chunks:
93
+ input_length = len(chunk.split()) # Approximate token count
94
+ max_length = max(48, int(input_length * 0.8)) # Set max_length to 80% of input length
95
+ summary = summarizer(chunk, max_length=max_length, min_length=10, do_sample=False)
96
+ summaries.append(summary[0]["summary_text"])
97
+ return summaries
98
+
99
+ # Streamlit Dashboard
100
+ st.title("PDF Summarizer")
101
+ st.write("Upload a PDF file to get a summarized version of its content.")
102
+
103
+ uploaded_file = st.file_uploader("Upload your PDF", type=["pdf"])
104
+
105
+ if uploaded_file is not None:
106
+ try:
107
+ # Extract text from the PDF
108
+ st.write("Processing your PDF...")
109
+ pdf_text = extract_text_from_pdf(uploaded_file)
110
+ st.write("PDF content extracted successfully.")
111
+
112
+ # Display extracted text (optional)
113
+ with st.expander("View Extracted Text"):
114
+ st.text_area("Extracted Text", pdf_text, height=300)
115
+
116
+ # Summarize the extracted text
117
+ if st.button("Summarize"):
118
+ st.write("Generating summary...")
119
+ chunks = split_text_into_chunks(pdf_text)
120
+ summaries = summarize_text(chunks)
121
+ full_summary = " ".join(summaries)
122
+ st.subheader("Summary")
123
+ st.write(full_summary)
124
+ except Exception as e:
125
+ st.error(f"An error occurred while processing the PDF: {str(e)}")
requirements.txt ADDED
Binary file (352 Bytes). View file
 
textsumm.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import pipeline
2
+
3
+ summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
4
+ ARTICLE ="""
5
+ There is widespread international concern that Russia's war will provoke a global food crisis similar to, or
6
+ worse than, that faced in 2007 and 2008. The war comes at a time when the global food system was already
7
+ struggling to feed its growing population in a sustainable way, under the pressure caused by climate change
8
+ and the Covid-19 pandemic. Russia and Ukraine are key agricultural players, together exporting nearly 12
9
+ % of food calories traded globally. They are major providers of basic agro-commodities, including wheat,
10
+ maize and sunflower oil, and Russia is the world's top exporter of fertilisers. The global supply chain will
11
+ get impacted until Russia and Ukraine retreat and will end the war.
12
+ The war's impact on global food supply centred on three factors. First is a significant reduction in exports
13
+ and production of essential commodities from both countries, caused by the war and not the economic
14
+ sanctions imposed on Russia, which, intentionally, did not target the agricultural sector. Overall, the
15
+ European Commission estimates that 'up to 25 million tonnes of wheat would need to be substituted to
16
+ meet worldwide food needs in the current and the next season. Second factor is a global spike in prices of
17
+ food supplies and inputs needed for agri-food production, which were already at record levels before the
18
+ war. The war has further pushed the prices up. Third factor is the international response to the above,
19
+ which could either amplify the effects of the crisis (mainly by uncoordinated export bans) or mitigate them
20
+ (applying lessons learnt from the 2007-2008 food crisis). A number of countries, other than Russia and
21
+ Ukraine, have already imposed or announced their intention to impose some control over exports of
22
+ essential agricultural commodities, including Egypt, Argentina, Indonesia, Serbia, Turkey and, in the EU,
23
+ Hungary. We should keep this in our mind that the long duration of war will make the global situation
24
+ irrecoverable.
25
+
26
+ """
27
+ print(summarizer(ARTICLE, max_length=130, min_length=30, do_sample=False))
28
+