Spaces:
Sleeping
Sleeping
# import streamlit as st | |
# from transformers import pipeline | |
# from PyPDF2 import PdfReader | |
# # Initialize the summarizer | |
# summarizer = pipeline("summarization", model="facebook/bart-large-cnn") | |
# def extract_text_from_pdf(pdf_file): | |
# """Extract text from an uploaded PDF file.""" | |
# try: | |
# reader = PdfReader(pdf_file) | |
# text = "" | |
# for page in reader.pages: | |
# page_text = page.extract_text() | |
# if page_text: # Skip pages with no text | |
# text += page_text + "\n" | |
# return text | |
# except Exception as e: | |
# raise ValueError(f"Error extracting text from PDF: {e}") | |
# def split_text_into_chunks(text, max_chunk_size=1024): | |
# """Split the text into smaller chunks for summarization.""" | |
# chunks = [] | |
# while len(text) > max_chunk_size: | |
# split_point = text.rfind(". ", 0, max_chunk_size) + 1 # Split at the last sentence boundary | |
# if split_point == 0: # No sentence boundary found, split arbitrarily | |
# split_point = max_chunk_size | |
# chunks.append | |
# # Streamlit Dashboard | |
# st.title("PDF Summarizer") | |
# st.write("Upload a PDF file to get a summarized version of its content.") | |
# uploaded_file = st.file_uploader("Upload your PDF", type=["pdf"]) | |
# if uploaded_file is not None: | |
# # Extract text from the PDF | |
# st.write("Processing your PDF...") | |
# try: | |
# pdf_text = extract_text_from_pdf(uploaded_file) | |
# st.write("PDF content extracted successfully.") | |
# # Display extracted text (optional) | |
# with st.expander("View Extracted Text"): | |
# st.text_area("Extracted Text", pdf_text, height=300) | |
# # Summarize the extracted text | |
# if st.button("Summarize"): | |
# st.write("Generating summary...") | |
# summary = summarizer(pdf_text, max_length=130, min_length=30, do_sample=False) | |
# st.subheader("Summary") | |
# st.write(summary[0]["summary_text"]) | |
# except Exception as e: | |
# st.error(f"An error occurred while processing the PDF: {str(e)}") | |
import streamlit as st | |
from transformers import pipeline | |
import pdfplumber | |
# Initialize the summarizer | |
summarizer = pipeline("summarization", model="t5-small") | |
def extract_text_from_pdf(pdf_file): | |
"""Extract text from an uploaded PDF file using pdfplumber.""" | |
try: | |
text = "" | |
with pdfplumber.open(pdf_file) as pdf: | |
for page in pdf.pages: | |
text += page.extract_text() + "\n" | |
if not text.strip(): | |
raise ValueError("No extractable text found in the PDF.") | |
return text | |
except Exception as e: | |
raise ValueError(f"Error extracting text from PDF: {e}") | |
def split_text_into_chunks(text, max_chunk_size=1024): | |
"""Split the text into smaller chunks for summarization.""" | |
chunks = [] | |
while len(text) > max_chunk_size: | |
split_point = text.rfind(". ", 0, max_chunk_size) + 1 # Find the last full sentence | |
if split_point == 0: # No sentence boundary found, split arbitrarily | |
split_point = max_chunk_size | |
chunks.append(text[:split_point]) | |
text = text[split_point:] | |
if text: | |
chunks.append(text) | |
return chunks | |
def summarize_text(chunks): | |
"""Summarize each chunk of text with dynamic max_length.""" | |
summaries = [] | |
for chunk in chunks: | |
input_length = len(chunk.split()) # Approximate token count | |
max_length = max(48, int(input_length * 0.8)) # Set max_length to 80% of input length | |
summary = summarizer(chunk, max_length=max_length, min_length=10, do_sample=False) | |
summaries.append(summary[0]["summary_text"]) | |
return summaries | |
# Streamlit Dashboard | |
st.title("PDF Summarizer") | |
st.write("Upload a PDF file to get a summarized version of its content.") | |
uploaded_file = st.file_uploader("Upload your PDF", type=["pdf"]) | |
if uploaded_file is not None: | |
try: | |
# Extract text from the PDF | |
st.write("Processing your PDF...") | |
pdf_text = extract_text_from_pdf(uploaded_file) | |
st.write("PDF content extracted successfully.") | |
# Display extracted text (optional) | |
with st.expander("View Extracted Text"): | |
st.text_area("Extracted Text", pdf_text, height=300) | |
# Summarize the extracted text | |
if st.button("Summarize"): | |
st.write("Generating summary...") | |
chunks = split_text_into_chunks(pdf_text) | |
summaries = summarize_text(chunks) | |
full_summary = " ".join(summaries) | |
st.subheader("Summary") | |
st.write(full_summary) | |
except Exception as e: | |
st.error(f"An error occurred while processing the PDF: {str(e)}") | |