Spaces:
Sleeping
Sleeping
File size: 4,906 Bytes
93c008b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 |
# import streamlit as st
# from transformers import pipeline
# from PyPDF2 import PdfReader
# # Initialize the summarizer
# summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
# def extract_text_from_pdf(pdf_file):
# """Extract text from an uploaded PDF file."""
# try:
# reader = PdfReader(pdf_file)
# text = ""
# for page in reader.pages:
# page_text = page.extract_text()
# if page_text: # Skip pages with no text
# text += page_text + "\n"
# return text
# except Exception as e:
# raise ValueError(f"Error extracting text from PDF: {e}")
# def split_text_into_chunks(text, max_chunk_size=1024):
# """Split the text into smaller chunks for summarization."""
# chunks = []
# while len(text) > max_chunk_size:
# split_point = text.rfind(". ", 0, max_chunk_size) + 1 # Split at the last sentence boundary
# if split_point == 0: # No sentence boundary found, split arbitrarily
# split_point = max_chunk_size
# chunks.append
# # Streamlit Dashboard
# st.title("PDF Summarizer")
# st.write("Upload a PDF file to get a summarized version of its content.")
# uploaded_file = st.file_uploader("Upload your PDF", type=["pdf"])
# if uploaded_file is not None:
# # Extract text from the PDF
# st.write("Processing your PDF...")
# try:
# pdf_text = extract_text_from_pdf(uploaded_file)
# st.write("PDF content extracted successfully.")
# # Display extracted text (optional)
# with st.expander("View Extracted Text"):
# st.text_area("Extracted Text", pdf_text, height=300)
# # Summarize the extracted text
# if st.button("Summarize"):
# st.write("Generating summary...")
# summary = summarizer(pdf_text, max_length=130, min_length=30, do_sample=False)
# st.subheader("Summary")
# st.write(summary[0]["summary_text"])
# except Exception as e:
# st.error(f"An error occurred while processing the PDF: {str(e)}")
import streamlit as st
from transformers import pipeline
import pdfplumber
# Initialize the summarizer
summarizer = pipeline("summarization", model="t5-small")
def extract_text_from_pdf(pdf_file):
"""Extract text from an uploaded PDF file using pdfplumber."""
try:
text = ""
with pdfplumber.open(pdf_file) as pdf:
for page in pdf.pages:
text += page.extract_text() + "\n"
if not text.strip():
raise ValueError("No extractable text found in the PDF.")
return text
except Exception as e:
raise ValueError(f"Error extracting text from PDF: {e}")
def split_text_into_chunks(text, max_chunk_size=1024):
"""Split the text into smaller chunks for summarization."""
chunks = []
while len(text) > max_chunk_size:
split_point = text.rfind(". ", 0, max_chunk_size) + 1 # Find the last full sentence
if split_point == 0: # No sentence boundary found, split arbitrarily
split_point = max_chunk_size
chunks.append(text[:split_point])
text = text[split_point:]
if text:
chunks.append(text)
return chunks
def summarize_text(chunks):
"""Summarize each chunk of text with dynamic max_length."""
summaries = []
for chunk in chunks:
input_length = len(chunk.split()) # Approximate token count
max_length = max(48, int(input_length * 0.8)) # Set max_length to 80% of input length
summary = summarizer(chunk, max_length=max_length, min_length=10, do_sample=False)
summaries.append(summary[0]["summary_text"])
return summaries
# Streamlit Dashboard
st.title("PDF Summarizer")
st.write("Upload a PDF file to get a summarized version of its content.")
uploaded_file = st.file_uploader("Upload your PDF", type=["pdf"])
if uploaded_file is not None:
try:
# Extract text from the PDF
st.write("Processing your PDF...")
pdf_text = extract_text_from_pdf(uploaded_file)
st.write("PDF content extracted successfully.")
# Display extracted text (optional)
with st.expander("View Extracted Text"):
st.text_area("Extracted Text", pdf_text, height=300)
# Summarize the extracted text
if st.button("Summarize"):
st.write("Generating summary...")
chunks = split_text_into_chunks(pdf_text)
summaries = summarize_text(chunks)
full_summary = " ".join(summaries)
st.subheader("Summary")
st.write(full_summary)
except Exception as e:
st.error(f"An error occurred while processing the PDF: {str(e)}")
|