File size: 4,906 Bytes
93c008b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
# import streamlit as st
# from transformers import pipeline
# from PyPDF2 import PdfReader

# # Initialize the summarizer
# summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

# def extract_text_from_pdf(pdf_file):
#     """Extract text from an uploaded PDF file."""
#     try:
#         reader = PdfReader(pdf_file)
#         text = ""
#         for page in reader.pages:
#             page_text = page.extract_text()
#             if page_text:  # Skip pages with no text
#                 text += page_text + "\n"
#         return text
#     except Exception as e:
#         raise ValueError(f"Error extracting text from PDF: {e}")

# def split_text_into_chunks(text, max_chunk_size=1024):
#     """Split the text into smaller chunks for summarization."""
#     chunks = []
#     while len(text) > max_chunk_size:
#         split_point = text.rfind(". ", 0, max_chunk_size) + 1  # Split at the last sentence boundary
#         if split_point == 0:  # No sentence boundary found, split arbitrarily
#             split_point = max_chunk_size
#         chunks.append

# # Streamlit Dashboard
# st.title("PDF Summarizer")
# st.write("Upload a PDF file to get a summarized version of its content.")

# uploaded_file = st.file_uploader("Upload your PDF", type=["pdf"])

# if uploaded_file is not None:
#     # Extract text from the PDF
#     st.write("Processing your PDF...")
#     try:
#         pdf_text = extract_text_from_pdf(uploaded_file)
#         st.write("PDF content extracted successfully.")
        
#         # Display extracted text (optional)
#         with st.expander("View Extracted Text"):
#             st.text_area("Extracted Text", pdf_text, height=300)
        
#         # Summarize the extracted text
#         if st.button("Summarize"):
#             st.write("Generating summary...")
#             summary = summarizer(pdf_text, max_length=130, min_length=30, do_sample=False)
#             st.subheader("Summary")
#             st.write(summary[0]["summary_text"])
#     except Exception as e:
#         st.error(f"An error occurred while processing the PDF: {str(e)}")

import streamlit as st
from transformers import pipeline
import pdfplumber

# Initialize the summarizer
summarizer = pipeline("summarization", model="t5-small")

def extract_text_from_pdf(pdf_file):
    """Extract text from an uploaded PDF file using pdfplumber."""
    try:
        text = ""
        with pdfplumber.open(pdf_file) as pdf:
            for page in pdf.pages:
                text += page.extract_text() + "\n"
        if not text.strip():
            raise ValueError("No extractable text found in the PDF.")
        return text
    except Exception as e:
        raise ValueError(f"Error extracting text from PDF: {e}")

def split_text_into_chunks(text, max_chunk_size=1024):
    """Split the text into smaller chunks for summarization."""
    chunks = []
    while len(text) > max_chunk_size:
        split_point = text.rfind(". ", 0, max_chunk_size) + 1  # Find the last full sentence
        if split_point == 0:  # No sentence boundary found, split arbitrarily
            split_point = max_chunk_size
        chunks.append(text[:split_point])
        text = text[split_point:]
    if text:
        chunks.append(text)
    return chunks

def summarize_text(chunks):
    """Summarize each chunk of text with dynamic max_length."""
    summaries = []
    for chunk in chunks:
        input_length = len(chunk.split())  # Approximate token count
        max_length = max(48, int(input_length * 0.8))  # Set max_length to 80% of input length
        summary = summarizer(chunk, max_length=max_length, min_length=10, do_sample=False)
        summaries.append(summary[0]["summary_text"])
    return summaries

# Streamlit Dashboard
st.title("PDF Summarizer")
st.write("Upload a PDF file to get a summarized version of its content.")

uploaded_file = st.file_uploader("Upload your PDF", type=["pdf"])

if uploaded_file is not None:
    try:
        # Extract text from the PDF
        st.write("Processing your PDF...")
        pdf_text = extract_text_from_pdf(uploaded_file)
        st.write("PDF content extracted successfully.")
        
        # Display extracted text (optional)
        with st.expander("View Extracted Text"):
            st.text_area("Extracted Text", pdf_text, height=300)
        
        # Summarize the extracted text
        if st.button("Summarize"):
            st.write("Generating summary...")
            chunks = split_text_into_chunks(pdf_text)
            summaries = summarize_text(chunks)
            full_summary = " ".join(summaries)
            st.subheader("Summary")
            st.write(full_summary)
    except Exception as e:
        st.error(f"An error occurred while processing the PDF: {str(e)}")