Spaces:
Sleeping
Sleeping
import streamlit as st | |
import PyPDF2 | |
from transformers import pipeline | |
from gtts import gTTS | |
from PIL import Image | |
# Function to read the PDF and extract text | |
def extract_text_from_pdf(pdf_file): | |
pdf_reader = PyPDF2.PdfReader(pdf_file) | |
text = "" | |
for page_num in range(len(pdf_reader.pages)): | |
text += pdf_reader.pages[page_num].extract_text() | |
return text | |
# Function to answer questions based on document image | |
def answer_question_with_docvqa(pdf_file, question): | |
docvqa_pipeline = pipeline( | |
"document-question-answering", | |
model="google/pix2struct-docvqa-large" | |
) | |
answers = [] | |
for page_num in range(len(pdf_file.pages)): | |
page = pdf_file.pages[page_num] | |
page_image = page.to_image() # Convert PDF page to image if possible | |
answers.append(docvqa_pipeline(image=Image.open(page_image), question=question)) | |
return answers | |
# Function to generate discussion points | |
def generate_discussion_points(text): | |
summarizer = pipeline('summarization') | |
summary = summarizer(text, max_length=600, min_length=300, do_sample=False) | |
return summary[0]['summary_text'] | |
# Function to convert text to speech | |
def text_to_speech(text): | |
tts = gTTS(text=text, lang='en') | |
tts.save("discussion_points.mp3") | |
# Streamlit app | |
st.title("PDF Analysis and Discussion Generator") | |
uploaded_file = st.file_uploader("Upload a PDF file", type=["pdf"]) | |
if uploaded_file is not None: | |
# Load PDF for processing | |
pdf_reader = PyPDF2.PdfReader(uploaded_file) | |
# Extract and display text content | |
text = extract_text_from_pdf(pdf_reader) | |
st.subheader("Extracted Text") | |
st.write(text) | |
# Question answering functionality | |
st.subheader("Ask Questions About the Document") | |
user_question = st.text_input("Enter your question:") | |
if user_question: | |
answers = answer_question_with_docvqa(pdf_reader, user_question) | |
st.write("Answer:", answers) | |
# Generate and display discussion points | |
discussion_points = generate_discussion_points(text) | |
st.subheader("Generated Discussion Points") | |
st.write(discussion_points) | |
# Convert discussion points to audio | |
text_to_speech(discussion_points) | |
audio_file = open("discussion_points.mp3", "rb") | |
audio_bytes = audio_file.read() | |
st.audio(audio_bytes, format='audio/mp3') |