NotebookCwithqa / app.py
Arslan17121's picture
Create app.py
4751360 verified
raw
history blame
2.35 kB
import streamlit as st
import PyPDF2
from transformers import pipeline
from gtts import gTTS
from PIL import Image
# Function to read the PDF and extract text
def extract_text_from_pdf(pdf_file):
pdf_reader = PyPDF2.PdfReader(pdf_file)
text = ""
for page_num in range(len(pdf_reader.pages)):
text += pdf_reader.pages[page_num].extract_text()
return text
# Function to answer questions based on document image
def answer_question_with_docvqa(pdf_file, question):
docvqa_pipeline = pipeline(
"document-question-answering",
model="google/pix2struct-docvqa-large"
)
answers = []
for page_num in range(len(pdf_file.pages)):
page = pdf_file.pages[page_num]
page_image = page.to_image() # Convert PDF page to image if possible
answers.append(docvqa_pipeline(image=Image.open(page_image), question=question))
return answers
# Function to generate discussion points
def generate_discussion_points(text):
summarizer = pipeline('summarization')
summary = summarizer(text, max_length=600, min_length=300, do_sample=False)
return summary[0]['summary_text']
# Function to convert text to speech
def text_to_speech(text):
tts = gTTS(text=text, lang='en')
tts.save("discussion_points.mp3")
# Streamlit app
st.title("PDF Analysis and Discussion Generator")
uploaded_file = st.file_uploader("Upload a PDF file", type=["pdf"])
if uploaded_file is not None:
# Load PDF for processing
pdf_reader = PyPDF2.PdfReader(uploaded_file)
# Extract and display text content
text = extract_text_from_pdf(pdf_reader)
st.subheader("Extracted Text")
st.write(text)
# Question answering functionality
st.subheader("Ask Questions About the Document")
user_question = st.text_input("Enter your question:")
if user_question:
answers = answer_question_with_docvqa(pdf_reader, user_question)
st.write("Answer:", answers)
# Generate and display discussion points
discussion_points = generate_discussion_points(text)
st.subheader("Generated Discussion Points")
st.write(discussion_points)
# Convert discussion points to audio
text_to_speech(discussion_points)
audio_file = open("discussion_points.mp3", "rb")
audio_bytes = audio_file.read()
st.audio(audio_bytes, format='audio/mp3')