DocQA / app.py
notabaka's picture
test3
af64ea8
import streamlit as st
import os
from PyPDF2 import PdfReader
import openpyxl
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, pipeline
os.environ['GOOGLE_API_KEY'] = 'AIzaSyD8uzXToT4I2ABs7qo_XiuKh8-L2nuWCEM'
def get_pdf_text(pdf_docs):
text = ""
for pdf in pdf_docs:
pdf_reader = PdfReader(pdf)
for page in pdf_reader.pages:
text += page.extract_text()
return text
def get_excel_text(excel_docs):
text = ""
for excel_doc in excel_docs:
workbook = openpyxl.load_workbook(filename=excel_doc)
for sheet in workbook:
for row in sheet:
for cell in row:
text += str(cell.value) + " "
return text.strip()
def get_user_input(user_question, qa_pipeline):
with st.container():
response = qa_pipeline(question=user_question, context=st.session_state.raw_text)
st.write("Answer:", response["answer"])
def main():
st.set_page_config("DocChat")
st.header("DocChat - Chat with multiple documents")
st.write("---")
qa_pipeline = None
with st.container():
with st.sidebar:
st.title("Settings")
st.subheader("Upload Documents")
st.markdown("**PDF files:**")
pdf_docs = st.file_uploader("Upload PDF Files", accept_multiple_files=True)
if st.button("Process PDF file"):
with st.spinner("Processing PDFs..."):
raw_text = get_pdf_text(pdf_docs)
st.session_state.raw_text = raw_text
st.success("PDF processed successfully!")
st.markdown("**Excel files:**")
excel_docs = st.file_uploader("Upload Excel Files", accept_multiple_files=True)
if st.button("Process Excel file"):
with st.spinner("Processing Excel files..."):
raw_text = get_excel_text(excel_docs)
st.session_state.raw_text = raw_text
st.success("Excel file processed successfully!")
with st.container():
st.subheader("Document Q&A")
st.write('Ask a question : ')
user_question = st.text_input("Ask a Question from the document")
if user_question:
if not qa_pipeline and "raw_text" in st.session_state:
model_name = "HanNayeoniee/LHK_DPO_v1"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForQuestionAnswering.from_pretrained(model_name)
qa_pipeline = pipeline("question-answering", model=model, tokenizer=tokenizer)
if qa_pipeline:
get_user_input(user_question, qa_pipeline)
if __name__ == "__main__":
main()