Spaces:

notabaka
/

DocQA

Sleeping

App Files Files Community

DocQA / app.py

notabaka

test3

af64ea8 over 1 year ago

raw

history blame contribute delete

2.77 kB

	import streamlit as st
	import os
	from PyPDF2 import PdfReader
	import openpyxl
	from transformers import AutoTokenizer, AutoModelForQuestionAnswering, pipeline

	os.environ['GOOGLE_API_KEY'] = 'AIzaSyD8uzXToT4I2ABs7qo_XiuKh8-L2nuWCEM'

	def get_pdf_text(pdf_docs):
	text = ""
	for pdf in pdf_docs:
	pdf_reader = PdfReader(pdf)
	for page in pdf_reader.pages:
	text += page.extract_text()
	return text

	def get_excel_text(excel_docs):
	text = ""
	for excel_doc in excel_docs:
	workbook = openpyxl.load_workbook(filename=excel_doc)
	for sheet in workbook:
	for row in sheet:
	for cell in row:
	text += str(cell.value) + " "
	return text.strip()

	def get_user_input(user_question, qa_pipeline):
	with st.container():
	response = qa_pipeline(question=user_question, context=st.session_state.raw_text)
	st.write("Answer:", response["answer"])

	def main():
	st.set_page_config("DocChat")
	st.header("DocChat - Chat with multiple documents")
	st.write("---")

	qa_pipeline = None

	with st.container():
	with st.sidebar:
	st.title("Settings")
	st.subheader("Upload Documents")
	st.markdown("PDF files:")
	pdf_docs = st.file_uploader("Upload PDF Files", accept_multiple_files=True)
	if st.button("Process PDF file"):
	with st.spinner("Processing PDFs..."):
	raw_text = get_pdf_text(pdf_docs)
	st.session_state.raw_text = raw_text
	st.success("PDF processed successfully!")

	st.markdown("Excel files:")
	excel_docs = st.file_uploader("Upload Excel Files", accept_multiple_files=True)
	if st.button("Process Excel file"):
	with st.spinner("Processing Excel files..."):
	raw_text = get_excel_text(excel_docs)
	st.session_state.raw_text = raw_text
	st.success("Excel file processed successfully!")

	with st.container():
	st.subheader("Document Q&A")
	st.write('Ask a question : ')
	user_question = st.text_input("Ask a Question from the document")
	if user_question:
	if not qa_pipeline and "raw_text" in st.session_state:
	model_name = "HanNayeoniee/LHK_DPO_v1"
	tokenizer = AutoTokenizer.from_pretrained(model_name)
	model = AutoModelForQuestionAnswering.from_pretrained(model_name)
	qa_pipeline = pipeline("question-answering", model=model, tokenizer=tokenizer)
	if qa_pipeline:
	get_user_input(user_question, qa_pipeline)

	if __name__ == "__main__":
	main()