import streamlit as st import os from PyPDF2 import PdfReader import openpyxl from transformers import AutoTokenizer, AutoModelForQuestionAnswering, pipeline os.environ['GOOGLE_API_KEY'] = 'AIzaSyD8uzXToT4I2ABs7qo_XiuKh8-L2nuWCEM' def get_pdf_text(pdf_docs): text = "" for pdf in pdf_docs: pdf_reader = PdfReader(pdf) for page in pdf_reader.pages: text += page.extract_text() return text def get_excel_text(excel_docs): text = "" for excel_doc in excel_docs: workbook = openpyxl.load_workbook(filename=excel_doc) for sheet in workbook: for row in sheet: for cell in row: text += str(cell.value) + " " return text.strip() def get_user_input(user_question, qa_pipeline): with st.container(): response = qa_pipeline(question=user_question, context=st.session_state.raw_text) st.write("Answer:", response["answer"]) def main(): st.set_page_config("DocChat") st.header("DocChat - Chat with multiple documents") st.write("---") qa_pipeline = None with st.container(): with st.sidebar: st.title("Settings") st.subheader("Upload Documents") st.markdown("**PDF files:**") pdf_docs = st.file_uploader("Upload PDF Files", accept_multiple_files=True) if st.button("Process PDF file"): with st.spinner("Processing PDFs..."): raw_text = get_pdf_text(pdf_docs) st.session_state.raw_text = raw_text st.success("PDF processed successfully!") st.markdown("**Excel files:**") excel_docs = st.file_uploader("Upload Excel Files", accept_multiple_files=True) if st.button("Process Excel file"): with st.spinner("Processing Excel files..."): raw_text = get_excel_text(excel_docs) st.session_state.raw_text = raw_text st.success("Excel file processed successfully!") with st.container(): st.subheader("Document Q&A") st.write('Ask a question : ') user_question = st.text_input("Ask a Question from the document") if user_question: if not qa_pipeline and "raw_text" in st.session_state: model_name = "HanNayeoniee/LHK_DPO_v1" tokenizer = AutoTokenizer.from_pretrained(model_name) model = AutoModelForQuestionAnswering.from_pretrained(model_name) qa_pipeline = pipeline("question-answering", model=model, tokenizer=tokenizer) if qa_pipeline: get_user_input(user_question, qa_pipeline) if __name__ == "__main__": main()