Spaces:

notabaka
/

DocQA

Sleeping

File size: 2,767 Bytes

b21d556
 
 
 
2dbdb3a
b21d556
 
 
 
 
 
 
 
 
 
 
 
2dbdb3a
 
 
 
 
 
 
 
b21d556
2dbdb3a
b21d556
2dbdb3a
 
b21d556
 
2dbdb3a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b21d556
2dbdb3a
 
 
 
 
 
 
b21d556
2dbdb3a
 
 
 
 
 
af64ea8
2dbdb3a
 
 
 
 
b21d556
 
2dbdb3a

import streamlit as st
import os
from PyPDF2 import PdfReader
import openpyxl
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, pipeline

os.environ['GOOGLE_API_KEY'] = 'AIzaSyD8uzXToT4I2ABs7qo_XiuKh8-L2nuWCEM'

def get_pdf_text(pdf_docs):
    text = ""
    for pdf in pdf_docs:
        pdf_reader = PdfReader(pdf)
        for page in pdf_reader.pages:
            text += page.extract_text()
    return text

def get_excel_text(excel_docs):
    text = ""
    for excel_doc in excel_docs:
        workbook = openpyxl.load_workbook(filename=excel_doc)
        for sheet in workbook:
            for row in sheet:
                for cell in row:
                    text += str(cell.value) + " "
    return text.strip()

def get_user_input(user_question, qa_pipeline):
    with st.container():
        response = qa_pipeline(question=user_question, context=st.session_state.raw_text)
        st.write("Answer:", response["answer"])

def main():
    st.set_page_config("DocChat")
    st.header("DocChat - Chat with multiple documents")
    st.write("---")
    
    qa_pipeline = None
    
    with st.container():
        with st.sidebar:
            st.title("Settings")
            st.subheader("Upload Documents")
            st.markdown("**PDF files:**")
            pdf_docs = st.file_uploader("Upload PDF Files", accept_multiple_files=True)
            if st.button("Process PDF file"):
                with st.spinner("Processing PDFs..."):
                    raw_text = get_pdf_text(pdf_docs)
                    st.session_state.raw_text = raw_text
                    st.success("PDF processed successfully!")

            st.markdown("**Excel files:**")
            excel_docs = st.file_uploader("Upload Excel Files", accept_multiple_files=True)
            if st.button("Process Excel file"):
                with st.spinner("Processing Excel files..."):
                    raw_text = get_excel_text(excel_docs)
                    st.session_state.raw_text = raw_text
                    st.success("Excel file processed successfully!")

    with st.container():
        st.subheader("Document Q&A")
        st.write('Ask a question : ')
        user_question = st.text_input("Ask a Question from the document")
        if user_question:
            if not qa_pipeline and "raw_text" in st.session_state:
                model_name = "HanNayeoniee/LHK_DPO_v1"
                tokenizer = AutoTokenizer.from_pretrained(model_name)
                model = AutoModelForQuestionAnswering.from_pretrained(model_name)
                qa_pipeline = pipeline("question-answering", model=model, tokenizer=tokenizer)
            if qa_pipeline:
                get_user_input(user_question, qa_pipeline)

if __name__ == "__main__":
    main()