File size: 2,767 Bytes
b21d556
 
 
 
2dbdb3a
b21d556
 
 
 
 
 
 
 
 
 
 
 
2dbdb3a
 
 
 
 
 
 
 
b21d556
2dbdb3a
b21d556
2dbdb3a
 
b21d556
 
2dbdb3a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b21d556
2dbdb3a
 
 
 
 
 
 
b21d556
2dbdb3a
 
 
 
 
 
af64ea8
2dbdb3a
 
 
 
 
b21d556
 
2dbdb3a
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
import streamlit as st
import os
from PyPDF2 import PdfReader
import openpyxl
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, pipeline

os.environ['GOOGLE_API_KEY'] = 'AIzaSyD8uzXToT4I2ABs7qo_XiuKh8-L2nuWCEM'

def get_pdf_text(pdf_docs):
    text = ""
    for pdf in pdf_docs:
        pdf_reader = PdfReader(pdf)
        for page in pdf_reader.pages:
            text += page.extract_text()
    return text

def get_excel_text(excel_docs):
    text = ""
    for excel_doc in excel_docs:
        workbook = openpyxl.load_workbook(filename=excel_doc)
        for sheet in workbook:
            for row in sheet:
                for cell in row:
                    text += str(cell.value) + " "
    return text.strip()

def get_user_input(user_question, qa_pipeline):
    with st.container():
        response = qa_pipeline(question=user_question, context=st.session_state.raw_text)
        st.write("Answer:", response["answer"])

def main():
    st.set_page_config("DocChat")
    st.header("DocChat - Chat with multiple documents")
    st.write("---")
    
    qa_pipeline = None
    
    with st.container():
        with st.sidebar:
            st.title("Settings")
            st.subheader("Upload Documents")
            st.markdown("**PDF files:**")
            pdf_docs = st.file_uploader("Upload PDF Files", accept_multiple_files=True)
            if st.button("Process PDF file"):
                with st.spinner("Processing PDFs..."):
                    raw_text = get_pdf_text(pdf_docs)
                    st.session_state.raw_text = raw_text
                    st.success("PDF processed successfully!")

            st.markdown("**Excel files:**")
            excel_docs = st.file_uploader("Upload Excel Files", accept_multiple_files=True)
            if st.button("Process Excel file"):
                with st.spinner("Processing Excel files..."):
                    raw_text = get_excel_text(excel_docs)
                    st.session_state.raw_text = raw_text
                    st.success("Excel file processed successfully!")

    with st.container():
        st.subheader("Document Q&A")
        st.write('Ask a question : ')
        user_question = st.text_input("Ask a Question from the document")
        if user_question:
            if not qa_pipeline and "raw_text" in st.session_state:
                model_name = "HanNayeoniee/LHK_DPO_v1"
                tokenizer = AutoTokenizer.from_pretrained(model_name)
                model = AutoModelForQuestionAnswering.from_pretrained(model_name)
                qa_pipeline = pipeline("question-answering", model=model, tokenizer=tokenizer)
            if qa_pipeline:
                get_user_input(user_question, qa_pipeline)

if __name__ == "__main__":
    main()