|
import streamlit as st |
|
import os |
|
from PyPDF2 import PdfReader |
|
import openpyxl |
|
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, pipeline |
|
|
|
os.environ['GOOGLE_API_KEY'] = 'AIzaSyD8uzXToT4I2ABs7qo_XiuKh8-L2nuWCEM' |
|
|
|
def get_pdf_text(pdf_docs): |
|
text = "" |
|
for pdf in pdf_docs: |
|
pdf_reader = PdfReader(pdf) |
|
for page in pdf_reader.pages: |
|
text += page.extract_text() |
|
return text |
|
|
|
def get_excel_text(excel_docs): |
|
text = "" |
|
for excel_doc in excel_docs: |
|
workbook = openpyxl.load_workbook(filename=excel_doc) |
|
for sheet in workbook: |
|
for row in sheet: |
|
for cell in row: |
|
text += str(cell.value) + " " |
|
return text.strip() |
|
|
|
def get_user_input(user_question, qa_pipeline): |
|
with st.container(): |
|
response = qa_pipeline(question=user_question, context=st.session_state.raw_text) |
|
st.write("Answer:", response["answer"]) |
|
|
|
def main(): |
|
st.set_page_config("DocChat") |
|
st.header("DocChat - Chat with multiple documents") |
|
st.write("---") |
|
|
|
qa_pipeline = None |
|
|
|
with st.container(): |
|
with st.sidebar: |
|
st.title("Settings") |
|
st.subheader("Upload Documents") |
|
st.markdown("**PDF files:**") |
|
pdf_docs = st.file_uploader("Upload PDF Files", accept_multiple_files=True) |
|
if st.button("Process PDF file"): |
|
with st.spinner("Processing PDFs..."): |
|
raw_text = get_pdf_text(pdf_docs) |
|
st.session_state.raw_text = raw_text |
|
st.success("PDF processed successfully!") |
|
|
|
st.markdown("**Excel files:**") |
|
excel_docs = st.file_uploader("Upload Excel Files", accept_multiple_files=True) |
|
if st.button("Process Excel file"): |
|
with st.spinner("Processing Excel files..."): |
|
raw_text = get_excel_text(excel_docs) |
|
st.session_state.raw_text = raw_text |
|
st.success("Excel file processed successfully!") |
|
|
|
with st.container(): |
|
st.subheader("Document Q&A") |
|
st.write('Ask a question : ') |
|
user_question = st.text_input("Ask a Question from the document") |
|
if user_question: |
|
if not qa_pipeline and "raw_text" in st.session_state: |
|
model_name = "HanNayeoniee/LHK_DPO_v1" |
|
tokenizer = AutoTokenizer.from_pretrained(model_name) |
|
model = AutoModelForQuestionAnswering.from_pretrained(model_name) |
|
qa_pipeline = pipeline("question-answering", model=model, tokenizer=tokenizer) |
|
if qa_pipeline: |
|
get_user_input(user_question, qa_pipeline) |
|
|
|
if __name__ == "__main__": |
|
main() |
|
|