notabaka commited on
Commit
2dbdb3a
·
1 Parent(s): 3464d30
Files changed (2) hide show
  1. app.py +49 -83
  2. requirements.txt +3 -1
app.py CHANGED
@@ -2,12 +2,7 @@ import streamlit as st
2
  import os
3
  from PyPDF2 import PdfReader
4
  import openpyxl
5
- from langchain.text_splitter import RecursiveCharacterTextSplitter
6
- from langchain.embeddings import GooglePalmEmbeddings
7
- from langchain.llms import HuggingFaceTransformers # Updated import
8
- from langchain.vectorstores import FAISS
9
- from langchain.chains import ConversationalRetrievalChain
10
- from langchain.memory import ConversationBufferMemory
11
 
12
  os.environ['GOOGLE_API_KEY'] = 'AIzaSyD8uzXToT4I2ABs7qo_XiuKh8-L2nuWCEM'
13
 
@@ -20,88 +15,59 @@ def get_pdf_text(pdf_docs):
20
  return text
21
 
22
  def get_excel_text(excel_docs):
23
- text = ""
24
- for excel_doc in excel_docs:
25
- workbook = openpyxl.load_workbook(filename=excel_doc)
26
- for sheet in workbook:
27
- for row in sheet:
28
- for cell in row:
29
- text += str(cell.value) + " "
30
- return text.strip()
31
-
32
-
33
- def get_text_chunks(text):
34
- text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=20)
35
- chunks = text_splitter.split_text(text)
36
- return chunks
37
-
38
- def get_vector_store(text_chunks):
39
- embeddings = GooglePalmEmbeddings()
40
- vector_store = FAISS.from_texts(text_chunks, embedding=embeddings)
41
- return vector_store
42
-
43
- def get_conversational_chain(vector_store):
44
- llm = HuggingFaceTransformers(model_name="HanNayeoniee/LHK_DPO_v1")
45
- memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)
46
- conversation_chain = ConversationalRetrievalChain.from_llm(llm=llm, retriever=vector_store.as_retriever(), memory=memory)
47
- return conversation_chain
48
 
49
- def get_user_input(user_question):
50
  with st.container():
51
- response = st.session_state.conversation({'question': user_question})
52
- st.session_state.chatHistory = response['chat_history']
53
- file_contents = ""
54
- left , right = st.columns((2,1))
55
- with left:
56
- for i, message in enumerate(st.session_state.chatHistory):
57
- if i % 2 == 0:
58
- st.write("User: ", message.content)
59
- else:
60
- st.write("Bot: ", message.content)
61
- st.success("Done !")
62
- with right:
63
- for message in st.session_state.chatHistory:
64
- file_contents += f"{message.content}\n"
65
- file_name = "Chat_History.txt"
66
 
67
  def main():
68
- st.set_page_config("DocChat")
69
- st.header("DocChat - Chat with multiple documents")
70
- st.write("---")
71
- with st.container():
72
- with st.sidebar:
73
- st.title("Settings")
74
- st.subheader("Upload Documents")
75
- st.markdown("**PDF files:**")
76
- pdf_docs = st.file_uploader("Upload PDF Files", accept_multiple_files=True)
77
- if st.button("Process PDF file"):
78
- with st.spinner("Processing PDFs..."):
79
- raw_text = get_pdf_text(pdf_docs)
80
- text_chunks = get_text_chunks(raw_text)
81
- vector_store = get_vector_store(text_chunks)
82
- st.session_state.conversation = get_conversational_chain(vector_store)
83
- st.success("PDF processed successfully!")
 
84
 
85
- st.markdown("**Excel files:**")
86
- excel_docs = st.file_uploader("Upload Excel Files", accept_multiple_files=True)
87
- if st.button("Process Excel file"):
88
- with st.spinner("Processing Excel files..."):
89
- raw_text = get_excel_text(excel_docs)
90
- text_chunks = get_text_chunks(raw_text)
91
- vector_store = get_vector_store(text_chunks)
92
- st.session_state.conversation = get_conversational_chain(vector_store)
93
- st.success("Excel file processed successfully!")
94
 
95
- with st.container():
96
- st.subheader("Document Q&A")
97
- st.write('Ask a question : ')
98
- user_question = st.text_input("Ask a Question from the document")
99
- if "conversation" not in st.session_state:
100
- st.session_state.conversation = None
101
- if "chatHistory" not in st.session_state:
102
- st.session_state.chatHistory = None
103
- if user_question:
104
- get_user_input(user_question)
 
 
105
 
106
  if __name__ == "__main__":
107
- main()
 
2
  import os
3
  from PyPDF2 import PdfReader
4
  import openpyxl
5
+ from transformers import AutoTokenizer, AutoModelForQuestionAnswering, pipeline
 
 
 
 
 
6
 
7
  os.environ['GOOGLE_API_KEY'] = 'AIzaSyD8uzXToT4I2ABs7qo_XiuKh8-L2nuWCEM'
8
 
 
15
  return text
16
 
17
  def get_excel_text(excel_docs):
18
+ text = ""
19
+ for excel_doc in excel_docs:
20
+ workbook = openpyxl.load_workbook(filename=excel_doc)
21
+ for sheet in workbook:
22
+ for row in sheet:
23
+ for cell in row:
24
+ text += str(cell.value) + " "
25
+ return text.strip()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
 
27
+ def get_user_input(user_question, qa_pipeline):
28
  with st.container():
29
+ response = qa_pipeline(question=user_question, context=st.session_state.raw_text)
30
+ st.write("Answer:", response["answer"])
 
 
 
 
 
 
 
 
 
 
 
 
 
31
 
32
  def main():
33
+ st.set_page_config("DocChat")
34
+ st.header("DocChat - Chat with multiple documents")
35
+ st.write("---")
36
+
37
+ qa_pipeline = None
38
+
39
+ with st.container():
40
+ with st.sidebar:
41
+ st.title("Settings")
42
+ st.subheader("Upload Documents")
43
+ st.markdown("**PDF files:**")
44
+ pdf_docs = st.file_uploader("Upload PDF Files", accept_multiple_files=True)
45
+ if st.button("Process PDF file"):
46
+ with st.spinner("Processing PDFs..."):
47
+ raw_text = get_pdf_text(pdf_docs)
48
+ st.session_state.raw_text = raw_text
49
+ st.success("PDF processed successfully!")
50
 
51
+ st.markdown("**Excel files:**")
52
+ excel_docs = st.file_uploader("Upload Excel Files", accept_multiple_files=True)
53
+ if st.button("Process Excel file"):
54
+ with st.spinner("Processing Excel files..."):
55
+ raw_text = get_excel_text(excel_docs)
56
+ st.session_state.raw_text = raw_text
57
+ st.success("Excel file processed successfully!")
 
 
58
 
59
+ with st.container():
60
+ st.subheader("Document Q&A")
61
+ st.write('Ask a question : ')
62
+ user_question = st.text_input("Ask a Question from the document")
63
+ if user_question:
64
+ if not qa_pipeline and "raw_text" in st.session_state:
65
+ model_name = "notabaka/DocQA"
66
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
67
+ model = AutoModelForQuestionAnswering.from_pretrained(model_name)
68
+ qa_pipeline = pipeline("question-answering", model=model, tokenizer=tokenizer)
69
+ if qa_pipeline:
70
+ get_user_input(user_question, qa_pipeline)
71
 
72
  if __name__ == "__main__":
73
+ main()
requirements.txt CHANGED
@@ -3,4 +3,6 @@ langchain
3
  PyPDF2
4
  faiss-cpu
5
  streamlit
6
- openpyxl
 
 
 
3
  PyPDF2
4
  faiss-cpu
5
  streamlit
6
+ openpyxl
7
+ transformers
8
+ torch