awacke1 commited on
Commit
9939323
·
1 Parent(s): 95513f4

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +10 -21
app.py CHANGED
@@ -89,11 +89,7 @@ def get_table_download_link(file_path):
89
  data = file.read()
90
  except:
91
  st.write('')
92
- return file_path
93
- #import codecs
94
- #with codecs.open(file_path, "r", "utf-8") as file:
95
- # data = file.read()
96
-
97
  b64 = base64.b64encode(data.encode()).decode()
98
  file_name = os.path.basename(file_path)
99
  ext = os.path.splitext(file_name)[1] # get the file extension
@@ -148,14 +144,10 @@ def chat_with_model(prompt, document_section, model_choice='gpt-3.5-turbo'):
148
  conversation.append({'role': 'user', 'content': prompt})
149
  if len(document_section)>0:
150
  conversation.append({'role': 'assistant', 'content': document_section})
151
-
152
- # iterate through the stream of events
153
  start_time = time.time()
154
-
155
-
156
  report = []
157
  res_box = st.empty()
158
-
159
  collected_chunks = []
160
  collected_messages = []
161
 
@@ -182,7 +174,6 @@ def chat_with_model(prompt, document_section, model_choice='gpt-3.5-turbo'):
182
  st.write('.')
183
 
184
  full_reply_content = ''.join([m.get('content', '') for m in collected_messages])
185
- #st.write(f"Full conversation received: {full_reply_content}")
186
  st.write("Elapsed time:")
187
  st.write(time.time() - start_time)
188
  return full_reply_content
@@ -195,7 +186,7 @@ def chat_with_file_contents(prompt, file_content, model_choice='gpt-3.5-turbo'):
195
  response = openai.ChatCompletion.create(model=model_choice, messages=conversation)
196
  return response['choices'][0]['message']['content']
197
 
198
- def extract_text_from_pdfs(pdf_docs):
199
  text = ""
200
  for pdf in pdf_docs:
201
  pdf_reader = PdfReader(pdf)
@@ -203,16 +194,16 @@ def extract_text_from_pdfs(pdf_docs):
203
  text += page.extract_text()
204
  return text
205
 
206
- def split_text_into_chunks(text):
207
  text_splitter = CharacterTextSplitter(separator="\n", chunk_size=1000, chunk_overlap=200, length_function=len)
208
  return text_splitter.split_text(text)
209
 
210
- def create_vector_store_from_text_chunks(text_chunks):
211
  key = os.getenv('OPENAI_API_KEY')
212
  embeddings = OpenAIEmbeddings(openai_api_key=key)
213
  return FAISS.from_texts(texts=text_chunks, embedding=embeddings)
214
 
215
- def create_conversation_chain(vectorstore):
216
  llm = ChatOpenAI()
217
  memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)
218
  return ConversationalRetrievalChain.from_llm(llm=llm, retriever=vectorstore.as_retriever(), memory=memory)
@@ -338,8 +329,6 @@ def main():
338
  if __name__ == "__main__":
339
  main()
340
 
341
-
342
-
343
  load_dotenv()
344
  st.write(css, unsafe_allow_html=True)
345
 
@@ -352,12 +341,12 @@ with st.sidebar:
352
  st.subheader("Your documents")
353
  docs = st.file_uploader("Upload your documents", accept_multiple_files=True)
354
  with st.spinner("Processing"):
355
- raw = extract_text_from_pdfs(docs)
356
  if len(raw) > 0:
357
  length = str(len(raw))
358
- text_chunks = split_text_into_chunks(raw)
359
- vectorstore = create_vector_store_from_text_chunks(text_chunks)
360
- st.session_state.conversation = create_conversation_chain(vectorstore)
361
  st.markdown('# Extracted Text of Length:' + length + ' and Created Search Index')
362
  filename = generate_filename(raw, 'txt')
363
  create_file(filename, raw, '')
 
89
  data = file.read()
90
  except:
91
  st.write('')
92
+ return file_path
 
 
 
 
93
  b64 = base64.b64encode(data.encode()).decode()
94
  file_name = os.path.basename(file_path)
95
  ext = os.path.splitext(file_name)[1] # get the file extension
 
144
  conversation.append({'role': 'user', 'content': prompt})
145
  if len(document_section)>0:
146
  conversation.append({'role': 'assistant', 'content': document_section})
147
+
 
148
  start_time = time.time()
 
 
149
  report = []
150
  res_box = st.empty()
 
151
  collected_chunks = []
152
  collected_messages = []
153
 
 
174
  st.write('.')
175
 
176
  full_reply_content = ''.join([m.get('content', '') for m in collected_messages])
 
177
  st.write("Elapsed time:")
178
  st.write(time.time() - start_time)
179
  return full_reply_content
 
186
  response = openai.ChatCompletion.create(model=model_choice, messages=conversation)
187
  return response['choices'][0]['message']['content']
188
 
189
+ def pdf2txt(pdf_docs):
190
  text = ""
191
  for pdf in pdf_docs:
192
  pdf_reader = PdfReader(pdf)
 
194
  text += page.extract_text()
195
  return text
196
 
197
+ def txt2chunks(text):
198
  text_splitter = CharacterTextSplitter(separator="\n", chunk_size=1000, chunk_overlap=200, length_function=len)
199
  return text_splitter.split_text(text)
200
 
201
+ def vector_store(text_chunks):
202
  key = os.getenv('OPENAI_API_KEY')
203
  embeddings = OpenAIEmbeddings(openai_api_key=key)
204
  return FAISS.from_texts(texts=text_chunks, embedding=embeddings)
205
 
206
+ def get_chain(vectorstore):
207
  llm = ChatOpenAI()
208
  memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)
209
  return ConversationalRetrievalChain.from_llm(llm=llm, retriever=vectorstore.as_retriever(), memory=memory)
 
329
  if __name__ == "__main__":
330
  main()
331
 
 
 
332
  load_dotenv()
333
  st.write(css, unsafe_allow_html=True)
334
 
 
341
  st.subheader("Your documents")
342
  docs = st.file_uploader("Upload your documents", accept_multiple_files=True)
343
  with st.spinner("Processing"):
344
+ raw = pdf2txt(docs)
345
  if len(raw) > 0:
346
  length = str(len(raw))
347
+ text_chunks = txt2chunks(raw)
348
+ vectorstore = vector_store(text_chunks)
349
+ st.session_state.conversation = get_chain(vectorstore)
350
  st.markdown('# Extracted Text of Length:' + length + ' and Created Search Index')
351
  filename = generate_filename(raw, 'txt')
352
  create_file(filename, raw, '')