awacke1 commited on
Commit
0627aa1
Β·
1 Parent(s): b65e145

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +412 -0
app.py ADDED
@@ -0,0 +1,412 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import openai
3
+ import os
4
+ import base64
5
+ import glob
6
+ import json
7
+ import mistune
8
+ import pytz
9
+ import math
10
+ import requests
11
+ import time
12
+ import re
13
+ import textract
14
+
15
+ from datetime import datetime
16
+ from openai import ChatCompletion
17
+ from xml.etree import ElementTree as ET
18
+ from bs4 import BeautifulSoup
19
+ from collections import deque
20
+ from audio_recorder_streamlit import audio_recorder
21
+
22
+ from dotenv import load_dotenv
23
+ from PyPDF2 import PdfReader
24
+ from langchain.text_splitter import CharacterTextSplitter
25
+ from langchain.embeddings import OpenAIEmbeddings
26
+ from langchain.vectorstores import FAISS
27
+ from langchain.chat_models import ChatOpenAI
28
+ from langchain.memory import ConversationBufferMemory
29
+ from langchain.chains import ConversationalRetrievalChain
30
+ from templates import css, bot_template, user_template
31
+
32
+
33
+
34
+ def generate_filename(prompt, file_type):
35
+ central = pytz.timezone('US/Central')
36
+ safe_date_time = datetime.now(central).strftime("%m%d_%I%M") # Date and time DD-TT
37
+ safe_prompt = "".join(x for x in prompt if x.isalnum())[:90] # Limit file name size and trim whitespace
38
+ return f"{safe_date_time}_{safe_prompt}.{file_type}" # Return a safe file name
39
+
40
+ def transcribe_audio(openai_key, file_path, model):
41
+ OPENAI_API_URL = "https://api.openai.com/v1/audio/transcriptions"
42
+ headers = {
43
+ "Authorization": f"Bearer {openai_key}",
44
+ }
45
+ with open(file_path, 'rb') as f:
46
+ data = {'file': f}
47
+ response = requests.post(OPENAI_API_URL, headers=headers, files=data, data={'model': model})
48
+ if response.status_code == 200:
49
+ st.write(response.json())
50
+ chatResponse = chat_with_model(response.json().get('text'), '') # *************************************
51
+ transcript = response.json().get('text')
52
+ #st.write('Responses:')
53
+ #st.write(chatResponse)
54
+ filename = generate_filename(transcript, 'txt')
55
+ create_file(filename, transcript, chatResponse)
56
+ return transcript
57
+ else:
58
+ st.write(response.json())
59
+ st.error("Error in API call.")
60
+ return None
61
+
62
+ def save_and_play_audio(audio_recorder):
63
+ audio_bytes = audio_recorder()
64
+ if audio_bytes:
65
+ filename = generate_filename("Recording", "wav")
66
+ with open(filename, 'wb') as f:
67
+ f.write(audio_bytes)
68
+ st.audio(audio_bytes, format="audio/wav")
69
+ return filename
70
+ return None
71
+
72
+ def create_file(filename, prompt, response):
73
+ if filename.endswith(".txt"):
74
+ with open(filename, 'w') as file:
75
+ file.write(f"{prompt}\n{response}")
76
+ elif filename.endswith(".htm"):
77
+ with open(filename, 'w') as file:
78
+ file.write(f"{prompt} {response}")
79
+ elif filename.endswith(".md"):
80
+ with open(filename, 'w') as file:
81
+ file.write(f"{prompt}\n\n{response}")
82
+
83
+ def truncate_document(document, length):
84
+ return document[:length]
85
+ def divide_document(document, max_length):
86
+ return [document[i:i+max_length] for i in range(0, len(document), max_length)]
87
+
88
+ def get_table_download_link(file_path):
89
+ with open(file_path, 'r') as file:
90
+ try:
91
+ data = file.read()
92
+ except:
93
+ st.write('')
94
+ return file_path
95
+ b64 = base64.b64encode(data.encode()).decode()
96
+ file_name = os.path.basename(file_path)
97
+ ext = os.path.splitext(file_name)[1] # get the file extension
98
+ if ext == '.txt':
99
+ mime_type = 'text/plain'
100
+ elif ext == '.py':
101
+ mime_type = 'text/plain'
102
+ elif ext == '.xlsx':
103
+ mime_type = 'text/plain'
104
+ elif ext == '.csv':
105
+ mime_type = 'text/plain'
106
+ elif ext == '.htm':
107
+ mime_type = 'text/html'
108
+ elif ext == '.md':
109
+ mime_type = 'text/markdown'
110
+ else:
111
+ mime_type = 'application/octet-stream' # general binary data type
112
+ href = f'<a href="data:{mime_type};base64,{b64}" target="_blank" download="{file_name}">{file_name}</a>'
113
+ return href
114
+
115
+ def CompressXML(xml_text):
116
+ root = ET.fromstring(xml_text)
117
+ for elem in list(root.iter()):
118
+ if isinstance(elem.tag, str) and 'Comment' in elem.tag:
119
+ elem.parent.remove(elem)
120
+ return ET.tostring(root, encoding='unicode', method="xml")
121
+
122
+ def read_file_content(file,max_length):
123
+ if file.type == "application/json":
124
+ content = json.load(file)
125
+ return str(content)
126
+ elif file.type == "text/html" or file.type == "text/htm":
127
+ content = BeautifulSoup(file, "html.parser")
128
+ return content.text
129
+ elif file.type == "application/xml" or file.type == "text/xml":
130
+ tree = ET.parse(file)
131
+ root = tree.getroot()
132
+ xml = CompressXML(ET.tostring(root, encoding='unicode'))
133
+ return xml
134
+ elif file.type == "text/markdown" or file.type == "text/md":
135
+ md = mistune.create_markdown()
136
+ content = md(file.read().decode())
137
+ return content
138
+ elif file.type == "text/plain":
139
+ return file.getvalue().decode()
140
+ else:
141
+ return ""
142
+
143
+ def chat_with_model(prompt, document_section, model_choice='gpt-3.5-turbo'):
144
+ model = model_choice
145
+ conversation = [{'role': 'system', 'content': 'You are a helpful assistant.'}]
146
+ conversation.append({'role': 'user', 'content': prompt})
147
+ if len(document_section)>0:
148
+ conversation.append({'role': 'assistant', 'content': document_section})
149
+
150
+ start_time = time.time()
151
+ report = []
152
+ res_box = st.empty()
153
+ collected_chunks = []
154
+ collected_messages = []
155
+
156
+ for chunk in openai.ChatCompletion.create(
157
+ model='gpt-3.5-turbo',
158
+ messages=conversation,
159
+ temperature=0.5,
160
+ stream=True
161
+ ):
162
+
163
+ collected_chunks.append(chunk) # save the event response
164
+ chunk_message = chunk['choices'][0]['delta'] # extract the message
165
+ collected_messages.append(chunk_message) # save the message
166
+
167
+ content=chunk["choices"][0].get("delta",{}).get("content")
168
+
169
+ try:
170
+ report.append(content)
171
+ if len(content) > 0:
172
+ result = "".join(report).strip()
173
+ #result = result.replace("\n", "")
174
+ res_box.markdown(f'*{result}*')
175
+ except:
176
+ st.write(' ')
177
+
178
+ full_reply_content = ''.join([m.get('content', '') for m in collected_messages])
179
+ st.write("Elapsed time:")
180
+ st.write(time.time() - start_time)
181
+ return full_reply_content
182
+
183
+ def chat_with_file_contents(prompt, file_content, model_choice='gpt-3.5-turbo'):
184
+ conversation = [{'role': 'system', 'content': 'You are a helpful assistant.'}]
185
+ conversation.append({'role': 'user', 'content': prompt})
186
+ if len(file_content)>0:
187
+ conversation.append({'role': 'assistant', 'content': file_content})
188
+ response = openai.ChatCompletion.create(model=model_choice, messages=conversation)
189
+ return response['choices'][0]['message']['content']
190
+
191
+ def extract_mime_type(file):
192
+ # Check if the input is a string
193
+ if isinstance(file, str):
194
+ pattern = r"type='(.*?)'"
195
+ match = re.search(pattern, file)
196
+ if match:
197
+ return match.group(1)
198
+ else:
199
+ raise ValueError(f"Unable to extract MIME type from {file}")
200
+ # If it's not a string, assume it's a streamlit.UploadedFile object
201
+ elif isinstance(file, streamlit.UploadedFile):
202
+ return file.type
203
+ else:
204
+ raise TypeError("Input should be a string or a streamlit.UploadedFile object")
205
+
206
+ from io import BytesIO
207
+ import re
208
+
209
+ def extract_file_extension(file):
210
+ # get the file name directly from the UploadedFile object
211
+ file_name = file.name
212
+ pattern = r".*?\.(.*?)$"
213
+ match = re.search(pattern, file_name)
214
+ if match:
215
+ return match.group(1)
216
+ else:
217
+ raise ValueError(f"Unable to extract file extension from {file_name}")
218
+
219
+ def pdf2txt(docs):
220
+ text = ""
221
+ for file in docs:
222
+ file_extension = extract_file_extension(file)
223
+ # print the file extension
224
+ st.write(f"File type extension: {file_extension}")
225
+
226
+ # read the file according to its extension
227
+ try:
228
+ if file_extension.lower() in ['py', 'txt', 'html', 'htm', 'xml', 'json']:
229
+ text += file.getvalue().decode('utf-8')
230
+ elif file_extension.lower() == 'pdf':
231
+ from PyPDF2 import PdfFileReader
232
+ pdf = PdfFileReader(BytesIO(file.getvalue()))
233
+ for page in range(pdf.getNumPages()):
234
+ text += pdf.getPage(page).extractText()
235
+ except Exception as e:
236
+ st.write(f"Error processing file {file.name}: {e}")
237
+
238
+ return text
239
+
240
+ def pdf2txt_old(pdf_docs):
241
+ st.write(pdf_docs)
242
+ for file in pdf_docs:
243
+ mime_type = extract_mime_type(file)
244
+ st.write(f"MIME type of file: {mime_type}")
245
+
246
+ text = ""
247
+ for pdf in pdf_docs:
248
+ pdf_reader = PdfReader(pdf)
249
+ for page in pdf_reader.pages:
250
+ text += page.extract_text()
251
+ return text
252
+
253
+ def txt2chunks(text):
254
+ text_splitter = CharacterTextSplitter(separator="\n", chunk_size=1000, chunk_overlap=200, length_function=len)
255
+ return text_splitter.split_text(text)
256
+
257
+ def vector_store(text_chunks):
258
+ key = os.getenv('OPENAI_API_KEY')
259
+ embeddings = OpenAIEmbeddings(openai_api_key=key)
260
+ return FAISS.from_texts(texts=text_chunks, embedding=embeddings)
261
+
262
+ def get_chain(vectorstore):
263
+ llm = ChatOpenAI()
264
+ memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)
265
+ return ConversationalRetrievalChain.from_llm(llm=llm, retriever=vectorstore.as_retriever(), memory=memory)
266
+
267
+ def process_user_input(user_question):
268
+ response = st.session_state.conversation({'question': user_question})
269
+ st.session_state.chat_history = response['chat_history']
270
+ for i, message in enumerate(st.session_state.chat_history):
271
+ template = user_template if i % 2 == 0 else bot_template
272
+ st.write(template.replace("{{MSG}}", message.content), unsafe_allow_html=True)
273
+ # Save file output from PDF query results
274
+ filename = generate_filename(user_question, 'txt')
275
+ create_file(filename, user_question, message.content)
276
+
277
+ #st.sidebar.markdown(get_table_download_link(filename), unsafe_allow_html=True)
278
+
279
+
280
+ def main():
281
+ # Sidebar and global
282
+ openai.api_key = os.getenv('OPENAI_API_KEY')
283
+ st.set_page_config(page_title="GPT Streamlit Document Reasoner",layout="wide")
284
+
285
+ # File type for output, model choice
286
+ menu = ["htm", "txt", "xlsx", "csv", "md", "py"] #619
287
+ choice = st.sidebar.selectbox("Output File Type:", menu)
288
+ model_choice = st.sidebar.radio("Select Model:", ('gpt-3.5-turbo', 'gpt-3.5-turbo-0301'))
289
+
290
+ # Audio, transcribe, GPT:
291
+ filename = save_and_play_audio(audio_recorder)
292
+ if filename is not None:
293
+ transcription = transcribe_audio(openai.api_key, filename, "whisper-1")
294
+ st.sidebar.markdown(get_table_download_link(filename), unsafe_allow_html=True)
295
+ filename=None # since transcription is finished next time just use the saved transcript
296
+
297
+ # prompt interfaces
298
+ user_prompt = st.text_area("Enter prompts, instructions & questions:", '', height=100)
299
+
300
+ # file section interface for prompts against large documents as context
301
+ collength, colupload = st.columns([2,3]) # adjust the ratio as needed
302
+ with collength:
303
+ max_length = st.slider("File section length for large files", min_value=1000, max_value=128000, value=12000, step=1000)
304
+ with colupload:
305
+ uploaded_file = st.file_uploader("Add a file for context:", type=["xml", "json", "xlsx","csv","html", "htm", "md", "txt"])
306
+
307
+ # Document section chat
308
+ document_sections = deque()
309
+ document_responses = {}
310
+ if uploaded_file is not None:
311
+ file_content = read_file_content(uploaded_file, max_length)
312
+ document_sections.extend(divide_document(file_content, max_length))
313
+ if len(document_sections) > 0:
314
+ if st.button("πŸ‘οΈ View Upload"):
315
+ st.markdown("**Sections of the uploaded file:**")
316
+ for i, section in enumerate(list(document_sections)):
317
+ st.markdown(f"**Section {i+1}**\n{section}")
318
+ st.markdown("**Chat with the model:**")
319
+ for i, section in enumerate(list(document_sections)):
320
+ if i in document_responses:
321
+ st.markdown(f"**Section {i+1}**\n{document_responses[i]}")
322
+ else:
323
+ if st.button(f"Chat about Section {i+1}"):
324
+ st.write('Reasoning with your inputs...')
325
+ response = chat_with_model(user_prompt, section, model_choice) # *************************************
326
+ st.write('Response:')
327
+ st.write(response)
328
+ document_responses[i] = response
329
+ filename = generate_filename(f"{user_prompt}_section_{i+1}", choice)
330
+ create_file(filename, user_prompt, response)
331
+ st.sidebar.markdown(get_table_download_link(filename), unsafe_allow_html=True)
332
+
333
+ if st.button('πŸ’¬ Chat'):
334
+ st.write('Reasoning with your inputs...')
335
+ response = chat_with_model(user_prompt, ''.join(list(document_sections,)), model_choice) # *************************************
336
+ st.write('Response:')
337
+ st.write(response)
338
+
339
+ filename = generate_filename(user_prompt, choice)
340
+ create_file(filename, user_prompt, response)
341
+ st.sidebar.markdown(get_table_download_link(filename), unsafe_allow_html=True)
342
+
343
+ all_files = glob.glob("*.*")
344
+ all_files = [file for file in all_files if len(os.path.splitext(file)[0]) >= 20] # exclude files with short names
345
+ all_files.sort(key=lambda x: (os.path.splitext(x)[1], x), reverse=True) # sort by file type and file name in descending order
346
+
347
+ # sidebar of files
348
+ file_contents=''
349
+ next_action=''
350
+ for file in all_files:
351
+ col1, col2, col3, col4, col5 = st.sidebar.columns([1,6,1,1,1]) # adjust the ratio as needed
352
+ with col1:
353
+ if st.button("🌐", key="md_"+file): # md emoji button
354
+ with open(file, 'r') as f:
355
+ file_contents = f.read()
356
+ next_action='md'
357
+ with col2:
358
+ st.markdown(get_table_download_link(file), unsafe_allow_html=True)
359
+ with col3:
360
+ if st.button("πŸ“‚", key="open_"+file): # open emoji button
361
+ with open(file, 'r') as f:
362
+ file_contents = f.read()
363
+ next_action='open'
364
+ with col4:
365
+ if st.button("πŸ”", key="read_"+file): # search emoji button
366
+ with open(file, 'r') as f:
367
+ file_contents = f.read()
368
+ next_action='search'
369
+ with col5:
370
+ if st.button("πŸ—‘", key="delete_"+file):
371
+ os.remove(file)
372
+ st.experimental_rerun()
373
+
374
+ if len(file_contents) > 0:
375
+ if next_action=='open':
376
+ file_content_area = st.text_area("File Contents:", file_contents, height=500)
377
+ if next_action=='md':
378
+ st.markdown(file_contents)
379
+ if next_action=='search':
380
+ file_content_area = st.text_area("File Contents:", file_contents, height=500)
381
+ st.write('Reasoning with your inputs...')
382
+ response = chat_with_model(user_prompt, file_contents, model_choice)
383
+ filename = generate_filename(file_contents, choice)
384
+ create_file(filename, file_contents, response)
385
+
386
+ st.experimental_rerun()
387
+ #st.sidebar.markdown(get_table_download_link(filename), unsafe_allow_html=True)
388
+
389
+ if __name__ == "__main__":
390
+ main()
391
+
392
+ load_dotenv()
393
+ st.write(css, unsafe_allow_html=True)
394
+
395
+ st.header("Chat with documents :books:")
396
+ user_question = st.text_input("Ask a question about your documents:")
397
+ if user_question:
398
+ process_user_input(user_question)
399
+
400
+ with st.sidebar:
401
+ st.subheader("Your documents")
402
+ docs = st.file_uploader("import documents", accept_multiple_files=True)
403
+ with st.spinner("Processing"):
404
+ raw = pdf2txt(docs)
405
+ if len(raw) > 0:
406
+ length = str(len(raw))
407
+ text_chunks = txt2chunks(raw)
408
+ vectorstore = vector_store(text_chunks)
409
+ st.session_state.conversation = get_chain(vectorstore)
410
+ st.markdown('# AI Search Index of Length:' + length + ' Created.') # add timing
411
+ filename = generate_filename(raw, 'txt')
412
+ create_file(filename, raw, '')