YchKhan commited on
Commit
0bde7bf
·
1 Parent(s): 3a6bb12

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +31 -14
app.py CHANGED
@@ -22,6 +22,7 @@ from duckduckgo_search import DDGS
22
  import requests
23
  import tempfile
24
 
 
25
  tokenizer = tiktoken.encoding_for_model("gpt-3.5-turbo")
26
 
27
  # create the length function
@@ -149,14 +150,15 @@ def add_files_to_zip(session_id):
149
  arcname = os.path.relpath(file_path, session_id)
150
  zipObj.write(file_path, arcname)
151
 
152
-
153
  ## Search files functions ##
154
 
155
  def search_docs(topic, max_references):
 
156
  doc_list = []
157
  with DDGS() as ddgs:
158
  i=0
159
  for r in ddgs.text('{} filetype:pdf'.format(topic), region='wt-wt', safesearch='On', timelimit='n'):
 
160
  if i>=max_references:
161
  break
162
  doc_list.append("TITLE : " + r['title'] + " -- BODY : " + r['body'] + " -- URL : " + r['href'])
@@ -164,7 +166,7 @@ def search_docs(topic, max_references):
164
  return doc_list
165
 
166
 
167
- def store_files(references):
168
  url_list=[]
169
  temp_files = []
170
  for ref in references:
@@ -182,11 +184,13 @@ def store_files(references):
182
  temp_file = tempfile.NamedTemporaryFile(delete=False,prefix=filename, suffix='.pdf')
183
  temp_file.write(response.content)
184
  temp_file.close()
185
- temp_files.append(temp_file)
 
 
 
186
 
187
  return temp_files
188
-
189
-
190
  ## Summary functions ##
191
 
192
  ## Load each doc from the vector store
@@ -289,7 +293,7 @@ def embed_files(files,ui_session_id,progress=gr.Progress(),progress_step=0.05):
289
 
290
  print("EMBEDDED, before embeddeding: ",session_id,len(db.index_to_docstore_id))
291
  for file_id,file in enumerate(files):
292
- print("ID : ", file_id,"FILE : ", file)
293
  file_type = file.name.split('.')[-1].lower()
294
  source = file.name.split('/')[-1]
295
  print(f"current file: {source}")
@@ -330,19 +334,25 @@ def embed_files(files,ui_session_id,progress=gr.Progress(),progress_step=0.05):
330
  progress(progress_step, desc = 'db zipped')
331
  return f"{session_id}.zip",ui_session_id
332
 
333
- def display_docs(docs):
334
- output_str = ''
335
- for i, doc in enumerate(docs):
336
- source = doc.metadata['source'].split('/')[-1]
337
- output_str += f"Ref: {i+1}\n{repr(doc.page_content)}\nSource: {source}\n\n"
338
- return output_str
339
 
340
 
341
  def add_to_db(references,ui_session_id):
342
  files = store_files(references)
343
  return embed_files(files,ui_session_id)
 
 
 
 
 
344
 
345
 
 
 
 
 
 
 
 
346
  def ask_gpt(query, apikey,history,ui_session_id):
347
  session_id = f"PDFAISS-{ui_session_id}"
348
  try:
@@ -368,7 +378,10 @@ with gr.Blocks() as demo:
368
  gr.Markdown("Upload your documents and question them.")
369
  with gr.Accordion("Open to enter your API key", open=False):
370
  apikey_input = gr.Textbox(placeholder="Type here your OpenAI API key to use Summarization and Q&A", label="OpenAI API Key",type='password')
371
- with gr.Tab("Upload PDF & TXT"):
 
 
 
372
  with gr.Accordion("Get files from the web", open=False):
373
  with gr.Column():
374
  topic_input = gr.Textbox(placeholder="Type your research", label="Research")
@@ -379,6 +392,8 @@ with gr.Blocks() as demo:
379
  dd_documents.style(container=True)
380
  with gr.Row():
381
  btn_dl = gr.Button("Add these files to the Database")
 
 
382
  tb_session_id = gr.Textbox(label='session id')
383
  docs_input = gr.File(file_count="multiple", file_types=[".txt", ".pdf",".zip",".docx"])
384
  db_output = gr.outputs.File(label="Download zipped database")
@@ -406,10 +421,12 @@ with gr.Blocks() as demo:
406
 
407
  btn_search.click(search_docs, inputs=[topic_input, max_files], outputs=dd_documents)
408
  btn_dl.click(add_to_db, inputs=[dd_documents,tb_session_id], outputs=[db_output,tb_session_id])
 
 
409
  btn_generate_db.click(embed_files, inputs=[docs_input,tb_session_id], outputs=[db_output,tb_session_id])
410
  btn_reset_db.click(reset_database,inputs=[tb_session_id],outputs=[db_output])
411
  btn_summary.click(summarize_docs, inputs=[apikey_input,tb_session_id], outputs=summary_output)
412
  btn_askGPT.click(ask_gpt, inputs=[query_input,apikey_input,history,tb_session_id], outputs=[answer_output,sources,history])
413
-
414
  demo.queue(concurrency_count=10)
415
  demo.launch(debug=False,share=False)
 
22
  import requests
23
  import tempfile
24
 
25
+
26
  tokenizer = tiktoken.encoding_for_model("gpt-3.5-turbo")
27
 
28
  # create the length function
 
150
  arcname = os.path.relpath(file_path, session_id)
151
  zipObj.write(file_path, arcname)
152
 
 
153
  ## Search files functions ##
154
 
155
  def search_docs(topic, max_references):
156
+ print(f"SEARCH PDF : {topic}")
157
  doc_list = []
158
  with DDGS() as ddgs:
159
  i=0
160
  for r in ddgs.text('{} filetype:pdf'.format(topic), region='wt-wt', safesearch='On', timelimit='n'):
161
+ #doc_list.append(str(r))
162
  if i>=max_references:
163
  break
164
  doc_list.append("TITLE : " + r['title'] + " -- BODY : " + r['body'] + " -- URL : " + r['href'])
 
166
  return doc_list
167
 
168
 
169
+ def store_files(references, ret_names=False):
170
  url_list=[]
171
  temp_files = []
172
  for ref in references:
 
184
  temp_file = tempfile.NamedTemporaryFile(delete=False,prefix=filename, suffix='.pdf')
185
  temp_file.write(response.content)
186
  temp_file.close()
187
+ if ret_names:
188
+ temp_files.append(temp_file.name)
189
+ else:
190
+ temp_files.append(temp_file)
191
 
192
  return temp_files
193
+
 
194
  ## Summary functions ##
195
 
196
  ## Load each doc from the vector store
 
293
 
294
  print("EMBEDDED, before embeddeding: ",session_id,len(db.index_to_docstore_id))
295
  for file_id,file in enumerate(files):
296
+ print("ID : ", file_id, "FILE : ", file)
297
  file_type = file.name.split('.')[-1].lower()
298
  source = file.name.split('/')[-1]
299
  print(f"current file: {source}")
 
334
  progress(progress_step, desc = 'db zipped')
335
  return f"{session_id}.zip",ui_session_id
336
 
 
 
 
 
 
 
337
 
338
 
339
  def add_to_db(references,ui_session_id):
340
  files = store_files(references)
341
  return embed_files(files,ui_session_id)
342
+
343
+ def export_files(references):
344
+ files = store_files(references, ret_names=True)
345
+ #paths = [file.name for file in files]
346
+ return files
347
 
348
 
349
+ def display_docs(docs):
350
+ output_str = ''
351
+ for i, doc in enumerate(docs):
352
+ source = doc.metadata['source'].split('/')[-1]
353
+ output_str += f"Ref: {i+1}\n{repr(doc.page_content)}\nSource: {source}\n\n"
354
+ return output_str
355
+
356
  def ask_gpt(query, apikey,history,ui_session_id):
357
  session_id = f"PDFAISS-{ui_session_id}"
358
  try:
 
378
  gr.Markdown("Upload your documents and question them.")
379
  with gr.Accordion("Open to enter your API key", open=False):
380
  apikey_input = gr.Textbox(placeholder="Type here your OpenAI API key to use Summarization and Q&A", label="OpenAI API Key",type='password')
381
+
382
+
383
+
384
+ with gr.Tab("Upload PDF & TXT"):
385
  with gr.Accordion("Get files from the web", open=False):
386
  with gr.Column():
387
  topic_input = gr.Textbox(placeholder="Type your research", label="Research")
 
392
  dd_documents.style(container=True)
393
  with gr.Row():
394
  btn_dl = gr.Button("Add these files to the Database")
395
+ btn_export = gr.Button("Export selected files ⬇⬇")
396
+
397
  tb_session_id = gr.Textbox(label='session id')
398
  docs_input = gr.File(file_count="multiple", file_types=[".txt", ".pdf",".zip",".docx"])
399
  db_output = gr.outputs.File(label="Download zipped database")
 
421
 
422
  btn_search.click(search_docs, inputs=[topic_input, max_files], outputs=dd_documents)
423
  btn_dl.click(add_to_db, inputs=[dd_documents,tb_session_id], outputs=[db_output,tb_session_id])
424
+ topic_input.submit(export_files, inputs=dd_documents, outputs=docs_input)
425
+ btn_export.click(export_files, inputs=dd_documents, outputs=docs_input)
426
  btn_generate_db.click(embed_files, inputs=[docs_input,tb_session_id], outputs=[db_output,tb_session_id])
427
  btn_reset_db.click(reset_database,inputs=[tb_session_id],outputs=[db_output])
428
  btn_summary.click(summarize_docs, inputs=[apikey_input,tb_session_id], outputs=summary_output)
429
  btn_askGPT.click(ask_gpt, inputs=[query_input,apikey_input,history,tb_session_id], outputs=[answer_output,sources,history])
430
+ #
431
  demo.queue(concurrency_count=10)
432
  demo.launch(debug=False,share=False)