Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -22,6 +22,7 @@ from duckduckgo_search import DDGS
|
|
22 |
import requests
|
23 |
import tempfile
|
24 |
|
|
|
25 |
tokenizer = tiktoken.encoding_for_model("gpt-3.5-turbo")
|
26 |
|
27 |
# create the length function
|
@@ -149,14 +150,15 @@ def add_files_to_zip(session_id):
|
|
149 |
arcname = os.path.relpath(file_path, session_id)
|
150 |
zipObj.write(file_path, arcname)
|
151 |
|
152 |
-
|
153 |
## Search files functions ##
|
154 |
|
155 |
def search_docs(topic, max_references):
|
|
|
156 |
doc_list = []
|
157 |
with DDGS() as ddgs:
|
158 |
i=0
|
159 |
for r in ddgs.text('{} filetype:pdf'.format(topic), region='wt-wt', safesearch='On', timelimit='n'):
|
|
|
160 |
if i>=max_references:
|
161 |
break
|
162 |
doc_list.append("TITLE : " + r['title'] + " -- BODY : " + r['body'] + " -- URL : " + r['href'])
|
@@ -164,7 +166,7 @@ def search_docs(topic, max_references):
|
|
164 |
return doc_list
|
165 |
|
166 |
|
167 |
-
def store_files(references):
|
168 |
url_list=[]
|
169 |
temp_files = []
|
170 |
for ref in references:
|
@@ -182,11 +184,13 @@ def store_files(references):
|
|
182 |
temp_file = tempfile.NamedTemporaryFile(delete=False,prefix=filename, suffix='.pdf')
|
183 |
temp_file.write(response.content)
|
184 |
temp_file.close()
|
185 |
-
|
|
|
|
|
|
|
186 |
|
187 |
return temp_files
|
188 |
-
|
189 |
-
|
190 |
## Summary functions ##
|
191 |
|
192 |
## Load each doc from the vector store
|
@@ -289,7 +293,7 @@ def embed_files(files,ui_session_id,progress=gr.Progress(),progress_step=0.05):
|
|
289 |
|
290 |
print("EMBEDDED, before embeddeding: ",session_id,len(db.index_to_docstore_id))
|
291 |
for file_id,file in enumerate(files):
|
292 |
-
print("ID : ", file_id,"FILE : ", file)
|
293 |
file_type = file.name.split('.')[-1].lower()
|
294 |
source = file.name.split('/')[-1]
|
295 |
print(f"current file: {source}")
|
@@ -330,19 +334,25 @@ def embed_files(files,ui_session_id,progress=gr.Progress(),progress_step=0.05):
|
|
330 |
progress(progress_step, desc = 'db zipped')
|
331 |
return f"{session_id}.zip",ui_session_id
|
332 |
|
333 |
-
def display_docs(docs):
|
334 |
-
output_str = ''
|
335 |
-
for i, doc in enumerate(docs):
|
336 |
-
source = doc.metadata['source'].split('/')[-1]
|
337 |
-
output_str += f"Ref: {i+1}\n{repr(doc.page_content)}\nSource: {source}\n\n"
|
338 |
-
return output_str
|
339 |
|
340 |
|
341 |
def add_to_db(references,ui_session_id):
|
342 |
files = store_files(references)
|
343 |
return embed_files(files,ui_session_id)
|
|
|
|
|
|
|
|
|
|
|
344 |
|
345 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
346 |
def ask_gpt(query, apikey,history,ui_session_id):
|
347 |
session_id = f"PDFAISS-{ui_session_id}"
|
348 |
try:
|
@@ -368,7 +378,10 @@ with gr.Blocks() as demo:
|
|
368 |
gr.Markdown("Upload your documents and question them.")
|
369 |
with gr.Accordion("Open to enter your API key", open=False):
|
370 |
apikey_input = gr.Textbox(placeholder="Type here your OpenAI API key to use Summarization and Q&A", label="OpenAI API Key",type='password')
|
371 |
-
|
|
|
|
|
|
|
372 |
with gr.Accordion("Get files from the web", open=False):
|
373 |
with gr.Column():
|
374 |
topic_input = gr.Textbox(placeholder="Type your research", label="Research")
|
@@ -379,6 +392,8 @@ with gr.Blocks() as demo:
|
|
379 |
dd_documents.style(container=True)
|
380 |
with gr.Row():
|
381 |
btn_dl = gr.Button("Add these files to the Database")
|
|
|
|
|
382 |
tb_session_id = gr.Textbox(label='session id')
|
383 |
docs_input = gr.File(file_count="multiple", file_types=[".txt", ".pdf",".zip",".docx"])
|
384 |
db_output = gr.outputs.File(label="Download zipped database")
|
@@ -406,10 +421,12 @@ with gr.Blocks() as demo:
|
|
406 |
|
407 |
btn_search.click(search_docs, inputs=[topic_input, max_files], outputs=dd_documents)
|
408 |
btn_dl.click(add_to_db, inputs=[dd_documents,tb_session_id], outputs=[db_output,tb_session_id])
|
|
|
|
|
409 |
btn_generate_db.click(embed_files, inputs=[docs_input,tb_session_id], outputs=[db_output,tb_session_id])
|
410 |
btn_reset_db.click(reset_database,inputs=[tb_session_id],outputs=[db_output])
|
411 |
btn_summary.click(summarize_docs, inputs=[apikey_input,tb_session_id], outputs=summary_output)
|
412 |
btn_askGPT.click(ask_gpt, inputs=[query_input,apikey_input,history,tb_session_id], outputs=[answer_output,sources,history])
|
413 |
-
|
414 |
demo.queue(concurrency_count=10)
|
415 |
demo.launch(debug=False,share=False)
|
|
|
22 |
import requests
|
23 |
import tempfile
|
24 |
|
25 |
+
|
26 |
tokenizer = tiktoken.encoding_for_model("gpt-3.5-turbo")
|
27 |
|
28 |
# create the length function
|
|
|
150 |
arcname = os.path.relpath(file_path, session_id)
|
151 |
zipObj.write(file_path, arcname)
|
152 |
|
|
|
153 |
## Search files functions ##
|
154 |
|
155 |
def search_docs(topic, max_references):
|
156 |
+
print(f"SEARCH PDF : {topic}")
|
157 |
doc_list = []
|
158 |
with DDGS() as ddgs:
|
159 |
i=0
|
160 |
for r in ddgs.text('{} filetype:pdf'.format(topic), region='wt-wt', safesearch='On', timelimit='n'):
|
161 |
+
#doc_list.append(str(r))
|
162 |
if i>=max_references:
|
163 |
break
|
164 |
doc_list.append("TITLE : " + r['title'] + " -- BODY : " + r['body'] + " -- URL : " + r['href'])
|
|
|
166 |
return doc_list
|
167 |
|
168 |
|
169 |
+
def store_files(references, ret_names=False):
|
170 |
url_list=[]
|
171 |
temp_files = []
|
172 |
for ref in references:
|
|
|
184 |
temp_file = tempfile.NamedTemporaryFile(delete=False,prefix=filename, suffix='.pdf')
|
185 |
temp_file.write(response.content)
|
186 |
temp_file.close()
|
187 |
+
if ret_names:
|
188 |
+
temp_files.append(temp_file.name)
|
189 |
+
else:
|
190 |
+
temp_files.append(temp_file)
|
191 |
|
192 |
return temp_files
|
193 |
+
|
|
|
194 |
## Summary functions ##
|
195 |
|
196 |
## Load each doc from the vector store
|
|
|
293 |
|
294 |
print("EMBEDDED, before embeddeding: ",session_id,len(db.index_to_docstore_id))
|
295 |
for file_id,file in enumerate(files):
|
296 |
+
print("ID : ", file_id, "FILE : ", file)
|
297 |
file_type = file.name.split('.')[-1].lower()
|
298 |
source = file.name.split('/')[-1]
|
299 |
print(f"current file: {source}")
|
|
|
334 |
progress(progress_step, desc = 'db zipped')
|
335 |
return f"{session_id}.zip",ui_session_id
|
336 |
|
|
|
|
|
|
|
|
|
|
|
|
|
337 |
|
338 |
|
339 |
def add_to_db(references,ui_session_id):
|
340 |
files = store_files(references)
|
341 |
return embed_files(files,ui_session_id)
|
342 |
+
|
343 |
+
def export_files(references):
|
344 |
+
files = store_files(references, ret_names=True)
|
345 |
+
#paths = [file.name for file in files]
|
346 |
+
return files
|
347 |
|
348 |
|
349 |
+
def display_docs(docs):
|
350 |
+
output_str = ''
|
351 |
+
for i, doc in enumerate(docs):
|
352 |
+
source = doc.metadata['source'].split('/')[-1]
|
353 |
+
output_str += f"Ref: {i+1}\n{repr(doc.page_content)}\nSource: {source}\n\n"
|
354 |
+
return output_str
|
355 |
+
|
356 |
def ask_gpt(query, apikey,history,ui_session_id):
|
357 |
session_id = f"PDFAISS-{ui_session_id}"
|
358 |
try:
|
|
|
378 |
gr.Markdown("Upload your documents and question them.")
|
379 |
with gr.Accordion("Open to enter your API key", open=False):
|
380 |
apikey_input = gr.Textbox(placeholder="Type here your OpenAI API key to use Summarization and Q&A", label="OpenAI API Key",type='password')
|
381 |
+
|
382 |
+
|
383 |
+
|
384 |
+
with gr.Tab("Upload PDF & TXT"):
|
385 |
with gr.Accordion("Get files from the web", open=False):
|
386 |
with gr.Column():
|
387 |
topic_input = gr.Textbox(placeholder="Type your research", label="Research")
|
|
|
392 |
dd_documents.style(container=True)
|
393 |
with gr.Row():
|
394 |
btn_dl = gr.Button("Add these files to the Database")
|
395 |
+
btn_export = gr.Button("Export selected files ⬇⬇")
|
396 |
+
|
397 |
tb_session_id = gr.Textbox(label='session id')
|
398 |
docs_input = gr.File(file_count="multiple", file_types=[".txt", ".pdf",".zip",".docx"])
|
399 |
db_output = gr.outputs.File(label="Download zipped database")
|
|
|
421 |
|
422 |
btn_search.click(search_docs, inputs=[topic_input, max_files], outputs=dd_documents)
|
423 |
btn_dl.click(add_to_db, inputs=[dd_documents,tb_session_id], outputs=[db_output,tb_session_id])
|
424 |
+
topic_input.submit(export_files, inputs=dd_documents, outputs=docs_input)
|
425 |
+
btn_export.click(export_files, inputs=dd_documents, outputs=docs_input)
|
426 |
btn_generate_db.click(embed_files, inputs=[docs_input,tb_session_id], outputs=[db_output,tb_session_id])
|
427 |
btn_reset_db.click(reset_database,inputs=[tb_session_id],outputs=[db_output])
|
428 |
btn_summary.click(summarize_docs, inputs=[apikey_input,tb_session_id], outputs=summary_output)
|
429 |
btn_askGPT.click(ask_gpt, inputs=[query_input,apikey_input,history,tb_session_id], outputs=[answer_output,sources,history])
|
430 |
+
#
|
431 |
demo.queue(concurrency_count=10)
|
432 |
demo.launch(debug=False,share=False)
|