Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
|
@@ -210,104 +210,71 @@ def google_search(term, num_results=5, lang="en", timeout=5, safe="active", ssl_
|
|
| 210 |
print(f"Result {i}:")
|
| 211 |
print(f" Link: {result['link']}")
|
| 212 |
if result['text']:
|
| 213 |
-
print(f" Text: {result['text'][:100]}...") #
|
| 214 |
else:
|
| 215 |
-
print("
|
| 216 |
-
print("End of search results")
|
| 217 |
-
|
| 218 |
-
if not all_results:
|
| 219 |
-
print("No search results found. Returning a default message.")
|
| 220 |
-
return [{"link": None, "text": "No information found in the web search results."}]
|
| 221 |
-
|
| 222 |
return all_results
|
| 223 |
|
| 224 |
-
def
|
| 225 |
global conversation_history
|
| 226 |
|
| 227 |
-
|
| 228 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 229 |
|
| 230 |
-
|
| 231 |
-
|
|
|
|
|
|
|
| 232 |
else:
|
| 233 |
-
|
| 234 |
-
embed = get_embeddings()
|
| 235 |
-
|
| 236 |
-
if web_search:
|
| 237 |
-
search_results = google_search(question)
|
| 238 |
-
context_str = "\n".join([result["text"] for result in search_results if result["text"]])
|
| 239 |
-
|
| 240 |
-
# Convert web search results to Document format
|
| 241 |
-
web_docs = [Document(page_content=result["text"], metadata={"source": result["link"]}) for result in search_results if result["text"]]
|
| 242 |
-
|
| 243 |
-
# Check if the FAISS database exists
|
| 244 |
-
if os.path.exists("faiss_database"):
|
| 245 |
-
database = FAISS.load_local("faiss_database", embed, allow_dangerous_deserialization=True)
|
| 246 |
-
database.add_documents(web_docs)
|
| 247 |
-
else:
|
| 248 |
-
database = FAISS.from_documents(web_docs, embed)
|
| 249 |
-
database.save_local("faiss_database")
|
| 250 |
-
|
| 251 |
-
prompt_template = """
|
| 252 |
-
Answer the question based on the following web search results:
|
| 253 |
-
Web Search Results:
|
| 254 |
-
{context}
|
| 255 |
-
Current Question: {question}
|
| 256 |
-
If the web search results don't contain relevant information, state that the information is not available in the search results.
|
| 257 |
-
Provide a concise and direct answer to the question without mentioning the web search or these instructions:
|
| 258 |
-
"""
|
| 259 |
-
prompt_val = ChatPromptTemplate.from_template(prompt_template)
|
| 260 |
-
formatted_prompt = prompt_val.format(context=context_str, question=question)
|
| 261 |
-
else:
|
| 262 |
-
# Check if the FAISS database exists
|
| 263 |
-
if os.path.exists("faiss_database"):
|
| 264 |
-
database = FAISS.load_local("faiss_database", embed, allow_dangerous_deserialization=True)
|
| 265 |
-
else:
|
| 266 |
-
return "No FAISS database found. Please upload documents to create the vector store."
|
| 267 |
|
| 268 |
-
|
|
|
|
|
|
|
|
|
|
| 269 |
|
| 270 |
-
|
| 271 |
-
|
| 272 |
-
|
| 273 |
-
retriever = database.as_retriever()
|
| 274 |
-
relevant_docs = retriever.get_relevant_documents(question)
|
| 275 |
-
context_str = "\n".join([doc.page_content for doc in relevant_docs])
|
| 276 |
|
| 277 |
-
|
| 278 |
-
|
|
|
|
| 279 |
|
| 280 |
-
|
| 281 |
-
answer = re.split(r'Question:|Current Question:', answer)[-1].strip()
|
| 282 |
|
| 283 |
-
|
| 284 |
-
|
| 285 |
-
|
|
|
|
|
|
|
| 286 |
|
| 287 |
-
|
| 288 |
-
|
|
|
|
|
|
|
| 289 |
|
| 290 |
-
|
| 291 |
-
|
|
|
|
|
|
|
|
|
|
| 292 |
|
| 293 |
return answer
|
| 294 |
|
| 295 |
-
def
|
| 296 |
-
if
|
| 297 |
-
|
| 298 |
-
|
| 299 |
-
|
| 300 |
-
|
| 301 |
-
|
| 302 |
-
|
| 303 |
-
if use_recursive_splitter:
|
| 304 |
-
data = load_and_split_document_recursive(file)
|
| 305 |
-
else:
|
| 306 |
-
data = load_and_split_document_basic(file)
|
| 307 |
-
create_or_update_database(data, embed)
|
| 308 |
-
total_chunks += len(data)
|
| 309 |
-
|
| 310 |
-
return f"Vector store updated successfully. Processed {total_chunks} chunks from {len(files)} files."
|
| 311 |
|
| 312 |
def extract_db_to_excel():
|
| 313 |
embed = get_embeddings()
|
|
@@ -338,47 +305,46 @@ def export_memory_db_to_excel():
|
|
| 338 |
|
| 339 |
return excel_path
|
| 340 |
|
| 341 |
-
# Gradio interface
|
| 342 |
with gr.Blocks() as demo:
|
| 343 |
-
gr.
|
| 344 |
-
|
| 345 |
-
|
| 346 |
-
|
| 347 |
-
|
| 348 |
-
|
| 349 |
-
|
| 350 |
-
|
| 351 |
-
update_button.click(update_vectors, inputs=[file_input, use_recursive_splitter], outputs=update_output)
|
| 352 |
-
|
| 353 |
-
with gr.Row():
|
| 354 |
-
with gr.Column(scale=2):
|
| 355 |
-
chatbot = gr.Chatbot(label="Conversation")
|
| 356 |
-
question_input = gr.Textbox(label="Ask a question about your documents")
|
| 357 |
-
submit_button = gr.Button("Submit")
|
| 358 |
-
with gr.Column(scale=1):
|
| 359 |
-
temperature_slider = gr.Slider(label="Temperature", minimum=0.0, maximum=1.0, value=0.5, step=0.1)
|
| 360 |
-
top_p_slider = gr.Slider(label="Top P", minimum=0.0, maximum=1.0, value=0.9, step=0.1)
|
| 361 |
-
repetition_penalty_slider = gr.Slider(label="Repetition Penalty", minimum=1.0, maximum=2.0, value=1.0, step=0.1)
|
| 362 |
-
web_search_checkbox = gr.Checkbox(label="Enable Web Search", value=False)
|
| 363 |
-
|
| 364 |
-
def chat(question, history):
|
| 365 |
-
answer = ask_question(question, temperature_slider.value, top_p_slider.value, repetition_penalty_slider.value, web_search_checkbox.value)
|
| 366 |
-
history.append((question, answer))
|
| 367 |
-
return "", history
|
| 368 |
-
|
| 369 |
-
submit_button.click(chat, inputs=[question_input, chatbot], outputs=[question_input, chatbot])
|
| 370 |
|
| 371 |
-
|
| 372 |
-
|
| 373 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 374 |
|
| 375 |
-
|
| 376 |
-
|
| 377 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 378 |
|
| 379 |
-
|
| 380 |
-
|
| 381 |
-
clear_button.click(clear_cache,
|
|
|
|
|
|
|
| 382 |
|
| 383 |
-
|
| 384 |
-
demo.launch()
|
|
|
|
| 210 |
print(f"Result {i}:")
|
| 211 |
print(f" Link: {result['link']}")
|
| 212 |
if result['text']:
|
| 213 |
+
print(f" Text: {result['text'][:100]}...") # Display the first 100 characters of the text for brevity
|
| 214 |
else:
|
| 215 |
+
print(" No text extracted")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 216 |
return all_results
|
| 217 |
|
| 218 |
+
def process_question(question, documents, history, temperature, top_p, repetition_penalty):
|
| 219 |
global conversation_history
|
| 220 |
|
| 221 |
+
embeddings = get_embeddings()
|
| 222 |
+
|
| 223 |
+
# Check the memory database for similar questions
|
| 224 |
+
for prev_question, prev_answer in memory_database.items():
|
| 225 |
+
similarity = get_similarity(question, prev_question)
|
| 226 |
+
if similarity > 0.7:
|
| 227 |
+
return prev_answer
|
| 228 |
|
| 229 |
+
# Load the FAISS vector store if it exists
|
| 230 |
+
if os.path.exists("faiss_database"):
|
| 231 |
+
db = FAISS.load_local("faiss_database", embeddings, allow_dangerous_deserialization=True)
|
| 232 |
+
relevant_docs = db.similarity_search(question, k=3)
|
| 233 |
else:
|
| 234 |
+
relevant_docs = []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 235 |
|
| 236 |
+
if len(relevant_docs) == 0:
|
| 237 |
+
# Perform web search and update the vector store
|
| 238 |
+
web_search_results = google_search(question, num_results=5)
|
| 239 |
+
web_docs = [Document(page_content=res["text"] or "", metadata={"source": res["link"]}) for res in web_search_results if res["text"]]
|
| 240 |
|
| 241 |
+
if web_docs:
|
| 242 |
+
# Update the FAISS vector store with new documents
|
| 243 |
+
create_or_update_database(web_docs, embeddings)
|
|
|
|
|
|
|
|
|
|
| 244 |
|
| 245 |
+
# Reload the updated FAISS store and retrieve relevant documents
|
| 246 |
+
db = FAISS.load_local("faiss_database", embeddings, allow_dangerous_deserialization=True)
|
| 247 |
+
relevant_docs = db.similarity_search(question, k=3)
|
| 248 |
|
| 249 |
+
context = "\n\n".join([doc.page_content for doc in relevant_docs])
|
|
|
|
| 250 |
|
| 251 |
+
if is_related_to_history(question, history):
|
| 252 |
+
context = "None"
|
| 253 |
+
else:
|
| 254 |
+
history_text = "\n".join([f"Q: {h['question']}\nA: {h['answer']}" for h in history])
|
| 255 |
+
context = context if context else "None"
|
| 256 |
|
| 257 |
+
prompt_text = ChatPromptTemplate(
|
| 258 |
+
input_variables=["history", "context", "question"],
|
| 259 |
+
template=prompt
|
| 260 |
+
).format(history=history_text, context=context, question=question)
|
| 261 |
|
| 262 |
+
model = get_model(temperature, top_p, repetition_penalty)
|
| 263 |
+
answer = generate_chunked_response(model, prompt_text)
|
| 264 |
+
|
| 265 |
+
conversation_history = manage_conversation_history(question, answer, history)
|
| 266 |
+
memory_database[question] = answer
|
| 267 |
|
| 268 |
return answer
|
| 269 |
|
| 270 |
+
def process_uploaded_file(file, is_recursive):
|
| 271 |
+
if is_recursive:
|
| 272 |
+
data = load_and_split_document_recursive(file)
|
| 273 |
+
else:
|
| 274 |
+
data = load_and_split_document_basic(file)
|
| 275 |
+
embeddings = get_embeddings()
|
| 276 |
+
create_or_update_database(data, embeddings)
|
| 277 |
+
return "File processed and data added to the vector database."
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 278 |
|
| 279 |
def extract_db_to_excel():
|
| 280 |
embed = get_embeddings()
|
|
|
|
| 305 |
|
| 306 |
return excel_path
|
| 307 |
|
|
|
|
| 308 |
with gr.Blocks() as demo:
|
| 309 |
+
with gr.Tab("Upload PDF"):
|
| 310 |
+
with gr.Row():
|
| 311 |
+
pdf_file = gr.File(label="Upload PDF")
|
| 312 |
+
with gr.Row():
|
| 313 |
+
recursive_check = gr.Checkbox(label="Use Recursive Text Splitter")
|
| 314 |
+
upload_button = gr.Button("Upload and Process")
|
| 315 |
+
with gr.Row():
|
| 316 |
+
upload_output = gr.Textbox(label="Upload Output")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 317 |
|
| 318 |
+
with gr.Tab("Ask Questions"):
|
| 319 |
+
with gr.Row():
|
| 320 |
+
question = gr.Textbox(label="Your Question")
|
| 321 |
+
with gr.Row():
|
| 322 |
+
temperature = gr.Slider(minimum=0.0, maximum=1.0, value=0.7, label="Temperature")
|
| 323 |
+
top_p = gr.Slider(minimum=0.0, maximum=1.0, value=0.9, label="Top P")
|
| 324 |
+
repetition_penalty = gr.Slider(minimum=0.0, maximum=2.0, value=1.0, label="Repetition Penalty")
|
| 325 |
+
with gr.Row():
|
| 326 |
+
ask_button = gr.Button("Ask")
|
| 327 |
+
with gr.Row():
|
| 328 |
+
answer = gr.Textbox(label="Answer")
|
| 329 |
|
| 330 |
+
with gr.Tab("Clear Cache"):
|
| 331 |
+
with gr.Row():
|
| 332 |
+
clear_button = gr.Button("Clear Cache")
|
| 333 |
+
with gr.Row():
|
| 334 |
+
clear_output = gr.Textbox(label="Clear Output")
|
| 335 |
+
|
| 336 |
+
with gr.Tab("Export Data"):
|
| 337 |
+
with gr.Row():
|
| 338 |
+
export_db_button = gr.Button("Export Database to Excel")
|
| 339 |
+
export_db_output = gr.Textbox(label="Export Output")
|
| 340 |
+
with gr.Row():
|
| 341 |
+
export_memory_button = gr.Button("Export Memory DB to Excel")
|
| 342 |
+
export_memory_output = gr.Textbox(label="Export Output")
|
| 343 |
|
| 344 |
+
upload_button.click(process_uploaded_file, [pdf_file, recursive_check], upload_output)
|
| 345 |
+
ask_button.click(process_question, [question, pdf_file, recursive_check, temperature, top_p, repetition_penalty], answer)
|
| 346 |
+
clear_button.click(clear_cache, [], clear_output)
|
| 347 |
+
export_db_button.click(extract_db_to_excel, [], export_db_output)
|
| 348 |
+
export_memory_button.click(export_memory_db_to_excel, [], export_memory_output)
|
| 349 |
|
| 350 |
+
demo.launch()
|
|
|