Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -43,8 +43,7 @@ if not HF_API_TOKEN:
|
|
43 |
# --- Global Vector Store and Embeddings ---
|
44 |
try:
|
45 |
# Make sure to import HuggingFaceEmbeddings from the new package
|
46 |
-
|
47 |
-
from langchain_huggingface import HuggingFaceEmbeddings # Or keep langchain_community if you haven't migrated yet
|
48 |
|
49 |
embeddings = HuggingFaceEmbeddings(model_name=HF_EMBEDDING_MODEL_ID)
|
50 |
logger.info(f"Initialized HuggingFaceEmbeddings with model: {HF_EMBEDDING_MODEL_ID}")
|
@@ -52,8 +51,10 @@ except Exception as e:
|
|
52 |
logger.error(f"Failed to initialize HuggingFaceEmbeddings: {e}. Please ensure the model_id is correct and dependencies are installed.")
|
53 |
embeddings = None
|
54 |
|
55 |
-
# Initialize DocArrayInMemorySearch
|
56 |
-
|
|
|
|
|
57 |
text_splitter = RecursiveCharacterTextSplitter(
|
58 |
chunk_size=1000,
|
59 |
chunk_overlap=200,
|
@@ -63,27 +64,14 @@ text_splitter = RecursiveCharacterTextSplitter(
|
|
63 |
logger.info("Initialized in-memory DocArrayInMemorySearch vector store and RecursiveCharacterTextSplitter.")
|
64 |
|
65 |
|
66 |
-
# --- Utility Functions ---
|
67 |
-
def extract_youtube_id(url: str) -> str:
|
68 |
-
"""Extract YouTube ID from various URL formats"""
|
69 |
-
patterns = [
|
70 |
-
r'(?:https?:\/\/)?(?:www\.)?youtube\.com\/watch\?v=([^&]+)',
|
71 |
-
r'(?:https?:\/\/)?youtu\.be\/([^?]+)',
|
72 |
-
r'([a-zA-Z0-9_-]{11})'
|
73 |
-
]
|
74 |
-
for pattern in patterns:
|
75 |
-
match = re.search(pattern, url)
|
76 |
-
if match:
|
77 |
-
return match.group(1)
|
78 |
-
return ""
|
79 |
|
80 |
def add_document_to_vector_store(content: str, source: str, metadata: dict = None):
|
81 |
"""
|
82 |
Adds content to the global vector store.
|
83 |
Chunks the content and creates LangChain Documents.
|
84 |
"""
|
85 |
-
if vectorstore is None
|
86 |
-
logger.warning("Vector store
|
87 |
return
|
88 |
|
89 |
try:
|
@@ -95,14 +83,33 @@ def add_document_to_vector_store(content: str, source: str, metadata: dict = Non
|
|
95 |
doc_metadata.update(metadata)
|
96 |
docs.append(Document(page_content=chunk, metadata=doc_metadata))
|
97 |
|
98 |
-
#
|
99 |
-
|
|
|
|
|
|
|
|
|
100 |
logger.info(f"Added {len(docs)} chunks from '{source}' to the vector store.")
|
101 |
except Exception as e:
|
102 |
logger.error(f"Error adding document from '{source}' to vector store: {e}")
|
103 |
|
104 |
|
105 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
106 |
# --- Enhanced Tools ---
|
107 |
class WikiSearchTool(Tool):
|
108 |
"""Enhanced Wikipedia search with better formatting and error handling"""
|
@@ -556,19 +563,24 @@ You are an advanced, helpful, and highly analytical research assistant. Your goa
|
|
556 |
return agent
|
557 |
|
558 |
def __call__(self, question: str) -> str:
|
|
|
559 |
logger.info(f"Received question: {question[:200]}...")
|
|
|
|
|
|
|
560 |
try:
|
561 |
global vectorstore
|
|
|
|
|
562 |
if embeddings:
|
563 |
-
vectorstore = DocArrayInMemorySearch(
|
564 |
logger.info("DocArrayInMemorySearch re-initialized for new session.")
|
565 |
else:
|
566 |
logger.warning("Embeddings not initialized, cannot re-initialize DocArrayInMemorySearch.")
|
567 |
return "Error: Embedding model not loaded, cannot process request."
|
568 |
|
569 |
# --- Implement a timeout for the agent's run method ---
|
570 |
-
# Max time in seconds for the agent to respond
|
571 |
-
AGENT_TIMEOUT_SECONDS = 120
|
572 |
|
573 |
with concurrent.futures.ThreadPoolExecutor(max_workers=1) as executor:
|
574 |
future = executor.submit(self.agent.run, question)
|
@@ -584,6 +596,8 @@ def __call__(self, question: str) -> str:
|
|
584 |
return f"Error processing your request: {str(e)}. Please try again or rephrase your question."
|
585 |
|
586 |
logger.info(f"Response generated successfully for question: {question[:200]}")
|
|
|
|
|
587 |
return response
|
588 |
except Exception as e:
|
589 |
# This outer catch is for issues before agent.run is called or unhandled by the ThreadPoolExecutor
|
|
|
43 |
# --- Global Vector Store and Embeddings ---
|
44 |
try:
|
45 |
# Make sure to import HuggingFaceEmbeddings from the new package
|
46 |
+
from langchain_huggingface import HuggingFaceEmbeddings # This is the correct import now
|
|
|
47 |
|
48 |
embeddings = HuggingFaceEmbeddings(model_name=HF_EMBEDDING_MODEL_ID)
|
49 |
logger.info(f"Initialized HuggingFaceEmbeddings with model: {HF_EMBEDDING_MODEL_ID}")
|
|
|
51 |
logger.error(f"Failed to initialize HuggingFaceEmbeddings: {e}. Please ensure the model_id is correct and dependencies are installed.")
|
52 |
embeddings = None
|
53 |
|
54 |
+
# Initialize DocArrayInMemorySearch WITH the embedding function here
|
55 |
+
# This will likely work with newer versions of DocArrayInMemorySearch
|
56 |
+
# as it needs the embedding function for its internal doc_index.
|
57 |
+
vectorstore = DocArrayInMemorySearch(embedding=embeddings) if embeddings else None # <--- FIXED THIS LINE AGAIN
|
58 |
text_splitter = RecursiveCharacterTextSplitter(
|
59 |
chunk_size=1000,
|
60 |
chunk_overlap=200,
|
|
|
64 |
logger.info("Initialized in-memory DocArrayInMemorySearch vector store and RecursiveCharacterTextSplitter.")
|
65 |
|
66 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
67 |
|
68 |
def add_document_to_vector_store(content: str, source: str, metadata: dict = None):
|
69 |
"""
|
70 |
Adds content to the global vector store.
|
71 |
Chunks the content and creates LangChain Documents.
|
72 |
"""
|
73 |
+
if vectorstore is None: # Embeddings check is less critical here if vectorstore is already None
|
74 |
+
logger.warning("Vector store not initialized. Cannot add document.")
|
75 |
return
|
76 |
|
77 |
try:
|
|
|
83 |
doc_metadata.update(metadata)
|
84 |
docs.append(Document(page_content=chunk, metadata=doc_metadata))
|
85 |
|
86 |
+
# When `vectorstore` was initialized with `embedding=embeddings`,
|
87 |
+
# `add_documents` often doesn't *also* need `embedding=embeddings`
|
88 |
+
# if the vectorstore already knows its embedding function.
|
89 |
+
# However, passing it explicitly here doesn't hurt and provides clarity.
|
90 |
+
vectorstore.add_documents(docs) # Changed from vectorstore.add_documents(docs, embedding=embeddings)
|
91 |
+
# as it should now pick up the embedding from initialization.
|
92 |
logger.info(f"Added {len(docs)} chunks from '{source}' to the vector store.")
|
93 |
except Exception as e:
|
94 |
logger.error(f"Error adding document from '{source}' to vector store: {e}")
|
95 |
|
96 |
|
97 |
|
98 |
+
# --- Utility Functions ---
|
99 |
+
def extract_youtube_id(url: str) -> str:
|
100 |
+
"""Extract YouTube ID from various URL formats"""
|
101 |
+
patterns = [
|
102 |
+
r'(?:https?:\/\/)?(?:www\.)?youtube\.com\/watch\?v=([^&]+)',
|
103 |
+
r'(?:https?:\/\/)?youtu\.be\/([^?]+)',
|
104 |
+
r'([a-zA-Z0-9_-]{11})'
|
105 |
+
]
|
106 |
+
for pattern in patterns:
|
107 |
+
match = re.search(pattern, url)
|
108 |
+
if match:
|
109 |
+
return match.group(1)
|
110 |
+
return ""
|
111 |
+
|
112 |
+
|
113 |
# --- Enhanced Tools ---
|
114 |
class WikiSearchTool(Tool):
|
115 |
"""Enhanced Wikipedia search with better formatting and error handling"""
|
|
|
563 |
return agent
|
564 |
|
565 |
def __call__(self, question: str) -> str:
|
566 |
+
# Logging the initial receipt of the question
|
567 |
logger.info(f"Received question: {question[:200]}...")
|
568 |
+
# print statement for immediate console feedback (optional, for debugging/display)
|
569 |
+
print(f"Agent received question (first 50 chars): {question[:50]}...")
|
570 |
+
|
571 |
try:
|
572 |
global vectorstore
|
573 |
+
# Re-initialize vectorstore for a new session, passing the embeddings
|
574 |
+
# This is crucial for newer versions of DocArrayInMemorySearch
|
575 |
if embeddings:
|
576 |
+
vectorstore = DocArrayInMemorySearch(embedding=embeddings)
|
577 |
logger.info("DocArrayInMemorySearch re-initialized for new session.")
|
578 |
else:
|
579 |
logger.warning("Embeddings not initialized, cannot re-initialize DocArrayInMemorySearch.")
|
580 |
return "Error: Embedding model not loaded, cannot process request."
|
581 |
|
582 |
# --- Implement a timeout for the agent's run method ---
|
583 |
+
AGENT_TIMEOUT_SECONDS = 120 # Max time in seconds for the agent to respond
|
|
|
584 |
|
585 |
with concurrent.futures.ThreadPoolExecutor(max_workers=1) as executor:
|
586 |
future = executor.submit(self.agent.run, question)
|
|
|
596 |
return f"Error processing your request: {str(e)}. Please try again or rephrase your question."
|
597 |
|
598 |
logger.info(f"Response generated successfully for question: {question[:200]}")
|
599 |
+
# print statement for immediate console feedback of the final answer
|
600 |
+
print(f"Agent returning answer: {response}")
|
601 |
return response
|
602 |
except Exception as e:
|
603 |
# This outer catch is for issues before agent.run is called or unhandled by the ThreadPoolExecutor
|