Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -42,8 +42,7 @@ if not HF_API_TOKEN:
|
|
42 |
|
43 |
# --- Global Vector Store and Embeddings ---
|
44 |
try:
|
45 |
-
|
46 |
-
from langchain_huggingface import HuggingFaceEmbeddings # This is the correct import now
|
47 |
|
48 |
embeddings = HuggingFaceEmbeddings(model_name=HF_EMBEDDING_MODEL_ID)
|
49 |
logger.info(f"Initialized HuggingFaceEmbeddings with model: {HF_EMBEDDING_MODEL_ID}")
|
@@ -51,10 +50,9 @@ except Exception as e:
|
|
51 |
logger.error(f"Failed to initialize HuggingFaceEmbeddings: {e}. Please ensure the model_id is correct and dependencies are installed.")
|
52 |
embeddings = None
|
53 |
|
54 |
-
# Initialize DocArrayInMemorySearch
|
55 |
-
#
|
56 |
-
|
57 |
-
vectorstore = DocArrayInMemorySearch(embedding=embeddings) if embeddings else None # <--- FIXED THIS LINE AGAIN
|
58 |
text_splitter = RecursiveCharacterTextSplitter(
|
59 |
chunk_size=1000,
|
60 |
chunk_overlap=200,
|
@@ -70,8 +68,8 @@ def add_document_to_vector_store(content: str, source: str, metadata: dict = Non
|
|
70 |
Adds content to the global vector store.
|
71 |
Chunks the content and creates LangChain Documents.
|
72 |
"""
|
73 |
-
if vectorstore is None
|
74 |
-
logger.warning("Vector store not initialized. Cannot add document.")
|
75 |
return
|
76 |
|
77 |
try:
|
@@ -83,18 +81,15 @@ def add_document_to_vector_store(content: str, source: str, metadata: dict = Non
|
|
83 |
doc_metadata.update(metadata)
|
84 |
docs.append(Document(page_content=chunk, metadata=doc_metadata))
|
85 |
|
86 |
-
#
|
87 |
-
#
|
88 |
-
# if
|
89 |
-
|
90 |
-
vectorstore.add_documents(docs) # Changed from vectorstore.add_documents(docs, embedding=embeddings)
|
91 |
-
# as it should now pick up the embedding from initialization.
|
92 |
logger.info(f"Added {len(docs)} chunks from '{source}' to the vector store.")
|
93 |
except Exception as e:
|
94 |
logger.error(f"Error adding document from '{source}' to vector store: {e}")
|
95 |
|
96 |
|
97 |
-
|
98 |
# --- Utility Functions ---
|
99 |
def extract_youtube_id(url: str) -> str:
|
100 |
"""Extract YouTube ID from various URL formats"""
|
@@ -315,29 +310,21 @@ class RetrievalTool(Tool):
|
|
315 |
}
|
316 |
output_type = "string"
|
317 |
|
318 |
-
|
319 |
-
|
320 |
-
|
|
|
|
|
321 |
|
322 |
try:
|
323 |
logger.info(f"Retrieving {k} chunks from DocArrayInMemorySearch for query: {query}")
|
324 |
-
|
|
|
325 |
|
326 |
if not retrieved_docs:
|
327 |
return "No relevant information found in the vector store for this query."
|
328 |
|
329 |
-
|
330 |
-
for i, doc in enumerate(retrieved_docs):
|
331 |
-
source = doc.metadata.get('source', 'Unknown Source')
|
332 |
-
title = doc.metadata.get('title', 'N/A')
|
333 |
-
chunk_index = doc.metadata.get('chunk_index', 'N/A')
|
334 |
-
formatted_results.append(
|
335 |
-
f"--- Retrieved Document Chunk {i+1} ---\n"
|
336 |
-
f"Source: {source} (Chunk: {chunk_index})\n"
|
337 |
-
f"Title: {title}\n"
|
338 |
-
f"Content: {doc.page_content}\n"
|
339 |
-
)
|
340 |
-
return "\n\n".join(formatted_results)
|
341 |
except Exception as e:
|
342 |
logger.error(f"Error retrieving from vector store for query '{query}': {e}")
|
343 |
return f"Error retrieving from vector store: {str(e)}"
|
@@ -562,23 +549,24 @@ You are an advanced, helpful, and highly analytical research assistant. Your goa
|
|
562 |
agent.prompt_templates["system_prompt"] = system_prompt
|
563 |
return agent
|
564 |
|
565 |
-
|
566 |
-
|
567 |
logger.info(f"Received question: {question[:200]}...")
|
568 |
-
# print statement for immediate console feedback (optional, for debugging/display)
|
569 |
print(f"Agent received question (first 50 chars): {question[:50]}...")
|
570 |
|
571 |
try:
|
572 |
global vectorstore
|
573 |
-
# Re-initialize vectorstore for a new session
|
574 |
-
# This
|
575 |
if embeddings:
|
576 |
-
vectorstore = DocArrayInMemorySearch(
|
577 |
logger.info("DocArrayInMemorySearch re-initialized for new session.")
|
578 |
else:
|
579 |
logger.warning("Embeddings not initialized, cannot re-initialize DocArrayInMemorySearch.")
|
580 |
return "Error: Embedding model not loaded, cannot process request."
|
581 |
|
|
|
|
|
582 |
# --- Implement a timeout for the agent's run method ---
|
583 |
AGENT_TIMEOUT_SECONDS = 120 # Max time in seconds for the agent to respond
|
584 |
|
|
|
42 |
|
43 |
# --- Global Vector Store and Embeddings ---
|
44 |
try:
|
45 |
+
from langchain_huggingface import HuggingFaceEmbeddings # Correct import for embeddings
|
|
|
46 |
|
47 |
embeddings = HuggingFaceEmbeddings(model_name=HF_EMBEDDING_MODEL_ID)
|
48 |
logger.info(f"Initialized HuggingFaceEmbeddings with model: {HF_EMBEDDING_MODEL_ID}")
|
|
|
50 |
logger.error(f"Failed to initialize HuggingFaceEmbeddings: {e}. Please ensure the model_id is correct and dependencies are installed.")
|
51 |
embeddings = None
|
52 |
|
53 |
+
# Initialize DocArrayInMemorySearch WITHOUT ANY arguments here.
|
54 |
+
# We'll rely on passing the embedding to add_documents and similarity_search explicitly.
|
55 |
+
vectorstore = DocArrayInMemorySearch() if embeddings else None # <--- REVERTED TO THIS SIMPLE INIT
|
|
|
56 |
text_splitter = RecursiveCharacterTextSplitter(
|
57 |
chunk_size=1000,
|
58 |
chunk_overlap=200,
|
|
|
68 |
Adds content to the global vector store.
|
69 |
Chunks the content and creates LangChain Documents.
|
70 |
"""
|
71 |
+
if vectorstore is None or embeddings is None: # Explicitly check embeddings
|
72 |
+
logger.warning("Vector store or embeddings not initialized. Cannot add document.")
|
73 |
return
|
74 |
|
75 |
try:
|
|
|
81 |
doc_metadata.update(metadata)
|
82 |
docs.append(Document(page_content=chunk, metadata=doc_metadata))
|
83 |
|
84 |
+
# Pass the embeddings function here when adding documents.
|
85 |
+
# This is often the more reliable way for DocArrayInMemorySearch
|
86 |
+
# if its __init__ doesn't directly take `embedding`.
|
87 |
+
vectorstore.add_documents(docs, embedding=embeddings) # <--- IMPORTANT: Pass embeddings here
|
|
|
|
|
88 |
logger.info(f"Added {len(docs)} chunks from '{source}' to the vector store.")
|
89 |
except Exception as e:
|
90 |
logger.error(f"Error adding document from '{source}' to vector store: {e}")
|
91 |
|
92 |
|
|
|
93 |
# --- Utility Functions ---
|
94 |
def extract_youtube_id(url: str) -> str:
|
95 |
"""Extract YouTube ID from various URL formats"""
|
|
|
310 |
}
|
311 |
output_type = "string"
|
312 |
|
313 |
+
class RetrievalTool(Tool):
|
314 |
+
# ... (rest of class definition) ...
|
315 |
+
def forward(self, query: str, k: int = 3) -> str:
|
316 |
+
if vectorstore is None or embeddings is None: # Added check for embeddings
|
317 |
+
return "Vector store is not initialized or embeddings are missing. No documents available for retrieval."
|
318 |
|
319 |
try:
|
320 |
logger.info(f"Retrieving {k} chunks from DocArrayInMemorySearch for query: {query}")
|
321 |
+
# Explicitly pass the embedding for similarity search if it's required for query embedding
|
322 |
+
retrieved_docs = vectorstore.similarity_search(query, k=k, embedding=embeddings) # <--- IMPORTANT: Pass embeddings here
|
323 |
|
324 |
if not retrieved_docs:
|
325 |
return "No relevant information found in the vector store for this query."
|
326 |
|
327 |
+
# ... (rest of the method) ...
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
328 |
except Exception as e:
|
329 |
logger.error(f"Error retrieving from vector store for query '{query}': {e}")
|
330 |
return f"Error retrieving from vector store: {str(e)}"
|
|
|
549 |
agent.prompt_templates["system_prompt"] = system_prompt
|
550 |
return agent
|
551 |
|
552 |
+
|
553 |
+
def __call__(self, question: str) -> str:
|
554 |
logger.info(f"Received question: {question[:200]}...")
|
|
|
555 |
print(f"Agent received question (first 50 chars): {question[:50]}...")
|
556 |
|
557 |
try:
|
558 |
global vectorstore
|
559 |
+
# Re-initialize vectorstore for a new session without arguments
|
560 |
+
# This relies on the add_documents and similarity_search methods getting the embedding
|
561 |
if embeddings:
|
562 |
+
vectorstore = DocArrayInMemorySearch() # <--- REVERTED TO THIS SIMPLE INIT HERE TOO
|
563 |
logger.info("DocArrayInMemorySearch re-initialized for new session.")
|
564 |
else:
|
565 |
logger.warning("Embeddings not initialized, cannot re-initialize DocArrayInMemorySearch.")
|
566 |
return "Error: Embedding model not loaded, cannot process request."
|
567 |
|
568 |
+
|
569 |
+
|
570 |
# --- Implement a timeout for the agent's run method ---
|
571 |
AGENT_TIMEOUT_SECONDS = 120 # Max time in seconds for the agent to respond
|
572 |
|