Spaces:
Sleeping
Sleeping
update app.py
Browse files
app.py
CHANGED
@@ -10,7 +10,8 @@ from youtube_transcript_api import YouTubeTranscriptApi
|
|
10 |
from smolagents import tool, Tool, CodeAgent, DuckDuckGoSearchTool, HfApiModel, VisitWebpageTool, SpeechToTextTool, FinalAnswerTool
|
11 |
from langchain_community.document_loaders import WikipediaLoader, PyPDFLoader, TextLoader
|
12 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
13 |
-
|
|
|
14 |
from langchain_community.vectorstores import DocArrayInMemorySearch
|
15 |
from langchain_core.documents import Document
|
16 |
from dotenv import load_dotenv
|
@@ -22,6 +23,10 @@ import uuid
|
|
22 |
import concurrent.futures
|
23 |
import time
|
24 |
|
|
|
|
|
|
|
|
|
25 |
# --- Initialize logging ---
|
26 |
LOG_FILE_PATH = "agent_activity.log"
|
27 |
logging.basicConfig(
|
@@ -42,17 +47,14 @@ if not HF_API_TOKEN:
|
|
42 |
|
43 |
# --- Global Vector Store and Embeddings ---
|
44 |
try:
|
45 |
-
from langchain_huggingface import HuggingFaceEmbeddings # Correct import for embeddings
|
46 |
-
|
47 |
embeddings = HuggingFaceEmbeddings(model_name=HF_EMBEDDING_MODEL_ID)
|
48 |
logger.info(f"Initialized HuggingFaceEmbeddings with model: {HF_EMBEDDING_MODEL_ID}")
|
49 |
except Exception as e:
|
50 |
logger.error(f"Failed to initialize HuggingFaceEmbeddings: {e}. Please ensure the model_id is correct and dependencies are installed.")
|
51 |
embeddings = None
|
52 |
|
53 |
-
# Initialize DocArrayInMemorySearch
|
54 |
-
|
55 |
-
vectorstore = DocArrayInMemorySearch() if embeddings else None # <--- REVERTED TO THIS SIMPLE INIT
|
56 |
text_splitter = RecursiveCharacterTextSplitter(
|
57 |
chunk_size=1000,
|
58 |
chunk_overlap=200,
|
@@ -62,14 +64,27 @@ text_splitter = RecursiveCharacterTextSplitter(
|
|
62 |
logger.info("Initialized in-memory DocArrayInMemorySearch vector store and RecursiveCharacterTextSplitter.")
|
63 |
|
64 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
65 |
|
66 |
def add_document_to_vector_store(content: str, source: str, metadata: dict = None):
|
67 |
"""
|
68 |
Adds content to the global vector store.
|
69 |
Chunks the content and creates LangChain Documents.
|
70 |
"""
|
71 |
-
if vectorstore is None
|
72 |
-
logger.warning("Vector store
|
73 |
return
|
74 |
|
75 |
try:
|
@@ -81,30 +96,13 @@ def add_document_to_vector_store(content: str, source: str, metadata: dict = Non
|
|
81 |
doc_metadata.update(metadata)
|
82 |
docs.append(Document(page_content=chunk, metadata=doc_metadata))
|
83 |
|
84 |
-
#
|
85 |
-
#
|
86 |
-
#
|
87 |
-
vectorstore.add_documents(docs, embedding=embeddings) # <--- IMPORTANT: Pass embeddings here
|
88 |
logger.info(f"Added {len(docs)} chunks from '{source}' to the vector store.")
|
89 |
except Exception as e:
|
90 |
logger.error(f"Error adding document from '{source}' to vector store: {e}")
|
91 |
|
92 |
-
|
93 |
-
# --- Utility Functions ---
|
94 |
-
def extract_youtube_id(url: str) -> str:
|
95 |
-
"""Extract YouTube ID from various URL formats"""
|
96 |
-
patterns = [
|
97 |
-
r'(?:https?:\/\/)?(?:www\.)?youtube\.com\/watch\?v=([^&]+)',
|
98 |
-
r'(?:https?:\/\/)?youtu\.be\/([^?]+)',
|
99 |
-
r'([a-zA-Z0-9_-]{11})'
|
100 |
-
]
|
101 |
-
for pattern in patterns:
|
102 |
-
match = re.search(pattern, url)
|
103 |
-
if match:
|
104 |
-
return match.group(1)
|
105 |
-
return ""
|
106 |
-
|
107 |
-
|
108 |
# --- Enhanced Tools ---
|
109 |
class WikiSearchTool(Tool):
|
110 |
"""Enhanced Wikipedia search with better formatting and error handling"""
|
@@ -310,25 +308,36 @@ class RetrievalTool(Tool):
|
|
310 |
}
|
311 |
output_type = "string"
|
312 |
|
313 |
-
|
314 |
def forward(self, query: str, k: int = 3) -> str:
|
315 |
-
if vectorstore is None or embeddings is None:
|
316 |
return "Vector store is not initialized or embeddings are missing. No documents available for retrieval."
|
317 |
|
318 |
try:
|
319 |
logger.info(f"Retrieving {k} chunks from DocArrayInMemorySearch for query: {query}")
|
320 |
-
#
|
321 |
-
|
|
|
|
|
322 |
|
323 |
if not retrieved_docs:
|
324 |
return "No relevant information found in the vector store for this query."
|
325 |
|
326 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
327 |
except Exception as e:
|
328 |
logger.error(f"Error retrieving from vector store for query '{query}': {e}")
|
329 |
return f"Error retrieving from vector store: {str(e)}"
|
330 |
|
331 |
-
|
332 |
class ChessAnalysisAPITool(Tool):
|
333 |
"""
|
334 |
Analyzes a chess position provided in FEN format using a remote chess engine API (chess-api.com).
|
@@ -395,6 +404,7 @@ class ChessAnalysisAPITool(Tool):
|
|
395 |
logger.error(f"An unexpected error occurred during remote chess analysis for FEN '{fen_string}': {e}")
|
396 |
return f"An unexpected error occurred during chess analysis: {str(e)}"
|
397 |
|
|
|
398 |
# --- Agent Initialization ---
|
399 |
class BasicAgent:
|
400 |
def __init__(self):
|
@@ -428,8 +438,7 @@ class BasicAgent:
|
|
428 |
logger.info("Adding RetrievalTool to the agent's tools.")
|
429 |
base_tools.append(RetrievalTool())
|
430 |
else:
|
431 |
-
logger.warning("RetrievalTool not added because vector store or embeddings are not initialized.")
|
432 |
-
|
433 |
|
434 |
return base_tools
|
435 |
|
|
|
10 |
from smolagents import tool, Tool, CodeAgent, DuckDuckGoSearchTool, HfApiModel, VisitWebpageTool, SpeechToTextTool, FinalAnswerTool
|
11 |
from langchain_community.document_loaders import WikipediaLoader, PyPDFLoader, TextLoader
|
12 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
13 |
+
# Use the new import for HuggingFaceEmbeddings
|
14 |
+
from langchain_huggingface import HuggingFaceEmbeddings # <--- IMPORTANT: Updated import
|
15 |
from langchain_community.vectorstores import DocArrayInMemorySearch
|
16 |
from langchain_core.documents import Document
|
17 |
from dotenv import load_dotenv
|
|
|
23 |
import concurrent.futures
|
24 |
import time
|
25 |
|
26 |
+
# Import DocList from docarray
|
27 |
+
from docarray import DocList # <--- IMPORTANT: Added this import
|
28 |
+
|
29 |
+
|
30 |
# --- Initialize logging ---
|
31 |
LOG_FILE_PATH = "agent_activity.log"
|
32 |
logging.basicConfig(
|
|
|
47 |
|
48 |
# --- Global Vector Store and Embeddings ---
|
49 |
try:
|
|
|
|
|
50 |
embeddings = HuggingFaceEmbeddings(model_name=HF_EMBEDDING_MODEL_ID)
|
51 |
logger.info(f"Initialized HuggingFaceEmbeddings with model: {HF_EMBEDDING_MODEL_ID}")
|
52 |
except Exception as e:
|
53 |
logger.error(f"Failed to initialize HuggingFaceEmbeddings: {e}. Please ensure the model_id is correct and dependencies are installed.")
|
54 |
embeddings = None
|
55 |
|
56 |
+
# Initialize DocArrayInMemorySearch WITH the required arguments: doc_index and embedding
|
57 |
+
vectorstore = DocArrayInMemorySearch(doc_index=DocList(), embedding=embeddings) if embeddings else None # <--- FIXED THIS LINE
|
|
|
58 |
text_splitter = RecursiveCharacterTextSplitter(
|
59 |
chunk_size=1000,
|
60 |
chunk_overlap=200,
|
|
|
64 |
logger.info("Initialized in-memory DocArrayInMemorySearch vector store and RecursiveCharacterTextSplitter.")
|
65 |
|
66 |
|
67 |
+
# --- Utility Functions ---
|
68 |
+
def extract_youtube_id(url: str) -> str:
|
69 |
+
"""Extract YouTube ID from various URL formats"""
|
70 |
+
patterns = [
|
71 |
+
r'(?:https?:\/\/)?(?:www\.)?youtube\.com\/watch\?v=([^&]+)',
|
72 |
+
r'(?:https?:\/\/)?youtu\.be\/([^?]+)',
|
73 |
+
r'([a-zA-Z0-9_-]{11})'
|
74 |
+
]
|
75 |
+
for pattern in patterns:
|
76 |
+
match = re.search(pattern, url)
|
77 |
+
if match:
|
78 |
+
return match.group(1)
|
79 |
+
return ""
|
80 |
|
81 |
def add_document_to_vector_store(content: str, source: str, metadata: dict = None):
|
82 |
"""
|
83 |
Adds content to the global vector store.
|
84 |
Chunks the content and creates LangChain Documents.
|
85 |
"""
|
86 |
+
if vectorstore is None:
|
87 |
+
logger.warning("Vector store not initialized. Cannot add document.")
|
88 |
return
|
89 |
|
90 |
try:
|
|
|
96 |
doc_metadata.update(metadata)
|
97 |
docs.append(Document(page_content=chunk, metadata=doc_metadata))
|
98 |
|
99 |
+
# When vectorstore is initialized with embedding, add_documents might not need it again.
|
100 |
+
# But explicitly passing it is safer if there are multiple ways to initialize.
|
101 |
+
vectorstore.add_documents(docs) # No `embedding` argument needed here if initialized in __init__
|
|
|
102 |
logger.info(f"Added {len(docs)} chunks from '{source}' to the vector store.")
|
103 |
except Exception as e:
|
104 |
logger.error(f"Error adding document from '{source}' to vector store: {e}")
|
105 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
106 |
# --- Enhanced Tools ---
|
107 |
class WikiSearchTool(Tool):
|
108 |
"""Enhanced Wikipedia search with better formatting and error handling"""
|
|
|
308 |
}
|
309 |
output_type = "string"
|
310 |
|
|
|
311 |
def forward(self, query: str, k: int = 3) -> str:
|
312 |
+
if vectorstore is None or embeddings is None:
|
313 |
return "Vector store is not initialized or embeddings are missing. No documents available for retrieval."
|
314 |
|
315 |
try:
|
316 |
logger.info(f"Retrieving {k} chunks from DocArrayInMemorySearch for query: {query}")
|
317 |
+
# Ensure similarity_search uses the vectorstore's internal embedding if initialized correctly
|
318 |
+
# or if it takes an explicit embedding argument here.
|
319 |
+
# With DocArrayInMemorySearch initialized with `embedding=embeddings`, this call should be fine.
|
320 |
+
retrieved_docs = vectorstore.similarity_search(query, k=k)
|
321 |
|
322 |
if not retrieved_docs:
|
323 |
return "No relevant information found in the vector store for this query."
|
324 |
|
325 |
+
formatted_results = []
|
326 |
+
for i, doc in enumerate(retrieved_docs):
|
327 |
+
source = doc.metadata.get('source', 'Unknown Source')
|
328 |
+
title = doc.metadata.get('title', 'N/A')
|
329 |
+
chunk_index = doc.metadata.get('chunk_index', 'N/A')
|
330 |
+
formatted_results.append(
|
331 |
+
f"--- Retrieved Document Chunk {i+1} ---\n"
|
332 |
+
f"Source: {source} (Chunk: {chunk_index})\n"
|
333 |
+
f"Title: {title}\n"
|
334 |
+
f"Content: {doc.page_content}\n"
|
335 |
+
)
|
336 |
+
return "\n\n".join(formatted_results)
|
337 |
except Exception as e:
|
338 |
logger.error(f"Error retrieving from vector store for query '{query}': {e}")
|
339 |
return f"Error retrieving from vector store: {str(e)}"
|
340 |
|
|
|
341 |
class ChessAnalysisAPITool(Tool):
|
342 |
"""
|
343 |
Analyzes a chess position provided in FEN format using a remote chess engine API (chess-api.com).
|
|
|
404 |
logger.error(f"An unexpected error occurred during remote chess analysis for FEN '{fen_string}': {e}")
|
405 |
return f"An unexpected error occurred during chess analysis: {str(e)}"
|
406 |
|
407 |
+
|
408 |
# --- Agent Initialization ---
|
409 |
class BasicAgent:
|
410 |
def __init__(self):
|
|
|
438 |
logger.info("Adding RetrievalTool to the agent's tools.")
|
439 |
base_tools.append(RetrievalTool())
|
440 |
else:
|
441 |
+
logger.warning("RetrievalTool not added because vector store or embeddings are not initialized.")
|
|
|
442 |
|
443 |
return base_tools
|
444 |
|