wt002 commited on
Commit
df2ad57
·
verified ·
1 Parent(s): fe93293

update app.py

Browse files
Files changed (1) hide show
  1. app.py +45 -36
app.py CHANGED
@@ -10,7 +10,8 @@ from youtube_transcript_api import YouTubeTranscriptApi
10
  from smolagents import tool, Tool, CodeAgent, DuckDuckGoSearchTool, HfApiModel, VisitWebpageTool, SpeechToTextTool, FinalAnswerTool
11
  from langchain_community.document_loaders import WikipediaLoader, PyPDFLoader, TextLoader
12
  from langchain.text_splitter import RecursiveCharacterTextSplitter
13
- from langchain_huggingface import HuggingFaceEmbeddings
 
14
  from langchain_community.vectorstores import DocArrayInMemorySearch
15
  from langchain_core.documents import Document
16
  from dotenv import load_dotenv
@@ -22,6 +23,10 @@ import uuid
22
  import concurrent.futures
23
  import time
24
 
 
 
 
 
25
  # --- Initialize logging ---
26
  LOG_FILE_PATH = "agent_activity.log"
27
  logging.basicConfig(
@@ -42,17 +47,14 @@ if not HF_API_TOKEN:
42
 
43
  # --- Global Vector Store and Embeddings ---
44
  try:
45
- from langchain_huggingface import HuggingFaceEmbeddings # Correct import for embeddings
46
-
47
  embeddings = HuggingFaceEmbeddings(model_name=HF_EMBEDDING_MODEL_ID)
48
  logger.info(f"Initialized HuggingFaceEmbeddings with model: {HF_EMBEDDING_MODEL_ID}")
49
  except Exception as e:
50
  logger.error(f"Failed to initialize HuggingFaceEmbeddings: {e}. Please ensure the model_id is correct and dependencies are installed.")
51
  embeddings = None
52
 
53
- # Initialize DocArrayInMemorySearch WITHOUT ANY arguments here.
54
- # We'll rely on passing the embedding to add_documents and similarity_search explicitly.
55
- vectorstore = DocArrayInMemorySearch() if embeddings else None # <--- REVERTED TO THIS SIMPLE INIT
56
  text_splitter = RecursiveCharacterTextSplitter(
57
  chunk_size=1000,
58
  chunk_overlap=200,
@@ -62,14 +64,27 @@ text_splitter = RecursiveCharacterTextSplitter(
62
  logger.info("Initialized in-memory DocArrayInMemorySearch vector store and RecursiveCharacterTextSplitter.")
63
 
64
 
 
 
 
 
 
 
 
 
 
 
 
 
 
65
 
66
  def add_document_to_vector_store(content: str, source: str, metadata: dict = None):
67
  """
68
  Adds content to the global vector store.
69
  Chunks the content and creates LangChain Documents.
70
  """
71
- if vectorstore is None or embeddings is None: # Explicitly check embeddings
72
- logger.warning("Vector store or embeddings not initialized. Cannot add document.")
73
  return
74
 
75
  try:
@@ -81,30 +96,13 @@ def add_document_to_vector_store(content: str, source: str, metadata: dict = Non
81
  doc_metadata.update(metadata)
82
  docs.append(Document(page_content=chunk, metadata=doc_metadata))
83
 
84
- # Pass the embeddings function here when adding documents.
85
- # This is often the more reliable way for DocArrayInMemorySearch
86
- # if its __init__ doesn't directly take `embedding`.
87
- vectorstore.add_documents(docs, embedding=embeddings) # <--- IMPORTANT: Pass embeddings here
88
  logger.info(f"Added {len(docs)} chunks from '{source}' to the vector store.")
89
  except Exception as e:
90
  logger.error(f"Error adding document from '{source}' to vector store: {e}")
91
 
92
-
93
- # --- Utility Functions ---
94
- def extract_youtube_id(url: str) -> str:
95
- """Extract YouTube ID from various URL formats"""
96
- patterns = [
97
- r'(?:https?:\/\/)?(?:www\.)?youtube\.com\/watch\?v=([^&]+)',
98
- r'(?:https?:\/\/)?youtu\.be\/([^?]+)',
99
- r'([a-zA-Z0-9_-]{11})'
100
- ]
101
- for pattern in patterns:
102
- match = re.search(pattern, url)
103
- if match:
104
- return match.group(1)
105
- return ""
106
-
107
-
108
  # --- Enhanced Tools ---
109
  class WikiSearchTool(Tool):
110
  """Enhanced Wikipedia search with better formatting and error handling"""
@@ -310,25 +308,36 @@ class RetrievalTool(Tool):
310
  }
311
  output_type = "string"
312
 
313
-
314
  def forward(self, query: str, k: int = 3) -> str:
315
- if vectorstore is None or embeddings is None: # Added check for embeddings
316
  return "Vector store is not initialized or embeddings are missing. No documents available for retrieval."
317
 
318
  try:
319
  logger.info(f"Retrieving {k} chunks from DocArrayInMemorySearch for query: {query}")
320
- # Explicitly pass the embedding for similarity search if it's required for query embedding
321
- retrieved_docs = vectorstore.similarity_search(query, k=k, embedding=embeddings) # <--- IMPORTANT: Pass embeddings here
 
 
322
 
323
  if not retrieved_docs:
324
  return "No relevant information found in the vector store for this query."
325
 
326
- # ... (rest of the method) ...
 
 
 
 
 
 
 
 
 
 
 
327
  except Exception as e:
328
  logger.error(f"Error retrieving from vector store for query '{query}': {e}")
329
  return f"Error retrieving from vector store: {str(e)}"
330
 
331
-
332
  class ChessAnalysisAPITool(Tool):
333
  """
334
  Analyzes a chess position provided in FEN format using a remote chess engine API (chess-api.com).
@@ -395,6 +404,7 @@ class ChessAnalysisAPITool(Tool):
395
  logger.error(f"An unexpected error occurred during remote chess analysis for FEN '{fen_string}': {e}")
396
  return f"An unexpected error occurred during chess analysis: {str(e)}"
397
 
 
398
  # --- Agent Initialization ---
399
  class BasicAgent:
400
  def __init__(self):
@@ -428,8 +438,7 @@ class BasicAgent:
428
  logger.info("Adding RetrievalTool to the agent's tools.")
429
  base_tools.append(RetrievalTool())
430
  else:
431
- logger.warning("RetrievalTool not added because vector store or embeddings are not initialized.")
432
-
433
 
434
  return base_tools
435
 
 
10
  from smolagents import tool, Tool, CodeAgent, DuckDuckGoSearchTool, HfApiModel, VisitWebpageTool, SpeechToTextTool, FinalAnswerTool
11
  from langchain_community.document_loaders import WikipediaLoader, PyPDFLoader, TextLoader
12
  from langchain.text_splitter import RecursiveCharacterTextSplitter
13
+ # Use the new import for HuggingFaceEmbeddings
14
+ from langchain_huggingface import HuggingFaceEmbeddings # <--- IMPORTANT: Updated import
15
  from langchain_community.vectorstores import DocArrayInMemorySearch
16
  from langchain_core.documents import Document
17
  from dotenv import load_dotenv
 
23
  import concurrent.futures
24
  import time
25
 
26
+ # Import DocList from docarray
27
+ from docarray import DocList # <--- IMPORTANT: Added this import
28
+
29
+
30
  # --- Initialize logging ---
31
  LOG_FILE_PATH = "agent_activity.log"
32
  logging.basicConfig(
 
47
 
48
  # --- Global Vector Store and Embeddings ---
49
  try:
 
 
50
  embeddings = HuggingFaceEmbeddings(model_name=HF_EMBEDDING_MODEL_ID)
51
  logger.info(f"Initialized HuggingFaceEmbeddings with model: {HF_EMBEDDING_MODEL_ID}")
52
  except Exception as e:
53
  logger.error(f"Failed to initialize HuggingFaceEmbeddings: {e}. Please ensure the model_id is correct and dependencies are installed.")
54
  embeddings = None
55
 
56
+ # Initialize DocArrayInMemorySearch WITH the required arguments: doc_index and embedding
57
+ vectorstore = DocArrayInMemorySearch(doc_index=DocList(), embedding=embeddings) if embeddings else None # <--- FIXED THIS LINE
 
58
  text_splitter = RecursiveCharacterTextSplitter(
59
  chunk_size=1000,
60
  chunk_overlap=200,
 
64
  logger.info("Initialized in-memory DocArrayInMemorySearch vector store and RecursiveCharacterTextSplitter.")
65
 
66
 
67
+ # --- Utility Functions ---
68
+ def extract_youtube_id(url: str) -> str:
69
+ """Extract YouTube ID from various URL formats"""
70
+ patterns = [
71
+ r'(?:https?:\/\/)?(?:www\.)?youtube\.com\/watch\?v=([^&]+)',
72
+ r'(?:https?:\/\/)?youtu\.be\/([^?]+)',
73
+ r'([a-zA-Z0-9_-]{11})'
74
+ ]
75
+ for pattern in patterns:
76
+ match = re.search(pattern, url)
77
+ if match:
78
+ return match.group(1)
79
+ return ""
80
 
81
  def add_document_to_vector_store(content: str, source: str, metadata: dict = None):
82
  """
83
  Adds content to the global vector store.
84
  Chunks the content and creates LangChain Documents.
85
  """
86
+ if vectorstore is None:
87
+ logger.warning("Vector store not initialized. Cannot add document.")
88
  return
89
 
90
  try:
 
96
  doc_metadata.update(metadata)
97
  docs.append(Document(page_content=chunk, metadata=doc_metadata))
98
 
99
+ # When vectorstore is initialized with embedding, add_documents might not need it again.
100
+ # But explicitly passing it is safer if there are multiple ways to initialize.
101
+ vectorstore.add_documents(docs) # No `embedding` argument needed here if initialized in __init__
 
102
  logger.info(f"Added {len(docs)} chunks from '{source}' to the vector store.")
103
  except Exception as e:
104
  logger.error(f"Error adding document from '{source}' to vector store: {e}")
105
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
106
  # --- Enhanced Tools ---
107
  class WikiSearchTool(Tool):
108
  """Enhanced Wikipedia search with better formatting and error handling"""
 
308
  }
309
  output_type = "string"
310
 
 
311
  def forward(self, query: str, k: int = 3) -> str:
312
+ if vectorstore is None or embeddings is None:
313
  return "Vector store is not initialized or embeddings are missing. No documents available for retrieval."
314
 
315
  try:
316
  logger.info(f"Retrieving {k} chunks from DocArrayInMemorySearch for query: {query}")
317
+ # Ensure similarity_search uses the vectorstore's internal embedding if initialized correctly
318
+ # or if it takes an explicit embedding argument here.
319
+ # With DocArrayInMemorySearch initialized with `embedding=embeddings`, this call should be fine.
320
+ retrieved_docs = vectorstore.similarity_search(query, k=k)
321
 
322
  if not retrieved_docs:
323
  return "No relevant information found in the vector store for this query."
324
 
325
+ formatted_results = []
326
+ for i, doc in enumerate(retrieved_docs):
327
+ source = doc.metadata.get('source', 'Unknown Source')
328
+ title = doc.metadata.get('title', 'N/A')
329
+ chunk_index = doc.metadata.get('chunk_index', 'N/A')
330
+ formatted_results.append(
331
+ f"--- Retrieved Document Chunk {i+1} ---\n"
332
+ f"Source: {source} (Chunk: {chunk_index})\n"
333
+ f"Title: {title}\n"
334
+ f"Content: {doc.page_content}\n"
335
+ )
336
+ return "\n\n".join(formatted_results)
337
  except Exception as e:
338
  logger.error(f"Error retrieving from vector store for query '{query}': {e}")
339
  return f"Error retrieving from vector store: {str(e)}"
340
 
 
341
  class ChessAnalysisAPITool(Tool):
342
  """
343
  Analyzes a chess position provided in FEN format using a remote chess engine API (chess-api.com).
 
404
  logger.error(f"An unexpected error occurred during remote chess analysis for FEN '{fen_string}': {e}")
405
  return f"An unexpected error occurred during chess analysis: {str(e)}"
406
 
407
+
408
  # --- Agent Initialization ---
409
  class BasicAgent:
410
  def __init__(self):
 
438
  logger.info("Adding RetrievalTool to the agent's tools.")
439
  base_tools.append(RetrievalTool())
440
  else:
441
+ logger.warning("RetrievalTool not added because vector store or embeddings are not initialized.")
 
442
 
443
  return base_tools
444