Final_Assignment_Project

Sleeping

App Files Files Community

wt002 commited on Jun 3

Commit

df2ad57

verified ·

1 Parent(s): fe93293

update app.py

Browse files

Files changed (1) hide show

app.py +45 -36

app.py CHANGED Viewed

@@ -10,7 +10,8 @@ from youtube_transcript_api import YouTubeTranscriptApi
 from smolagents import tool, Tool, CodeAgent, DuckDuckGoSearchTool, HfApiModel, VisitWebpageTool, SpeechToTextTool, FinalAnswerTool
 from langchain_community.document_loaders import WikipediaLoader, PyPDFLoader, TextLoader
 from langchain.text_splitter import RecursiveCharacterTextSplitter
-from langchain_huggingface import HuggingFaceEmbeddings
 from langchain_community.vectorstores import DocArrayInMemorySearch
 from langchain_core.documents import Document
 from dotenv import load_dotenv
@@ -22,6 +23,10 @@ import uuid
 import concurrent.futures
 import time
 # --- Initialize logging ---
 LOG_FILE_PATH = "agent_activity.log"
 logging.basicConfig(
@@ -42,17 +47,14 @@ if not HF_API_TOKEN:
 # --- Global Vector Store and Embeddings ---
 try:
-    from langchain_huggingface import HuggingFaceEmbeddings # Correct import for embeddings
     embeddings = HuggingFaceEmbeddings(model_name=HF_EMBEDDING_MODEL_ID)
     logger.info(f"Initialized HuggingFaceEmbeddings with model: {HF_EMBEDDING_MODEL_ID}")
 except Exception as e:
     logger.error(f"Failed to initialize HuggingFaceEmbeddings: {e}. Please ensure the model_id is correct and dependencies are installed.")
     embeddings = None
-# Initialize DocArrayInMemorySearch WITHOUT ANY arguments here.
-# We'll rely on passing the embedding to add_documents and similarity_search explicitly.
-vectorstore = DocArrayInMemorySearch() if embeddings else None # <--- REVERTED TO THIS SIMPLE INIT
 text_splitter = RecursiveCharacterTextSplitter(
     chunk_size=1000,
     chunk_overlap=200,
@@ -62,14 +64,27 @@ text_splitter = RecursiveCharacterTextSplitter(
 logger.info("Initialized in-memory DocArrayInMemorySearch vector store and RecursiveCharacterTextSplitter.")
 def add_document_to_vector_store(content: str, source: str, metadata: dict = None):
     """
     Adds content to the global vector store.
     Chunks the content and creates LangChain Documents.
     """
-    if vectorstore is None or embeddings is None: # Explicitly check embeddings
-        logger.warning("Vector store or embeddings not initialized. Cannot add document.")
         return
     try:
@@ -81,30 +96,13 @@ def add_document_to_vector_store(content: str, source: str, metadata: dict = Non
                 doc_metadata.update(metadata)
             docs.append(Document(page_content=chunk, metadata=doc_metadata))
-        # Pass the embeddings function here when adding documents.
-        # This is often the more reliable way for DocArrayInMemorySearch
-        # if its __init__ doesn't directly take `embedding`.
-        vectorstore.add_documents(docs, embedding=embeddings) # <--- IMPORTANT: Pass embeddings here
         logger.info(f"Added {len(docs)} chunks from '{source}' to the vector store.")
     except Exception as e:
         logger.error(f"Error adding document from '{source}' to vector store: {e}")
-# --- Utility Functions ---
-def extract_youtube_id(url: str) -> str:
-    """Extract YouTube ID from various URL formats"""
-    patterns = [
-        r'(?:https?:\/\/)?(?:www\.)?youtube\.com\/watch\?v=([^&]+)',
-        r'(?:https?:\/\/)?youtu\.be\/([^?]+)',
-        r'([a-zA-Z0-9_-]{11})'
-    ]
-    for pattern in patterns:
-        match = re.search(pattern, url)
-        if match:
-            return match.group(1)
-    return ""
 # --- Enhanced Tools ---
 class WikiSearchTool(Tool):
     """Enhanced Wikipedia search with better formatting and error handling"""
@@ -310,25 +308,36 @@ class RetrievalTool(Tool):
     }
     output_type = "string"
     def forward(self, query: str, k: int = 3) -> str:
-        if vectorstore is None or embeddings is None: # Added check for embeddings
             return "Vector store is not initialized or embeddings are missing. No documents available for retrieval."
         try:
             logger.info(f"Retrieving {k} chunks from DocArrayInMemorySearch for query: {query}")
-            # Explicitly pass the embedding for similarity search if it's required for query embedding
-            retrieved_docs = vectorstore.similarity_search(query, k=k, embedding=embeddings) # <--- IMPORTANT: Pass embeddings here
             if not retrieved_docs:
                 return "No relevant information found in the vector store for this query."
-            # ... (rest of the method) ...
         except Exception as e:
             logger.error(f"Error retrieving from vector store for query '{query}': {e}")
             return f"Error retrieving from vector store: {str(e)}"
 class ChessAnalysisAPITool(Tool):
     """
     Analyzes a chess position provided in FEN format using a remote chess engine API (chess-api.com).
@@ -395,6 +404,7 @@ class ChessAnalysisAPITool(Tool):
             logger.error(f"An unexpected error occurred during remote chess analysis for FEN '{fen_string}': {e}")
             return f"An unexpected error occurred during chess analysis: {str(e)}"
 # --- Agent Initialization ---
 class BasicAgent:
     def __init__(self):
@@ -428,8 +438,7 @@ class BasicAgent:
             logger.info("Adding RetrievalTool to the agent's tools.")
             base_tools.append(RetrievalTool())
         else:
-            logger.warning("RetrievalTool not added because vector store or embeddings are not initialized.")
         return base_tools

 from smolagents import tool, Tool, CodeAgent, DuckDuckGoSearchTool, HfApiModel, VisitWebpageTool, SpeechToTextTool, FinalAnswerTool
 from langchain_community.document_loaders import WikipediaLoader, PyPDFLoader, TextLoader
 from langchain.text_splitter import RecursiveCharacterTextSplitter
+# Use the new import for HuggingFaceEmbeddings
+from langchain_huggingface import HuggingFaceEmbeddings # <--- IMPORTANT: Updated import
 from langchain_community.vectorstores import DocArrayInMemorySearch
 from langchain_core.documents import Document
 from dotenv import load_dotenv
 import concurrent.futures
 import time
+# Import DocList from docarray
+from docarray import DocList # <--- IMPORTANT: Added this import
 # --- Initialize logging ---
 LOG_FILE_PATH = "agent_activity.log"
 logging.basicConfig(
 # --- Global Vector Store and Embeddings ---
 try:
     embeddings = HuggingFaceEmbeddings(model_name=HF_EMBEDDING_MODEL_ID)
     logger.info(f"Initialized HuggingFaceEmbeddings with model: {HF_EMBEDDING_MODEL_ID}")
 except Exception as e:
     logger.error(f"Failed to initialize HuggingFaceEmbeddings: {e}. Please ensure the model_id is correct and dependencies are installed.")
     embeddings = None
+# Initialize DocArrayInMemorySearch WITH the required arguments: doc_index and embedding
+vectorstore = DocArrayInMemorySearch(doc_index=DocList(), embedding=embeddings) if embeddings else None # <--- FIXED THIS LINE
 text_splitter = RecursiveCharacterTextSplitter(
     chunk_size=1000,
     chunk_overlap=200,
 logger.info("Initialized in-memory DocArrayInMemorySearch vector store and RecursiveCharacterTextSplitter.")
+# --- Utility Functions ---
+def extract_youtube_id(url: str) -> str:
+    """Extract YouTube ID from various URL formats"""
+    patterns = [
+        r'(?:https?:\/\/)?(?:www\.)?youtube\.com\/watch\?v=([^&]+)',
+        r'(?:https?:\/\/)?youtu\.be\/([^?]+)',
+        r'([a-zA-Z0-9_-]{11})'
+    ]
+    for pattern in patterns:
+        match = re.search(pattern, url)
+        if match:
+            return match.group(1)
+    return ""
 def add_document_to_vector_store(content: str, source: str, metadata: dict = None):
     """
     Adds content to the global vector store.
     Chunks the content and creates LangChain Documents.
     """
+    if vectorstore is None:
+        logger.warning("Vector store not initialized. Cannot add document.")
         return
     try:
                 doc_metadata.update(metadata)
             docs.append(Document(page_content=chunk, metadata=doc_metadata))
+        # When vectorstore is initialized with embedding, add_documents might not need it again.
+        # But explicitly passing it is safer if there are multiple ways to initialize.
+        vectorstore.add_documents(docs) # No `embedding` argument needed here if initialized in __init__
         logger.info(f"Added {len(docs)} chunks from '{source}' to the vector store.")
     except Exception as e:
         logger.error(f"Error adding document from '{source}' to vector store: {e}")
 # --- Enhanced Tools ---
 class WikiSearchTool(Tool):
     """Enhanced Wikipedia search with better formatting and error handling"""
     }
     output_type = "string"
     def forward(self, query: str, k: int = 3) -> str:
+        if vectorstore is None or embeddings is None:
             return "Vector store is not initialized or embeddings are missing. No documents available for retrieval."
         try:
             logger.info(f"Retrieving {k} chunks from DocArrayInMemorySearch for query: {query}")
+            # Ensure similarity_search uses the vectorstore's internal embedding if initialized correctly
+            # or if it takes an explicit embedding argument here.
+            # With DocArrayInMemorySearch initialized with `embedding=embeddings`, this call should be fine.
+            retrieved_docs = vectorstore.similarity_search(query, k=k)
             if not retrieved_docs:
                 return "No relevant information found in the vector store for this query."
+            formatted_results = []
+            for i, doc in enumerate(retrieved_docs):
+                source = doc.metadata.get('source', 'Unknown Source')
+                title = doc.metadata.get('title', 'N/A')
+                chunk_index = doc.metadata.get('chunk_index', 'N/A')
+                formatted_results.append(
+                    f"--- Retrieved Document Chunk {i+1} ---\n"
+                    f"Source: {source} (Chunk: {chunk_index})\n"
+                    f"Title: {title}\n"
+                    f"Content: {doc.page_content}\n"
+                )
+            return "\n\n".join(formatted_results)
         except Exception as e:
             logger.error(f"Error retrieving from vector store for query '{query}': {e}")
             return f"Error retrieving from vector store: {str(e)}"
 class ChessAnalysisAPITool(Tool):
     """
     Analyzes a chess position provided in FEN format using a remote chess engine API (chess-api.com).
             logger.error(f"An unexpected error occurred during remote chess analysis for FEN '{fen_string}': {e}")
             return f"An unexpected error occurred during chess analysis: {str(e)}"
 # --- Agent Initialization ---
 class BasicAgent:
     def __init__(self):
             logger.info("Adding RetrievalTool to the agent's tools.")
             base_tools.append(RetrievalTool())
         else:
+            logger.warning("RetrievalTool not added because vector store or embeddings are not initialized.")
         return base_tools