Final_Assignment_Project

Sleeping

App Files Files Community

wt002 commited on Jun 3

Commit

15e7b1c

verified ·

1 Parent(s): 40b3768

Update app.py

Browse files

Files changed (1) hide show

app.py +25 -37

app.py CHANGED Viewed

@@ -42,8 +42,7 @@ if not HF_API_TOKEN:
 # --- Global Vector Store and Embeddings ---
 try:
-    # Make sure to import HuggingFaceEmbeddings from the new package
-    from langchain_huggingface import HuggingFaceEmbeddings # This is the correct import now
     embeddings = HuggingFaceEmbeddings(model_name=HF_EMBEDDING_MODEL_ID)
     logger.info(f"Initialized HuggingFaceEmbeddings with model: {HF_EMBEDDING_MODEL_ID}")
@@ -51,10 +50,9 @@ except Exception as e:
     logger.error(f"Failed to initialize HuggingFaceEmbeddings: {e}. Please ensure the model_id is correct and dependencies are installed.")
     embeddings = None
-# Initialize DocArrayInMemorySearch WITH the embedding function here
-# This will likely work with newer versions of DocArrayInMemorySearch
-# as it needs the embedding function for its internal doc_index.
-vectorstore = DocArrayInMemorySearch(embedding=embeddings) if embeddings else None # <--- FIXED THIS LINE AGAIN
 text_splitter = RecursiveCharacterTextSplitter(
     chunk_size=1000,
     chunk_overlap=200,
@@ -70,8 +68,8 @@ def add_document_to_vector_store(content: str, source: str, metadata: dict = Non
     Adds content to the global vector store.
     Chunks the content and creates LangChain Documents.
     """
-    if vectorstore is None: # Embeddings check is less critical here if vectorstore is already None
-        logger.warning("Vector store not initialized. Cannot add document.")
         return
     try:
@@ -83,18 +81,15 @@ def add_document_to_vector_store(content: str, source: str, metadata: dict = Non
                 doc_metadata.update(metadata)
             docs.append(Document(page_content=chunk, metadata=doc_metadata))
-        # When `vectorstore` was initialized with `embedding=embeddings`,
-        # `add_documents` often doesn't *also* need `embedding=embeddings`
-        # if the vectorstore already knows its embedding function.
-        # However, passing it explicitly here doesn't hurt and provides clarity.
-        vectorstore.add_documents(docs) # Changed from vectorstore.add_documents(docs, embedding=embeddings)
-                                        # as it should now pick up the embedding from initialization.
         logger.info(f"Added {len(docs)} chunks from '{source}' to the vector store.")
     except Exception as e:
         logger.error(f"Error adding document from '{source}' to vector store: {e}")
 # --- Utility Functions ---
 def extract_youtube_id(url: str) -> str:
     """Extract YouTube ID from various URL formats"""
@@ -315,29 +310,21 @@ class RetrievalTool(Tool):
     }
     output_type = "string"
-    def forward(self, query: str, k: int = 3) -> str:
-        if vectorstore is None:
-            return "Vector store is not initialized. No documents available for retrieval."
         try:
             logger.info(f"Retrieving {k} chunks from DocArrayInMemorySearch for query: {query}")
-            retrieved_docs = vectorstore.similarity_search(query, k=k)
             if not retrieved_docs:
                 return "No relevant information found in the vector store for this query."
-            formatted_results = []
-            for i, doc in enumerate(retrieved_docs):
-                source = doc.metadata.get('source', 'Unknown Source')
-                title = doc.metadata.get('title', 'N/A')
-                chunk_index = doc.metadata.get('chunk_index', 'N/A')
-                formatted_results.append(
-                    f"--- Retrieved Document Chunk {i+1} ---\n"
-                    f"Source: {source} (Chunk: {chunk_index})\n"
-                    f"Title: {title}\n"
-                    f"Content: {doc.page_content}\n"
-                )
-            return "\n\n".join(formatted_results)
         except Exception as e:
             logger.error(f"Error retrieving from vector store for query '{query}': {e}")
             return f"Error retrieving from vector store: {str(e)}"
@@ -562,23 +549,24 @@ You are an advanced, helpful, and highly analytical research assistant. Your goa
         agent.prompt_templates["system_prompt"] = system_prompt
         return agent
-def __call__(self, question: str) -> str:
-        # Logging the initial receipt of the question
         logger.info(f"Received question: {question[:200]}...")
-        # print statement for immediate console feedback (optional, for debugging/display)
         print(f"Agent received question (first 50 chars): {question[:50]}...")
         try:
             global vectorstore
-            # Re-initialize vectorstore for a new session, passing the embeddings
-            # This is crucial for newer versions of DocArrayInMemorySearch
             if embeddings:
-                vectorstore = DocArrayInMemorySearch(embedding=embeddings)
                 logger.info("DocArrayInMemorySearch re-initialized for new session.")
             else:
                 logger.warning("Embeddings not initialized, cannot re-initialize DocArrayInMemorySearch.")
                 return "Error: Embedding model not loaded, cannot process request."
             # --- Implement a timeout for the agent's run method ---
             AGENT_TIMEOUT_SECONDS = 120  # Max time in seconds for the agent to respond

 # --- Global Vector Store and Embeddings ---
 try:
+    from langchain_huggingface import HuggingFaceEmbeddings # Correct import for embeddings
     embeddings = HuggingFaceEmbeddings(model_name=HF_EMBEDDING_MODEL_ID)
     logger.info(f"Initialized HuggingFaceEmbeddings with model: {HF_EMBEDDING_MODEL_ID}")
     logger.error(f"Failed to initialize HuggingFaceEmbeddings: {e}. Please ensure the model_id is correct and dependencies are installed.")
     embeddings = None
+# Initialize DocArrayInMemorySearch WITHOUT ANY arguments here.
+# We'll rely on passing the embedding to add_documents and similarity_search explicitly.
+vectorstore = DocArrayInMemorySearch() if embeddings else None # <--- REVERTED TO THIS SIMPLE INIT
 text_splitter = RecursiveCharacterTextSplitter(
     chunk_size=1000,
     chunk_overlap=200,
     Adds content to the global vector store.
     Chunks the content and creates LangChain Documents.
     """
+    if vectorstore is None or embeddings is None: # Explicitly check embeddings
+        logger.warning("Vector store or embeddings not initialized. Cannot add document.")
         return
     try:
                 doc_metadata.update(metadata)
             docs.append(Document(page_content=chunk, metadata=doc_metadata))
+        # Pass the embeddings function here when adding documents.
+        # This is often the more reliable way for DocArrayInMemorySearch
+        # if its __init__ doesn't directly take `embedding`.
+        vectorstore.add_documents(docs, embedding=embeddings) # <--- IMPORTANT: Pass embeddings here
         logger.info(f"Added {len(docs)} chunks from '{source}' to the vector store.")
     except Exception as e:
         logger.error(f"Error adding document from '{source}' to vector store: {e}")
 # --- Utility Functions ---
 def extract_youtube_id(url: str) -> str:
     """Extract YouTube ID from various URL formats"""
     }
     output_type = "string"
+    class RetrievalTool(Tool):
+    # ... (rest of class definition) ...
+        def forward(self, query: str, k: int = 3) -> str:
+        if vectorstore is None or embeddings is None: # Added check for embeddings
+            return "Vector store is not initialized or embeddings are missing. No documents available for retrieval."
         try:
             logger.info(f"Retrieving {k} chunks from DocArrayInMemorySearch for query: {query}")
+            # Explicitly pass the embedding for similarity search if it's required for query embedding
+            retrieved_docs = vectorstore.similarity_search(query, k=k, embedding=embeddings) # <--- IMPORTANT: Pass embeddings here
             if not retrieved_docs:
                 return "No relevant information found in the vector store for this query."
+            # ... (rest of the method) ...
         except Exception as e:
             logger.error(f"Error retrieving from vector store for query '{query}': {e}")
             return f"Error retrieving from vector store: {str(e)}"
         agent.prompt_templates["system_prompt"] = system_prompt
         return agent
+        def __call__(self, question: str) -> str:
         logger.info(f"Received question: {question[:200]}...")
         print(f"Agent received question (first 50 chars): {question[:50]}...")
         try:
             global vectorstore
+            # Re-initialize vectorstore for a new session without arguments
+            # This relies on the add_documents and similarity_search methods getting the embedding
             if embeddings:
+                vectorstore = DocArrayInMemorySearch() # <--- REVERTED TO THIS SIMPLE INIT HERE TOO
                 logger.info("DocArrayInMemorySearch re-initialized for new session.")
             else:
                 logger.warning("Embeddings not initialized, cannot re-initialize DocArrayInMemorySearch.")
                 return "Error: Embedding model not loaded, cannot process request."
             # --- Implement a timeout for the agent's run method ---
             AGENT_TIMEOUT_SECONDS = 120  # Max time in seconds for the agent to respond