Final_Assignment_Project

Sleeping

App Files Files Community

wt002 commited on Jun 3

Commit

40b3768

verified ·

1 Parent(s): d5a8fef

Update app.py

Browse files

Files changed (1) hide show

app.py +38 -24

app.py CHANGED Viewed

@@ -43,8 +43,7 @@ if not HF_API_TOKEN:
 # --- Global Vector Store and Embeddings ---
 try:
     # Make sure to import HuggingFaceEmbeddings from the new package
-    # if you followed the previous advice to resolve the deprecation warning
-    from langchain_huggingface import HuggingFaceEmbeddings # Or keep langchain_community if you haven't migrated yet
     embeddings = HuggingFaceEmbeddings(model_name=HF_EMBEDDING_MODEL_ID)
     logger.info(f"Initialized HuggingFaceEmbeddings with model: {HF_EMBEDDING_MODEL_ID}")
@@ -52,8 +51,10 @@ except Exception as e:
     logger.error(f"Failed to initialize HuggingFaceEmbeddings: {e}. Please ensure the model_id is correct and dependencies are installed.")
     embeddings = None
-# Initialize DocArrayInMemorySearch WITHOUT the embedding_function argument here
-vectorstore = DocArrayInMemorySearch() if embeddings else None # <--- FIXED THIS LINE
 text_splitter = RecursiveCharacterTextSplitter(
     chunk_size=1000,
     chunk_overlap=200,
@@ -63,27 +64,14 @@ text_splitter = RecursiveCharacterTextSplitter(
 logger.info("Initialized in-memory DocArrayInMemorySearch vector store and RecursiveCharacterTextSplitter.")
-# --- Utility Functions ---
-def extract_youtube_id(url: str) -> str:
-    """Extract YouTube ID from various URL formats"""
-    patterns = [
-        r'(?:https?:\/\/)?(?:www\.)?youtube\.com\/watch\?v=([^&]+)',
-        r'(?:https?:\/\/)?youtu\.be\/([^?]+)',
-        r'([a-zA-Z0-9_-]{11})'
-    ]
-    for pattern in patterns:
-        match = re.search(pattern, url)
-        if match:
-            return match.group(1)
-    return ""
 def add_document_to_vector_store(content: str, source: str, metadata: dict = None):
     """
     Adds content to the global vector store.
     Chunks the content and creates LangChain Documents.
     """
-    if vectorstore is None or embeddings is None: # Added check for embeddings too
-        logger.warning("Vector store or embeddings not initialized. Cannot add document.")
         return
     try:
@@ -95,14 +83,33 @@ def add_document_to_vector_store(content: str, source: str, metadata: dict = Non
                 doc_metadata.update(metadata)
             docs.append(Document(page_content=chunk, metadata=doc_metadata))
-        # Pass the embeddings function here when adding documents
-        vectorstore.add_documents(docs, embedding=embeddings) # <--- IMPORTANT: Pass embeddings here
         logger.info(f"Added {len(docs)} chunks from '{source}' to the vector store.")
     except Exception as e:
         logger.error(f"Error adding document from '{source}' to vector store: {e}")
 # --- Enhanced Tools ---
 class WikiSearchTool(Tool):
     """Enhanced Wikipedia search with better formatting and error handling"""
@@ -556,19 +563,24 @@ You are an advanced, helpful, and highly analytical research assistant. Your goa
         return agent
 def __call__(self, question: str) -> str:
         logger.info(f"Received question: {question[:200]}...")
         try:
             global vectorstore
             if embeddings:
-                vectorstore = DocArrayInMemorySearch(embedding_function=embeddings)
                 logger.info("DocArrayInMemorySearch re-initialized for new session.")
             else:
                 logger.warning("Embeddings not initialized, cannot re-initialize DocArrayInMemorySearch.")
                 return "Error: Embedding model not loaded, cannot process request."
             # --- Implement a timeout for the agent's run method ---
-            # Max time in seconds for the agent to respond
-            AGENT_TIMEOUT_SECONDS = 120
             with concurrent.futures.ThreadPoolExecutor(max_workers=1) as executor:
                 future = executor.submit(self.agent.run, question)
@@ -584,6 +596,8 @@ def __call__(self, question: str) -> str:
                     return f"Error processing your request: {str(e)}. Please try again or rephrase your question."
             logger.info(f"Response generated successfully for question: {question[:200]}")
             return response
         except Exception as e:
             # This outer catch is for issues before agent.run is called or unhandled by the ThreadPoolExecutor

 # --- Global Vector Store and Embeddings ---
 try:
     # Make sure to import HuggingFaceEmbeddings from the new package
+    from langchain_huggingface import HuggingFaceEmbeddings # This is the correct import now
     embeddings = HuggingFaceEmbeddings(model_name=HF_EMBEDDING_MODEL_ID)
     logger.info(f"Initialized HuggingFaceEmbeddings with model: {HF_EMBEDDING_MODEL_ID}")
     logger.error(f"Failed to initialize HuggingFaceEmbeddings: {e}. Please ensure the model_id is correct and dependencies are installed.")
     embeddings = None
+# Initialize DocArrayInMemorySearch WITH the embedding function here
+# This will likely work with newer versions of DocArrayInMemorySearch
+# as it needs the embedding function for its internal doc_index.
+vectorstore = DocArrayInMemorySearch(embedding=embeddings) if embeddings else None # <--- FIXED THIS LINE AGAIN
 text_splitter = RecursiveCharacterTextSplitter(
     chunk_size=1000,
     chunk_overlap=200,
 logger.info("Initialized in-memory DocArrayInMemorySearch vector store and RecursiveCharacterTextSplitter.")
 def add_document_to_vector_store(content: str, source: str, metadata: dict = None):
     """
     Adds content to the global vector store.
     Chunks the content and creates LangChain Documents.
     """
+    if vectorstore is None: # Embeddings check is less critical here if vectorstore is already None
+        logger.warning("Vector store not initialized. Cannot add document.")
         return
     try:
                 doc_metadata.update(metadata)
             docs.append(Document(page_content=chunk, metadata=doc_metadata))
+        # When `vectorstore` was initialized with `embedding=embeddings`,
+        # `add_documents` often doesn't *also* need `embedding=embeddings`
+        # if the vectorstore already knows its embedding function.
+        # However, passing it explicitly here doesn't hurt and provides clarity.
+        vectorstore.add_documents(docs) # Changed from vectorstore.add_documents(docs, embedding=embeddings)
+                                        # as it should now pick up the embedding from initialization.
         logger.info(f"Added {len(docs)} chunks from '{source}' to the vector store.")
     except Exception as e:
         logger.error(f"Error adding document from '{source}' to vector store: {e}")
+# --- Utility Functions ---
+def extract_youtube_id(url: str) -> str:
+    """Extract YouTube ID from various URL formats"""
+    patterns = [
+        r'(?:https?:\/\/)?(?:www\.)?youtube\.com\/watch\?v=([^&]+)',
+        r'(?:https?:\/\/)?youtu\.be\/([^?]+)',
+        r'([a-zA-Z0-9_-]{11})'
+    ]
+    for pattern in patterns:
+        match = re.search(pattern, url)
+        if match:
+            return match.group(1)
+    return ""
 # --- Enhanced Tools ---
 class WikiSearchTool(Tool):
     """Enhanced Wikipedia search with better formatting and error handling"""
         return agent
 def __call__(self, question: str) -> str:
+        # Logging the initial receipt of the question
         logger.info(f"Received question: {question[:200]}...")
+        # print statement for immediate console feedback (optional, for debugging/display)
+        print(f"Agent received question (first 50 chars): {question[:50]}...")
         try:
             global vectorstore
+            # Re-initialize vectorstore for a new session, passing the embeddings
+            # This is crucial for newer versions of DocArrayInMemorySearch
             if embeddings:
+                vectorstore = DocArrayInMemorySearch(embedding=embeddings)
                 logger.info("DocArrayInMemorySearch re-initialized for new session.")
             else:
                 logger.warning("Embeddings not initialized, cannot re-initialize DocArrayInMemorySearch.")
                 return "Error: Embedding model not loaded, cannot process request."
             # --- Implement a timeout for the agent's run method ---
+            AGENT_TIMEOUT_SECONDS = 120  # Max time in seconds for the agent to respond
             with concurrent.futures.ThreadPoolExecutor(max_workers=1) as executor:
                 future = executor.submit(self.agent.run, question)
                     return f"Error processing your request: {str(e)}. Please try again or rephrase your question."
             logger.info(f"Response generated successfully for question: {question[:200]}")
+            # print statement for immediate console feedback of the final answer
+            print(f"Agent returning answer: {response}")
             return response
         except Exception as e:
             # This outer catch is for issues before agent.run is called or unhandled by the ThreadPoolExecutor