wt002 commited on
Commit
15e7b1c
·
verified ·
1 Parent(s): 40b3768

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +25 -37
app.py CHANGED
@@ -42,8 +42,7 @@ if not HF_API_TOKEN:
42
 
43
  # --- Global Vector Store and Embeddings ---
44
  try:
45
- # Make sure to import HuggingFaceEmbeddings from the new package
46
- from langchain_huggingface import HuggingFaceEmbeddings # This is the correct import now
47
 
48
  embeddings = HuggingFaceEmbeddings(model_name=HF_EMBEDDING_MODEL_ID)
49
  logger.info(f"Initialized HuggingFaceEmbeddings with model: {HF_EMBEDDING_MODEL_ID}")
@@ -51,10 +50,9 @@ except Exception as e:
51
  logger.error(f"Failed to initialize HuggingFaceEmbeddings: {e}. Please ensure the model_id is correct and dependencies are installed.")
52
  embeddings = None
53
 
54
- # Initialize DocArrayInMemorySearch WITH the embedding function here
55
- # This will likely work with newer versions of DocArrayInMemorySearch
56
- # as it needs the embedding function for its internal doc_index.
57
- vectorstore = DocArrayInMemorySearch(embedding=embeddings) if embeddings else None # <--- FIXED THIS LINE AGAIN
58
  text_splitter = RecursiveCharacterTextSplitter(
59
  chunk_size=1000,
60
  chunk_overlap=200,
@@ -70,8 +68,8 @@ def add_document_to_vector_store(content: str, source: str, metadata: dict = Non
70
  Adds content to the global vector store.
71
  Chunks the content and creates LangChain Documents.
72
  """
73
- if vectorstore is None: # Embeddings check is less critical here if vectorstore is already None
74
- logger.warning("Vector store not initialized. Cannot add document.")
75
  return
76
 
77
  try:
@@ -83,18 +81,15 @@ def add_document_to_vector_store(content: str, source: str, metadata: dict = Non
83
  doc_metadata.update(metadata)
84
  docs.append(Document(page_content=chunk, metadata=doc_metadata))
85
 
86
- # When `vectorstore` was initialized with `embedding=embeddings`,
87
- # `add_documents` often doesn't *also* need `embedding=embeddings`
88
- # if the vectorstore already knows its embedding function.
89
- # However, passing it explicitly here doesn't hurt and provides clarity.
90
- vectorstore.add_documents(docs) # Changed from vectorstore.add_documents(docs, embedding=embeddings)
91
- # as it should now pick up the embedding from initialization.
92
  logger.info(f"Added {len(docs)} chunks from '{source}' to the vector store.")
93
  except Exception as e:
94
  logger.error(f"Error adding document from '{source}' to vector store: {e}")
95
 
96
 
97
-
98
  # --- Utility Functions ---
99
  def extract_youtube_id(url: str) -> str:
100
  """Extract YouTube ID from various URL formats"""
@@ -315,29 +310,21 @@ class RetrievalTool(Tool):
315
  }
316
  output_type = "string"
317
 
318
- def forward(self, query: str, k: int = 3) -> str:
319
- if vectorstore is None:
320
- return "Vector store is not initialized. No documents available for retrieval."
 
 
321
 
322
  try:
323
  logger.info(f"Retrieving {k} chunks from DocArrayInMemorySearch for query: {query}")
324
- retrieved_docs = vectorstore.similarity_search(query, k=k)
 
325
 
326
  if not retrieved_docs:
327
  return "No relevant information found in the vector store for this query."
328
 
329
- formatted_results = []
330
- for i, doc in enumerate(retrieved_docs):
331
- source = doc.metadata.get('source', 'Unknown Source')
332
- title = doc.metadata.get('title', 'N/A')
333
- chunk_index = doc.metadata.get('chunk_index', 'N/A')
334
- formatted_results.append(
335
- f"--- Retrieved Document Chunk {i+1} ---\n"
336
- f"Source: {source} (Chunk: {chunk_index})\n"
337
- f"Title: {title}\n"
338
- f"Content: {doc.page_content}\n"
339
- )
340
- return "\n\n".join(formatted_results)
341
  except Exception as e:
342
  logger.error(f"Error retrieving from vector store for query '{query}': {e}")
343
  return f"Error retrieving from vector store: {str(e)}"
@@ -562,23 +549,24 @@ You are an advanced, helpful, and highly analytical research assistant. Your goa
562
  agent.prompt_templates["system_prompt"] = system_prompt
563
  return agent
564
 
565
- def __call__(self, question: str) -> str:
566
- # Logging the initial receipt of the question
567
  logger.info(f"Received question: {question[:200]}...")
568
- # print statement for immediate console feedback (optional, for debugging/display)
569
  print(f"Agent received question (first 50 chars): {question[:50]}...")
570
 
571
  try:
572
  global vectorstore
573
- # Re-initialize vectorstore for a new session, passing the embeddings
574
- # This is crucial for newer versions of DocArrayInMemorySearch
575
  if embeddings:
576
- vectorstore = DocArrayInMemorySearch(embedding=embeddings)
577
  logger.info("DocArrayInMemorySearch re-initialized for new session.")
578
  else:
579
  logger.warning("Embeddings not initialized, cannot re-initialize DocArrayInMemorySearch.")
580
  return "Error: Embedding model not loaded, cannot process request."
581
 
 
 
582
  # --- Implement a timeout for the agent's run method ---
583
  AGENT_TIMEOUT_SECONDS = 120 # Max time in seconds for the agent to respond
584
 
 
42
 
43
  # --- Global Vector Store and Embeddings ---
44
  try:
45
+ from langchain_huggingface import HuggingFaceEmbeddings # Correct import for embeddings
 
46
 
47
  embeddings = HuggingFaceEmbeddings(model_name=HF_EMBEDDING_MODEL_ID)
48
  logger.info(f"Initialized HuggingFaceEmbeddings with model: {HF_EMBEDDING_MODEL_ID}")
 
50
  logger.error(f"Failed to initialize HuggingFaceEmbeddings: {e}. Please ensure the model_id is correct and dependencies are installed.")
51
  embeddings = None
52
 
53
+ # Initialize DocArrayInMemorySearch WITHOUT ANY arguments here.
54
+ # We'll rely on passing the embedding to add_documents and similarity_search explicitly.
55
+ vectorstore = DocArrayInMemorySearch() if embeddings else None # <--- REVERTED TO THIS SIMPLE INIT
 
56
  text_splitter = RecursiveCharacterTextSplitter(
57
  chunk_size=1000,
58
  chunk_overlap=200,
 
68
  Adds content to the global vector store.
69
  Chunks the content and creates LangChain Documents.
70
  """
71
+ if vectorstore is None or embeddings is None: # Explicitly check embeddings
72
+ logger.warning("Vector store or embeddings not initialized. Cannot add document.")
73
  return
74
 
75
  try:
 
81
  doc_metadata.update(metadata)
82
  docs.append(Document(page_content=chunk, metadata=doc_metadata))
83
 
84
+ # Pass the embeddings function here when adding documents.
85
+ # This is often the more reliable way for DocArrayInMemorySearch
86
+ # if its __init__ doesn't directly take `embedding`.
87
+ vectorstore.add_documents(docs, embedding=embeddings) # <--- IMPORTANT: Pass embeddings here
 
 
88
  logger.info(f"Added {len(docs)} chunks from '{source}' to the vector store.")
89
  except Exception as e:
90
  logger.error(f"Error adding document from '{source}' to vector store: {e}")
91
 
92
 
 
93
  # --- Utility Functions ---
94
  def extract_youtube_id(url: str) -> str:
95
  """Extract YouTube ID from various URL formats"""
 
310
  }
311
  output_type = "string"
312
 
313
+ class RetrievalTool(Tool):
314
+ # ... (rest of class definition) ...
315
+ def forward(self, query: str, k: int = 3) -> str:
316
+ if vectorstore is None or embeddings is None: # Added check for embeddings
317
+ return "Vector store is not initialized or embeddings are missing. No documents available for retrieval."
318
 
319
  try:
320
  logger.info(f"Retrieving {k} chunks from DocArrayInMemorySearch for query: {query}")
321
+ # Explicitly pass the embedding for similarity search if it's required for query embedding
322
+ retrieved_docs = vectorstore.similarity_search(query, k=k, embedding=embeddings) # <--- IMPORTANT: Pass embeddings here
323
 
324
  if not retrieved_docs:
325
  return "No relevant information found in the vector store for this query."
326
 
327
+ # ... (rest of the method) ...
 
 
 
 
 
 
 
 
 
 
 
328
  except Exception as e:
329
  logger.error(f"Error retrieving from vector store for query '{query}': {e}")
330
  return f"Error retrieving from vector store: {str(e)}"
 
549
  agent.prompt_templates["system_prompt"] = system_prompt
550
  return agent
551
 
552
+
553
+ def __call__(self, question: str) -> str:
554
  logger.info(f"Received question: {question[:200]}...")
 
555
  print(f"Agent received question (first 50 chars): {question[:50]}...")
556
 
557
  try:
558
  global vectorstore
559
+ # Re-initialize vectorstore for a new session without arguments
560
+ # This relies on the add_documents and similarity_search methods getting the embedding
561
  if embeddings:
562
+ vectorstore = DocArrayInMemorySearch() # <--- REVERTED TO THIS SIMPLE INIT HERE TOO
563
  logger.info("DocArrayInMemorySearch re-initialized for new session.")
564
  else:
565
  logger.warning("Embeddings not initialized, cannot re-initialize DocArrayInMemorySearch.")
566
  return "Error: Embedding model not loaded, cannot process request."
567
 
568
+
569
+
570
  # --- Implement a timeout for the agent's run method ---
571
  AGENT_TIMEOUT_SECONDS = 120 # Max time in seconds for the agent to respond
572