wt002 commited on
Commit
40b3768
·
verified ·
1 Parent(s): d5a8fef

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +38 -24
app.py CHANGED
@@ -43,8 +43,7 @@ if not HF_API_TOKEN:
43
  # --- Global Vector Store and Embeddings ---
44
  try:
45
  # Make sure to import HuggingFaceEmbeddings from the new package
46
- # if you followed the previous advice to resolve the deprecation warning
47
- from langchain_huggingface import HuggingFaceEmbeddings # Or keep langchain_community if you haven't migrated yet
48
 
49
  embeddings = HuggingFaceEmbeddings(model_name=HF_EMBEDDING_MODEL_ID)
50
  logger.info(f"Initialized HuggingFaceEmbeddings with model: {HF_EMBEDDING_MODEL_ID}")
@@ -52,8 +51,10 @@ except Exception as e:
52
  logger.error(f"Failed to initialize HuggingFaceEmbeddings: {e}. Please ensure the model_id is correct and dependencies are installed.")
53
  embeddings = None
54
 
55
- # Initialize DocArrayInMemorySearch WITHOUT the embedding_function argument here
56
- vectorstore = DocArrayInMemorySearch() if embeddings else None # <--- FIXED THIS LINE
 
 
57
  text_splitter = RecursiveCharacterTextSplitter(
58
  chunk_size=1000,
59
  chunk_overlap=200,
@@ -63,27 +64,14 @@ text_splitter = RecursiveCharacterTextSplitter(
63
  logger.info("Initialized in-memory DocArrayInMemorySearch vector store and RecursiveCharacterTextSplitter.")
64
 
65
 
66
- # --- Utility Functions ---
67
- def extract_youtube_id(url: str) -> str:
68
- """Extract YouTube ID from various URL formats"""
69
- patterns = [
70
- r'(?:https?:\/\/)?(?:www\.)?youtube\.com\/watch\?v=([^&]+)',
71
- r'(?:https?:\/\/)?youtu\.be\/([^?]+)',
72
- r'([a-zA-Z0-9_-]{11})'
73
- ]
74
- for pattern in patterns:
75
- match = re.search(pattern, url)
76
- if match:
77
- return match.group(1)
78
- return ""
79
 
80
  def add_document_to_vector_store(content: str, source: str, metadata: dict = None):
81
  """
82
  Adds content to the global vector store.
83
  Chunks the content and creates LangChain Documents.
84
  """
85
- if vectorstore is None or embeddings is None: # Added check for embeddings too
86
- logger.warning("Vector store or embeddings not initialized. Cannot add document.")
87
  return
88
 
89
  try:
@@ -95,14 +83,33 @@ def add_document_to_vector_store(content: str, source: str, metadata: dict = Non
95
  doc_metadata.update(metadata)
96
  docs.append(Document(page_content=chunk, metadata=doc_metadata))
97
 
98
- # Pass the embeddings function here when adding documents
99
- vectorstore.add_documents(docs, embedding=embeddings) # <--- IMPORTANT: Pass embeddings here
 
 
 
 
100
  logger.info(f"Added {len(docs)} chunks from '{source}' to the vector store.")
101
  except Exception as e:
102
  logger.error(f"Error adding document from '{source}' to vector store: {e}")
103
 
104
 
105
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
106
  # --- Enhanced Tools ---
107
  class WikiSearchTool(Tool):
108
  """Enhanced Wikipedia search with better formatting and error handling"""
@@ -556,19 +563,24 @@ You are an advanced, helpful, and highly analytical research assistant. Your goa
556
  return agent
557
 
558
  def __call__(self, question: str) -> str:
 
559
  logger.info(f"Received question: {question[:200]}...")
 
 
 
560
  try:
561
  global vectorstore
 
 
562
  if embeddings:
563
- vectorstore = DocArrayInMemorySearch(embedding_function=embeddings)
564
  logger.info("DocArrayInMemorySearch re-initialized for new session.")
565
  else:
566
  logger.warning("Embeddings not initialized, cannot re-initialize DocArrayInMemorySearch.")
567
  return "Error: Embedding model not loaded, cannot process request."
568
 
569
  # --- Implement a timeout for the agent's run method ---
570
- # Max time in seconds for the agent to respond
571
- AGENT_TIMEOUT_SECONDS = 120
572
 
573
  with concurrent.futures.ThreadPoolExecutor(max_workers=1) as executor:
574
  future = executor.submit(self.agent.run, question)
@@ -584,6 +596,8 @@ def __call__(self, question: str) -> str:
584
  return f"Error processing your request: {str(e)}. Please try again or rephrase your question."
585
 
586
  logger.info(f"Response generated successfully for question: {question[:200]}")
 
 
587
  return response
588
  except Exception as e:
589
  # This outer catch is for issues before agent.run is called or unhandled by the ThreadPoolExecutor
 
43
  # --- Global Vector Store and Embeddings ---
44
  try:
45
  # Make sure to import HuggingFaceEmbeddings from the new package
46
+ from langchain_huggingface import HuggingFaceEmbeddings # This is the correct import now
 
47
 
48
  embeddings = HuggingFaceEmbeddings(model_name=HF_EMBEDDING_MODEL_ID)
49
  logger.info(f"Initialized HuggingFaceEmbeddings with model: {HF_EMBEDDING_MODEL_ID}")
 
51
  logger.error(f"Failed to initialize HuggingFaceEmbeddings: {e}. Please ensure the model_id is correct and dependencies are installed.")
52
  embeddings = None
53
 
54
+ # Initialize DocArrayInMemorySearch WITH the embedding function here
55
+ # This will likely work with newer versions of DocArrayInMemorySearch
56
+ # as it needs the embedding function for its internal doc_index.
57
+ vectorstore = DocArrayInMemorySearch(embedding=embeddings) if embeddings else None # <--- FIXED THIS LINE AGAIN
58
  text_splitter = RecursiveCharacterTextSplitter(
59
  chunk_size=1000,
60
  chunk_overlap=200,
 
64
  logger.info("Initialized in-memory DocArrayInMemorySearch vector store and RecursiveCharacterTextSplitter.")
65
 
66
 
 
 
 
 
 
 
 
 
 
 
 
 
 
67
 
68
  def add_document_to_vector_store(content: str, source: str, metadata: dict = None):
69
  """
70
  Adds content to the global vector store.
71
  Chunks the content and creates LangChain Documents.
72
  """
73
+ if vectorstore is None: # Embeddings check is less critical here if vectorstore is already None
74
+ logger.warning("Vector store not initialized. Cannot add document.")
75
  return
76
 
77
  try:
 
83
  doc_metadata.update(metadata)
84
  docs.append(Document(page_content=chunk, metadata=doc_metadata))
85
 
86
+ # When `vectorstore` was initialized with `embedding=embeddings`,
87
+ # `add_documents` often doesn't *also* need `embedding=embeddings`
88
+ # if the vectorstore already knows its embedding function.
89
+ # However, passing it explicitly here doesn't hurt and provides clarity.
90
+ vectorstore.add_documents(docs) # Changed from vectorstore.add_documents(docs, embedding=embeddings)
91
+ # as it should now pick up the embedding from initialization.
92
  logger.info(f"Added {len(docs)} chunks from '{source}' to the vector store.")
93
  except Exception as e:
94
  logger.error(f"Error adding document from '{source}' to vector store: {e}")
95
 
96
 
97
 
98
+ # --- Utility Functions ---
99
+ def extract_youtube_id(url: str) -> str:
100
+ """Extract YouTube ID from various URL formats"""
101
+ patterns = [
102
+ r'(?:https?:\/\/)?(?:www\.)?youtube\.com\/watch\?v=([^&]+)',
103
+ r'(?:https?:\/\/)?youtu\.be\/([^?]+)',
104
+ r'([a-zA-Z0-9_-]{11})'
105
+ ]
106
+ for pattern in patterns:
107
+ match = re.search(pattern, url)
108
+ if match:
109
+ return match.group(1)
110
+ return ""
111
+
112
+
113
  # --- Enhanced Tools ---
114
  class WikiSearchTool(Tool):
115
  """Enhanced Wikipedia search with better formatting and error handling"""
 
563
  return agent
564
 
565
  def __call__(self, question: str) -> str:
566
+ # Logging the initial receipt of the question
567
  logger.info(f"Received question: {question[:200]}...")
568
+ # print statement for immediate console feedback (optional, for debugging/display)
569
+ print(f"Agent received question (first 50 chars): {question[:50]}...")
570
+
571
  try:
572
  global vectorstore
573
+ # Re-initialize vectorstore for a new session, passing the embeddings
574
+ # This is crucial for newer versions of DocArrayInMemorySearch
575
  if embeddings:
576
+ vectorstore = DocArrayInMemorySearch(embedding=embeddings)
577
  logger.info("DocArrayInMemorySearch re-initialized for new session.")
578
  else:
579
  logger.warning("Embeddings not initialized, cannot re-initialize DocArrayInMemorySearch.")
580
  return "Error: Embedding model not loaded, cannot process request."
581
 
582
  # --- Implement a timeout for the agent's run method ---
583
+ AGENT_TIMEOUT_SECONDS = 120 # Max time in seconds for the agent to respond
 
584
 
585
  with concurrent.futures.ThreadPoolExecutor(max_workers=1) as executor:
586
  future = executor.submit(self.agent.run, question)
 
596
  return f"Error processing your request: {str(e)}. Please try again or rephrase your question."
597
 
598
  logger.info(f"Response generated successfully for question: {question[:200]}")
599
+ # print statement for immediate console feedback of the final answer
600
+ print(f"Agent returning answer: {response}")
601
  return response
602
  except Exception as e:
603
  # This outer catch is for issues before agent.run is called or unhandled by the ThreadPoolExecutor