Spaces:

bk-anupam
/

SpiritualChatBot

Building

SpiritualChatBot / RAG_BOT /context_retriever_tool.py

bk-anupam

Enhance RAG_BOT functionality with multilingual support and improved JSON parsing

b9ccd0b about 1 month ago

7.64 kB

	import datetime
	import datetime
	from langchain_core.tools import tool
	from langchain_chroma import Chroma
	from typing import Optional, Dict, Any, Callable, List, Tuple
	import os
	import sys
	# Add the project root to the Python path
	project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))
	sys.path.insert(0, project_root)
	from RAG_BOT.logger import logger


	# Factory function to create the tool with the vectordb instance enclosed
	def create_context_retriever_tool(vectordb: Chroma, k: int = 25, search_type: str = "similarity") -> Callable:
	"""
	Factory function that creates and returns the murli retriever tool.
	The returned tool function has the vectordb instance enclosed via closure.

	Args:
	vectordb: The initialized Chroma vector database instance.
	k: The number of documents to retrieve (default: 25).
	search_type: The type of search to perform ('similarity', 'mmr', etc. Default: 'similarity').

	Returns:
	A callable tool function suitable for LangChain/LangGraph.
	"""

	@tool(response_format="content_and_artifact")
	def retrieve_context(
	query: str,
	date_filter: Optional[str] = None,
	language: Optional[str] = None,
	) -> Tuple[str, List[str]]:
	"""
	Retrieves relevant context snippets from indexed Brahmakumaris murlis based on a user query,
	optionally filtering by date and language. Use this tool when the user asks for summaries,
	details, specific points, or content related to a particular date (YYYY-MM-DD), topic,
	or concept mentioned within the murlis. The tool accesses an underlying vector database
	containing the murli content.

	Args:
	query: The core semantic query about the murli content (e.g., 'summary of teachings', 'points about effort'). The LLM should formulate this based on the user's original question.
	date_filter: An optional date string in 'YYYY-MM-DD' format extracted from the user's query to filter documents by date. Provide ONLY if a specific date is mentioned.
	language: The language code ('en' for English, 'hi' for Hindi) inferred from the user's original query. ALWAYS try to infer and provide this parameter based on the user's input language.

	Returns:
	A tuple containing:
	1. A status string indicating the outcome (e.g., number of documents retrieved).
	2. A list of strings, where each string is the page content of a retrieved document chunk.
	Returns ("Error retrieving context.", []) if an error occurs.
	"""
	logger.info(f"Executing context_retriever_tool for query: '{query}', date: {date_filter}, lang: {language}")
	try:
	# Normalize query
	normalized_query = query.strip().lower()
	# Prepare search kwargs
	search_kwargs: Dict[str, Any] = {"k": k}
	if date_filter:
	try:
	# Validate and format date
	filter_date = datetime.datetime.strptime(date_filter, '%Y-%m-%d')
	formatted_date = filter_date.strftime('%Y-%m-%d')
	# Use $and if language filter is also present, otherwise just date
	date_condition = {"date": formatted_date} # Chroma uses implicit $eq
	if language:
	lang_condition = {"language": language.lower()}
	search_kwargs["filter"] = {"$and": [date_condition, lang_condition]}
	logger.info(f"Applying date filter: {formatted_date}, language filter: {language.lower()}")
	else:
	search_kwargs["filter"] = date_condition
	except ValueError:
	logger.warning(f"Invalid date format '{date_filter}'. Ignoring date filter.")
	# If only language is present after invalid date
	if language:
	search_kwargs["filter"] = {"language": language.lower()}
	elif language: # Only language filter is present
	logger.info(f"Applying language filter: {language.lower()}")
	search_kwargs["filter"] = {"language": language.lower()}

	logger.debug(f"Using search_kwargs: {search_kwargs}")
	# Create retriever using the enclosed vectordb
	retriever = vectordb.as_retriever(
	search_type=search_type,
	search_kwargs=search_kwargs
	)

	# Retrieve documents
	retrieved_docs = retriever.invoke(normalized_query)
	# Return list of document contents
	doc_contents = [doc.page_content for doc in retrieved_docs]

	if not doc_contents:
	logger.info("No documents found matching the query.")
	content_string = "No relevant documents found matching the criteria."
	else:
	content_string = f"Successfully retrieved {len(retrieved_docs)} document chunks based on the query and filters."
	logger.info(f"Retrieved {len(doc_contents)} chunks.")
	logger.info(f"First retrieved doc content snippet: {doc_contents[0][:200]}...")

	return content_string, doc_contents

	except Exception as e:
	logger.error(f"Error during context retrieval: {e}", exc_info=True)
	return "Error during context retrieval.", [] # Return error status and empty list

	# Return the decorated inner function
	return retrieve_context


	# Example usage (for testing purposes, requires a Chroma instance)
	if __name__ == '__main__':
	# This is a placeholder for testing. Replace with actual Chroma DB setup.
	from langchain_huggingface import HuggingFaceEmbeddings
	from RAG_BOT.config import Config
	persist_directory = '/home/bk_anupam/code/LLM_agents/RAG_BOT/chroma_db' # Example path
	embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
	try:
	vectordb_instance = Chroma(persist_directory=persist_directory, embedding_function=embeddings)
	test_query = "What is soul consciousness?"

	# 1. Create the tool using the factory, passing config values
	murli_tool_instance = create_context_retriever_tool(
	vectordb=vectordb_instance,
	k=Config.K, # Use K from config
	search_type=Config.SEARCH_TYPE # Use search_type from config
	)
	logger.info(f"Test: Context retriever tool created with k={Config.K} and search_type='{Config.SEARCH_TYPE}'.")

	# 2. Invoke the created tool
	# The tool now only expects arguments defined in its signature (query, date_filter)
	# k and search_type are now part of the tool's internal configuration via the factory
	tool_input = {"query": test_query} # Removed k from here
	context_result_list = murli_tool_instance.invoke(tool_input)

	print(f"Query: {test_query}")
	print(f"Retrieved {len(context_result_list)} Context Documents (using k={Config.K}, search_type='{Config.SEARCH_TYPE}'):")
	if context_result_list:
	print("First document content:")
	print(context_result_list[0][:500] + "...") # Print snippet of first doc
	else:
	print("No documents retrieved.")
	print("\nFactory function defined. Tool created and tested.")

	except Exception as e:
	print(f"Error during test setup or execution: {e}")
	# pass # Keep pass if not running example directly