File size: 7,642 Bytes
3f61806
24ae72d
3f61806
 
24ae72d
 
 
 
 
 
3f61806
24ae72d
3f61806
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24ae72d
3f61806
24ae72d
 
b9ccd0b
24ae72d
3f61806
b9ccd0b
 
 
 
 
3f61806
b9ccd0b
 
 
 
3f61806
b9ccd0b
 
 
 
 
3f61806
b9ccd0b
3f61806
 
 
 
 
 
 
b9ccd0b
3f61806
b9ccd0b
 
 
 
 
 
 
 
 
3f61806
b9ccd0b
 
 
 
 
 
 
3f61806
b9ccd0b
3f61806
 
 
 
 
 
 
 
24ae72d
 
b9ccd0b
 
24ae72d
b9ccd0b
24ae72d
b9ccd0b
 
24ae72d
 
b9ccd0b
3f61806
 
 
b9ccd0b
3f61806
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24ae72d
3f61806
 
24ae72d
 
 
 
 
 
3f61806
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
import datetime
import datetime
from langchain_core.tools import tool
from langchain_chroma import Chroma
from typing import Optional, Dict, Any, Callable, List, Tuple 
import os
import sys
# Add the project root to the Python path
project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))
sys.path.insert(0, project_root)
from RAG_BOT.logger import logger


# Factory function to create the tool with the vectordb instance enclosed
def create_context_retriever_tool(vectordb: Chroma, k: int = 25, search_type: str = "similarity") -> Callable:
    """
    Factory function that creates and returns the murli retriever tool.
    The returned tool function has the vectordb instance enclosed via closure.

    Args:
        vectordb: The initialized Chroma vector database instance.
        k: The number of documents to retrieve (default: 25).
        search_type: The type of search to perform ('similarity', 'mmr', etc. Default: 'similarity').

    Returns:
        A callable tool function suitable for LangChain/LangGraph.
    """

    @tool(response_format="content_and_artifact")
    def retrieve_context(
        query: str,
        date_filter: Optional[str] = None,
        language: Optional[str] = None,
    ) -> Tuple[str, List[str]]: 
        """
        Retrieves relevant context snippets from indexed Brahmakumaris murlis based on a user query,
        optionally filtering by date and language. Use this tool when the user asks for summaries,
        details, specific points, or content related to a particular date (YYYY-MM-DD), topic,
        or concept mentioned within the murlis. The tool accesses an underlying vector database
        containing the murli content.

        Args:            
            query: The core semantic query about the murli content (e.g., 'summary of teachings', 'points about effort'). The LLM should formulate this based on the user's original question.
            date_filter: An optional date string in 'YYYY-MM-DD' format extracted from the user's query to filter documents by date. Provide ONLY if a specific date is mentioned.
            language: The language code ('en' for English, 'hi' for Hindi) inferred from the user's original query. ALWAYS try to infer and provide this parameter based on the user's input language.

        Returns:            
            A tuple containing:
            1. A status string indicating the outcome (e.g., number of documents retrieved).
            2. A list of strings, where each string is the page content of a retrieved document chunk.
            Returns ("Error retrieving context.", []) if an error occurs.
        """
        logger.info(f"Executing context_retriever_tool for query: '{query}', date: {date_filter}, lang: {language}")
        try:
            # Normalize query
            normalized_query = query.strip().lower()
            # Prepare search kwargs
            search_kwargs: Dict[str, Any] = {"k": k}
            if date_filter:
                try:
                    # Validate and format date
                    filter_date = datetime.datetime.strptime(date_filter, '%Y-%m-%d')
                    formatted_date = filter_date.strftime('%Y-%m-%d')                    
                    # Use $and if language filter is also present, otherwise just date
                    date_condition = {"date": formatted_date} # Chroma uses implicit $eq
                    if language:
                        lang_condition = {"language": language.lower()}
                        search_kwargs["filter"] = {"$and": [date_condition, lang_condition]}
                        logger.info(f"Applying date filter: {formatted_date}, language filter: {language.lower()}")
                    else:
                        search_kwargs["filter"] = date_condition
                except ValueError:
                    logger.warning(f"Invalid date format '{date_filter}'. Ignoring date filter.")
                    # If only language is present after invalid date
                    if language:
                         search_kwargs["filter"] = {"language": language.lower()}
            elif language: # Only language filter is present
                logger.info(f"Applying language filter: {language.lower()}")
                search_kwargs["filter"] = {"language": language.lower()}

            logger.debug(f"Using search_kwargs: {search_kwargs}")
            # Create retriever using the enclosed vectordb
            retriever = vectordb.as_retriever(
                search_type=search_type,
                search_kwargs=search_kwargs
            )

            # Retrieve documents
            retrieved_docs = retriever.invoke(normalized_query)
            # Return list of document contents
            doc_contents = [doc.page_content for doc in retrieved_docs]

            if not doc_contents:
                logger.info("No documents found matching the query.")
                content_string = "No relevant documents found matching the criteria."
            else:
                content_string = f"Successfully retrieved {len(retrieved_docs)} document chunks based on the query and filters."
                logger.info(f"Retrieved {len(doc_contents)} chunks.")
                logger.info(f"First retrieved doc content snippet: {doc_contents[0][:200]}...")

            return content_string, doc_contents

        except Exception as e:
            logger.error(f"Error during context retrieval: {e}", exc_info=True)
            return "Error during context retrieval.", [] # Return error status and empty list

    # Return the decorated inner function
    return retrieve_context


# Example usage (for testing purposes, requires a Chroma instance)
if __name__ == '__main__':
    # This is a placeholder for testing. Replace with actual Chroma DB setup.
    from langchain_huggingface import HuggingFaceEmbeddings
    from RAG_BOT.config import Config 
    persist_directory = '/home/bk_anupam/code/LLM_agents/RAG_BOT/chroma_db' # Example path
    embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
    try:
        vectordb_instance = Chroma(persist_directory=persist_directory, embedding_function=embeddings)
        test_query = "What is soul consciousness?"

        # 1. Create the tool using the factory, passing config values
        murli_tool_instance = create_context_retriever_tool(
            vectordb=vectordb_instance,
            k=Config.K,                 # Use K from config
            search_type=Config.SEARCH_TYPE # Use search_type from config
        )
        logger.info(f"Test: Context retriever tool created with k={Config.K} and search_type='{Config.SEARCH_TYPE}'.")

        # 2. Invoke the created tool
        # The tool now only expects arguments defined in its signature (query, date_filter)
        # k and search_type are now part of the tool's internal configuration via the factory
        tool_input = {"query": test_query} # Removed k from here
        context_result_list = murli_tool_instance.invoke(tool_input)

        print(f"Query: {test_query}")
        print(f"Retrieved {len(context_result_list)} Context Documents (using k={Config.K}, search_type='{Config.SEARCH_TYPE}'):")
        if context_result_list:
            print("First document content:")
            print(context_result_list[0][:500] + "...") # Print snippet of first doc
        else:
            print("No documents retrieved.")
        print("\nFactory function defined. Tool created and tested.")

    except Exception as e:
        print(f"Error during test setup or execution: {e}")
    # pass # Keep pass if not running example directly