Spaces:

JoeArmani
/

csc525_retrieval_based_chatbot

Sleeping

App Files Files Community

JoeArmani commited on Jan 12

Commit

c7c1b4e

1 Parent(s): 64e7c31

chat refinements

Browse files

Files changed (6) hide show

chatbot_config.py +8 -4
chatbot_model.py +28 -118
cross_encoder_reranker.py +2 -1
run_chatbot_chat.py +47 -23
run_chatbot_validation.py +7 -16
tf_data_pipeline.py +28 -38

chatbot_config.py CHANGED Viewed

@@ -4,19 +4,23 @@ from typing import Dict
 @dataclass
 class ChatbotConfig:
-    """RetrievalChatbot Config"""
-    max_context_token_limit: int = 512
-    embedding_dim: int = 384  # Match Sentence Transformer dimension
     learning_rate: float = 0.0005
     min_text_length: int = 3
-    max_context_turns: int = 20
     pretrained_model: str = 'sentence-transformers/all-MiniLM-L6-v2'
     cross_encoder_model: str = 'cross-encoder/ms-marco-MiniLM-L-12-v2'
     summarizer_model: str = 't5-small'
     embedding_batch_size: int = 64
     search_batch_size: int = 64
     max_batch_size: int = 64
     max_retries: int = 3
     def to_dict(self) -> Dict:
         """Convert config to dictionary."""

 @dataclass
 class ChatbotConfig:
+    """
+    All config params for the chatbot
+    """
+    max_context_length: int = 512
+    embedding_dim: int = 384  # Sentence Transformer dim
     learning_rate: float = 0.0005
     min_text_length: int = 3
+    max_context_turns: int = 24
     pretrained_model: str = 'sentence-transformers/all-MiniLM-L6-v2'
     cross_encoder_model: str = 'cross-encoder/ms-marco-MiniLM-L-12-v2'
     summarizer_model: str = 't5-small'
     embedding_batch_size: int = 64
     search_batch_size: int = 64
     max_batch_size: int = 64
+    neg_samples: int = 10
     max_retries: int = 3
+    nlist: int = 100
     def to_dict(self) -> Dict:
         """Convert config to dictionary."""

chatbot_model.py CHANGED Viewed

@@ -22,6 +22,9 @@ from tqdm.auto import tqdm
 absl.logging.set_verbosity(absl.logging.WARNING)
 logger = config_logger(__name__)
 class RetrievalChatbot(DeviceAwareModel):
     """
@@ -59,7 +62,6 @@ class RetrievalChatbot(DeviceAwareModel):
             tokenizer=self.tokenizer,
             encoder=self.encoder,
             response_pool=[],
-            max_length=self.config.max_context_token_limit,
             query_embeddings_cache={},
         )
@@ -96,7 +98,7 @@ class RetrievalChatbot(DeviceAwareModel):
         return Summarizer(
             tokenizer=self.tokenizer,
             model_name=self.config.summarizer_model,
-            max_summary_length=self.config.max_context_token_limit // 4,
             device=self.device,
             max_summary_rounds=2
         )
@@ -218,7 +220,6 @@ class RetrievalChatbot(DeviceAwareModel):
     ) -> List[Tuple[str, float]]:
         """
         Retrieve top-k responses using FAISS and cross-encoder re-ranking.
         Args:
             query: The user's input text.
             top_k: Number of responses to return.
@@ -226,7 +227,6 @@ class RetrievalChatbot(DeviceAwareModel):
             summarizer: Optional summarizer for long queries.
             summarize_threshold: Threshold to summarize long queries.
             boost_factor: Factor to boost scores for keyword matches.
         Returns:
             List of (response_text, final_score).
         """
@@ -241,18 +241,27 @@ class RetrievalChatbot(DeviceAwareModel):
         # Detect domain for query
         detected_domain = self.detect_domain_from_query(query)
-        # Step 1: Retrieve candidates from FAISS
-        logger.info("Retrieving initial candidates from FAISS...")
         faiss_candidates = self.data_pipeline.retrieve_responses(query, top_k=top_k * 10)
         if not faiss_candidates:
             logger.warning("No candidates retrieved from FAISS.")
             return []
-        # Step 2: Re-rank candidates using Cross-Encoder
-        logger.info("Re-ranking candidates using Cross-Encoder...")
-        texts = [item[0] for item in faiss_candidates]  # Extract response texts
         faiss_scores = [item[1] for item in faiss_candidates]
         if reranker is None:
@@ -277,9 +286,10 @@ class RetrievalChatbot(DeviceAwareModel):
             final_candidates.append((resp_text, length_adjusted_score))
-        # Step 3: Sort and return top-k results
         final_candidates.sort(key=lambda x: x[1], reverse=True)
-        logger.info(f"Returning top-{top_k} re-ranked responses.")
         return final_candidates[:top_k]
     def extract_keywords(self, query: str) -> List[str]:
@@ -323,7 +333,7 @@ class RetrievalChatbot(DeviceAwareModel):
     def detect_domain_from_query(self, query: str) -> str:
         """
-        Detect the domain of the query based on keywords. Used for boosting FAISS search.
         """
         domain_patterns = {
             'restaurant': r'\b(restaurant|restaurants?|dining|food|foods?|dine|reservation|reservations?|table|tables?|menu|menus?|cuisine|cuisines?|eat|eats?|place\s?to\s?eat|places\s?to\s?eat|hungry|chef|chefs?|dish|dishes?|meal|meals?|fork|forks?|knife|knives?|spoon|spoons?|brunch|bistro|buffet|buffets?|catering|caterings?|gourmet|fast\s?food|fine\s?dining|takeaway|takeaways?|delivery|deliveries|restaurant\s?booking)\b',
@@ -348,85 +358,6 @@ class RetrievalChatbot(DeviceAwareModel):
         pattern = r'^[\s]*[\d]+([\s.,\d]+)*[\s]*$'
         return bool(re.match(pattern, text.strip()))
-    def faiss_search(
-        self,
-        query: str,
-        domain: str = 'other',
-        top_k: int = 10,
-        boost_factor: float = 1.15
-    ) -> List[Tuple[str, float]]:
-        """
-        Retrieve top-k responses from the FAISS index (IndexFlatIP) given a user query.
-        Args:
-            query (str): The user input text.
-            domain (str): The detected domain from possible domains: ['restaurant', 'movie', 'ride_share', 'coffee', 'pizza', 'auto', 'other']
-            top_k (int): Number of top results to return.
-            boost_factor (float, optional): Factor to boost scores for keyword matches.
-        Returns:
-            List[Tuple[str, float]]: List of (response_text, similarity) sorted by descending similarity.
-        """
-        # Encode the query
-        q_emb = self.data_pipeline.encode_query(query)
-        q_emb_np = q_emb.reshape(1, -1).astype('float32')
-        # Search the index
-        distances, indices = self.data_pipeline.index.search(q_emb_np, top_k * 10)
-        # IndexFlatIP: 'distances' are inner products (cosine similarities for normalized vectors).
-        candidates = []
-        for rank, idx in enumerate(indices[0]):
-            if idx < 0:
-                continue
-            text_dict = self.data_pipeline.response_pool[idx]
-            text = text_dict.get('text', '').strip()
-            cand_domain = text_dict.get('domain', 'other')
-            score = distances[0][rank]
-            # Skip purely numeric or extremely short text (fewer than 3 words):
-            words = text.split()
-            if len(words) < 4:
-                continue
-            if self.is_numeric_response(text):
-                continue
-            candidates.append((text, cand_domain, score))
-        if not candidates:
-            logger.warning("No valid candidates found after initial numeric/length filtering.")
-            return []
-        # Sort candidates by score descending
-        candidates.sort(key=lambda x: x[2], reverse=True)
-        # Filter in-domain responses
-        in_domain = [c for c in candidates if c[1] == domain]
-        if not in_domain:
-            logger.info(f"No in-domain responses found for '{domain}'. Using all candidates.")
-            in_domain = candidates
-        # Boost responses containing query keywords
-        query_keywords = self.extract_keywords(query)
-        boosted = []
-        for (resp_text, resp_domain, score) in in_domain:
-            new_score = score
-            # If the domain is known AND the response text shares any query keywords, boost it
-            if query_keywords and any(kw in resp_text.lower() for kw in query_keywords):
-                new_score *= boost_factor
-            # Apply length penalty/bonus
-            new_score = self.length_adjust_score(resp_text, new_score)
-            boosted.append((resp_text, new_score))
-        # Sort boosted responses
-        boosted.sort(key=lambda x: x[1], reverse=True)
-        # Debug logging (see FAISS responses)
-        # for resp, score in boosted[:100]:
-        #     logger.debug(f"Candidate: '{resp}' with score {score}")
-        return boosted[:top_k]
     def introduction_message(self) -> None:
         """Print an introduction message to introduce the chatbot."""
         print(
@@ -453,7 +384,7 @@ class RetrievalChatbot(DeviceAwareModel):
                 print("\nAssistant: Goodbye!")
                 break
-            response, candidates, metrics = self.chat(
                 query=user_input,
                 conversation_history=None,
                 quality_checker=quality_checker,
@@ -466,7 +397,7 @@ class RetrievalChatbot(DeviceAwareModel):
                 print("\n   Alternative responses:")
                 for resp, score in candidates[1:4]:
                     print(f"   Score: {score:.4f} - {resp}")
-            else:
                 print("\n[Low Confidence]: Consider rephrasing your query for better assistance.")
     def chat(
@@ -504,10 +435,10 @@ class RetrievalChatbot(DeviceAwareModel):
             # if uncertain, ask for clarification
             if not is_confident or top_response_score < 0.5:
-                return ("I need more information to provide a good answer. Could you please clarify?", responses, metrics)
             # Return the top response
-            return responses[0][0], responses, metrics
         return get_response(self, query)
@@ -535,27 +466,6 @@ class RetrievalChatbot(DeviceAwareModel):
         conversation_parts.append(f"{USER_TOKEN} {query}")
         return "\n".join(conversation_parts)
-    # def _build_conversation_context(
-    #     self,
-    #     query: str,
-    #     conversation_history: Optional[List[Tuple[str, str]]]
-    # ) -> str:
-    #     """
-    #     Build conversation context string from conversation history.
-    #     """
-    #     if not conversation_history:
-    #         return f"{self.tokenizer.additional_special_tokens[self.tokenizer.additional_special_tokens.index('<USER>')]} {query}"
-    #     conversation_parts = []
-    #     for user_txt, assistant_txt in conversation_history:
-    #         conversation_parts.extend([
-    #             f"{self.tokenizer.additional_special_tokens[self.tokenizer.additional_special_tokens.index('<USER>')]} {user_txt}",
-    #             f"{self.tokenizer.additional_special_tokens[self.tokenizer.additional_special_tokens.index('<ASSISTANT>')]} {assistant_txt}"
-    #         ])
-    #     conversation_parts.append(f"{self.tokenizer.additional_special_tokens[self.tokenizer.additional_special_tokens.index('<USER>')]} {query}")
-    #     return "\n".join(conversation_parts)
     def train_model(
         self,
         tfrecord_file_path: str,
@@ -633,7 +543,7 @@ class RetrievalChatbot(DeviceAwareModel):
             logger.info("Using fixed learning rate.")
         # Dummy step to force initialization
-        dummy_input = tf.zeros((1, self.config.max_context_token_limit), dtype=tf.int32)
         with tf.GradientTape() as tape:
             dummy_output = self.encoder(dummy_input)
             dummy_loss = tf.cast(tf.reduce_mean(dummy_output), tf.float32)
@@ -747,7 +657,7 @@ class RetrievalChatbot(DeviceAwareModel):
             logger.info(f"New validation pairs: {val_size}")
         dataset = dataset.map(
-            lambda x: parse_tfrecord_fn(x, self.config.max_context_token_limit, self.data_pipeline.neg_samples),
             num_parallel_calls=tf.data.AUTOTUNE
         )

 absl.logging.set_verbosity(absl.logging.WARNING)
 logger = config_logger(__name__)
+logger.setLevel("WARNING")
+os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
+tqdm(disable=True)
 class RetrievalChatbot(DeviceAwareModel):
     """
             tokenizer=self.tokenizer,
             encoder=self.encoder,
             response_pool=[],
             query_embeddings_cache={},
         )
         return Summarizer(
             tokenizer=self.tokenizer,
             model_name=self.config.summarizer_model,
+            max_summary_length=self.config.max_context_length // 4,
             device=self.device,
             max_summary_rounds=2
         )
     ) -> List[Tuple[str, float]]:
         """
         Retrieve top-k responses using FAISS and cross-encoder re-ranking.
         Args:
             query: The user's input text.
             top_k: Number of responses to return.
             summarizer: Optional summarizer for long queries.
             summarize_threshold: Threshold to summarize long queries.
             boost_factor: Factor to boost scores for keyword matches.
         Returns:
             List of (response_text, final_score).
         """
         # Detect domain for query
         detected_domain = self.detect_domain_from_query(query)
+        #logger.info(f"Detected domain: {detected_domain}")
+        # Retrieve candidates from FAISS
+        #logger.info("Retrieving initial candidates from FAISS...")
         faiss_candidates = self.data_pipeline.retrieve_responses(query, top_k=top_k * 10)
         if not faiss_candidates:
             logger.warning("No candidates retrieved from FAISS.")
             return []
+        # Filter out-of-domain responses
+        if detected_domain != 'other':
+            in_domain_candidates = [c for c in faiss_candidates if c[0]["domain"] == detected_domain]
+            if in_domain_candidates:
+                faiss_candidates = in_domain_candidates
+            else:
+                logger.info(f"No in-domain responses found for '{query}'. Using all candidates.")
+        # Re-rank candidates using Cross-Encoder
+        #logger.info("Re-ranking candidates using Cross-Encoder...")
+        texts = [item[0]["text"] for item in faiss_candidates]  # Extract response texts
         faiss_scores = [item[1] for item in faiss_candidates]
         if reranker is None:
             final_candidates.append((resp_text, length_adjusted_score))
+        # Sort and return top-k results
         final_candidates.sort(key=lambda x: x[1], reverse=True)
+        #logger.info(f"Returning top-{top_k} re-ranked responses.")
         return final_candidates[:top_k]
     def extract_keywords(self, query: str) -> List[str]:
     def detect_domain_from_query(self, query: str) -> str:
         """
+        Detect the domain of the query based on keywords. Used for filtering FAISS search.
         """
         domain_patterns = {
             'restaurant': r'\b(restaurant|restaurants?|dining|food|foods?|dine|reservation|reservations?|table|tables?|menu|menus?|cuisine|cuisines?|eat|eats?|place\s?to\s?eat|places\s?to\s?eat|hungry|chef|chefs?|dish|dishes?|meal|meals?|fork|forks?|knife|knives?|spoon|spoons?|brunch|bistro|buffet|buffets?|catering|caterings?|gourmet|fast\s?food|fine\s?dining|takeaway|takeaways?|delivery|deliveries|restaurant\s?booking)\b',
         pattern = r'^[\s]*[\d]+([\s.,\d]+)*[\s]*$'
         return bool(re.match(pattern, text.strip()))
     def introduction_message(self) -> None:
         """Print an introduction message to introduce the chatbot."""
         print(
                 print("\nAssistant: Goodbye!")
                 break
+            response, candidates, metrics, top_response_score = self.chat(
                 query=user_input,
                 conversation_history=None,
                 quality_checker=quality_checker,
                 print("\n   Alternative responses:")
                 for resp, score in candidates[1:4]:
                     print(f"   Score: {score:.4f} - {resp}")
+            elif top_response_score < 0.7:
                 print("\n[Low Confidence]: Consider rephrasing your query for better assistance.")
     def chat(
             # if uncertain, ask for clarification
             if not is_confident or top_response_score < 0.5:
+                return ("I need more information to provide a good answer. Could you please clarify?", responses, metrics, top_response_score)
             # Return the top response
+            return responses[0][0], responses, metrics, top_response_score
         return get_response(self, query)
         conversation_parts.append(f"{USER_TOKEN} {query}")
         return "\n".join(conversation_parts)
     def train_model(
         self,
         tfrecord_file_path: str,
             logger.info("Using fixed learning rate.")
         # Dummy step to force initialization
+        dummy_input = tf.zeros((1, self.config.max_context_length), dtype=tf.int32)
         with tf.GradientTape() as tape:
             dummy_output = self.encoder(dummy_input)
             dummy_loss = tf.cast(tf.reduce_mean(dummy_output), tf.float32)
             logger.info(f"New validation pairs: {val_size}")
         dataset = dataset.map(
+            lambda x: parse_tfrecord_fn(x, self.config.max_context_length, self.data_pipeline.neg_samples),
             num_parallel_calls=tf.data.AUTOTUNE
         )

cross_encoder_reranker.py CHANGED Viewed

@@ -42,7 +42,8 @@ class CrossEncoderReranker:
             padding=True,
             truncation=True,
             max_length=max_length,
-            return_tensors="tf"
         )
         # Forward pass, logits shape [batch_size, 1]

             padding=True,
             truncation=True,
             max_length=max_length,
+            return_tensors="tf",
+            verbose=False
         )
         # Forward pass, logits shape [batch_size, 1]

run_chatbot_chat.py CHANGED Viewed

@@ -1,12 +1,19 @@
 import os
 import json
-from chatbot_model import RetrievalChatbot
 from chatbot_config import ChatbotConfig
 from response_quality_checker import ResponseQualityChecker
 from environment_setup import EnvironmentSetup
 from logger_config import config_logger
 logger = config_logger(__name__)
 def run_chatbot_chat():
     env = EnvironmentSetup()
@@ -37,38 +44,55 @@ def run_chatbot_chat():
         config = ChatbotConfig()
         logger.warning("No config.json found. Using default ChatbotConfig.")
-    # Load RetrievalChatbot in 'inference' mode
     try:
-        chatbot = RetrievalChatbot.load_model(load_dir=MODEL_DIR, mode="inference")
     except Exception as e:
-        logger.error(f"Failed to load RetrievalChatbot: {e}")
-        return
-    # Confirm FAISS index & response pool exist
-    if not os.path.exists(FAISS_INDEX_PATH) or not os.path.exists(RESPONSE_POOL_PATH):
-        logger.error("FAISS index or response pool file is missing.")
         return
     # Load FAISS index and response pool
     try:
-        chatbot.data_pipeline.load_faiss_index(FAISS_INDEX_PATH)
         with open(RESPONSE_POOL_PATH, "r", encoding="utf-8") as f:
-            chatbot.data_pipeline.response_pool = json.load(f)
-        logger.info(f"FAISS index loaded from {FAISS_INDEX_PATH}.")
         # Validate dimension consistency
-        chatbot.data_pipeline.validate_faiss_index()
     except Exception as e:
         logger.error(f"Failed to load or validate FAISS index: {e}")
         return
-    # Init QualityChecker and Validator
-    quality_checker = ResponseQualityChecker(data_pipeline=chatbot.data_pipeline)
-    # Run interactive chat loop
-    logger.info("\nStarting interactive chat session...")
-    chatbot.run_interactive_chat(quality_checker)
 if __name__ == "__main__":
     run_chatbot_chat()

 import os
 import json
+from tqdm.auto import tqdm
 from chatbot_config import ChatbotConfig
+from chatbot_model import RetrievalChatbot
+from sentence_transformers import SentenceTransformer
+from tf_data_pipeline import TFDataPipeline
 from response_quality_checker import ResponseQualityChecker
 from environment_setup import EnvironmentSetup
 from logger_config import config_logger
 logger = config_logger(__name__)
+logger.setLevel("WARNING")
+os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
+tqdm(disable=True)
 def run_chatbot_chat():
     env = EnvironmentSetup()
         config = ChatbotConfig()
         logger.warning("No config.json found. Using default ChatbotConfig.")
+    # Init SentenceTransformer
     try:
+        encoder = SentenceTransformer(config.pretrained_model)
+        logger.info(f"Loaded SentenceTransformer model: {config.pretrained_model}")
     except Exception as e:
+        logger.error(f"Failed to load SentenceTransformer: {e}")
         return
     # Load FAISS index and response pool
     try:
+        # Initialize TFDataPipeline
+        data_pipeline = TFDataPipeline(
+            config=config,
+            tokenizer=encoder.tokenizer,
+            encoder=encoder,
+            response_pool=[],
+            query_embeddings_cache={},
+            index_type='IndexFlatIP',
+            faiss_index_file_path=FAISS_INDEX_PATH
+        )
+        if not os.path.exists(FAISS_INDEX_PATH) or not os.path.exists(RESPONSE_POOL_PATH):
+            logger.error("FAISS index or response pool file is missing.")
+            return
+        data_pipeline.load_faiss_index(FAISS_INDEX_PATH)
+        logger.info(f"FAISS index loaded from {FAISS_INDEX_PATH}.")
         with open(RESPONSE_POOL_PATH, "r", encoding="utf-8") as f:
+            data_pipeline.response_pool = json.load(f)
+            logger.info(f"Response pool loaded from {RESPONSE_POOL_PATH}.")
+            logger.info(f"Total responses in pool: {len(data_pipeline.response_pool)}")
         # Validate dimension consistency
+        data_pipeline.validate_faiss_index()
+        logger.info("FAISS index and response pool validated successfully.")
     except Exception as e:
         logger.error(f"Failed to load or validate FAISS index: {e}")
         return
+    # Run interactive chat
+    try:
+        chatbot = RetrievalChatbot.load_model(load_dir=MODEL_DIR, mode="inference")
+        quality_checker = ResponseQualityChecker(data_pipeline=data_pipeline)
+        logger.info("\nStarting interactive chat session...")
+        chatbot.run_interactive_chat(quality_checker=quality_checker, show_alternatives=False)
+    except Exception as e:
+        logger.error(f"Interactive chat session failed: {e}")
 if __name__ == "__main__":
     run_chatbot_chat()

run_chatbot_validation.py CHANGED Viewed

@@ -44,9 +44,8 @@ def run_chatbot_validation():
     # Init SentenceTransformer
     try:
-        model_name = "sentence-transformers/all-MiniLM-L6-v2"  # Replace with your chosen model
-        encoder = SentenceTransformer(model_name)
-        logger.info(f"Loaded SentenceTransformer model: {model_name}")
     except Exception as e:
         logger.error(f"Failed to load SentenceTransformer: {e}")
         return
@@ -108,18 +107,10 @@ def run_chatbot_validation():
     # Run interactive chat loop
     try:
         logger.info("\nStarting interactive chat session...")
-        while True:
-            user_input = input("You: ")
-            if user_input.lower() in ["exit", "quit"]:
-                logger.info("Exiting chat session.")
-                break
-            responses = data_pipeline.retrieve_responses(user_input, top_k=3)
-            print("Top Responses:")
-            for i, (response, score) in enumerate(responses, start=1):
-                print(f"{i}. {response} (Score: {score:.4f})")
-    except KeyboardInterrupt:
-        logger.info("Interactive chat session interrupted by user.")
 if __name__ == "__main__":
     run_chatbot_validation()

     # Init SentenceTransformer
     try:
+        encoder = SentenceTransformer(config.pretrained_model)
+        logger.info(f"Loaded SentenceTransformer model: {config.pretrained_model}")
     except Exception as e:
         logger.error(f"Failed to load SentenceTransformer: {e}")
         return
     # Run interactive chat loop
     try:
         logger.info("\nStarting interactive chat session...")
+        chatbot.run_interactive_chat(quality_checker=quality_checker, show_alternatives=True)
+    except Exception as e:
+        logger.error(f"Interactive chat session failed: {e}")
 if __name__ == "__main__":
     run_chatbot_validation()

tf_data_pipeline.py CHANGED Viewed

@@ -6,7 +6,7 @@ import h5py
 import math
 import random
 import gc
-from tqdm import tqdm
 import json
 from pathlib import Path
 from typing import Union, Optional, Dict, List, Tuple, Generator
@@ -28,31 +28,25 @@ class TFDataPipeline:
         encoder: SentenceTransformer,
         response_pool: List[str],
         query_embeddings_cache: dict,
-        model_name: str = 'sentence-transformers/all-MiniLM-L6-v2',
-        max_length: int = 512,
-        neg_samples: int = 10,
         index_type: str = 'IndexFlatIP',
         faiss_index_file_path: str = 'models/faiss_indices/faiss_index_production.index',
-        dimension: int = 384,
-        nlist: int = 100,
-        max_retries: int = 3
     ):
         self.config = config
         self.tokenizer = tokenizer
         self.encoder = encoder
-        self.model = SentenceTransformer(model_name)
         self.faiss_index_file_path = faiss_index_file_path
         self.response_pool = response_pool
-        self.max_length = max_length
-        self.neg_samples = neg_samples
         self.query_embeddings_cache = query_embeddings_cache # In-memory cache for embeddings
-        self.dimension = config.embedding_dim
         self.index_type = index_type
-        self.nlist = nlist
-        self.embedding_batch_size = 16 if len(response_pool) < 100 else 64
-        self.search_batch_size = 16 if len(response_pool) < 100 else 64
-        self.max_batch_size = 16 if len(response_pool) < 100 else 64
-        self.max_retries = max_retries
         # Build text -> domain map for O(1) domain lookups (hard negative sampling)
         self._text_domain_map = {}
@@ -159,7 +153,7 @@ class TFDataPipeline:
                 speaker = turn.get('speaker')
                 text = turn.get('text', '').strip()
                 if speaker == 'assistant' and text:
-                    if len(text) <= self.max_length:
                         # Use tuple as set key to ensure uniqueness
                         key = (domain, text)
                         if key not in response_set:
@@ -388,7 +382,7 @@ class TFDataPipeline:
                     #     f"Collision detected: text '{stripped_text}' found with domains "
                     #     f"'{existing_domain}' and '{domain}'. Keeping the first."
                     # )
-                # By default, keep the first domain or overwrite. We'll skip overwriting:
                 continue
             else:
                 # Insert into the dict
@@ -434,7 +428,7 @@ class TFDataPipeline:
             prepared,
             padding='max_length',
             truncation=True,
-            max_length=self.max_length,
             return_tensors='np'
         )
         input_ids = encodings['input_ids']
@@ -454,23 +448,19 @@ class TFDataPipeline:
     def retrieve_responses(self, query: str, top_k: int = 10) -> List[Tuple[str, float]]:
         """
         Retrieve top-k responses for a query using FAISS.
-        Args:
-            query: User's query text.
-            top_k: Number of responses to return.
-        Returns:
-            List of tuples (response text, similarity score).
         """
         query_embedding = self.encode_query(query).reshape(1, -1).astype("float32")
         distances, indices = self.index.search(query_embedding, top_k)
         results = []
-        for idx, dist in zip(indices[0], distances[0]):
             if idx < 0:
                 continue
             response = self.response_pool[idx]
-            results.append((response["text"], dist))
         return results
@@ -496,7 +486,7 @@ class TFDataPipeline:
                     for dialogue in batch_dialogues:
                         pairs = self._extract_pairs_from_dialogue(dialogue)
                         for query, positive in pairs:
-                            if len(query) <= self.max_length and len(positive) <= self.max_length:
                                 queries.append(query)
                                 positives.append(positive)
@@ -524,14 +514,14 @@ class TFDataPipeline:
                     try:
                         encoded_queries = self.tokenizer.batch_encode_plus(
                             queries,
-                            max_length=self.config.max_context_token_limit,
                             truncation=True,
                             padding='max_length',
                             return_tensors='tf'
                         )
                         encoded_positives = self.tokenizer.batch_encode_plus(
                             positives,
-                            max_length=self.config.max_context_token_limit,
                             truncation=True,
                             padding='max_length',
                             return_tensors='tf'
@@ -547,7 +537,7 @@ class TFDataPipeline:
                         flattened_negatives = [neg for sublist in hard_negatives for neg in sublist]
                         encoded_negatives = self.tokenizer.batch_encode_plus(
                             flattened_negatives,
-                            max_length=self.config.max_context_token_limit,
                             truncation=True,
                             padding='max_length',
                             return_tensors='tf'
@@ -555,7 +545,7 @@ class TFDataPipeline:
                         # Reshape to [num_queries, num_negatives, max_length]
                         num_negatives = self.config.neg_samples
-                        reshaped_negatives = encoded_negatives['input_ids'].numpy().reshape(-1, num_negatives, self.config.max_context_token_limit)
                     except Exception as e:
                         logger.error(f"Error during negatives tokenization: {e}")
                         pbar.update(1)
@@ -600,7 +590,7 @@ class TFDataPipeline:
                 batch_queries,
                 padding=True,
                 truncation=True,
-                max_length=self.max_length,
                 return_tensors='tf'
             )
             batch_embeddings = self.encoder(encoded['input_ids'], training=False).numpy()
@@ -667,14 +657,14 @@ class TFDataPipeline:
         # Use tf.py_function, limit parallelism
         q_ids, p_ids, n_ids = tf.py_function(
             func=self._tokenize_triple_py,
-            inp=[q, p, n, tf.constant(self.max_length), tf.constant(self.neg_samples)],
             Tout=[tf.int32, tf.int32, tf.int32]
         )
         # Set shape info for the output tensors
-        q_ids.set_shape([None, self.max_length])                    # [batch_size, max_length]
-        p_ids.set_shape([None, self.max_length])                    # [batch_size, max_length]
-        n_ids.set_shape([None, self.neg_samples, self.max_length])  # [batch_size, neg_samples, max_length]
         return q_ids, p_ids, n_ids

 import math
 import random
 import gc
+from tqdm.auto import tqdm
 import json
 from pathlib import Path
 from typing import Union, Optional, Dict, List, Tuple, Generator
         encoder: SentenceTransformer,
         response_pool: List[str],
         query_embeddings_cache: dict,
         index_type: str = 'IndexFlatIP',
         faiss_index_file_path: str = 'models/faiss_indices/faiss_index_production.index',
     ):
         self.config = config
         self.tokenizer = tokenizer
         self.encoder = encoder
+        self.model = SentenceTransformer(config.pretrained_model)
         self.faiss_index_file_path = faiss_index_file_path
         self.response_pool = response_pool
         self.query_embeddings_cache = query_embeddings_cache # In-memory cache for embeddings
         self.index_type = index_type
+        self.neg_samples = config.neg_samples
+        self.nlist = config.nlist
+        self.dimension = config.embedding_dim
+        self.max_context_length = config.max_context_length
+        self.embedding_batch_size = config.embedding_batch_size
+        self.search_batch_size = config.search_batch_size
+        self.max_batch_size = config.max_batch_size
+        self.max_retries = config.max_retries
         # Build text -> domain map for O(1) domain lookups (hard negative sampling)
         self._text_domain_map = {}
                 speaker = turn.get('speaker')
                 text = turn.get('text', '').strip()
                 if speaker == 'assistant' and text:
+                    if len(text) <= self.max_context_length:
                         # Use tuple as set key to ensure uniqueness
                         key = (domain, text)
                         if key not in response_set:
                     #     f"Collision detected: text '{stripped_text}' found with domains "
                     #     f"'{existing_domain}' and '{domain}'. Keeping the first."
                     # )
+                # By default, keep the first domain or overwrite. Skip overwriting:
                 continue
             else:
                 # Insert into the dict
             prepared,
             padding='max_length',
             truncation=True,
+            max_length=self.max_context_length,
             return_tensors='np'
         )
         input_ids = encodings['input_ids']
     def retrieve_responses(self, query: str, top_k: int = 10) -> List[Tuple[str, float]]:
         """
         Retrieve top-k responses for a query using FAISS.
         """
         query_embedding = self.encode_query(query).reshape(1, -1).astype("float32")
         distances, indices = self.index.search(query_embedding, top_k)
         results = []
+        for idx, dist in tqdm(
+            zip(indices[0], distances[0]),
+            disable=True # Silence tqdm
+        ):
             if idx < 0:
                 continue
             response = self.response_pool[idx]
+            results.append((response, dist))
         return results
                     for dialogue in batch_dialogues:
                         pairs = self._extract_pairs_from_dialogue(dialogue)
                         for query, positive in pairs:
+                            if len(query) <= self.max_context_length and len(positive) <= self.max_context_length:
                                 queries.append(query)
                                 positives.append(positive)
                     try:
                         encoded_queries = self.tokenizer.batch_encode_plus(
                             queries,
+                            max_length=self.config.max_context_length,
                             truncation=True,
                             padding='max_length',
                             return_tensors='tf'
                         )
                         encoded_positives = self.tokenizer.batch_encode_plus(
                             positives,
+                            max_length=self.config.max_context_length,
                             truncation=True,
                             padding='max_length',
                             return_tensors='tf'
                         flattened_negatives = [neg for sublist in hard_negatives for neg in sublist]
                         encoded_negatives = self.tokenizer.batch_encode_plus(
                             flattened_negatives,
+                            max_length=self.config.max_context_length,
                             truncation=True,
                             padding='max_length',
                             return_tensors='tf'
                         # Reshape to [num_queries, num_negatives, max_length]
                         num_negatives = self.config.neg_samples
+                        reshaped_negatives = encoded_negatives['input_ids'].numpy().reshape(-1, num_negatives, self.config.max_context_length)
                     except Exception as e:
                         logger.error(f"Error during negatives tokenization: {e}")
                         pbar.update(1)
                 batch_queries,
                 padding=True,
                 truncation=True,
+                max_length=self.max_context_length,
                 return_tensors='tf'
             )
             batch_embeddings = self.encoder(encoded['input_ids'], training=False).numpy()
         # Use tf.py_function, limit parallelism
         q_ids, p_ids, n_ids = tf.py_function(
             func=self._tokenize_triple_py,
+            inp=[q, p, n, tf.constant(self.max_context_length), tf.constant(self.neg_samples)],
             Tout=[tf.int32, tf.int32, tf.int32]
         )
         # Set shape info for the output tensors
+        q_ids.set_shape([None, self.max_context_length])                    # [batch_size, max_length]
+        p_ids.set_shape([None, self.max_context_length])                    # [batch_size, max_length]
+        n_ids.set_shape([None, self.neg_samples, self.max_context_length])  # [batch_size, neg_samples, max_length]
         return q_ids, p_ids, n_ids