Spaces:

JoeArmani
/

csc525_retrieval_based_chatbot

Sleeping

App Files Files Community

JoeArmani commited on Jan 11

Commit

71ca212

1 Parent(s): e5be70f

restructuring

Browse files

Files changed (21) hide show

build_faiss_index.py +0 -161
chatbot_model.py +1 -3
chatbot_validator.py +42 -74
processing_pipeline.py → data_augmentation/augmentation_processing_pipeline.py +3 -3
back_translator.py → data_augmentation/back_translator.py +0 -0
dialogue_augmenter.py → data_augmentation/dialogue_augmenter.py +3 -3
main.py → data_augmentation/main.py +5 -5
paraphraser.py → data_augmentation/paraphraser.py +0 -0
pipeline_config.py → data_augmentation/pipeline_config.py +0 -0
quality_metrics.py → data_augmentation/quality_metrics.py +1 -1
schema_guided_dialogue_processor.py → data_augmentation/schema_guided_dialogue_processor.py +1 -1
taskmaster_processor.py → data_augmentation/taskmaster_processor.py +1 -1
deduplicate_augmented_dialogues.py +7 -6
environment_setup.py +5 -8
new_iteration/run_taskmaster_processor.py +2 -2
plotter.py +14 -19
prepare_data.py +22 -28
tf_data_pipeline.py +0 -3
unused/build_faiss_index.py +160 -0
gpu_monitor.py → unused/gpu_monitor.py +4 -13
validate_model.py +0 -5

build_faiss_index.py DELETED Viewed

@@ -1,161 +0,0 @@
-import os
-import json
-from pathlib import Path
-import faiss
-import numpy as np
-import tensorflow as tf
-from transformers import AutoTokenizer, TFAutoModel
-from tqdm.auto import tqdm
-from chatbot_model import ChatbotConfig, EncoderModel
-from tf_data_pipeline import TFDataPipeline
-from logger_config import config_logger
-logger = config_logger(__name__)
-os.environ["TOKENIZERS_PARALLELISM"] = "false"
-def sanity_check(encoder: EncoderModel, tokenizer: AutoTokenizer, config: ChatbotConfig):
-    """
-    Perform a quick sanity check to ensure the model is loaded correctly.
-    """
-    sample_response = "This is a test response."
-    encoded_sample = tokenizer(
-        [sample_response],
-        padding=True,
-        truncation=True,
-        max_length=config.max_context_token_limit,
-        return_tensors='tf'
-    )
-    # Get embedding
-    sample_embedding = encoder(encoded_sample['input_ids'], training=False).numpy()
-    # Check shape
-    if sample_embedding.shape[1] != config.embedding_dim:
-        logger.error(
-            f"Embedding dimension mismatch: Expected {config.embedding_dim}, "
-            f"got {sample_embedding.shape[1]}"
-        )
-        raise ValueError("Embedding dimension mismatch.")
-    else:
-        logger.info("Embedding dimension matches the configuration.")
-    # Check normalization
-    embedding_norm = np.linalg.norm(sample_embedding, axis=1)
-    if not np.allclose(embedding_norm, 1.0, atol=1e-5):
-        logger.error("Embeddings are not properly normalized.")
-        raise ValueError("Embeddings are not normalized.")
-    else:
-        logger.info("Embeddings are properly normalized.")
-    logger.info("Sanity check passed: Model loaded correctly and outputs are as expected.")
-def build_faiss_index():
-    """
-    Rebuild the FAISS index by:
-      1) Loading your config.json
-      2) Initializing encoder + loading submodule & custom weights
-      3) Loading tokenizer from disk
-      4) Creating a TFDataPipeline
-      5) Setting the pipeline's response_pool from a JSON file
-      6) Using pipeline.compute_and_index_response_embeddings()
-      7) Saving the FAISS index
-    """
-    # Directories
-    MODELS_DIR = Path("models")
-    FAISS_DIR = MODELS_DIR / "faiss_indices"
-    FAISS_INDEX_PATH = FAISS_DIR / "faiss_index_production.index"
-    RESPONSES_PATH = FAISS_DIR / "faiss_index_production_responses.json"
-    TOKENIZER_DIR = MODELS_DIR / "tokenizer"
-    SHARED_ENCODER_DIR = MODELS_DIR / "shared_encoder"
-    CUSTOM_WEIGHTS_PATH = MODELS_DIR / "encoder_custom_weights.weights.h5"
-    # 1) Load ChatbotConfig
-    config_path = MODELS_DIR / "config.json"
-    if config_path.exists():
-        with open(config_path, "r", encoding="utf-8") as f:
-            config_dict = json.load(f)
-        config = ChatbotConfig.from_dict(config_dict)
-        logger.info(f"Loaded ChatbotConfig from {config_path}")
-    else:
-        config = ChatbotConfig()
-        logger.warning(f"No config.json found at {config_path}. Using default ChatbotConfig.")
-    # 2) Initialize the EncoderModel
-    encoder = EncoderModel(config=config)
-    logger.info("EncoderModel instantiated (empty).")
-    # Overwrite the submodule from 'shared_encoder' directory
-    if SHARED_ENCODER_DIR.exists():
-        logger.info(f"Loading DistilBERT submodule from {SHARED_ENCODER_DIR}...")
-        encoder.pretrained = TFAutoModel.from_pretrained(str(SHARED_ENCODER_DIR))
-        logger.info("Loaded HF submodule into encoder.pretrained.")
-    else:
-        logger.warning(f"No shared_encoder directory at {SHARED_ENCODER_DIR}. Using default pretrained model.")
-    # Build model once, then load custom weights (projection, etc.)
-    dummy_input = tf.zeros((1, config.max_context_token_limit), dtype=tf.int32)
-    _ = encoder(dummy_input, training=False)  # builds the layers
-    if CUSTOM_WEIGHTS_PATH.exists():
-        logger.info(f"Loading custom top-level weights from {CUSTOM_WEIGHTS_PATH}")
-        encoder.load_weights(str(CUSTOM_WEIGHTS_PATH))
-        logger.info("Custom top-level weights loaded successfully.")
-    else:
-        logger.warning(f"Custom weights file not found at {CUSTOM_WEIGHTS_PATH}.")
-    # 3) Load tokenizer
-    if TOKENIZER_DIR.exists():
-        logger.info(f"Loading tokenizer from {TOKENIZER_DIR}")
-        tokenizer = AutoTokenizer.from_pretrained(str(TOKENIZER_DIR))
-    else:
-        logger.warning(f"No tokenizer dir at {TOKENIZER_DIR}, falling back to default HF tokenizer.")
-        tokenizer = AutoTokenizer.from_pretrained(config.pretrained_model)
-        #tokenizer.add_special_tokens({'additional_special_tokens': ['<EMPTY_NEGATIVE>']})
-    # 4) Quick sanity check
-    sanity_check(encoder, tokenizer, config)
-    # 5) Prepare a TFDataPipeline
-    pipeline = TFDataPipeline(
-        config=config,
-        tokenizer=tokenizer,
-        encoder=encoder,
-        index_file_path=str(FAISS_INDEX_PATH),
-        response_pool=[],
-        max_length=config.max_context_token_limit,
-        query_embeddings_cache={},
-        neg_samples=config.neg_samples,
-        index_type='IndexFlatIP',
-        nlist=100,
-        max_retries=config.max_retries
-    )
-    # 6) Load the existing response pool
-    if not RESPONSES_PATH.exists():
-        logger.error(f"Response pool JSON file not found at {RESPONSES_PATH}")
-        raise FileNotFoundError(f"No response pool JSON at {RESPONSES_PATH}")
-    with open(RESPONSES_PATH, "r", encoding="utf-8") as f:
-        response_pool = json.load(f)
-    logger.info(f"Loaded {len(response_pool)} responses from {RESPONSES_PATH}")
-    pipeline.response_pool = response_pool  # assign to pipeline
-    # 7) Build (or rebuild) the FAISS index from pipeline method
-    #    This does all the compute-embeddings + index.add in one place
-    logger.info("Starting to compute and index response embeddings via TFDataPipeline...")
-    pipeline.compute_and_index_response_embeddings()
-    # 8) Save the rebuilt FAISS index
-    pipeline.save_faiss_index(str(FAISS_INDEX_PATH))
-    # Verify
-    loaded_index = faiss.read_index(str(FAISS_INDEX_PATH))
-    logger.info(f"Verified the rebuilt FAISS index has {loaded_index.ntotal} vectors.")
-    return loaded_index, pipeline.response_pool
-if __name__ == "__main__":
-    build_faiss_index()

chatbot_model.py CHANGED Viewed

@@ -15,7 +15,6 @@ from tf_data_pipeline import TFDataPipeline
 from response_quality_checker import ResponseQualityChecker
 from cross_encoder_reranker import CrossEncoderReranker
 from conversation_summarizer import DeviceAwareModel, Summarizer
-from gpu_monitor import GPUMemoryMonitor
 import absl.logging
 from logger_config import config_logger
 from tqdm.auto import tqdm
@@ -147,7 +146,6 @@ class RetrievalChatbot(DeviceAwareModel):
         self.tokenizer = self._initialize_tokenizer()
         self.encoder = self._initialize_encoder()
         self.summarizer = summarizer or self._initialize_summarizer()
-        self.memory_monitor = GPUMemoryMonitor()
         # Initialize data pipeline
         logger.info("Initializing TFDataPipeline.")
@@ -566,7 +564,7 @@ class RetrievalChatbot(DeviceAwareModel):
         boosted.sort(key=lambda x: x[1], reverse=True)
         # Print top 10
-        for resp, score in boosted[:100]:
             logger.debug(f"Candidate: '{resp}' with score {score}")
         # 8) Return top_k

 from response_quality_checker import ResponseQualityChecker
 from cross_encoder_reranker import CrossEncoderReranker
 from conversation_summarizer import DeviceAwareModel, Summarizer
 import absl.logging
 from logger_config import config_logger
 from tqdm.auto import tqdm
         self.tokenizer = self._initialize_tokenizer()
         self.encoder = self._initialize_encoder()
         self.summarizer = summarizer or self._initialize_summarizer()
         # Initialize data pipeline
         logger.info("Initializing TFDataPipeline.")
         boosted.sort(key=lambda x: x[1], reverse=True)
         # Print top 10
+        for resp, score in boosted[:150]:
             logger.debug(f"Candidate: '{resp}' with score {score}")
         # 8) Return top_k

chatbot_validator.py CHANGED Viewed

@@ -10,17 +10,12 @@ logger = config_logger(__name__)
 class ChatbotValidator:
     """
     Handles automated validation and performance analysis for the chatbot.
-    This validator executes domain-specific test queries, obtains candidate
-    responses via the chatbot, then evaluates them with a quality checker.
-    It aggregates metrics across queries and domains, logs intermediate
-    results, and returns a comprehensive summary.
     """
     def __init__(self, chatbot, quality_checker):
         """
         Initialize the validator.
         Args:
             chatbot: RetrievalChatbot instance for inference
             quality_checker: ResponseQualityChecker instance
@@ -28,75 +23,60 @@ class ChatbotValidator:
         self.chatbot = chatbot
         self.quality_checker = quality_checker
-        # Basic domain-specific test queries (easy examples)
-        # Taskmaster-1 and Schema-Guided style
         self.domain_queries = {
-            # 'restaurant': [
-            #     "Hi, I have a question about your restaurant. Do they take reservations?",
-            #     "I'd like to make a reservation for dinner tonight after 6pm. Is that time available?",
-            #     "Can you recommend an Italian restaurant with wood-fired pizza?",
-            #     "Is there parking available if we dine at your restaurant tomorrow evening?",
-            #     "What's the average cost per plate at your restaurant?"
-            # # ],
             'movie': [
                 "How much are movie tickets for two people?",
                 "I'm looking for showings after 6pm?",
                 "Is this at the new theater with reclining seats?",
-                "Hi, I'm thinking about reserving tickets for the new movie.",
-                "What is the price for your largest popcorn?"
             ],
-            # 'ride_share': [
-            #     "I need a ride from the airport to downtown.",
-            #     "How much would it cost to get to the mall?",
-            #     "Can you book a car for tomorrow morning?",
-            #     "Is there a driver available right now?",
-            #     "What's the estimated arrival time for the driver?"
-            # ],
-            # 'coffee': [
-            #     "Can I get a latte with almond milk?",
-            #     "Can I get a cappuccino with oat milk?",
-            #     "Can I get a mocha with coconut milk?",
-            #     "Can I get a cappuccino with almond milk?",
-            #     "Can I get a mocha with oat milk?",
-            # ],
-            # 'pizza': [
-            #     "Can I get a pizza with extra cheese?",
-            #     "Can I get a pizza with mushrooms?",
-            #     "Can I get a pizza with bell peppers?",
-            #     "Can I get a pizza with onions?",
-            #     "Can I get a pizza with olives?"
-            # ],
-            # 'auto': [
-            #     "I need to schedule an oil change for my car.",
-            #     "When can I bring my car in for maintenance?",
-            #     "Do you have any openings for auto repair today?",
-            #     "How long will the service take?",
-            #     "Can I get an estimate for brake repair?"
-            #],
         }
     def run_validation(
         self,
-        num_examples: int = 5,
         top_k: int = 10,
         domains: Optional[List[str]] = None,
         randomize: bool = False,
         seed: int = 42
     ) -> Dict[str, Any]:
         """
-        Run comprehensive validation across specified domains.
         Args:
             num_examples: Number of test queries per domain
             top_k: Number of responses to retrieve for each query
             domains: Optional list of domain keys to test. If None, test all.
             randomize: If True, randomly select queries from the domain lists
             seed: Random seed for consistent sampling if randomize=True
         Returns:
-            Dict containing detailed validation metrics and domain-specific performance
         """
-        logger.info("\n=== Running Enhanced Automatic Validation ===")
         # Select which domains to test
         test_domains = domains if domains else list(self.domain_queries.keys())
@@ -105,6 +85,7 @@ class ChatbotValidator:
         metrics_history = []
         domain_metrics = {}
         reranker = CrossEncoderReranker(model_name="cross-encoder/ms-marco-MiniLM-L-12-v2")
         # Prepare random selection if needed
@@ -131,26 +112,21 @@ class ChatbotValidator:
             for i, query in enumerate(queries, 1):
                 logger.info(f"\nTest Case {i}: {query}")
-                # Retrieve top_k responses (including cross-encoder re-ranking if available)
                 responses = self.chatbot.retrieve_responses_cross_encoder(query, top_k=top_k, reranker=reranker)
-                # Evaluate with quality checker
                 quality_metrics = self.quality_checker.check_response_quality(query, responses)
-                # Save domain info
                 quality_metrics['domain'] = domain
                 metrics_history.append(quality_metrics)
                 domain_metrics[domain].append(quality_metrics)
-                # Detailed logging
-                self._log_validation_results(query, responses, quality_metrics, i)
         # Final aggregation
         aggregate_metrics = self._calculate_aggregate_metrics(metrics_history)
         domain_analysis = self._analyze_domain_performance(domain_metrics)
         confidence_analysis = self._analyze_confidence_distribution(metrics_history)
-        # Combine into one dictionary
         aggregate_metrics.update({
             'domain_performance': domain_analysis,
             'confidence_analysis': confidence_analysis
@@ -161,7 +137,7 @@ class ChatbotValidator:
     def _calculate_aggregate_metrics(self, metrics_history: List[Dict]) -> Dict[str, float]:
         """
-        Calculate comprehensive aggregate metrics over all tested queries.
         """
         if not metrics_history:
             logger.warning("No metrics to aggregate. Returning empty summary.")
@@ -169,7 +145,6 @@ class ChatbotValidator:
         top_scores = [m.get('top_score', 0.0) for m in metrics_history]
-        # The length-based metrics are robust to missing or zero-length data
         metrics = {
             'num_queries_tested': len(metrics_history),
             'avg_top_response_score': np.mean(top_scores),
@@ -177,10 +152,7 @@ class ChatbotValidator:
             'avg_relevance': np.mean([m.get('query_response_relevance', 0.0) for m in metrics_history]),
             'avg_length_score': np.mean([m.get('response_length_score', 0.0) for m in metrics_history]),
             'avg_score_gap': np.mean([m.get('top_3_score_gap', 0.0) for m in metrics_history]),
-            'confidence_rate': np.mean([1.0 if m.get('is_confident', False) else 0.0
-                                        for m in metrics_history]),
-            # Additional statistical metrics
             'median_top_score': np.median(top_scores),
             'score_std': np.std(top_scores),
             'min_score': np.min(top_scores),
@@ -202,12 +174,9 @@ class ChatbotValidator:
             top_scores = [m.get('top_score', 0.0) for m in metrics_list]
             analysis[domain] = {
-                'confidence_rate': np.mean([1.0 if m.get('is_confident', False) else 0.0
-                                            for m in metrics_list]),
-                'avg_relevance': np.mean([m.get('query_response_relevance', 0.0)
-                                          for m in metrics_list]),
-                'avg_diversity': np.mean([m.get('response_diversity', 0.0)
-                                          for m in metrics_list]),
                 'avg_top_score': np.mean(top_scores),
                 'num_samples': len(metrics_list)
             }
@@ -235,7 +204,6 @@ class ChatbotValidator:
         query: str,
         responses: List[Tuple[str, float]],
         metrics: Dict[str, Any],
-        case_num: int
     ):
         """
         Log detailed validation results for each test case.
@@ -249,8 +217,8 @@ class ChatbotValidator:
         #     if isinstance(v, (int, float)):
         #         logger.info(f"  {k}: {v:.4f}")
-        logger.info("Top 10 Responses:")
-        for i, (resp_text, score) in enumerate(responses[:10], 1):
             logger.info(f"{i}) Score: {score:.4f} | {resp_text}")
             if i == 1 and not is_confident:
                 logger.info("   [Low Confidence on Top Response]")

 class ChatbotValidator:
     """
     Handles automated validation and performance analysis for the chatbot.
+    This testing module executes domain-specific queries, obtains chatbot responses, and evaluates them with a quality checker.
     """
     def __init__(self, chatbot, quality_checker):
         """
         Initialize the validator.
         Args:
             chatbot: RetrievalChatbot instance for inference
             quality_checker: ResponseQualityChecker instance
         self.chatbot = chatbot
         self.quality_checker = quality_checker
+        # Domain-specific test queries (aligns with Taskmaster-1 dataset)
         self.domain_queries = {
+            'restaurant': [
+                "Hi, I have a question about your restaurant. Do they take reservations?",
+                "I'd like to make a reservation for dinner tonight after 6pm. Is that time available?",
+                "Can you recommend an Italian restaurant with wood-fired pizza?",
+            ],
             'movie': [
                 "How much are movie tickets for two people?",
                 "I'm looking for showings after 6pm?",
                 "Is this at the new theater with reclining seats?",
             ],
+            'ride_share': [
+                "I need a ride from the airport to downtown.",
+                "What is the cost for Lyft? How about Uber XL?",
+                "Can you book a car for tomorrow morning?",
+            ],
+            'coffee': [
+                "Can I customize my coffee?",
+                "Can I order a mocha from you?",
+                "Can I get my usual venti vanilla latte?",
+            ],
+            'pizza': [
+                "Do you have any pizza specials or deals available?",
+                "How long is the wait until the pizza is ready and delivered to me?",
+                "Please repeat my pizza order for two medium pizzas with thick crust.",
+            ],
+            'auto': [
+                "The car is making a funny noise when I turn, and I'm due for an oil change.",
+                "Is my buddy John available to work on my car?",
+                "My Jeep needs a repair. Can you help me with that?",
+            ],
         }
     def run_validation(
         self,
+        num_examples: int = 3,
         top_k: int = 10,
         domains: Optional[List[str]] = None,
         randomize: bool = False,
         seed: int = 42
     ) -> Dict[str, Any]:
         """
+        Run validation across testable domains.
         Args:
             num_examples: Number of test queries per domain
             top_k: Number of responses to retrieve for each query
             domains: Optional list of domain keys to test. If None, test all.
             randomize: If True, randomly select queries from the domain lists
             seed: Random seed for consistent sampling if randomize=True
         Returns:
+            Dict with validation metrics
         """
+        logger.info("\n=== Running Automatic Validation ===")
         # Select which domains to test
         test_domains = domains if domains else list(self.domain_queries.keys())
         metrics_history = []
         domain_metrics = {}
+        # Init the cross-encoder reranker to pass to the chatbot
         reranker = CrossEncoderReranker(model_name="cross-encoder/ms-marco-MiniLM-L-12-v2")
         # Prepare random selection if needed
             for i, query in enumerate(queries, 1):
                 logger.info(f"\nTest Case {i}: {query}")
+                # Retrieve top_k responses, then evaluate with quality checker
                 responses = self.chatbot.retrieve_responses_cross_encoder(query, top_k=top_k, reranker=reranker)
                 quality_metrics = self.quality_checker.check_response_quality(query, responses)
+                # Aggregate metrics and log
                 quality_metrics['domain'] = domain
                 metrics_history.append(quality_metrics)
                 domain_metrics[domain].append(quality_metrics)
+                self._log_validation_results(query, responses, quality_metrics)
         # Final aggregation
         aggregate_metrics = self._calculate_aggregate_metrics(metrics_history)
         domain_analysis = self._analyze_domain_performance(domain_metrics)
         confidence_analysis = self._analyze_confidence_distribution(metrics_history)
         aggregate_metrics.update({
             'domain_performance': domain_analysis,
             'confidence_analysis': confidence_analysis
     def _calculate_aggregate_metrics(self, metrics_history: List[Dict]) -> Dict[str, float]:
         """
+        Calculate aggregate metrics over tested queries.
         """
         if not metrics_history:
             logger.warning("No metrics to aggregate. Returning empty summary.")
         top_scores = [m.get('top_score', 0.0) for m in metrics_history]
         metrics = {
             'num_queries_tested': len(metrics_history),
             'avg_top_response_score': np.mean(top_scores),
             'avg_relevance': np.mean([m.get('query_response_relevance', 0.0) for m in metrics_history]),
             'avg_length_score': np.mean([m.get('response_length_score', 0.0) for m in metrics_history]),
             'avg_score_gap': np.mean([m.get('top_3_score_gap', 0.0) for m in metrics_history]),
+            'confidence_rate': np.mean([1.0 if m.get('is_confident', False) else 0.0 for m in metrics_history]),
             'median_top_score': np.median(top_scores),
             'score_std': np.std(top_scores),
             'min_score': np.min(top_scores),
             top_scores = [m.get('top_score', 0.0) for m in metrics_list]
             analysis[domain] = {
+                'confidence_rate': np.mean([1.0 if m.get('is_confident', False) else 0.0 for m in metrics_list]),
+                'avg_relevance': np.mean([m.get('query_response_relevance', 0.0) for m in metrics_list]),
+                'avg_diversity': np.mean([m.get('response_diversity', 0.0) for m in metrics_list]),
                 'avg_top_score': np.mean(top_scores),
                 'num_samples': len(metrics_list)
             }
         query: str,
         responses: List[Tuple[str, float]],
         metrics: Dict[str, Any],
     ):
         """
         Log detailed validation results for each test case.
         #     if isinstance(v, (int, float)):
         #         logger.info(f"  {k}: {v:.4f}")
+        logger.info("Top 3 Responses:")
+        for i, (resp_text, score) in enumerate(responses[:3], 1):
             logger.info(f"{i}) Score: {score:.4f} | {resp_text}")
             if i == 1 and not is_confident:
                 logger.info("   [Low Confidence on Top Response]")

processing_pipeline.py → data_augmentation/augmentation_processing_pipeline.py RENAMED Viewed

@@ -7,13 +7,13 @@ import hashlib
 import spacy
 import torch
 from tqdm import tqdm
-from pipeline_config import PipelineConfig
-from dialogue_augmenter import DialogueAugmenter
 from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.metrics.pairwise import cosine_similarity
 from typing import Set
-class ProcessingPipeline:
     """
     Complete pipeline combining validation, optimization, and augmentation.
     """

 import spacy
 import torch
 from tqdm import tqdm
+from data_augmentation.pipeline_config import PipelineConfig
+from data_augmentation.dialogue_augmenter import DialogueAugmenter
 from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.metrics.pairwise import cosine_similarity
 from typing import Set
+class AugmentationProcessingPipeline:
     """
     Complete pipeline combining validation, optimization, and augmentation.
     """

back_translator.py → data_augmentation/back_translator.py RENAMED Viewed

File without changes

dialogue_augmenter.py → data_augmentation/dialogue_augmenter.py RENAMED Viewed

@@ -3,9 +3,9 @@ import numpy as np
 import torch
 import tensorflow as tf
 import tensorflow_hub as hub
-from pipeline_config import PipelineConfig
-from quality_metrics import QualityMetrics
-from paraphraser import Paraphraser
 import nlpaug.augmenter.word as naw
 from functools import lru_cache
 from sklearn.metrics.pairwise import cosine_similarity

 import torch
 import tensorflow as tf
 import tensorflow_hub as hub
+from data_augmentation.pipeline_config import PipelineConfig
+from data_augmentation.quality_metrics import QualityMetrics
+from data_augmentation.paraphraser import Paraphraser
 import nlpaug.augmenter.word as naw
 from functools import lru_cache
 from sklearn.metrics.pairwise import cosine_similarity

main.py → data_augmentation/main.py RENAMED Viewed

@@ -5,10 +5,10 @@ Description and References in the README.md file.
 import json
 import tensorflow as tf
 from typing import List, Dict
-from pipeline_config import PipelineConfig
-from processing_pipeline import ProcessingPipeline
-from taskmaster_processor import TaskmasterProcessor
-from schema_guided_dialogue_processor import SchemaGuidedProcessor
 def combine_datasets(taskmaster_dialogues: List[Dict],
                     schema_guided_dialogues: List[Dict]) -> List[Dict]:
@@ -99,7 +99,7 @@ def main():
         # Process through augmentation pipeline
         print("Processing combined dataset")
-        pipeline = ProcessingPipeline(config)
         output_path = pipeline.process_dataset(combined_dialogues)
         print(f"Processing complete. Results saved to {output_path}")
         pipeline.cleanup()

 import json
 import tensorflow as tf
 from typing import List, Dict
+from data_augmentation.pipeline_config import PipelineConfig
+from data_augmentation.augmentation_processing_pipeline import AugmentationProcessingPipeline
+from data_augmentation.taskmaster_processor import TaskmasterProcessor
+from data_augmentation.schema_guided_dialogue_processor import SchemaGuidedProcessor
 def combine_datasets(taskmaster_dialogues: List[Dict],
                     schema_guided_dialogues: List[Dict]) -> List[Dict]:
         # Process through augmentation pipeline
         print("Processing combined dataset")
+        pipeline = AugmentationProcessingPipeline(config)
         output_path = pipeline.process_dataset(combined_dialogues)
         print(f"Processing complete. Results saved to {output_path}")
         pipeline.cleanup()

paraphraser.py → data_augmentation/paraphraser.py RENAMED Viewed

File without changes

pipeline_config.py → data_augmentation/pipeline_config.py RENAMED Viewed

File without changes

quality_metrics.py → data_augmentation/quality_metrics.py RENAMED Viewed

@@ -2,7 +2,7 @@ import tensorflow_hub as hub
 import spacy
 from sklearn.metrics.pairwise import cosine_similarity
 from typing import Dict
-from pipeline_config import PipelineConfig
 class QualityMetrics:
     """

 import spacy
 from sklearn.metrics.pairwise import cosine_similarity
 from typing import Dict
+from data_augmentation.pipeline_config import PipelineConfig
 class QualityMetrics:
     """

schema_guided_dialogue_processor.py → data_augmentation/schema_guided_dialogue_processor.py RENAMED Viewed

@@ -3,7 +3,7 @@ from typing import List, Dict, Optional, Any
 import json
 import glob
 from pathlib import Path
-from pipeline_config import PipelineConfig
 @dataclass
 class SchemaGuidedDialogue:

 import json
 import glob
 from pathlib import Path
+from data_augmentation.pipeline_config import PipelineConfig
 @dataclass
 class SchemaGuidedDialogue:

taskmaster_processor.py → data_augmentation/taskmaster_processor.py RENAMED Viewed

@@ -3,7 +3,7 @@ from typing import List, Dict, Optional, Any
 import json
 import re
 from pathlib import Path
-from pipeline_config import PipelineConfig
 @dataclass
 class TaskmasterDialogue:

 import json
 import re
 from pathlib import Path
+from data_augmentation.pipeline_config import PipelineConfig
 @dataclass
 class TaskmasterDialogue:

deduplicate_augmented_dialogues.py CHANGED Viewed

@@ -2,13 +2,16 @@ import json
 from pathlib import Path
 import logging
 from typing import List, Dict
-from collections import defaultdict
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 def load_json_file(file_path: str) -> List[Dict]:
-    """Load and parse a JSON file."""
     try:
         with open(file_path, 'r', encoding='utf-8') as f:
             return json.load(f)
@@ -21,13 +24,12 @@ def load_json_file(file_path: str) -> List[Dict]:
 def combine_json_files(input_directory: str, output_file: str):
     """
-    Combine multiple JSON files while removing duplicates based on dialogue_id.
     Args:
         input_directory: Directory containing JSON files to process
         output_file: Path to save the combined output
     """
-    # Track unique dialogues and their source files
     dialogue_map = {}
     duplicate_count = 0
@@ -66,7 +68,6 @@ def combine_json_files(input_directory: str, output_file: str):
     except Exception as e:
         logger.error(f"Error writing output file: {e}")
-# Usage example
 if __name__ == "__main__":
     combine_json_files(
         input_directory="/Users/joe/Desktop/Grad School/CSC525/CSC525_mod8_option2_joseph_armani/processed_outputs",

 from pathlib import Path
 import logging
 from typing import List, Dict
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
+"""
+Standalone script to deduplicate dialogues from multiple JSON files.
+"""
 def load_json_file(file_path: str) -> List[Dict]:
+    """Load and parse JSON file."""
     try:
         with open(file_path, 'r', encoding='utf-8') as f:
             return json.load(f)
 def combine_json_files(input_directory: str, output_file: str):
     """
+    Combine multiple JSON files and removing duplicate dialogues based on dialogue_id.
     Args:
         input_directory: Directory containing JSON files to process
         output_file: Path to save the combined output
     """
+    # Track unique dialogues
     dialogue_map = {}
     duplicate_count = 0
     except Exception as e:
         logger.error(f"Error writing output file: {e}")
 if __name__ == "__main__":
     combine_json_files(
         input_directory="/Users/joe/Desktop/Grad School/CSC525/CSC525_mod8_option2_joseph_armani/processed_outputs",

environment_setup.py CHANGED Viewed

@@ -90,7 +90,7 @@ class EnvironmentSetup:
             return None
     def setup_devices(self) -> Tuple[str, tf.distribute.Strategy]:
-        """Configure available compute devices with Colab-specific optimizations."""
         logger.info("Checking available compute devices...")
         # Colab-specific setup
@@ -128,7 +128,7 @@ class EnvironmentSetup:
                 except Exception as e:
                     logger.error(f"Error configuring Colab GPU: {str(e)}")
-        # Non-Colab setup (same as before)
         else:
             # Check for TPU
             try:
@@ -166,11 +166,11 @@ class EnvironmentSetup:
         return "CPU", strategy
     def optimize_batch_size(self, base_batch_size: int = 16) -> int:
-        """Apply Colab-specific optimizations for training."""
         if not self.is_colab():
             return base_batch_size
-        # Colab-specific batch size optimization
         if self.device_type == "GPU":
             try:
                 gpu_name = subprocess.check_output(
@@ -179,15 +179,12 @@ class EnvironmentSetup:
                 ).decode('utf-8').strip()
                 if "A100" in gpu_name:
-                    # A100 optimizations - has 40GB or 80GB variants
                     logger.info("Optimizing for Colab A100 GPU")
-                    base_batch_size = min(base_batch_size * 8, 128)  # A100 can handle much larger batches
                 elif "T4" in gpu_name:
-                    # T4 optimizations
                     logger.info("Optimizing for Colab T4 GPU")
                     base_batch_size = min(base_batch_size * 2, 32)
                 elif "V100" in gpu_name:
-                    # V100 optimizations
                     logger.info("Optimizing for Colab V100 GPU")
                     base_batch_size = min(base_batch_size * 3, 48)
             except (subprocess.SubprocessError, FileNotFoundError):

             return None
     def setup_devices(self) -> Tuple[str, tf.distribute.Strategy]:
+        """Configure available compute devices with Colab optimizations."""
         logger.info("Checking available compute devices...")
         # Colab-specific setup
                 except Exception as e:
                     logger.error(f"Error configuring Colab GPU: {str(e)}")
+        # Non-Colab setup
         else:
             # Check for TPU
             try:
         return "CPU", strategy
     def optimize_batch_size(self, base_batch_size: int = 16) -> int:
+        """Colab-specific optimizations for training."""
         if not self.is_colab():
             return base_batch_size
+        # Colab batch size optimization
         if self.device_type == "GPU":
             try:
                 gpu_name = subprocess.check_output(
                 ).decode('utf-8').strip()
                 if "A100" in gpu_name:
                     logger.info("Optimizing for Colab A100 GPU")
+                    base_batch_size = min(base_batch_size * 8, 64)
                 elif "T4" in gpu_name:
                     logger.info("Optimizing for Colab T4 GPU")
                     base_batch_size = min(base_batch_size * 2, 32)
                 elif "V100" in gpu_name:
                     logger.info("Optimizing for Colab V100 GPU")
                     base_batch_size = min(base_batch_size * 3, 48)
             except (subprocess.SubprocessError, FileNotFoundError):

new_iteration/run_taskmaster_processor.py CHANGED Viewed

@@ -2,8 +2,8 @@ import json
 from datetime import datetime
 from pathlib import Path
-from pipeline_config import PipelineConfig
-from taskmaster_processor import TaskmasterProcessor
 def main():
     # 1) Setup config

 from datetime import datetime
 from pathlib import Path
+from data_augmentation.pipeline_config import PipelineConfig
+from data_augmentation.taskmaster_processor import TaskmasterProcessor
 def main():
     # 1) Setup config

plotter.py CHANGED Viewed

@@ -10,11 +10,10 @@ class Plotter:
             self.save_dir.mkdir(parents=True, exist_ok=True)
     def plot_training_history(self, history: Dict[str, List[float]], title: str = "Training History"):
-        """Plot and optionally save training metrics history.
         Args:
-            history: Dictionary containing training metrics
-            title: Title for the plot
         """
         # Create figure with subplots
         fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(10, 12))
@@ -28,7 +27,7 @@ class Plotter:
         ax1.legend()
         ax1.grid(True)
-        # Plot learning rate if available
         if 'learning_rate' in history:
             ax2.plot(history['learning_rate'], label='Learning Rate')
             ax2.set_xlabel('Step')
@@ -40,7 +39,7 @@ class Plotter:
         plt.suptitle(title)
         plt.tight_layout()
-        # Save if directory provided
         if self.save_dir:
             timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
             save_path = self.save_dir / f'training_history_{timestamp}.png'
@@ -49,25 +48,23 @@ class Plotter:
         plt.show()
     def plot_validation_metrics(self, metrics: Dict[str, float]):
-        """Plot validation metrics as a bar chart.
         Args:
             metrics: Dictionary of validation metrics. Can handle nested dictionaries.
         """
-        # Flatten nested metrics dictionary
         flat_metrics = {}
         for key, value in metrics.items():
-            # Skip num_queries_tested
             if key == 'num_queries_tested':
                 continue
             if isinstance(value, dict):
-                # If value is a dictionary, flatten it with key prefix
                 for subkey, subvalue in value.items():
-                    if isinstance(subvalue, (int, float)):  # Only include numeric values
                         flat_metrics[f"{key}_{subkey}"] = subvalue
-            elif isinstance(value, (int, float)):  # Only include numeric values
                 flat_metrics[key] = value
         if not flat_metrics:
@@ -87,20 +84,18 @@ class Plotter:
         plt.xticks(range(len(metric_names)), metric_names, rotation=45, ha='right')
         plt.ylabel('Value')
-        # Add value labels on top of bars
         for bar in bars:
             height = bar.get_height()
             plt.text(bar.get_x() + bar.get_width()/2., height,
                     f'{height:.3f}',
                     ha='center', va='bottom')
-        # Set y-axis limits to focus on metrics between 0 and 1
-        plt.ylim(0, 1.1)  # Slight padding above 1 for label visibility
-        # Adjust layout to prevent label cutoff
         plt.tight_layout()
-        # Save if directory provided
         if self.save_dir:
             timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
             save_path = self.save_dir / f'validation_metrics_{timestamp}.png'

             self.save_dir.mkdir(parents=True, exist_ok=True)
     def plot_training_history(self, history: Dict[str, List[float]], title: str = "Training History"):
+        """Plot and save training metrics history
         Args:
+            history: Dict with training metrics
+            title: Plot title
         """
         # Create figure with subplots
         fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(10, 12))
         ax1.legend()
         ax1.grid(True)
+        # Plot learning rate
         if 'learning_rate' in history:
             ax2.plot(history['learning_rate'], label='Learning Rate')
             ax2.set_xlabel('Step')
         plt.suptitle(title)
         plt.tight_layout()
+        # Save
         if self.save_dir:
             timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
             save_path = self.save_dir / f'training_history_{timestamp}.png'
         plt.show()
     def plot_validation_metrics(self, metrics: Dict[str, float]):
+        """Plot validation metrics as a bar chart
         Args:
             metrics: Dictionary of validation metrics. Can handle nested dictionaries.
         """
+        # Flatten nested metrics dict
         flat_metrics = {}
         for key, value in metrics.items():
             if key == 'num_queries_tested':
                 continue
+            # Flatten dict values, use numerical values only
             if isinstance(value, dict):
                 for subkey, subvalue in value.items():
+                    if isinstance(subvalue, (int, float)):
                         flat_metrics[f"{key}_{subkey}"] = subvalue
+            elif isinstance(value, (int, float)):
                 flat_metrics[key] = value
         if not flat_metrics:
         plt.xticks(range(len(metric_names)), metric_names, rotation=45, ha='right')
         plt.ylabel('Value')
+        # Add value labels on bars
         for bar in bars:
             height = bar.get_height()
             plt.text(bar.get_x() + bar.get_width()/2., height,
                     f'{height:.3f}',
                     ha='center', va='bottom')
+        # Set y-axis limits and adjust layout
+        plt.ylim(0, 1.1)
         plt.tight_layout()
+        # Save
         if self.save_dir:
             timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
             save_path = self.save_dir / f'validation_metrics_{timestamp}.png'

prepare_data.py CHANGED Viewed

@@ -7,8 +7,6 @@ import tensorflow as tf
 from transformers import AutoTokenizer, TFAutoModel
 from tqdm.auto import tqdm
 from pathlib import Path
-# Your existing modules
 from chatbot_model import ChatbotConfig, EncoderModel
 from tf_data_pipeline import TFDataPipeline
 from logger_config import config_logger
@@ -18,7 +16,6 @@ logger = config_logger(__name__)
 os.environ["TOKENIZERS_PARALLELISM"] = "false"
 def main():
-    # Constants
     MODELS_DIR = 'new_iteration/data_prep_iterative_models'
     PROCESSED_DATA_DIR = 'new_iteration/processed_outputs'
     CACHE_DIR = 'new_iteration/cache'
@@ -30,9 +27,9 @@ def main():
     CACHE_FILE = os.path.join(CACHE_DIR, 'query_embeddings_cache.pkl')
     TF_RECORD_PATH = os.path.join(TF_RECORD_DIR, 'training_data_3.tfrecord')
-    # Decide whether to load the **custom** fine-tuned model or just base DistilBERT.
     # True for custom, False for base DistilBERT.
-    LOAD_CUSTOM_MODEL = True
     NUM_NEG_SAMPLES = 10
     # Ensure output directories exist
@@ -43,7 +40,7 @@ def main():
     os.makedirs(FAISS_INDICES_DIR, exist_ok=True)
     os.makedirs(TF_RECORD_DIR, exist_ok=True)
-    # Initialize config
     config_json = Path(MODELS_DIR) / "config.json"
     if config_json.exists():
         with open(config_json, "r", encoding="utf-8") as f:
@@ -54,20 +51,18 @@ def main():
         config = ChatbotConfig()
         logger.warning("No config.json found. Using default ChatbotConfig.")
     config.neg_samples = NUM_NEG_SAMPLES
-    # Load or initialize tokenizer
     try:
-        # If the directory has a valid tokenizer
         if Path(TOKENIZER_DIR).exists() and list(Path(TOKENIZER_DIR).iterdir()):
             logger.info(f"Loading tokenizer from {TOKENIZER_DIR}")
             tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_DIR)
         else:
-            # Initialize from base DistilBERT
             logger.info(f"Loading base tokenizer for {config.pretrained_model}")
             tokenizer = AutoTokenizer.from_pretrained(config.pretrained_model)
-            # Save to disk
             Path(TOKENIZER_DIR).mkdir(parents=True, exist_ok=True)
             tokenizer.save_pretrained(TOKENIZER_DIR)
             logger.info(f"New tokenizer saved to {TOKENIZER_DIR}")
@@ -75,7 +70,7 @@ def main():
         logger.error(f"Failed to load or create tokenizer: {e}")
         sys.exit(1)
-    # Initialize the encoder
     try:
         encoder = EncoderModel(config=config)
         logger.info("EncoderModel initialized successfully.")
@@ -89,22 +84,24 @@ def main():
             else:
                 logger.warning(f"No shared_encoder found at {shared_encoder_path}, using base DistilBERT instead.")
-            # Load top-level custom .weights.h5 (projection, dropout, etc.)
             custom_weights_path = Path(MODELS_DIR) / "encoder_custom_weights.weights.h5"
             if custom_weights_path.exists():
                 logger.info(f"Loading custom top-level weights from {custom_weights_path}")
-                # Build model layers with a dummy forward pass
                 dummy_input = tf.zeros((1, config.max_context_token_limit), dtype=tf.int32)
                 _ = encoder(dummy_input, training=False)
                 encoder.load_weights(str(custom_weights_path))
                 logger.info("Custom encoder weights loaded successfully.")
             else:
                 logger.warning(f"Custom weights file not found at {custom_weights_path}. Using only submodule weights.")
         else:
-            # Just base DistilBERT with special tokens resized
             logger.info("Using the base DistilBERT without loading custom weights.")
-        # Resize token embeddings in case we added special tokens
         encoder.pretrained.resize_token_embeddings(len(tokenizer))
         logger.info(f"Token embeddings resized to: {len(tokenizer)}")
@@ -124,7 +121,7 @@ def main():
         logger.error(f"Failed to load dialogues: {e}")
         sys.exit(1)
-    # Load or initialize query_embeddings_cache
     query_embeddings_cache = {}
     if os.path.exists(CACHE_FILE):
         try:
@@ -138,20 +135,18 @@ def main():
     # Initialize TFDataPipeline
     try:
-        # Determine if FAISS index should be loaded or initialized
         if Path(FAISS_INDEX_PRODUCTION_PATH).exists():
-            # Load existing index
             logger.info(f"Loading existing FAISS index from {FAISS_INDEX_PRODUCTION_PATH}...")
             faiss_index = faiss.read_index(FAISS_INDEX_PRODUCTION_PATH)
             logger.info("FAISS index loaded successfully.")
         else:
-            # Initialize a new FAISS index
             logger.info("No existing FAISS index found. Initializing a new index.")
             dimension = config.embedding_dim  # Ensure this matches your encoder's output
             faiss_index = faiss.IndexFlatIP(dimension)  # Using Inner Product for cosine similarity
             logger.info(f"Initialized new FAISS index with dimension {dimension}.")
-        # Initialize TFDataPipeline with the FAISS index
         data_pipeline = TFDataPipeline(
             config=config,
             tokenizer=tokenizer,
@@ -162,7 +157,7 @@ def main():
             neg_samples=config.neg_samples,
             query_embeddings_cache=query_embeddings_cache,
             index_type='IndexFlatIP',
-            nlist=100,
             max_retries=config.max_retries
         )
         logger.info("TFDataPipeline initialized successfully.")
@@ -170,7 +165,7 @@ def main():
         logger.error(f"Failed to initialize TFDataPipeline: {e}")
         sys.exit(1)
-    # 7) Collect unique assistant responses from dialogues
     try:
         if dialogues:
             response_pool = data_pipeline.collect_responses_with_domain(dialogues)
@@ -182,8 +177,7 @@ def main():
         logger.error(f"Failed to collect responses: {e}")
         sys.exit(1)
-    # 8) Build the FAISS index with response embeddings
-    #    Instead of manually computing embeddings, we use the pipeline method
     try:
         if data_pipeline.response_pool:
             data_pipeline.build_text_to_domain_map()
@@ -191,10 +185,10 @@ def main():
             data_pipeline.compute_and_index_response_embeddings()
             logger.info("Response embeddings computed and added to FAISS index.")
-            # Save the updated FAISS index
             data_pipeline.save_faiss_index(FAISS_INDEX_PRODUCTION_PATH)
-            # Also save the response pool JSON
             response_pool_path = FAISS_INDEX_PRODUCTION_PATH.replace('.index', '_responses.json')
             with open(response_pool_path, 'w', encoding='utf-8') as f:
                 json.dump(data_pipeline.response_pool, f, indent=2)
@@ -206,7 +200,7 @@ def main():
         logger.error(f"Failed to compute or add response embeddings: {e}")
         sys.exit(1)
-    # 9) Prepare and save training data as TFRecords
     try:
         if dialogues:
             logger.info("Starting data preparation and saving as TFRecord...")
@@ -218,7 +212,7 @@ def main():
         logger.error(f"Failed during data preparation and saving: {e}")
         sys.exit(1)
-    # 10) Save query embeddings cache
     try:
         with open(CACHE_FILE, 'wb') as f:
             pickle.dump(data_pipeline.query_embeddings_cache, f)

 from transformers import AutoTokenizer, TFAutoModel
 from tqdm.auto import tqdm
 from pathlib import Path
 from chatbot_model import ChatbotConfig, EncoderModel
 from tf_data_pipeline import TFDataPipeline
 from logger_config import config_logger
 os.environ["TOKENIZERS_PARALLELISM"] = "false"
 def main():
     MODELS_DIR = 'new_iteration/data_prep_iterative_models'
     PROCESSED_DATA_DIR = 'new_iteration/processed_outputs'
     CACHE_DIR = 'new_iteration/cache'
     CACHE_FILE = os.path.join(CACHE_DIR, 'query_embeddings_cache.pkl')
     TF_RECORD_PATH = os.path.join(TF_RECORD_DIR, 'training_data_3.tfrecord')
+    # Decide whether to load the **custom** model or base DistilBERT (Base used for first iteration).
     # True for custom, False for base DistilBERT.
+    LOAD_CUSTOM_MODEL = True
     NUM_NEG_SAMPLES = 10
     # Ensure output directories exist
     os.makedirs(FAISS_INDICES_DIR, exist_ok=True)
     os.makedirs(TF_RECORD_DIR, exist_ok=True)
+    # Init config
     config_json = Path(MODELS_DIR) / "config.json"
     if config_json.exists():
         with open(config_json, "r", encoding="utf-8") as f:
         config = ChatbotConfig()
         logger.warning("No config.json found. Using default ChatbotConfig.")
+    # Ensure negative samples are set
     config.neg_samples = NUM_NEG_SAMPLES
+    # Load or init tokenizer
     try:
         if Path(TOKENIZER_DIR).exists() and list(Path(TOKENIZER_DIR).iterdir()):
             logger.info(f"Loading tokenizer from {TOKENIZER_DIR}")
             tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_DIR)
         else:
             logger.info(f"Loading base tokenizer for {config.pretrained_model}")
             tokenizer = AutoTokenizer.from_pretrained(config.pretrained_model)
             Path(TOKENIZER_DIR).mkdir(parents=True, exist_ok=True)
             tokenizer.save_pretrained(TOKENIZER_DIR)
             logger.info(f"New tokenizer saved to {TOKENIZER_DIR}")
         logger.error(f"Failed to load or create tokenizer: {e}")
         sys.exit(1)
+    # Init the encoder
     try:
         encoder = EncoderModel(config=config)
         logger.info("EncoderModel initialized successfully.")
             else:
                 logger.warning(f"No shared_encoder found at {shared_encoder_path}, using base DistilBERT instead.")
+            # Load custom .weights.h5 (projection, dropout, etc.)
             custom_weights_path = Path(MODELS_DIR) / "encoder_custom_weights.weights.h5"
             if custom_weights_path.exists():
                 logger.info(f"Loading custom top-level weights from {custom_weights_path}")
+                # Dummy forward pass forces model build to ensure all layers are built
                 dummy_input = tf.zeros((1, config.max_context_token_limit), dtype=tf.int32)
                 _ = encoder(dummy_input, training=False)
                 encoder.load_weights(str(custom_weights_path))
                 logger.info("Custom encoder weights loaded successfully.")
             else:
                 logger.warning(f"Custom weights file not found at {custom_weights_path}. Using only submodule weights.")
         else:
+            # Base DistilBERT with special tokens
             logger.info("Using the base DistilBERT without loading custom weights.")
+        # Resize token embeddings in case we added special tokens (EncoderModel class)
         encoder.pretrained.resize_token_embeddings(len(tokenizer))
         logger.info(f"Token embeddings resized to: {len(tokenizer)}")
         logger.error(f"Failed to load dialogues: {e}")
         sys.exit(1)
+    # Load or init query_embeddings_cache. NOTE: recompute after each training. This was a bug source.
     query_embeddings_cache = {}
     if os.path.exists(CACHE_FILE):
         try:
     # Initialize TFDataPipeline
     try:
+        # Load or init FAISS index
         if Path(FAISS_INDEX_PRODUCTION_PATH).exists():
             logger.info(f"Loading existing FAISS index from {FAISS_INDEX_PRODUCTION_PATH}...")
             faiss_index = faiss.read_index(FAISS_INDEX_PRODUCTION_PATH)
             logger.info("FAISS index loaded successfully.")
         else:
             logger.info("No existing FAISS index found. Initializing a new index.")
             dimension = config.embedding_dim  # Ensure this matches your encoder's output
             faiss_index = faiss.IndexFlatIP(dimension)  # Using Inner Product for cosine similarity
             logger.info(f"Initialized new FAISS index with dimension {dimension}.")
+        # Init TFDataPipeline with the FAISS index
         data_pipeline = TFDataPipeline(
             config=config,
             tokenizer=tokenizer,
             neg_samples=config.neg_samples,
             query_embeddings_cache=query_embeddings_cache,
             index_type='IndexFlatIP',
+            nlist=100, # Not used for IndexFlatIP. Retained for future use of IndexIVFFlat
             max_retries=config.max_retries
         )
         logger.info("TFDataPipeline initialized successfully.")
         logger.error(f"Failed to initialize TFDataPipeline: {e}")
         sys.exit(1)
+    # Collect response pool from dialogues
     try:
         if dialogues:
             response_pool = data_pipeline.collect_responses_with_domain(dialogues)
         logger.error(f"Failed to collect responses: {e}")
         sys.exit(1)
+    # Build FAISS index with response embeddings
     try:
         if data_pipeline.response_pool:
             data_pipeline.build_text_to_domain_map()
             data_pipeline.compute_and_index_response_embeddings()
             logger.info("Response embeddings computed and added to FAISS index.")
+            # Save the FAISS index
             data_pipeline.save_faiss_index(FAISS_INDEX_PRODUCTION_PATH)
+            # Also save response pool JSON
             response_pool_path = FAISS_INDEX_PRODUCTION_PATH.replace('.index', '_responses.json')
             with open(response_pool_path, 'w', encoding='utf-8') as f:
                 json.dump(data_pipeline.response_pool, f, indent=2)
         logger.error(f"Failed to compute or add response embeddings: {e}")
         sys.exit(1)
+    # Prepare training data as TFRecords (TensforFlow Record format)
     try:
         if dialogues:
             logger.info("Starting data preparation and saving as TFRecord...")
         logger.error(f"Failed during data preparation and saving: {e}")
         sys.exit(1)
+    # Save query embeddings cache
     try:
         with open(CACHE_FILE, 'wb') as f:
             pickle.dump(data_pipeline.query_embeddings_cache, f)

tf_data_pipeline.py CHANGED Viewed

@@ -1,5 +1,4 @@
 import os
-import gc
 import numpy as np
 import faiss
 import tensorflow as tf
@@ -12,7 +11,6 @@ from typing import Union, Optional, Dict, List, Tuple, Generator
 from transformers import AutoTokenizer
 from typing import List, Tuple, Generator
 from transformers import AutoTokenizer
-from gpu_monitor import GPUMemoryMonitor
 import random
 from logger_config import config_logger
@@ -46,7 +44,6 @@ class TFDataPipeline:
         self.embedding_batch_size = 16 if len(response_pool) < 100 else 64
         self.search_batch_size = 16 if len(response_pool) < 100 else 64
         self.max_batch_size = 16 if len(response_pool) < 100 else 64
-        self.memory_monitor = GPUMemoryMonitor()
         self.max_retries = max_retries
         # Build a quick text->domain map for O(1) domain lookups

 import os
 import numpy as np
 import faiss
 import tensorflow as tf
 from transformers import AutoTokenizer
 from typing import List, Tuple, Generator
 from transformers import AutoTokenizer
 import random
 from logger_config import config_logger
         self.embedding_batch_size = 16 if len(response_pool) < 100 else 64
         self.search_batch_size = 16 if len(response_pool) < 100 else 64
         self.max_batch_size = 16 if len(response_pool) < 100 else 64
         self.max_retries = max_retries
         # Build a quick text->domain map for O(1) domain lookups

unused/build_faiss_index.py ADDED Viewed

	@@ -0,0 +1,160 @@

+# import os
+# import json
+# from pathlib import Path
+# import faiss
+# import numpy as np
+# import tensorflow as tf
+# from transformers import AutoTokenizer, TFAutoModel
+# from tqdm.auto import tqdm
+# from chatbot_model import ChatbotConfig, EncoderModel
+# from tf_data_pipeline import TFDataPipeline
+# from logger_config import config_logger
+# logger = config_logger(__name__)
+# os.environ["TOKENIZERS_PARALLELISM"] = "false"
+# def sanity_check(encoder: EncoderModel, tokenizer: AutoTokenizer, config: ChatbotConfig):
+#     """
+#     Perform a quick sanity check to ensure the model is loaded correctly.
+#     """
+#     sample_response = "This is a test response."
+#     encoded_sample = tokenizer(
+#         [sample_response],
+#         padding=True,
+#         truncation=True,
+#         max_length=config.max_context_token_limit,
+#         return_tensors='tf'
+#     )
+#     # Get embedding
+#     sample_embedding = encoder(encoded_sample['input_ids'], training=False).numpy()
+#     # Check shape
+#     if sample_embedding.shape[1] != config.embedding_dim:
+#         logger.error(
+#             f"Embedding dimension mismatch: Expected {config.embedding_dim}, "
+#             f"got {sample_embedding.shape[1]}"
+#         )
+#         raise ValueError("Embedding dimension mismatch.")
+#     else:
+#         logger.info("Embedding dimension matches the configuration.")
+#     # Check normalization
+#     embedding_norm = np.linalg.norm(sample_embedding, axis=1)
+#     if not np.allclose(embedding_norm, 1.0, atol=1e-5):
+#         logger.error("Embeddings are not properly normalized.")
+#         raise ValueError("Embeddings are not normalized.")
+#     else:
+#         logger.info("Embeddings are properly normalized.")
+#     logger.info("Sanity check passed: Model loaded correctly and outputs are as expected.")
+# def build_faiss_index():
+#     """
+#     Rebuild the FAISS index by:
+#       1) Loading your config.json
+#       2) Initializing encoder + loading submodule & custom weights
+#       3) Loading tokenizer from disk
+#       4) Creating a TFDataPipeline
+#       5) Setting the pipeline's response_pool from a JSON file
+#       6) Using pipeline.compute_and_index_response_embeddings()
+#       7) Saving the FAISS index
+#     """
+#     # Directories
+#     MODELS_DIR = Path("models")
+#     FAISS_DIR = MODELS_DIR / "faiss_indices"
+#     FAISS_INDEX_PATH = FAISS_DIR / "faiss_index_production.index"
+#     RESPONSES_PATH = FAISS_DIR / "faiss_index_production_responses.json"
+#     TOKENIZER_DIR = MODELS_DIR / "tokenizer"
+#     SHARED_ENCODER_DIR = MODELS_DIR / "shared_encoder"
+#     CUSTOM_WEIGHTS_PATH = MODELS_DIR / "encoder_custom_weights.weights.h5"
+#     # 1) Load ChatbotConfig
+#     config_path = MODELS_DIR / "config.json"
+#     if config_path.exists():
+#         with open(config_path, "r", encoding="utf-8") as f:
+#             config_dict = json.load(f)
+#         config = ChatbotConfig.from_dict(config_dict)
+#         logger.info(f"Loaded ChatbotConfig from {config_path}")
+#     else:
+#         config = ChatbotConfig()
+#         logger.warning(f"No config.json found at {config_path}. Using default ChatbotConfig.")
+#     # 2) Initialize the EncoderModel
+#     encoder = EncoderModel(config=config)
+#     logger.info("EncoderModel instantiated (empty).")
+#     # Overwrite the submodule from 'shared_encoder' directory
+#     if SHARED_ENCODER_DIR.exists():
+#         logger.info(f"Loading DistilBERT submodule from {SHARED_ENCODER_DIR}...")
+#         encoder.pretrained = TFAutoModel.from_pretrained(str(SHARED_ENCODER_DIR))
+#         logger.info("Loaded HF submodule into encoder.pretrained.")
+#     else:
+#         logger.warning(f"No shared_encoder directory at {SHARED_ENCODER_DIR}. Using default pretrained model.")
+#     # Build model once, then load custom weights (projection, etc.)
+#     dummy_input = tf.zeros((1, config.max_context_token_limit), dtype=tf.int32)
+#     _ = encoder(dummy_input, training=False)  # builds the layers
+#     if CUSTOM_WEIGHTS_PATH.exists():
+#         logger.info(f"Loading custom top-level weights from {CUSTOM_WEIGHTS_PATH}")
+#         encoder.load_weights(str(CUSTOM_WEIGHTS_PATH))
+#         logger.info("Custom top-level weights loaded successfully.")
+#     else:
+#         logger.warning(f"Custom weights file not found at {CUSTOM_WEIGHTS_PATH}.")
+#     # 3) Load tokenizer
+#     if TOKENIZER_DIR.exists():
+#         logger.info(f"Loading tokenizer from {TOKENIZER_DIR}")
+#         tokenizer = AutoTokenizer.from_pretrained(str(TOKENIZER_DIR))
+#     else:
+#         logger.warning(f"No tokenizer dir at {TOKENIZER_DIR}, falling back to default HF tokenizer.")
+#         tokenizer = AutoTokenizer.from_pretrained(config.pretrained_model)
+#     # 4) Quick sanity check
+#     sanity_check(encoder, tokenizer, config)
+#     # 5) Prepare a TFDataPipeline
+#     pipeline = TFDataPipeline(
+#         config=config,
+#         tokenizer=tokenizer,
+#         encoder=encoder,
+#         index_file_path=str(FAISS_INDEX_PATH),
+#         response_pool=[],
+#         max_length=config.max_context_token_limit,
+#         query_embeddings_cache={},
+#         neg_samples=config.neg_samples,
+#         index_type='IndexFlatIP',
+#         nlist=100,
+#         max_retries=config.max_retries
+#     )
+#     # 6) Load the existing response pool
+#     if not RESPONSES_PATH.exists():
+#         logger.error(f"Response pool JSON file not found at {RESPONSES_PATH}")
+#         raise FileNotFoundError(f"No response pool JSON at {RESPONSES_PATH}")
+#     with open(RESPONSES_PATH, "r", encoding="utf-8") as f:
+#         response_pool = json.load(f)
+#     logger.info(f"Loaded {len(response_pool)} responses from {RESPONSES_PATH}")
+#     pipeline.response_pool = response_pool  # assign to pipeline
+#     # 7) Build (or rebuild) the FAISS index from pipeline method
+#     #    This does all the compute-embeddings + index.add in one place
+#     logger.info("Starting to compute and index response embeddings via TFDataPipeline...")
+#     pipeline.compute_and_index_response_embeddings()
+#     # 8) Save the rebuilt FAISS index
+#     pipeline.save_faiss_index(str(FAISS_INDEX_PATH))
+#     # Verify
+#     loaded_index = faiss.read_index(str(FAISS_INDEX_PATH))
+#     logger.info(f"Verified the rebuilt FAISS index has {loaded_index.ntotal} vectors.")
+#     return loaded_index, pipeline.response_pool
+# if __name__ == "__main__":
+#     build_faiss_index()

gpu_monitor.py → unused/gpu_monitor.py RENAMED Viewed

@@ -1,17 +1,8 @@
-import numpy as np
 import tensorflow as tf
-import faiss
-import json
-from pathlib import Path
-from typing import List, Dict, Tuple, Optional, Generator
 from dataclasses import dataclass
-import threading
-from queue import Queue
-import gc
-try:
-    from tqdm.notebook import tqdm
-except ImportError:
-    from tqdm import tqdm
 @dataclass
 class GPUMemoryStats:
@@ -63,6 +54,6 @@ class GPUMemoryMonitor:
     def can_increase_batch_size(self) -> bool:
         """Check if batch size can be increased based on memory usage."""
         if not self.has_gpu:
-            return True  # Allow increase on CPU
         usage = self.get_memory_usage()
         return usage < 0.70

 import tensorflow as tf
+from typing import List, Dict, Optional
 from dataclasses import dataclass
+from tqdm.auto import tqdm
 @dataclass
 class GPUMemoryStats:
     def can_increase_batch_size(self) -> bool:
         """Check if batch size can be increased based on memory usage."""
         if not self.has_gpu:
+            return True
         usage = self.get_memory_usage()
         return usage < 0.70

validate_model.py CHANGED Viewed

@@ -71,11 +71,6 @@ def validate_chatbot():
         logger.warning("No config.json found. Using default ChatbotConfig.")
     # Load RetrievalChatbot in 'inference' mode using the classmethod
-    # This:
-    #    - Loads shared_encoder submodule
-    #    - Loads encoder_custom_weights.weights.h5
-    #    - Loads tokenizer
-    #    - Prepares the model for inference
     try:
         chatbot = RetrievalChatbot.load_model(load_dir=MODEL_DIR, mode="inference")
         logger.info("RetrievalChatbot loaded in 'inference' mode successfully.")

         logger.warning("No config.json found. Using default ChatbotConfig.")
     # Load RetrievalChatbot in 'inference' mode using the classmethod
     try:
         chatbot = RetrievalChatbot.load_model(load_dir=MODEL_DIR, mode="inference")
         logger.info("RetrievalChatbot loaded in 'inference' mode successfully.")