Spaces:

JoeArmani
/

csc525_retrieval_based_chatbot

Sleeping

App Files Files Community

JoeArmani commited on Jan 12

Commit

a763857

1 Parent(s): 4aec49f

implement chat features

Browse files

Files changed (5) hide show

chatbot_model.py +55 -6
chatbot_validator.py +46 -45
response_quality_checker.py +3 -5
run_chatbot_chat.py +73 -0
run_chatbot_validation.py +7 -37

chatbot_model.py CHANGED Viewed

@@ -546,6 +546,48 @@ class RetrievalChatbot(DeviceAwareModel):
         return boosted[:top_k]
     def chat(
         self,
         query: str,
@@ -562,7 +604,7 @@ class RetrievalChatbot(DeviceAwareModel):
             conversation_str = self_arg._build_conversation_context(query_arg, conversation_history)
             # Retrieve and re-rank
-            results = self_arg.retrieve_responses(
                 query=conversation_str,
                 top_k=top_k,
                 reranker=self_arg.reranker,
@@ -571,13 +613,20 @@ class RetrievalChatbot(DeviceAwareModel):
             )
             # Handle low confidence or empty responses
-            if not results:
                 return ("I'm sorry, but I couldn't find a relevant response.", [], {})
-            metrics = quality_checker.check_response_quality(query_arg, results)
-            if not metrics.get('is_confident', False):
-                return ("I need more information to provide a good answer. Could you please clarify?", results, metrics)
-            return results[0][0], results, metrics
         return get_response(self, query)

         return boosted[:top_k]
+    def introduction_message(self) -> None:
+        """Print an introduction message to introduce the chatbot."""
+        print(
+            "\nAssistant: Hello! I'm a simple chatbot assistant. I've been trained to answer "
+            "basic questions about topics including restaurants, movies, ride sharing, coffee, and pizza. "
+            "Please ask me a question and I'll do my best to assist you."
+        )
+    def run_interactive_chat(self, quality_checker, show_alternatives=False):
+        """Separate function for interactive chat loop."""
+        # Chatbot introduction
+        self.introduction_message()
+        # Chat loop
+        while True:
+            try:
+                user_input = input("\nYou: ")
+            except (KeyboardInterrupt, EOFError):
+                print("\nAssistant: Goodbye!")
+                break
+            if user_input.lower() in ["quit", "exit", "bye"]:
+                print("\nAssistant: Goodbye!")
+                break
+            response, candidates, metrics = self.chat(
+                query=user_input,
+                conversation_history=None,
+                quality_checker=quality_checker,
+                top_k=10
+            )
+            print(f"\nAssistant: {response}")
+            if show_alternatives and candidates and metrics.get("is_confident", False):
+                print("\n   Alternative responses:")
+                for resp, score in candidates[1:4]:
+                    print(f"   Score: {score:.4f} - {resp}")
+            else:
+                print("\n[Low Confidence]: Consider rephrasing your query for better assistance.")
     def chat(
         self,
         query: str,
             conversation_str = self_arg._build_conversation_context(query_arg, conversation_history)
             # Retrieve and re-rank
+            responses = self_arg.retrieve_responses(
                 query=conversation_str,
                 top_k=top_k,
                 reranker=self_arg.reranker,
             )
             # Handle low confidence or empty responses
+            if not responses:
                 return ("I'm sorry, but I couldn't find a relevant response.", [], {})
+            # Analyze is_confident and computed score when returning the top response
+            metrics = quality_checker.check_response_quality(query_arg, responses)
+            is_confident = metrics.get('is_confident', False)
+            top_response_score = responses[0][1]
+            # if uncertain, ask for clarification
+            if not is_confident or top_response_score < 0.5:
+                return ("I need more information to provide a good answer. Could you please clarify?", responses, metrics)
+            # Return the top response
+            return responses[0][0], responses, metrics
         return get_response(self, query)

chatbot_validator.py CHANGED Viewed

@@ -12,7 +12,7 @@ class ChatbotValidator:
     Handles automated validation and performance analysis for the chatbot.
     This testing module executes domain-specific queries, obtains chatbot responses, and evaluates them with a quality checker.
     """
     def __init__(self, chatbot, quality_checker):
         """
         Initialize the validator.
@@ -22,7 +22,7 @@ class ChatbotValidator:
         """
         self.chatbot = chatbot
         self.quality_checker = quality_checker
         # Domain-specific test queries (aligns with Taskmaster-1 dataset)
         self.domain_queries = {
             'restaurant': [
@@ -56,7 +56,7 @@ class ChatbotValidator:
                 "My Jeep needs a repair. Can you help me with that?",
             ],
         }
     def run_validation(
         self,
         num_examples: int = 3,
@@ -77,64 +77,65 @@ class ChatbotValidator:
             Dict with validation metrics
         """
         logger.info("\n=== Running Automatic Validation ===")
         # Select which domains to test
         test_domains = domains if domains else list(self.domain_queries.keys())
         # Initialize results
         metrics_history = []
         domain_metrics = {}
         # Init the cross-encoder reranker to pass to the chatbot
         reranker = CrossEncoderReranker(model_name=self.chatbot.config.cross_encoder_model)
         # Prepare random selection if needed
         rng = random.Random(seed)
         # Run validation for each domain
         for domain in test_domains:
             # Avoid errors if domain key missing
             if domain not in self.domain_queries:
                 logger.warning(f"Domain '{domain}' not found in domain_queries. Skipping.")
                 continue
             all_queries = self.domain_queries[domain]
             if randomize:
                 queries = rng.sample(all_queries, min(num_examples, len(all_queries)))
             else:
                 queries = all_queries[:num_examples]
             # Store domain-level metrics
             domain_metrics[domain] = []
-            logger.info(f"\n=== Testing {domain.title()} Domain ===")
             for i, query in enumerate(queries, 1):
-                logger.info(f"\nTest Case {i}: {query}")
                 # Retrieve top_k responses, then evaluate with quality checker
                 responses = self.chatbot.retrieve_responses(query, top_k=top_k, reranker=reranker)
                 quality_metrics = self.quality_checker.check_response_quality(query, responses)
                 # Aggregate metrics and log
                 quality_metrics['domain'] = domain
                 metrics_history.append(quality_metrics)
                 domain_metrics[domain].append(quality_metrics)
                 self._log_validation_results(query, responses, quality_metrics)
         # Final aggregation
         aggregate_metrics = self._calculate_aggregate_metrics(metrics_history)
         domain_analysis = self._analyze_domain_performance(domain_metrics)
         confidence_analysis = self._analyze_confidence_distribution(metrics_history)
         aggregate_metrics.update({
             'domain_performance': domain_analysis,
             'confidence_analysis': confidence_analysis
         })
         self._log_validation_summary(aggregate_metrics)
         return aggregate_metrics
     def _calculate_aggregate_metrics(self, metrics_history: List[Dict]) -> Dict[str, float]:
         """
         Calculate aggregate metrics over tested queries.
@@ -142,9 +143,9 @@ class ChatbotValidator:
         if not metrics_history:
             logger.warning("No metrics to aggregate. Returning empty summary.")
             return {}
         top_scores = [m.get('top_score', 0.0) for m in metrics_history]
         metrics = {
             'num_queries_tested': len(metrics_history),
             'avg_top_response_score': np.mean(top_scores),
@@ -159,20 +160,20 @@ class ChatbotValidator:
             'max_score': np.max(top_scores)
         }
         return metrics
     def _analyze_domain_performance(self, domain_metrics: Dict[str, List[Dict]]) -> Dict[str, Dict[str, float]]:
         """
         Analyze performance by domain, returning a nested dict.
         """
         analysis = {}
         for domain, metrics_list in domain_metrics.items():
             if not metrics_list:
                 analysis[domain] = {}
                 continue
             top_scores = [m.get('top_score', 0.0) for m in metrics_list]
             analysis[domain] = {
                 'confidence_rate': np.mean([1.0 if m.get('is_confident', False) else 0.0 for m in metrics_list]),
                 'avg_relevance': np.mean([m.get('query_response_relevance', 0.0) for m in metrics_list]),
@@ -180,9 +181,9 @@ class ChatbotValidator:
                 'avg_top_score': np.mean(top_scores),
                 'num_samples': len(metrics_list)
             }
         return analysis
     def _analyze_confidence_distribution(self, metrics_history: List[Dict]) -> Dict[str, float]:
         """
         Analyze the distribution of top scores to gauge system confidence levels.
@@ -190,7 +191,7 @@ class ChatbotValidator:
         if not metrics_history:
             return {'percentile_25': 0.0, 'percentile_50': 0.0,
                     'percentile_75': 0.0, 'percentile_90': 0.0}
         scores = [m.get('top_score', 0.0) for m in metrics_history]
         return {
             'percentile_25': float(np.percentile(scores, 25)),
@@ -198,7 +199,7 @@ class ChatbotValidator:
             'percentile_75': float(np.percentile(scores, 75)),
             'percentile_90': float(np.percentile(scores, 90))
         }
     def _log_validation_results(
         self,
         query: str,
@@ -210,19 +211,18 @@ class ChatbotValidator:
         """
         domain = metrics.get('domain', 'Unknown')
         is_confident = metrics.get('is_confident', False)
-        logger.info(f"Domain: {domain} | Confidence: {'Yes' if is_confident else 'No'}")
-        # logger.info("Quality Metrics:")
-        # for k, v in metrics.items():
-        #     if isinstance(v, (int, float)):
-        #         logger.info(f"  {k}: {v:.4f}")
-        logger.info("Top 3 Responses:")
         for i, (resp_text, score) in enumerate(responses[:3], 1):
-            logger.info(f"{i}) Score: {score:.4f} | {resp_text}")
-            if i == 1 and not is_confident:
-                logger.info("   [Low Confidence on Top Response]")
     def _log_validation_summary(self, metrics: Dict[str, Any]):
         """
         Log a summary of all validation metrics and domain performance.
@@ -230,16 +230,16 @@ class ChatbotValidator:
         if not metrics:
             logger.info("No metrics to summarize.")
             return
         logger.info("\n=== Validation Summary ===")
         # Overall
         logger.info("\nOverall Metrics:")
         for metric, value in metrics.items():
             # Skip sub-dicts here
             if isinstance(value, (int, float)):
                 logger.info(f"{metric}: {value:.4f}")
         # Domain performance
         domain_perf = metrics.get('domain_performance', {})
         logger.info("\nDomain Performance:")
@@ -247,9 +247,10 @@ class ChatbotValidator:
             logger.info(f"\n{domain.title()}:")
             for metric, value in domain_stats.items():
                 logger.info(f"  {metric}: {value:.4f}")
         # Confidence distribution
         conf_analysis = metrics.get('confidence_analysis', {})
         logger.info("\nConfidence Distribution:")
         for pct, val in conf_analysis.items():
             logger.info(f"  {pct}: {val:.4f}")

     Handles automated validation and performance analysis for the chatbot.
     This testing module executes domain-specific queries, obtains chatbot responses, and evaluates them with a quality checker.
     """
     def __init__(self, chatbot, quality_checker):
         """
         Initialize the validator.
         """
         self.chatbot = chatbot
         self.quality_checker = quality_checker
         # Domain-specific test queries (aligns with Taskmaster-1 dataset)
         self.domain_queries = {
             'restaurant': [
                 "My Jeep needs a repair. Can you help me with that?",
             ],
         }
     def run_validation(
         self,
         num_examples: int = 3,
             Dict with validation metrics
         """
         logger.info("\n=== Running Automatic Validation ===")
         # Select which domains to test
         test_domains = domains if domains else list(self.domain_queries.keys())
         # Initialize results
         metrics_history = []
         domain_metrics = {}
         # Init the cross-encoder reranker to pass to the chatbot
         reranker = CrossEncoderReranker(model_name=self.chatbot.config.cross_encoder_model)
         # Prepare random selection if needed
         rng = random.Random(seed)
         # Run validation for each domain
         for domain in test_domains:
             # Avoid errors if domain key missing
             if domain not in self.domain_queries:
                 logger.warning(f"Domain '{domain}' not found in domain_queries. Skipping.")
                 continue
             all_queries = self.domain_queries[domain]
             if randomize:
                 queries = rng.sample(all_queries, min(num_examples, len(all_queries)))
             else:
                 queries = all_queries[:num_examples]
             # Store domain-level metrics
             domain_metrics[domain] = []
+            logger.info(f"\n=== Testing {domain.title()} Domain ===\n")
             for i, query in enumerate(queries, 1):
+                logger.info(f"TEST CASE {i}: QUERY: {query}")
                 # Retrieve top_k responses, then evaluate with quality checker
                 responses = self.chatbot.retrieve_responses(query, top_k=top_k, reranker=reranker)
                 quality_metrics = self.quality_checker.check_response_quality(query, responses)
                 # Aggregate metrics and log
                 quality_metrics['domain'] = domain
                 metrics_history.append(quality_metrics)
                 domain_metrics[domain].append(quality_metrics)
                 self._log_validation_results(query, responses, quality_metrics)
+                logger.info(f"Quality metrics: {quality_metrics}\n")
         # Final aggregation
         aggregate_metrics = self._calculate_aggregate_metrics(metrics_history)
         domain_analysis = self._analyze_domain_performance(domain_metrics)
         confidence_analysis = self._analyze_confidence_distribution(metrics_history)
         aggregate_metrics.update({
             'domain_performance': domain_analysis,
             'confidence_analysis': confidence_analysis
         })
         self._log_validation_summary(aggregate_metrics)
         return aggregate_metrics
     def _calculate_aggregate_metrics(self, metrics_history: List[Dict]) -> Dict[str, float]:
         """
         Calculate aggregate metrics over tested queries.
         if not metrics_history:
             logger.warning("No metrics to aggregate. Returning empty summary.")
             return {}
         top_scores = [m.get('top_score', 0.0) for m in metrics_history]
         metrics = {
             'num_queries_tested': len(metrics_history),
             'avg_top_response_score': np.mean(top_scores),
             'max_score': np.max(top_scores)
         }
         return metrics
     def _analyze_domain_performance(self, domain_metrics: Dict[str, List[Dict]]) -> Dict[str, Dict[str, float]]:
         """
         Analyze performance by domain, returning a nested dict.
         """
         analysis = {}
         for domain, metrics_list in domain_metrics.items():
             if not metrics_list:
                 analysis[domain] = {}
                 continue
             top_scores = [m.get('top_score', 0.0) for m in metrics_list]
             analysis[domain] = {
                 'confidence_rate': np.mean([1.0 if m.get('is_confident', False) else 0.0 for m in metrics_list]),
                 'avg_relevance': np.mean([m.get('query_response_relevance', 0.0) for m in metrics_list]),
                 'avg_top_score': np.mean(top_scores),
                 'num_samples': len(metrics_list)
             }
         return analysis
     def _analyze_confidence_distribution(self, metrics_history: List[Dict]) -> Dict[str, float]:
         """
         Analyze the distribution of top scores to gauge system confidence levels.
         if not metrics_history:
             return {'percentile_25': 0.0, 'percentile_50': 0.0,
                     'percentile_75': 0.0, 'percentile_90': 0.0}
         scores = [m.get('top_score', 0.0) for m in metrics_history]
         return {
             'percentile_25': float(np.percentile(scores, 25)),
             'percentile_75': float(np.percentile(scores, 75)),
             'percentile_90': float(np.percentile(scores, 90))
         }
     def _log_validation_results(
         self,
         query: str,
         """
         domain = metrics.get('domain', 'Unknown')
         is_confident = metrics.get('is_confident', False)
+        logger.info(f"DOMAIN: {domain} | CONFIDENCE: {'Yes' if is_confident else 'No'}")
+        if is_confident or responses[0][1] >= 0.5:
+            logger.info(f"SELECTED RESPONSE: '{responses[0][0]}'")
+        else:
+            logger.info("SELECTED RESPONSE: NONE (Low Confidence)")
+        logger.info("   Top 3 Responses:")
         for i, (resp_text, score) in enumerate(responses[:3], 1):
+            logger.info(f"   {i}) Score: {score:.4f} | {resp_text}")
     def _log_validation_summary(self, metrics: Dict[str, Any]):
         """
         Log a summary of all validation metrics and domain performance.
         if not metrics:
             logger.info("No metrics to summarize.")
             return
         logger.info("\n=== Validation Summary ===")
         # Overall
         logger.info("\nOverall Metrics:")
         for metric, value in metrics.items():
             # Skip sub-dicts here
             if isinstance(value, (int, float)):
                 logger.info(f"{metric}: {value:.4f}")
         # Domain performance
         domain_perf = metrics.get('domain_performance', {})
         logger.info("\nDomain Performance:")
             logger.info(f"\n{domain.title()}:")
             for metric, value in domain_stats.items():
                 logger.info(f"  {metric}: {value:.4f}")
         # Confidence distribution
         conf_analysis = metrics.get('confidence_analysis', {})
         logger.info("\nConfidence Distribution:")
         for pct, val in conf_analysis.items():
             logger.info(f"  {pct}: {val:.4f}")

response_quality_checker.py CHANGED Viewed

@@ -18,10 +18,10 @@ class ResponseQualityChecker:
     def __init__(
         self,
         data_pipeline: "TFDataPipeline",
-        confidence_threshold: float = 0.40,
-        diversity_threshold: float = 0.15,
         min_response_length: int = 5,
-        similarity_cap: float = 0.85
     ):
         """
         Args:
@@ -74,10 +74,8 @@ class ResponseQualityChecker:
         metrics['response_length_score'] = self._calc_length_score(responses)
         metrics['top_score'] = responses[0][1]
         metrics['top_3_score_gap'] = self._calc_score_gap([score for _, score in responses])
         metrics['is_confident'] = self._determine_confidence(metrics)
-        logger.info(f"Quality metrics: {metrics}")
         return metrics
     def _calc_diversity(self, responses: List[Tuple[str, float]]) -> float:

     def __init__(
         self,
         data_pipeline: "TFDataPipeline",
+        confidence_threshold: float = 0.45,
+        diversity_threshold: float = 0.10,
         min_response_length: int = 5,
+        similarity_cap: float = 0.90
     ):
         """
         Args:
         metrics['response_length_score'] = self._calc_length_score(responses)
         metrics['top_score'] = responses[0][1]
         metrics['top_3_score_gap'] = self._calc_score_gap([score for _, score in responses])
         metrics['is_confident'] = self._determine_confidence(metrics)
         return metrics
     def _calc_diversity(self, responses: List[Tuple[str, float]]) -> float:

run_chatbot_chat.py ADDED Viewed

	@@ -0,0 +1,73 @@

+import os
+import json
+from chatbot_model import ChatbotConfig, RetrievalChatbot
+from response_quality_checker import ResponseQualityChecker
+from environment_setup import EnvironmentSetup
+from logger_config import config_logger
+logger = config_logger(__name__)
+def run_chatbot_chat():
+    env = EnvironmentSetup()
+    env.initialize()
+    MODEL_DIR = "models"
+    FAISS_INDICES_DIR = os.path.join(MODEL_DIR, "faiss_indices")
+    FAISS_INDEX_PRODUCTION_PATH = os.path.join(FAISS_INDICES_DIR, "faiss_index_production.index")
+    FAISS_INDEX_TEST_PATH = os.path.join(FAISS_INDICES_DIR, "faiss_index_test.index")
+    # Toggle 'production' or 'test' env
+    ENVIRONMENT = "production"
+    if ENVIRONMENT == "test":
+        FAISS_INDEX_PATH = FAISS_INDEX_TEST_PATH
+        RESPONSE_POOL_PATH = FAISS_INDEX_TEST_PATH.replace(".index", "_responses.json")
+    else:
+        FAISS_INDEX_PATH = FAISS_INDEX_PRODUCTION_PATH
+        RESPONSE_POOL_PATH = FAISS_INDEX_PRODUCTION_PATH.replace(".index", "_responses.json")
+    # Load the config
+    config_path = os.path.join(MODEL_DIR, "config.json")
+    if os.path.exists(config_path):
+        with open(config_path, "r", encoding="utf-8") as f:
+            config_dict = json.load(f)
+        config = ChatbotConfig.from_dict(config_dict)
+        logger.info(f"Loaded ChatbotConfig from {config_path}")
+    else:
+        config = ChatbotConfig()
+        logger.warning("No config.json found. Using default ChatbotConfig.")
+    # Load RetrievalChatbot in 'inference' mode
+    try:
+        chatbot = RetrievalChatbot.load_model(load_dir=MODEL_DIR, mode="inference")
+    except Exception as e:
+        logger.error(f"Failed to load RetrievalChatbot: {e}")
+        return
+    # Confirm FAISS index & response pool exist
+    if not os.path.exists(FAISS_INDEX_PATH) or not os.path.exists(RESPONSE_POOL_PATH):
+        logger.error("FAISS index or response pool file is missing.")
+        return
+    # Load FAISS index and response pool
+    try:
+        chatbot.data_pipeline.load_faiss_index(FAISS_INDEX_PATH)
+        with open(RESPONSE_POOL_PATH, "r", encoding="utf-8") as f:
+            chatbot.data_pipeline.response_pool = json.load(f)
+        logger.info(f"FAISS index loaded from {FAISS_INDEX_PATH}.")
+        # Validate dimension consistency
+        chatbot.data_pipeline.validate_faiss_index()
+    except Exception as e:
+        logger.error(f"Failed to load or validate FAISS index: {e}")
+        return
+    # Init QualityChecker and Validator
+    quality_checker = ResponseQualityChecker(data_pipeline=chatbot.data_pipeline)
+    # Run interactive chat loop
+    logger.info("\nStarting interactive chat session...")
+    chatbot.run_interactive_chat(quality_checker)
+if __name__ == "__main__":
+    run_chatbot_chat()

run_chatbot_validation.py CHANGED Viewed

@@ -8,36 +8,6 @@ from environment_setup import EnvironmentSetup
 from logger_config import config_logger
 logger = config_logger(__name__)
-def run_interactive_chat(chatbot, quality_checker):
-    """Separate function for interactive chat loop."""
-    while True:
-        try:
-            user_input = input("You: ")
-        except (KeyboardInterrupt, EOFError):
-            print("\nAssistant: Goodbye!")
-            break
-        if user_input.lower() in ["quit", "exit", "bye"]:
-            print("Assistant: Goodbye!")
-            break
-        response, candidates, metrics = chatbot.chat(
-            query=user_input,
-            conversation_history=None,
-            quality_checker=quality_checker,
-            top_k=10
-        )
-        print(f"Assistant: {response}")
-        # Show alternative responses if confident
-        if metrics.get("is_confident", False):
-            print("\nAlternative responses:")
-            for resp, score in candidates[1:4]:
-                print(f"Score: {score:.4f} - {resp}")
-        else:
-            print("\n[Low Confidence]: Consider rephrasing your query for better assistance.")
 def run_chatbot_validation():
     # Initialize environment
@@ -118,16 +88,16 @@ def run_chatbot_validation():
         return
     # Plot metrics
-    # try:
-    #     plotter = Plotter(save_dir=env.training_dirs["plots"])
-    #     plotter.plot_validation_metrics(validation_metrics)
-    #     logger.info("Validation metrics plotted successfully.")
-    # except Exception as e:
-    #     logger.error(f"Failed to plot validation metrics: {e}")
     # Run interactive chat loop
     logger.info("\nStarting interactive chat session...")
-    run_interactive_chat(chatbot, quality_checker)
 if __name__ == "__main__":
     run_chatbot_validation()

 from logger_config import config_logger
 logger = config_logger(__name__)
 def run_chatbot_validation():
     # Initialize environment
         return
     # Plot metrics
+    try:
+        plotter = Plotter(save_dir=env.training_dirs["plots"])
+        plotter.plot_validation_metrics(validation_metrics)
+        logger.info("Validation metrics plotted successfully.")
+    except Exception as e:
+        logger.error(f"Failed to plot validation metrics: {e}")
     # Run interactive chat loop
     logger.info("\nStarting interactive chat session...")
+    chatbot.run_interactive_chat(quality_checker, show_alternatives=True)
 if __name__ == "__main__":
     run_chatbot_validation()