File size: 6,020 Bytes
300fe5d
f7b283c
300fe5d
f7b283c
 
 
 
 
5b413d1
300fe5d
 
f7b283c
5b413d1
300fe5d
 
5b413d1
f7b283c
 
 
 
300fe5d
 
 
 
f7b283c
5b413d1
 
f7b283c
 
 
 
 
 
5b413d1
300fe5d
 
 
 
 
 
f7b283c
5b413d1
f7b283c
 
 
5b413d1
f7b283c
 
 
 
 
 
 
 
 
 
 
 
5b413d1
f7b283c
 
 
 
 
 
 
 
 
 
5b413d1
f7b283c
 
5b413d1
f7b283c
 
 
 
 
 
 
5b413d1
f7b283c
5b413d1
 
 
 
 
f7b283c
5b413d1
 
f7b283c
5b413d1
f7b283c
 
300fe5d
f7b283c
300fe5d
 
5b413d1
 
 
300fe5d
f7b283c
5b413d1
 
300fe5d
5b413d1
 
 
f7b283c
5b413d1
 
 
 
300fe5d
5b413d1
 
 
 
f7b283c
 
 
 
 
 
 
 
 
5b413d1
f7b283c
 
 
 
 
 
5b413d1
f7b283c
 
300fe5d
f7b283c
 
5b413d1
f7b283c
 
 
 
300fe5d
f7b283c
300fe5d
f7b283c
300fe5d
 
f7b283c
5b413d1
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
import numpy as np
from typing import List, Tuple, Dict, Any, TYPE_CHECKING
from sklearn.metrics.pairwise import cosine_similarity

from logger_config import config_logger
logger = config_logger(__name__)

if TYPE_CHECKING:
    from tf_data_pipeline import TFDataPipeline

class ResponseQualityChecker:
    """Enhanced quality checking with dynamic thresholds."""

    def __init__(
        self,
        data_pipeline: 'TFDataPipeline',
        confidence_threshold: float = 0.6,
        diversity_threshold: float = 0.15,
        min_response_length: int = 5,
        similarity_cap: float = 0.85  # Renamed from max_similarity_ratio and used in diversity calc
    ):
        self.confidence_threshold = confidence_threshold
        self.diversity_threshold = diversity_threshold
        self.min_response_length = min_response_length
        self.similarity_cap = similarity_cap
        self.data_pipeline = data_pipeline  # Reference to TFDataPipeline

        # Dynamic thresholds based on response patterns
        self.thresholds = {
            'relevance': 0.35,    
            'length_score': 0.85,  
            'score_gap': 0.07     
        }

    def check_response_quality(
        self,
        query: str,
        responses: List[Tuple[str, float]]
    ) -> Dict[str, Any]:
        """
        Evaluate the quality of responses based on various metrics.

        Args:
            query: The user's query
            responses: List of (response_text, score) tuples

        Returns:
            Dict containing quality metrics and confidence assessment
        """
        if not responses:
            return {
                'response_diversity': 0.0,
                'query_response_relevance': 0.0,
                'is_confident': False,
                'top_score': 0.0,
                'response_length_score': 0.0,
                'top_3_score_gap': 0.0
            }

        # Calculate core metrics
        metrics = {
            'response_diversity': self.calculate_diversity(responses),
            'query_response_relevance': self.calculate_relevance(query, responses),
            'response_length_score': np.mean([
                self._calculate_length_score(response) for response, _ in responses
            ]),
            'top_score': responses[0][1],
            'top_3_score_gap': self._calculate_score_gap([score for _, score in responses], top_n=3)
        }

        # Determine confidence using thresholds
        metrics['is_confident'] = self._determine_confidence(metrics)

        logger.info(f"Quality metrics: {metrics}")
        return metrics

    def calculate_relevance(self, query: str, responses: List[Tuple[str, float]]) -> float:
        """Calculate relevance as weighted similarity between query and responses."""
        if not responses:
            return 0.0

        # Get embeddings
        query_embedding = self.data_pipeline.encode_query(query)
        response_texts = [resp for resp, _ in responses]
        response_embeddings = self.data_pipeline.encode_responses(response_texts)

        # Compute similarities
        similarities = cosine_similarity([query_embedding], response_embeddings)[0]

        # Apply decreasing weights for later responses
        weights = np.array([1.0 / (i + 1) for i in range(len(similarities))])

        return np.average(similarities, weights=weights)

    def calculate_diversity(self, responses: List[Tuple[str, float]]) -> float:
        """Calculate diversity with length normalization and similarity capping."""
        if not responses:
            return 0.0

        response_texts = [resp for resp, _ in responses]
        embeddings = self.data_pipeline.encode_responses(response_texts)
        if len(embeddings) < 2:
            return 1.0

        # Calculate pairwise cosine similarities
        similarity_matrix = cosine_similarity(embeddings)
        np.fill_diagonal(similarity_matrix, 0)  # Exclude self-similarity

        # Apply similarity cap
        similarity_matrix = np.minimum(similarity_matrix, self.similarity_cap)

        # Calculate average similarity
        sum_similarities = np.sum(similarity_matrix)
        num_pairs = len(embeddings) * (len(embeddings) - 1)
        avg_similarity = sum_similarities / num_pairs if num_pairs > 0 else 0.0

        # Diversity is inversely related to average similarity
        diversity_score = 1 - avg_similarity
        return diversity_score

    def _determine_confidence(self, metrics: Dict[str, float]) -> bool:
        """Determine confidence using primary and secondary conditions."""
        # Primary conditions (must all be met)
        primary_conditions = [
            metrics['top_score'] >= self.confidence_threshold,
            metrics['response_diversity'] >= self.diversity_threshold,
            metrics['response_length_score'] >= self.thresholds['length_score']
        ]

        # Secondary conditions (majority must be met)
        secondary_conditions = [
            metrics['query_response_relevance'] >= self.thresholds['relevance'],
            metrics['top_3_score_gap'] >= self.thresholds['score_gap'],
            metrics['top_score'] >= (self.confidence_threshold * 1.1)  # Extra confidence boost
        ]

        return all(primary_conditions) and sum(secondary_conditions) >= 2

    def _calculate_length_score(self, response: str) -> float:
        """Calculate length score with penalty for very short or long responses."""
        words = len(response.split())

        if words < self.min_response_length:
            return words / self.min_response_length
        elif words > 50:  # Penalty for very long responses
            return min(1.0, 50 / words)
        return 1.0

    def _calculate_score_gap(self, scores: List[float], top_n: int = 3) -> float:
        """Calculate average gap between top N scores."""
        if len(scores) < top_n + 1:
            return 0.0
        gaps = [scores[i] - scores[i + 1] for i in range(min(len(scores) - 1, top_n))]
        return np.mean(gaps)