File size: 6,020 Bytes
300fe5d f7b283c 300fe5d f7b283c 5b413d1 300fe5d f7b283c 5b413d1 300fe5d 5b413d1 f7b283c 300fe5d f7b283c 5b413d1 f7b283c 5b413d1 300fe5d f7b283c 5b413d1 f7b283c 5b413d1 f7b283c 5b413d1 f7b283c 5b413d1 f7b283c 5b413d1 f7b283c 5b413d1 f7b283c 5b413d1 f7b283c 5b413d1 f7b283c 5b413d1 f7b283c 300fe5d f7b283c 300fe5d 5b413d1 300fe5d f7b283c 5b413d1 300fe5d 5b413d1 f7b283c 5b413d1 300fe5d 5b413d1 f7b283c 5b413d1 f7b283c 5b413d1 f7b283c 300fe5d f7b283c 5b413d1 f7b283c 300fe5d f7b283c 300fe5d f7b283c 300fe5d f7b283c 5b413d1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 |
import numpy as np
from typing import List, Tuple, Dict, Any, TYPE_CHECKING
from sklearn.metrics.pairwise import cosine_similarity
from logger_config import config_logger
logger = config_logger(__name__)
if TYPE_CHECKING:
from tf_data_pipeline import TFDataPipeline
class ResponseQualityChecker:
"""Enhanced quality checking with dynamic thresholds."""
def __init__(
self,
data_pipeline: 'TFDataPipeline',
confidence_threshold: float = 0.6,
diversity_threshold: float = 0.15,
min_response_length: int = 5,
similarity_cap: float = 0.85 # Renamed from max_similarity_ratio and used in diversity calc
):
self.confidence_threshold = confidence_threshold
self.diversity_threshold = diversity_threshold
self.min_response_length = min_response_length
self.similarity_cap = similarity_cap
self.data_pipeline = data_pipeline # Reference to TFDataPipeline
# Dynamic thresholds based on response patterns
self.thresholds = {
'relevance': 0.35,
'length_score': 0.85,
'score_gap': 0.07
}
def check_response_quality(
self,
query: str,
responses: List[Tuple[str, float]]
) -> Dict[str, Any]:
"""
Evaluate the quality of responses based on various metrics.
Args:
query: The user's query
responses: List of (response_text, score) tuples
Returns:
Dict containing quality metrics and confidence assessment
"""
if not responses:
return {
'response_diversity': 0.0,
'query_response_relevance': 0.0,
'is_confident': False,
'top_score': 0.0,
'response_length_score': 0.0,
'top_3_score_gap': 0.0
}
# Calculate core metrics
metrics = {
'response_diversity': self.calculate_diversity(responses),
'query_response_relevance': self.calculate_relevance(query, responses),
'response_length_score': np.mean([
self._calculate_length_score(response) for response, _ in responses
]),
'top_score': responses[0][1],
'top_3_score_gap': self._calculate_score_gap([score for _, score in responses], top_n=3)
}
# Determine confidence using thresholds
metrics['is_confident'] = self._determine_confidence(metrics)
logger.info(f"Quality metrics: {metrics}")
return metrics
def calculate_relevance(self, query: str, responses: List[Tuple[str, float]]) -> float:
"""Calculate relevance as weighted similarity between query and responses."""
if not responses:
return 0.0
# Get embeddings
query_embedding = self.data_pipeline.encode_query(query)
response_texts = [resp for resp, _ in responses]
response_embeddings = self.data_pipeline.encode_responses(response_texts)
# Compute similarities
similarities = cosine_similarity([query_embedding], response_embeddings)[0]
# Apply decreasing weights for later responses
weights = np.array([1.0 / (i + 1) for i in range(len(similarities))])
return np.average(similarities, weights=weights)
def calculate_diversity(self, responses: List[Tuple[str, float]]) -> float:
"""Calculate diversity with length normalization and similarity capping."""
if not responses:
return 0.0
response_texts = [resp for resp, _ in responses]
embeddings = self.data_pipeline.encode_responses(response_texts)
if len(embeddings) < 2:
return 1.0
# Calculate pairwise cosine similarities
similarity_matrix = cosine_similarity(embeddings)
np.fill_diagonal(similarity_matrix, 0) # Exclude self-similarity
# Apply similarity cap
similarity_matrix = np.minimum(similarity_matrix, self.similarity_cap)
# Calculate average similarity
sum_similarities = np.sum(similarity_matrix)
num_pairs = len(embeddings) * (len(embeddings) - 1)
avg_similarity = sum_similarities / num_pairs if num_pairs > 0 else 0.0
# Diversity is inversely related to average similarity
diversity_score = 1 - avg_similarity
return diversity_score
def _determine_confidence(self, metrics: Dict[str, float]) -> bool:
"""Determine confidence using primary and secondary conditions."""
# Primary conditions (must all be met)
primary_conditions = [
metrics['top_score'] >= self.confidence_threshold,
metrics['response_diversity'] >= self.diversity_threshold,
metrics['response_length_score'] >= self.thresholds['length_score']
]
# Secondary conditions (majority must be met)
secondary_conditions = [
metrics['query_response_relevance'] >= self.thresholds['relevance'],
metrics['top_3_score_gap'] >= self.thresholds['score_gap'],
metrics['top_score'] >= (self.confidence_threshold * 1.1) # Extra confidence boost
]
return all(primary_conditions) and sum(secondary_conditions) >= 2
def _calculate_length_score(self, response: str) -> float:
"""Calculate length score with penalty for very short or long responses."""
words = len(response.split())
if words < self.min_response_length:
return words / self.min_response_length
elif words > 50: # Penalty for very long responses
return min(1.0, 50 / words)
return 1.0
def _calculate_score_gap(self, scores: List[float], top_n: int = 3) -> float:
"""Calculate average gap between top N scores."""
if len(scores) < top_n + 1:
return 0.0
gaps = [scores[i] - scores[i + 1] for i in range(min(len(scores) - 1, top_n))]
return np.mean(gaps) |