File size: 6,979 Bytes
300fe5d f7b283c 300fe5d f7b283c 300fe5d f7b283c 300fe5d f7b283c 300fe5d f7b283c 300fe5d f7b283c 300fe5d f7b283c 300fe5d f7b283c 300fe5d f7b283c 300fe5d f7b283c 300fe5d f7b283c 300fe5d f7b283c 300fe5d f7b283c 300fe5d f7b283c 300fe5d f7b283c 300fe5d f7b283c 300fe5d f7b283c 300fe5d f7b283c 300fe5d f7b283c 300fe5d f7b283c 300fe5d f7b283c 300fe5d f7b283c 300fe5d f7b283c 300fe5d f7b283c 300fe5d f7b283c 300fe5d f7b283c 300fe5d f7b283c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 |
import numpy as np
from typing import List, Tuple, Dict, Any, TYPE_CHECKING
from sklearn.metrics.pairwise import cosine_similarity
from logger_config import config_logger
logger = config_logger(__name__)
if TYPE_CHECKING:
from chatbot_model import RetrievalChatbot
class ResponseQualityChecker:
"""Enhanced quality checking with dynamic thresholds."""
def __init__(
self,
chatbot: 'RetrievalChatbot',
confidence_threshold: float = 0.6,
diversity_threshold: float = 0.15,
min_response_length: int = 5,
similarity_cap: float = 0.85 # Renamed from max_similarity_ratio and used in diversity calc
):
self.confidence_threshold = confidence_threshold
self.diversity_threshold = diversity_threshold
self.min_response_length = min_response_length
self.similarity_cap = similarity_cap
self.chatbot = chatbot
# Dynamic thresholds based on response patterns
self.thresholds = {
'relevance': 0.35,
'length_score': 0.85,
'score_gap': 0.07
}
def check_response_quality(
self,
query: str,
responses: List[Tuple[str, float]]
) -> Dict[str, Any]:
"""
Evaluate the quality of responses based on various metrics.
Args:
query: The user's query
responses: List of (response_text, score) tuples
Returns:
Dict containing quality metrics and confidence assessment
"""
if not responses:
return {
'response_diversity': 0.0,
'query_response_relevance': 0.0,
'is_confident': False,
'top_score': 0.0,
'response_length_score': 0.0,
'top_3_score_gap': 0.0
}
# Calculate core metrics
metrics = {
'response_diversity': self.calculate_diversity(responses),
'query_response_relevance': self.calculate_relevance(query, responses),
'response_length_score': np.mean([
self._calculate_length_score(response) for response, _ in responses
]),
'top_score': responses[0][1],
'top_3_score_gap': self._calculate_score_gap([score for _, score in responses], top_n=3)
}
# Determine confidence using thresholds
metrics['is_confident'] = self._determine_confidence(metrics)
logger.info(f"Quality metrics: {metrics}")
return metrics
def calculate_relevance(self, query: str, responses: List[Tuple[str, float]]) -> float:
"""Calculate relevance as weighted similarity between query and responses."""
if not responses:
return 0.0
# Get embeddings
query_embedding = self.encode_query(query)
response_embeddings = [self.encode_text(response) for response, _ in responses]
# Compute similarities with decreasing weights for later responses
similarities = cosine_similarity([query_embedding], response_embeddings)[0]
weights = np.array([1.0 / (i + 1) for i in range(len(similarities))])
return np.average(similarities, weights=weights)
def calculate_diversity(self, responses: List[Tuple[str, float]]) -> float:
"""Calculate diversity with length normalization and similarity capping."""
if not responses:
return 0.0
embeddings = [self.encode_text(response) for response, _ in responses]
if len(embeddings) < 2:
return 1.0
# Calculate similarities and apply cap
similarity_matrix = cosine_similarity(embeddings)
similarity_matrix = np.minimum(similarity_matrix, self.similarity_cap)
# Apply length normalization
lengths = [len(resp[0].split()) for resp in responses]
length_ratios = np.array([min(a, b) / max(a, b) for a in lengths for b in lengths])
length_ratios = length_ratios.reshape(len(responses), len(responses))
# Combine factors with weights
adjusted_similarity = (similarity_matrix * 0.7 + length_ratios * 0.3)
# Calculate final score
sum_similarities = np.sum(adjusted_similarity) - len(responses)
num_pairs = len(responses) * (len(responses) - 1)
avg_similarity = sum_similarities / num_pairs if num_pairs > 0 else 0.0
return 1 - avg_similarity
def _determine_confidence(self, metrics: Dict[str, float]) -> bool:
"""Determine confidence using primary and secondary conditions."""
# Primary conditions (must all be met)
primary_conditions = [
metrics['top_score'] >= self.confidence_threshold,
metrics['response_diversity'] >= self.diversity_threshold,
metrics['response_length_score'] >= self.thresholds['length_score']
]
# Secondary conditions (majority must be met)
secondary_conditions = [
metrics['query_response_relevance'] >= self.thresholds['relevance'],
metrics['top_3_score_gap'] >= self.thresholds['score_gap'],
metrics['top_score'] >= (self.confidence_threshold * 1.1) # Extra confidence boost
]
return all(primary_conditions) and sum(secondary_conditions) >= 2
def _calculate_length_score(self, response: str) -> float:
"""Calculate length score with penalty for very short or long responses."""
words = len(response.split())
if words < self.min_response_length:
return words / self.min_response_length
elif words > 50: # Penalty for very long responses
return min(1.0, 50 / words)
return 1.0
def _calculate_score_gap(self, scores: List[float], top_n: int = 3) -> float:
"""Calculate average gap between top N scores."""
if len(scores) < top_n + 1:
return 0.0
gaps = [scores[i] - scores[i + 1] for i in range(min(len(scores) - 1, top_n))]
return np.mean(gaps)
def encode_text(self, text: str) -> np.ndarray:
"""Encode response text to embedding."""
embedding_tensor = self.chatbot.encode_responses([text])
embedding = embedding_tensor.numpy()[0].astype('float32')
return self._normalize_embedding(embedding)
def encode_query(self, query: str) -> np.ndarray:
"""Encode query text to embedding."""
embedding_tensor = self.chatbot.encode_query(query)
embedding = embedding_tensor.numpy()[0].astype('float32')
return self._normalize_embedding(embedding)
def _normalize_embedding(self, embedding: np.ndarray) -> np.ndarray:
"""Normalize embedding vector."""
norm = np.linalg.norm(embedding)
return embedding / norm if norm > 0 else embedding
|