JoeArmani
commited on
Commit
·
3ea7670
1
Parent(s):
71ca212
update ranking
Browse files- chatbot_model.py +5 -4
- chatbot_validator.py +3 -3
- response_quality_checker.py +137 -295
chatbot_model.py
CHANGED
@@ -36,6 +36,7 @@ class ChatbotConfig:
|
|
36 |
max_context_turns: int = 5
|
37 |
warmup_steps: int = 200
|
38 |
pretrained_model: str = 'distilbert-base-uncased'
|
|
|
39 |
dtype: str = 'float32'
|
40 |
freeze_embeddings: bool = False
|
41 |
embedding_batch_size: int = 64
|
@@ -190,7 +191,7 @@ class RetrievalChatbot(DeviceAwareModel):
|
|
190 |
def _initialize_reranker(self) -> CrossEncoderReranker:
|
191 |
"""Initialize the CrossEncoderReranker."""
|
192 |
logger.info("Initializing default CrossEncoderReranker...")
|
193 |
-
return CrossEncoderReranker(model_name=
|
194 |
|
195 |
def _initialize_summarizer(self) -> Summarizer:
|
196 |
"""Initialize the Summarizer."""
|
@@ -392,7 +393,7 @@ class RetrievalChatbot(DeviceAwareModel):
|
|
392 |
|
393 |
# Re-rank these boosted candidates
|
394 |
if not reranker:
|
395 |
-
reranker = CrossEncoderReranker(model_name=
|
396 |
|
397 |
ce_scores = reranker.rerank(query, texts, max_length=256)
|
398 |
|
@@ -564,8 +565,8 @@ class RetrievalChatbot(DeviceAwareModel):
|
|
564 |
boosted.sort(key=lambda x: x[1], reverse=True)
|
565 |
|
566 |
# Print top 10
|
567 |
-
for resp, score in boosted[:150]:
|
568 |
-
|
569 |
|
570 |
# 8) Return top_k
|
571 |
return boosted[:top_k]
|
|
|
36 |
max_context_turns: int = 5
|
37 |
warmup_steps: int = 200
|
38 |
pretrained_model: str = 'distilbert-base-uncased'
|
39 |
+
cross_encoder_model: str = 'cross-encoder/ms-marco-MiniLM-L-12-v2'
|
40 |
dtype: str = 'float32'
|
41 |
freeze_embeddings: bool = False
|
42 |
embedding_batch_size: int = 64
|
|
|
191 |
def _initialize_reranker(self) -> CrossEncoderReranker:
|
192 |
"""Initialize the CrossEncoderReranker."""
|
193 |
logger.info("Initializing default CrossEncoderReranker...")
|
194 |
+
return CrossEncoderReranker(model_name=self.config.cross_encoder_model)
|
195 |
|
196 |
def _initialize_summarizer(self) -> Summarizer:
|
197 |
"""Initialize the Summarizer."""
|
|
|
393 |
|
394 |
# Re-rank these boosted candidates
|
395 |
if not reranker:
|
396 |
+
reranker = CrossEncoderReranker(model_name=self.config.cross_encoder_model)
|
397 |
|
398 |
ce_scores = reranker.rerank(query, texts, max_length=256)
|
399 |
|
|
|
565 |
boosted.sort(key=lambda x: x[1], reverse=True)
|
566 |
|
567 |
# Print top 10
|
568 |
+
# for resp, score in boosted[:150]:
|
569 |
+
# logger.debug(f"Candidate: '{resp}' with score {score}")
|
570 |
|
571 |
# 8) Return top_k
|
572 |
return boosted[:top_k]
|
chatbot_validator.py
CHANGED
@@ -17,8 +17,8 @@ class ChatbotValidator:
|
|
17 |
"""
|
18 |
Initialize the validator.
|
19 |
Args:
|
20 |
-
chatbot: RetrievalChatbot
|
21 |
-
quality_checker: ResponseQualityChecker
|
22 |
"""
|
23 |
self.chatbot = chatbot
|
24 |
self.quality_checker = quality_checker
|
@@ -86,7 +86,7 @@ class ChatbotValidator:
|
|
86 |
domain_metrics = {}
|
87 |
|
88 |
# Init the cross-encoder reranker to pass to the chatbot
|
89 |
-
reranker = CrossEncoderReranker(model_name=
|
90 |
|
91 |
# Prepare random selection if needed
|
92 |
rng = random.Random(seed)
|
|
|
17 |
"""
|
18 |
Initialize the validator.
|
19 |
Args:
|
20 |
+
chatbot: RetrievalChatbot for inference
|
21 |
+
quality_checker: ResponseQualityChecker
|
22 |
"""
|
23 |
self.chatbot = chatbot
|
24 |
self.quality_checker = quality_checker
|
|
|
86 |
domain_metrics = {}
|
87 |
|
88 |
# Init the cross-encoder reranker to pass to the chatbot
|
89 |
+
reranker = CrossEncoderReranker(model_name=self.chatbot.config.cross_encoder_model)
|
90 |
|
91 |
# Prepare random selection if needed
|
92 |
rng = random.Random(seed)
|
response_quality_checker.py
CHANGED
@@ -1,352 +1,194 @@
|
|
1 |
import numpy as np
|
2 |
-
from typing import List, Tuple, Dict, Any
|
3 |
-
from
|
4 |
-
|
5 |
from logger_config import config_logger
|
6 |
-
logger = config_logger(__name__)
|
7 |
|
8 |
-
|
9 |
-
from tf_data_pipeline import TFDataPipeline
|
10 |
|
11 |
class ResponseQualityChecker:
|
12 |
"""
|
13 |
-
|
14 |
-
- Relevance
|
15 |
- Diversity among top responses
|
16 |
-
-
|
17 |
-
-
|
|
|
18 |
"""
|
19 |
-
|
20 |
def __init__(
|
21 |
self,
|
22 |
-
data_pipeline:
|
23 |
-
confidence_threshold: float = 0.
|
24 |
diversity_threshold: float = 0.15,
|
25 |
min_response_length: int = 5,
|
26 |
-
similarity_cap: float = 0.85
|
27 |
):
|
28 |
"""
|
29 |
Args:
|
30 |
-
data_pipeline:
|
31 |
-
confidence_threshold:
|
32 |
-
diversity_threshold:
|
33 |
-
min_response_length:
|
34 |
-
similarity_cap: Cap
|
35 |
"""
|
|
|
36 |
self.confidence_threshold = confidence_threshold
|
37 |
self.diversity_threshold = diversity_threshold
|
38 |
self.min_response_length = min_response_length
|
39 |
self.similarity_cap = similarity_cap
|
40 |
-
|
41 |
-
|
42 |
-
# Additional thresholds for more refined checks
|
43 |
self.thresholds = {
|
44 |
-
'relevance': 0.30,
|
45 |
-
'length_score': 0.80,
|
46 |
-
'score_gap': 0.05
|
47 |
}
|
48 |
-
|
49 |
def check_response_quality(
|
50 |
self,
|
51 |
query: str,
|
52 |
responses: List[Tuple[str, float]]
|
53 |
) -> Dict[str, Any]:
|
54 |
"""
|
55 |
-
Evaluate the quality of
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
Dictionary of metrics, including 'is_confident' and others
|
63 |
"""
|
64 |
if not responses:
|
65 |
return {
|
66 |
'response_diversity': 0.0,
|
67 |
'query_response_relevance': 0.0,
|
68 |
-
'is_confident': False,
|
69 |
-
'top_score': 0.0,
|
70 |
'response_length_score': 0.0,
|
71 |
-
'
|
|
|
|
|
72 |
}
|
73 |
-
|
74 |
-
# 1) Calculate relevant metrics
|
75 |
metrics = {}
|
76 |
-
metrics['response_diversity'] = self.
|
77 |
-
metrics['query_response_relevance'] = self.
|
78 |
-
metrics['response_length_score'] = self.
|
79 |
metrics['top_score'] = responses[0][1]
|
80 |
-
metrics['top_3_score_gap'] = self.
|
81 |
-
|
82 |
-
# 2) Determine confidence
|
83 |
metrics['is_confident'] = self._determine_confidence(metrics)
|
|
|
84 |
logger.info(f"Quality metrics: {metrics}")
|
85 |
return metrics
|
86 |
-
|
87 |
-
def
|
88 |
"""
|
89 |
-
|
90 |
-
Uses an exponential transform on the similarity to penalize weaker matches.
|
91 |
-
"""
|
92 |
-
if not responses:
|
93 |
-
return 0.0
|
94 |
-
|
95 |
-
# Encode query and responses
|
96 |
-
query_emb = self.data_pipeline.encode_query(query)
|
97 |
-
resp_texts = [r for r, _ in responses]
|
98 |
-
resp_embs = self.data_pipeline.encode_responses(resp_texts)
|
99 |
-
|
100 |
-
# Normalize embeddings
|
101 |
-
query_emb = query_emb / (np.linalg.norm(query_emb) + 1e-12)
|
102 |
-
resp_norms = np.linalg.norm(resp_embs, axis=1, keepdims=True) + 1e-12
|
103 |
-
resp_embs = resp_embs / resp_norms
|
104 |
-
|
105 |
-
# Cosine similarity
|
106 |
-
sims = cosine_similarity([query_emb], resp_embs)[0]
|
107 |
-
|
108 |
-
# Exponential transform: higher sims remain close to 1, lower sims drop quickly
|
109 |
-
sims = np.exp(sims - 1.0)
|
110 |
-
|
111 |
-
# Weighted average: give heavier weighting to higher-ranked items
|
112 |
-
weights = np.exp(-np.arange(len(sims)) / 2.0)
|
113 |
-
weighted_avg = np.average(sims, weights=weights)
|
114 |
-
return float(weighted_avg)
|
115 |
-
|
116 |
-
def calculate_diversity(self, responses: List[Tuple[str, float]]) -> float:
|
117 |
-
"""
|
118 |
-
Calculate how 'different' the top responses are from each other.
|
119 |
-
Diversity = 1 - avg_cosine_similarity (capped).
|
120 |
"""
|
121 |
if len(responses) < 2:
|
122 |
-
return 1.0 # Single response
|
123 |
-
|
124 |
-
|
125 |
-
embs = self.data_pipeline.encode_responses(
|
126 |
-
|
127 |
-
|
128 |
-
|
129 |
np.fill_diagonal(sim_matrix, 0.0)
|
130 |
-
|
131 |
-
# Cap similarity
|
132 |
sim_matrix = np.minimum(sim_matrix, self.similarity_cap)
|
133 |
-
|
134 |
-
# Mean off-diagonal similarity
|
135 |
sum_sims = np.sum(sim_matrix)
|
136 |
-
|
137 |
-
avg_sim = sum_sims /
|
138 |
-
|
139 |
-
# Invert to get diversity
|
140 |
return 1.0 - avg_sim
|
141 |
-
|
142 |
-
def
|
143 |
-
"""
|
144 |
-
Decide if we're 'confident' based on multiple metric thresholds.
|
145 |
-
"""
|
146 |
-
primary_conditions = [
|
147 |
-
metrics['top_score'] >= self.confidence_threshold,
|
148 |
-
metrics['response_diversity'] >= self.diversity_threshold,
|
149 |
-
metrics['response_length_score'] >= self.thresholds['length_score']
|
150 |
-
]
|
151 |
-
|
152 |
-
secondary_conditions = [
|
153 |
-
metrics['query_response_relevance'] >= self.thresholds['relevance'],
|
154 |
-
metrics['top_3_score_gap'] >= self.thresholds['score_gap'],
|
155 |
-
metrics['top_score'] >= (self.confidence_threshold + 0.05) # Extra buffer
|
156 |
-
]
|
157 |
-
|
158 |
-
# Must pass all primary checks, and at least 2 of the 3 secondary
|
159 |
-
return all(primary_conditions) and (sum(secondary_conditions) >= 2)
|
160 |
-
|
161 |
-
def _average_length_score(self, responses: List[Tuple[str, float]]) -> float:
|
162 |
"""
|
163 |
-
|
|
|
164 |
"""
|
165 |
-
|
166 |
-
|
167 |
-
|
168 |
-
|
169 |
-
|
170 |
-
|
171 |
-
|
172 |
-
|
173 |
-
|
174 |
-
|
175 |
-
|
176 |
-
|
177 |
-
|
178 |
-
|
179 |
-
|
180 |
-
|
181 |
-
|
182 |
-
|
183 |
-
|
184 |
-
|
185 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
186 |
"""
|
187 |
if len(scores) < 2:
|
188 |
return 0.0
|
189 |
-
top_n = min(len(scores)
|
190 |
gaps = []
|
191 |
for i in range(top_n - 1):
|
192 |
gaps.append(scores[i] - scores[i + 1])
|
193 |
return float(np.mean(gaps)) if gaps else 0.0
|
194 |
-
|
195 |
-
|
196 |
-
|
197 |
-
|
198 |
-
|
199 |
-
|
200 |
-
|
201 |
-
|
202 |
-
|
203 |
-
|
204 |
-
|
205 |
-
|
206 |
-
|
207 |
-
|
208 |
-
|
209 |
-
|
210 |
-
|
211 |
-
|
212 |
-
|
213 |
-
|
214 |
-
|
215 |
-
|
216 |
-
|
217 |
-
# self.diversity_threshold = diversity_threshold
|
218 |
-
# self.min_response_length = min_response_length
|
219 |
-
# self.similarity_cap = similarity_cap
|
220 |
-
# self.data_pipeline = data_pipeline # Reference to TFDataPipeline
|
221 |
-
|
222 |
-
# # Dynamic thresholds based on response patterns
|
223 |
-
# self.thresholds = {
|
224 |
-
# 'relevance': 0.35,
|
225 |
-
# 'length_score': 0.85,
|
226 |
-
# 'score_gap': 0.04
|
227 |
-
# }
|
228 |
-
|
229 |
-
# def check_response_quality(
|
230 |
-
# self,
|
231 |
-
# query: str,
|
232 |
-
# responses: List[Tuple[str, float]]
|
233 |
-
# ) -> Dict[str, Any]:
|
234 |
-
# """
|
235 |
-
# Evaluate the quality of responses based on various metrics.
|
236 |
-
|
237 |
-
# Args:
|
238 |
-
# query: The user's query
|
239 |
-
# responses: List of (response_text, score) tuples
|
240 |
-
|
241 |
-
# Returns:
|
242 |
-
# Dict containing quality metrics and confidence assessment
|
243 |
-
# """
|
244 |
-
# if not responses:
|
245 |
-
# return {
|
246 |
-
# 'response_diversity': 0.0,
|
247 |
-
# 'query_response_relevance': 0.0,
|
248 |
-
# 'is_confident': False,
|
249 |
-
# 'top_score': 0.0,
|
250 |
-
# 'response_length_score': 0.0,
|
251 |
-
# 'top_3_score_gap': 0.0
|
252 |
-
# }
|
253 |
-
|
254 |
-
# # Calculate core metrics
|
255 |
-
# metrics = {
|
256 |
-
# 'response_diversity': self.calculate_diversity(responses),
|
257 |
-
# 'query_response_relevance': self.calculate_relevance(query, responses),
|
258 |
-
# 'response_length_score': np.mean([
|
259 |
-
# self._calculate_length_score(response) for response, _ in responses
|
260 |
-
# ]),
|
261 |
-
# 'top_score': responses[0][1],
|
262 |
-
# 'top_3_score_gap': self._calculate_score_gap([score for _, score in responses], top_n=3)
|
263 |
-
# }
|
264 |
-
|
265 |
-
# # Determine confidence using thresholds
|
266 |
-
# metrics['is_confident'] = self._determine_confidence(metrics)
|
267 |
-
|
268 |
-
# logger.info(f"Quality metrics: {metrics}")
|
269 |
-
# return metrics
|
270 |
-
|
271 |
-
# def calculate_relevance(self, query: str, responses: List[Tuple[str, float]]) -> float:
|
272 |
-
# """Calculate relevance with stricter scoring."""
|
273 |
-
# if not responses:
|
274 |
-
# return 0.0
|
275 |
-
|
276 |
-
# query_embedding = self.data_pipeline.encode_query(query)
|
277 |
-
# response_texts = [resp for resp, _ in responses]
|
278 |
-
# response_embeddings = self.data_pipeline.encode_responses(response_texts)
|
279 |
-
|
280 |
-
# # Normalize embeddings
|
281 |
-
# query_embedding = query_embedding / np.linalg.norm(query_embedding)
|
282 |
-
# response_embeddings = response_embeddings / np.linalg.norm(response_embeddings, axis=1)[:, np.newaxis]
|
283 |
-
|
284 |
-
# # Compute similarities with exponential decay for far matches
|
285 |
-
# similarities = cosine_similarity([query_embedding], response_embeddings)[0]
|
286 |
-
# similarities = np.exp(similarities - 1) # Penalize lower similarities more strongly
|
287 |
-
|
288 |
-
# # Apply stronger position weighting
|
289 |
-
# weights = np.exp(-np.arange(len(similarities)) / 2)
|
290 |
|
291 |
-
|
292 |
-
|
293 |
-
|
294 |
-
|
295 |
-
|
296 |
-
|
297 |
-
|
298 |
-
|
299 |
-
|
300 |
-
|
301 |
-
# return 1.0
|
302 |
-
|
303 |
-
# # Calculate pairwise cosine similarities
|
304 |
-
# similarity_matrix = cosine_similarity(embeddings)
|
305 |
-
# np.fill_diagonal(similarity_matrix, 0) # Exclude self-similarity
|
306 |
-
|
307 |
-
# # Apply similarity cap
|
308 |
-
# similarity_matrix = np.minimum(similarity_matrix, self.similarity_cap)
|
309 |
-
|
310 |
-
# # Calculate average similarity
|
311 |
-
# sum_similarities = np.sum(similarity_matrix)
|
312 |
-
# num_pairs = len(embeddings) * (len(embeddings) - 1)
|
313 |
-
# avg_similarity = sum_similarities / num_pairs if num_pairs > 0 else 0.0
|
314 |
-
|
315 |
-
# # Diversity is inversely related to average similarity
|
316 |
-
# diversity_score = 1 - avg_similarity
|
317 |
-
# return diversity_score
|
318 |
-
|
319 |
-
# def _determine_confidence(self, metrics: Dict[str, float]) -> bool:
|
320 |
-
# """Determine confidence using primary and secondary conditions."""
|
321 |
-
# # Primary conditions (must all be met)
|
322 |
-
# primary_conditions = [
|
323 |
-
# metrics['top_score'] >= self.confidence_threshold,
|
324 |
-
# metrics['response_diversity'] >= self.diversity_threshold,
|
325 |
-
# metrics['response_length_score'] >= self.thresholds['length_score']
|
326 |
-
# ]
|
327 |
-
|
328 |
-
# # Secondary conditions (majority must be met)
|
329 |
-
# secondary_conditions = [
|
330 |
-
# metrics['query_response_relevance'] >= self.thresholds['relevance'],
|
331 |
-
# metrics['top_3_score_gap'] >= self.thresholds['score_gap'],
|
332 |
-
# metrics['top_score'] >= (self.confidence_threshold * 1.1) # Extra confidence boost
|
333 |
-
# ]
|
334 |
-
|
335 |
-
# return all(primary_conditions) and sum(secondary_conditions) >= 2
|
336 |
-
|
337 |
-
# def _calculate_length_score(self, response: str) -> float:
|
338 |
-
# """Calculate length score with penalty for very short or long responses."""
|
339 |
-
# words = len(response.split())
|
340 |
-
|
341 |
-
# if words < self.min_response_length:
|
342 |
-
# return words / self.min_response_length
|
343 |
-
# elif words > 50: # Penalty for very long responses
|
344 |
-
# return min(1.0, 50 / words)
|
345 |
-
# return 1.0
|
346 |
-
|
347 |
-
# def _calculate_score_gap(self, scores: List[float], top_n: int = 3) -> float:
|
348 |
-
# """Calculate average gap between top N scores."""
|
349 |
-
# if len(scores) < top_n + 1:
|
350 |
-
# return 0.0
|
351 |
-
# gaps = [scores[i] - scores[i + 1] for i in range(min(len(scores) - 1, top_n))]
|
352 |
-
# return np.mean(gaps)
|
|
|
1 |
import numpy as np
|
2 |
+
from typing import List, Tuple, Dict, Any
|
3 |
+
from tf_data_pipeline import TFDataPipeline
|
|
|
4 |
from logger_config import config_logger
|
|
|
5 |
|
6 |
+
logger = config_logger(__name__)
|
|
|
7 |
|
8 |
class ResponseQualityChecker:
|
9 |
"""
|
10 |
+
The Response Quality Checker measures:
|
11 |
+
- Relevance (embedding or cross-encoder)
|
12 |
- Diversity among top responses
|
13 |
+
- Length
|
14 |
+
- Score gap
|
15 |
+
- Confidence
|
16 |
"""
|
17 |
+
|
18 |
def __init__(
|
19 |
self,
|
20 |
+
data_pipeline: "TFDataPipeline",
|
21 |
+
confidence_threshold: float = 0.40,
|
22 |
diversity_threshold: float = 0.15,
|
23 |
min_response_length: int = 5,
|
24 |
+
similarity_cap: float = 0.85
|
25 |
):
|
26 |
"""
|
27 |
Args:
|
28 |
+
data_pipeline: TFDataPipeline for encoding
|
29 |
+
confidence_threshold: Min top_score for 'confident'
|
30 |
+
diversity_threshold: Min average diversity for top responses
|
31 |
+
min_response_length: Min word count - 'valid length'
|
32 |
+
similarity_cap: Cap pairwise similarity to reduce outliers
|
33 |
"""
|
34 |
+
self.data_pipeline = data_pipeline
|
35 |
self.confidence_threshold = confidence_threshold
|
36 |
self.diversity_threshold = diversity_threshold
|
37 |
self.min_response_length = min_response_length
|
38 |
self.similarity_cap = similarity_cap
|
39 |
+
|
40 |
+
# Additional thresholds
|
|
|
41 |
self.thresholds = {
|
42 |
+
'relevance': 0.30,
|
43 |
+
'length_score': 0.80,
|
44 |
+
'score_gap': 0.05
|
45 |
}
|
46 |
+
|
47 |
def check_response_quality(
|
48 |
self,
|
49 |
query: str,
|
50 |
responses: List[Tuple[str, float]]
|
51 |
) -> Dict[str, Any]:
|
52 |
"""
|
53 |
+
Evaluate the quality of top-k responses:
|
54 |
+
- response_diversity
|
55 |
+
- query_response_relevance
|
56 |
+
- response_length_score
|
57 |
+
- top_score
|
58 |
+
- top_3_score_gap
|
59 |
+
- is_confident
|
|
|
60 |
"""
|
61 |
if not responses:
|
62 |
return {
|
63 |
'response_diversity': 0.0,
|
64 |
'query_response_relevance': 0.0,
|
|
|
|
|
65 |
'response_length_score': 0.0,
|
66 |
+
'top_score': 0.0,
|
67 |
+
'top_3_score_gap': 0.0,
|
68 |
+
'is_confident': False
|
69 |
}
|
70 |
+
|
|
|
71 |
metrics = {}
|
72 |
+
metrics['response_diversity'] = self._calc_diversity(responses)
|
73 |
+
metrics['query_response_relevance'] = self._calc_relevance(query, responses)
|
74 |
+
metrics['response_length_score'] = self._calc_length_score(responses)
|
75 |
metrics['top_score'] = responses[0][1]
|
76 |
+
metrics['top_3_score_gap'] = self._calc_score_gap([score for _, score in responses])
|
77 |
+
|
|
|
78 |
metrics['is_confident'] = self._determine_confidence(metrics)
|
79 |
+
|
80 |
logger.info(f"Quality metrics: {metrics}")
|
81 |
return metrics
|
82 |
+
|
83 |
+
def _calc_diversity(self, responses: List[Tuple[str, float]]) -> float:
|
84 |
"""
|
85 |
+
Average similarity among top response embeddings, capped by self.similarity_cap.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
86 |
"""
|
87 |
if len(responses) < 2:
|
88 |
+
return 1.0 # Single response
|
89 |
+
|
90 |
+
texts = [r for r, _ in responses]
|
91 |
+
embs = self.data_pipeline.encode_responses(texts)
|
92 |
+
sim_matrix = self._cosine_similarity(embs, embs)
|
93 |
+
|
94 |
+
# Zero out diagonal
|
95 |
np.fill_diagonal(sim_matrix, 0.0)
|
96 |
+
|
97 |
+
# Cap similarity
|
98 |
sim_matrix = np.minimum(sim_matrix, self.similarity_cap)
|
99 |
+
|
|
|
100 |
sum_sims = np.sum(sim_matrix)
|
101 |
+
count = len(responses) * (len(responses) - 1)
|
102 |
+
avg_sim = sum_sims / count if count > 0 else 0.0
|
103 |
+
|
|
|
104 |
return 1.0 - avg_sim
|
105 |
+
|
106 |
+
def _calc_relevance(self, query: str, responses: List[Tuple[str, float]]) -> float:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
107 |
"""
|
108 |
+
Weighted average of exponential-transformed similarities for top-k.
|
109 |
+
Encourages a high similarity with the top responses.
|
110 |
"""
|
111 |
+
if not responses:
|
112 |
+
return 0.0
|
113 |
+
|
114 |
+
query_emb = self.data_pipeline.encode_query(query)
|
115 |
+
texts = [r for r, _ in responses]
|
116 |
+
resp_embs = self.data_pipeline.encode_responses(texts)
|
117 |
+
|
118 |
+
query_emb = query_emb / (np.linalg.norm(query_emb) + 1e-8)
|
119 |
+
norms = (np.linalg.norm(resp_embs, axis=1, keepdims=True) + 1e-8)
|
120 |
+
resp_embs = resp_embs / norms
|
121 |
+
|
122 |
+
# Cosine similarity, then exponential transform
|
123 |
+
sims = np.sum(query_emb[np.newaxis, :] * resp_embs, axis=1) # shape [k]
|
124 |
+
sims = np.exp(sims - 1.0)
|
125 |
+
|
126 |
+
# Weighted average to boost top responses
|
127 |
+
weights = np.exp(-np.arange(len(responses)) / 2.0)
|
128 |
+
weighted_avg = np.average(sims, weights=weights)
|
129 |
+
return float(weighted_avg)
|
130 |
+
|
131 |
+
def _calc_length_score(self, responses: List[Tuple[str, float]]) -> float:
|
132 |
+
"""
|
133 |
+
Average length-based score across top responses.
|
134 |
+
"""
|
135 |
+
scores = []
|
136 |
+
for text, _ in responses:
|
137 |
+
words = len(text.strip().split())
|
138 |
+
if words < self.min_response_length:
|
139 |
+
# Penalty for too short
|
140 |
+
s = words / float(self.min_response_length)
|
141 |
+
elif words > 50:
|
142 |
+
# Penalty for excessive length
|
143 |
+
s = max(0.5, 50.0 / words)
|
144 |
+
else:
|
145 |
+
s = 1.0
|
146 |
+
scores.append(s)
|
147 |
+
|
148 |
+
return float(np.mean(scores)) if scores else 0.0
|
149 |
+
|
150 |
+
def _calc_score_gap(self, scores: List[float], top_n: int = 3) -> float:
|
151 |
+
"""
|
152 |
+
Average difference between consecutive ranks for top_n.
|
153 |
"""
|
154 |
if len(scores) < 2:
|
155 |
return 0.0
|
156 |
+
top_n = min(top_n, len(scores))
|
157 |
gaps = []
|
158 |
for i in range(top_n - 1):
|
159 |
gaps.append(scores[i] - scores[i + 1])
|
160 |
return float(np.mean(gaps)) if gaps else 0.0
|
161 |
+
|
162 |
+
def _determine_confidence(self, m: Dict[str, float]) -> bool:
|
163 |
+
"""
|
164 |
+
Require:
|
165 |
+
- top_score >= self.confidence_threshold
|
166 |
+
- response_diversity >= self.diversity_threshold
|
167 |
+
- response_length_score >= self.thresholds['length_score']
|
168 |
+
|
169 |
+
Secondary conditions (2 of 3 required):
|
170 |
+
- query_response_relevance >= self.thresholds['relevance']
|
171 |
+
- top_3_score_gap >= self.thresholds['score_gap']
|
172 |
+
- top_score >= (confidence_threshold + 0.05)
|
173 |
+
"""
|
174 |
+
primary = [
|
175 |
+
m['top_score'] >= self.confidence_threshold,
|
176 |
+
m['response_diversity'] >= self.diversity_threshold,
|
177 |
+
m['response_length_score'] >= self.thresholds['length_score']
|
178 |
+
]
|
179 |
+
secondary = [
|
180 |
+
m['query_response_relevance'] >= self.thresholds['relevance'],
|
181 |
+
m['top_3_score_gap'] >= self.thresholds['score_gap'],
|
182 |
+
m['top_score'] >= (self.confidence_threshold + 0.05)
|
183 |
+
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
184 |
|
185 |
+
if all(primary) and sum(secondary) >= 2:
|
186 |
+
return True
|
187 |
+
return False
|
188 |
+
|
189 |
+
def _cosine_similarity(self, a: np.ndarray, b: np.ndarray) -> np.ndarray:
|
190 |
+
"""Manual cosine sim matrix: a-> shape [N, d], b-> shape [M, d]. Return shape [N, M]."""
|
191 |
+
a_norm = a / (np.linalg.norm(a, axis=1, keepdims=True) + 1e-8)
|
192 |
+
b_norm = b / (np.linalg.norm(b, axis=1, keepdims=True) + 1e-8)
|
193 |
+
return np.dot(a_norm, b_norm.T)
|
194 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|