JoeArmani commited on
Commit
3ea7670
·
1 Parent(s): 71ca212

update ranking

Browse files
chatbot_model.py CHANGED
@@ -36,6 +36,7 @@ class ChatbotConfig:
36
  max_context_turns: int = 5
37
  warmup_steps: int = 200
38
  pretrained_model: str = 'distilbert-base-uncased'
 
39
  dtype: str = 'float32'
40
  freeze_embeddings: bool = False
41
  embedding_batch_size: int = 64
@@ -190,7 +191,7 @@ class RetrievalChatbot(DeviceAwareModel):
190
  def _initialize_reranker(self) -> CrossEncoderReranker:
191
  """Initialize the CrossEncoderReranker."""
192
  logger.info("Initializing default CrossEncoderReranker...")
193
- return CrossEncoderReranker(model_name="cross-encoder/ms-marco-MiniLM-L-12-v2")
194
 
195
  def _initialize_summarizer(self) -> Summarizer:
196
  """Initialize the Summarizer."""
@@ -392,7 +393,7 @@ class RetrievalChatbot(DeviceAwareModel):
392
 
393
  # Re-rank these boosted candidates
394
  if not reranker:
395
- reranker = CrossEncoderReranker(model_name="cross-encoder/ms-marco-MiniLM-L-12-v2")
396
 
397
  ce_scores = reranker.rerank(query, texts, max_length=256)
398
 
@@ -564,8 +565,8 @@ class RetrievalChatbot(DeviceAwareModel):
564
  boosted.sort(key=lambda x: x[1], reverse=True)
565
 
566
  # Print top 10
567
- for resp, score in boosted[:150]:
568
- logger.debug(f"Candidate: '{resp}' with score {score}")
569
 
570
  # 8) Return top_k
571
  return boosted[:top_k]
 
36
  max_context_turns: int = 5
37
  warmup_steps: int = 200
38
  pretrained_model: str = 'distilbert-base-uncased'
39
+ cross_encoder_model: str = 'cross-encoder/ms-marco-MiniLM-L-12-v2'
40
  dtype: str = 'float32'
41
  freeze_embeddings: bool = False
42
  embedding_batch_size: int = 64
 
191
  def _initialize_reranker(self) -> CrossEncoderReranker:
192
  """Initialize the CrossEncoderReranker."""
193
  logger.info("Initializing default CrossEncoderReranker...")
194
+ return CrossEncoderReranker(model_name=self.config.cross_encoder_model)
195
 
196
  def _initialize_summarizer(self) -> Summarizer:
197
  """Initialize the Summarizer."""
 
393
 
394
  # Re-rank these boosted candidates
395
  if not reranker:
396
+ reranker = CrossEncoderReranker(model_name=self.config.cross_encoder_model)
397
 
398
  ce_scores = reranker.rerank(query, texts, max_length=256)
399
 
 
565
  boosted.sort(key=lambda x: x[1], reverse=True)
566
 
567
  # Print top 10
568
+ # for resp, score in boosted[:150]:
569
+ # logger.debug(f"Candidate: '{resp}' with score {score}")
570
 
571
  # 8) Return top_k
572
  return boosted[:top_k]
chatbot_validator.py CHANGED
@@ -17,8 +17,8 @@ class ChatbotValidator:
17
  """
18
  Initialize the validator.
19
  Args:
20
- chatbot: RetrievalChatbot instance for inference
21
- quality_checker: ResponseQualityChecker instance
22
  """
23
  self.chatbot = chatbot
24
  self.quality_checker = quality_checker
@@ -86,7 +86,7 @@ class ChatbotValidator:
86
  domain_metrics = {}
87
 
88
  # Init the cross-encoder reranker to pass to the chatbot
89
- reranker = CrossEncoderReranker(model_name="cross-encoder/ms-marco-MiniLM-L-12-v2")
90
 
91
  # Prepare random selection if needed
92
  rng = random.Random(seed)
 
17
  """
18
  Initialize the validator.
19
  Args:
20
+ chatbot: RetrievalChatbot for inference
21
+ quality_checker: ResponseQualityChecker
22
  """
23
  self.chatbot = chatbot
24
  self.quality_checker = quality_checker
 
86
  domain_metrics = {}
87
 
88
  # Init the cross-encoder reranker to pass to the chatbot
89
+ reranker = CrossEncoderReranker(model_name=self.chatbot.config.cross_encoder_model)
90
 
91
  # Prepare random selection if needed
92
  rng = random.Random(seed)
response_quality_checker.py CHANGED
@@ -1,352 +1,194 @@
1
  import numpy as np
2
- from typing import List, Tuple, Dict, Any, TYPE_CHECKING
3
- from sklearn.metrics.pairwise import cosine_similarity
4
-
5
  from logger_config import config_logger
6
- logger = config_logger(__name__)
7
 
8
- if TYPE_CHECKING:
9
- from tf_data_pipeline import TFDataPipeline
10
 
11
  class ResponseQualityChecker:
12
  """
13
- Enhanced quality checking that calculates:
14
- - Relevance between query & responses
15
  - Diversity among top responses
16
- - Response length scoring
17
- - Confidence determination based on multiple thresholds
 
18
  """
19
-
20
  def __init__(
21
  self,
22
- data_pipeline: 'TFDataPipeline',
23
- confidence_threshold: float = 0.45,
24
  diversity_threshold: float = 0.15,
25
  min_response_length: int = 5,
26
- similarity_cap: float = 0.85,
27
  ):
28
  """
29
  Args:
30
- data_pipeline: Reference to TFDataPipeline for encoding
31
- confidence_threshold: Minimum top_score for a 'confident' result
32
- diversity_threshold: Minimum required diversity among top responses
33
- min_response_length: Minimum words for a decent response
34
- similarity_cap: Cap on pairwise similarity for diversity calc
35
  """
 
36
  self.confidence_threshold = confidence_threshold
37
  self.diversity_threshold = diversity_threshold
38
  self.min_response_length = min_response_length
39
  self.similarity_cap = similarity_cap
40
- self.data_pipeline = data_pipeline
41
-
42
- # Additional thresholds for more refined checks
43
  self.thresholds = {
44
- 'relevance': 0.30, # Slightly relaxed
45
- 'length_score': 0.80, # Stricter length requirement
46
- 'score_gap': 0.05 # Gap between top scores
47
  }
48
-
49
  def check_response_quality(
50
  self,
51
  query: str,
52
  responses: List[Tuple[str, float]]
53
  ) -> Dict[str, Any]:
54
  """
55
- Evaluate the quality of a set of ranked responses for a given query.
56
-
57
- Args:
58
- query: The user's original query
59
- responses: List of (response_text, score) sorted by descending score
60
-
61
- Returns:
62
- Dictionary of metrics, including 'is_confident' and others
63
  """
64
  if not responses:
65
  return {
66
  'response_diversity': 0.0,
67
  'query_response_relevance': 0.0,
68
- 'is_confident': False,
69
- 'top_score': 0.0,
70
  'response_length_score': 0.0,
71
- 'top_3_score_gap': 0.0
 
 
72
  }
73
-
74
- # 1) Calculate relevant metrics
75
  metrics = {}
76
- metrics['response_diversity'] = self.calculate_diversity(responses)
77
- metrics['query_response_relevance'] = self.calculate_relevance(query, responses)
78
- metrics['response_length_score'] = self._average_length_score(responses)
79
  metrics['top_score'] = responses[0][1]
80
- metrics['top_3_score_gap'] = self._calculate_score_gap([s for _, s in responses], top_n=3)
81
-
82
- # 2) Determine confidence
83
  metrics['is_confident'] = self._determine_confidence(metrics)
 
84
  logger.info(f"Quality metrics: {metrics}")
85
  return metrics
86
-
87
- def calculate_relevance(self, query: str, responses: List[Tuple[str, float]]) -> float:
88
  """
89
- Compute an overall 'relevance' metric between the query and the top responses.
90
- Uses an exponential transform on the similarity to penalize weaker matches.
91
- """
92
- if not responses:
93
- return 0.0
94
-
95
- # Encode query and responses
96
- query_emb = self.data_pipeline.encode_query(query)
97
- resp_texts = [r for r, _ in responses]
98
- resp_embs = self.data_pipeline.encode_responses(resp_texts)
99
-
100
- # Normalize embeddings
101
- query_emb = query_emb / (np.linalg.norm(query_emb) + 1e-12)
102
- resp_norms = np.linalg.norm(resp_embs, axis=1, keepdims=True) + 1e-12
103
- resp_embs = resp_embs / resp_norms
104
-
105
- # Cosine similarity
106
- sims = cosine_similarity([query_emb], resp_embs)[0]
107
-
108
- # Exponential transform: higher sims remain close to 1, lower sims drop quickly
109
- sims = np.exp(sims - 1.0)
110
-
111
- # Weighted average: give heavier weighting to higher-ranked items
112
- weights = np.exp(-np.arange(len(sims)) / 2.0)
113
- weighted_avg = np.average(sims, weights=weights)
114
- return float(weighted_avg)
115
-
116
- def calculate_diversity(self, responses: List[Tuple[str, float]]) -> float:
117
- """
118
- Calculate how 'different' the top responses are from each other.
119
- Diversity = 1 - avg_cosine_similarity (capped).
120
  """
121
  if len(responses) < 2:
122
- return 1.0 # Single response is trivially 'unique'
123
-
124
- resp_texts = [r for r, _ in responses]
125
- embs = self.data_pipeline.encode_responses(resp_texts)
126
-
127
- # Pairwise similarity
128
- sim_matrix = cosine_similarity(embs, embs)
129
  np.fill_diagonal(sim_matrix, 0.0)
130
-
131
- # Cap similarity to avoid outliers
132
  sim_matrix = np.minimum(sim_matrix, self.similarity_cap)
133
-
134
- # Mean off-diagonal similarity
135
  sum_sims = np.sum(sim_matrix)
136
- num_pairs = len(resp_texts) * (len(resp_texts) - 1)
137
- avg_sim = sum_sims / num_pairs if num_pairs > 0 else 0.0
138
-
139
- # Invert to get diversity
140
  return 1.0 - avg_sim
141
-
142
- def _determine_confidence(self, metrics: Dict[str, float]) -> bool:
143
- """
144
- Decide if we're 'confident' based on multiple metric thresholds.
145
- """
146
- primary_conditions = [
147
- metrics['top_score'] >= self.confidence_threshold,
148
- metrics['response_diversity'] >= self.diversity_threshold,
149
- metrics['response_length_score'] >= self.thresholds['length_score']
150
- ]
151
-
152
- secondary_conditions = [
153
- metrics['query_response_relevance'] >= self.thresholds['relevance'],
154
- metrics['top_3_score_gap'] >= self.thresholds['score_gap'],
155
- metrics['top_score'] >= (self.confidence_threshold + 0.05) # Extra buffer
156
- ]
157
-
158
- # Must pass all primary checks, and at least 2 of the 3 secondary
159
- return all(primary_conditions) and (sum(secondary_conditions) >= 2)
160
-
161
- def _average_length_score(self, responses: List[Tuple[str, float]]) -> float:
162
  """
163
- Compute an average length score across all responses.
 
164
  """
165
- length_scores = []
166
- for response, _ in responses:
167
- length_scores.append(self._length_score(response))
168
- return float(np.mean(length_scores)) if length_scores else 0.0
169
-
170
- def _length_score(self, text: str) -> float:
171
- """
172
- Calculate how well the text meets our length requirement.
173
- Scores 1.0 if text is >= min_response_length and not too long,
174
- else it scales down.
175
- """
176
- words = len(text.split())
177
- if words < self.min_response_length:
178
- return words / float(self.min_response_length)
179
- elif words > 60:
180
- return max(0.5, 60.0 / words) # Slight penalty for very long
181
- return 1.0
182
-
183
- def _calculate_score_gap(self, scores: List[float], top_n: int = 3) -> float:
184
- """
185
- Calculate the average gap between consecutive scores in the top N.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
186
  """
187
  if len(scores) < 2:
188
  return 0.0
189
- top_n = min(len(scores), top_n)
190
  gaps = []
191
  for i in range(top_n - 1):
192
  gaps.append(scores[i] - scores[i + 1])
193
  return float(np.mean(gaps)) if gaps else 0.0
194
-
195
- # import numpy as np
196
- # from typing import List, Tuple, Dict, Any, TYPE_CHECKING
197
- # from sklearn.metrics.pairwise import cosine_similarity
198
-
199
- # from logger_config import config_logger
200
- # logger = config_logger(__name__)
201
-
202
- # if TYPE_CHECKING:
203
- # from tf_data_pipeline import TFDataPipeline
204
-
205
- # class ResponseQualityChecker:
206
- # """Enhanced quality checking with dynamic thresholds."""
207
-
208
- # def __init__(
209
- # self,
210
- # data_pipeline: 'TFDataPipeline',
211
- # confidence_threshold: float = 0.4,
212
- # diversity_threshold: float = 0.15,
213
- # min_response_length: int = 5,
214
- # similarity_cap: float = 0.85 # Renamed from max_similarity_ratio and used in diversity calc
215
- # ):
216
- # self.confidence_threshold = confidence_threshold
217
- # self.diversity_threshold = diversity_threshold
218
- # self.min_response_length = min_response_length
219
- # self.similarity_cap = similarity_cap
220
- # self.data_pipeline = data_pipeline # Reference to TFDataPipeline
221
-
222
- # # Dynamic thresholds based on response patterns
223
- # self.thresholds = {
224
- # 'relevance': 0.35,
225
- # 'length_score': 0.85,
226
- # 'score_gap': 0.04
227
- # }
228
-
229
- # def check_response_quality(
230
- # self,
231
- # query: str,
232
- # responses: List[Tuple[str, float]]
233
- # ) -> Dict[str, Any]:
234
- # """
235
- # Evaluate the quality of responses based on various metrics.
236
-
237
- # Args:
238
- # query: The user's query
239
- # responses: List of (response_text, score) tuples
240
-
241
- # Returns:
242
- # Dict containing quality metrics and confidence assessment
243
- # """
244
- # if not responses:
245
- # return {
246
- # 'response_diversity': 0.0,
247
- # 'query_response_relevance': 0.0,
248
- # 'is_confident': False,
249
- # 'top_score': 0.0,
250
- # 'response_length_score': 0.0,
251
- # 'top_3_score_gap': 0.0
252
- # }
253
-
254
- # # Calculate core metrics
255
- # metrics = {
256
- # 'response_diversity': self.calculate_diversity(responses),
257
- # 'query_response_relevance': self.calculate_relevance(query, responses),
258
- # 'response_length_score': np.mean([
259
- # self._calculate_length_score(response) for response, _ in responses
260
- # ]),
261
- # 'top_score': responses[0][1],
262
- # 'top_3_score_gap': self._calculate_score_gap([score for _, score in responses], top_n=3)
263
- # }
264
-
265
- # # Determine confidence using thresholds
266
- # metrics['is_confident'] = self._determine_confidence(metrics)
267
-
268
- # logger.info(f"Quality metrics: {metrics}")
269
- # return metrics
270
-
271
- # def calculate_relevance(self, query: str, responses: List[Tuple[str, float]]) -> float:
272
- # """Calculate relevance with stricter scoring."""
273
- # if not responses:
274
- # return 0.0
275
-
276
- # query_embedding = self.data_pipeline.encode_query(query)
277
- # response_texts = [resp for resp, _ in responses]
278
- # response_embeddings = self.data_pipeline.encode_responses(response_texts)
279
-
280
- # # Normalize embeddings
281
- # query_embedding = query_embedding / np.linalg.norm(query_embedding)
282
- # response_embeddings = response_embeddings / np.linalg.norm(response_embeddings, axis=1)[:, np.newaxis]
283
-
284
- # # Compute similarities with exponential decay for far matches
285
- # similarities = cosine_similarity([query_embedding], response_embeddings)[0]
286
- # similarities = np.exp(similarities - 1) # Penalize lower similarities more strongly
287
-
288
- # # Apply stronger position weighting
289
- # weights = np.exp(-np.arange(len(similarities)) / 2)
290
 
291
- # return float(np.average(similarities, weights=weights))
292
-
293
- # def calculate_diversity(self, responses: List[Tuple[str, float]]) -> float:
294
- # """Calculate diversity with length normalization and similarity capping."""
295
- # if not responses:
296
- # return 0.0
297
-
298
- # response_texts = [resp for resp, _ in responses]
299
- # embeddings = self.data_pipeline.encode_responses(response_texts)
300
- # if len(embeddings) < 2:
301
- # return 1.0
302
-
303
- # # Calculate pairwise cosine similarities
304
- # similarity_matrix = cosine_similarity(embeddings)
305
- # np.fill_diagonal(similarity_matrix, 0) # Exclude self-similarity
306
-
307
- # # Apply similarity cap
308
- # similarity_matrix = np.minimum(similarity_matrix, self.similarity_cap)
309
-
310
- # # Calculate average similarity
311
- # sum_similarities = np.sum(similarity_matrix)
312
- # num_pairs = len(embeddings) * (len(embeddings) - 1)
313
- # avg_similarity = sum_similarities / num_pairs if num_pairs > 0 else 0.0
314
-
315
- # # Diversity is inversely related to average similarity
316
- # diversity_score = 1 - avg_similarity
317
- # return diversity_score
318
-
319
- # def _determine_confidence(self, metrics: Dict[str, float]) -> bool:
320
- # """Determine confidence using primary and secondary conditions."""
321
- # # Primary conditions (must all be met)
322
- # primary_conditions = [
323
- # metrics['top_score'] >= self.confidence_threshold,
324
- # metrics['response_diversity'] >= self.diversity_threshold,
325
- # metrics['response_length_score'] >= self.thresholds['length_score']
326
- # ]
327
-
328
- # # Secondary conditions (majority must be met)
329
- # secondary_conditions = [
330
- # metrics['query_response_relevance'] >= self.thresholds['relevance'],
331
- # metrics['top_3_score_gap'] >= self.thresholds['score_gap'],
332
- # metrics['top_score'] >= (self.confidence_threshold * 1.1) # Extra confidence boost
333
- # ]
334
-
335
- # return all(primary_conditions) and sum(secondary_conditions) >= 2
336
-
337
- # def _calculate_length_score(self, response: str) -> float:
338
- # """Calculate length score with penalty for very short or long responses."""
339
- # words = len(response.split())
340
-
341
- # if words < self.min_response_length:
342
- # return words / self.min_response_length
343
- # elif words > 50: # Penalty for very long responses
344
- # return min(1.0, 50 / words)
345
- # return 1.0
346
-
347
- # def _calculate_score_gap(self, scores: List[float], top_n: int = 3) -> float:
348
- # """Calculate average gap between top N scores."""
349
- # if len(scores) < top_n + 1:
350
- # return 0.0
351
- # gaps = [scores[i] - scores[i + 1] for i in range(min(len(scores) - 1, top_n))]
352
- # return np.mean(gaps)
 
1
  import numpy as np
2
+ from typing import List, Tuple, Dict, Any
3
+ from tf_data_pipeline import TFDataPipeline
 
4
  from logger_config import config_logger
 
5
 
6
+ logger = config_logger(__name__)
 
7
 
8
  class ResponseQualityChecker:
9
  """
10
+ The Response Quality Checker measures:
11
+ - Relevance (embedding or cross-encoder)
12
  - Diversity among top responses
13
+ - Length
14
+ - Score gap
15
+ - Confidence
16
  """
17
+
18
  def __init__(
19
  self,
20
+ data_pipeline: "TFDataPipeline",
21
+ confidence_threshold: float = 0.40,
22
  diversity_threshold: float = 0.15,
23
  min_response_length: int = 5,
24
+ similarity_cap: float = 0.85
25
  ):
26
  """
27
  Args:
28
+ data_pipeline: TFDataPipeline for encoding
29
+ confidence_threshold: Min top_score for 'confident'
30
+ diversity_threshold: Min average diversity for top responses
31
+ min_response_length: Min word count - 'valid length'
32
+ similarity_cap: Cap pairwise similarity to reduce outliers
33
  """
34
+ self.data_pipeline = data_pipeline
35
  self.confidence_threshold = confidence_threshold
36
  self.diversity_threshold = diversity_threshold
37
  self.min_response_length = min_response_length
38
  self.similarity_cap = similarity_cap
39
+
40
+ # Additional thresholds
 
41
  self.thresholds = {
42
+ 'relevance': 0.30,
43
+ 'length_score': 0.80,
44
+ 'score_gap': 0.05
45
  }
46
+
47
  def check_response_quality(
48
  self,
49
  query: str,
50
  responses: List[Tuple[str, float]]
51
  ) -> Dict[str, Any]:
52
  """
53
+ Evaluate the quality of top-k responses:
54
+ - response_diversity
55
+ - query_response_relevance
56
+ - response_length_score
57
+ - top_score
58
+ - top_3_score_gap
59
+ - is_confident
 
60
  """
61
  if not responses:
62
  return {
63
  'response_diversity': 0.0,
64
  'query_response_relevance': 0.0,
 
 
65
  'response_length_score': 0.0,
66
+ 'top_score': 0.0,
67
+ 'top_3_score_gap': 0.0,
68
+ 'is_confident': False
69
  }
70
+
 
71
  metrics = {}
72
+ metrics['response_diversity'] = self._calc_diversity(responses)
73
+ metrics['query_response_relevance'] = self._calc_relevance(query, responses)
74
+ metrics['response_length_score'] = self._calc_length_score(responses)
75
  metrics['top_score'] = responses[0][1]
76
+ metrics['top_3_score_gap'] = self._calc_score_gap([score for _, score in responses])
77
+
 
78
  metrics['is_confident'] = self._determine_confidence(metrics)
79
+
80
  logger.info(f"Quality metrics: {metrics}")
81
  return metrics
82
+
83
+ def _calc_diversity(self, responses: List[Tuple[str, float]]) -> float:
84
  """
85
+ Average similarity among top response embeddings, capped by self.similarity_cap.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
86
  """
87
  if len(responses) < 2:
88
+ return 1.0 # Single response
89
+
90
+ texts = [r for r, _ in responses]
91
+ embs = self.data_pipeline.encode_responses(texts)
92
+ sim_matrix = self._cosine_similarity(embs, embs)
93
+
94
+ # Zero out diagonal
95
  np.fill_diagonal(sim_matrix, 0.0)
96
+
97
+ # Cap similarity
98
  sim_matrix = np.minimum(sim_matrix, self.similarity_cap)
99
+
 
100
  sum_sims = np.sum(sim_matrix)
101
+ count = len(responses) * (len(responses) - 1)
102
+ avg_sim = sum_sims / count if count > 0 else 0.0
103
+
 
104
  return 1.0 - avg_sim
105
+
106
+ def _calc_relevance(self, query: str, responses: List[Tuple[str, float]]) -> float:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
107
  """
108
+ Weighted average of exponential-transformed similarities for top-k.
109
+ Encourages a high similarity with the top responses.
110
  """
111
+ if not responses:
112
+ return 0.0
113
+
114
+ query_emb = self.data_pipeline.encode_query(query)
115
+ texts = [r for r, _ in responses]
116
+ resp_embs = self.data_pipeline.encode_responses(texts)
117
+
118
+ query_emb = query_emb / (np.linalg.norm(query_emb) + 1e-8)
119
+ norms = (np.linalg.norm(resp_embs, axis=1, keepdims=True) + 1e-8)
120
+ resp_embs = resp_embs / norms
121
+
122
+ # Cosine similarity, then exponential transform
123
+ sims = np.sum(query_emb[np.newaxis, :] * resp_embs, axis=1) # shape [k]
124
+ sims = np.exp(sims - 1.0)
125
+
126
+ # Weighted average to boost top responses
127
+ weights = np.exp(-np.arange(len(responses)) / 2.0)
128
+ weighted_avg = np.average(sims, weights=weights)
129
+ return float(weighted_avg)
130
+
131
+ def _calc_length_score(self, responses: List[Tuple[str, float]]) -> float:
132
+ """
133
+ Average length-based score across top responses.
134
+ """
135
+ scores = []
136
+ for text, _ in responses:
137
+ words = len(text.strip().split())
138
+ if words < self.min_response_length:
139
+ # Penalty for too short
140
+ s = words / float(self.min_response_length)
141
+ elif words > 50:
142
+ # Penalty for excessive length
143
+ s = max(0.5, 50.0 / words)
144
+ else:
145
+ s = 1.0
146
+ scores.append(s)
147
+
148
+ return float(np.mean(scores)) if scores else 0.0
149
+
150
+ def _calc_score_gap(self, scores: List[float], top_n: int = 3) -> float:
151
+ """
152
+ Average difference between consecutive ranks for top_n.
153
  """
154
  if len(scores) < 2:
155
  return 0.0
156
+ top_n = min(top_n, len(scores))
157
  gaps = []
158
  for i in range(top_n - 1):
159
  gaps.append(scores[i] - scores[i + 1])
160
  return float(np.mean(gaps)) if gaps else 0.0
161
+
162
+ def _determine_confidence(self, m: Dict[str, float]) -> bool:
163
+ """
164
+ Require:
165
+ - top_score >= self.confidence_threshold
166
+ - response_diversity >= self.diversity_threshold
167
+ - response_length_score >= self.thresholds['length_score']
168
+
169
+ Secondary conditions (2 of 3 required):
170
+ - query_response_relevance >= self.thresholds['relevance']
171
+ - top_3_score_gap >= self.thresholds['score_gap']
172
+ - top_score >= (confidence_threshold + 0.05)
173
+ """
174
+ primary = [
175
+ m['top_score'] >= self.confidence_threshold,
176
+ m['response_diversity'] >= self.diversity_threshold,
177
+ m['response_length_score'] >= self.thresholds['length_score']
178
+ ]
179
+ secondary = [
180
+ m['query_response_relevance'] >= self.thresholds['relevance'],
181
+ m['top_3_score_gap'] >= self.thresholds['score_gap'],
182
+ m['top_score'] >= (self.confidence_threshold + 0.05)
183
+ ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
184
 
185
+ if all(primary) and sum(secondary) >= 2:
186
+ return True
187
+ return False
188
+
189
+ def _cosine_similarity(self, a: np.ndarray, b: np.ndarray) -> np.ndarray:
190
+ """Manual cosine sim matrix: a-> shape [N, d], b-> shape [M, d]. Return shape [N, M]."""
191
+ a_norm = a / (np.linalg.norm(a, axis=1, keepdims=True) + 1e-8)
192
+ b_norm = b / (np.linalg.norm(b, axis=1, keepdims=True) + 1e-8)
193
+ return np.dot(a_norm, b_norm.T)
194
+