Testys commited on
Commit
3c95d1f
·
1 Parent(s): 2bff6ee

Update search_utils.py

Browse files
Files changed (1) hide show
  1. search_utils.py +17 -17
search_utils.py CHANGED
@@ -230,6 +230,7 @@ class SemanticSearch:
230
  self.metadata_mgr = MetadataManager()
231
  self.shard_sizes = []
232
  self.cumulative_offsets = None
 
233
  self.logger = logging.getLogger("SemanticSearch")
234
  self.logger.info("Initializing SemanticSearch")
235
 
@@ -270,8 +271,8 @@ class SemanticSearch:
270
  self.logger.info(f"Loaded index {shard_path.name} with {size} vectors")
271
  except Exception as e:
272
  self.logger.error(f"Error loading index {shard_path}: {str(e)}")
273
- total_vectors = sum(self.shard_sizes)
274
- self.logger.info(f"Total loaded vectors: {total_vectors} across {len(self.index_shards)} shards")
275
  self.cumulative_offsets = np.cumsum([0] + self.shard_sizes)
276
 
277
  def _load_single_index(self, shard_path):
@@ -348,52 +349,51 @@ class SemanticSearch:
348
  except Exception as e:
349
  self.logger.error(f"Search failed in shard {shard_idx}: {str(e)}")
350
  return None
351
-
352
  def _process_results(self, distances, global_indices, top_k):
353
  """Process raw search results: retrieve metadata, calculate similarity, and deduplicate."""
354
  process_start = time.time()
355
  if global_indices.size == 0 or distances.size == 0:
356
  self.logger.warning("No search results to process")
357
  return pd.DataFrame(columns=["title", "summary", "source", "authors", "similarity"])
 
358
  try:
359
  self.logger.info(f"Retrieving metadata for {len(global_indices)} indices")
360
  metadata_start = time.time()
361
  results = self.metadata_mgr.get_metadata(global_indices)
362
  self.logger.info(f"Metadata retrieved in {time.time() - metadata_start:.2f}s, got {len(results)} records")
363
-
364
  if len(results) == 0:
365
  self.logger.warning("No metadata found for indices")
366
  return pd.DataFrame(columns=["title", "summary", "source", "authors", "similarity"])
 
367
  if len(results) != len(distances):
368
  self.logger.warning(f"Mismatch between distances ({len(distances)}) and results ({len(results)})")
369
  if len(results) < len(distances):
370
  distances = distances[:len(results)]
371
  else:
372
  distances = np.pad(distances, (0, len(results) - len(distances)), 'constant', constant_values=1.0)
373
-
374
  self.logger.debug("Calculating similarity scores")
375
  results['similarity'] = 1 - (distances / 2)
376
- if not results.empty:
377
- self.logger.debug(f"Similarity stats: min={results['similarity'].min():.3f}, " +
378
- f"max={results['similarity'].max():.3f}, " +
379
- f"mean={results['similarity'].mean():.3f}")
380
  results['source'] = results["source"]
381
-
382
- # Ensure we have all required columns
383
  required_columns = ["title", "summary", "authors", "source", "similarity"]
384
  for col in required_columns:
385
  if col not in results.columns:
386
  results[col] = None # Fill missing columns with None
387
-
388
  pre_dedup = len(results)
389
- results = results.drop_duplicates(subset=["title","authors", "source"]).sort_values("similarity", ascending=False).head(top_k)
390
-
391
  post_dedup = len(results)
392
  if pre_dedup > post_dedup:
393
  self.logger.info(f"Removed {pre_dedup - post_dedup} duplicate results")
 
394
  self.logger.info(f"Results processed in {time.time() - process_start:.2f}s, returning {len(results)} items")
395
-
396
- return results.reset_index(drop=True)
397
  except Exception as e:
398
  self.logger.error(f"Result processing failed: {str(e)}", exc_info=True)
399
- return pd.DataFrame(columns=["title", "summary", "source", "similarity"])
 
230
  self.metadata_mgr = MetadataManager()
231
  self.shard_sizes = []
232
  self.cumulative_offsets = None
233
+ self.total_vectors = 0
234
  self.logger = logging.getLogger("SemanticSearch")
235
  self.logger.info("Initializing SemanticSearch")
236
 
 
271
  self.logger.info(f"Loaded index {shard_path.name} with {size} vectors")
272
  except Exception as e:
273
  self.logger.error(f"Error loading index {shard_path}: {str(e)}")
274
+ self.total_vectors = sum(self.shard_sizes)
275
+ self.logger.info(f"Total loaded vectors: {aelf.total_vectors} across {len(self.index_shards)} shards")
276
  self.cumulative_offsets = np.cumsum([0] + self.shard_sizes)
277
 
278
  def _load_single_index(self, shard_path):
 
349
  except Exception as e:
350
  self.logger.error(f"Search failed in shard {shard_idx}: {str(e)}")
351
  return None
352
+
353
  def _process_results(self, distances, global_indices, top_k):
354
  """Process raw search results: retrieve metadata, calculate similarity, and deduplicate."""
355
  process_start = time.time()
356
  if global_indices.size == 0 or distances.size == 0:
357
  self.logger.warning("No search results to process")
358
  return pd.DataFrame(columns=["title", "summary", "source", "authors", "similarity"])
359
+
360
  try:
361
  self.logger.info(f"Retrieving metadata for {len(global_indices)} indices")
362
  metadata_start = time.time()
363
  results = self.metadata_mgr.get_metadata(global_indices)
364
  self.logger.info(f"Metadata retrieved in {time.time() - metadata_start:.2f}s, got {len(results)} records")
365
+
366
  if len(results) == 0:
367
  self.logger.warning("No metadata found for indices")
368
  return pd.DataFrame(columns=["title", "summary", "source", "authors", "similarity"])
369
+
370
  if len(results) != len(distances):
371
  self.logger.warning(f"Mismatch between distances ({len(distances)}) and results ({len(results)})")
372
  if len(results) < len(distances):
373
  distances = distances[:len(results)]
374
  else:
375
  distances = np.pad(distances, (0, len(results) - len(distances)), 'constant', constant_values=1.0)
376
+
377
  self.logger.debug("Calculating similarity scores")
378
  results['similarity'] = 1 - (distances / 2)
379
+
380
+ # Ensure all required columns
 
 
381
  results['source'] = results["source"]
382
+
 
383
  required_columns = ["title", "summary", "authors", "source", "similarity"]
384
  for col in required_columns:
385
  if col not in results.columns:
386
  results[col] = None # Fill missing columns with None
387
+
388
  pre_dedup = len(results)
389
+ results = results.drop_duplicates(subset=["title", "authors", "source"]).sort_values("similarity", ascending=False).head(top_k)
390
+
391
  post_dedup = len(results)
392
  if pre_dedup > post_dedup:
393
  self.logger.info(f"Removed {pre_dedup - post_dedup} duplicate results")
394
+
395
  self.logger.info(f"Results processed in {time.time() - process_start:.2f}s, returning {len(results)} items")
396
+ return results[required_columns].reset_index(drop=True)
 
397
  except Exception as e:
398
  self.logger.error(f"Result processing failed: {str(e)}", exc_info=True)
399
+ return pd.DataFrame(columns=["title", "summary", "source", "similarity"])