Spaces:
Running
Running
Update search_utils.py
Browse files- search_utils.py +17 -17
search_utils.py
CHANGED
@@ -230,6 +230,7 @@ class SemanticSearch:
|
|
230 |
self.metadata_mgr = MetadataManager()
|
231 |
self.shard_sizes = []
|
232 |
self.cumulative_offsets = None
|
|
|
233 |
self.logger = logging.getLogger("SemanticSearch")
|
234 |
self.logger.info("Initializing SemanticSearch")
|
235 |
|
@@ -270,8 +271,8 @@ class SemanticSearch:
|
|
270 |
self.logger.info(f"Loaded index {shard_path.name} with {size} vectors")
|
271 |
except Exception as e:
|
272 |
self.logger.error(f"Error loading index {shard_path}: {str(e)}")
|
273 |
-
total_vectors = sum(self.shard_sizes)
|
274 |
-
self.logger.info(f"Total loaded vectors: {total_vectors} across {len(self.index_shards)} shards")
|
275 |
self.cumulative_offsets = np.cumsum([0] + self.shard_sizes)
|
276 |
|
277 |
def _load_single_index(self, shard_path):
|
@@ -348,52 +349,51 @@ class SemanticSearch:
|
|
348 |
except Exception as e:
|
349 |
self.logger.error(f"Search failed in shard {shard_idx}: {str(e)}")
|
350 |
return None
|
351 |
-
|
352 |
def _process_results(self, distances, global_indices, top_k):
|
353 |
"""Process raw search results: retrieve metadata, calculate similarity, and deduplicate."""
|
354 |
process_start = time.time()
|
355 |
if global_indices.size == 0 or distances.size == 0:
|
356 |
self.logger.warning("No search results to process")
|
357 |
return pd.DataFrame(columns=["title", "summary", "source", "authors", "similarity"])
|
|
|
358 |
try:
|
359 |
self.logger.info(f"Retrieving metadata for {len(global_indices)} indices")
|
360 |
metadata_start = time.time()
|
361 |
results = self.metadata_mgr.get_metadata(global_indices)
|
362 |
self.logger.info(f"Metadata retrieved in {time.time() - metadata_start:.2f}s, got {len(results)} records")
|
363 |
-
|
364 |
if len(results) == 0:
|
365 |
self.logger.warning("No metadata found for indices")
|
366 |
return pd.DataFrame(columns=["title", "summary", "source", "authors", "similarity"])
|
|
|
367 |
if len(results) != len(distances):
|
368 |
self.logger.warning(f"Mismatch between distances ({len(distances)}) and results ({len(results)})")
|
369 |
if len(results) < len(distances):
|
370 |
distances = distances[:len(results)]
|
371 |
else:
|
372 |
distances = np.pad(distances, (0, len(results) - len(distances)), 'constant', constant_values=1.0)
|
373 |
-
|
374 |
self.logger.debug("Calculating similarity scores")
|
375 |
results['similarity'] = 1 - (distances / 2)
|
376 |
-
|
377 |
-
|
378 |
-
f"max={results['similarity'].max():.3f}, " +
|
379 |
-
f"mean={results['similarity'].mean():.3f}")
|
380 |
results['source'] = results["source"]
|
381 |
-
|
382 |
-
# Ensure we have all required columns
|
383 |
required_columns = ["title", "summary", "authors", "source", "similarity"]
|
384 |
for col in required_columns:
|
385 |
if col not in results.columns:
|
386 |
results[col] = None # Fill missing columns with None
|
387 |
-
|
388 |
pre_dedup = len(results)
|
389 |
-
results = results.drop_duplicates(subset=["title","authors", "source"]).sort_values("similarity", ascending=False).head(top_k)
|
390 |
-
|
391 |
post_dedup = len(results)
|
392 |
if pre_dedup > post_dedup:
|
393 |
self.logger.info(f"Removed {pre_dedup - post_dedup} duplicate results")
|
|
|
394 |
self.logger.info(f"Results processed in {time.time() - process_start:.2f}s, returning {len(results)} items")
|
395 |
-
|
396 |
-
return results.reset_index(drop=True)
|
397 |
except Exception as e:
|
398 |
self.logger.error(f"Result processing failed: {str(e)}", exc_info=True)
|
399 |
-
return pd.DataFrame(columns=["title", "summary", "source", "similarity"])
|
|
|
230 |
self.metadata_mgr = MetadataManager()
|
231 |
self.shard_sizes = []
|
232 |
self.cumulative_offsets = None
|
233 |
+
self.total_vectors = 0
|
234 |
self.logger = logging.getLogger("SemanticSearch")
|
235 |
self.logger.info("Initializing SemanticSearch")
|
236 |
|
|
|
271 |
self.logger.info(f"Loaded index {shard_path.name} with {size} vectors")
|
272 |
except Exception as e:
|
273 |
self.logger.error(f"Error loading index {shard_path}: {str(e)}")
|
274 |
+
self.total_vectors = sum(self.shard_sizes)
|
275 |
+
self.logger.info(f"Total loaded vectors: {aelf.total_vectors} across {len(self.index_shards)} shards")
|
276 |
self.cumulative_offsets = np.cumsum([0] + self.shard_sizes)
|
277 |
|
278 |
def _load_single_index(self, shard_path):
|
|
|
349 |
except Exception as e:
|
350 |
self.logger.error(f"Search failed in shard {shard_idx}: {str(e)}")
|
351 |
return None
|
352 |
+
|
353 |
def _process_results(self, distances, global_indices, top_k):
|
354 |
"""Process raw search results: retrieve metadata, calculate similarity, and deduplicate."""
|
355 |
process_start = time.time()
|
356 |
if global_indices.size == 0 or distances.size == 0:
|
357 |
self.logger.warning("No search results to process")
|
358 |
return pd.DataFrame(columns=["title", "summary", "source", "authors", "similarity"])
|
359 |
+
|
360 |
try:
|
361 |
self.logger.info(f"Retrieving metadata for {len(global_indices)} indices")
|
362 |
metadata_start = time.time()
|
363 |
results = self.metadata_mgr.get_metadata(global_indices)
|
364 |
self.logger.info(f"Metadata retrieved in {time.time() - metadata_start:.2f}s, got {len(results)} records")
|
365 |
+
|
366 |
if len(results) == 0:
|
367 |
self.logger.warning("No metadata found for indices")
|
368 |
return pd.DataFrame(columns=["title", "summary", "source", "authors", "similarity"])
|
369 |
+
|
370 |
if len(results) != len(distances):
|
371 |
self.logger.warning(f"Mismatch between distances ({len(distances)}) and results ({len(results)})")
|
372 |
if len(results) < len(distances):
|
373 |
distances = distances[:len(results)]
|
374 |
else:
|
375 |
distances = np.pad(distances, (0, len(results) - len(distances)), 'constant', constant_values=1.0)
|
376 |
+
|
377 |
self.logger.debug("Calculating similarity scores")
|
378 |
results['similarity'] = 1 - (distances / 2)
|
379 |
+
|
380 |
+
# Ensure all required columns
|
|
|
|
|
381 |
results['source'] = results["source"]
|
382 |
+
|
|
|
383 |
required_columns = ["title", "summary", "authors", "source", "similarity"]
|
384 |
for col in required_columns:
|
385 |
if col not in results.columns:
|
386 |
results[col] = None # Fill missing columns with None
|
387 |
+
|
388 |
pre_dedup = len(results)
|
389 |
+
results = results.drop_duplicates(subset=["title", "authors", "source"]).sort_values("similarity", ascending=False).head(top_k)
|
390 |
+
|
391 |
post_dedup = len(results)
|
392 |
if pre_dedup > post_dedup:
|
393 |
self.logger.info(f"Removed {pre_dedup - post_dedup} duplicate results")
|
394 |
+
|
395 |
self.logger.info(f"Results processed in {time.time() - process_start:.2f}s, returning {len(results)} items")
|
396 |
+
return results[required_columns].reset_index(drop=True)
|
|
|
397 |
except Exception as e:
|
398 |
self.logger.error(f"Result processing failed: {str(e)}", exc_info=True)
|
399 |
+
return pd.DataFrame(columns=["title", "summary", "source", "similarity"])
|