Testys commited on
Commit
70d8022
·
verified ·
1 Parent(s): bc90a96

Update search_utils.py

Browse files
Files changed (1) hide show
  1. search_utils.py +6 -56
search_utils.py CHANGED
@@ -38,7 +38,6 @@ class MetadataManager:
38
  try:
39
  # Load the parquet file
40
  self.df = pd.read_parquet(self.metadata_path)
41
-
42
  # Clean and format the data
43
  self.df['source'] = self.df['source'].apply(
44
  lambda x: [
@@ -47,8 +46,12 @@ class MetadataManager:
47
  if url.strip()
48
  ]
49
  )
 
 
 
50
  self.total_docs = len(self.df)
51
 
 
52
  logger.info(f"Successfully loaded {self.total_docs} documents")
53
  except Exception as e:
54
  logger.error(f"Failed to load metadata: {str(e)}")
@@ -238,59 +241,6 @@ class SemanticSearch:
238
  self.logger.error(f"Search failed in shard {shard_idx}: {str(e)}")
239
  return None
240
 
241
- def _process_results(self, distances, global_indices, top_k):
242
- """Process raw search results with correct similarity calculation for cosine similarity."""
243
- process_start = time.time()
244
- if global_indices.size == 0 or distances.size == 0:
245
- self.logger.warning("No search results to process")
246
- return pd.DataFrame(columns=["title", "summary", "source", "authors", "similarity"])
247
-
248
- try:
249
- self.logger.info(f"Retrieving metadata for {len(global_indices)} indices")
250
- metadata_start = time.time()
251
- results = self.metadata_mgr.get_metadata(global_indices)
252
- self.logger.info(f"Metadata retrieved in {time.time() - metadata_start:.2f}s, got {len(results)} records")
253
-
254
- if len(results) == 0:
255
- self.logger.warning("No metadata found for indices")
256
- return pd.DataFrame(columns=["title", "summary", "source", "authors", "similarity"])
257
-
258
- # Handle distance-results alignment
259
- if len(results) != len(distances):
260
- self.logger.warning(f"Mismatch between distances ({len(distances)}) and results ({len(results)})")
261
- min_len = min(len(results), len(distances))
262
- results = results.iloc[:min_len]
263
- distances = distances[:min_len]
264
-
265
- # For inner product with normalized vectors, similarity is directly the distance
266
- # (FAISS IP search already returns higher scores for more similar items)
267
- results['similarity'] = 1 - (distances/2)
268
-
269
- # Deduplicate and sort
270
- required_columns = ["title", "summary", "authors", "source", "similarity"]
271
- pre_dedup = len(results)
272
- results = (
273
- results.drop_duplicates(subset=["title", "authors"])
274
- .sort_values("similarity", ascending=False)
275
- .head(top_k)
276
- )
277
- post_dedup = len(results)
278
-
279
- if pre_dedup > post_dedup:
280
- self.logger.info(f"Removed {pre_dedup - post_dedup} duplicate results")
281
-
282
- self.logger.info(f"Results processed in {time.time() - process_start:.2f}s")
283
- return results[required_columns].reset_index(drop=True)
284
-
285
- except Exception as e:
286
- self.logger.error(f"Result processing failed: {str(e)}", exc_info=True)
287
- return pd.DataFrame(columns=["title", "summary", "source", "authors", "similarity"])
288
-
289
-
290
-
291
-
292
-
293
-
294
  def _process_results(self, distances, global_indices, top_k):
295
  """Process raw search results into formatted DataFrame"""
296
  process_start = time.time()
@@ -337,7 +287,7 @@ class SemanticSearch:
337
 
338
  # Deduplicate and sort results
339
  pre_dedup = len(results)
340
- results = results.drop_duplicates(subset=["title", "source"]).sort_values("similarity", ascending=False).head(top_k)
341
  post_dedup = len(results)
342
 
343
  if pre_dedup > post_dedup:
@@ -359,4 +309,4 @@ class SemanticSearch:
359
 
360
  except Exception as e:
361
  self.logger.error(f"Result processing failed: {str(e)}", exc_info=True)
362
- return pd.DataFrame(columns=["title", "summary", "similarity"])
 
38
  try:
39
  # Load the parquet file
40
  self.df = pd.read_parquet(self.metadata_path)
 
41
  # Clean and format the data
42
  self.df['source'] = self.df['source'].apply(
43
  lambda x: [
 
46
  if url.strip()
47
  ]
48
  )
49
+ # Convert list of sources to a single string
50
+ self.df['source'] = self.df['source'].apply(lambda x: ' | '.join(x) if isinstance(x, list) else x)
51
+
52
  self.total_docs = len(self.df)
53
 
54
+
55
  logger.info(f"Successfully loaded {self.total_docs} documents")
56
  except Exception as e:
57
  logger.error(f"Failed to load metadata: {str(e)}")
 
241
  self.logger.error(f"Search failed in shard {shard_idx}: {str(e)}")
242
  return None
243
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
244
  def _process_results(self, distances, global_indices, top_k):
245
  """Process raw search results into formatted DataFrame"""
246
  process_start = time.time()
 
287
 
288
  # Deduplicate and sort results
289
  pre_dedup = len(results)
290
+ results = results.drop_duplicates(subset=["title").sort_values("similarity", ascending=False).head(top_k)
291
  post_dedup = len(results)
292
 
293
  if pre_dedup > post_dedup:
 
309
 
310
  except Exception as e:
311
  self.logger.error(f"Result processing failed: {str(e)}", exc_info=True)
312
+ return pd.DataFrame(columns=["title", "summary", "similarity", 'authors'])