Testys commited on
Commit
391339a
·
1 Parent(s): c065565

Update search_utils.py

Browse files
Files changed (1) hide show
  1. search_utils.py +20 -5
search_utils.py CHANGED
@@ -176,7 +176,7 @@ class MetadataManager:
176
  # Check for empty numpy array properly
177
  if isinstance(global_indices, np.ndarray) and global_indices.size == 0:
178
  logger.warning("Empty indices array passed to get_metadata")
179
- return pd.DataFrame(columns=["title", "summary", "source", "similarity"])
180
 
181
  # Convert numpy array to list for processing
182
  indices_list = global_indices.tolist() if isinstance(global_indices, np.ndarray) else global_indices
@@ -190,7 +190,7 @@ class MetadataManager:
190
 
191
  if not valid_indices:
192
  logger.warning("No valid indices remain after filtering")
193
- return pd.DataFrame(columns=["title", "summary", "source", "similarity"])
194
 
195
  # Group indices by shard with boundary check
196
  shard_groups = {}
@@ -236,7 +236,7 @@ class MetadataManager:
236
  try:
237
  self.loaded_shards[shard] = pd.read_parquet(
238
  shard_path,
239
- columns=["title", "summary", "source"]
240
  )
241
  logger.info(f"Successfully loaded shard {shard} with {len(self.loaded_shards[shard])} rows")
242
  except Exception as e:
@@ -277,7 +277,7 @@ class MetadataManager:
277
  return combined
278
  else:
279
  logger.warning("No metadata records retrieved")
280
- return pd.DataFrame(columns=["title", "summary", "source", "similarity"])
281
 
282
 
283
  def _resolve_paper_url(self, title):
@@ -554,10 +554,25 @@ class SemanticSearch:
554
 
555
  self.logger.info(f"Results processed in {time.time() - process_start:.2f}s, returning {len(results)} items")
556
  return results.reset_index(drop=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
557
 
558
  except Exception as e:
559
  self.logger.error(f"Result processing failed: {str(e)}", exc_info=True)
560
- return pd.DataFrame(columns=["title", "summary", "source", "similarity"])
561
 
562
 
563
  def _format_source_links(self, links):
 
176
  # Check for empty numpy array properly
177
  if isinstance(global_indices, np.ndarray) and global_indices.size == 0:
178
  logger.warning("Empty indices array passed to get_metadata")
179
+ return pd.DataFrame(columns=["title", "summary", "similarity"])
180
 
181
  # Convert numpy array to list for processing
182
  indices_list = global_indices.tolist() if isinstance(global_indices, np.ndarray) else global_indices
 
190
 
191
  if not valid_indices:
192
  logger.warning("No valid indices remain after filtering")
193
+ return pd.DataFrame(columns=["title", "summary", "similarity"])
194
 
195
  # Group indices by shard with boundary check
196
  shard_groups = {}
 
236
  try:
237
  self.loaded_shards[shard] = pd.read_parquet(
238
  shard_path,
239
+ columns=["title", "summary"]
240
  )
241
  logger.info(f"Successfully loaded shard {shard} with {len(self.loaded_shards[shard])} rows")
242
  except Exception as e:
 
277
  return combined
278
  else:
279
  logger.warning("No metadata records retrieved")
280
+ return pd.DataFrame(columns=["title", "summary", "similarity"])
281
 
282
 
283
  def _resolve_paper_url(self, title):
 
554
 
555
  self.logger.info(f"Results processed in {time.time() - process_start:.2f}s, returning {len(results)} items")
556
  return results.reset_index(drop=True)
557
+
558
+ # Add URL resolution for final results only
559
+ final_results = results.sort_values("similarity", ascending=False).head(top_k)
560
+
561
+ # Resolve URLs for top results only
562
+ final_results['source'] = final_results['title'].apply(
563
+ lambda title: self._format_source_links(
564
+ self.metadata_mgr._resolve_paper_url(title)
565
+ )
566
+ )
567
+
568
+ # Deduplicate based on title only
569
+ final_results = final_results.drop_duplicates(subset=["title"]).head(top_k)
570
+
571
+ return final_results.reset_index(drop=True)
572
 
573
  except Exception as e:
574
  self.logger.error(f"Result processing failed: {str(e)}", exc_info=True)
575
+ return pd.DataFrame(columns=["title", "summary", "similarity"])
576
 
577
 
578
  def _format_source_links(self, links):