Spaces:
Running
Running
Update search_utils.py
Browse files- search_utils.py +20 -5
search_utils.py
CHANGED
@@ -176,7 +176,7 @@ class MetadataManager:
|
|
176 |
# Check for empty numpy array properly
|
177 |
if isinstance(global_indices, np.ndarray) and global_indices.size == 0:
|
178 |
logger.warning("Empty indices array passed to get_metadata")
|
179 |
-
return pd.DataFrame(columns=["title", "summary", "
|
180 |
|
181 |
# Convert numpy array to list for processing
|
182 |
indices_list = global_indices.tolist() if isinstance(global_indices, np.ndarray) else global_indices
|
@@ -190,7 +190,7 @@ class MetadataManager:
|
|
190 |
|
191 |
if not valid_indices:
|
192 |
logger.warning("No valid indices remain after filtering")
|
193 |
-
return pd.DataFrame(columns=["title", "summary", "
|
194 |
|
195 |
# Group indices by shard with boundary check
|
196 |
shard_groups = {}
|
@@ -236,7 +236,7 @@ class MetadataManager:
|
|
236 |
try:
|
237 |
self.loaded_shards[shard] = pd.read_parquet(
|
238 |
shard_path,
|
239 |
-
columns=["title", "summary"
|
240 |
)
|
241 |
logger.info(f"Successfully loaded shard {shard} with {len(self.loaded_shards[shard])} rows")
|
242 |
except Exception as e:
|
@@ -277,7 +277,7 @@ class MetadataManager:
|
|
277 |
return combined
|
278 |
else:
|
279 |
logger.warning("No metadata records retrieved")
|
280 |
-
return pd.DataFrame(columns=["title", "summary", "
|
281 |
|
282 |
|
283 |
def _resolve_paper_url(self, title):
|
@@ -554,10 +554,25 @@ class SemanticSearch:
|
|
554 |
|
555 |
self.logger.info(f"Results processed in {time.time() - process_start:.2f}s, returning {len(results)} items")
|
556 |
return results.reset_index(drop=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
557 |
|
558 |
except Exception as e:
|
559 |
self.logger.error(f"Result processing failed: {str(e)}", exc_info=True)
|
560 |
-
return pd.DataFrame(columns=["title", "summary", "
|
561 |
|
562 |
|
563 |
def _format_source_links(self, links):
|
|
|
176 |
# Check for empty numpy array properly
|
177 |
if isinstance(global_indices, np.ndarray) and global_indices.size == 0:
|
178 |
logger.warning("Empty indices array passed to get_metadata")
|
179 |
+
return pd.DataFrame(columns=["title", "summary", "similarity"])
|
180 |
|
181 |
# Convert numpy array to list for processing
|
182 |
indices_list = global_indices.tolist() if isinstance(global_indices, np.ndarray) else global_indices
|
|
|
190 |
|
191 |
if not valid_indices:
|
192 |
logger.warning("No valid indices remain after filtering")
|
193 |
+
return pd.DataFrame(columns=["title", "summary", "similarity"])
|
194 |
|
195 |
# Group indices by shard with boundary check
|
196 |
shard_groups = {}
|
|
|
236 |
try:
|
237 |
self.loaded_shards[shard] = pd.read_parquet(
|
238 |
shard_path,
|
239 |
+
columns=["title", "summary"]
|
240 |
)
|
241 |
logger.info(f"Successfully loaded shard {shard} with {len(self.loaded_shards[shard])} rows")
|
242 |
except Exception as e:
|
|
|
277 |
return combined
|
278 |
else:
|
279 |
logger.warning("No metadata records retrieved")
|
280 |
+
return pd.DataFrame(columns=["title", "summary", "similarity"])
|
281 |
|
282 |
|
283 |
def _resolve_paper_url(self, title):
|
|
|
554 |
|
555 |
self.logger.info(f"Results processed in {time.time() - process_start:.2f}s, returning {len(results)} items")
|
556 |
return results.reset_index(drop=True)
|
557 |
+
|
558 |
+
# Add URL resolution for final results only
|
559 |
+
final_results = results.sort_values("similarity", ascending=False).head(top_k)
|
560 |
+
|
561 |
+
# Resolve URLs for top results only
|
562 |
+
final_results['source'] = final_results['title'].apply(
|
563 |
+
lambda title: self._format_source_links(
|
564 |
+
self.metadata_mgr._resolve_paper_url(title)
|
565 |
+
)
|
566 |
+
)
|
567 |
+
|
568 |
+
# Deduplicate based on title only
|
569 |
+
final_results = final_results.drop_duplicates(subset=["title"]).head(top_k)
|
570 |
+
|
571 |
+
return final_results.reset_index(drop=True)
|
572 |
|
573 |
except Exception as e:
|
574 |
self.logger.error(f"Result processing failed: {str(e)}", exc_info=True)
|
575 |
+
return pd.DataFrame(columns=["title", "summary", "similarity"])
|
576 |
|
577 |
|
578 |
def _format_source_links(self, links):
|