Spaces:

Testys
/

semantic-search

Running

App Files Files Community

Testys commited on Mar 18

Commit

391339a

1 Parent(s): c065565

Update search_utils.py

Browse files

Files changed (1) hide show

search_utils.py +20 -5

search_utils.py CHANGED Viewed

@@ -176,7 +176,7 @@ class MetadataManager:
         # Check for empty numpy array properly
         if isinstance(global_indices, np.ndarray) and global_indices.size == 0:
             logger.warning("Empty indices array passed to get_metadata")
-            return pd.DataFrame(columns=["title", "summary", "source", "similarity"])
         # Convert numpy array to list for processing
         indices_list = global_indices.tolist() if isinstance(global_indices, np.ndarray) else global_indices
@@ -190,7 +190,7 @@ class MetadataManager:
         if not valid_indices:
             logger.warning("No valid indices remain after filtering")
-            return pd.DataFrame(columns=["title", "summary", "source", "similarity"])
         # Group indices by shard with boundary check
         shard_groups = {}
@@ -236,7 +236,7 @@ class MetadataManager:
                     try:
                         self.loaded_shards[shard] = pd.read_parquet(
                             shard_path,
-                            columns=["title", "summary", "source"]
                         )
                         logger.info(f"Successfully loaded shard {shard} with {len(self.loaded_shards[shard])} rows")
                     except Exception as e:
@@ -277,7 +277,7 @@ class MetadataManager:
             return combined
         else:
             logger.warning("No metadata records retrieved")
-            return pd.DataFrame(columns=["title", "summary", "source", "similarity"])
     def _resolve_paper_url(self, title):
@@ -554,10 +554,25 @@ class SemanticSearch:
             self.logger.info(f"Results processed in {time.time() - process_start:.2f}s, returning {len(results)} items")
             return results.reset_index(drop=True)
         except Exception as e:
             self.logger.error(f"Result processing failed: {str(e)}", exc_info=True)
-            return pd.DataFrame(columns=["title", "summary", "source", "similarity"])
     def _format_source_links(self, links):

         # Check for empty numpy array properly
         if isinstance(global_indices, np.ndarray) and global_indices.size == 0:
             logger.warning("Empty indices array passed to get_metadata")
+            return pd.DataFrame(columns=["title", "summary", "similarity"])
         # Convert numpy array to list for processing
         indices_list = global_indices.tolist() if isinstance(global_indices, np.ndarray) else global_indices
         if not valid_indices:
             logger.warning("No valid indices remain after filtering")
+            return pd.DataFrame(columns=["title", "summary", "similarity"])
         # Group indices by shard with boundary check
         shard_groups = {}
                     try:
                         self.loaded_shards[shard] = pd.read_parquet(
                             shard_path,
+                            columns=["title", "summary"]
                         )
                         logger.info(f"Successfully loaded shard {shard} with {len(self.loaded_shards[shard])} rows")
                     except Exception as e:
             return combined
         else:
             logger.warning("No metadata records retrieved")
+            return pd.DataFrame(columns=["title", "summary", "similarity"])
     def _resolve_paper_url(self, title):
             self.logger.info(f"Results processed in {time.time() - process_start:.2f}s, returning {len(results)} items")
             return results.reset_index(drop=True)
+            # Add URL resolution for final results only
+            final_results = results.sort_values("similarity", ascending=False).head(top_k)
+            # Resolve URLs for top results only
+            final_results['source'] = final_results['title'].apply(
+                lambda title: self._format_source_links(
+                    self.metadata_mgr._resolve_paper_url(title)
+                )
+            )
+            # Deduplicate based on title only
+            final_results = final_results.drop_duplicates(subset=["title"]).head(top_k)
+            return final_results.reset_index(drop=True)
         except Exception as e:
             self.logger.error(f"Result processing failed: {str(e)}", exc_info=True)
+            return pd.DataFrame(columns=["title", "summary", "similarity"])
     def _format_source_links(self, links):