Spaces:
Sleeping
Sleeping
Update search_utils.py
Browse files- search_utils.py +26 -1
search_utils.py
CHANGED
|
@@ -141,4 +141,29 @@ class SemanticSearch:
|
|
| 141 |
np.array(all_distances[:min_length]),
|
| 142 |
np.array(all_global_indices[:min_length]),
|
| 143 |
top_k
|
| 144 |
-
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 141 |
np.array(all_distances[:min_length]),
|
| 142 |
np.array(all_global_indices[:min_length]),
|
| 143 |
top_k
|
| 144 |
+
)
|
| 145 |
+
|
| 146 |
+
def _process_results(self, distances, global_indices, top_k):
|
| 147 |
+
"""Process raw search results into formatted DataFrame"""
|
| 148 |
+
if len(global_indices) == 0 or len(distances) == 0:
|
| 149 |
+
return pd.DataFrame(columns=["title", "summary", "source", "similarity"])
|
| 150 |
+
|
| 151 |
+
try:
|
| 152 |
+
# Get metadata for valid indices
|
| 153 |
+
results = self.metadata_mgr.get_metadata(global_indices)
|
| 154 |
+
|
| 155 |
+
# Calculate similarity scores (convert L2 distance to cosine similarity approximation)
|
| 156 |
+
results['similarity'] = 1 - (distances / 2)
|
| 157 |
+
|
| 158 |
+
# Deduplicate results based on title and source
|
| 159 |
+
results = results.drop_duplicates(subset=["title", "source"])
|
| 160 |
+
|
| 161 |
+
# Sort by similarity and select top results
|
| 162 |
+
results = results.sort_values("similarity", ascending=False).head(top_k)
|
| 163 |
+
|
| 164 |
+
# Reset index for clean display
|
| 165 |
+
return results.reset_index(drop=True)
|
| 166 |
+
|
| 167 |
+
except Exception as e:
|
| 168 |
+
st.error(f"Error processing results: {str(e)}")
|
| 169 |
+
return pd.DataFrame(columns=["title", "summary", "source", "similarity"])
|