Spaces:
Running
Running
Update search_utils.py
Browse files- search_utils.py +26 -1
search_utils.py
CHANGED
@@ -141,4 +141,29 @@ class SemanticSearch:
|
|
141 |
np.array(all_distances[:min_length]),
|
142 |
np.array(all_global_indices[:min_length]),
|
143 |
top_k
|
144 |
-
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
141 |
np.array(all_distances[:min_length]),
|
142 |
np.array(all_global_indices[:min_length]),
|
143 |
top_k
|
144 |
+
)
|
145 |
+
|
146 |
+
def _process_results(self, distances, global_indices, top_k):
|
147 |
+
"""Process raw search results into formatted DataFrame"""
|
148 |
+
if len(global_indices) == 0 or len(distances) == 0:
|
149 |
+
return pd.DataFrame(columns=["title", "summary", "source", "similarity"])
|
150 |
+
|
151 |
+
try:
|
152 |
+
# Get metadata for valid indices
|
153 |
+
results = self.metadata_mgr.get_metadata(global_indices)
|
154 |
+
|
155 |
+
# Calculate similarity scores (convert L2 distance to cosine similarity approximation)
|
156 |
+
results['similarity'] = 1 - (distances / 2)
|
157 |
+
|
158 |
+
# Deduplicate results based on title and source
|
159 |
+
results = results.drop_duplicates(subset=["title", "source"])
|
160 |
+
|
161 |
+
# Sort by similarity and select top results
|
162 |
+
results = results.sort_values("similarity", ascending=False).head(top_k)
|
163 |
+
|
164 |
+
# Reset index for clean display
|
165 |
+
return results.reset_index(drop=True)
|
166 |
+
|
167 |
+
except Exception as e:
|
168 |
+
st.error(f"Error processing results: {str(e)}")
|
169 |
+
return pd.DataFrame(columns=["title", "summary", "source", "similarity"])
|