Spaces:
Running
Running
Update search_utils.py
Browse files- search_utils.py +22 -12
search_utils.py
CHANGED
@@ -37,11 +37,15 @@ class MetadataManager:
|
|
37 |
|
38 |
def get_metadata(self, global_indices):
|
39 |
"""Retrieve metadata with validation"""
|
40 |
-
|
|
|
41 |
return pd.DataFrame(columns=["title", "summary", "source", "similarity"])
|
42 |
|
|
|
|
|
|
|
43 |
# Filter valid indices
|
44 |
-
valid_indices = [idx for idx in
|
45 |
if not valid_indices:
|
46 |
return pd.DataFrame(columns=["title", "summary", "source", "similarity"])
|
47 |
|
@@ -145,25 +149,31 @@ class SemanticSearch:
|
|
145 |
|
146 |
def _process_results(self, distances, global_indices, top_k):
|
147 |
"""Process raw search results into formatted DataFrame"""
|
148 |
-
|
|
|
149 |
return pd.DataFrame(columns=["title", "summary", "source", "similarity"])
|
150 |
|
151 |
try:
|
152 |
-
#
|
153 |
-
|
|
|
|
|
|
|
154 |
|
155 |
-
#
|
|
|
|
|
|
|
|
|
156 |
results['similarity'] = 1 - (distances / 2)
|
157 |
|
158 |
-
# Deduplicate
|
159 |
results = results.drop_duplicates(subset=["title", "source"])
|
|
|
|
|
160 |
|
161 |
-
# Sort by similarity and select top results
|
162 |
-
results = results.sort_values("similarity", ascending=False).head(top_k)
|
163 |
-
|
164 |
-
# Reset index for clean display
|
165 |
return results.reset_index(drop=True)
|
166 |
|
167 |
except Exception as e:
|
168 |
-
st.error(f"
|
169 |
return pd.DataFrame(columns=["title", "summary", "source", "similarity"])
|
|
|
37 |
|
38 |
def get_metadata(self, global_indices):
|
39 |
"""Retrieve metadata with validation"""
|
40 |
+
# Check for empty numpy array properly
|
41 |
+
if isinstance(global_indices, np.ndarray) and global_indices.size == 0:
|
42 |
return pd.DataFrame(columns=["title", "summary", "source", "similarity"])
|
43 |
|
44 |
+
# Convert numpy array to list for processing
|
45 |
+
indices_list = global_indices.tolist() if isinstance(global_indices, np.ndarray) else global_indices
|
46 |
+
|
47 |
# Filter valid indices
|
48 |
+
valid_indices = [idx for idx in indices_list if 0 <= idx < self.total_docs]
|
49 |
if not valid_indices:
|
50 |
return pd.DataFrame(columns=["title", "summary", "source", "similarity"])
|
51 |
|
|
|
149 |
|
150 |
def _process_results(self, distances, global_indices, top_k):
|
151 |
"""Process raw search results into formatted DataFrame"""
|
152 |
+
# Proper numpy array emptiness checks
|
153 |
+
if global_indices.size == 0 or distances.size == 0:
|
154 |
return pd.DataFrame(columns=["title", "summary", "source", "similarity"])
|
155 |
|
156 |
try:
|
157 |
+
# Convert numpy indices to Python list for metadata retrieval
|
158 |
+
indices_list = global_indices.tolist()
|
159 |
+
|
160 |
+
# Get metadata for matched indices
|
161 |
+
results = self.metadata_mgr.get_metadata(indices_list)
|
162 |
|
163 |
+
# Ensure distances match results length
|
164 |
+
if len(results) != len(distances):
|
165 |
+
distances = distances[:len(results)]
|
166 |
+
|
167 |
+
# Calculate similarity scores
|
168 |
results['similarity'] = 1 - (distances / 2)
|
169 |
|
170 |
+
# Deduplicate and sort results
|
171 |
results = results.drop_duplicates(subset=["title", "source"])
|
172 |
+
.sort_values("similarity", ascending=False)
|
173 |
+
.head(top_k)
|
174 |
|
|
|
|
|
|
|
|
|
175 |
return results.reset_index(drop=True)
|
176 |
|
177 |
except Exception as e:
|
178 |
+
st.error(f"Result processing failed: {str(e)}")
|
179 |
return pd.DataFrame(columns=["title", "summary", "source", "similarity"])
|