Testys commited on
Commit
2dec497
·
1 Parent(s): b73a811

Update search_utils.py

Browse files
Files changed (1) hide show
  1. search_utils.py +22 -12
search_utils.py CHANGED
@@ -37,11 +37,15 @@ class MetadataManager:
37
 
38
  def get_metadata(self, global_indices):
39
  """Retrieve metadata with validation"""
40
- if not global_indices:
 
41
  return pd.DataFrame(columns=["title", "summary", "source", "similarity"])
42
 
 
 
 
43
  # Filter valid indices
44
- valid_indices = [idx for idx in global_indices if 0 <= idx < self.total_docs]
45
  if not valid_indices:
46
  return pd.DataFrame(columns=["title", "summary", "source", "similarity"])
47
 
@@ -145,25 +149,31 @@ class SemanticSearch:
145
 
146
  def _process_results(self, distances, global_indices, top_k):
147
  """Process raw search results into formatted DataFrame"""
148
- if len(global_indices) == 0 or len(distances) == 0:
 
149
  return pd.DataFrame(columns=["title", "summary", "source", "similarity"])
150
 
151
  try:
152
- # Get metadata for valid indices
153
- results = self.metadata_mgr.get_metadata(global_indices)
 
 
 
154
 
155
- # Calculate similarity scores (convert L2 distance to cosine similarity approximation)
 
 
 
 
156
  results['similarity'] = 1 - (distances / 2)
157
 
158
- # Deduplicate results based on title and source
159
  results = results.drop_duplicates(subset=["title", "source"])
 
 
160
 
161
- # Sort by similarity and select top results
162
- results = results.sort_values("similarity", ascending=False).head(top_k)
163
-
164
- # Reset index for clean display
165
  return results.reset_index(drop=True)
166
 
167
  except Exception as e:
168
- st.error(f"Error processing results: {str(e)}")
169
  return pd.DataFrame(columns=["title", "summary", "source", "similarity"])
 
37
 
38
  def get_metadata(self, global_indices):
39
  """Retrieve metadata with validation"""
40
+ # Check for empty numpy array properly
41
+ if isinstance(global_indices, np.ndarray) and global_indices.size == 0:
42
  return pd.DataFrame(columns=["title", "summary", "source", "similarity"])
43
 
44
+ # Convert numpy array to list for processing
45
+ indices_list = global_indices.tolist() if isinstance(global_indices, np.ndarray) else global_indices
46
+
47
  # Filter valid indices
48
+ valid_indices = [idx for idx in indices_list if 0 <= idx < self.total_docs]
49
  if not valid_indices:
50
  return pd.DataFrame(columns=["title", "summary", "source", "similarity"])
51
 
 
149
 
150
  def _process_results(self, distances, global_indices, top_k):
151
  """Process raw search results into formatted DataFrame"""
152
+ # Proper numpy array emptiness checks
153
+ if global_indices.size == 0 or distances.size == 0:
154
  return pd.DataFrame(columns=["title", "summary", "source", "similarity"])
155
 
156
  try:
157
+ # Convert numpy indices to Python list for metadata retrieval
158
+ indices_list = global_indices.tolist()
159
+
160
+ # Get metadata for matched indices
161
+ results = self.metadata_mgr.get_metadata(indices_list)
162
 
163
+ # Ensure distances match results length
164
+ if len(results) != len(distances):
165
+ distances = distances[:len(results)]
166
+
167
+ # Calculate similarity scores
168
  results['similarity'] = 1 - (distances / 2)
169
 
170
+ # Deduplicate and sort results
171
  results = results.drop_duplicates(subset=["title", "source"])
172
+ .sort_values("similarity", ascending=False)
173
+ .head(top_k)
174
 
 
 
 
 
175
  return results.reset_index(drop=True)
176
 
177
  except Exception as e:
178
+ st.error(f"Result processing failed: {str(e)}")
179
  return pd.DataFrame(columns=["title", "summary", "source", "similarity"])