Spaces:
Running
Running
Update search_utils.py
Browse files- search_utils.py +6 -56
search_utils.py
CHANGED
@@ -38,7 +38,6 @@ class MetadataManager:
|
|
38 |
try:
|
39 |
# Load the parquet file
|
40 |
self.df = pd.read_parquet(self.metadata_path)
|
41 |
-
|
42 |
# Clean and format the data
|
43 |
self.df['source'] = self.df['source'].apply(
|
44 |
lambda x: [
|
@@ -47,8 +46,12 @@ class MetadataManager:
|
|
47 |
if url.strip()
|
48 |
]
|
49 |
)
|
|
|
|
|
|
|
50 |
self.total_docs = len(self.df)
|
51 |
|
|
|
52 |
logger.info(f"Successfully loaded {self.total_docs} documents")
|
53 |
except Exception as e:
|
54 |
logger.error(f"Failed to load metadata: {str(e)}")
|
@@ -238,59 +241,6 @@ class SemanticSearch:
|
|
238 |
self.logger.error(f"Search failed in shard {shard_idx}: {str(e)}")
|
239 |
return None
|
240 |
|
241 |
-
def _process_results(self, distances, global_indices, top_k):
|
242 |
-
"""Process raw search results with correct similarity calculation for cosine similarity."""
|
243 |
-
process_start = time.time()
|
244 |
-
if global_indices.size == 0 or distances.size == 0:
|
245 |
-
self.logger.warning("No search results to process")
|
246 |
-
return pd.DataFrame(columns=["title", "summary", "source", "authors", "similarity"])
|
247 |
-
|
248 |
-
try:
|
249 |
-
self.logger.info(f"Retrieving metadata for {len(global_indices)} indices")
|
250 |
-
metadata_start = time.time()
|
251 |
-
results = self.metadata_mgr.get_metadata(global_indices)
|
252 |
-
self.logger.info(f"Metadata retrieved in {time.time() - metadata_start:.2f}s, got {len(results)} records")
|
253 |
-
|
254 |
-
if len(results) == 0:
|
255 |
-
self.logger.warning("No metadata found for indices")
|
256 |
-
return pd.DataFrame(columns=["title", "summary", "source", "authors", "similarity"])
|
257 |
-
|
258 |
-
# Handle distance-results alignment
|
259 |
-
if len(results) != len(distances):
|
260 |
-
self.logger.warning(f"Mismatch between distances ({len(distances)}) and results ({len(results)})")
|
261 |
-
min_len = min(len(results), len(distances))
|
262 |
-
results = results.iloc[:min_len]
|
263 |
-
distances = distances[:min_len]
|
264 |
-
|
265 |
-
# For inner product with normalized vectors, similarity is directly the distance
|
266 |
-
# (FAISS IP search already returns higher scores for more similar items)
|
267 |
-
results['similarity'] = 1 - (distances/2)
|
268 |
-
|
269 |
-
# Deduplicate and sort
|
270 |
-
required_columns = ["title", "summary", "authors", "source", "similarity"]
|
271 |
-
pre_dedup = len(results)
|
272 |
-
results = (
|
273 |
-
results.drop_duplicates(subset=["title", "authors"])
|
274 |
-
.sort_values("similarity", ascending=False)
|
275 |
-
.head(top_k)
|
276 |
-
)
|
277 |
-
post_dedup = len(results)
|
278 |
-
|
279 |
-
if pre_dedup > post_dedup:
|
280 |
-
self.logger.info(f"Removed {pre_dedup - post_dedup} duplicate results")
|
281 |
-
|
282 |
-
self.logger.info(f"Results processed in {time.time() - process_start:.2f}s")
|
283 |
-
return results[required_columns].reset_index(drop=True)
|
284 |
-
|
285 |
-
except Exception as e:
|
286 |
-
self.logger.error(f"Result processing failed: {str(e)}", exc_info=True)
|
287 |
-
return pd.DataFrame(columns=["title", "summary", "source", "authors", "similarity"])
|
288 |
-
|
289 |
-
|
290 |
-
|
291 |
-
|
292 |
-
|
293 |
-
|
294 |
def _process_results(self, distances, global_indices, top_k):
|
295 |
"""Process raw search results into formatted DataFrame"""
|
296 |
process_start = time.time()
|
@@ -337,7 +287,7 @@ class SemanticSearch:
|
|
337 |
|
338 |
# Deduplicate and sort results
|
339 |
pre_dedup = len(results)
|
340 |
-
results = results.drop_duplicates(subset=["title"
|
341 |
post_dedup = len(results)
|
342 |
|
343 |
if pre_dedup > post_dedup:
|
@@ -359,4 +309,4 @@ class SemanticSearch:
|
|
359 |
|
360 |
except Exception as e:
|
361 |
self.logger.error(f"Result processing failed: {str(e)}", exc_info=True)
|
362 |
-
return pd.DataFrame(columns=["title", "summary", "similarity"])
|
|
|
38 |
try:
|
39 |
# Load the parquet file
|
40 |
self.df = pd.read_parquet(self.metadata_path)
|
|
|
41 |
# Clean and format the data
|
42 |
self.df['source'] = self.df['source'].apply(
|
43 |
lambda x: [
|
|
|
46 |
if url.strip()
|
47 |
]
|
48 |
)
|
49 |
+
# Convert list of sources to a single string
|
50 |
+
self.df['source'] = self.df['source'].apply(lambda x: ' | '.join(x) if isinstance(x, list) else x)
|
51 |
+
|
52 |
self.total_docs = len(self.df)
|
53 |
|
54 |
+
|
55 |
logger.info(f"Successfully loaded {self.total_docs} documents")
|
56 |
except Exception as e:
|
57 |
logger.error(f"Failed to load metadata: {str(e)}")
|
|
|
241 |
self.logger.error(f"Search failed in shard {shard_idx}: {str(e)}")
|
242 |
return None
|
243 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
244 |
def _process_results(self, distances, global_indices, top_k):
|
245 |
"""Process raw search results into formatted DataFrame"""
|
246 |
process_start = time.time()
|
|
|
287 |
|
288 |
# Deduplicate and sort results
|
289 |
pre_dedup = len(results)
|
290 |
+
results = results.drop_duplicates(subset=["title").sort_values("similarity", ascending=False).head(top_k)
|
291 |
post_dedup = len(results)
|
292 |
|
293 |
if pre_dedup > post_dedup:
|
|
|
309 |
|
310 |
except Exception as e:
|
311 |
self.logger.error(f"Result processing failed: {str(e)}", exc_info=True)
|
312 |
+
return pd.DataFrame(columns=["title", "summary", "similarity", 'authors'])
|