Testys commited on
Commit
a138102
·
1 Parent(s): f848328

Update search_utils.py

Browse files
Files changed (1) hide show
  1. search_utils.py +44 -32
search_utils.py CHANGED
@@ -12,7 +12,6 @@ from urllib.parse import quote
12
  import requests
13
  import shutil
14
  import concurrent.futures
15
- # Optional: Uncomment if you want to use lru_cache for instance methods
16
  from functools import lru_cache
17
 
18
  # Configure logging
@@ -144,22 +143,28 @@ class MetadataManager:
144
  shard_path = self.shard_dir / shard
145
  if not shard_path.exists():
146
  logger.error(f"Shard file not found: {shard_path}")
147
- return pd.DataFrame(columns=["title", "summary", "similarity","authors", "source"])
148
 
149
  file_size_mb = os.path.getsize(shard_path) / (1024 * 1024)
150
  logger.info(f"Loading shard file: {shard} (size: {file_size_mb:.2f} MB)")
151
 
152
  try:
153
- self.loaded_shards[shard] = pd.read_parquet(shard_path, columns=["title", "summary", "source", "authors"])
 
 
 
 
 
 
 
 
 
 
 
154
  logger.info(f"Loaded shard {shard} with {len(self.loaded_shards[shard])} rows")
155
 
156
  except Exception as e:
157
  logger.error(f"Failed to read parquet file {shard}: {str(e)}")
158
- try:
159
- schema = pd.read_parquet(shard_path, engine='pyarrow').dtypes
160
- logger.info(f"Parquet schema: {schema}")
161
- except Exception:
162
- pass
163
  return pd.DataFrame(columns=["title", "summary", "similarity", "source", "authors"])
164
  df = self.loaded_shards[shard]
165
  df_len = len(df)
@@ -220,8 +225,8 @@ class MetadataManager:
220
  else:
221
  logger.warning("No metadata records retrieved")
222
  return pd.DataFrame(columns=["title", "summary", "similarity", "source"])
223
-
224
-
225
  class SemanticSearch:
226
  def __init__(self):
227
  self.shard_dir = Path("compressed_shards")
@@ -310,7 +315,6 @@ class SemanticSearch:
310
 
311
  all_distances = []
312
  all_global_indices = []
313
- # Run shard searches in parallel
314
  with concurrent.futures.ThreadPoolExecutor() as executor:
315
  futures = {
316
  executor.submit(self._search_shard, shard_idx, index, query_embedding, top_k): shard_idx
@@ -351,7 +355,7 @@ class SemanticSearch:
351
  return None
352
 
353
  def _process_results(self, distances, global_indices, top_k):
354
- """Process raw search results: retrieve metadata, calculate similarity, and deduplicate."""
355
  process_start = time.time()
356
  if global_indices.size == 0 or distances.size == 0:
357
  self.logger.warning("No search results to process")
@@ -367,33 +371,41 @@ class SemanticSearch:
367
  self.logger.warning("No metadata found for indices")
368
  return pd.DataFrame(columns=["title", "summary", "source", "authors", "similarity"])
369
 
 
370
  if len(results) != len(distances):
371
  self.logger.warning(f"Mismatch between distances ({len(distances)}) and results ({len(results)})")
372
- if len(results) < len(distances):
373
- distances = distances[:len(results)]
374
- else:
375
- distances = np.pad(distances, (0, len(results) - len(distances)), 'constant', constant_values=1.0)
376
-
377
- self.logger.debug("Calculating similarity scores")
378
- results['similarity'] = 1 - (distances / 2)
379
-
380
- # Ensure all required columns
381
- results['source'] = results["source"]
382
-
 
 
 
 
 
 
383
  required_columns = ["title", "summary", "authors", "source", "similarity"]
384
- for col in required_columns:
385
- if col not in results.columns:
386
- results[col] = None # Fill missing columns with None
387
-
388
  pre_dedup = len(results)
389
- results = results.drop_duplicates(subset=["title", "authors", "source"]).sort_values("similarity", ascending=False).head(top_k)
390
-
 
 
 
391
  post_dedup = len(results)
 
392
  if pre_dedup > post_dedup:
393
  self.logger.info(f"Removed {pre_dedup - post_dedup} duplicate results")
394
-
395
- self.logger.info(f"Results processed in {time.time() - process_start:.2f}s, returning {len(results)} items")
396
  return results[required_columns].reset_index(drop=True)
 
397
  except Exception as e:
398
  self.logger.error(f"Result processing failed: {str(e)}", exc_info=True)
399
- return pd.DataFrame(columns=["title", "summary", "source", "similarity"])
 
12
  import requests
13
  import shutil
14
  import concurrent.futures
 
15
  from functools import lru_cache
16
 
17
  # Configure logging
 
143
  shard_path = self.shard_dir / shard
144
  if not shard_path.exists():
145
  logger.error(f"Shard file not found: {shard_path}")
146
+ return pd.DataFrame(columns=["title", "summary", "similarity", "authors", "source"])
147
 
148
  file_size_mb = os.path.getsize(shard_path) / (1024 * 1024)
149
  logger.info(f"Loading shard file: {shard} (size: {file_size_mb:.2f} MB)")
150
 
151
  try:
152
+ # Load with explicit dtype for source column
153
+ self.loaded_shards[shard] = pd.read_parquet(
154
+ shard_path,
155
+ columns=["title", "summary", "source", "authors"],
156
+ dtype={'source': 'str'}
157
+ )
158
+ # Convert source strings to lists
159
+ self.loaded_shards[shard]['source'] = self.loaded_shards[shard]['source'].apply(
160
+ lambda x: x.split("; ") if isinstance(x, str) else []
161
+ )
162
+ # Handle missing summaries
163
+ self.loaded_shards[shard]['summary'] = self.loaded_shards[shard]['summary'].fillna("")
164
  logger.info(f"Loaded shard {shard} with {len(self.loaded_shards[shard])} rows")
165
 
166
  except Exception as e:
167
  logger.error(f"Failed to read parquet file {shard}: {str(e)}")
 
 
 
 
 
168
  return pd.DataFrame(columns=["title", "summary", "similarity", "source", "authors"])
169
  df = self.loaded_shards[shard]
170
  df_len = len(df)
 
225
  else:
226
  logger.warning("No metadata records retrieved")
227
  return pd.DataFrame(columns=["title", "summary", "similarity", "source"])
228
+
229
+
230
  class SemanticSearch:
231
  def __init__(self):
232
  self.shard_dir = Path("compressed_shards")
 
315
 
316
  all_distances = []
317
  all_global_indices = []
 
318
  with concurrent.futures.ThreadPoolExecutor() as executor:
319
  futures = {
320
  executor.submit(self._search_shard, shard_idx, index, query_embedding, top_k): shard_idx
 
355
  return None
356
 
357
  def _process_results(self, distances, global_indices, top_k):
358
+ """Process raw search results with correct similarity calculation."""
359
  process_start = time.time()
360
  if global_indices.size == 0 or distances.size == 0:
361
  self.logger.warning("No search results to process")
 
371
  self.logger.warning("No metadata found for indices")
372
  return pd.DataFrame(columns=["title", "summary", "source", "authors", "similarity"])
373
 
374
+ # Handle distance-results alignment
375
  if len(results) != len(distances):
376
  self.logger.warning(f"Mismatch between distances ({len(distances)}) and results ({len(results)})")
377
+ min_len = min(len(results), len(distances))
378
+ results = results.iloc[:min_len]
379
+ distances = distances[:min_len]
380
+
381
+ # Calculate similarity (cosine similarity = inner product for normalized embeddings)
382
+ results['similarity'] = distances
383
+
384
+ # Ensure URL lists are properly formatted
385
+ results['source'] = results['source'].apply(
386
+ lambda x: [
387
+ url.strip().rstrip(')') # Clean trailing parentheses and whitespace
388
+ for url in str(x).split(';') # Split on semicolons
389
+ if url.strip() # Remove empty strings
390
+ ] if isinstance(x, (str, list)) else []
391
+ )
392
+
393
+ # Deduplicate and sort
394
  required_columns = ["title", "summary", "authors", "source", "similarity"]
 
 
 
 
395
  pre_dedup = len(results)
396
+ results = (
397
+ results.drop_duplicates(subset=["title", "authors"])
398
+ .sort_values("similarity", ascending=False)
399
+ .head(top_k)
400
+ )
401
  post_dedup = len(results)
402
+
403
  if pre_dedup > post_dedup:
404
  self.logger.info(f"Removed {pre_dedup - post_dedup} duplicate results")
405
+
406
+ self.logger.info(f"Results processed in {time.time() - process_start:.2f}s")
407
  return results[required_columns].reset_index(drop=True)
408
+
409
  except Exception as e:
410
  self.logger.error(f"Result processing failed: {str(e)}", exc_info=True)
411
+ return pd.DataFrame(columns=["title", "summary", "source", "similarity"])