Testys commited on
Commit
2906fee
Β·
1 Parent(s): 500f692

Update search_utils.py

Browse files
Files changed (1) hide show
  1. search_utils.py +9 -81
search_utils.py CHANGED
@@ -145,7 +145,7 @@ class MetadataManager:
145
  shard_path = self.shard_dir / shard
146
  if not shard_path.exists():
147
  logger.error(f"Shard file not found: {shard_path}")
148
- return pd.DataFrame(columns=["title", "summary", "similarity"])
149
  file_size_mb = os.path.getsize(shard_path) / (1024 * 1024)
150
  logger.info(f"Loading shard file: {shard} (size: {file_size_mb:.2f} MB)")
151
  try:
@@ -158,7 +158,7 @@ class MetadataManager:
158
  logger.info(f"Parquet schema: {schema}")
159
  except Exception:
160
  pass
161
- return pd.DataFrame(columns=["title", "summary", "similarity"])
162
  df = self.loaded_shards[shard]
163
  df_len = len(df)
164
  valid_local_indices = [idx for idx in local_indices if 0 <= idx < df_len]
@@ -170,13 +170,13 @@ class MetadataManager:
170
  return chunk
171
  except Exception as e:
172
  logger.error(f"Error processing shard {shard}: {str(e)}", exc_info=True)
173
- return pd.DataFrame(columns=["title", "summary", "similarity"])
174
 
175
  def get_metadata(self, global_indices):
176
  """Retrieve metadata for a batch of global indices using parallel shard processing."""
177
  if isinstance(global_indices, np.ndarray) and global_indices.size == 0:
178
  logger.warning("Empty indices array passed to get_metadata")
179
- return pd.DataFrame(columns=["title", "summary", "similarity"])
180
 
181
  indices_list = global_indices.tolist() if isinstance(global_indices, np.ndarray) else global_indices
182
  logger.info(f"Retrieving metadata for {len(indices_list)} indices")
@@ -186,7 +186,7 @@ class MetadataManager:
186
  logger.warning(f"Filtered out {invalid_count} invalid indices")
187
  if not valid_indices:
188
  logger.warning("No valid indices remain after filtering")
189
- return pd.DataFrame(columns=["title", "summary", "similarity"])
190
 
191
  # Group indices by shard
192
  shard_groups = {}
@@ -216,69 +216,9 @@ class MetadataManager:
216
  return combined
217
  else:
218
  logger.warning("No metadata records retrieved")
219
- return pd.DataFrame(columns=["title", "summary", "similarity"])
220
 
221
- def _init_url_resolver(self):
222
- """Initialize API session and cache."""
223
- self.session = requests.Session()
224
- adapter = requests.adapters.HTTPAdapter(
225
- pool_connections=10,
226
- pool_maxsize=10,
227
- max_retries=3
228
- )
229
- self.session.mount("https://", adapter)
230
-
231
- def resolve_url(self, title: str) -> str:
232
- """Optimized URL resolution with fail-fast."""
233
- if title in self.api_cache:
234
- return self.api_cache[title]
235
-
236
- links = {}
237
- arxiv_url = self._get_arxiv_url(title)
238
- if arxiv_url:
239
- links["arxiv"] = arxiv_url
240
- semantic_url = self._get_semantic_url(title)
241
- if semantic_url:
242
- links["semantic"] = semantic_url
243
- scholar_url = f"https://scholar.google.com/scholar?q={quote(title)}"
244
- links["google"] = scholar_url
245
-
246
- self.api_cache[title] = links
247
- return links
248
-
249
- def _get_arxiv_url(self, title: str) -> str:
250
- """Fast arXiv lookup with timeout."""
251
- with self.session.get(
252
- "http://export.arxiv.org/api/query",
253
- params={"search_query": f'ti:"{title}"', "max_results": 1, "sortBy": "relevance"},
254
- timeout=2
255
- ) as response:
256
- if response.ok:
257
- return self._parse_arxiv_response(response.text)
258
- return ""
259
-
260
- def _parse_arxiv_response(self, xml: str) -> str:
261
- """Fast XML parsing using string operations."""
262
- if "<entry>" not in xml:
263
- return ""
264
- start = xml.find("<id>") + 4
265
- end = xml.find("</id>", start)
266
- return xml[start:end].replace("http:", "https:") if start > 3 else ""
267
-
268
- def _get_semantic_url(self, title: str) -> str:
269
- """Batch-friendly Semantic Scholar lookup."""
270
- with self.session.get(
271
- "https://api.semanticscholar.org/graph/v1/paper/search",
272
- params={"query": title[:200], "limit": 1},
273
- timeout=2
274
- ) as response:
275
- if response.ok:
276
- data = response.json()
277
- if data.get("data"):
278
- return data["data"][0].get("url", "")
279
- return ""
280
 
281
-
282
  class SemanticSearch:
283
  def __init__(self):
284
  self.shard_dir = Path("compressed_shards")
@@ -429,9 +369,8 @@ class SemanticSearch:
429
  self.logger.debug(f"Similarity stats: min={results['similarity'].min():.3f}, " +
430
  f"max={results['similarity'].max():.3f}, " +
431
  f"mean={results['similarity'].mean():.3f}")
432
- results['source'] = results['title'].apply(
433
- lambda title: self._format_source_links(self.metadata_mgr.resolve_url(title))
434
- )
435
  pre_dedup = len(results)
436
  results = results.drop_duplicates(subset=["title", "source"]).sort_values("similarity", ascending=False).head(top_k)
437
  post_dedup = len(results)
@@ -441,15 +380,4 @@ class SemanticSearch:
441
  return results.reset_index(drop=True)
442
  except Exception as e:
443
  self.logger.error(f"Result processing failed: {str(e)}", exc_info=True)
444
- return pd.DataFrame(columns=["title", "summary", "source", "similarity"])
445
-
446
- def _format_source_links(self, links):
447
- """Generate an HTML snippet for the available source links."""
448
- html_parts = []
449
- if "arxiv" in links:
450
- html_parts.append(f"<a class='source-link' href='{links['arxiv']}' target='_blank' rel='noopener noreferrer'> πŸ“œ arXiv</a>")
451
- if "semantic" in links:
452
- html_parts.append(f"<a class='source-link' href='{links['semantic']}' target='_blank' rel='noopener noreferrer'> 🌐 Semantic Scholar</a>")
453
- if "google" in links:
454
- html_parts.append(f"<a class='source-link' href='{links['google']}' target='_blank' rel='noopener noreferrer'> πŸ” Google Scholar</a>")
455
- return " | ".join(html_parts)
 
145
  shard_path = self.shard_dir / shard
146
  if not shard_path.exists():
147
  logger.error(f"Shard file not found: {shard_path}")
148
+ return pd.DataFrame(columns=["title", "summary", "similarity", "source"])
149
  file_size_mb = os.path.getsize(shard_path) / (1024 * 1024)
150
  logger.info(f"Loading shard file: {shard} (size: {file_size_mb:.2f} MB)")
151
  try:
 
158
  logger.info(f"Parquet schema: {schema}")
159
  except Exception:
160
  pass
161
+ return pd.DataFrame(columns=["title", "summary", "similarity", "source"])
162
  df = self.loaded_shards[shard]
163
  df_len = len(df)
164
  valid_local_indices = [idx for idx in local_indices if 0 <= idx < df_len]
 
170
  return chunk
171
  except Exception as e:
172
  logger.error(f"Error processing shard {shard}: {str(e)}", exc_info=True)
173
+ return pd.DataFrame(columns=["title", "summary", "similarity", "source"])
174
 
175
  def get_metadata(self, global_indices):
176
  """Retrieve metadata for a batch of global indices using parallel shard processing."""
177
  if isinstance(global_indices, np.ndarray) and global_indices.size == 0:
178
  logger.warning("Empty indices array passed to get_metadata")
179
+ return pd.DataFrame(columns=["title", "summary", "similarity", "source"])
180
 
181
  indices_list = global_indices.tolist() if isinstance(global_indices, np.ndarray) else global_indices
182
  logger.info(f"Retrieving metadata for {len(indices_list)} indices")
 
186
  logger.warning(f"Filtered out {invalid_count} invalid indices")
187
  if not valid_indices:
188
  logger.warning("No valid indices remain after filtering")
189
+ return pd.DataFrame(columns=["title", "summary", "similarity", "source"])
190
 
191
  # Group indices by shard
192
  shard_groups = {}
 
216
  return combined
217
  else:
218
  logger.warning("No metadata records retrieved")
219
+ return pd.DataFrame(columns=["title", "summary", "similarity", "source"])
220
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
221
 
 
222
  class SemanticSearch:
223
  def __init__(self):
224
  self.shard_dir = Path("compressed_shards")
 
369
  self.logger.debug(f"Similarity stats: min={results['similarity'].min():.3f}, " +
370
  f"max={results['similarity'].max():.3f}, " +
371
  f"mean={results['similarity'].mean():.3f}")
372
+ results['source'] = results["source"]
373
+
 
374
  pre_dedup = len(results)
375
  results = results.drop_duplicates(subset=["title", "source"]).sort_values("similarity", ascending=False).head(top_k)
376
  post_dedup = len(results)
 
380
  return results.reset_index(drop=True)
381
  except Exception as e:
382
  self.logger.error(f"Result processing failed: {str(e)}", exc_info=True)
383
+ return pd.DataFrame(columns=["title", "summary", "source", "similarity"])