Testys commited on
Commit
f9e4fd2
Β·
1 Parent(s): 1160873

Update search_utils.py

Browse files
Files changed (1) hide show
  1. search_utils.py +115 -0
search_utils.py CHANGED
@@ -26,6 +26,7 @@ class MetadataManager:
26
  self.shard_map = {}
27
  self.loaded_shards = {}
28
  self.total_docs = 0
 
29
 
30
  logger.info("Initializing MetadataManager")
31
  self._ensure_directories()
@@ -275,6 +276,95 @@ class MetadataManager:
275
  logger.warning("No metadata records retrieved")
276
  return pd.DataFrame(columns=["title", "summary", "source", "similarity"])
277
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
278
  class SemanticSearch:
279
  def __init__(self):
280
  self.shard_dir = Path("compressed_shards")
@@ -444,6 +534,13 @@ class SemanticSearch:
444
  self.logger.debug(f"Similarity stats: min={results['similarity'].min():.3f}, " +
445
  f"max={results['similarity'].max():.3f}, " +
446
  f"mean={results['similarity'].mean():.3f}")
 
 
 
 
 
 
 
447
 
448
  # Deduplicate and sort results
449
  pre_dedup = len(results)
@@ -459,3 +556,21 @@ class SemanticSearch:
459
  except Exception as e:
460
  self.logger.error(f"Result processing failed: {str(e)}", exc_info=True)
461
  return pd.DataFrame(columns=["title", "summary", "source", "similarity"])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
  self.shard_map = {}
27
  self.loaded_shards = {}
28
  self.total_docs = 0
29
+ self.api_cache = {}
30
 
31
  logger.info("Initializing MetadataManager")
32
  self._ensure_directories()
 
276
  logger.warning("No metadata records retrieved")
277
  return pd.DataFrame(columns=["title", "summary", "source", "similarity"])
278
 
279
+
280
+ def _resolve_paper_url(self, title):
281
+ """Find paper URL using multiple strategies"""
282
+ # Check cache first
283
+ if title in self.api_cache:
284
+ return self.api_cache[title]
285
+
286
+ links = {}
287
+
288
+ # Try arXiv first
289
+ arxiv_url = self._get_arxiv_url(title)
290
+ if arxiv_url:
291
+ links["arxiv"] = arxiv_url
292
+
293
+ # Attempt to get a direct link using Semantic Scholar's API
294
+ semantic_url = self._get_semantic_scholar_url(title)
295
+ if semantic_url:
296
+ links["semantic_search"] = semantic_url
297
+
298
+
299
+ # Fallback to Google Scholar search
300
+ scholar_url = f"https://scholar.google.com/scholar?q={quote(title)}"
301
+ links["google"] = scholar_url
302
+
303
+ self.api_cache[title] = links
304
+
305
+ return links
306
+
307
+
308
+ def _get_arxiv_url(self, title):
309
+ """Search arXiv API for paper"""
310
+ try:
311
+ response = requests.get(
312
+ "http://export.arxiv.org/api/query",
313
+ params={
314
+ "search_query": f'ti:"{title}"',
315
+ "max_results": 1,
316
+ "sortBy": "relevance"
317
+ },
318
+ timeout=5
319
+ )
320
+ response.raise_for_status()
321
+
322
+ # Parse XML response
323
+ from xml.etree import ElementTree as ET
324
+ root = ET.fromstring(response.content)
325
+ entry = root.find('{http://www.w3.org/2005/Atom}entry')
326
+ if entry is not None:
327
+ arxiv_id = entry.find('{http://www.w3.org/2005/Atom}id').text
328
+ return arxiv_id.replace('http:', 'https:') # Force HTTPS
329
+ except Exception as e:
330
+ logger.error(f"arXiv API failed for '{title}': {str(e)}")
331
+ return None
332
+
333
+
334
+ def _get_semantic_scholar_url(self, title):
335
+ """Search Semantic Scholar API for a paper by title and return its URL.
336
+
337
+ It queries the Semantic Scholar API for a matching paper.
338
+ If the API provides a URL, that URL is returned.
339
+ If not, but a paperId is provided, construct the URL from the paperId.
340
+ """
341
+ try:
342
+ response = requests.get(
343
+ "https://api.semanticscholar.org/graph/v1/paper/search",
344
+ params={
345
+ "query": title,
346
+ "limit": 1,
347
+ "fields": "paperId,url,title"
348
+ },
349
+ timeout=5
350
+ )
351
+ response.raise_for_status()
352
+ data = response.json()
353
+
354
+ # Check if we got at least one result
355
+ if "data" in data and len(data["data"]) > 0:
356
+ paper = data["data"][0]
357
+ # Prefer the provided URL if available
358
+ if paper.get("url"):
359
+ return paper["url"]
360
+ # Otherwise, build a URL using the paperId if available
361
+ elif paper.get("paperId"):
362
+ return f"https://www.semanticscholar.org/paper/{paper['paperId']}"
363
+ except Exception as e:
364
+ logger.error(f"Semantic Scholar API failed for '{title}': {str(e)}")
365
+ return None
366
+
367
+
368
  class SemanticSearch:
369
  def __init__(self):
370
  self.shard_dir = Path("compressed_shards")
 
534
  self.logger.debug(f"Similarity stats: min={results['similarity'].min():.3f}, " +
535
  f"max={results['similarity'].max():.3f}, " +
536
  f"mean={results['similarity'].mean():.3f}")
537
+
538
+
539
+ results['source'] = results['title'].apply(
540
+ lambda title: self._format_source_links(
541
+ self.metadata_mgr._resolve_paper_url(title)
542
+ )
543
+ )
544
 
545
  # Deduplicate and sort results
546
  pre_dedup = len(results)
 
556
  except Exception as e:
557
  self.logger.error(f"Result processing failed: {str(e)}", exc_info=True)
558
  return pd.DataFrame(columns=["title", "summary", "source", "similarity"])
559
+
560
+
561
+ def _format_source_links(self, links):
562
+ """Generate an HTML snippet for the available source links."""
563
+ html_parts = []
564
+ if "arxiv" in links:
565
+ html_parts.append(
566
+ f"<a class='source-link' href='{links['arxiv']}' target='_blank' rel='noopener noreferrer'> πŸ“œ arXiv</a>"
567
+ )
568
+ if "semantic" in links:
569
+ html_parts.append(
570
+ f"<a class='source-link' href='{links['semantic']}' target='_blank' rel='noopener noreferrer'> 🌐 Semantic Scholar</a>"
571
+ )
572
+ if "google" in links:
573
+ html_parts.append(
574
+ f"<a class='source-link' href='{links['google']}' target='_blank' rel='noopener noreferrer'> πŸ” Google Scholar</a>"
575
+ )
576
+ return " | ".join(html_parts)