Spaces:

Testys
/

semantic-search

Running

App Files Files Community

Testys commited on Mar 17

Commit

f9e4fd2

1 Parent(s): 1160873

Update search_utils.py

Browse files

Files changed (1) hide show

search_utils.py +115 -0

search_utils.py CHANGED Viewed

@@ -26,6 +26,7 @@ class MetadataManager:
         self.shard_map = {}
         self.loaded_shards = {}
         self.total_docs = 0
         logger.info("Initializing MetadataManager")
         self._ensure_directories()
@@ -275,6 +276,95 @@ class MetadataManager:
             logger.warning("No metadata records retrieved")
             return pd.DataFrame(columns=["title", "summary", "source", "similarity"])
 class SemanticSearch:
     def __init__(self):
         self.shard_dir = Path("compressed_shards")
@@ -444,6 +534,13 @@ class SemanticSearch:
                 self.logger.debug(f"Similarity stats: min={results['similarity'].min():.3f}, " +
                                  f"max={results['similarity'].max():.3f}, " +
                                  f"mean={results['similarity'].mean():.3f}")
             # Deduplicate and sort results
             pre_dedup = len(results)
@@ -459,3 +556,21 @@ class SemanticSearch:
         except Exception as e:
             self.logger.error(f"Result processing failed: {str(e)}", exc_info=True)
             return pd.DataFrame(columns=["title", "summary", "source", "similarity"])

         self.shard_map = {}
         self.loaded_shards = {}
         self.total_docs = 0
+        self.api_cache = {}
         logger.info("Initializing MetadataManager")
         self._ensure_directories()
             logger.warning("No metadata records retrieved")
             return pd.DataFrame(columns=["title", "summary", "source", "similarity"])
+    def _resolve_paper_url(self, title):
+        """Find paper URL using multiple strategies"""
+        # Check cache first
+        if title in self.api_cache:
+            return self.api_cache[title]
+        links = {}
+        # Try arXiv first
+        arxiv_url = self._get_arxiv_url(title)
+        if arxiv_url:
+            links["arxiv"] = arxiv_url
+        # Attempt to get a direct link using Semantic Scholar's API
+        semantic_url = self._get_semantic_scholar_url(title)
+        if semantic_url:
+            links["semantic_search"] = semantic_url
+        # Fallback to Google Scholar search
+        scholar_url = f"https://scholar.google.com/scholar?q={quote(title)}"
+        links["google"] = scholar_url
+        self.api_cache[title] = links
+        return links
+    def _get_arxiv_url(self, title):
+        """Search arXiv API for paper"""
+        try:
+            response = requests.get(
+                "http://export.arxiv.org/api/query",
+                params={
+                    "search_query": f'ti:"{title}"',
+                    "max_results": 1,
+                    "sortBy": "relevance"
+                },
+                timeout=5
+            )
+            response.raise_for_status()
+            # Parse XML response
+            from xml.etree import ElementTree as ET
+            root = ET.fromstring(response.content)
+            entry = root.find('{http://www.w3.org/2005/Atom}entry')
+            if entry is not None:
+                arxiv_id = entry.find('{http://www.w3.org/2005/Atom}id').text
+                return arxiv_id.replace('http:', 'https:')  # Force HTTPS
+        except Exception as e:
+            logger.error(f"arXiv API failed for '{title}': {str(e)}")
+        return None
+    def _get_semantic_scholar_url(self, title):
+        """Search Semantic Scholar API for a paper by title and return its URL.
+        It queries the Semantic Scholar API for a matching paper.
+        If the API provides a URL, that URL is returned.
+        If not, but a paperId is provided, construct the URL from the paperId.
+        """
+        try:
+            response = requests.get(
+                "https://api.semanticscholar.org/graph/v1/paper/search",
+                params={
+                    "query": title,
+                    "limit": 1,
+                    "fields": "paperId,url,title"
+                },
+                timeout=5
+            )
+            response.raise_for_status()
+            data = response.json()
+            # Check if we got at least one result
+            if "data" in data and len(data["data"]) > 0:
+                paper = data["data"][0]
+                # Prefer the provided URL if available
+                if paper.get("url"):
+                    return paper["url"]
+                # Otherwise, build a URL using the paperId if available
+                elif paper.get("paperId"):
+                    return f"https://www.semanticscholar.org/paper/{paper['paperId']}"
+        except Exception as e:
+            logger.error(f"Semantic Scholar API failed for '{title}': {str(e)}")
+        return None
 class SemanticSearch:
     def __init__(self):
         self.shard_dir = Path("compressed_shards")
                 self.logger.debug(f"Similarity stats: min={results['similarity'].min():.3f}, " +
                                  f"max={results['similarity'].max():.3f}, " +
                                  f"mean={results['similarity'].mean():.3f}")
+            results['source'] = results['title'].apply(
+                lambda title: self._format_source_links(
+                    self.metadata_mgr._resolve_paper_url(title)
+                )
+            )
             # Deduplicate and sort results
             pre_dedup = len(results)
         except Exception as e:
             self.logger.error(f"Result processing failed: {str(e)}", exc_info=True)
             return pd.DataFrame(columns=["title", "summary", "source", "similarity"])
+    def _format_source_links(self, links):
+        """Generate an HTML snippet for the available source links."""
+        html_parts = []
+        if "arxiv" in links:
+            html_parts.append(
+                f"<a class='source-link' href='{links['arxiv']}' target='_blank' rel='noopener noreferrer'> 📜 arXiv</a>"
+            )
+        if "semantic" in links:
+            html_parts.append(
+                f"<a class='source-link' href='{links['semantic']}' target='_blank' rel='noopener noreferrer'> 🌐 Semantic Scholar</a>"
+            )
+        if "google" in links:
+            html_parts.append(
+                f"<a class='source-link' href='{links['google']}' target='_blank' rel='noopener noreferrer'> 🔍 Google Scholar</a>"
+            )
+        return " | ".join(html_parts)