dylanebert
commited on
Commit
·
c3e313a
1
Parent(s):
be9d670
app.py
CHANGED
@@ -24,9 +24,7 @@ from typing import List, Dict, Any, Optional
|
|
24 |
import gradio as gr
|
25 |
import requests
|
26 |
import feedparser
|
27 |
-
import spacy
|
28 |
from bs4 import BeautifulSoup
|
29 |
-
from fuzzywuzzy import fuzz
|
30 |
|
31 |
# Configure logging
|
32 |
logging.basicConfig(
|
@@ -45,9 +43,6 @@ GITHUB_AUTH = os.environ.get("GITHUB_AUTH")
|
|
45 |
if not HF_TOKEN:
|
46 |
logger.warning("HF_TOKEN not found in environment variables")
|
47 |
|
48 |
-
# Global spaCy model (loaded lazily)
|
49 |
-
nlp = None
|
50 |
-
|
51 |
|
52 |
# Utility functions
|
53 |
def get_arxiv_id(paper_url: str) -> Optional[str]:
|
@@ -67,6 +62,60 @@ def extract_links_from_soup(soup, text):
|
|
67 |
return html_links + markdown_links
|
68 |
|
69 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
70 |
def create_row_data(input_data: str) -> Dict[str, Any]:
|
71 |
"""Create standardized row data structure from input."""
|
72 |
row_data = {
|
@@ -112,10 +161,25 @@ def infer_paper_from_row(row_data: Dict[str, Any]) -> Optional[str]:
|
|
112 |
try:
|
113 |
url = urlparse(row_data["Paper"])
|
114 |
if url.scheme in ["http", "https"]:
|
|
|
115 |
if "arxiv.org/pdf/" in row_data["Paper"]:
|
116 |
new_url = row_data["Paper"].replace("/pdf/", "/abs/").replace(".pdf", "")
|
117 |
logger.info(f"Paper {new_url} inferred from {row_data['Paper']}")
|
118 |
return new_url
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
119 |
return row_data["Paper"]
|
120 |
except Exception:
|
121 |
pass
|
@@ -246,7 +310,28 @@ def infer_code_from_row(row_data: Dict[str, Any]) -> Optional[str]:
|
|
246 |
except Exception:
|
247 |
pass
|
248 |
|
249 |
-
# Try
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
250 |
if row_data.get("Paper") is not None and "arxiv.org" in row_data["Paper"] and GITHUB_AUTH:
|
251 |
try:
|
252 |
arxiv_id = get_arxiv_id(row_data["Paper"])
|
@@ -327,10 +412,29 @@ def infer_model_from_row(row_data: Dict[str, Any]) -> Optional[str]:
|
|
327 |
|
328 |
if row_data.get("Paper") is not None:
|
329 |
arxiv_id = get_arxiv_id(row_data["Paper"])
|
|
|
|
|
330 |
if arxiv_id is not None and arxiv_id in known_model_mappings:
|
331 |
model_url = known_model_mappings[arxiv_id]
|
332 |
logger.info(f"Model {model_url} inferred from Paper (known mapping)")
|
333 |
return model_url
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
334 |
|
335 |
return None
|
336 |
|
@@ -347,16 +451,57 @@ def infer_dataset_from_row(row_data: Dict[str, Any]) -> Optional[str]:
|
|
347 |
|
348 |
if row_data.get("Paper") is not None:
|
349 |
arxiv_id = get_arxiv_id(row_data["Paper"])
|
|
|
|
|
350 |
if arxiv_id is not None and arxiv_id in known_dataset_mappings:
|
351 |
dataset_url = known_dataset_mappings[arxiv_id]
|
352 |
logger.info(f"Dataset {dataset_url} inferred from Paper (known mapping)")
|
353 |
return dataset_url
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
354 |
|
355 |
return None
|
356 |
|
357 |
|
358 |
def infer_space_from_row(row_data: Dict[str, Any]) -> Optional[str]:
|
359 |
"""Infer HuggingFace space from row data"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
360 |
if row_data.get("Model") is not None:
|
361 |
try:
|
362 |
model_id = row_data["Model"].split("huggingface.co/")[1]
|
@@ -393,36 +538,6 @@ def infer_license_from_row(row_data: Dict[str, Any]) -> Optional[str]:
|
|
393 |
return None
|
394 |
|
395 |
|
396 |
-
def infer_orgs_from_row(row_data: Dict[str, Any]) -> List[str]:
|
397 |
-
"""Infer organizations from row data"""
|
398 |
-
global nlp
|
399 |
-
if nlp is None:
|
400 |
-
try:
|
401 |
-
nlp = spacy.load("en_core_web_sm")
|
402 |
-
except OSError as e:
|
403 |
-
logger.warning(f"Could not load spaCy model 'en_core_web_sm': {e}")
|
404 |
-
return row_data.get("Orgs", [])
|
405 |
-
|
406 |
-
orgs_input = row_data.get("Orgs", [])
|
407 |
-
if not orgs_input or not isinstance(orgs_input, list):
|
408 |
-
return []
|
409 |
-
|
410 |
-
orgs = []
|
411 |
-
for org in orgs_input:
|
412 |
-
if not org or not isinstance(org, str):
|
413 |
-
continue
|
414 |
-
doc = nlp(org)
|
415 |
-
for ent in doc.ents:
|
416 |
-
if ent.label_ == "ORG":
|
417 |
-
if ent.text == org and ent.text not in orgs:
|
418 |
-
orgs.append(ent.text)
|
419 |
-
break
|
420 |
-
if fuzz.ratio(ent.text, org) > 80 and ent.text not in orgs:
|
421 |
-
orgs.append(ent.text)
|
422 |
-
logger.info(f"Org {ent.text} inferred from {org}")
|
423 |
-
break
|
424 |
-
|
425 |
-
return orgs
|
426 |
|
427 |
|
428 |
def infer_field_type(value: str) -> str:
|
@@ -575,27 +690,6 @@ def classify_research_url(input_data: str) -> str:
|
|
575 |
return "Unknown"
|
576 |
|
577 |
|
578 |
-
def infer_organizations(input_data: str) -> List[str]:
|
579 |
-
"""
|
580 |
-
Infer affiliated organizations from research paper or project information.
|
581 |
-
|
582 |
-
Args:
|
583 |
-
input_data (str): A URL, paper title, or other research-related input
|
584 |
-
|
585 |
-
Returns:
|
586 |
-
List[str]: A list of organization names, or empty list if no organizations found
|
587 |
-
"""
|
588 |
-
if not input_data or not input_data.strip():
|
589 |
-
return []
|
590 |
-
|
591 |
-
try:
|
592 |
-
row_data = create_row_data(input_data.strip())
|
593 |
-
orgs = infer_orgs_from_row(row_data)
|
594 |
-
return orgs if isinstance(orgs, list) else []
|
595 |
-
|
596 |
-
except Exception as e:
|
597 |
-
logger.error(f"Error inferring organizations: {e}")
|
598 |
-
return []
|
599 |
|
600 |
|
601 |
def infer_publication_date(input_data: str) -> str:
|
@@ -734,7 +828,6 @@ def find_research_relationships(input_data: str) -> Dict[str, Any]:
|
|
734 |
"code": None,
|
735 |
"name": None,
|
736 |
"authors": [],
|
737 |
-
"organizations": [],
|
738 |
"date": None,
|
739 |
"model": None,
|
740 |
"dataset": None,
|
@@ -742,7 +835,7 @@ def find_research_relationships(input_data: str) -> Dict[str, Any]:
|
|
742 |
"license": None,
|
743 |
"field_type": None,
|
744 |
"success_count": 0,
|
745 |
-
"total_inferences":
|
746 |
}
|
747 |
|
748 |
inferences = [
|
@@ -750,7 +843,6 @@ def find_research_relationships(input_data: str) -> Dict[str, Any]:
|
|
750 |
("code", infer_code_repository),
|
751 |
("name", infer_research_name),
|
752 |
("authors", infer_authors),
|
753 |
-
("organizations", infer_organizations),
|
754 |
("date", infer_publication_date),
|
755 |
("model", infer_model),
|
756 |
("dataset", infer_dataset),
|
@@ -783,41 +875,109 @@ def find_research_relationships(input_data: str) -> Dict[str, Any]:
|
|
783 |
return {"error": str(e), "success_count": 0, "total_inferences": 0}
|
784 |
|
785 |
|
786 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
787 |
with gr.Blocks(title="Research Tracker MCP Server") as demo:
|
788 |
-
gr.Markdown("# Research Tracker
|
789 |
gr.Markdown("""
|
790 |
-
|
791 |
-
|
792 |
-
**
|
793 |
-
-
|
794 |
-
-
|
795 |
-
- `infer_code_repository` - Discover code repository links
|
796 |
-
- `infer_research_name` - Extract research project names
|
797 |
-
- `classify_research_url` - Classify URL types (paper/code/model/etc.)
|
798 |
-
- `infer_organizations` - Identify affiliated organizations
|
799 |
-
- `infer_publication_date` - Extract publication dates
|
800 |
-
- `infer_model` - Find associated HuggingFace models
|
801 |
-
- `infer_dataset` - Find associated HuggingFace datasets
|
802 |
-
- `infer_space` - Find associated HuggingFace spaces
|
803 |
-
- `infer_license` - Extract license information
|
804 |
-
- `find_research_relationships` - Comprehensive research ecosystem analysis
|
805 |
-
|
806 |
-
**Input Support:**
|
807 |
-
- arXiv paper URLs (https://arxiv.org/abs/...)
|
808 |
- GitHub repository URLs (https://github.com/...)
|
809 |
- HuggingFace model/dataset/space URLs
|
810 |
- Research paper titles and project names
|
811 |
- Project page URLs
|
812 |
""")
|
813 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
814 |
# Expose all core functions as MCP tools
|
815 |
gr.api(infer_authors)
|
816 |
gr.api(infer_paper_url)
|
817 |
gr.api(infer_code_repository)
|
818 |
gr.api(infer_research_name)
|
819 |
gr.api(classify_research_url)
|
820 |
-
gr.api(infer_organizations)
|
821 |
gr.api(infer_publication_date)
|
822 |
gr.api(infer_model)
|
823 |
gr.api(infer_dataset)
|
@@ -828,4 +988,4 @@ with gr.Blocks(title="Research Tracker MCP Server") as demo:
|
|
828 |
|
829 |
if __name__ == "__main__":
|
830 |
logger.info("Starting Research Tracker MCP Server")
|
831 |
-
demo.launch(mcp_server=True, share=False)
|
|
|
24 |
import gradio as gr
|
25 |
import requests
|
26 |
import feedparser
|
|
|
27 |
from bs4 import BeautifulSoup
|
|
|
28 |
|
29 |
# Configure logging
|
30 |
logging.basicConfig(
|
|
|
43 |
if not HF_TOKEN:
|
44 |
logger.warning("HF_TOKEN not found in environment variables")
|
45 |
|
|
|
|
|
|
|
46 |
|
47 |
# Utility functions
|
48 |
def get_arxiv_id(paper_url: str) -> Optional[str]:
|
|
|
62 |
return html_links + markdown_links
|
63 |
|
64 |
|
65 |
+
def scrape_huggingface_paper_page(paper_url: str) -> Dict[str, Any]:
|
66 |
+
"""
|
67 |
+
Scrape HuggingFace paper page to find associated resources
|
68 |
+
|
69 |
+
Returns:
|
70 |
+
Dict containing found resources: {
|
71 |
+
"models": [], "datasets": [], "spaces": [], "code": []
|
72 |
+
}
|
73 |
+
"""
|
74 |
+
resources = {"models": [], "datasets": [], "spaces": [], "code": []}
|
75 |
+
|
76 |
+
if not paper_url or "huggingface.co/papers" not in paper_url:
|
77 |
+
return resources
|
78 |
+
|
79 |
+
try:
|
80 |
+
r = requests.get(paper_url, timeout=REQUEST_TIMEOUT)
|
81 |
+
if r.status_code != 200:
|
82 |
+
return resources
|
83 |
+
|
84 |
+
soup = BeautifulSoup(r.text, "html.parser")
|
85 |
+
|
86 |
+
# Find all links on the page
|
87 |
+
links = []
|
88 |
+
for link in soup.find_all("a"):
|
89 |
+
href = link.get("href")
|
90 |
+
if href:
|
91 |
+
# Convert relative URLs to absolute
|
92 |
+
if href.startswith("/"):
|
93 |
+
href = "https://huggingface.co" + href
|
94 |
+
elif href.startswith("huggingface.co"):
|
95 |
+
href = "https://" + href
|
96 |
+
links.append(href)
|
97 |
+
|
98 |
+
# Categorize links
|
99 |
+
for link in links:
|
100 |
+
if "huggingface.co/" in link:
|
101 |
+
if "/models/" in link and link not in resources["models"]:
|
102 |
+
resources["models"].append(link)
|
103 |
+
elif "/datasets/" in link and link not in resources["datasets"]:
|
104 |
+
resources["datasets"].append(link)
|
105 |
+
elif "/spaces/" in link and link not in resources["spaces"]:
|
106 |
+
resources["spaces"].append(link)
|
107 |
+
elif "github.com" in link and link not in resources["code"]:
|
108 |
+
resources["code"].append(link)
|
109 |
+
|
110 |
+
logger.info(f"Found {len(resources['models'])} models, {len(resources['datasets'])} datasets, "
|
111 |
+
f"{len(resources['spaces'])} spaces, {len(resources['code'])} code repos from HF paper page")
|
112 |
+
|
113 |
+
except Exception as e:
|
114 |
+
logger.warning(f"Failed to scrape HuggingFace paper page {paper_url}: {e}")
|
115 |
+
|
116 |
+
return resources
|
117 |
+
|
118 |
+
|
119 |
def create_row_data(input_data: str) -> Dict[str, Any]:
|
120 |
"""Create standardized row data structure from input."""
|
121 |
row_data = {
|
|
|
161 |
try:
|
162 |
url = urlparse(row_data["Paper"])
|
163 |
if url.scheme in ["http", "https"]:
|
164 |
+
# Convert arXiv PDF to abs format
|
165 |
if "arxiv.org/pdf/" in row_data["Paper"]:
|
166 |
new_url = row_data["Paper"].replace("/pdf/", "/abs/").replace(".pdf", "")
|
167 |
logger.info(f"Paper {new_url} inferred from {row_data['Paper']}")
|
168 |
return new_url
|
169 |
+
|
170 |
+
# If this is an arXiv URL, try HuggingFace papers first for better resource discovery
|
171 |
+
if "arxiv.org/abs/" in row_data["Paper"]:
|
172 |
+
arxiv_id = row_data["Paper"].split("arxiv.org/abs/")[1]
|
173 |
+
hf_paper_url = f"https://huggingface.co/papers/{arxiv_id}"
|
174 |
+
try:
|
175 |
+
# Test if HuggingFace paper page exists and has content
|
176 |
+
r = requests.get(hf_paper_url, timeout=10)
|
177 |
+
if r.status_code == 200 and len(r.text) > 1000: # Basic check for content
|
178 |
+
logger.info(f"Paper {hf_paper_url} inferred from arXiv (HuggingFace preferred)")
|
179 |
+
return hf_paper_url
|
180 |
+
except Exception:
|
181 |
+
pass # Fall back to original arXiv URL
|
182 |
+
|
183 |
return row_data["Paper"]
|
184 |
except Exception:
|
185 |
pass
|
|
|
310 |
except Exception:
|
311 |
pass
|
312 |
|
313 |
+
# Try scraping HuggingFace paper page for code links
|
314 |
+
if row_data.get("Paper") is not None:
|
315 |
+
arxiv_id = get_arxiv_id(row_data["Paper"])
|
316 |
+
|
317 |
+
# Try scraping HuggingFace paper page
|
318 |
+
if "huggingface.co/papers" in row_data["Paper"]:
|
319 |
+
resources = scrape_huggingface_paper_page(row_data["Paper"])
|
320 |
+
if resources["code"]:
|
321 |
+
code_url = resources["code"][0] # Take first code repo found
|
322 |
+
logger.info(f"Code {code_url} inferred from HuggingFace paper page")
|
323 |
+
return code_url
|
324 |
+
|
325 |
+
# If we have arXiv URL, try the HuggingFace version first
|
326 |
+
elif "arxiv.org/abs/" in row_data["Paper"] and arxiv_id:
|
327 |
+
hf_paper_url = f"https://huggingface.co/papers/{arxiv_id}"
|
328 |
+
resources = scrape_huggingface_paper_page(hf_paper_url)
|
329 |
+
if resources["code"]:
|
330 |
+
code_url = resources["code"][0]
|
331 |
+
logger.info(f"Code {code_url} inferred from HuggingFace paper page (via arXiv)")
|
332 |
+
return code_url
|
333 |
+
|
334 |
+
# Fallback: Try GitHub search for papers
|
335 |
if row_data.get("Paper") is not None and "arxiv.org" in row_data["Paper"] and GITHUB_AUTH:
|
336 |
try:
|
337 |
arxiv_id = get_arxiv_id(row_data["Paper"])
|
|
|
412 |
|
413 |
if row_data.get("Paper") is not None:
|
414 |
arxiv_id = get_arxiv_id(row_data["Paper"])
|
415 |
+
|
416 |
+
# First check known mappings
|
417 |
if arxiv_id is not None and arxiv_id in known_model_mappings:
|
418 |
model_url = known_model_mappings[arxiv_id]
|
419 |
logger.info(f"Model {model_url} inferred from Paper (known mapping)")
|
420 |
return model_url
|
421 |
+
|
422 |
+
# Try scraping HuggingFace paper page
|
423 |
+
if "huggingface.co/papers" in row_data["Paper"]:
|
424 |
+
resources = scrape_huggingface_paper_page(row_data["Paper"])
|
425 |
+
if resources["models"]:
|
426 |
+
model_url = resources["models"][0] # Take first model found
|
427 |
+
logger.info(f"Model {model_url} inferred from HuggingFace paper page")
|
428 |
+
return model_url
|
429 |
+
|
430 |
+
# If we have arXiv URL, try the HuggingFace version
|
431 |
+
elif "arxiv.org/abs/" in row_data["Paper"] and arxiv_id:
|
432 |
+
hf_paper_url = f"https://huggingface.co/papers/{arxiv_id}"
|
433 |
+
resources = scrape_huggingface_paper_page(hf_paper_url)
|
434 |
+
if resources["models"]:
|
435 |
+
model_url = resources["models"][0]
|
436 |
+
logger.info(f"Model {model_url} inferred from HuggingFace paper page (via arXiv)")
|
437 |
+
return model_url
|
438 |
|
439 |
return None
|
440 |
|
|
|
451 |
|
452 |
if row_data.get("Paper") is not None:
|
453 |
arxiv_id = get_arxiv_id(row_data["Paper"])
|
454 |
+
|
455 |
+
# First check known mappings
|
456 |
if arxiv_id is not None and arxiv_id in known_dataset_mappings:
|
457 |
dataset_url = known_dataset_mappings[arxiv_id]
|
458 |
logger.info(f"Dataset {dataset_url} inferred from Paper (known mapping)")
|
459 |
return dataset_url
|
460 |
+
|
461 |
+
# Try scraping HuggingFace paper page
|
462 |
+
if "huggingface.co/papers" in row_data["Paper"]:
|
463 |
+
resources = scrape_huggingface_paper_page(row_data["Paper"])
|
464 |
+
if resources["datasets"]:
|
465 |
+
dataset_url = resources["datasets"][0] # Take first dataset found
|
466 |
+
logger.info(f"Dataset {dataset_url} inferred from HuggingFace paper page")
|
467 |
+
return dataset_url
|
468 |
+
|
469 |
+
# If we have arXiv URL, try the HuggingFace version
|
470 |
+
elif "arxiv.org/abs/" in row_data["Paper"] and arxiv_id:
|
471 |
+
hf_paper_url = f"https://huggingface.co/papers/{arxiv_id}"
|
472 |
+
resources = scrape_huggingface_paper_page(hf_paper_url)
|
473 |
+
if resources["datasets"]:
|
474 |
+
dataset_url = resources["datasets"][0]
|
475 |
+
logger.info(f"Dataset {dataset_url} inferred from HuggingFace paper page (via arXiv)")
|
476 |
+
return dataset_url
|
477 |
|
478 |
return None
|
479 |
|
480 |
|
481 |
def infer_space_from_row(row_data: Dict[str, Any]) -> Optional[str]:
|
482 |
"""Infer HuggingFace space from row data"""
|
483 |
+
# Try scraping HuggingFace paper page first (most reliable)
|
484 |
+
if row_data.get("Paper") is not None:
|
485 |
+
arxiv_id = get_arxiv_id(row_data["Paper"])
|
486 |
+
|
487 |
+
# Try scraping HuggingFace paper page
|
488 |
+
if "huggingface.co/papers" in row_data["Paper"]:
|
489 |
+
resources = scrape_huggingface_paper_page(row_data["Paper"])
|
490 |
+
if resources["spaces"]:
|
491 |
+
space_url = resources["spaces"][0] # Take first space found
|
492 |
+
logger.info(f"Space {space_url} inferred from HuggingFace paper page")
|
493 |
+
return space_url
|
494 |
+
|
495 |
+
# If we have arXiv URL, try the HuggingFace version
|
496 |
+
elif "arxiv.org/abs/" in row_data["Paper"] and arxiv_id:
|
497 |
+
hf_paper_url = f"https://huggingface.co/papers/{arxiv_id}"
|
498 |
+
resources = scrape_huggingface_paper_page(hf_paper_url)
|
499 |
+
if resources["spaces"]:
|
500 |
+
space_url = resources["spaces"][0]
|
501 |
+
logger.info(f"Space {space_url} inferred from HuggingFace paper page (via arXiv)")
|
502 |
+
return space_url
|
503 |
+
|
504 |
+
# Fallback: try to infer from model
|
505 |
if row_data.get("Model") is not None:
|
506 |
try:
|
507 |
model_id = row_data["Model"].split("huggingface.co/")[1]
|
|
|
538 |
return None
|
539 |
|
540 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
541 |
|
542 |
|
543 |
def infer_field_type(value: str) -> str:
|
|
|
690 |
return "Unknown"
|
691 |
|
692 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
693 |
|
694 |
|
695 |
def infer_publication_date(input_data: str) -> str:
|
|
|
828 |
"code": None,
|
829 |
"name": None,
|
830 |
"authors": [],
|
|
|
831 |
"date": None,
|
832 |
"model": None,
|
833 |
"dataset": None,
|
|
|
835 |
"license": None,
|
836 |
"field_type": None,
|
837 |
"success_count": 0,
|
838 |
+
"total_inferences": 10
|
839 |
}
|
840 |
|
841 |
inferences = [
|
|
|
843 |
("code", infer_code_repository),
|
844 |
("name", infer_research_name),
|
845 |
("authors", infer_authors),
|
|
|
846 |
("date", infer_publication_date),
|
847 |
("model", infer_model),
|
848 |
("dataset", infer_dataset),
|
|
|
875 |
return {"error": str(e), "success_count": 0, "total_inferences": 0}
|
876 |
|
877 |
|
878 |
+
def format_list_output(items):
|
879 |
+
"""Format list items for display"""
|
880 |
+
if not items or not isinstance(items, list):
|
881 |
+
return "None"
|
882 |
+
return "\n".join([f"• {item}" for item in items])
|
883 |
+
|
884 |
+
def process_research_relationships(input_data):
|
885 |
+
"""Process research input and return formatted results"""
|
886 |
+
if not input_data or not input_data.strip():
|
887 |
+
return "Please enter a valid URL or research name", "", "", "", "", "", "", "", "", ""
|
888 |
+
|
889 |
+
try:
|
890 |
+
result = find_research_relationships(input_data.strip())
|
891 |
+
|
892 |
+
# Extract individual fields with fallback to empty string
|
893 |
+
paper = result.get("paper", "") or ""
|
894 |
+
code = result.get("code", "") or ""
|
895 |
+
name = result.get("name", "") or ""
|
896 |
+
authors = format_list_output(result.get("authors", []))
|
897 |
+
date = result.get("date", "") or ""
|
898 |
+
model = result.get("model", "") or ""
|
899 |
+
dataset = result.get("dataset", "") or ""
|
900 |
+
space = result.get("space", "") or ""
|
901 |
+
license_info = result.get("license", "") or ""
|
902 |
+
field_type = result.get("field_type", "") or ""
|
903 |
+
|
904 |
+
return paper, code, name, authors, date, model, dataset, space, license_info, field_type
|
905 |
+
|
906 |
+
except Exception as e:
|
907 |
+
error_msg = f"Error processing input: {str(e)}"
|
908 |
+
return error_msg, "", "", "", "", "", "", "", "", ""
|
909 |
+
|
910 |
+
# Create Gradio interface with both UI and MCP tool exposure
|
911 |
with gr.Blocks(title="Research Tracker MCP Server") as demo:
|
912 |
+
gr.Markdown("# Research Tracker - Find Research Relationships")
|
913 |
gr.Markdown("""
|
914 |
+
Enter a research paper URL, GitHub repository, or research name to discover all related resources across platforms.
|
915 |
+
|
916 |
+
**Supported inputs:**
|
917 |
+
- arXiv paper URLs (https://arxiv.org/abs/...) - automatically checks HuggingFace papers first
|
918 |
+
- HuggingFace paper URLs (https://huggingface.co/papers/...) - preferred for better resource discovery
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
919 |
- GitHub repository URLs (https://github.com/...)
|
920 |
- HuggingFace model/dataset/space URLs
|
921 |
- Research paper titles and project names
|
922 |
- Project page URLs
|
923 |
""")
|
924 |
|
925 |
+
with gr.Row():
|
926 |
+
with gr.Column():
|
927 |
+
input_text = gr.Textbox(
|
928 |
+
label="Paper URL, Repository URL, or Research Name",
|
929 |
+
placeholder="https://arxiv.org/abs/2506.18787",
|
930 |
+
lines=2
|
931 |
+
)
|
932 |
+
submit_btn = gr.Button("Find Research Relationships", variant="primary")
|
933 |
+
|
934 |
+
gr.Markdown("## Research Relationships")
|
935 |
+
|
936 |
+
with gr.Row():
|
937 |
+
with gr.Column():
|
938 |
+
paper_output = gr.Textbox(label="Paper URL", interactive=False)
|
939 |
+
code_output = gr.Textbox(label="Code Repository", interactive=False)
|
940 |
+
name_output = gr.Textbox(label="Research Name", interactive=False)
|
941 |
+
authors_output = gr.Textbox(label="Authors", lines=3, interactive=False)
|
942 |
+
|
943 |
+
with gr.Column():
|
944 |
+
date_output = gr.Textbox(label="Publication Date", interactive=False)
|
945 |
+
model_output = gr.Textbox(label="HuggingFace Model", interactive=False)
|
946 |
+
dataset_output = gr.Textbox(label="HuggingFace Dataset", interactive=False)
|
947 |
+
|
948 |
+
with gr.Column():
|
949 |
+
space_output = gr.Textbox(label="HuggingFace Space", interactive=False)
|
950 |
+
license_output = gr.Textbox(label="License", interactive=False)
|
951 |
+
field_type_output = gr.Textbox(label="Field Type", interactive=False)
|
952 |
+
|
953 |
+
# Connect the interface
|
954 |
+
submit_btn.click(
|
955 |
+
fn=process_research_relationships,
|
956 |
+
inputs=[input_text],
|
957 |
+
outputs=[
|
958 |
+
paper_output, code_output, name_output, authors_output,
|
959 |
+
date_output, model_output, dataset_output,
|
960 |
+
space_output, license_output, field_type_output
|
961 |
+
]
|
962 |
+
)
|
963 |
+
|
964 |
+
# Also trigger on Enter key
|
965 |
+
input_text.submit(
|
966 |
+
fn=process_research_relationships,
|
967 |
+
inputs=[input_text],
|
968 |
+
outputs=[
|
969 |
+
paper_output, code_output, name_output, authors_output,
|
970 |
+
date_output, model_output, dataset_output,
|
971 |
+
space_output, license_output, field_type_output
|
972 |
+
]
|
973 |
+
)
|
974 |
+
|
975 |
# Expose all core functions as MCP tools
|
976 |
gr.api(infer_authors)
|
977 |
gr.api(infer_paper_url)
|
978 |
gr.api(infer_code_repository)
|
979 |
gr.api(infer_research_name)
|
980 |
gr.api(classify_research_url)
|
|
|
981 |
gr.api(infer_publication_date)
|
982 |
gr.api(infer_model)
|
983 |
gr.api(infer_dataset)
|
|
|
988 |
|
989 |
if __name__ == "__main__":
|
990 |
logger.info("Starting Research Tracker MCP Server")
|
991 |
+
demo.launch(mcp_server=True, share=False)
|