Spaces:
Sleeping
Sleeping
import requests | |
import time | |
import json | |
import sys | |
from pathlib import Path | |
from tqdm import tqdm | |
import pandas as pd | |
MAIL_TO = "[email protected]" | |
def get_openalex_ids(dois, batch_size=50): | |
"""Retrieve the OpenAlex IDs for a list of DOIs.""" | |
results = {} | |
for i in range(0, len(dois), batch_size): | |
batch = dois[i:i+batch_size] | |
pipe_separated_dois = "|".join(batch) | |
url = f"https://api.openalex.org/works?filter=doi:{pipe_separated_dois}&per-page={batch_size}&select=id,doi&mailto={MAIL_TO}" | |
response = requests.get(url) | |
time.sleep(0.1) # Respect API rate limits | |
if response.status_code == 200: | |
data = response.json().get("results", []) | |
for a in data: | |
results[a.get("doi").replace("https://doi.org/","")] = a.get("id") | |
else: | |
print(f"response failed with code: {response.status_code}") | |
return results | |
def get_outgoing_citations(openalex_id): | |
"""Retrieve the list of outgoing citations for multiple articles given their OpenAlex IDs.""" | |
url = ( | |
f"https://api.openalex.org/works?filter=cited_by:{openalex_id}" | |
f"&select=id,doi,title,keywords,authorships,abstract_inverted_index,publication_year,primary_location,language" | |
f"&per-page=200" | |
f"&mailto={MAIL_TO}" | |
) | |
response = requests.get(url) | |
if response.status_code == 200: | |
results = response.json().get("results", []) | |
return results | |
else: | |
print(f"response failed with code: {response.status_code}") | |
return [] | |
def extract_citation_data(citing_articles): | |
"""Extracts relevant metadata from the citing articles.""" | |
citations = [] | |
for article in citing_articles: | |
citations.append({ | |
"id": article.get("id"), | |
"doi": article.get("doi"), | |
"title": article.get("title"), | |
"authors": [ | |
{"name": author.get("author", {}).get("display_name"), "id": author.get("author", {}).get("id")} | |
for author in article.get("authorships", []) | |
], | |
"abstract": article.get("abstract_inverted_index"), | |
"year": article.get("publication_year"), | |
"venue": article.get("primary_location", {}).get("source", {}).get("display_name"), | |
"language": article.get("language") | |
}) | |
return citations | |
def fetch_citations_for_dois(doi_list): | |
"""Main function to fetch outgoing citations for a list of DOIs.""" | |
all_citations = {} | |
openalex_ids = get_openalex_ids(doi_list) | |
print(len(openalex_ids)) | |
for doi, oa_id in tqdm(openalex_ids.items()): | |
all_citations[doi] = get_outgoing_citations(oa_id) | |
if len(all_citations[doi]) == 200: | |
print(">= 200 citations:", doi, oa_id) | |
time.sleep(0.1) # Respect API rate limits | |
return all_citations | |
def save_to_file(citations, fn): | |
# Save to a JSON file | |
with open(fn, "w") as f: | |
json.dump(citations, f) | |
if __name__ == "__main__": | |
# Example usage | |
data = pd.read_parquet(sys.argv[1]) | |
doi_list = data["OriginalPaperDOI"] | |
dois_w_fulltext = [] | |
for doi in doi_list: | |
md_fn = doi.replace("https://doi.org/", "").replace("/", "|") + ".md" | |
if "retraction" in sys.argv[1]: | |
dir_up = Path("/mnt/data1/retraction_data/pdf_articles_unpaywall_md") | |
dir_oa = Path("/mnt/data1/retraction_data/pdf_articles_md") | |
dir_man = Path("/mnt/data1/retraction_data/pdf_articles_manual_md") | |
if (dir_up/md_fn).exists() or (dir_oa/md_fn).exists() or (dir_man/md_fn).exists(): | |
dois_w_fulltext.append(doi) | |
elif "reference" in sys.argv[1]: | |
dir = Path("/mnt/data1/retraction_data/pdf_articles_reference_md") | |
if (dir/md_fn).exists(): | |
dois_w_fulltext.append(doi) | |
else: | |
print("Can't find any markdown files for these DOI's.") | |
# dois_w_fulltext = dois_w_fulltext[:101] | |
print(f"Fetching outgoing citations for {len(dois_w_fulltext)} articles.") | |
out_fn = sys.argv[2] | |
citations_data = fetch_citations_for_dois(dois_w_fulltext) | |
save_to_file(citations_data, out_fn) | |
print(f"Citations data saved to {out_fn}") | |