citation-integrity / extract_citations.py
Christof Bless
add citation lookup function
64d8f8c unverified
raw
history blame
4.26 kB
import requests
import time
import json
import sys
from pathlib import Path
from tqdm import tqdm
import pandas as pd
MAIL_TO = "[email protected]"
def get_openalex_ids(dois, batch_size=50):
"""Retrieve the OpenAlex IDs for a list of DOIs."""
results = {}
for i in range(0, len(dois), batch_size):
batch = dois[i:i+batch_size]
pipe_separated_dois = "|".join(batch)
url = f"https://api.openalex.org/works?filter=doi:{pipe_separated_dois}&per-page={batch_size}&select=id,doi&mailto={MAIL_TO}"
response = requests.get(url)
time.sleep(0.1) # Respect API rate limits
if response.status_code == 200:
data = response.json().get("results", [])
for a in data:
results[a.get("doi").replace("https://doi.org/","")] = a.get("id")
else:
print(f"response failed with code: {response.status_code}")
return results
def get_outgoing_citations(openalex_id):
"""Retrieve the list of outgoing citations for multiple articles given their OpenAlex IDs."""
url = (
f"https://api.openalex.org/works?filter=cited_by:{openalex_id}"
f"&select=id,doi,title,keywords,authorships,abstract_inverted_index,publication_year,primary_location,language"
f"&per-page=200"
f"&mailto={MAIL_TO}"
)
response = requests.get(url)
if response.status_code == 200:
results = response.json().get("results", [])
return results
else:
print(f"response failed with code: {response.status_code}")
return []
def extract_citation_data(citing_articles):
"""Extracts relevant metadata from the citing articles."""
citations = []
for article in citing_articles:
citations.append({
"id": article.get("id"),
"doi": article.get("doi"),
"title": article.get("title"),
"authors": [
{"name": author.get("author", {}).get("display_name"), "id": author.get("author", {}).get("id")}
for author in article.get("authorships", [])
],
"abstract": article.get("abstract_inverted_index"),
"year": article.get("publication_year"),
"venue": article.get("primary_location", {}).get("source", {}).get("display_name"),
"language": article.get("language")
})
return citations
def fetch_citations_for_dois(doi_list):
"""Main function to fetch outgoing citations for a list of DOIs."""
all_citations = {}
openalex_ids = get_openalex_ids(doi_list)
print(len(openalex_ids))
for doi, oa_id in tqdm(openalex_ids.items()):
all_citations[doi] = get_outgoing_citations(oa_id)
if len(all_citations[doi]) == 200:
print(">= 200 citations:", doi, oa_id)
time.sleep(0.1) # Respect API rate limits
return all_citations
def save_to_file(citations, fn):
# Save to a JSON file
with open(fn, "w") as f:
json.dump(citations, f)
if __name__ == "__main__":
# Example usage
data = pd.read_parquet(sys.argv[1])
doi_list = data["OriginalPaperDOI"]
dois_w_fulltext = []
for doi in doi_list:
md_fn = doi.replace("https://doi.org/", "").replace("/", "|") + ".md"
if "retraction" in sys.argv[1]:
dir_up = Path("/mnt/data1/retraction_data/pdf_articles_unpaywall_md")
dir_oa = Path("/mnt/data1/retraction_data/pdf_articles_md")
dir_man = Path("/mnt/data1/retraction_data/pdf_articles_manual_md")
if (dir_up/md_fn).exists() or (dir_oa/md_fn).exists() or (dir_man/md_fn).exists():
dois_w_fulltext.append(doi)
elif "reference" in sys.argv[1]:
dir = Path("/mnt/data1/retraction_data/pdf_articles_reference_md")
if (dir/md_fn).exists():
dois_w_fulltext.append(doi)
else:
print("Can't find any markdown files for these DOI's.")
# dois_w_fulltext = dois_w_fulltext[:101]
print(f"Fetching outgoing citations for {len(dois_w_fulltext)} articles.")
out_fn = sys.argv[2]
citations_data = fetch_citations_for_dois(dois_w_fulltext)
save_to_file(citations_data, out_fn)
print(f"Citations data saved to {out_fn}")