citation-integrity / extract_mentions.py
Christof Bless
first working mvp
b23f8b6 unverified
import json
import re
import sys
import numpy as np
from pathlib import Path
from typing import NamedTuple
import pandas as pd
TITLE_NORMALIZE = [
"alpha", "beta", "gamma", "delta", "epsilon", "kappa", "lambda"
]
class Context(NamedTuple):
left: str
right: str
split_right_pattern = re.compile(r"(?:#+)|(?:\[(?>[^A-Za-z0-9\[\]\.]{0,4}\d{1,3}[^A-Za-z0-9\[\]\.]{0,4})+?\])")
split_left_pattern = re.compile(r"(?:#+)|(?:\](?>[^A-Za-z0-9\[\]\.]{0,4}\d{1,3}[^A-Za-z0-9\[\]\.]{0,4})+?\[)")
ieee_style_pattern = re.compile(r"(?>\[(?>[^A-Za-z0-9\[\]\.]*(\d{1,3})[^A-Za-z0-9\[\]\.]*)+\][^A-Za-z0-9\[\]]*)+")
auth_year_style_pattern = re.compile(r"(?>\((?>[^()]+?[,\s][1-2][0-9]{3})+\)[^()A-Za-z0-9]*)+")
def filter_page_breaks(content):
find_page_breaks = re.compile(
r"""
\n*
\n # empty line
-----\n # 5 dashes
\n # empty line
(?:.*?\n)? # Capture the footer/header
\n*
""",
re.VERBOSE | re.M
)
return re.sub(find_page_breaks, " ", content)
def get_author_title_year_patterns_from_citation(cite):
title = cite['title']
for w in TITLE_NORMALIZE:
title = title.replace(w, "$")
title = re.sub(r"[^a-zA-Z0-9]+", "_", title) # Replace en and em dashes with a hyphen
# title = title.replace(" ", r"[^a-zA-Z0-9]+?")
year = str(cite['publication_year'])
try:
first_author = cite['authorships'][0]['author']['display_name']
## only lastname
first_author = re.sub(r"[^a-zA-Z0-9]+", "_", first_author.split(" ")[-1])
except IndexError or TypeError:
first_author = None
return first_author, title, year
def extract_potential_citations(paper):
ieee_style = ieee_style_pattern.finditer(paper)
ieee_style_buckets = []
for match in ieee_style:
possible = set([int(n) for n in re.findall(r"\d{1,3}", match.group(1))])
## expand ranges
ranges = re.findall(r"(\d{1,3})[–——-]+(\d{1,3})", match.group(1))
if len(ranges)>0:
for start, end in ranges:
possible |= set(range(int(start),int(end)+1))
ieee_style_buckets.append((match.start(), match.end(), match.group(0), possible))
auth_year_style = auth_year_style_pattern.finditer(paper)
auth_year_style_buckets = []
for match in auth_year_style:
possible = set(re.split(r"(\b[1-2]\d{3}\b)\W*", match.group(0)))
auth_year_style_buckets.append((match.start(), match.end(), match.group(0), possible))
return ieee_style_buckets, auth_year_style_buckets
def find_reference_in_reference_section(paper, cite, references):
"""
Searches for reference section entry matching citation paper title, year, first author, and journal in a markdown file
using fuzzy matching.
"""
patterns = get_author_title_year_patterns_from_citation(cite)
if any([p is None for p in patterns]):
return paper, None
author, title, year = patterns
patterns = [author, title, year]
# Try finding all the patterns between two enumeration items starting from the back of the string
# for i,s in enumerate(references):
for full_ref, enum, ref_body in references:
for w in TITLE_NORMALIZE:
normalized = ref_body.replace(w, "$")
fuzzy_ref = re.sub(r"[^a-zA-Z0-9]+", "_", normalized)
if all([re.search(pattern, fuzzy_ref, re.IGNORECASE | re.MULTILINE | re.DOTALL) for pattern in patterns]):
match = (cite["id"], author, title, year, enum, ref_body)
# remove the reference from the paper so it can't be matched again
paper = paper.replace(full_ref, "")
return paper, match
return paper, (cite["id"], author, title, year, None, None)
def find_mentions_by_pointer(doi, ref, paper, ieee_possible):
"""
Match the links mentioning that reference in the text and extract context.
"""
mentions = []
(oa_id, _, _, _, ref_num, r) = ref
for start, end, match, possible_numbers in ieee_possible:
if int(ref_num) in possible_numbers:
context = create_context(start, end, paper)
mentions.append((doi, oa_id, ref_num, r, start, end, context.left, match, context.right))
return mentions
def find_mentions_direct(doi, ref, paper, auth_style_possible):
"""
Match the links mentioning that reference in the text and extract context.
"""
mentions = []
(oa_id, a, _, y, _, _) = ref
for start, end, match, possibilities in auth_style_possible:
for possibility in possibilities:
if y in possibility and a in possibility:
context = create_context(start, end, paper)
mentions.append((doi, oa_id, None, None, start, end, context.left, match, context.right))
return mentions
def create_context(start, end, paper):
left = paper[max(0, start - 500):start]
right = paper[end:end + min(len(paper) - end, 500)]
## only take context until a next section begins or another citation appears
splitleft = split_left_pattern.search(left[::-1])
if splitleft is not None:
left = left[len(left) - splitleft.start():]
splitright = split_right_pattern.search(right)
if splitright is not None:
right = right[:splitright.start()]
return Context(left=left, right=right)
def restore_inverted_abstract(inverted_abstr):
all_indexes = [index for indexes in inverted_abstr.values() for index in indexes]
if len(all_indexes) > 0:
length = max(all_indexes) + 1
else:
return None
abstract_words = ["" for _ in range(length)]
for word, indexes in inverted_abstr.items():
for index in indexes:
abstract_words[index] = word
return " ".join(abstract_words)
def extract_title_abstract(oa_object):
abstract = oa_object["abstract_inverted_index"]
title_abstract_obj = {
"title": oa_object["title"],
"abstract": (None if abstract is None else restore_inverted_abstract(abstract))
}
return title_abstract_obj
def extract_citation_contexts(cites, paper):
counter=0
extracted_citations = []
references_pattern = re.compile(r'(\n\W*(\d{1,3})\W(.+?)(?=(?:\n\n)|(?:\n\W*\d{1,3}\W)|\Z))', re.VERBOSE | re.I | re.M | re.S)
for doi in cites:
# for doi in ["10.1155/2021/4883509"]:
counter+=1
paper = filter_page_breaks(paper)
# print(paper)
if paper is None:
continue
# remove title and authors from beginning of paper
paper = paper[750:]
citations = cites[doi]
# references = re.findall(r'\n\s*(\d+)\.(.*?)(?=(?:\n\s*\d+\.)|\Z)', paper, re.VERBOSE | re.I | re.M | re.S)
references = references_pattern.findall(paper)
found = 0
n_mentions = 0
has_abstract_title = 0
in_ref_section_refs = []
for cite in citations:
embedding_input = extract_title_abstract(cite)
if embedding_input["abstract"] is None or embedding_input["title"] is None:
in_ref_section_refs.append(None)
continue
has_abstract_title+=1
paper, in_ref_section_ref = find_reference_in_reference_section(paper, cite, references)
in_ref_section_refs.append(in_ref_section_ref)
ieee, auth_year = extract_potential_citations(paper)
for ref in in_ref_section_refs:
if ref is not None:
if ref[4] is not None:
mentions = find_mentions_by_pointer(doi, ref, paper, ieee)
else: mentions = []
mentions += find_mentions_direct(doi, ref, paper, auth_year)
extracted_citations+=mentions
if len(mentions)>0:
n_mentions+=len(mentions)
found+=1
print(f"{counter}/{len(cites)} - {doi}: {len(citations)} citations, {has_abstract_title} embeddable citations and {found} references with {n_mentions} mentions found in markdown.")
return pd.DataFrame(extracted_citations, columns = ["cited_in_doi", "citation_id", "reference_marker", "reference_target", "mention_start", "mention_end", "left_context", "mention", "right_context"])