Spaces:
Sleeping
Sleeping
import json | |
import re | |
import sys | |
import numpy as np | |
from pathlib import Path | |
from typing import NamedTuple | |
import pandas as pd | |
TITLE_NORMALIZE = [ | |
"alpha", "beta", "gamma", "delta", "epsilon", "kappa", "lambda" | |
] | |
class Context(NamedTuple): | |
left: str | |
right: str | |
split_right_pattern = re.compile(r"(?:#+)|(?:\[(?>[^A-Za-z0-9\[\]\.]{0,4}\d{1,3}[^A-Za-z0-9\[\]\.]{0,4})+?\])") | |
split_left_pattern = re.compile(r"(?:#+)|(?:\](?>[^A-Za-z0-9\[\]\.]{0,4}\d{1,3}[^A-Za-z0-9\[\]\.]{0,4})+?\[)") | |
ieee_style_pattern = re.compile(r"(?>\[(?>[^A-Za-z0-9\[\]\.]*(\d{1,3})[^A-Za-z0-9\[\]\.]*)+\][^A-Za-z0-9\[\]]*)+") | |
auth_year_style_pattern = re.compile(r"(?>\((?>[^()]+?[,\s][1-2][0-9]{3})+\)[^()A-Za-z0-9]*)+") | |
def filter_page_breaks(content): | |
find_page_breaks = re.compile( | |
r""" | |
\n* | |
\n # empty line | |
-----\n # 5 dashes | |
\n # empty line | |
(?:.*?\n)? # Capture the footer/header | |
\n* | |
""", | |
re.VERBOSE | re.M | |
) | |
return re.sub(find_page_breaks, " ", content) | |
def get_author_title_year_patterns_from_citation(cite): | |
title = cite['title'] | |
for w in TITLE_NORMALIZE: | |
title = title.replace(w, "$") | |
title = re.sub(r"[^a-zA-Z0-9]+", "_", title) # Replace en and em dashes with a hyphen | |
# title = title.replace(" ", r"[^a-zA-Z0-9]+?") | |
year = str(cite['publication_year']) | |
try: | |
first_author = cite['authorships'][0]['author']['display_name'] | |
## only lastname | |
first_author = re.sub(r"[^a-zA-Z0-9]+", "_", first_author.split(" ")[-1]) | |
except IndexError or TypeError: | |
first_author = None | |
return first_author, title, year | |
def extract_potential_citations(paper): | |
ieee_style = ieee_style_pattern.finditer(paper) | |
ieee_style_buckets = [] | |
for match in ieee_style: | |
possible = set([int(n) for n in re.findall(r"\d{1,3}", match.group(1))]) | |
## expand ranges | |
ranges = re.findall(r"(\d{1,3})[βββ-]+(\d{1,3})", match.group(1)) | |
if len(ranges)>0: | |
for start, end in ranges: | |
possible |= set(range(int(start),int(end)+1)) | |
ieee_style_buckets.append((match.start(), match.end(), match.group(0), possible)) | |
auth_year_style = auth_year_style_pattern.finditer(paper) | |
auth_year_style_buckets = [] | |
for match in auth_year_style: | |
possible = set(re.split(r"(\b[1-2]\d{3}\b)\W*", match.group(0))) | |
auth_year_style_buckets.append((match.start(), match.end(), match.group(0), possible)) | |
return ieee_style_buckets, auth_year_style_buckets | |
def find_reference_in_reference_section(paper, cite, references): | |
""" | |
Searches for reference section entry matching citation paper title, year, first author, and journal in a markdown file | |
using fuzzy matching. | |
""" | |
patterns = get_author_title_year_patterns_from_citation(cite) | |
if any([p is None for p in patterns]): | |
return paper, None | |
author, title, year = patterns | |
patterns = [author, title, year] | |
# Try finding all the patterns between two enumeration items starting from the back of the string | |
# for i,s in enumerate(references): | |
for full_ref, enum, ref_body in references: | |
for w in TITLE_NORMALIZE: | |
normalized = ref_body.replace(w, "$") | |
fuzzy_ref = re.sub(r"[^a-zA-Z0-9]+", "_", normalized) | |
if all([re.search(pattern, fuzzy_ref, re.IGNORECASE | re.MULTILINE | re.DOTALL) for pattern in patterns]): | |
match = (cite["id"], author, title, year, enum, ref_body) | |
# remove the reference from the paper so it can't be matched again | |
paper = paper.replace(full_ref, "") | |
return paper, match | |
return paper, (cite["id"], author, title, year, None, None) | |
def find_mentions_by_pointer(doi, ref, paper, ieee_possible): | |
""" | |
Match the links mentioning that reference in the text and extract context. | |
""" | |
mentions = [] | |
(oa_id, _, _, _, ref_num, r) = ref | |
for start, end, match, possible_numbers in ieee_possible: | |
if int(ref_num) in possible_numbers: | |
context = create_context(start, end, paper) | |
mentions.append((doi, oa_id, ref_num, r, start, end, context.left, match, context.right)) | |
return mentions | |
def find_mentions_direct(doi, ref, paper, auth_style_possible): | |
""" | |
Match the links mentioning that reference in the text and extract context. | |
""" | |
mentions = [] | |
(oa_id, a, _, y, _, _) = ref | |
for start, end, match, possibilities in auth_style_possible: | |
for possibility in possibilities: | |
if y in possibility and a in possibility: | |
context = create_context(start, end, paper) | |
mentions.append((doi, oa_id, None, None, start, end, context.left, match, context.right)) | |
return mentions | |
def create_context(start, end, paper): | |
left = paper[max(0, start - 500):start] | |
right = paper[end:end + min(len(paper) - end, 500)] | |
## only take context until a next section begins or another citation appears | |
splitleft = split_left_pattern.search(left[::-1]) | |
if splitleft is not None: | |
left = left[len(left) - splitleft.start():] | |
splitright = split_right_pattern.search(right) | |
if splitright is not None: | |
right = right[:splitright.start()] | |
return Context(left=left, right=right) | |
def restore_inverted_abstract(inverted_abstr): | |
all_indexes = [index for indexes in inverted_abstr.values() for index in indexes] | |
if len(all_indexes) > 0: | |
length = max(all_indexes) + 1 | |
else: | |
return None | |
abstract_words = ["" for _ in range(length)] | |
for word, indexes in inverted_abstr.items(): | |
for index in indexes: | |
abstract_words[index] = word | |
return " ".join(abstract_words) | |
def extract_title_abstract(oa_object): | |
abstract = oa_object["abstract_inverted_index"] | |
title_abstract_obj = { | |
"title": oa_object["title"], | |
"abstract": (None if abstract is None else restore_inverted_abstract(abstract)) | |
} | |
return title_abstract_obj | |
def extract_citation_contexts(cites, paper): | |
counter=0 | |
extracted_citations = [] | |
references_pattern = re.compile(r'(\n\W*(\d{1,3})\W(.+?)(?=(?:\n\n)|(?:\n\W*\d{1,3}\W)|\Z))', re.VERBOSE | re.I | re.M | re.S) | |
for doi in cites: | |
# for doi in ["10.1155/2021/4883509"]: | |
counter+=1 | |
paper = filter_page_breaks(paper) | |
# print(paper) | |
if paper is None: | |
continue | |
# remove title and authors from beginning of paper | |
paper = paper[750:] | |
citations = cites[doi] | |
# references = re.findall(r'\n\s*(\d+)\.(.*?)(?=(?:\n\s*\d+\.)|\Z)', paper, re.VERBOSE | re.I | re.M | re.S) | |
references = references_pattern.findall(paper) | |
found = 0 | |
n_mentions = 0 | |
has_abstract_title = 0 | |
in_ref_section_refs = [] | |
for cite in citations: | |
embedding_input = extract_title_abstract(cite) | |
if embedding_input["abstract"] is None or embedding_input["title"] is None: | |
in_ref_section_refs.append(None) | |
continue | |
has_abstract_title+=1 | |
paper, in_ref_section_ref = find_reference_in_reference_section(paper, cite, references) | |
in_ref_section_refs.append(in_ref_section_ref) | |
ieee, auth_year = extract_potential_citations(paper) | |
for ref in in_ref_section_refs: | |
if ref is not None: | |
if ref[4] is not None: | |
mentions = find_mentions_by_pointer(doi, ref, paper, ieee) | |
else: mentions = [] | |
mentions += find_mentions_direct(doi, ref, paper, auth_year) | |
extracted_citations+=mentions | |
if len(mentions)>0: | |
n_mentions+=len(mentions) | |
found+=1 | |
print(f"{counter}/{len(cites)} - {doi}: {len(citations)} citations, {has_abstract_title} embeddable citations and {found} references with {n_mentions} mentions found in markdown.") | |
return pd.DataFrame(extracted_citations, columns = ["cited_in_doi", "citation_id", "reference_marker", "reference_target", "mention_start", "mention_end", "left_context", "mention", "right_context"]) | |