Spaces:

chrible
/

citation-integrity

Sleeping

File size: 8,253 Bytes

b23f8b6

import json
import re
import sys
import numpy as np
from pathlib import Path
from typing import NamedTuple

import pandas as pd

TITLE_NORMALIZE = [
    "alpha", "beta", "gamma", "delta", "epsilon", "kappa", "lambda"
]

class Context(NamedTuple):
    left: str
    right: str

split_right_pattern = re.compile(r"(?:#+)|(?:\[(?>[^A-Za-z0-9\[\]\.]{0,4}\d{1,3}[^A-Za-z0-9\[\]\.]{0,4})+?\])")
split_left_pattern = re.compile(r"(?:#+)|(?:\](?>[^A-Za-z0-9\[\]\.]{0,4}\d{1,3}[^A-Za-z0-9\[\]\.]{0,4})+?\[)")
ieee_style_pattern = re.compile(r"(?>\[(?>[^A-Za-z0-9\[\]\.]*(\d{1,3})[^A-Za-z0-9\[\]\.]*)+\][^A-Za-z0-9\[\]]*)+")
auth_year_style_pattern = re.compile(r"(?>\((?>[^()]+?[,\s][1-2][0-9]{3})+\)[^()A-Za-z0-9]*)+")

def filter_page_breaks(content):
    find_page_breaks = re.compile(
        r"""
        \n*
        \n # empty line
        -----\n  # 5 dashes
        \n # empty line
        (?:.*?\n)? # Capture the footer/header
        \n*
        """,
        re.VERBOSE | re.M
    )
    return re.sub(find_page_breaks, " ",  content)

def get_author_title_year_patterns_from_citation(cite):
    title = cite['title']
    for w in TITLE_NORMALIZE:
        title = title.replace(w, "$")
    title = re.sub(r"[^a-zA-Z0-9]+", "_", title)  # Replace en and em dashes with a hyphen
    # title = title.replace(" ", r"[^a-zA-Z0-9]+?")
    year = str(cite['publication_year'])
    try:
        first_author = cite['authorships'][0]['author']['display_name']
        ## only lastname
        first_author = re.sub(r"[^a-zA-Z0-9]+", "_", first_author.split(" ")[-1])
    except IndexError or TypeError:
        first_author = None
    return first_author, title, year

def extract_potential_citations(paper):
    ieee_style = ieee_style_pattern.finditer(paper)
    ieee_style_buckets = []
    for match in ieee_style:
        possible = set([int(n) for n in re.findall(r"\d{1,3}", match.group(1))])
        ## expand ranges
        ranges = re.findall(r"(\d{1,3})[–——-]+(\d{1,3})", match.group(1))
        if len(ranges)>0:
            for start, end in ranges:
                possible |= set(range(int(start),int(end)+1))
        ieee_style_buckets.append((match.start(), match.end(), match.group(0), possible))
    
    auth_year_style = auth_year_style_pattern.finditer(paper)
    auth_year_style_buckets = []
    for match in auth_year_style:
        possible = set(re.split(r"(\b[1-2]\d{3}\b)\W*", match.group(0)))
        auth_year_style_buckets.append((match.start(), match.end(), match.group(0), possible))

    return ieee_style_buckets, auth_year_style_buckets

def find_reference_in_reference_section(paper, cite, references):
    """
    Searches for reference section entry matching citation paper title, year, first author, and journal in a markdown file
    using fuzzy matching.
    """
    patterns = get_author_title_year_patterns_from_citation(cite)
    if any([p is None for p in patterns]):
        return paper, None
    author, title, year = patterns
    patterns = [author, title, year]
    # Try finding all the patterns between two enumeration items starting from the back of the string
    # for i,s in enumerate(references):
    for full_ref, enum, ref_body in references:
        for w in TITLE_NORMALIZE:
            normalized = ref_body.replace(w, "$")
        fuzzy_ref = re.sub(r"[^a-zA-Z0-9]+", "_", normalized)
        if all([re.search(pattern, fuzzy_ref, re.IGNORECASE | re.MULTILINE | re.DOTALL) for pattern in patterns]):
            match = (cite["id"], author, title, year, enum, ref_body)
            # remove the reference from the paper so it can't be matched again
            paper = paper.replace(full_ref, "")
            return paper, match

    return paper, (cite["id"], author, title, year, None, None)
        
    
def find_mentions_by_pointer(doi, ref, paper, ieee_possible):
    """
    Match the links mentioning that reference in the text and extract context.
    """
    mentions = []
    (oa_id, _, _, _, ref_num, r) = ref
    for start, end, match, possible_numbers in ieee_possible:
        if int(ref_num) in possible_numbers:
            context = create_context(start, end, paper)
            mentions.append((doi, oa_id, ref_num, r, start, end, context.left, match, context.right))
    return mentions

def find_mentions_direct(doi, ref, paper, auth_style_possible):
    """
    Match the links mentioning that reference in the text and extract context.
    """
    mentions = []
    (oa_id, a, _, y, _, _) = ref 
    for start, end, match, possibilities in auth_style_possible:
        for possibility in possibilities:
            if y in possibility and a in possibility:
                context = create_context(start, end, paper)
                mentions.append((doi, oa_id, None, None, start, end, context.left, match, context.right))
    return mentions

def create_context(start, end, paper):
    left = paper[max(0, start - 500):start]
    right = paper[end:end + min(len(paper) - end, 500)]
    ## only take context until a next section begins or another citation appears
    splitleft = split_left_pattern.search(left[::-1])
    if splitleft is not None:
        left = left[len(left) - splitleft.start():]
    splitright = split_right_pattern.search(right)
    if splitright is not None:
        right = right[:splitright.start()]
    return Context(left=left, right=right)

def restore_inverted_abstract(inverted_abstr):
    all_indexes = [index for indexes in inverted_abstr.values() for index in indexes]
    if len(all_indexes) > 0:
        length = max(all_indexes) + 1
    else:
        return None
    abstract_words = ["" for _ in range(length)]
    for word, indexes in inverted_abstr.items():
        for index in indexes:
            abstract_words[index] = word
    return " ".join(abstract_words)

def extract_title_abstract(oa_object):
    abstract = oa_object["abstract_inverted_index"]
    title_abstract_obj = {
        "title": oa_object["title"],
        "abstract": (None if abstract is None else restore_inverted_abstract(abstract))
    }
    return title_abstract_obj

def extract_citation_contexts(cites, paper):
    counter=0
    extracted_citations = []
    references_pattern = re.compile(r'(\n\W*(\d{1,3})\W(.+?)(?=(?:\n\n)|(?:\n\W*\d{1,3}\W)|\Z))', re.VERBOSE | re.I | re.M | re.S)
    for doi in cites:
    # for doi in ["10.1155/2021/4883509"]:
        counter+=1
        paper = filter_page_breaks(paper)
        # print(paper)
        if paper is None:
            continue
        # remove title and authors from beginning of paper
        paper = paper[750:]
        citations = cites[doi]
        # references = re.findall(r'\n\s*(\d+)\.(.*?)(?=(?:\n\s*\d+\.)|\Z)', paper, re.VERBOSE | re.I | re.M | re.S)
        references = references_pattern.findall(paper)
        found = 0
        n_mentions = 0
        has_abstract_title = 0
        in_ref_section_refs = []
        for cite in citations:
            embedding_input = extract_title_abstract(cite)
            if embedding_input["abstract"] is None or embedding_input["title"] is None:
                in_ref_section_refs.append(None)
                continue
            has_abstract_title+=1
            paper, in_ref_section_ref = find_reference_in_reference_section(paper, cite, references)
            in_ref_section_refs.append(in_ref_section_ref)
        ieee, auth_year = extract_potential_citations(paper)

        for ref in in_ref_section_refs:
            if ref is not None:
                if ref[4] is not None:
                    mentions = find_mentions_by_pointer(doi, ref, paper, ieee)
                else: mentions = []
                mentions += find_mentions_direct(doi, ref, paper, auth_year)
                extracted_citations+=mentions

                if len(mentions)>0:
                    n_mentions+=len(mentions)
                    found+=1

        print(f"{counter}/{len(cites)} - {doi}: {len(citations)} citations, {has_abstract_title} embeddable citations and {found} references with {n_mentions} mentions found in markdown.")
        
    return pd.DataFrame(extracted_citations, columns = ["cited_in_doi", "citation_id", "reference_marker", "reference_target", "mention_start", "mention_end", "left_context", "mention", "right_context"])