File size: 8,253 Bytes
b23f8b6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
import json
import re
import sys
import numpy as np
from pathlib import Path
from typing import NamedTuple

import pandas as pd

TITLE_NORMALIZE = [
    "alpha", "beta", "gamma", "delta", "epsilon", "kappa", "lambda"
]

class Context(NamedTuple):
    left: str
    right: str

split_right_pattern = re.compile(r"(?:#+)|(?:\[(?>[^A-Za-z0-9\[\]\.]{0,4}\d{1,3}[^A-Za-z0-9\[\]\.]{0,4})+?\])")
split_left_pattern = re.compile(r"(?:#+)|(?:\](?>[^A-Za-z0-9\[\]\.]{0,4}\d{1,3}[^A-Za-z0-9\[\]\.]{0,4})+?\[)")
ieee_style_pattern = re.compile(r"(?>\[(?>[^A-Za-z0-9\[\]\.]*(\d{1,3})[^A-Za-z0-9\[\]\.]*)+\][^A-Za-z0-9\[\]]*)+")
auth_year_style_pattern = re.compile(r"(?>\((?>[^()]+?[,\s][1-2][0-9]{3})+\)[^()A-Za-z0-9]*)+")

def filter_page_breaks(content):
    find_page_breaks = re.compile(
        r"""
        \n*
        \n # empty line
        -----\n  # 5 dashes
        \n # empty line
        (?:.*?\n)? # Capture the footer/header
        \n*
        """,
        re.VERBOSE | re.M
    )
    return re.sub(find_page_breaks, " ",  content)

def get_author_title_year_patterns_from_citation(cite):
    title = cite['title']
    for w in TITLE_NORMALIZE:
        title = title.replace(w, "$")
    title = re.sub(r"[^a-zA-Z0-9]+", "_", title)  # Replace en and em dashes with a hyphen
    # title = title.replace(" ", r"[^a-zA-Z0-9]+?")
    year = str(cite['publication_year'])
    try:
        first_author = cite['authorships'][0]['author']['display_name']
        ## only lastname
        first_author = re.sub(r"[^a-zA-Z0-9]+", "_", first_author.split(" ")[-1])
    except IndexError or TypeError:
        first_author = None
    return first_author, title, year

def extract_potential_citations(paper):
    ieee_style = ieee_style_pattern.finditer(paper)
    ieee_style_buckets = []
    for match in ieee_style:
        possible = set([int(n) for n in re.findall(r"\d{1,3}", match.group(1))])
        ## expand ranges
        ranges = re.findall(r"(\d{1,3})[–——-]+(\d{1,3})", match.group(1))
        if len(ranges)>0:
            for start, end in ranges:
                possible |= set(range(int(start),int(end)+1))
        ieee_style_buckets.append((match.start(), match.end(), match.group(0), possible))
    
    auth_year_style = auth_year_style_pattern.finditer(paper)
    auth_year_style_buckets = []
    for match in auth_year_style:
        possible = set(re.split(r"(\b[1-2]\d{3}\b)\W*", match.group(0)))
        auth_year_style_buckets.append((match.start(), match.end(), match.group(0), possible))

    return ieee_style_buckets, auth_year_style_buckets

def find_reference_in_reference_section(paper, cite, references):
    """
    Searches for reference section entry matching citation paper title, year, first author, and journal in a markdown file
    using fuzzy matching.
    """
    patterns = get_author_title_year_patterns_from_citation(cite)
    if any([p is None for p in patterns]):
        return paper, None
    author, title, year = patterns
    patterns = [author, title, year]
    # Try finding all the patterns between two enumeration items starting from the back of the string
    # for i,s in enumerate(references):
    for full_ref, enum, ref_body in references:
        for w in TITLE_NORMALIZE:
            normalized = ref_body.replace(w, "$")
        fuzzy_ref = re.sub(r"[^a-zA-Z0-9]+", "_", normalized)
        if all([re.search(pattern, fuzzy_ref, re.IGNORECASE | re.MULTILINE | re.DOTALL) for pattern in patterns]):
            match = (cite["id"], author, title, year, enum, ref_body)
            # remove the reference from the paper so it can't be matched again
            paper = paper.replace(full_ref, "")
            return paper, match

    return paper, (cite["id"], author, title, year, None, None)
        
    
def find_mentions_by_pointer(doi, ref, paper, ieee_possible):
    """
    Match the links mentioning that reference in the text and extract context.
    """
    mentions = []
    (oa_id, _, _, _, ref_num, r) = ref
    for start, end, match, possible_numbers in ieee_possible:
        if int(ref_num) in possible_numbers:
            context = create_context(start, end, paper)
            mentions.append((doi, oa_id, ref_num, r, start, end, context.left, match, context.right))
    return mentions

def find_mentions_direct(doi, ref, paper, auth_style_possible):
    """
    Match the links mentioning that reference in the text and extract context.
    """
    mentions = []
    (oa_id, a, _, y, _, _) = ref 
    for start, end, match, possibilities in auth_style_possible:
        for possibility in possibilities:
            if y in possibility and a in possibility:
                context = create_context(start, end, paper)
                mentions.append((doi, oa_id, None, None, start, end, context.left, match, context.right))
    return mentions

def create_context(start, end, paper):
    left = paper[max(0, start - 500):start]
    right = paper[end:end + min(len(paper) - end, 500)]
    ## only take context until a next section begins or another citation appears
    splitleft = split_left_pattern.search(left[::-1])
    if splitleft is not None:
        left = left[len(left) - splitleft.start():]
    splitright = split_right_pattern.search(right)
    if splitright is not None:
        right = right[:splitright.start()]
    return Context(left=left, right=right)

def restore_inverted_abstract(inverted_abstr):
    all_indexes = [index for indexes in inverted_abstr.values() for index in indexes]
    if len(all_indexes) > 0:
        length = max(all_indexes) + 1
    else:
        return None
    abstract_words = ["" for _ in range(length)]
    for word, indexes in inverted_abstr.items():
        for index in indexes:
            abstract_words[index] = word
    return " ".join(abstract_words)

def extract_title_abstract(oa_object):
    abstract = oa_object["abstract_inverted_index"]
    title_abstract_obj = {
        "title": oa_object["title"],
        "abstract": (None if abstract is None else restore_inverted_abstract(abstract))
    }
    return title_abstract_obj

def extract_citation_contexts(cites, paper):
    counter=0
    extracted_citations = []
    references_pattern = re.compile(r'(\n\W*(\d{1,3})\W(.+?)(?=(?:\n\n)|(?:\n\W*\d{1,3}\W)|\Z))', re.VERBOSE | re.I | re.M | re.S)
    for doi in cites:
    # for doi in ["10.1155/2021/4883509"]:
        counter+=1
        paper = filter_page_breaks(paper)
        # print(paper)
        if paper is None:
            continue
        # remove title and authors from beginning of paper
        paper = paper[750:]
        citations = cites[doi]
        # references = re.findall(r'\n\s*(\d+)\.(.*?)(?=(?:\n\s*\d+\.)|\Z)', paper, re.VERBOSE | re.I | re.M | re.S)
        references = references_pattern.findall(paper)
        found = 0
        n_mentions = 0
        has_abstract_title = 0
        in_ref_section_refs = []
        for cite in citations:
            embedding_input = extract_title_abstract(cite)
            if embedding_input["abstract"] is None or embedding_input["title"] is None:
                in_ref_section_refs.append(None)
                continue
            has_abstract_title+=1
            paper, in_ref_section_ref = find_reference_in_reference_section(paper, cite, references)
            in_ref_section_refs.append(in_ref_section_ref)
        ieee, auth_year = extract_potential_citations(paper)

        for ref in in_ref_section_refs:
            if ref is not None:
                if ref[4] is not None:
                    mentions = find_mentions_by_pointer(doi, ref, paper, ieee)
                else: mentions = []
                mentions += find_mentions_direct(doi, ref, paper, auth_year)
                extracted_citations+=mentions

                if len(mentions)>0:
                    n_mentions+=len(mentions)
                    found+=1

        print(f"{counter}/{len(cites)} - {doi}: {len(citations)} citations, {has_abstract_title} embeddable citations and {found} references with {n_mentions} mentions found in markdown.")
        
    return pd.DataFrame(extracted_citations, columns = ["cited_in_doi", "citation_id", "reference_marker", "reference_target", "mention_start", "mention_end", "left_context", "mention", "right_context"])