Spaces:

chrible
/

citation-integrity

Sleeping

citation-integrity / extract_mentions.py

Christof Bless

first working mvp

b23f8b6 unverified 4 months ago

8.25 kB

	import json
	import re
	import sys
	import numpy as np
	from pathlib import Path
	from typing import NamedTuple

	import pandas as pd

	TITLE_NORMALIZE = [
	"alpha", "beta", "gamma", "delta", "epsilon", "kappa", "lambda"
	]

	class Context(NamedTuple):
	left: str
	right: str

	split_right_pattern = re.compile(r"(?:#+)\|(?:\[(?>[^A-Za-z0-9\[\]\.]{0,4}\d{1,3}[^A-Za-z0-9\[\]\.]{0,4})+?\])")
	split_left_pattern = re.compile(r"(?:#+)\|(?:\](?>[^A-Za-z0-9\[\]\.]{0,4}\d{1,3}[^A-Za-z0-9\[\]\.]{0,4})+?\[)")
	ieee_style_pattern = re.compile(r"(?>\[(?>[^A-Za-z0-9\[\]\.](\d{1,3})[^A-Za-z0-9\[\]\.])+\][^A-Za-z0-9\[\]]*)+")
	auth_year_style_pattern = re.compile(r"(?>$(?>[^()]+?[,\s][1-2][0-9]{3})+$[^()A-Za-z0-9]*)+")

	def filter_page_breaks(content):
	find_page_breaks = re.compile(
	r"""
	\n*
	\n # empty line
	-----\n # 5 dashes
	\n # empty line
	(?:.*?\n)? # Capture the footer/header
	\n*
	""",
	re.VERBOSE \| re.M
	)
	return re.sub(find_page_breaks, " ", content)

	def get_author_title_year_patterns_from_citation(cite):
	title = cite['title']
	for w in TITLE_NORMALIZE:
	title = title.replace(w, "$")
	title = re.sub(r"[^a-zA-Z0-9]+", "_", title) # Replace en and em dashes with a hyphen
	# title = title.replace(" ", r"[^a-zA-Z0-9]+?")
	year = str(cite['publication_year'])
	try:
	first_author = cite['authorships'][0]['author']['display_name']
	## only lastname
	first_author = re.sub(r"[^a-zA-Z0-9]+", "_", first_author.split(" ")[-1])
	except IndexError or TypeError:
	first_author = None
	return first_author, title, year

	def extract_potential_citations(paper):
	ieee_style = ieee_style_pattern.finditer(paper)
	ieee_style_buckets = []
	for match in ieee_style:
	possible = set([int(n) for n in re.findall(r"\d{1,3}", match.group(1))])
	## expand ranges
	ranges = re.findall(r"(\d{1,3})[–——-]+(\d{1,3})", match.group(1))
	if len(ranges)>0:
	for start, end in ranges:
	possible \|= set(range(int(start),int(end)+1))
	ieee_style_buckets.append((match.start(), match.end(), match.group(0), possible))

	auth_year_style = auth_year_style_pattern.finditer(paper)
	auth_year_style_buckets = []
	for match in auth_year_style:
	possible = set(re.split(r"(\b[1-2]\d{3}\b)\W*", match.group(0)))
	auth_year_style_buckets.append((match.start(), match.end(), match.group(0), possible))

	return ieee_style_buckets, auth_year_style_buckets

	def find_reference_in_reference_section(paper, cite, references):
	"""
	Searches for reference section entry matching citation paper title, year, first author, and journal in a markdown file
	using fuzzy matching.
	"""
	patterns = get_author_title_year_patterns_from_citation(cite)
	if any([p is None for p in patterns]):
	return paper, None
	author, title, year = patterns
	patterns = [author, title, year]
	# Try finding all the patterns between two enumeration items starting from the back of the string
	# for i,s in enumerate(references):
	for full_ref, enum, ref_body in references:
	for w in TITLE_NORMALIZE:
	normalized = ref_body.replace(w, "$")
	fuzzy_ref = re.sub(r"[^a-zA-Z0-9]+", "_", normalized)
	if all([re.search(pattern, fuzzy_ref, re.IGNORECASE \| re.MULTILINE \| re.DOTALL) for pattern in patterns]):
	match = (cite["id"], author, title, year, enum, ref_body)
	# remove the reference from the paper so it can't be matched again
	paper = paper.replace(full_ref, "")
	return paper, match

	return paper, (cite["id"], author, title, year, None, None)


	def find_mentions_by_pointer(doi, ref, paper, ieee_possible):
	"""
	Match the links mentioning that reference in the text and extract context.
	"""
	mentions = []
	(oa_id, _, _, _, ref_num, r) = ref
	for start, end, match, possible_numbers in ieee_possible:
	if int(ref_num) in possible_numbers:
	context = create_context(start, end, paper)
	mentions.append((doi, oa_id, ref_num, r, start, end, context.left, match, context.right))
	return mentions

	def find_mentions_direct(doi, ref, paper, auth_style_possible):
	"""
	Match the links mentioning that reference in the text and extract context.
	"""
	mentions = []
	(oa_id, a, _, y, _, _) = ref
	for start, end, match, possibilities in auth_style_possible:
	for possibility in possibilities:
	if y in possibility and a in possibility:
	context = create_context(start, end, paper)
	mentions.append((doi, oa_id, None, None, start, end, context.left, match, context.right))
	return mentions

	def create_context(start, end, paper):
	left = paper[max(0, start - 500):start]
	right = paper[end:end + min(len(paper) - end, 500)]
	## only take context until a next section begins or another citation appears
	splitleft = split_left_pattern.search(left[::-1])
	if splitleft is not None:
	left = left[len(left) - splitleft.start():]
	splitright = split_right_pattern.search(right)
	if splitright is not None:
	right = right[:splitright.start()]
	return Context(left=left, right=right)

	def restore_inverted_abstract(inverted_abstr):
	all_indexes = [index for indexes in inverted_abstr.values() for index in indexes]
	if len(all_indexes) > 0:
	length = max(all_indexes) + 1
	else:
	return None
	abstract_words = ["" for _ in range(length)]
	for word, indexes in inverted_abstr.items():
	for index in indexes:
	abstract_words[index] = word
	return " ".join(abstract_words)

	def extract_title_abstract(oa_object):
	abstract = oa_object["abstract_inverted_index"]
	title_abstract_obj = {
	"title": oa_object["title"],
	"abstract": (None if abstract is None else restore_inverted_abstract(abstract))
	}
	return title_abstract_obj

	def extract_citation_contexts(cites, paper):
	counter=0
	extracted_citations = []
	references_pattern = re.compile(r'(\n\W(\d{1,3})\W(.+?)(?=(?:\n\n)\|(?:\n\W\d{1,3}\W)\|\Z))', re.VERBOSE \| re.I \| re.M \| re.S)
	for doi in cites:
	# for doi in ["10.1155/2021/4883509"]:
	counter+=1
	paper = filter_page_breaks(paper)
	# print(paper)
	if paper is None:
	continue
	# remove title and authors from beginning of paper
	paper = paper[750:]
	citations = cites[doi]
	# references = re.findall(r'\n\s(\d+)\.(.?)(?=(?:\n\s*\d+\.)\|\Z)', paper, re.VERBOSE \| re.I \| re.M \| re.S)
	references = references_pattern.findall(paper)
	found = 0
	n_mentions = 0
	has_abstract_title = 0
	in_ref_section_refs = []
	for cite in citations:
	embedding_input = extract_title_abstract(cite)
	if embedding_input["abstract"] is None or embedding_input["title"] is None:
	in_ref_section_refs.append(None)
	continue
	has_abstract_title+=1
	paper, in_ref_section_ref = find_reference_in_reference_section(paper, cite, references)
	in_ref_section_refs.append(in_ref_section_ref)
	ieee, auth_year = extract_potential_citations(paper)

	for ref in in_ref_section_refs:
	if ref is not None:
	if ref[4] is not None:
	mentions = find_mentions_by_pointer(doi, ref, paper, ieee)
	else: mentions = []
	mentions += find_mentions_direct(doi, ref, paper, auth_year)
	extracted_citations+=mentions

	if len(mentions)>0:
	n_mentions+=len(mentions)
	found+=1

	print(f"{counter}/{len(cites)} - {doi}: {len(citations)} citations, {has_abstract_title} embeddable citations and {found} references with {n_mentions} mentions found in markdown.")

	return pd.DataFrame(extracted_citations, columns = ["cited_in_doi", "citation_id", "reference_marker", "reference_target", "mention_start", "mention_end", "left_context", "mention", "right_context"])