Spaces:
Build error
Build error
| #! /usr/bin/env python | |
| import time | |
| import re | |
| import sys | |
| import glob | |
| import os | |
| import gzip | |
| import json | |
| import math | |
| from multiprocessing import Pool,cpu_count | |
| from arxiv_public_data.regex_arxiv import REGEX_ARXIV_FLEXIBLE, clean | |
| from arxiv_public_data.config import DIR_FULLTEXT, DIR_OUTPUT, LOGGER | |
| log = LOGGER.getChild('fulltext') | |
| RE_FLEX = re.compile(REGEX_ARXIV_FLEXIBLE) | |
| RE_OLDNAME_SPLIT = re.compile(r"([a-z\-]+)(\d+)") | |
| def path_to_id(path): | |
| """ Convert filepath name of ArXiv file to ArXiv ID """ | |
| name = os.path.splitext(os.path.basename(path))[0] | |
| if '.' in name: # new ID | |
| return name | |
| split = [a for a in RE_OLDNAME_SPLIT.split(name) if a] | |
| return "/".join(split) | |
| def all_articles(directory=DIR_FULLTEXT): | |
| """ Find all *.txt files in directory """ | |
| out = [] | |
| # make sure the path is absolute for os.walk | |
| directory = os.path.abspath(os.path.expanduser(directory)) | |
| for root, dirs, files in os.walk(directory): | |
| for f in files: | |
| if 'txt' in f: | |
| out.append(os.path.join(root, f)) | |
| return out | |
| def extract_references(filename, pattern=RE_FLEX): | |
| """ | |
| Parameters | |
| ---------- | |
| filename : str | |
| name of file to search for pattern | |
| pattern : re pattern object | |
| compiled regex pattern | |
| Returns | |
| ------- | |
| citations : list | |
| list of found arXiv IDs | |
| """ | |
| out = [] | |
| with open(filename, 'r') as fn: | |
| txt = fn.read() | |
| for matches in pattern.findall(txt): | |
| out.extend([clean(a) for a in matches if a]) | |
| return list(set(out)) | |
| def citation_list_inner(articles): | |
| """ Find references in all the input articles | |
| Parameters | |
| ---------- | |
| articles : list of str | |
| list of paths to article text | |
| Returns | |
| ------- | |
| citations : dict[arXiv ID] = list of arXiv IDs | |
| dictionary of articles and their references | |
| """ | |
| cites = {} | |
| for i, article in enumerate(articles): | |
| if i > 0 and i % 1000 == 0: | |
| log.info('Completed {} articles'.format(i)) | |
| try: | |
| refs = extract_references(article) | |
| cites[path_to_id(article)] = refs | |
| except: | |
| log.error("Error in {}".format(article)) | |
| continue | |
| return cites | |
| def citation_list_parallel(N=cpu_count(), directory=DIR_FULLTEXT): | |
| """ | |
| Split the task of checking for citations across some number of processes | |
| Parameters | |
| ---------- | |
| N : int | |
| number of processes | |
| directory: str | |
| directory where full text files are stored | |
| Returns | |
| ------- | |
| citations : dict[arXiv ID] = list of arXiv IDs | |
| all arXiv citations in all articles | |
| """ | |
| articles = all_articles(directory) | |
| log.info('Calculating citation network for {} articles'.format(len(articles))) | |
| pool = Pool(N) | |
| A = len(articles) | |
| divs = list(range(0, A, math.ceil(A/N))) + [A] | |
| chunks = [articles[s:e] for s, e in zip(divs[:-1], divs[1:])] | |
| cites = pool.map(citation_list_inner, chunks) | |
| allcites = {} | |
| for c in cites: | |
| allcites.update(c) | |
| return allcites | |
| def default_filename(): | |
| return os.path.join(DIR_OUTPUT, 'internal-citations.json.gz') | |
| def save_to_default_location(citations): | |
| filename = default_filename() | |
| log.info('Saving to "{}"'.format(filename)) | |
| with gzip.open(filename, 'wb') as fn: | |
| fn.write(json.dumps(citations).encode('utf-8')) | |