Spaces:

jskim
/

paper-matching

Runtime error

App Files Files Community

paper-matching / input_format.py

jskim

init files

6eff5e7 over 2 years ago

raw

history blame

3.14 kB

	import numpy as np
	from pypdf import PdfReader
	from urllib.parse import urlparse
	import requests
	from semanticscholar import SemanticScholar

	### Input Formatting Module

	## Input formatting for the given paper
	# Extracting text from a pdf or a link

	def get_text_from_pdf(file_path):
	"""
	Convert a pdf to list of text files
	"""
	reader = PdfReader(file_path)
	text = []
	for p in reader.pages:
	t = p.extract_text()
	text.append(t)
	return text

	def get_text_from_url(url, file_path='paper.pdf'):
	"""
	Get text of the paper from a url
	"""
	# TODO check for other valid urls (e.g. semantic scholar)

	## Check for different URL cases
	url_parts = urlparse(url)
	# arxiv
	if 'arxiv' in url_parts.netloc:
	if 'abs' in url_parts.path:
	# abstract page, change the url to pdf link
	paper_id = url_parts.path.split('/')[-1]
	url = 'https://www.arxiv.org/pdf/%s.pdf'%(paper_id)
	elif 'pdf' in url_parts.path:
	# pdf file, pass
	pass
	else:
	raise ValueError('invalid url')
	else:
	raise ValueError('invalid url')

	# download the file
	download_pdf(url, file_path)

	# get the text from the pdf file
	text = get_text_from_pdf(file_path)
	return text

	def download_pdf(url, file_name):
	"""
	Download the pdf file from given url and save it as file_name
	"""
	# Send GET request
	response = requests.get(url)

	# Save the PDF
	if response.status_code == 200:
	with open(file_name, "wb") as f:
	f.write(response.content)
	elif response.status_code == 404:
	raise ValueError('cannot download the file')
	else:
	print(response.status_code)

	## Input formatting for the given author (reviewer)
	# Extracting text from a link

	def get_text_from_author_id(author_id, max_count=100):
	if author_id is None:
	raise ValueError('Input valid author ID')
	author_id = str(author_id)
	# author_id = '1737249'
	url = "https://api.semanticscholar.org/graph/v1/author/%s?fields=url,name,paperCount,papers,papers.title,papers.abstract"%author_id
	r = requests.get(url)
	if r.status_code == 404:
	raise ValueError('Input valid author ID')
	data = r.json()
	papers = data['papers'][:max_count]
	name = data['name']

	return name, papers

	## TODO Preprocess Extracted Texts from PDFs
	# Get a portion of the text for actual task

	def get_title(text):
	pass

	def get_abstract(text):
	pass

	def get_introduction(text):
	pass

	def get_conclusion(text):
	pass


	if __name__ == '__main__':
	def run_sample():
	url = 'https://arxiv.org/abs/2105.06506'
	text = get_text_from_url(url)
	assert(text[0].split('\n')[0] == 'Sanity Simulations for Saliency Methods')

	text2 = get_text_from_url('https://arxiv.org/pdf/2105.06506.pdf')
	assert(text2[0].split('\n')[0] == 'Sanity Simulations for Saliency Methods')

	# text = get_text_from_url('https://arxiv.org/paetseths.pdf')

	# test the code
	run_sample()