Spaces:
Runtime error
Runtime error
| import numpy as np | |
| from pypdf import PdfReader | |
| from urllib.parse import urlparse | |
| import requests | |
| from semanticscholar import SemanticScholar | |
| ### Input Formatting Module | |
| ## Input formatting for the given paper | |
| # Extracting text from a pdf or a link | |
| def get_text_from_pdf(file_path): | |
| """ | |
| Convert a pdf to list of text files | |
| """ | |
| reader = PdfReader(file_path) | |
| text = [] | |
| for p in reader.pages: | |
| t = p.extract_text() | |
| text.append(t) | |
| return text | |
| def get_text_from_url(url, file_path='paper.pdf'): | |
| """ | |
| Get text of the paper from a url | |
| """ | |
| # TODO check for other valid urls (e.g. semantic scholar) | |
| ## Check for different URL cases | |
| url_parts = urlparse(url) | |
| # arxiv | |
| if 'arxiv' in url_parts.netloc: | |
| if 'abs' in url_parts.path: | |
| # abstract page, change the url to pdf link | |
| paper_id = url_parts.path.split('/')[-1] | |
| url = 'https://www.arxiv.org/pdf/%s.pdf'%(paper_id) | |
| elif 'pdf' in url_parts.path: | |
| # pdf file, pass | |
| pass | |
| else: | |
| raise ValueError('invalid url') | |
| else: | |
| raise ValueError('invalid url') | |
| # download the file | |
| download_pdf(url, file_path) | |
| # get the text from the pdf file | |
| text = get_text_from_pdf(file_path) | |
| return text | |
| def download_pdf(url, file_name): | |
| """ | |
| Download the pdf file from given url and save it as file_name | |
| """ | |
| # Send GET request | |
| response = requests.get(url) | |
| # Save the PDF | |
| if response.status_code == 200: | |
| with open(file_name, "wb") as f: | |
| f.write(response.content) | |
| elif response.status_code == 404: | |
| raise ValueError('cannot download the file') | |
| else: | |
| print(response.status_code) | |
| ## Input formatting for the given author (reviewer) | |
| # Extracting text from a link | |
| def get_text_from_author_id(author_id, max_count=100): | |
| if author_id is None: | |
| raise ValueError('Input valid author ID') | |
| author_id = str(author_id) | |
| # author_id = '1737249' | |
| url = "https://api.semanticscholar.org/graph/v1/author/%s?fields=url,name,paperCount,papers,papers.title,papers.abstract"%author_id | |
| r = requests.get(url) | |
| if r.status_code == 404: | |
| raise ValueError('Input valid author ID') | |
| data = r.json() | |
| papers = data['papers'][:max_count] | |
| name = data['name'] | |
| return name, papers | |
| ## TODO Preprocess Extracted Texts from PDFs | |
| # Get a portion of the text for actual task | |
| def get_title(text): | |
| pass | |
| def get_abstract(text): | |
| pass | |
| def get_introduction(text): | |
| pass | |
| def get_conclusion(text): | |
| pass | |
| if __name__ == '__main__': | |
| def run_sample(): | |
| url = 'https://arxiv.org/abs/2105.06506' | |
| text = get_text_from_url(url) | |
| assert(text[0].split('\n')[0] == 'Sanity Simulations for Saliency Methods') | |
| text2 = get_text_from_url('https://arxiv.org/pdf/2105.06506.pdf') | |
| assert(text2[0].split('\n')[0] == 'Sanity Simulations for Saliency Methods') | |
| # text = get_text_from_url('https://arxiv.org/paetseths.pdf') | |
| # test the code | |
| run_sample() |