Spaces:
Build error
Build error
arxiv id list support
Browse files- app.py +52 -23
- arxiv_public_data/config.py +1 -1
- requirements.txt +2 -0
- src/Surveyor.py +4 -69
- survey.py +6 -3
app.py
CHANGED
|
@@ -1,43 +1,71 @@
|
|
|
|
|
|
|
|
| 1 |
import streamlit as st
|
| 2 |
import pandas as pd
|
| 3 |
import numpy as np
|
| 4 |
|
| 5 |
from src.Surveyor import Surveyor
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6 |
|
| 7 |
-
|
|
|
|
| 8 |
zip_file_name, survey_file_name = surveyor.survey(research_keywords,
|
|
|
|
| 9 |
max_search=max_search,
|
| 10 |
num_papers=num_papers
|
| 11 |
)
|
|
|
|
|
|
|
| 12 |
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
|
|
|
|
|
|
|
|
|
| 19 |
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
for line in file.readlines():
|
| 27 |
-
st.write(line)
|
| 28 |
|
| 29 |
|
| 30 |
-
def survey_space(surveyor):
|
|
|
|
| 31 |
form = st.sidebar.form(key='survey_form')
|
| 32 |
-
research_keywords = form.text_input("What would you like to research in today?")
|
| 33 |
max_search = form.number_input("num_papers_to_search", help="maximium number of papers to glance through - defaults to 20",
|
| 34 |
-
min_value=1, max_value=
|
| 35 |
num_papers = form.number_input("num_papers_to_select", help="maximium number of papers to select and analyse - defaults to 8",
|
| 36 |
-
min_value=1, max_value=
|
| 37 |
submit = form.form_submit_button('Submit')
|
| 38 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 39 |
if submit:
|
| 40 |
-
run_survey(surveyor, research_keywords, max_search, num_papers)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 41 |
|
| 42 |
|
| 43 |
if __name__ == '__main__':
|
|
@@ -45,6 +73,7 @@ if __name__ == '__main__':
|
|
| 45 |
std_col, survey_col = st.columns(2)
|
| 46 |
std_col.header('execution log:')
|
| 47 |
survey_col.header('Generated_survey:')
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
from turtle import down
|
| 3 |
import streamlit as st
|
| 4 |
import pandas as pd
|
| 5 |
import numpy as np
|
| 6 |
|
| 7 |
from src.Surveyor import Surveyor
|
| 8 |
+
from streamlit_tags import st_tags_sidebar
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
@st.experimental_singleton
|
| 12 |
+
def get_surveyor_instance(_print_fn, _survey_print_fn):
|
| 13 |
+
with st.spinner('Loading The-Surveyor ...'):
|
| 14 |
+
return Surveyor(_print_fn, _survey_print_fn, refresh_models=True)
|
| 15 |
|
| 16 |
+
|
| 17 |
+
def run_survey(surveyor, download_placeholder, research_keywords=None, arxiv_ids=None, max_search=None, num_papers=None):
|
| 18 |
zip_file_name, survey_file_name = surveyor.survey(research_keywords,
|
| 19 |
+
arxiv_ids,
|
| 20 |
max_search=max_search,
|
| 21 |
num_papers=num_papers
|
| 22 |
)
|
| 23 |
+
show_survey_download(zip_file_name, survey_file_name, download_placeholder)
|
| 24 |
+
|
| 25 |
|
| 26 |
+
def show_survey_download(zip_file_name, survey_file_name, download_placeholder):
|
| 27 |
+
download_placeholder.empty()
|
| 28 |
+
with download_placeholder.container():
|
| 29 |
+
with open(str(zip_file_name), "rb") as file:
|
| 30 |
+
btn = st.download_button(
|
| 31 |
+
label="Download extracted topic-clustered-highlights, images and tables as zip",
|
| 32 |
+
data=file,
|
| 33 |
+
file_name=str(zip_file_name)
|
| 34 |
+
)
|
| 35 |
|
| 36 |
+
with open(str(survey_file_name), "rb") as file:
|
| 37 |
+
btn = st.download_button(
|
| 38 |
+
label="Download detailed generated survey file",
|
| 39 |
+
data=file,
|
| 40 |
+
file_name=str(survey_file_name)
|
| 41 |
+
)
|
|
|
|
|
|
|
| 42 |
|
| 43 |
|
| 44 |
+
def survey_space(surveyor, download_placeholder):
|
| 45 |
+
|
| 46 |
form = st.sidebar.form(key='survey_form')
|
| 47 |
+
research_keywords = form.text_input("What would you like to research in today?", key='research_keywords')
|
| 48 |
max_search = form.number_input("num_papers_to_search", help="maximium number of papers to glance through - defaults to 20",
|
| 49 |
+
min_value=1, max_value=50, value=10, step=1, key='max_search')
|
| 50 |
num_papers = form.number_input("num_papers_to_select", help="maximium number of papers to select and analyse - defaults to 8",
|
| 51 |
+
min_value=1, max_value=8, value=2, step=1, key='num_papers')
|
| 52 |
submit = form.form_submit_button('Submit')
|
| 53 |
|
| 54 |
+
st.sidebar.write('or')
|
| 55 |
+
|
| 56 |
+
arxiv_ids = st_tags_sidebar(
|
| 57 |
+
label='# Enter Keywords:',
|
| 58 |
+
value=[],
|
| 59 |
+
text='Press enter to add more',
|
| 60 |
+
maxtags = 6,
|
| 61 |
+
key='arxiv_ids')
|
| 62 |
+
|
| 63 |
if submit:
|
| 64 |
+
run_survey(surveyor, download_placeholder, research_keywords, max_search, num_papers)
|
| 65 |
+
elif len(arxiv_ids):
|
| 66 |
+
run_survey(surveyor, download_placeholder, arxiv_ids)
|
| 67 |
+
|
| 68 |
+
|
| 69 |
|
| 70 |
|
| 71 |
if __name__ == '__main__':
|
|
|
|
| 73 |
std_col, survey_col = st.columns(2)
|
| 74 |
std_col.header('execution log:')
|
| 75 |
survey_col.header('Generated_survey:')
|
| 76 |
+
download_placeholder = survey_col.container()
|
| 77 |
+
download_placeholder = st.empty()
|
| 78 |
+
surveyor_obj = get_surveyor_instance(_print_fn=std_col.write, _survey_print_fn=survey_col.write)
|
| 79 |
+
survey_space(surveyor_obj, survey_col)
|
arxiv_public_data/config.py
CHANGED
|
@@ -9,7 +9,7 @@ logging.basicConfig(
|
|
| 9 |
baselog = logging.getLogger('arxivdata')
|
| 10 |
logger = baselog.getChild('config')
|
| 11 |
|
| 12 |
-
DEFAULT_PATH = os.path.join(os.path.abspath('
|
| 13 |
JSONFILE = './config.json'
|
| 14 |
KEY = 'ARXIV_DATA'
|
| 15 |
|
|
|
|
| 9 |
baselog = logging.getLogger('arxivdata')
|
| 10 |
logger = baselog.getChild('config')
|
| 11 |
|
| 12 |
+
DEFAULT_PATH = os.path.join(os.path.abspath('.'), 'arxiv-data')
|
| 13 |
JSONFILE = './config.json'
|
| 14 |
KEY = 'ARXIV_DATA'
|
| 15 |
|
requirements.txt
CHANGED
|
@@ -3,6 +3,7 @@ arxiv
|
|
| 3 |
arxiv2bib
|
| 4 |
boto3==1.9.118
|
| 5 |
bert-extractive-summarizer
|
|
|
|
| 6 |
joblib
|
| 7 |
keybert
|
| 8 |
numpy
|
|
@@ -22,6 +23,7 @@ scispacy
|
|
| 22 |
https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.0/en_core_sci_scibert-0.5.0.tar.gz
|
| 23 |
https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.0/en_core_sci_lg-0.5.0.tar.gz
|
| 24 |
streamlit
|
|
|
|
| 25 |
summarizer
|
| 26 |
tabula
|
| 27 |
tabula_py
|
|
|
|
| 3 |
arxiv2bib
|
| 4 |
boto3==1.9.118
|
| 5 |
bert-extractive-summarizer
|
| 6 |
+
fitz==0.0.1.dev2
|
| 7 |
joblib
|
| 8 |
keybert
|
| 9 |
numpy
|
|
|
|
| 23 |
https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.0/en_core_sci_scibert-0.5.0.tar.gz
|
| 24 |
https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.0/en_core_sci_lg-0.5.0.tar.gz
|
| 25 |
streamlit
|
| 26 |
+
streamlit-tags
|
| 27 |
summarizer
|
| 28 |
tabula
|
| 29 |
tabula_py
|
src/Surveyor.py
CHANGED
|
@@ -1355,16 +1355,18 @@ class Surveyor:
|
|
| 1355 |
zipdir(dump_dir, zipf)
|
| 1356 |
return zip_name
|
| 1357 |
|
| 1358 |
-
def survey(self, query, max_search=None, num_papers=None, debug=False, weigh_authors=False):
|
| 1359 |
import joblib
|
| 1360 |
import os, shutil
|
| 1361 |
if not max_search:
|
| 1362 |
max_search = self.DEFAULTS['max_search']
|
| 1363 |
if not num_papers:
|
| 1364 |
num_papers = self.DEFAULTS['num_papers']
|
|
|
|
|
|
|
| 1365 |
# arxiv api relevance search and data preparation
|
| 1366 |
self.print_fn("\n-searching arXiv for top 100 papers.. ")
|
| 1367 |
-
results, searched_papers = self.search(query, max_search=max_search)
|
| 1368 |
joblib.dump(searched_papers, self.dump_dir + 'papers_metadata.dmp')
|
| 1369 |
self.print_fn("\n-found " + str(len(searched_papers)) + " papers")
|
| 1370 |
|
|
@@ -1485,70 +1487,3 @@ class Surveyor:
|
|
| 1485 |
survey_file) + "\nAll outputs zip path :" + os.path.abspath(self.dump_dir + output_zip))
|
| 1486 |
|
| 1487 |
return os.path.abspath(self.dump_dir + output_zip), os.path.abspath(survey_file)
|
| 1488 |
-
|
| 1489 |
-
|
| 1490 |
-
if __name__ == '__main__':
|
| 1491 |
-
import argparse
|
| 1492 |
-
|
| 1493 |
-
parser = argparse.ArgumentParser(description='Generate a survey just from a query !!')
|
| 1494 |
-
parser.add_argument('query', metavar='query_string', type=str,
|
| 1495 |
-
help='your research query/keywords')
|
| 1496 |
-
parser.add_argument('--max_search', metavar='max_metadata_papers', type=int, default=None,
|
| 1497 |
-
help='maximium number of papers to gaze at - defaults to 100')
|
| 1498 |
-
parser.add_argument('--num_papers', metavar='max_num_papers', type=int, default=None,
|
| 1499 |
-
help='maximium number of papers to download and analyse - defaults to 25')
|
| 1500 |
-
parser.add_argument('--pdf_dir', metavar='pdf_dir', type=str, default=None,
|
| 1501 |
-
help='pdf paper storage directory - defaults to arxiv_data/tarpdfs/')
|
| 1502 |
-
parser.add_argument('--txt_dir', metavar='txt_dir', type=str, default=None,
|
| 1503 |
-
help='text-converted paper storage directory - defaults to arxiv_data/fulltext/')
|
| 1504 |
-
parser.add_argument('--img_dir', metavar='img_dir', type=str, default=None,
|
| 1505 |
-
help='image storage directory - defaults to arxiv_data/images/')
|
| 1506 |
-
parser.add_argument('--tab_dir', metavar='tab_dir', type=str, default=None,
|
| 1507 |
-
help='tables storage directory - defaults to arxiv_data/tables/')
|
| 1508 |
-
parser.add_argument('--dump_dir', metavar='dump_dir', type=str, default=None,
|
| 1509 |
-
help='all_output_dir - defaults to arxiv_dumps/')
|
| 1510 |
-
parser.add_argument('--models_dir', metavar='save_models_dir', type=str, default=None,
|
| 1511 |
-
help='directory to save models (> 5GB) - defaults to saved_models/')
|
| 1512 |
-
parser.add_argument('--title_model_name', metavar='title_model_name', type=str, default=None,
|
| 1513 |
-
help='title model name/tag in hugging-face, defaults to \'Callidior/bert2bert-base-arxiv-titlegen\'')
|
| 1514 |
-
parser.add_argument('--ex_summ_model_name', metavar='extractive_summ_model_name', type=str, default=None,
|
| 1515 |
-
help='extractive summary model name/tag in hugging-face, defaults to \'allenai/scibert_scivocab_uncased\'')
|
| 1516 |
-
parser.add_argument('--ledmodel_name', metavar='ledmodel_name', type=str, default=None,
|
| 1517 |
-
help='led model(for abstractive summary) name/tag in hugging-face, defaults to \'allenai/led-large-16384-arxiv\'')
|
| 1518 |
-
parser.add_argument('--embedder_name', metavar='sentence_embedder_name', type=str, default=None,
|
| 1519 |
-
help='sentence embedder name/tag in hugging-face, defaults to \'paraphrase-MiniLM-L6-v2\'')
|
| 1520 |
-
parser.add_argument('--nlp_name', metavar='spacy_model_name', type=str, default=None,
|
| 1521 |
-
help='spacy model name/tag in hugging-face (if changed - needs to be spacy-installed prior), defaults to \'en_core_sci_scibert\'')
|
| 1522 |
-
parser.add_argument('--similarity_nlp_name', metavar='similarity_nlp_name', type=str, default=None,
|
| 1523 |
-
help='spacy downstream model(for similarity) name/tag in hugging-face (if changed - needs to be spacy-installed prior), defaults to \'en_core_sci_lg\'')
|
| 1524 |
-
parser.add_argument('--kw_model_name', metavar='kw_model_name', type=str, default=None,
|
| 1525 |
-
help='keyword extraction model name/tag in hugging-face, defaults to \'distilbert-base-nli-mean-tokens\'')
|
| 1526 |
-
parser.add_argument('--refresh_models', metavar='refresh_models', type=str, default=None,
|
| 1527 |
-
help='Refresh model downloads with given names (needs atleast one model name param above), defaults to False')
|
| 1528 |
-
parser.add_argument('--high_gpu', metavar='high_gpu', type=str, default=None,
|
| 1529 |
-
help='High GPU usage permitted, defaults to False')
|
| 1530 |
-
|
| 1531 |
-
args = parser.parse_args()
|
| 1532 |
-
|
| 1533 |
-
surveyor = Surveyor(
|
| 1534 |
-
pdf_dir=args.pdf_dir,
|
| 1535 |
-
txt_dir=args.txt_dir,
|
| 1536 |
-
img_dir=args.img_dir,
|
| 1537 |
-
tab_dir=args.tab_dir,
|
| 1538 |
-
dump_dir=args.dump_dir,
|
| 1539 |
-
models_dir=args.models_dir,
|
| 1540 |
-
title_model_name=args.title_model_name,
|
| 1541 |
-
ex_summ_model_name=args.ex_summ_model_name,
|
| 1542 |
-
ledmodel_name=args.ledmodel_name,
|
| 1543 |
-
embedder_name=args.embedder_name,
|
| 1544 |
-
nlp_name=args.nlp_name,
|
| 1545 |
-
similarity_nlp_name=args.similarity_nlp_name,
|
| 1546 |
-
kw_model_name=args.kw_model_name,
|
| 1547 |
-
refresh_models=args.refresh_models,
|
| 1548 |
-
high_gpu=args.high_gpu
|
| 1549 |
-
|
| 1550 |
-
)
|
| 1551 |
-
|
| 1552 |
-
surveyor.survey(args.query, max_search=args.max_search, num_papers=args.num_papers,
|
| 1553 |
-
debug=False, weigh_authors=False)
|
| 1554 |
-
|
|
|
|
| 1355 |
zipdir(dump_dir, zipf)
|
| 1356 |
return zip_name
|
| 1357 |
|
| 1358 |
+
def survey(self, query=None, id_list=None, max_search=None, num_papers=None, debug=False, weigh_authors=False):
|
| 1359 |
import joblib
|
| 1360 |
import os, shutil
|
| 1361 |
if not max_search:
|
| 1362 |
max_search = self.DEFAULTS['max_search']
|
| 1363 |
if not num_papers:
|
| 1364 |
num_papers = self.DEFAULTS['num_papers']
|
| 1365 |
+
if (query is None) and (id_list is None):
|
| 1366 |
+
raise ValueError('please provide a base to survey on: list of arxiv IDs or a few research keywords')
|
| 1367 |
# arxiv api relevance search and data preparation
|
| 1368 |
self.print_fn("\n-searching arXiv for top 100 papers.. ")
|
| 1369 |
+
results, searched_papers = self.search(query, id_list, max_search=max_search)
|
| 1370 |
joblib.dump(searched_papers, self.dump_dir + 'papers_metadata.dmp')
|
| 1371 |
self.print_fn("\n-found " + str(len(searched_papers)) + " papers")
|
| 1372 |
|
|
|
|
| 1487 |
survey_file) + "\nAll outputs zip path :" + os.path.abspath(self.dump_dir + output_zip))
|
| 1488 |
|
| 1489 |
return os.path.abspath(self.dump_dir + output_zip), os.path.abspath(survey_file)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
survey.py
CHANGED
|
@@ -9,8 +9,11 @@ if __name__ == '__main__':
|
|
| 9 |
import argparse
|
| 10 |
|
| 11 |
parser = argparse.ArgumentParser(description='Generate a survey just from a query !!')
|
| 12 |
-
|
| 13 |
-
|
|
|
|
|
|
|
|
|
|
| 14 |
parser.add_argument('--max_search', metavar='max_metadata_papers', type=int, default=None,
|
| 15 |
help='maximium number of papers to gaze at - defaults to 100')
|
| 16 |
parser.add_argument('--num_papers', metavar='max_num_papers', type=int, default=None,
|
|
@@ -67,6 +70,6 @@ if __name__ == '__main__':
|
|
| 67 |
|
| 68 |
)
|
| 69 |
|
| 70 |
-
surveyor.survey(args.query, max_search=args.max_search, num_papers=args.num_papers,
|
| 71 |
debug=False, weigh_authors=False)
|
| 72 |
|
|
|
|
| 9 |
import argparse
|
| 10 |
|
| 11 |
parser = argparse.ArgumentParser(description='Generate a survey just from a query !!')
|
| 12 |
+
|
| 13 |
+
data = parser.add_mutually_exclusive_group(required=True)
|
| 14 |
+
data.add_argument('--query', type=str, help='your research query/keywords')
|
| 15 |
+
data.add_argument('--arxiv_ids', nargs='+', help='arxiv ids for your curated set of papers')
|
| 16 |
+
|
| 17 |
parser.add_argument('--max_search', metavar='max_metadata_papers', type=int, default=None,
|
| 18 |
help='maximium number of papers to gaze at - defaults to 100')
|
| 19 |
parser.add_argument('--num_papers', metavar='max_num_papers', type=int, default=None,
|
|
|
|
| 70 |
|
| 71 |
)
|
| 72 |
|
| 73 |
+
surveyor.survey(query=args.query, id_list=args.arxiv_ids, max_search=args.max_search, num_papers=args.num_papers,
|
| 74 |
debug=False, weigh_authors=False)
|
| 75 |
|