Spaces:

KaiserML
/

Arxivss

Runtime error

App Files Files Community

Corran commited on Jul 13, 2023

Commit

c6adab2

1 Parent(s): 95b4b28

Create app.py

Browse files

Files changed (1) hide show

app.py +130 -0

app.py ADDED Viewed

	@@ -0,0 +1,130 @@

+import datetime
+import math
+from datasets import load_dataset
+from sentence_transformers import SentenceTransformer
+import gradio as gr
+def boolean_search(paragraph, query):
+    # Split paragraph into words
+    words = paragraph.lower().split()
+    words_dict = dict.fromkeys(words, True)
+    # Split query into words
+    query_words = query.lower().split()
+    result = words_dict.get(query_words[0], False)
+    for i in range(1, len(query_words), 2):
+        operator = query_words[i]
+        operand = words_dict.get(query_words[i + 1], False)
+        if operator == 'and':
+            result = result and operand
+        elif operator == 'or':
+            result = result or operand
+        elif operator == 'not':
+            result = result and not operand
+    return result
+def parse_retrieved(retrieved_examples,scores,filters,k):
+  results=[]
+  repo_avail,in_date,boolmet=len(scores),len(scores),len(scores)
+  for i in range(len(scores)):
+    resdict={}
+    for key in keys:
+      resdict[key] = retrieved_examples[key][i]
+    resdict['arxiv_url'] = "https://arxiv.org/abs/{}".format(retrieved_examples['id'][i])
+    resdict['pdf_url'] = "https://arxiv.org/pdf/{}.pdf".format(retrieved_examples['id'][i])
+    resdict['published'] = retrieved_examples['versions'][0][0]['created']
+    resdict['year'] = datetime.datetime.strptime(resdict['published'], "%a, %d %b %Y %H:%M:%S %Z").year
+    resdict['score'] = str(round(scores[i],3))[:5]
+    relevant=True
+    if resdict['repo_url']==None:
+      repo_avail-=1
+      resdict['repo_url']=""
+      if filters['limit2_pwc']:
+        relevant=False
+    if filters['sy']>resdict['year'] or filters['ey']<resdict['year']:
+      relevant=False
+      in_date-=1
+    print(filters['boolean_terms'])
+    if filters['boolean_terms']!="":
+      boolean_met=boolean_search(resdict['abstract'], filters['boolean_terms'])
+      if not boolean_met:
+        relevant=False
+        boolmet-=1
+    if relevant:
+      results.append(resdict)
+  return [results[:k],repo_avail,in_date,boolmet]
+def create_metadata_html(metadata_dict):
+    html = '''
+    <div style="border: 1px solid #ccc; padding: 10px; background-color: #f9f9f9;">
+        <h2>{title}</h2>
+        <pre><p><strong>Relevance_score:</strong> {score}    <strong>Published:</strong> {published}</p></pre>
+        <p><strong>Authors:</strong> {authors}</p>
+        <pre><p><strong>Categories:</strong> {categories}      <strong>Year:</strong> {year}</p></pre>
+        <pre><p><a href="{arxiv_url}"><strong>ArXiv URL</strong></a>    <a href="{pdf_url}"><strong>PDF URL</strong></a></p></pre>
+        <p><strong>Abstract:</strong> {abstract}</p>
+        <p><strong>Repo URL:</strong> <a href="{repo_url}">{repo_url}</a><p>
+    </div>
+    '''
+    return html.format(**metadata_dict)
+def search(query, boolean_terms, sy, ey,limit2_pwc):
+  k=10
+  question_embedding = model.encode(query)
+  scores, retrieved_examples = ds['train'].get_nearest_examples('embeddings', question_embedding, k=100)
+  filters={'limit2_pwc':limit2_pwc,'sy':sy,'ey':ey,'boolean_terms':boolean_terms}
+  results = parse_retrieved(retrieved_examples,scores,filters,k)
+  divs=[create_metadata_html(r) for r in results[0]]
+  divs.reverse()
+  html="<br><br><pre><strong>Articles with Repo:</strong> {}    <strong>Articles in date range:</strong> {}    <strong>Articles meeting boolean terms:</strong> {}</pre><br><strong>Top 10 results returned<strong><br>".format(str(results[1]),str(results[2]),str(results[3]))+"<br>".join(divs)
+  return html
+global keys
+keys = ['title','authors','categories','abstract','repo_url','is_official','mentioned_in_paper']
+ds = load_dataset("Corran/Arxiv_V12July23_Post2013CS_AllMiniV2L6")
+ds['train'].add_faiss_index(column='embeddings')
+model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
+search_interface = gr.Blocks()
+with search_interface:
+    fn = search,
+    inputs=[
+      gr.Textbox(label="Query",value="",info="Search Query"),
+      gr.Textbox(label="Boolean Terms",value="",info="Simple boolean conditions on words contained in the abstract (AND OR and NOT accepted for individual words, exact phrase isn't supported)"),
+      gr.Slider(2013, 2023,step=1, value=2013, label="Start Year", info="Choose the earliest date for papers retrieved"),
+      gr.Slider(2013, 2023,step=1, value=2023, label="End Year", info="Choose the latest date for papers retrieved"),
+      gr.Checkbox(value=False,label="Limit results to those with a link to a github repo via pwc")
+    ]
+    run = gr.Button(label="Search")
+    examples=[
+        ["We research the use of chatgpt on scientific article summarisation. Summaries are of scientific articles", "chatgpt AND NOT gpt3", 2013, 2023, True],
+    ]
+    output=gr.outputs.HTML()
+    run.click(fn=search, inputs=inputs, outputs=output, api_name="Arxiv Semantic Search")
+search_interface.launch()