Corran commited on
Commit
c6adab2
Β·
1 Parent(s): 95b4b28

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +130 -0
app.py ADDED
@@ -0,0 +1,130 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import datetime
2
+ import math
3
+
4
+ from datasets import load_dataset
5
+ from sentence_transformers import SentenceTransformer
6
+
7
+ import gradio as gr
8
+
9
+ def boolean_search(paragraph, query):
10
+ # Split paragraph into words
11
+ words = paragraph.lower().split()
12
+ words_dict = dict.fromkeys(words, True)
13
+
14
+ # Split query into words
15
+ query_words = query.lower().split()
16
+
17
+ result = words_dict.get(query_words[0], False)
18
+
19
+ for i in range(1, len(query_words), 2):
20
+ operator = query_words[i]
21
+ operand = words_dict.get(query_words[i + 1], False)
22
+
23
+ if operator == 'and':
24
+ result = result and operand
25
+ elif operator == 'or':
26
+ result = result or operand
27
+ elif operator == 'not':
28
+ result = result and not operand
29
+
30
+ return result
31
+
32
+ def parse_retrieved(retrieved_examples,scores,filters,k):
33
+
34
+ results=[]
35
+ repo_avail,in_date,boolmet=len(scores),len(scores),len(scores)
36
+
37
+ for i in range(len(scores)):
38
+
39
+ resdict={}
40
+ for key in keys:
41
+ resdict[key] = retrieved_examples[key][i]
42
+ resdict['arxiv_url'] = "https://arxiv.org/abs/{}".format(retrieved_examples['id'][i])
43
+ resdict['pdf_url'] = "https://arxiv.org/pdf/{}.pdf".format(retrieved_examples['id'][i])
44
+ resdict['published'] = retrieved_examples['versions'][0][0]['created']
45
+ resdict['year'] = datetime.datetime.strptime(resdict['published'], "%a, %d %b %Y %H:%M:%S %Z").year
46
+ resdict['score'] = str(round(scores[i],3))[:5]
47
+ relevant=True
48
+
49
+ if resdict['repo_url']==None:
50
+ repo_avail-=1
51
+ resdict['repo_url']=""
52
+ if filters['limit2_pwc']:
53
+ relevant=False
54
+
55
+ if filters['sy']>resdict['year'] or filters['ey']<resdict['year']:
56
+ relevant=False
57
+ in_date-=1
58
+ print(filters['boolean_terms'])
59
+ if filters['boolean_terms']!="":
60
+ boolean_met=boolean_search(resdict['abstract'], filters['boolean_terms'])
61
+ if not boolean_met:
62
+ relevant=False
63
+ boolmet-=1
64
+
65
+ if relevant:
66
+ results.append(resdict)
67
+
68
+ return [results[:k],repo_avail,in_date,boolmet]
69
+
70
+ def create_metadata_html(metadata_dict):
71
+ html = '''
72
+ <div style="border: 1px solid #ccc; padding: 10px; background-color: #f9f9f9;">
73
+ <h2>{title}</h2>
74
+ <pre><p><strong>Relevance_score:</strong> {score} <strong>Published:</strong> {published}</p></pre>
75
+ <p><strong>Authors:</strong> {authors}</p>
76
+ <pre><p><strong>Categories:</strong> {categories} <strong>Year:</strong> {year}</p></pre>
77
+ <pre><p><a href="{arxiv_url}"><strong>ArXiv URL</strong></a> <a href="{pdf_url}"><strong>PDF URL</strong></a></p></pre>
78
+ <p><strong>Abstract:</strong> {abstract}</p>
79
+ <p><strong>Repo URL:</strong> <a href="{repo_url}">{repo_url}</a><p>
80
+ </div>
81
+ '''
82
+ return html.format(**metadata_dict)
83
+
84
+ def search(query, boolean_terms, sy, ey,limit2_pwc):
85
+
86
+ k=10
87
+
88
+ question_embedding = model.encode(query)
89
+ scores, retrieved_examples = ds['train'].get_nearest_examples('embeddings', question_embedding, k=100)
90
+
91
+ filters={'limit2_pwc':limit2_pwc,'sy':sy,'ey':ey,'boolean_terms':boolean_terms}
92
+
93
+ results = parse_retrieved(retrieved_examples,scores,filters,k)
94
+
95
+ divs=[create_metadata_html(r) for r in results[0]]
96
+ divs.reverse()
97
+
98
+ html="<br><br><pre><strong>Articles with Repo:</strong> {} <strong>Articles in date range:</strong> {} <strong>Articles meeting boolean terms:</strong> {}</pre><br><strong>Top 10 results returned<strong><br>".format(str(results[1]),str(results[2]),str(results[3]))+"<br>".join(divs)
99
+ return html
100
+
101
+
102
+ global keys
103
+ keys = ['title','authors','categories','abstract','repo_url','is_official','mentioned_in_paper']
104
+
105
+
106
+ ds = load_dataset("Corran/Arxiv_V12July23_Post2013CS_AllMiniV2L6")
107
+ ds['train'].add_faiss_index(column='embeddings')
108
+
109
+ model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
110
+
111
+
112
+ search_interface = gr.Blocks()
113
+
114
+ with search_interface:
115
+ fn = search,
116
+ inputs=[
117
+ gr.Textbox(label="Query",value="",info="Search Query"),
118
+ gr.Textbox(label="Boolean Terms",value="",info="Simple boolean conditions on words contained in the abstract (AND OR and NOT accepted for individual words, exact phrase isn't supported)"),
119
+ gr.Slider(2013, 2023,step=1, value=2013, label="Start Year", info="Choose the earliest date for papers retrieved"),
120
+ gr.Slider(2013, 2023,step=1, value=2023, label="End Year", info="Choose the latest date for papers retrieved"),
121
+ gr.Checkbox(value=False,label="Limit results to those with a link to a github repo via pwc")
122
+ ]
123
+ run = gr.Button(label="Search")
124
+ examples=[
125
+ ["We research the use of chatgpt on scientific article summarisation. Summaries are of scientific articles", "chatgpt AND NOT gpt3", 2013, 2023, True],
126
+ ]
127
+ output=gr.outputs.HTML()
128
+ run.click(fn=search, inputs=inputs, outputs=output, api_name="Arxiv Semantic Search")
129
+
130
+ search_interface.launch()