Spaces:
Running
Running
Commit
Β·
6477832
1
Parent(s):
0067690
Bug fix: find_result cache breaks down upon concurrent users
Browse files
README.md
CHANGED
|
@@ -4,7 +4,7 @@ emoji: π
|
|
| 4 |
colorFrom: blue
|
| 5 |
colorTo: green
|
| 6 |
sdk: gradio
|
| 7 |
-
sdk_version: 4.
|
| 8 |
app_file: app.py
|
| 9 |
pinned: false
|
| 10 |
license: cc-by-nc-sa-4.0
|
|
|
|
| 4 |
colorFrom: blue
|
| 5 |
colorTo: green
|
| 6 |
sdk: gradio
|
| 7 |
+
sdk_version: 4.44.1
|
| 8 |
app_file: app.py
|
| 9 |
pinned: false
|
| 10 |
license: cc-by-nc-sa-4.0
|
app.py
CHANGED
|
@@ -150,16 +150,16 @@ def search_docs(index_desc, query, maxnum, max_disp_len, max_clause_freq, max_di
|
|
| 150 |
docs.append([])
|
| 151 |
return tuple([latency, tokenization_info, message] + metadatas + docs)
|
| 152 |
|
| 153 |
-
|
| 154 |
-
|
| 155 |
-
def search_docs_new(index_desc, query, max_disp_len, max_clause_freq, max_diff_tokens):
|
| 156 |
-
global find_result
|
| 157 |
if ' AND ' in query or ' OR ' in query: # CNF query
|
| 158 |
find_result = process('find_cnf', index_desc, query=query, max_clause_freq=max_clause_freq, max_diff_tokens=max_diff_tokens)
|
| 159 |
find_result['type'] = 'cnf'
|
| 160 |
else: # simple query
|
| 161 |
find_result = process('find', index_desc, query=query)
|
| 162 |
find_result['type'] = 'simple'
|
|
|
|
|
|
|
|
|
|
| 163 |
latency = '' if 'latency' not in find_result else f'{find_result["latency"]:.3f}'
|
| 164 |
tokenization_info = format_tokenization_info(find_result)
|
| 165 |
if 'error' in find_result:
|
|
@@ -167,7 +167,7 @@ def search_docs_new(index_desc, query, max_disp_len, max_clause_freq, max_diff_t
|
|
| 167 |
idx = gr.Number(minimum=0, maximum=0, step=1, value=0, interactive=False)
|
| 168 |
metadata = ''
|
| 169 |
doc = []
|
| 170 |
-
return latency, tokenization_info, message, idx, metadata, doc
|
| 171 |
|
| 172 |
if ' AND ' in query or ' OR ' in query: # CNF query
|
| 173 |
ptrs_by_shard = find_result['ptrs_by_shard']
|
|
@@ -183,21 +183,20 @@ def search_docs_new(index_desc, query, max_disp_len, max_clause_freq, max_diff_t
|
|
| 183 |
idx = gr.Number(minimum=0, maximum=0, step=1, value=0, interactive=False)
|
| 184 |
metadata = ''
|
| 185 |
doc = []
|
| 186 |
-
return latency, tokenization_info, message, idx, metadata, doc
|
| 187 |
idx = random.randint(0, cnt_retrievable-1)
|
| 188 |
-
metadata, doc = get_another_doc(index_desc, idx, max_disp_len)
|
| 189 |
idx = gr.Number(minimum=0, maximum=cnt_retrievable-1, step=1, value=idx, interactive=True)
|
| 190 |
-
return latency, tokenization_info, message, idx, metadata, doc
|
| 191 |
|
| 192 |
-
def clear_search_docs_new():
|
| 193 |
-
|
| 194 |
-
find_result = None
|
| 195 |
idx = gr.Number(minimum=0, maximum=0, step=1, value=0, interactive=False)
|
| 196 |
-
return idx
|
| 197 |
|
| 198 |
-
def get_another_doc(index_desc, idx, max_disp_len):
|
| 199 |
-
|
| 200 |
-
if not (type(idx) == int and 0 <= idx and idx < find_result['cnt']):
|
| 201 |
metadata = ''
|
| 202 |
doc = []
|
| 203 |
return metadata, doc
|
|
@@ -230,10 +229,10 @@ def get_another_doc(index_desc, idx, max_disp_len):
|
|
| 230 |
with gr.Blocks() as demo:
|
| 231 |
with gr.Column():
|
| 232 |
gr.HTML(
|
| 233 |
-
'''<h1 text-align="center">Infini-gram: An Engine
|
| 234 |
|
| 235 |
-
<p style='font-size: 16px;'>This
|
| 236 |
-
<p style='font-size: 16px;'>The engine is developed by <a href="https://liujch1998.github.io">Jiacheng
|
| 237 |
<p style='font-size: 16px;'><b>API Endpoint:</b> If you'd like to issue batch queries to infini-gram, you may invoke our API endpoint. Please refer to the <a href="https://infini-gram.io/api_doc">API documentation</a>.</p>
|
| 238 |
<p style='font-size: 16px;'><b>Note:</b> The query is <b>case-sensitive</b>. Your query will be tokenized with the Llama-2 tokenizer (unless otherwise specified).</p>
|
| 239 |
'''
|
|
@@ -482,10 +481,29 @@ with gr.Blocks() as demo:
|
|
| 482 |
search_docs_new_idx = gr.Slider(label='', minimum=0, maximum=0, step=1, value=0, interactive=False)
|
| 483 |
search_docs_new_metadata = gr.Textbox(label='Metadata', lines=3, max_lines=3, interactive=False)
|
| 484 |
search_docs_new_output = gr.HighlightedText(label='Document', show_legend=False, color_map={"-": "red", "0": "green", "1": "cyan", "2": "blue", "3": "magenta"})
|
|
|
|
| 485 |
search_docs_new_clear.add([search_docs_new_query, search_docs_new_latency, search_docs_new_tokenized, search_docs_new_message, search_docs_new_idx, search_docs_new_metadata, search_docs_new_output])
|
| 486 |
-
search_docs_new_clear.click(
|
| 487 |
-
|
| 488 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 489 |
|
| 490 |
with gr.Row():
|
| 491 |
gr.Markdown('''
|
|
|
|
| 150 |
docs.append([])
|
| 151 |
return tuple([latency, tokenization_info, message] + metadatas + docs)
|
| 152 |
|
| 153 |
+
def search_docs_new(index_desc, query, max_disp_len, max_clause_freq, max_diff_tokens, state):
|
|
|
|
|
|
|
|
|
|
| 154 |
if ' AND ' in query or ' OR ' in query: # CNF query
|
| 155 |
find_result = process('find_cnf', index_desc, query=query, max_clause_freq=max_clause_freq, max_diff_tokens=max_diff_tokens)
|
| 156 |
find_result['type'] = 'cnf'
|
| 157 |
else: # simple query
|
| 158 |
find_result = process('find', index_desc, query=query)
|
| 159 |
find_result['type'] = 'simple'
|
| 160 |
+
|
| 161 |
+
state = find_result
|
| 162 |
+
|
| 163 |
latency = '' if 'latency' not in find_result else f'{find_result["latency"]:.3f}'
|
| 164 |
tokenization_info = format_tokenization_info(find_result)
|
| 165 |
if 'error' in find_result:
|
|
|
|
| 167 |
idx = gr.Number(minimum=0, maximum=0, step=1, value=0, interactive=False)
|
| 168 |
metadata = ''
|
| 169 |
doc = []
|
| 170 |
+
return latency, tokenization_info, message, idx, metadata, doc, state
|
| 171 |
|
| 172 |
if ' AND ' in query or ' OR ' in query: # CNF query
|
| 173 |
ptrs_by_shard = find_result['ptrs_by_shard']
|
|
|
|
| 183 |
idx = gr.Number(minimum=0, maximum=0, step=1, value=0, interactive=False)
|
| 184 |
metadata = ''
|
| 185 |
doc = []
|
| 186 |
+
return latency, tokenization_info, message, idx, metadata, doc, state
|
| 187 |
idx = random.randint(0, cnt_retrievable-1)
|
| 188 |
+
metadata, doc = get_another_doc(index_desc, idx, max_disp_len, state)
|
| 189 |
idx = gr.Number(minimum=0, maximum=cnt_retrievable-1, step=1, value=idx, interactive=True)
|
| 190 |
+
return latency, tokenization_info, message, idx, metadata, doc, state
|
| 191 |
|
| 192 |
+
def clear_search_docs_new(state):
|
| 193 |
+
state = None
|
|
|
|
| 194 |
idx = gr.Number(minimum=0, maximum=0, step=1, value=0, interactive=False)
|
| 195 |
+
return idx, state
|
| 196 |
|
| 197 |
+
def get_another_doc(index_desc, idx, max_disp_len, state):
|
| 198 |
+
find_result = state
|
| 199 |
+
if find_result is None or not (type(idx) == int and 0 <= idx and idx < find_result['cnt']):
|
| 200 |
metadata = ''
|
| 201 |
doc = []
|
| 202 |
return metadata, doc
|
|
|
|
| 229 |
with gr.Blocks() as demo:
|
| 230 |
with gr.Column():
|
| 231 |
gr.HTML(
|
| 232 |
+
'''<h1 text-align="center">Infini-gram: An Efficient Search Engine over the Massive Pretraining Datasets of Language Models</h1>
|
| 233 |
|
| 234 |
+
<p style='font-size: 16px;'>This engine does exact-match search over several open pretraining datasets of language models. Please first select the corpus and the type of query, then enter your query and submit.</p>
|
| 235 |
+
<p style='font-size: 16px;'>The engine is developed by <a href="https://liujch1998.github.io">Jiacheng Liu</a> and documented in our paper: <a href="https://huggingface.co/papers/2401.17377">Infini-gram: Scaling Unbounded n-gram Language Models to a Trillion Tokens</a>. Feel free to check out our <a href="https://infini-gram.io">Project Homepage</a>.</p>
|
| 236 |
<p style='font-size: 16px;'><b>API Endpoint:</b> If you'd like to issue batch queries to infini-gram, you may invoke our API endpoint. Please refer to the <a href="https://infini-gram.io/api_doc">API documentation</a>.</p>
|
| 237 |
<p style='font-size: 16px;'><b>Note:</b> The query is <b>case-sensitive</b>. Your query will be tokenized with the Llama-2 tokenizer (unless otherwise specified).</p>
|
| 238 |
'''
|
|
|
|
| 481 |
search_docs_new_idx = gr.Slider(label='', minimum=0, maximum=0, step=1, value=0, interactive=False)
|
| 482 |
search_docs_new_metadata = gr.Textbox(label='Metadata', lines=3, max_lines=3, interactive=False)
|
| 483 |
search_docs_new_output = gr.HighlightedText(label='Document', show_legend=False, color_map={"-": "red", "0": "green", "1": "cyan", "2": "blue", "3": "magenta"})
|
| 484 |
+
search_docs_state = gr.State(value=None)
|
| 485 |
search_docs_new_clear.add([search_docs_new_query, search_docs_new_latency, search_docs_new_tokenized, search_docs_new_message, search_docs_new_idx, search_docs_new_metadata, search_docs_new_output])
|
| 486 |
+
search_docs_new_clear.click(
|
| 487 |
+
clear_search_docs_new,
|
| 488 |
+
inputs=[search_docs_state],
|
| 489 |
+
outputs=[search_docs_new_idx, search_docs_state]
|
| 490 |
+
)
|
| 491 |
+
search_docs_new_submit.click(
|
| 492 |
+
search_docs_new,
|
| 493 |
+
inputs=[index_desc, search_docs_new_query, search_docs_new_max_disp_len,
|
| 494 |
+
search_docs_new_max_clause_freq, search_docs_new_max_diff_tokens,
|
| 495 |
+
search_docs_state],
|
| 496 |
+
outputs=[search_docs_new_latency, search_docs_new_tokenized,
|
| 497 |
+
search_docs_new_message, search_docs_new_idx,
|
| 498 |
+
search_docs_new_metadata, search_docs_new_output,
|
| 499 |
+
search_docs_state]
|
| 500 |
+
)
|
| 501 |
+
search_docs_new_idx.input(
|
| 502 |
+
get_another_doc,
|
| 503 |
+
inputs=[index_desc, search_docs_new_idx, search_docs_new_max_disp_len,
|
| 504 |
+
search_docs_state],
|
| 505 |
+
outputs=[search_docs_new_metadata, search_docs_new_output]
|
| 506 |
+
)
|
| 507 |
|
| 508 |
with gr.Row():
|
| 509 |
gr.Markdown('''
|