Pringled commited on
Commit
d54c792
·
1 Parent(s): 72c7e2c
Files changed (1) hide show
  1. app.py +5 -7
app.py CHANGED
@@ -156,9 +156,10 @@ def perform_deduplication(
156
  with gr.Blocks(css="#status_output { height: 50px; overflow: auto; }") as demo:
157
  gr.Markdown("# Semantic Deduplication")
158
  gr.Markdown("""
159
- This demo showcases a semantic deduplication process where we identify duplicate texts within a single dataset or across two datasets.
160
- The deduplication is based on cosine similarity between the embeddings of the texts.
161
- You can adjust the similarity threshold to control the strictness of the deduplication.
 
162
  """)
163
 
164
  deduplication_type = gr.Radio(
@@ -181,8 +182,7 @@ with gr.Blocks(css="#status_output { height: 50px; overflow: auto; }") as demo:
181
  dataset2_text_column = gr.Textbox(value=default_text_column, label="Text Column Name")
182
 
183
  threshold = gr.Slider(0.0, 1.0, value=default_threshold, label="Similarity Threshold")
184
- compute_button = gr.Button("Compute")
185
- stop_button = gr.Button("Stop")
186
  status_output = gr.Markdown(elem_id="status_output")
187
  result_output = gr.Markdown()
188
 
@@ -206,8 +206,6 @@ with gr.Blocks(css="#status_output { height: 50px; overflow: auto; }") as demo:
206
  outputs=[status_output, result_output],
207
  )
208
 
209
- # Stop button functionality
210
- stop_button.click(lambda: demo.stop(), None, None)
211
 
212
  demo.launch()
213
 
 
156
  with gr.Blocks(css="#status_output { height: 50px; overflow: auto; }") as demo:
157
  gr.Markdown("# Semantic Deduplication")
158
  gr.Markdown("""
159
+ This demo showcases semantic deduplication using Model2Vec for HuggingFace datasets.
160
+ It can be used to identify duplicate texts within a single dataset or across two datasets.
161
+ You can adjust the similarity threshold to control the strictness of the deduplication.\n
162
+ NOTE: this demo runs on a free CPU backend, so it may be slow for large datasets. For faster results, please run the code locally.
163
  """)
164
 
165
  deduplication_type = gr.Radio(
 
182
  dataset2_text_column = gr.Textbox(value=default_text_column, label="Text Column Name")
183
 
184
  threshold = gr.Slider(0.0, 1.0, value=default_threshold, label="Similarity Threshold")
185
+ compute_button = gr.Button("Deduplicate")
 
186
  status_output = gr.Markdown(elem_id="status_output")
187
  result_output = gr.Markdown()
188
 
 
206
  outputs=[status_output, result_output],
207
  )
208
 
 
 
209
 
210
  demo.launch()
211