Updates
Browse files
app.py
CHANGED
@@ -156,9 +156,10 @@ def perform_deduplication(
|
|
156 |
with gr.Blocks(css="#status_output { height: 50px; overflow: auto; }") as demo:
|
157 |
gr.Markdown("# Semantic Deduplication")
|
158 |
gr.Markdown("""
|
159 |
-
This demo showcases
|
160 |
-
|
161 |
-
You can adjust the similarity threshold to control the strictness of the deduplication
|
|
|
162 |
""")
|
163 |
|
164 |
deduplication_type = gr.Radio(
|
@@ -181,8 +182,7 @@ with gr.Blocks(css="#status_output { height: 50px; overflow: auto; }") as demo:
|
|
181 |
dataset2_text_column = gr.Textbox(value=default_text_column, label="Text Column Name")
|
182 |
|
183 |
threshold = gr.Slider(0.0, 1.0, value=default_threshold, label="Similarity Threshold")
|
184 |
-
compute_button = gr.Button("
|
185 |
-
stop_button = gr.Button("Stop")
|
186 |
status_output = gr.Markdown(elem_id="status_output")
|
187 |
result_output = gr.Markdown()
|
188 |
|
@@ -206,8 +206,6 @@ with gr.Blocks(css="#status_output { height: 50px; overflow: auto; }") as demo:
|
|
206 |
outputs=[status_output, result_output],
|
207 |
)
|
208 |
|
209 |
-
# Stop button functionality
|
210 |
-
stop_button.click(lambda: demo.stop(), None, None)
|
211 |
|
212 |
demo.launch()
|
213 |
|
|
|
156 |
with gr.Blocks(css="#status_output { height: 50px; overflow: auto; }") as demo:
|
157 |
gr.Markdown("# Semantic Deduplication")
|
158 |
gr.Markdown("""
|
159 |
+
This demo showcases semantic deduplication using Model2Vec for HuggingFace datasets.
|
160 |
+
It can be used to identify duplicate texts within a single dataset or across two datasets.
|
161 |
+
You can adjust the similarity threshold to control the strictness of the deduplication.\n
|
162 |
+
NOTE: this demo runs on a free CPU backend, so it may be slow for large datasets. For faster results, please run the code locally.
|
163 |
""")
|
164 |
|
165 |
deduplication_type = gr.Radio(
|
|
|
182 |
dataset2_text_column = gr.Textbox(value=default_text_column, label="Text Column Name")
|
183 |
|
184 |
threshold = gr.Slider(0.0, 1.0, value=default_threshold, label="Similarity Threshold")
|
185 |
+
compute_button = gr.Button("Deduplicate")
|
|
|
186 |
status_output = gr.Markdown(elem_id="status_output")
|
187 |
result_output = gr.Markdown()
|
188 |
|
|
|
206 |
outputs=[status_output, result_output],
|
207 |
)
|
208 |
|
|
|
|
|
209 |
|
210 |
demo.launch()
|
211 |
|