Updates
Browse files
app.py
CHANGED
|
@@ -156,9 +156,10 @@ def perform_deduplication(
|
|
| 156 |
with gr.Blocks(css="#status_output { height: 50px; overflow: auto; }") as demo:
|
| 157 |
gr.Markdown("# Semantic Deduplication")
|
| 158 |
gr.Markdown("""
|
| 159 |
-
This demo showcases
|
| 160 |
-
|
| 161 |
-
You can adjust the similarity threshold to control the strictness of the deduplication
|
|
|
|
| 162 |
""")
|
| 163 |
|
| 164 |
deduplication_type = gr.Radio(
|
|
@@ -181,8 +182,7 @@ with gr.Blocks(css="#status_output { height: 50px; overflow: auto; }") as demo:
|
|
| 181 |
dataset2_text_column = gr.Textbox(value=default_text_column, label="Text Column Name")
|
| 182 |
|
| 183 |
threshold = gr.Slider(0.0, 1.0, value=default_threshold, label="Similarity Threshold")
|
| 184 |
-
compute_button = gr.Button("
|
| 185 |
-
stop_button = gr.Button("Stop")
|
| 186 |
status_output = gr.Markdown(elem_id="status_output")
|
| 187 |
result_output = gr.Markdown()
|
| 188 |
|
|
@@ -206,8 +206,6 @@ with gr.Blocks(css="#status_output { height: 50px; overflow: auto; }") as demo:
|
|
| 206 |
outputs=[status_output, result_output],
|
| 207 |
)
|
| 208 |
|
| 209 |
-
# Stop button functionality
|
| 210 |
-
stop_button.click(lambda: demo.stop(), None, None)
|
| 211 |
|
| 212 |
demo.launch()
|
| 213 |
|
|
|
|
| 156 |
with gr.Blocks(css="#status_output { height: 50px; overflow: auto; }") as demo:
|
| 157 |
gr.Markdown("# Semantic Deduplication")
|
| 158 |
gr.Markdown("""
|
| 159 |
+
This demo showcases semantic deduplication using Model2Vec for HuggingFace datasets.
|
| 160 |
+
It can be used to identify duplicate texts within a single dataset or across two datasets.
|
| 161 |
+
You can adjust the similarity threshold to control the strictness of the deduplication.\n
|
| 162 |
+
NOTE: this demo runs on a free CPU backend, so it may be slow for large datasets. For faster results, please run the code locally.
|
| 163 |
""")
|
| 164 |
|
| 165 |
deduplication_type = gr.Radio(
|
|
|
|
| 182 |
dataset2_text_column = gr.Textbox(value=default_text_column, label="Text Column Name")
|
| 183 |
|
| 184 |
threshold = gr.Slider(0.0, 1.0, value=default_threshold, label="Similarity Threshold")
|
| 185 |
+
compute_button = gr.Button("Deduplicate")
|
|
|
|
| 186 |
status_output = gr.Markdown(elem_id="status_output")
|
| 187 |
result_output = gr.Markdown()
|
| 188 |
|
|
|
|
| 206 |
outputs=[status_output, result_output],
|
| 207 |
)
|
| 208 |
|
|
|
|
|
|
|
| 209 |
|
| 210 |
demo.launch()
|
| 211 |
|