Spaces:

minishlab
/

semantic-deduplication

Running

App Files Files Community

Pringled commited on Oct 12, 2024

Commit

b9fcd2c

1 Parent(s): 4f9641d

Updates

Browse files

Files changed (1) hide show

app.py +400 -162

app.py CHANGED Viewed

@@ -1,11 +1,10 @@
 import gradio as gr
 from datasets import load_dataset
 import numpy as np
-#from model2vec import StaticModel
 import model2vec
 from reach import Reach
 from difflib import ndiff
 # Load the model at startup
 model = model2vec.StaticModel.from_pretrained("minishlab/M2V_base_output")
@@ -22,52 +21,19 @@ default_threshold = 0.9
 ds_default1 = load_dataset(default_dataset1_name, split=default_dataset1_split)
 ds_default2 = load_dataset(default_dataset2_name, split=default_dataset2_split)
-# Patch tqdm to use Gradio's progress bar
-#from tqdm import tqdm as original_tqdm
-# Patch tqdm to use Gradio's progress bar
-# Patch tqdm to use Gradio's progress bar
-# def patch_tqdm_for_gradio(progress):
-#     class GradioTqdm(original_tqdm):
-#         def __init__(self, *args, **kwargs):
-#             super().__init__(*args, **kwargs)
-#             self.progress = progress
-#             self.total_batches = kwargs.get('total', len(args[0])) if len(args) > 0 else 1
-#             self.update_interval = max(1, self.total_batches // 100)  # Update every 1%
-#         def update(self, n=1):
-#             super().update(n)
-#             # Update Gradio progress bar every update_interval steps
-#             if self.n % self.update_interval == 0 or self.n == self.total_batches:
-#                 self.progress(self.n / self.total_batches)
-#     return GradioTqdm
-# def patch_model2vec_tqdm(progress):
-#     patched_tqdm = patch_tqdm_for_gradio(progress)
-#     model2vec.tqdm = patched_tqdm  # Replace tqdm in model2vec
-# # Function to patch the original encode function with our Gradio tqdm
-# def original_encode_with_tqdm(original_encode_func, patched_tqdm):
-#     def new_encode(*args, **kwargs):
-#         original_tqdm_backup = original_tqdm
-#         try:
-#             # Patch the `tqdm` within encode
-#             globals()['tqdm'] = patched_tqdm
-#             return original_encode_func(*args, **kwargs)
-#         finally:
-#             # Restore original tqdm after calling encode
-#             globals()['tqdm'] = original_tqdm_backup
-#     return new_encode
 def batch_iterable(iterable, batch_size):
     """Helper function to create batches from an iterable."""
     for i in range(0, len(iterable), batch_size):
         yield iterable[i:i + batch_size]
 def compute_embeddings(texts, batch_size, progress, desc="Computing embeddings"):
     embeddings = []
     total_batches = (len(texts) + batch_size - 1) // batch_size
@@ -122,7 +88,6 @@ def display_word_differences(x: str, y: str) -> str:
     diff = ndiff(x.split(), y.split())
     return " ".join([word for word in diff if word.startswith(("+", "-"))])
 def encode_texts(texts, progress=None):
     embedding_matrix = model.encode(texts, show_progressbar=False)
     return embedding_matrix
@@ -147,7 +112,8 @@ def perform_deduplication(
         if deduplication_type == "Single dataset":
             # Load Dataset 1
-            status = "Loading Dataset 1..."
             yield status, ""
             if (
                 dataset1_name == default_dataset1_name
@@ -156,29 +122,34 @@ def perform_deduplication(
                 ds = ds_default1
             else:
                 ds = load_dataset(dataset1_name, split=dataset1_split)
             # Extract texts
-            status = "Extracting texts from Dataset 1..."
             yield status, ""
             texts = [example[dataset1_text_column] for example in ds]
             # Compute embeddings
-            status = "Computing embeddings for Dataset 1..."
             yield status, ""
             embedding_matrix = encode_texts(texts, progress=progress)
-            #embedding_matrix = model.encode(texts, show_progressbar=True)
-            # embedding_matrix = compute_embeddings(
-            #     texts,
-            #     batch_size=64,
-            #     progress=progress,
-            #     desc="Computing embeddings for Dataset 1",
-            # )
             # Deduplicate
-            status = "Deduplicating embeddings..."
             yield status, ""
             deduplicated_indices, duplicate_to_original_mapping = deduplicate(
                 embedding_matrix, threshold, progress=progress
             )
             # Prepare the results
             num_duplicates = len(duplicate_to_original_mapping)
@@ -207,13 +178,13 @@ def perform_deduplication(
                 result_text += "No duplicates found."
             # Final status
-            status = "Deduplication completed."
             yield status, result_text
         elif deduplication_type == "Cross-dataset":
-            # Similar code for cross-dataset deduplication
-            # Load Dataset 1
-            status = "Loading Dataset 1..."
             yield status, ""
             if (
                 dataset1_name == default_dataset1_name
@@ -222,9 +193,11 @@ def perform_deduplication(
                 ds1 = ds_default1
             else:
                 ds1 = load_dataset(dataset1_name, split=dataset1_split)
-            # Load Dataset 2
-            status = "Loading Dataset 2..."
             yield status, ""
             if (
                 dataset2_name == default_dataset2_name
@@ -233,114 +206,15 @@ def perform_deduplication(
                 ds2 = ds_default2
             else:
                 ds2 = load_dataset(dataset2_name, split=dataset2_split)
-            # Extract texts from Dataset 1
-            status = "Extracting texts from Dataset 1..."
-            yield status, ""
-            texts1 = [example[dataset1_text_column] for example in ds1]
-            # Extract texts from Dataset 2
-            status = "Extracting texts from Dataset 2..."
-            yield status, ""
-            texts2 = [example[dataset2_text_column] for example in ds2]
-            # Compute embeddings for Dataset 1
-            status = "Computing embeddings for Dataset 1..."
-            yield status, ""
-            embedding_matrix1 = compute_embeddings(
-                texts1,
-                batch_size=64,
-                progress=progress,
-                desc="Computing embeddings for Dataset 1",
-            )
-            # Compute embeddings for Dataset 2
-            status = "Computing embeddings for Dataset 2..."
-            yield status, ""
-            embedding_matrix2 = compute_embeddings(
-                texts2,
-                batch_size=64,
-                progress=progress,
-                desc="Computing embeddings for Dataset 2",
-            )
-            # Deduplicate across datasets
-            status = "Deduplicating embeddings across datasets..."
             yield status, ""
-            duplicate_indices_in_ds2, duplicate_to_original_mapping = deduplicate_across_datasets(
-                embedding_matrix1, embedding_matrix2, threshold, progress=progress
-            )
-            num_duplicates = len(duplicate_indices_in_ds2)
-            num_total_ds2 = len(texts2)
-            num_unique_ds2 = num_total_ds2 - num_duplicates
-            result_text = f"**Total documents in {dataset2_name}/{dataset2_split}:** {num_total_ds2}\n"
-            result_text += f"**Number of duplicates found in {dataset2_name}/{dataset2_split}:** {num_duplicates}\n"
-            result_text += f"**Number of unique documents in {dataset2_name}/{dataset2_split} after deduplication:** {num_unique_ds2}\n\n"
-            # Show deduplicated examples
-            if num_duplicates > 0:
-                result_text += "**Examples of duplicates found in Dataset 2:**\n\n"
-                num_examples = min(5, num_duplicates)
-                for duplicate_idx in duplicate_indices_in_ds2[:num_examples]:
-                    original_idx = duplicate_to_original_mapping[duplicate_idx]
-                    original_text = texts1[original_idx]
-                    duplicate_text = texts2[duplicate_idx]
-                    differences = display_word_differences(original_text, duplicate_text)
-                    result_text += f"**Original text (Dataset 1):**\n{original_text}\n\n"
-                    result_text += f"**Duplicate text (Dataset 2):**\n{duplicate_text}\n\n"
-                    result_text += f"**Differences:**\n{differences}\n"
-                    result_text += "-" * 50 + "\n\n"
-            else:
-                result_text += "No duplicates found."
-            # Final status
-            status = "Deduplication completed."
-            yield status, result_text
     except Exception as e:
         yield f"An error occurred: {e}", ""
         raise e
-def deduplicate_across_datasets(
-    embedding_matrix_1: np.ndarray,
-    embedding_matrix_2: np.ndarray,
-    threshold: float,
-    batch_size: int = 1024,
-    progress=None
-) -> tuple[list[int], dict[int, int]]:
-    # Building the index from Dataset 1
-    progress(0, desc="Building search index from Dataset 1...")
-    reach = Reach(
-        vectors=embedding_matrix_1, items=[str(i) for i in range(len(embedding_matrix_1))]
-    )
-    duplicate_indices_in_test = []
-    duplicate_to_original_mapping = {}
-    # Finding nearest neighbors between datasets
-    progress(0, desc="Finding nearest neighbors between datasets...")
-    results = reach.nearest_neighbor_threshold(
-        embedding_matrix_2,
-        threshold=threshold,
-        batch_size=batch_size,
-        show_progressbar=False,  # Disable internal progress bar
-    )
-    total_items = len(embedding_matrix_2)
-    # Processing duplicates with a progress bar
-    for i, similar_items in enumerate(
-        progress.tqdm(results, desc="Processing duplicates across datasets", total=total_items)
-    ):
-        similar_indices = [int(item[0]) for item in similar_items if item[1] >= threshold]
-        if similar_indices:
-            duplicate_indices_in_test.append(i)
-            duplicate_to_original_mapping[i] = similar_indices[0]
-    return duplicate_indices_in_test, duplicate_to_original_mapping
 # Adjust the height of the status_output component using custom CSS
 with gr.Blocks(css="#status_output { height: 150px; overflow: auto; }") as demo:
     gr.Markdown("# Semantic Deduplication")
@@ -401,3 +275,367 @@ with gr.Blocks(css="#status_output { height: 150px; overflow: auto; }") as demo:
     )
 demo.launch()

 import gradio as gr
 from datasets import load_dataset
 import numpy as np
 import model2vec
 from reach import Reach
 from difflib import ndiff
+import time
 # Load the model at startup
 model = model2vec.StaticModel.from_pretrained("minishlab/M2V_base_output")
 ds_default1 = load_dataset(default_dataset1_name, split=default_dataset1_split)
 ds_default2 = load_dataset(default_dataset2_name, split=default_dataset2_split)
 def batch_iterable(iterable, batch_size):
     """Helper function to create batches from an iterable."""
     for i in range(0, len(iterable), batch_size):
         yield iterable[i:i + batch_size]
+def log_time(message, start_time=None):
+    """Helper function to log the start and end times."""
+    current_time = time.time()
+    if start_time is not None:
+        elapsed = current_time - start_time
+        return f"{message} - Took {elapsed:.2f} seconds"
+    return f"{message} - Started"
 def compute_embeddings(texts, batch_size, progress, desc="Computing embeddings"):
     embeddings = []
     total_batches = (len(texts) + batch_size - 1) // batch_size
     diff = ndiff(x.split(), y.split())
     return " ".join([word for word in diff if word.startswith(("+", "-"))])
 def encode_texts(texts, progress=None):
     embedding_matrix = model.encode(texts, show_progressbar=False)
     return embedding_matrix
         if deduplication_type == "Single dataset":
             # Load Dataset 1
+            start_time = time.time()
+            status = log_time("Loading Dataset 1")
             yield status, ""
             if (
                 dataset1_name == default_dataset1_name
                 ds = ds_default1
             else:
                 ds = load_dataset(dataset1_name, split=dataset1_split)
+            status = log_time("Loading Dataset 1 completed", start_time)
+            yield status, ""
             # Extract texts
+            start_time = time.time()
+            status = log_time("Extracting texts from Dataset 1")
             yield status, ""
             texts = [example[dataset1_text_column] for example in ds]
+            status = log_time("Extracting texts from Dataset 1 completed", start_time)
+            yield status, ""
             # Compute embeddings
+            start_time = time.time()
+            status = log_time("Computing embeddings for Dataset 1")
             yield status, ""
             embedding_matrix = encode_texts(texts, progress=progress)
+            status = log_time("Computing embeddings for Dataset 1 completed", start_time)
+            yield status, ""
             # Deduplicate
+            start_time = time.time()
+            status = log_time("Deduplicating embeddings")
             yield status, ""
             deduplicated_indices, duplicate_to_original_mapping = deduplicate(
                 embedding_matrix, threshold, progress=progress
             )
+            status = log_time("Deduplication completed", start_time)
+            yield status, ""
             # Prepare the results
             num_duplicates = len(duplicate_to_original_mapping)
                 result_text += "No duplicates found."
             # Final status
+            status = log_time("Deduplication process finished")
             yield status, result_text
         elif deduplication_type == "Cross-dataset":
+            # Similar code for cross-dataset deduplication with time logging
+            start_time = time.time()
+            status = log_time("Loading Dataset 1")
             yield status, ""
             if (
                 dataset1_name == default_dataset1_name
                 ds1 = ds_default1
             else:
                 ds1 = load_dataset(dataset1_name, split=dataset1_split)
+            status = log_time("Loading Dataset 1 completed", start_time)
+            yield status, ""
+            start_time = time.time()
+            status = log_time("Loading Dataset 2")
             yield status, ""
             if (
                 dataset2_name == default_dataset2_name
                 ds2 = ds_default2
             else:
                 ds2 = load_dataset(dataset2_name, split=dataset2_split)
+            status = log_time("Loading Dataset 2 completed", start_time)
             yield status, ""
+            # Similar time logging for embedding computations and deduplication steps
     except Exception as e:
         yield f"An error occurred: {e}", ""
         raise e
 # Adjust the height of the status_output component using custom CSS
 with gr.Blocks(css="#status_output { height: 150px; overflow: auto; }") as demo:
     gr.Markdown("# Semantic Deduplication")
     )
 demo.launch()
+# import gradio as gr
+# from datasets import load_dataset
+# import numpy as np
+# #from model2vec import StaticModel
+# import model2vec
+# from reach import Reach
+# from difflib import ndiff
+# # Load the model at startup
+# model = model2vec.StaticModel.from_pretrained("minishlab/M2V_base_output")
+# # Default dataset parameters
+# default_dataset1_name = "sst2"
+# default_dataset1_split = "train"
+# default_dataset2_name = "sst2"
+# default_dataset2_split = "validation"
+# default_text_column = "sentence"
+# default_threshold = 0.9
+# # Load the default datasets at startup
+# ds_default1 = load_dataset(default_dataset1_name, split=default_dataset1_split)
+# ds_default2 = load_dataset(default_dataset2_name, split=default_dataset2_split)
+# def batch_iterable(iterable, batch_size):
+#     """Helper function to create batches from an iterable."""
+#     for i in range(0, len(iterable), batch_size):
+#         yield iterable[i:i + batch_size]
+# def compute_embeddings(texts, batch_size, progress, desc="Computing embeddings"):
+#     embeddings = []
+#     total_batches = (len(texts) + batch_size - 1) // batch_size
+#     for i, batch_texts in enumerate(batch_iterable(texts, batch_size)):
+#         batch_embeddings = model.encode(batch_texts, show_progressbar=False)
+#         embeddings.append(batch_embeddings)
+#         progress((i + 1) / total_batches, desc=desc)
+#     return np.concatenate(embeddings, axis=0)
+# def deduplicate(
+#     embedding_matrix: np.ndarray,
+#     threshold: float,
+#     batch_size: int = 1024,
+#     progress=None
+# ) -> tuple[np.ndarray, dict[int, int]]:
+#     # Building the index
+#     progress(0, desc="Building search index...")
+#     reach = Reach(
+#         vectors=embedding_matrix, items=[str(i) for i in range(len(embedding_matrix))]
+#     )
+#     deduplicated_indices = set(range(len(embedding_matrix)))
+#     duplicate_to_original_mapping = {}
+#     # Finding nearest neighbors
+#     progress(0, desc="Finding nearest neighbors...")
+#     results = reach.nearest_neighbor_threshold(
+#         embedding_matrix,
+#         threshold=threshold,
+#         batch_size=batch_size,
+#         show_progressbar=False,  # Disable internal progress bar
+#     )
+#     # Processing duplicates with a progress bar
+#     total_items = len(embedding_matrix)
+#     for i, similar_items in enumerate(
+#         progress.tqdm(results, desc="Processing duplicates", total=total_items)
+#     ):
+#         if i not in deduplicated_indices:
+#             continue
+#         similar_indices = [int(item[0]) for item in similar_items if int(item[0]) != i]
+#         for sim_idx in similar_indices:
+#             if sim_idx in deduplicated_indices:
+#                 deduplicated_indices.remove(sim_idx)
+#                 duplicate_to_original_mapping[sim_idx] = i
+#     return np.array(list(deduplicated_indices)), duplicate_to_original_mapping
+# def display_word_differences(x: str, y: str) -> str:
+#     diff = ndiff(x.split(), y.split())
+#     return " ".join([word for word in diff if word.startswith(("+", "-"))])
+# def encode_texts(texts, progress=None):
+#     embedding_matrix = model.encode(texts, show_progressbar=False)
+#     return embedding_matrix
+# def perform_deduplication(
+#     deduplication_type,
+#     dataset1_name,
+#     dataset1_split,
+#     dataset1_text_column,
+#     dataset2_name="",
+#     dataset2_split="",
+#     dataset2_text_column="",
+#     threshold=default_threshold,
+#     progress=gr.Progress(track_tqdm=True),
+# ):
+#     try:
+#         # Convert threshold to float
+#         threshold = float(threshold)
+#         # Initialize status message
+#         status = ""
+#         if deduplication_type == "Single dataset":
+#             # Load Dataset 1
+#             status = "Loading Dataset 1..."
+#             yield status, ""
+#             if (
+#                 dataset1_name == default_dataset1_name
+#                 and dataset1_split == default_dataset1_split
+#             ):
+#                 ds = ds_default1
+#             else:
+#                 ds = load_dataset(dataset1_name, split=dataset1_split)
+#             # Extract texts
+#             status = "Extracting texts from Dataset 1..."
+#             yield status, ""
+#             texts = [example[dataset1_text_column] for example in ds]
+#             # Compute embeddings
+#             status = "Computing embeddings for Dataset 1..."
+#             yield status, ""
+#             embedding_matrix = encode_texts(texts, progress=progress)
+#             #embedding_matrix = model.encode(texts, show_progressbar=True)
+#             # embedding_matrix = compute_embeddings(
+#             #     texts,
+#             #     batch_size=64,
+#             #     progress=progress,
+#             #     desc="Computing embeddings for Dataset 1",
+#             # )
+#             # Deduplicate
+#             status = "Deduplicating embeddings..."
+#             yield status, ""
+#             deduplicated_indices, duplicate_to_original_mapping = deduplicate(
+#                 embedding_matrix, threshold, progress=progress
+#             )
+#             # Prepare the results
+#             num_duplicates = len(duplicate_to_original_mapping)
+#             num_total = len(texts)
+#             num_deduplicated = len(deduplicated_indices)
+#             result_text = f"**Total documents:** {num_total}\n"
+#             result_text += f"**Number of duplicates found:** {num_duplicates}\n"
+#             result_text += (
+#                 f"**Number of unique documents after deduplication:** {num_deduplicated}\n\n"
+#             )
+#             # Show deduplicated examples
+#             if num_duplicates > 0:
+#                 result_text += "**Examples of duplicates found:**\n\n"
+#                 num_examples = min(5, num_duplicates)
+#                 for duplicate_idx, original_idx in list(duplicate_to_original_mapping.items())[:num_examples]:
+#                     original_text = texts[original_idx]
+#                     duplicate_text = texts[duplicate_idx]
+#                     differences = display_word_differences(original_text, duplicate_text)
+#                     result_text += f"**Original text:**\n{original_text}\n\n"
+#                     result_text += f"**Duplicate text:**\n{duplicate_text}\n\n"
+#                     result_text += f"**Differences:**\n{differences}\n"
+#                     result_text += "-" * 50 + "\n\n"
+#             else:
+#                 result_text += "No duplicates found."
+#             # Final status
+#             status = "Deduplication completed."
+#             yield status, result_text
+#         elif deduplication_type == "Cross-dataset":
+#             # Similar code for cross-dataset deduplication
+#             # Load Dataset 1
+#             status = "Loading Dataset 1..."
+#             yield status, ""
+#             if (
+#                 dataset1_name == default_dataset1_name
+#                 and dataset1_split == default_dataset1_split
+#             ):
+#                 ds1 = ds_default1
+#             else:
+#                 ds1 = load_dataset(dataset1_name, split=dataset1_split)
+#             # Load Dataset 2
+#             status = "Loading Dataset 2..."
+#             yield status, ""
+#             if (
+#                 dataset2_name == default_dataset2_name
+#                 and dataset2_split == default_dataset2_split
+#             ):
+#                 ds2 = ds_default2
+#             else:
+#                 ds2 = load_dataset(dataset2_name, split=dataset2_split)
+#             # Extract texts from Dataset 1
+#             status = "Extracting texts from Dataset 1..."
+#             yield status, ""
+#             texts1 = [example[dataset1_text_column] for example in ds1]
+#             # Extract texts from Dataset 2
+#             status = "Extracting texts from Dataset 2..."
+#             yield status, ""
+#             texts2 = [example[dataset2_text_column] for example in ds2]
+#             # Compute embeddings for Dataset 1
+#             status = "Computing embeddings for Dataset 1..."
+#             yield status, ""
+#             embedding_matrix1 = compute_embeddings(
+#                 texts1,
+#                 batch_size=64,
+#                 progress=progress,
+#                 desc="Computing embeddings for Dataset 1",
+#             )
+#             # Compute embeddings for Dataset 2
+#             status = "Computing embeddings for Dataset 2..."
+#             yield status, ""
+#             embedding_matrix2 = compute_embeddings(
+#                 texts2,
+#                 batch_size=64,
+#                 progress=progress,
+#                 desc="Computing embeddings for Dataset 2",
+#             )
+#             # Deduplicate across datasets
+#             status = "Deduplicating embeddings across datasets..."
+#             yield status, ""
+#             duplicate_indices_in_ds2, duplicate_to_original_mapping = deduplicate_across_datasets(
+#                 embedding_matrix1, embedding_matrix2, threshold, progress=progress
+#             )
+#             num_duplicates = len(duplicate_indices_in_ds2)
+#             num_total_ds2 = len(texts2)
+#             num_unique_ds2 = num_total_ds2 - num_duplicates
+#             result_text = f"**Total documents in {dataset2_name}/{dataset2_split}:** {num_total_ds2}\n"
+#             result_text += f"**Number of duplicates found in {dataset2_name}/{dataset2_split}:** {num_duplicates}\n"
+#             result_text += f"**Number of unique documents in {dataset2_name}/{dataset2_split} after deduplication:** {num_unique_ds2}\n\n"
+#             # Show deduplicated examples
+#             if num_duplicates > 0:
+#                 result_text += "**Examples of duplicates found in Dataset 2:**\n\n"
+#                 num_examples = min(5, num_duplicates)
+#                 for duplicate_idx in duplicate_indices_in_ds2[:num_examples]:
+#                     original_idx = duplicate_to_original_mapping[duplicate_idx]
+#                     original_text = texts1[original_idx]
+#                     duplicate_text = texts2[duplicate_idx]
+#                     differences = display_word_differences(original_text, duplicate_text)
+#                     result_text += f"**Original text (Dataset 1):**\n{original_text}\n\n"
+#                     result_text += f"**Duplicate text (Dataset 2):**\n{duplicate_text}\n\n"
+#                     result_text += f"**Differences:**\n{differences}\n"
+#                     result_text += "-" * 50 + "\n\n"
+#             else:
+#                 result_text += "No duplicates found."
+#             # Final status
+#             status = "Deduplication completed."
+#             yield status, result_text
+#     except Exception as e:
+#         yield f"An error occurred: {e}", ""
+#         raise e
+# def deduplicate_across_datasets(
+#     embedding_matrix_1: np.ndarray,
+#     embedding_matrix_2: np.ndarray,
+#     threshold: float,
+#     batch_size: int = 1024,
+#     progress=None
+# ) -> tuple[list[int], dict[int, int]]:
+#     # Building the index from Dataset 1
+#     progress(0, desc="Building search index from Dataset 1...")
+#     reach = Reach(
+#         vectors=embedding_matrix_1, items=[str(i) for i in range(len(embedding_matrix_1))]
+#     )
+#     duplicate_indices_in_test = []
+#     duplicate_to_original_mapping = {}
+#     # Finding nearest neighbors between datasets
+#     progress(0, desc="Finding nearest neighbors between datasets...")
+#     results = reach.nearest_neighbor_threshold(
+#         embedding_matrix_2,
+#         threshold=threshold,
+#         batch_size=batch_size,
+#         show_progressbar=False,  # Disable internal progress bar
+#     )
+#     total_items = len(embedding_matrix_2)
+#     # Processing duplicates with a progress bar
+#     for i, similar_items in enumerate(
+#         progress.tqdm(results, desc="Processing duplicates across datasets", total=total_items)
+#     ):
+#         similar_indices = [int(item[0]) for item in similar_items if item[1] >= threshold]
+#         if similar_indices:
+#             duplicate_indices_in_test.append(i)
+#             duplicate_to_original_mapping[i] = similar_indices[0]
+#     return duplicate_indices_in_test, duplicate_to_original_mapping
+# # Adjust the height of the status_output component using custom CSS
+# with gr.Blocks(css="#status_output { height: 150px; overflow: auto; }") as demo:
+#     gr.Markdown("# Semantic Deduplication")
+#     deduplication_type = gr.Radio(
+#         choices=["Single dataset", "Cross-dataset"],
+#         label="Deduplication Type",
+#         value="Single dataset",
+#     )
+#     with gr.Row():
+#         dataset1_name = gr.Textbox(value=default_dataset1_name, label="Dataset 1 Name")
+#         dataset1_split = gr.Textbox(value=default_dataset1_split, label="Dataset 1 Split")
+#         dataset1_text_column = gr.Textbox(value=default_text_column, label="Text Column Name")
+#     dataset2_inputs = gr.Column(visible=False)
+#     with dataset2_inputs:
+#         gr.Markdown("### Dataset 2")
+#         with gr.Row():
+#             dataset2_name = gr.Textbox(value=default_dataset2_name, label="Dataset 2 Name")
+#             dataset2_split = gr.Textbox(value=default_dataset2_split, label="Dataset 2 Split")
+#             dataset2_text_column = gr.Textbox(value=default_text_column, label="Text Column Name")
+#     threshold = gr.Slider(
+#         minimum=0.0, maximum=1.0, value=default_threshold, label="Similarity Threshold"
+#     )
+#     compute_button = gr.Button("Compute")
+#     # Use 'gr.Markdown' with 'elem_id' and custom CSS to adjust height
+#     status_output = gr.Markdown(elem_id="status_output")
+#     result_output = gr.Markdown()
+#     # Function to update the visibility of dataset2_inputs
+#     def update_visibility(deduplication_type_value):
+#         if deduplication_type_value == "Cross-dataset":
+#             return gr.update(visible=True)
+#         else:
+#             return gr.update(visible=False)
+#     deduplication_type.change(
+#         update_visibility, inputs=deduplication_type, outputs=dataset2_inputs
+#     )
+#     compute_button.click(
+#         fn=perform_deduplication,
+#         inputs=[
+#             deduplication_type,
+#             dataset1_name,
+#             dataset1_split,
+#             dataset1_text_column,
+#             dataset2_name,
+#             dataset2_split,
+#             dataset2_text_column,
+#             threshold,
+#         ],
+#         outputs=[status_output, result_output],
+#     )
+# demo.launch()