Spaces:

minishlab
/

semantic-deduplication

Running

App Files Files Community

Pringled commited on Oct 12, 2024

Commit

c58907b

1 Parent(s): 1a5f99b

Updates

Browse files

Files changed (1) hide show

app.py +897 -541

app.py CHANGED Viewed

@@ -4,12 +4,11 @@ import numpy as np
 from model2vec import StaticModel
 from reach import Reach
 from difflib import ndiff
-import tqdm
 # Load the model at startup
 model = StaticModel.from_pretrained("minishlab/M2V_base_output")
-# Update default dataset to 'sst2' and set default threshold to 0.9
 default_dataset1_name = "sst2"
 default_dataset1_split = "train"
 default_dataset2_name = "sst2"
@@ -28,29 +27,42 @@ def batch_iterable(iterable, batch_size):
 def compute_embeddings(texts, batch_size, progress, desc="Computing embeddings"):
     embeddings = []
-    for batch in progress.tqdm(batch_iterable(texts, batch_size), total=(len(texts) + batch_size - 1) // batch_size, desc=desc):
-        batch_embeddings = model.encode(batch, show_progressbar=False)
         embeddings.append(batch_embeddings)
     return np.concatenate(embeddings, axis=0)
-def deduplicate(embedding_matrix: np.ndarray, threshold: float, batch_size: int = 1024, progress=None) -> tuple[np.ndarray, dict[int, int]]:
-    """
-    Deduplicate embeddings and return the deduplicated indices and a mapping of removed indices to their corresponding original indices.
-    """
-    reach = Reach(vectors=embedding_matrix, items=[str(i) for i in range(len(embedding_matrix))])
     deduplicated_indices = set(range(len(embedding_matrix)))
     duplicate_to_original_mapping = {}
     results = reach.nearest_neighbor_threshold(
         embedding_matrix,
         threshold=threshold,
         batch_size=batch_size,
-        show_progressbar=False
     )
     total_items = len(embedding_matrix)
-    for i, similar_items in enumerate(progress.tqdm(results, desc="Processing duplicates", total=total_items)):
         if i not in deduplicated_indices:
             continue
@@ -63,35 +75,9 @@ def deduplicate(embedding_matrix: np.ndarray, threshold: float, batch_size: int
     return np.array(list(deduplicated_indices)), duplicate_to_original_mapping
-def deduplicate_across_datasets(embedding_matrix_1: np.ndarray, embedding_matrix_2: np.ndarray, threshold: float, batch_size: int = 1024, progress=None) -> tuple[list[int], dict[int, int]]:
-    """
-    Deduplicate embeddings across two datasets and return the indices of duplicates between them.
-    """
-    reach = Reach(vectors=embedding_matrix_1, items=[str(i) for i in range(len(embedding_matrix_1))])
-    duplicate_indices_in_test = []
-    duplicate_to_original_mapping = {}
-    results = reach.nearest_neighbor_threshold(
-        embedding_matrix_2,
-        threshold=threshold,
-        batch_size=batch_size,
-        show_progressbar=False
-    )
-    total_items = len(embedding_matrix_2)
-    for i, similar_items in enumerate(progress.tqdm(results, desc="Processing duplicates across datasets", total=total_items)):
-        similar_indices = [int(item[0]) for item in similar_items if item[1] >= threshold]
-        if similar_indices:
-            duplicate_indices_in_test.append(i)
-            duplicate_to_original_mapping[i] = similar_indices[0]
-    return duplicate_indices_in_test, duplicate_to_original_mapping
 def display_word_differences(x: str, y: str) -> str:
     diff = ndiff(x.split(), y.split())
-    return " ".join([word for word in diff if word.startswith(('+', '-'))])
 def perform_deduplication(
     deduplication_type,
@@ -102,26 +88,61 @@ def perform_deduplication(
     dataset2_split="",
     dataset2_text_column="",
     threshold=default_threshold,
-    progress=gr.Progress(track_tqdm=True)
 ):
     try:
         threshold = float(threshold)
         if deduplication_type == "Single dataset":
-            ds = ds_default1 if dataset1_name == default_dataset1_name and dataset1_split == default_dataset1_split else load_dataset(dataset1_name, split=dataset1_split)
-            texts = [example[dataset1_text_column] for example in ds]
-            embedding_matrix = compute_embeddings(texts, batch_size=64, progress=progress, desc="Computing embeddings for Dataset 1")
-            deduplicated_indices, duplicate_to_original_mapping = deduplicate(embedding_matrix, threshold, progress=progress)
             num_duplicates = len(duplicate_to_original_mapping)
             num_total = len(texts)
             num_deduplicated = len(deduplicated_indices)
             result_text = f"**Total documents:** {num_total}\n"
             result_text += f"**Number of duplicates found:** {num_duplicates}\n"
-            result_text += f"**Number of unique documents after deduplication:** {num_deduplicated}\n\n"
             if num_duplicates > 0:
                 result_text += "**Examples of duplicates found:**\n\n"
                 num_examples = min(5, num_duplicates)
@@ -136,19 +157,70 @@ def perform_deduplication(
             else:
                 result_text += "No duplicates found."
-            yield result_text
         elif deduplication_type == "Cross-dataset":
-            ds1 = ds_default1 if dataset1_name == default_dataset1_name and dataset1_split == default_dataset1_split else load_dataset(dataset1_name, split=dataset1_split)
-            ds2 = ds_default2 if dataset2_name == default_dataset2_name and dataset2_split == default_dataset2_split else load_dataset(dataset2_name, split=dataset2_split)
             texts1 = [example[dataset1_text_column] for example in ds1]
-            texts2 = [example[dataset2_text_column] for example in ds2]
-            embedding_matrix1 = compute_embeddings(texts1, batch_size=64, progress=progress, desc="Computing embeddings for Dataset 1")
-            embedding_matrix2 = compute_embeddings(texts2, batch_size=64, progress=progress, desc="Computing embeddings for Dataset 2")
-            duplicate_indices_in_ds2, duplicate_to_original_mapping = deduplicate_across_datasets(embedding_matrix1, embedding_matrix2, threshold, progress=progress)
             num_duplicates = len(duplicate_indices_in_ds2)
             num_total_ds2 = len(texts2)
@@ -158,6 +230,7 @@ def perform_deduplication(
             result_text += f"**Number of duplicates found in {dataset2_name}/{dataset2_split}:** {num_duplicates}\n"
             result_text += f"**Number of unique documents in {dataset2_name}/{dataset2_split} after deduplication:** {num_unique_ds2}\n\n"
             if num_duplicates > 0:
                 result_text += "**Examples of duplicates found in Dataset 2:**\n\n"
                 num_examples = min(5, num_duplicates)
@@ -173,19 +246,60 @@ def perform_deduplication(
             else:
                 result_text += "No duplicates found."
-            yield result_text
     except Exception as e:
         yield f"An error occurred: {e}", ""
-# Adjust the height of the status_output and result_output components
-with gr.Blocks(css="#status_output { height: 300px; overflow: auto; } #result_output { height: 300px; overflow: auto; }") as demo:
     gr.Markdown("# Semantic Deduplication")
     deduplication_type = gr.Radio(
         choices=["Single dataset", "Cross-dataset"],
         label="Deduplication Type",
-        value="Single dataset"
     )
     with gr.Row():
@@ -202,17 +316,16 @@ with gr.Blocks(css="#status_output { height: 300px; overflow: auto; } #result_ou
             dataset2_text_column = gr.Textbox(value=default_text_column, label="Text Column Name")
     threshold = gr.Slider(
-        minimum=0.0,
-        maximum=1.0,
-        value=default_threshold,
-        label="Similarity Threshold"
     )
     compute_button = gr.Button("Compute")
     status_output = gr.Markdown(elem_id="status_output")
-    result_output = gr.Markdown(elem_id="result_output")
     def update_visibility(deduplication_type_value):
         if deduplication_type_value == "Cross-dataset":
             return gr.update(visible=True)
@@ -220,9 +333,7 @@ with gr.Blocks(css="#status_output { height: 300px; overflow: auto; } #result_ou
             return gr.update(visible=False)
     deduplication_type.change(
-        update_visibility,
-        inputs=deduplication_type,
-        outputs=dataset2_inputs
     )
     compute_button.click(
@@ -235,13 +346,14 @@ with gr.Blocks(css="#status_output { height: 300px; overflow: auto; } #result_ou
             dataset2_name,
             dataset2_split,
             dataset2_text_column,
-            threshold
         ],
-        outputs=[status_output, result_output]
     )
 demo.launch()
 # import gradio as gr
 # from datasets import load_dataset
 # import numpy as np
@@ -281,23 +393,18 @@ demo.launch()
 #     """
 #     Deduplicate embeddings and return the deduplicated indices and a mapping of removed indices to their corresponding original indices.
 #     """
-#     # Building the index
-#     progress(0, desc="Building search index...")
 #     reach = Reach(vectors=embedding_matrix, items=[str(i) for i in range(len(embedding_matrix))])
 #     deduplicated_indices = set(range(len(embedding_matrix)))
 #     duplicate_to_original_mapping = {}
-#     # Finding nearest neighbors
-#     progress(0, desc="Finding nearest neighbors...")
 #     results = reach.nearest_neighbor_threshold(
 #         embedding_matrix,
 #         threshold=threshold,
 #         batch_size=batch_size,
-#         show_progressbar=False  # Disable internal progress bar
 #     )
-#     # Processing duplicates with a progress bar
 #     total_items = len(embedding_matrix)
 #     for i, similar_items in enumerate(progress.tqdm(results, desc="Processing duplicates", total=total_items)):
 #         if i not in deduplicated_indices:
@@ -316,24 +423,19 @@ demo.launch()
 #     """
 #     Deduplicate embeddings across two datasets and return the indices of duplicates between them.
 #     """
-#     # Building the index from Dataset 1
-#     progress(0, desc="Building search index from Dataset 1...")
 #     reach = Reach(vectors=embedding_matrix_1, items=[str(i) for i in range(len(embedding_matrix_1))])
 #     duplicate_indices_in_test = []
 #     duplicate_to_original_mapping = {}
-#     # Finding nearest neighbors between datasets
-#     progress(0, desc="Finding nearest neighbors between datasets...")
 #     results = reach.nearest_neighbor_threshold(
 #         embedding_matrix_2,
 #         threshold=threshold,
 #         batch_size=batch_size,
-#         show_progressbar=False  # Disable internal progress bar
 #     )
 #     total_items = len(embedding_matrix_2)
-#     # Processing duplicates with a progress bar
 #     for i, similar_items in enumerate(progress.tqdm(results, desc="Processing duplicates across datasets", total=total_items)):
 #         similar_indices = [int(item[0]) for item in similar_items if item[1] >= threshold]
@@ -359,39 +461,15 @@ demo.launch()
 #     progress=gr.Progress(track_tqdm=True)
 # ):
 #     try:
-#         # Convert threshold to float
 #         threshold = float(threshold)
-#         # Initialize status message
-#         status = ""
 #         if deduplication_type == "Single dataset":
-#             # Load Dataset 1
-#             status = "Loading Dataset 1..."
-#             yield status, ""
-#             if dataset1_name == default_dataset1_name and dataset1_split == default_dataset1_split:
-#                 ds = ds_default1
-#             else:
-#                 ds = load_dataset(dataset1_name, split=dataset1_split)
-#             # Extract texts
-#             status = "Extracting texts from Dataset 1..."
-#             yield status, ""
 #             texts = [example[dataset1_text_column] for example in ds]
-#             # Compute embeddings
-#             status = "Computing embeddings for Dataset 1..."
-#             yield status, ""
 #             embedding_matrix = compute_embeddings(texts, batch_size=64, progress=progress, desc="Computing embeddings for Dataset 1")
-#             # Deduplicate
-#             status = "Deduplicating embeddings..."
-#             yield status, ""
-#             deduplicated_indices, duplicate_to_original_mapping = deduplicate(
-#                 embedding_matrix, threshold, progress=progress
-#             )
-#             # Prepare the results
 #             num_duplicates = len(duplicate_to_original_mapping)
 #             num_total = len(texts)
 #             num_deduplicated = len(deduplicated_indices)
@@ -400,7 +478,6 @@ demo.launch()
 #             result_text += f"**Number of duplicates found:** {num_duplicates}\n"
 #             result_text += f"**Number of unique documents after deduplication:** {num_deduplicated}\n\n"
-#             # Show deduplicated examples
 #             if num_duplicates > 0:
 #                 result_text += "**Examples of duplicates found:**\n\n"
 #                 num_examples = min(5, num_duplicates)
@@ -415,53 +492,19 @@ demo.launch()
 #             else:
 #                 result_text += "No duplicates found."
-#             # Final status
-#             status = "Deduplication completed."
-#             yield status, result_text
 #         elif deduplication_type == "Cross-dataset":
-#             # Load Dataset 1
-#             status = "Loading Dataset 1..."
-#             yield status, ""
-#             if dataset1_name == default_dataset1_name and dataset1_split == default_dataset1_split:
-#                 ds1 = ds_default1
-#             else:
-#                 ds1 = load_dataset(dataset1_name, split=dataset1_split)
-#             # Load Dataset 2
-#             status = "Loading Dataset 2..."
-#             yield status, ""
-#             if dataset2_name == default_dataset2_name and dataset2_split == default_dataset2_split:
-#                 ds2 = ds_default2
-#             else:
-#                 ds2 = load_dataset(dataset2_name, split=dataset2_split)
-#             # Extract texts from Dataset 1
-#             status = "Extracting texts from Dataset 1..."
-#             yield status, ""
 #             texts1 = [example[dataset1_text_column] for example in ds1]
-#             # Extract texts from Dataset 2
-#             status = "Extracting texts from Dataset 2..."
-#             yield status, ""
 #             texts2 = [example[dataset2_text_column] for example in ds2]
-#             # Compute embeddings for Dataset 1
-#             status = "Computing embeddings for Dataset 1..."
-#             yield status, ""
 #             embedding_matrix1 = compute_embeddings(texts1, batch_size=64, progress=progress, desc="Computing embeddings for Dataset 1")
-#             # Compute embeddings for Dataset 2
-#             status = "Computing embeddings for Dataset 2..."
-#             yield status, ""
 #             embedding_matrix2 = compute_embeddings(texts2, batch_size=64, progress=progress, desc="Computing embeddings for Dataset 2")
-#             # Deduplicate across datasets
-#             status = "Deduplicating embeddings across datasets..."
-#             yield status, ""
-#             duplicate_indices_in_ds2, duplicate_to_original_mapping = deduplicate_across_datasets(
-#                 embedding_matrix1, embedding_matrix2, threshold, progress=progress
-#             )
 #             num_duplicates = len(duplicate_indices_in_ds2)
 #             num_total_ds2 = len(texts2)
@@ -471,7 +514,6 @@ demo.launch()
 #             result_text += f"**Number of duplicates found in {dataset2_name}/{dataset2_split}:** {num_duplicates}\n"
 #             result_text += f"**Number of unique documents in {dataset2_name}/{dataset2_split} after deduplication:** {num_unique_ds2}\n\n"
-#             # Show deduplicated examples
 #             if num_duplicates > 0:
 #                 result_text += "**Examples of duplicates found in Dataset 2:**\n\n"
 #                 num_examples = min(5, num_duplicates)
@@ -487,15 +529,13 @@ demo.launch()
 #             else:
 #                 result_text += "No duplicates found."
-#             # Final status
-#             status = "Deduplication completed."
-#             yield status, result_text
 #     except Exception as e:
 #         yield f"An error occurred: {e}", ""
-#         raise e
-# with gr.Blocks() as demo:
 #     gr.Markdown("# Semantic Deduplication")
 #     deduplication_type = gr.Radio(
@@ -526,10 +566,9 @@ demo.launch()
 #     compute_button = gr.Button("Compute")
-#     status_output = gr.Markdown()
-#     result_output = gr.Markdown()
-#     # Function to update the visibility of dataset2_inputs
 #     def update_visibility(deduplication_type_value):
 #         if deduplication_type_value == "Cross-dataset":
 #             return gr.update(visible=True)
@@ -559,178 +598,322 @@ demo.launch()
 # demo.launch()
-# import gradio as gr
-# from datasets import load_dataset
-# import numpy as np
-# import model2vec
-# from reach import Reach
-# from difflib import ndiff
-# # Load the model at startup
-# model = model2vec.StaticModel.from_pretrained("minishlab/M2V_base_output")
-# # Default dataset parameters
-# default_dataset1_name = "sst2"
-# default_dataset1_split = "train"
-# default_dataset2_name = "sst2"
-# default_dataset2_split = "validation"
-# default_text_column = "sentence"
-# default_threshold = 0.9
-# # Load the default datasets at startup
-# ds_default1 = load_dataset(default_dataset1_name, split=default_dataset1_split)
-# ds_default2 = load_dataset(default_dataset2_name, split=default_dataset2_split)
-# def batch_iterable(iterable, batch_size):
-#     """Helper function to create batches from an iterable."""
-#     for i in range(0, len(iterable), batch_size):
-#         yield iterable[i:i + batch_size]
-# def compute_embeddings(texts, batch_size, progress, desc="Computing embeddings"):
-#     embeddings = []
-#     total_batches = (len(texts) + batch_size - 1) // batch_size
-#     for i, batch_texts in enumerate(batch_iterable(texts, batch_size)):
-#         batch_embeddings = model.encode(batch_texts, show_progressbar=False)
-#         embeddings.append(batch_embeddings)
-#         progress((i + 1) / total_batches, desc=desc)
-#     return np.concatenate(embeddings, axis=0)
-# def deduplicate(
-#     embedding_matrix: np.ndarray,
-#     threshold: float,
-#     batch_size: int = 1024,
-#     progress=None
-# ) -> tuple[np.ndarray, dict[int, int]]:
-#     reach = Reach(vectors=embedding_matrix, items=[str(i) for i in range(len(embedding_matrix))])
-#     deduplicated_indices = set(range(len(embedding_matrix)))
-#     duplicate_to_original_mapping = {}
-#     results = reach.nearest_neighbor_threshold(
-#         embedding_matrix,
-#         threshold=threshold,
-#         batch_size=batch_size,
-#         show_progressbar=False,
-#     )
-#     total_items = len(embedding_matrix)
-#     for i, similar_items in enumerate(progress.tqdm(results, desc="Processing duplicates", total=total_items)):
-#         if i not in deduplicated_indices:
-#             continue
-#         similar_indices = [int(item[0]) for item in similar_items if int(item[0]) != i]
-#         for sim_idx in similar_indices:
-#             if sim_idx in deduplicated_indices:
-#                 deduplicated_indices.remove(sim_idx)
-#                 duplicate_to_original_mapping[sim_idx] = i
-#     return np.array(list(deduplicated_indices)), duplicate_to_original_mapping
-# def display_word_differences(x: str, y: str) -> str:
-#     diff = ndiff(x.split(), y.split())
-#     return " ".join([word for word in diff if word.startswith(("+", "-"))])
-# def perform_deduplication(
-#     deduplication_type,
-#     dataset1_name,
-#     dataset1_split,
-#     dataset1_text_column,
-#     dataset2_name="",
-#     dataset2_split="",
-#     dataset2_text_column="",
-#     threshold=default_threshold,
-#     progress=gr.Progress(track_tqdm=True),
-# ):
-#     try:
-#         threshold = float(threshold)
-#         if deduplication_type == "Single dataset":
-#             ds = ds_default1 if dataset1_name == default_dataset1_name and dataset1_split == default_dataset1_split else load_dataset(dataset1_name, split=dataset1_split)
-#             texts = [example[dataset1_text_column] for example in ds]
-#             embedding_matrix = compute_embeddings(texts, batch_size=64, progress=progress)
-#             deduplicated_indices, duplicate_to_original_mapping = deduplicate(embedding_matrix, threshold, progress=progress)
-#             num_duplicates = len(duplicate_to_original_mapping)
-#             num_total = len(texts)
-#             num_deduplicated = len(deduplicated_indices)
-#             result_text = f"**Total documents:** {num_total}\n"
-#             result_text += f"**Number of duplicates found:** {num_duplicates}\n"
-#             result_text += f"**Number of unique documents after deduplication:** {num_deduplicated}\n\n"
-#             if num_duplicates > 0:
-#                 result_text += "**Examples of duplicates found:**\n\n"
-#                 num_examples = min(5, num_duplicates)
-#                 for duplicate_idx, original_idx in list(duplicate_to_original_mapping.items())[:num_examples]:
-#                     original_text = texts[original_idx]
-#                     duplicate_text = texts[duplicate_idx]
-#                     differences = display_word_differences(original_text, duplicate_text)
-#                     result_text += f"**Original text:**\n{original_text}\n\n"
-#                     result_text += f"**Duplicate text:**\n{duplicate_text}\n\n"
-#                     result_text += f"**Differences:**\n{differences}\n"
-#                     result_text += "-" * 50 + "\n\n"
-#             else:
-#                 result_text += "No duplicates found."
-#             yield result_text
-#     except Exception as e:
-#         yield f"An error occurred: {e}"
-# # Gradio interface setup
-# with gr.Blocks(css="#status_output { height: 150px; overflow: auto; }") as demo:
-#     gr.Markdown("# Semantic Deduplication")
-#     deduplication_type = gr.Radio(
-#         choices=["Single dataset", "Cross-dataset"],
-#         label="Deduplication Type",
-#         value="Single dataset",
-#     )
-#     with gr.Row():
-#         dataset1_name = gr.Textbox(value=default_dataset1_name, label="Dataset 1 Name")
-#         dataset1_split = gr.Textbox(value=default_dataset1_split, label="Dataset 1 Split")
-#         dataset1_text_column = gr.Textbox(value=default_text_column, label="Text Column Name")
-#     dataset2_inputs = gr.Column(visible=False)
-#     with dataset2_inputs:
-#         gr.Markdown("### Dataset 2")
-#         with gr.Row():
-#             dataset2_name = gr.Textbox(value=default_dataset2_name, label="Dataset 2 Name")
-#             dataset2_split = gr.Textbox(value=default_dataset2_split, label="Dataset 2 Split")
-#             dataset2_text_column = gr.Textbox(value=default_text_column, label="Text Column Name")
-#     threshold = gr.Slider(minimum=0.0, maximum=1.0, value=default_threshold, label="Similarity Threshold")
-#     compute_button = gr.Button("Compute")
-#     result_output = gr.Markdown()
-#     def update_visibility(deduplication_type_value):
-#         return gr.update(visible=True) if deduplication_type_value == "Cross-dataset" else gr.update(visible=False)
-#     deduplication_type.change(
-#         update_visibility, inputs=deduplication_type, outputs=dataset2_inputs
-#     )
-#     compute_button.click(
-#         fn=perform_deduplication,
-#         inputs=[
-#             deduplication_type,
-#             dataset1_name,
-#             dataset1_split,
-#             dataset1_text_column,
-#             dataset2_name,
-#             dataset2_split,
-#             dataset2_text_column,
-#             threshold,
-#         ],
-#         outputs=[result_output],
-#     )
-# demo.launch()
 # # import gradio as gr
@@ -739,7 +922,6 @@ demo.launch()
 # # import model2vec
 # # from reach import Reach
 # # from difflib import ndiff
-# # import time
 # # # Load the model at startup
 # # model = model2vec.StaticModel.from_pretrained("minishlab/M2V_base_output")
@@ -761,19 +943,7 @@ demo.launch()
 # #     for i in range(0, len(iterable), batch_size):
 # #         yield iterable[i:i + batch_size]
-# # def log_time(message, start_time=None, logs=None):
-# #     """Helper function to log the start and end times."""
-# #     current_time = time.time()
-# #     if start_time is not None:
-# #         elapsed = current_time - start_time
-# #         log_message = f"{message} - Took {elapsed:.2f} seconds"
-# #     else:
-# #         log_message = f"{message} - Started"
-# #     if logs is not None:
-# #         logs.append(log_message)
-# # def compute_embeddings(texts, batch_size, progress, logs, desc="Computing embeddings"):
 # #     embeddings = []
 # #     total_batches = (len(texts) + batch_size - 1) // batch_size
 # #     for i, batch_texts in enumerate(batch_iterable(texts, batch_size)):
@@ -786,38 +956,26 @@ demo.launch()
 # #     embedding_matrix: np.ndarray,
 # #     threshold: float,
 # #     batch_size: int = 1024,
-# #     progress=None,
-# #     logs=None
 # # ) -> tuple[np.ndarray, dict[int, int]]:
-# #     # Building the index
-# #     log_time("Building search index", logs=logs)
-# #     reach = Reach(
-# #         vectors=embedding_matrix, items=[str(i) for i in range(len(embedding_matrix))]
-# #     )
 # #     deduplicated_indices = set(range(len(embedding_matrix)))
 # #     duplicate_to_original_mapping = {}
-# #     # Finding nearest neighbors
-# #     log_time("Finding nearest neighbors", logs=logs)
 # #     results = reach.nearest_neighbor_threshold(
 # #         embedding_matrix,
 # #         threshold=threshold,
 # #         batch_size=batch_size,
-# #         show_progressbar=False,  # Disable internal progress bar
 # #     )
-# #     # Processing duplicates with a progress bar
 # #     total_items = len(embedding_matrix)
-# #     log_time("Processing duplicates", logs=logs)
-# #     for i, similar_items in enumerate(
-# #         progress.tqdm(results, desc="Processing duplicates", total=total_items)
-# #     ):
 # #         if i not in deduplicated_indices:
 # #             continue
 # #         similar_indices = [int(item[0]) for item in similar_items if int(item[0]) != i]
 # #         for sim_idx in similar_indices:
 # #             if sim_idx in deduplicated_indices:
 # #                 deduplicated_indices.remove(sim_idx)
@@ -829,11 +987,6 @@ demo.launch()
 # #     diff = ndiff(x.split(), y.split())
 # #     return " ".join([word for word in diff if word.startswith(("+", "-"))])
-# # def encode_texts(texts, progress=None, logs=None):
-# #     embedding_matrix = model.encode(texts, show_progressbar=False)
-# #     log_time("Encoding texts completed", logs=logs)
-# #     return embedding_matrix
 # # def perform_deduplication(
 # #     deduplication_type,
 # #     dataset1_name,
@@ -845,59 +998,24 @@ demo.launch()
 # #     threshold=default_threshold,
 # #     progress=gr.Progress(track_tqdm=True),
 # # ):
-# #     logs = []  # To store log messages
 # #     try:
-# #         # Convert threshold to float
 # #         threshold = float(threshold)
-# #         # Initialize status message
-# #         log_time("Deduplication started", logs=logs)
 # #         if deduplication_type == "Single dataset":
-# #             # Load Dataset 1
-# #             start_time = time.time()
-# #             log_time("Loading Dataset 1", logs=logs)
-# #             if (
-# #                 dataset1_name == default_dataset1_name
-# #                 and dataset1_split == default_dataset1_split
-# #             ):
-# #                 ds = ds_default1
-# #             else:
-# #                 ds = load_dataset(dataset1_name, split=dataset1_split)
-# #             log_time("Loading Dataset 1 completed", start_time=start_time, logs=logs)
-# #             # Extract texts
-# #             start_time = time.time()
-# #             log_time("Extracting texts from Dataset 1", logs=logs)
 # #             texts = [example[dataset1_text_column] for example in ds]
-# #             log_time("Extracting texts from Dataset 1 completed", start_time=start_time, logs=logs)
-# #             # Compute embeddings
-# #             start_time = time.time()
-# #             log_time("Computing embeddings for Dataset 1", logs=logs)
-# #             embedding_matrix = encode_texts(texts, progress=progress, logs=logs)
-# #             log_time("Computing embeddings for Dataset 1 completed", start_time=start_time, logs=logs)
-# #             # Deduplicate
-# #             start_time = time.time()
-# #             log_time("Deduplicating embeddings", logs=logs)
-# #             deduplicated_indices, duplicate_to_original_mapping = deduplicate(
-# #                 embedding_matrix, threshold, progress=progress, logs=logs
-# #             )
-# #             log_time("Deduplication completed", start_time=start_time, logs=logs)
-# #             # Prepare the results
 # #             num_duplicates = len(duplicate_to_original_mapping)
 # #             num_total = len(texts)
 # #             num_deduplicated = len(deduplicated_indices)
 # #             result_text = f"**Total documents:** {num_total}\n"
 # #             result_text += f"**Number of duplicates found:** {num_duplicates}\n"
-# #             result_text += (
-# #                 f"**Number of unique documents after deduplication:** {num_deduplicated}\n\n"
-# #             )
-# #             # Show deduplicated examples
 # #             if num_duplicates > 0:
 # #                 result_text += "**Examples of duplicates found:**\n\n"
 # #                 num_examples = min(5, num_duplicates)
@@ -912,16 +1030,12 @@ demo.launch()
 # #             else:
 # #                 result_text += "No duplicates found."
-# #             log_time("Deduplication process finished", logs=logs)
-# #             full_log = "\n".join(logs)  # Combine all logs into one output
-# #             yield full_log, result_text
 # #     except Exception as e:
-# #         full_log = "\n".join(logs)  # Combine all logs into one output in case of an error
-# #         yield f"An error occurred: {e}", ""
-# #         raise e
-# # # Adjust the height of the status_output component using custom CSS
 # # with gr.Blocks(css="#status_output { height: 150px; overflow: auto; }") as demo:
 # #     gr.Markdown("# Semantic Deduplication")
@@ -944,22 +1058,14 @@ demo.launch()
 # #             dataset2_split = gr.Textbox(value=default_dataset2_split, label="Dataset 2 Split")
 # #             dataset2_text_column = gr.Textbox(value=default_text_column, label="Text Column Name")
-# #     threshold = gr.Slider(
-# #         minimum=0.0, maximum=1.0, value=default_threshold, label="Similarity Threshold"
-# #     )
 # #     compute_button = gr.Button("Compute")
-# #     # Use 'gr.Markdown' with 'elem_id' and custom CSS to adjust height
-# #     status_output = gr.Markdown(elem_id="status_output")
 # #     result_output = gr.Markdown()
-# #     # Function to update the visibility of dataset2_inputs
 # #     def update_visibility(deduplication_type_value):
-# #         if deduplication_type_value == "Cross-dataset":
-# #             return gr.update(visible=True)
-# #         else:
-# #             return gr.update(visible=False)
 # #     deduplication_type.change(
 # #         update_visibility, inputs=deduplication_type, outputs=dataset2_inputs
@@ -977,21 +1083,19 @@ demo.launch()
 # #             dataset2_text_column,
 # #             threshold,
 # #         ],
-# #         outputs=[status_output, result_output],
 # #     )
 # # demo.launch()
 # # # import gradio as gr
 # # # from datasets import load_dataset
 # # # import numpy as np
-# # # #from model2vec import StaticModel
 # # # import model2vec
 # # # from reach import Reach
 # # # from difflib import ndiff
 # # # # Load the model at startup
 # # # model = model2vec.StaticModel.from_pretrained("minishlab/M2V_base_output")
@@ -1008,13 +1112,24 @@ demo.launch()
 # # # ds_default1 = load_dataset(default_dataset1_name, split=default_dataset1_split)
 # # # ds_default2 = load_dataset(default_dataset2_name, split=default_dataset2_split)
 # # # def batch_iterable(iterable, batch_size):
 # # #     """Helper function to create batches from an iterable."""
 # # #     for i in range(0, len(iterable), batch_size):
 # # #         yield iterable[i:i + batch_size]
-# # # def compute_embeddings(texts, batch_size, progress, desc="Computing embeddings"):
 # # #     embeddings = []
 # # #     total_batches = (len(texts) + batch_size - 1) // batch_size
 # # #     for i, batch_texts in enumerate(batch_iterable(texts, batch_size)):
@@ -1027,10 +1142,11 @@ demo.launch()
 # # #     embedding_matrix: np.ndarray,
 # # #     threshold: float,
 # # #     batch_size: int = 1024,
-# # #     progress=None
 # # # ) -> tuple[np.ndarray, dict[int, int]]:
 # # #     # Building the index
-# # #     progress(0, desc="Building search index...")
 # # #     reach = Reach(
 # # #         vectors=embedding_matrix, items=[str(i) for i in range(len(embedding_matrix))]
 # # #     )
@@ -1039,7 +1155,7 @@ demo.launch()
 # # #     duplicate_to_original_mapping = {}
 # # #     # Finding nearest neighbors
-# # #     progress(0, desc="Finding nearest neighbors...")
 # # #     results = reach.nearest_neighbor_threshold(
 # # #         embedding_matrix,
 # # #         threshold=threshold,
@@ -1049,6 +1165,7 @@ demo.launch()
 # # #     # Processing duplicates with a progress bar
 # # #     total_items = len(embedding_matrix)
 # # #     for i, similar_items in enumerate(
 # # #         progress.tqdm(results, desc="Processing duplicates", total=total_items)
 # # #     ):
@@ -1068,9 +1185,9 @@ demo.launch()
 # # #     diff = ndiff(x.split(), y.split())
 # # #     return " ".join([word for word in diff if word.startswith(("+", "-"))])
-# # # def encode_texts(texts, progress=None):
 # # #     embedding_matrix = model.encode(texts, show_progressbar=False)
 # # #     return embedding_matrix
 # # # def perform_deduplication(
@@ -1084,17 +1201,18 @@ demo.launch()
 # # #     threshold=default_threshold,
 # # #     progress=gr.Progress(track_tqdm=True),
 # # # ):
 # # #     try:
 # # #         # Convert threshold to float
 # # #         threshold = float(threshold)
 # # #         # Initialize status message
-# # #         status = ""
 # # #         if deduplication_type == "Single dataset":
 # # #             # Load Dataset 1
-# # #             status = "Loading Dataset 1..."
-# # #             yield status, ""
 # # #             if (
 # # #                 dataset1_name == default_dataset1_name
 # # #                 and dataset1_split == default_dataset1_split
@@ -1102,29 +1220,27 @@ demo.launch()
 # # #                 ds = ds_default1
 # # #             else:
 # # #                 ds = load_dataset(dataset1_name, split=dataset1_split)
 # # #             # Extract texts
-# # #             status = "Extracting texts from Dataset 1..."
-# # #             yield status, ""
 # # #             texts = [example[dataset1_text_column] for example in ds]
 # # #             # Compute embeddings
-# # #             status = "Computing embeddings for Dataset 1..."
-# # #             yield status, ""
-# # #             embedding_matrix = encode_texts(texts, progress=progress)
-# # #             #embedding_matrix = model.encode(texts, show_progressbar=True)
-# # #             # embedding_matrix = compute_embeddings(
-# # #             #     texts,
-# # #             #     batch_size=64,
-# # #             #     progress=progress,
-# # #             #     desc="Computing embeddings for Dataset 1",
-# # #             # )
 # # #             # Deduplicate
-# # #             status = "Deduplicating embeddings..."
-# # #             yield status, ""
 # # #             deduplicated_indices, duplicate_to_original_mapping = deduplicate(
-# # #                 embedding_matrix, threshold, progress=progress
 # # #             )
 # # #             # Prepare the results
 # # #             num_duplicates = len(duplicate_to_original_mapping)
@@ -1152,141 +1268,15 @@ demo.launch()
 # # #             else:
 # # #                 result_text += "No duplicates found."
-# # #             # Final status
-# # #             status = "Deduplication completed."
-# # #             yield status, result_text
-# # #         elif deduplication_type == "Cross-dataset":
-# # #             # Similar code for cross-dataset deduplication
-# # #             # Load Dataset 1
-# # #             status = "Loading Dataset 1..."
-# # #             yield status, ""
-# # #             if (
-# # #                 dataset1_name == default_dataset1_name
-# # #                 and dataset1_split == default_dataset1_split
-# # #             ):
-# # #                 ds1 = ds_default1
-# # #             else:
-# # #                 ds1 = load_dataset(dataset1_name, split=dataset1_split)
-# # #             # Load Dataset 2
-# # #             status = "Loading Dataset 2..."
-# # #             yield status, ""
-# # #             if (
-# # #                 dataset2_name == default_dataset2_name
-# # #                 and dataset2_split == default_dataset2_split
-# # #             ):
-# # #                 ds2 = ds_default2
-# # #             else:
-# # #                 ds2 = load_dataset(dataset2_name, split=dataset2_split)
-# # #             # Extract texts from Dataset 1
-# # #             status = "Extracting texts from Dataset 1..."
-# # #             yield status, ""
-# # #             texts1 = [example[dataset1_text_column] for example in ds1]
-# # #             # Extract texts from Dataset 2
-# # #             status = "Extracting texts from Dataset 2..."
-# # #             yield status, ""
-# # #             texts2 = [example[dataset2_text_column] for example in ds2]
-# # #             # Compute embeddings for Dataset 1
-# # #             status = "Computing embeddings for Dataset 1..."
-# # #             yield status, ""
-# # #             embedding_matrix1 = compute_embeddings(
-# # #                 texts1,
-# # #                 batch_size=64,
-# # #                 progress=progress,
-# # #                 desc="Computing embeddings for Dataset 1",
-# # #             )
-# # #             # Compute embeddings for Dataset 2
-# # #             status = "Computing embeddings for Dataset 2..."
-# # #             yield status, ""
-# # #             embedding_matrix2 = compute_embeddings(
-# # #                 texts2,
-# # #                 batch_size=64,
-# # #                 progress=progress,
-# # #                 desc="Computing embeddings for Dataset 2",
-# # #             )
-# # #             # Deduplicate across datasets
-# # #             status = "Deduplicating embeddings across datasets..."
-# # #             yield status, ""
-# # #             duplicate_indices_in_ds2, duplicate_to_original_mapping = deduplicate_across_datasets(
-# # #                 embedding_matrix1, embedding_matrix2, threshold, progress=progress
-# # #             )
-# # #             num_duplicates = len(duplicate_indices_in_ds2)
-# # #             num_total_ds2 = len(texts2)
-# # #             num_unique_ds2 = num_total_ds2 - num_duplicates
-# # #             result_text = f"**Total documents in {dataset2_name}/{dataset2_split}:** {num_total_ds2}\n"
-# # #             result_text += f"**Number of duplicates found in {dataset2_name}/{dataset2_split}:** {num_duplicates}\n"
-# # #             result_text += f"**Number of unique documents in {dataset2_name}/{dataset2_split} after deduplication:** {num_unique_ds2}\n\n"
-# # #             # Show deduplicated examples
-# # #             if num_duplicates > 0:
-# # #                 result_text += "**Examples of duplicates found in Dataset 2:**\n\n"
-# # #                 num_examples = min(5, num_duplicates)
-# # #                 for duplicate_idx in duplicate_indices_in_ds2[:num_examples]:
-# # #                     original_idx = duplicate_to_original_mapping[duplicate_idx]
-# # #                     original_text = texts1[original_idx]
-# # #                     duplicate_text = texts2[duplicate_idx]
-# # #                     differences = display_word_differences(original_text, duplicate_text)
-# # #                     result_text += f"**Original text (Dataset 1):**\n{original_text}\n\n"
-# # #                     result_text += f"**Duplicate text (Dataset 2):**\n{duplicate_text}\n\n"
-# # #                     result_text += f"**Differences:**\n{differences}\n"
-# # #                     result_text += "-" * 50 + "\n\n"
-# # #             else:
-# # #                 result_text += "No duplicates found."
-# # #             # Final status
-# # #             status = "Deduplication completed."
-# # #             yield status, result_text
 # # #     except Exception as e:
 # # #         yield f"An error occurred: {e}", ""
 # # #         raise e
-# # # def deduplicate_across_datasets(
-# # #     embedding_matrix_1: np.ndarray,
-# # #     embedding_matrix_2: np.ndarray,
-# # #     threshold: float,
-# # #     batch_size: int = 1024,
-# # #     progress=None
-# # # ) -> tuple[list[int], dict[int, int]]:
-# # #     # Building the index from Dataset 1
-# # #     progress(0, desc="Building search index from Dataset 1...")
-# # #     reach = Reach(
-# # #         vectors=embedding_matrix_1, items=[str(i) for i in range(len(embedding_matrix_1))]
-# # #     )
-# # #     duplicate_indices_in_test = []
-# # #     duplicate_to_original_mapping = {}
-# # #     # Finding nearest neighbors between datasets
-# # #     progress(0, desc="Finding nearest neighbors between datasets...")
-# # #     results = reach.nearest_neighbor_threshold(
-# # #         embedding_matrix_2,
-# # #         threshold=threshold,
-# # #         batch_size=batch_size,
-# # #         show_progressbar=False,  # Disable internal progress bar
-# # #     )
-# # #     total_items = len(embedding_matrix_2)
-# # #     # Processing duplicates with a progress bar
-# # #     for i, similar_items in enumerate(
-# # #         progress.tqdm(results, desc="Processing duplicates across datasets", total=total_items)
-# # #     ):
-# # #         similar_indices = [int(item[0]) for item in similar_items if item[1] >= threshold]
-# # #         if similar_indices:
-# # #             duplicate_indices_in_test.append(i)
-# # #             duplicate_to_original_mapping[i] = similar_indices[0]
-# # #     return duplicate_indices_in_test, duplicate_to_original_mapping
 # # # # Adjust the height of the status_output component using custom CSS
 # # # with gr.Blocks(css="#status_output { height: 150px; overflow: auto; }") as demo:
 # # #     gr.Markdown("# Semantic Deduplication")
@@ -1347,3 +1337,369 @@ demo.launch()
 # # #     )
 # # # demo.launch()

 from model2vec import StaticModel
 from reach import Reach
 from difflib import ndiff
 # Load the model at startup
 model = StaticModel.from_pretrained("minishlab/M2V_base_output")
+# Default dataset parameters
 default_dataset1_name = "sst2"
 default_dataset1_split = "train"
 default_dataset2_name = "sst2"
 def compute_embeddings(texts, batch_size, progress, desc="Computing embeddings"):
     embeddings = []
+    total_batches = (len(texts) + batch_size - 1) // batch_size
+    for i, batch_texts in enumerate(batch_iterable(texts, batch_size)):
+        batch_embeddings = model.encode(batch_texts, show_progressbar=False)
         embeddings.append(batch_embeddings)
+        progress((i + 1) / total_batches, desc=desc)
     return np.concatenate(embeddings, axis=0)
+def deduplicate(
+    embedding_matrix: np.ndarray,
+    threshold: float,
+    batch_size: int = 1024,
+    progress=None
+) -> tuple[np.ndarray, dict[int, int]]:
+    # Building the index
+    progress(0, desc="Building search index...")
+    reach = Reach(
+        vectors=embedding_matrix, items=[str(i) for i in range(len(embedding_matrix))]
+    )
     deduplicated_indices = set(range(len(embedding_matrix)))
     duplicate_to_original_mapping = {}
+    # Finding nearest neighbors
+    progress(0, desc="Finding nearest neighbors...")
     results = reach.nearest_neighbor_threshold(
         embedding_matrix,
         threshold=threshold,
         batch_size=batch_size,
+        show_progressbar=False,  # Disable internal progress bar
     )
+    # Processing duplicates with a progress bar
     total_items = len(embedding_matrix)
+    for i, similar_items in enumerate(
+        progress.tqdm(results, desc="Processing duplicates", total=total_items)
+    ):
         if i not in deduplicated_indices:
             continue
     return np.array(list(deduplicated_indices)), duplicate_to_original_mapping
 def display_word_differences(x: str, y: str) -> str:
     diff = ndiff(x.split(), y.split())
+    return " ".join([word for word in diff if word.startswith(("+", "-"))])
 def perform_deduplication(
     deduplication_type,
     dataset2_split="",
     dataset2_text_column="",
     threshold=default_threshold,
+    progress=gr.Progress(track_tqdm=True),
 ):
     try:
+        # Convert threshold to float
         threshold = float(threshold)
+        # Initialize status message
+        status = ""
         if deduplication_type == "Single dataset":
+            # Load Dataset 1
+            status = "Loading Dataset 1..."
+            yield status, ""
+            if (
+                dataset1_name == default_dataset1_name
+                and dataset1_split == default_dataset1_split
+            ):
+                ds = ds_default1
+            else:
+                ds = load_dataset(dataset1_name, split=dataset1_split)
+            # Extract texts
+            status = "Extracting texts from Dataset 1..."
+            yield status, ""
+            texts = [example[dataset1_text_column] for example in ds]
+            # Compute embeddings
+            status = "Computing embeddings for Dataset 1..."
+            yield status, ""
+            embedding_matrix = compute_embeddings(
+                texts,
+                batch_size=64,
+                progress=progress,
+                desc="Computing embeddings for Dataset 1",
+            )
+            # Deduplicate
+            status = "Deduplicating embeddings..."
+            yield status, ""
+            deduplicated_indices, duplicate_to_original_mapping = deduplicate(
+                embedding_matrix, threshold, progress=progress
+            )
+            # Prepare the results
             num_duplicates = len(duplicate_to_original_mapping)
             num_total = len(texts)
             num_deduplicated = len(deduplicated_indices)
             result_text = f"**Total documents:** {num_total}\n"
             result_text += f"**Number of duplicates found:** {num_duplicates}\n"
+            result_text += (
+                f"**Number of unique documents after deduplication:** {num_deduplicated}\n\n"
+            )
+            # Show deduplicated examples
             if num_duplicates > 0:
                 result_text += "**Examples of duplicates found:**\n\n"
                 num_examples = min(5, num_duplicates)
             else:
                 result_text += "No duplicates found."
+            # Final status
+            status = "Deduplication completed."
+            yield status, result_text
         elif deduplication_type == "Cross-dataset":
+            # Similar code for cross-dataset deduplication
+            # Load Dataset 1
+            status = "Loading Dataset 1..."
+            yield status, ""
+            if (
+                dataset1_name == default_dataset1_name
+                and dataset1_split == default_dataset1_split
+            ):
+                ds1 = ds_default1
+            else:
+                ds1 = load_dataset(dataset1_name, split=dataset1_split)
+            # Load Dataset 2
+            status = "Loading Dataset 2..."
+            yield status, ""
+            if (
+                dataset2_name == default_dataset2_name
+                and dataset2_split == default_dataset2_split
+            ):
+                ds2 = ds_default2
+            else:
+                ds2 = load_dataset(dataset2_name, split=dataset2_split)
+            # Extract texts from Dataset 1
+            status = "Extracting texts from Dataset 1..."
+            yield status, ""
             texts1 = [example[dataset1_text_column] for example in ds1]
+            # Extract texts from Dataset 2
+            status = "Extracting texts from Dataset 2..."
+            yield status, ""
+            texts2 = [example[dataset2_text_column] for example in ds2]
+            # Compute embeddings for Dataset 1
+            status = "Computing embeddings for Dataset 1..."
+            yield status, ""
+            embedding_matrix1 = compute_embeddings(
+                texts1,
+                batch_size=64,
+                progress=progress,
+                desc="Computing embeddings for Dataset 1",
+            )
+            # Compute embeddings for Dataset 2
+            status = "Computing embeddings for Dataset 2..."
+            yield status, ""
+            embedding_matrix2 = compute_embeddings(
+                texts2,
+                batch_size=64,
+                progress=progress,
+                desc="Computing embeddings for Dataset 2",
+            )
+            # Deduplicate across datasets
+            status = "Deduplicating embeddings across datasets..."
+            yield status, ""
+            duplicate_indices_in_ds2, duplicate_to_original_mapping = deduplicate_across_datasets(
+                embedding_matrix1, embedding_matrix2, threshold, progress=progress
+            )
             num_duplicates = len(duplicate_indices_in_ds2)
             num_total_ds2 = len(texts2)
             result_text += f"**Number of duplicates found in {dataset2_name}/{dataset2_split}:** {num_duplicates}\n"
             result_text += f"**Number of unique documents in {dataset2_name}/{dataset2_split} after deduplication:** {num_unique_ds2}\n\n"
+            # Show deduplicated examples
             if num_duplicates > 0:
                 result_text += "**Examples of duplicates found in Dataset 2:**\n\n"
                 num_examples = min(5, num_duplicates)
             else:
                 result_text += "No duplicates found."
+            # Final status
+            status = "Deduplication completed."
+            yield status, result_text
     except Exception as e:
         yield f"An error occurred: {e}", ""
+        raise e
+def deduplicate_across_datasets(
+    embedding_matrix_1: np.ndarray,
+    embedding_matrix_2: np.ndarray,
+    threshold: float,
+    batch_size: int = 1024,
+    progress=None
+) -> tuple[list[int], dict[int, int]]:
+    # Building the index from Dataset 1
+    progress(0, desc="Building search index from Dataset 1...")
+    reach = Reach(
+        vectors=embedding_matrix_1, items=[str(i) for i in range(len(embedding_matrix_1))]
+    )
+    duplicate_indices_in_test = []
+    duplicate_to_original_mapping = {}
+    # Finding nearest neighbors between datasets
+    progress(0, desc="Finding nearest neighbors between datasets...")
+    results = reach.nearest_neighbor_threshold(
+        embedding_matrix_2,
+        threshold=threshold,
+        batch_size=batch_size,
+        show_progressbar=False,  # Disable internal progress bar
+    )
+    total_items = len(embedding_matrix_2)
+    # Processing duplicates with a progress bar
+    for i, similar_items in enumerate(
+        progress.tqdm(results, desc="Processing duplicates across datasets", total=total_items)
+    ):
+        similar_indices = [int(item[0]) for item in similar_items if item[1] >= threshold]
+        if similar_indices:
+            duplicate_indices_in_test.append(i)
+            duplicate_to_original_mapping[i] = similar_indices[0]
+    return duplicate_indices_in_test, duplicate_to_original_mapping
+# Adjust the height of the status_output component using custom CSS
+with gr.Blocks(css="#status_output { height: 150px; overflow: auto; }") as demo:
     gr.Markdown("# Semantic Deduplication")
     deduplication_type = gr.Radio(
         choices=["Single dataset", "Cross-dataset"],
         label="Deduplication Type",
+        value="Single dataset",
     )
     with gr.Row():
             dataset2_text_column = gr.Textbox(value=default_text_column, label="Text Column Name")
     threshold = gr.Slider(
+        minimum=0.0, maximum=1.0, value=default_threshold, label="Similarity Threshold"
     )
     compute_button = gr.Button("Compute")
+    # Use 'gr.Markdown' with 'elem_id' and custom CSS to adjust height
     status_output = gr.Markdown(elem_id="status_output")
+    result_output = gr.Markdown()
+    # Function to update the visibility of dataset2_inputs
     def update_visibility(deduplication_type_value):
         if deduplication_type_value == "Cross-dataset":
             return gr.update(visible=True)
             return gr.update(visible=False)
     deduplication_type.change(
+        update_visibility, inputs=deduplication_type, outputs=dataset2_inputs
     )
     compute_button.click(
             dataset2_name,
             dataset2_split,
             dataset2_text_column,
+            threshold,
         ],
+        outputs=[status_output, result_output],
     )
 demo.launch()
 # import gradio as gr
 # from datasets import load_dataset
 # import numpy as np
 #     """
 #     Deduplicate embeddings and return the deduplicated indices and a mapping of removed indices to their corresponding original indices.
 #     """
 #     reach = Reach(vectors=embedding_matrix, items=[str(i) for i in range(len(embedding_matrix))])
 #     deduplicated_indices = set(range(len(embedding_matrix)))
 #     duplicate_to_original_mapping = {}
 #     results = reach.nearest_neighbor_threshold(
 #         embedding_matrix,
 #         threshold=threshold,
 #         batch_size=batch_size,
+#         show_progressbar=False
 #     )
 #     total_items = len(embedding_matrix)
 #     for i, similar_items in enumerate(progress.tqdm(results, desc="Processing duplicates", total=total_items)):
 #         if i not in deduplicated_indices:
 #     """
 #     Deduplicate embeddings across two datasets and return the indices of duplicates between them.
 #     """
 #     reach = Reach(vectors=embedding_matrix_1, items=[str(i) for i in range(len(embedding_matrix_1))])
 #     duplicate_indices_in_test = []
 #     duplicate_to_original_mapping = {}
 #     results = reach.nearest_neighbor_threshold(
 #         embedding_matrix_2,
 #         threshold=threshold,
 #         batch_size=batch_size,
+#         show_progressbar=False
 #     )
 #     total_items = len(embedding_matrix_2)
 #     for i, similar_items in enumerate(progress.tqdm(results, desc="Processing duplicates across datasets", total=total_items)):
 #         similar_indices = [int(item[0]) for item in similar_items if item[1] >= threshold]
 #     progress=gr.Progress(track_tqdm=True)
 # ):
 #     try:
 #         threshold = float(threshold)
 #         if deduplication_type == "Single dataset":
+#             ds = ds_default1 if dataset1_name == default_dataset1_name and dataset1_split == default_dataset1_split else load_dataset(dataset1_name, split=dataset1_split)
 #             texts = [example[dataset1_text_column] for example in ds]
 #             embedding_matrix = compute_embeddings(texts, batch_size=64, progress=progress, desc="Computing embeddings for Dataset 1")
+#             deduplicated_indices, duplicate_to_original_mapping = deduplicate(embedding_matrix, threshold, progress=progress)
 #             num_duplicates = len(duplicate_to_original_mapping)
 #             num_total = len(texts)
 #             num_deduplicated = len(deduplicated_indices)
 #             result_text += f"**Number of duplicates found:** {num_duplicates}\n"
 #             result_text += f"**Number of unique documents after deduplication:** {num_deduplicated}\n\n"
 #             if num_duplicates > 0:
 #                 result_text += "**Examples of duplicates found:**\n\n"
 #                 num_examples = min(5, num_duplicates)
 #             else:
 #                 result_text += "No duplicates found."
+#             yield result_text
 #         elif deduplication_type == "Cross-dataset":
+#             ds1 = ds_default1 if dataset1_name == default_dataset1_name and dataset1_split == default_dataset1_split else load_dataset(dataset1_name, split=dataset1_split)
+#             ds2 = ds_default2 if dataset2_name == default_dataset2_name and dataset2_split == default_dataset2_split else load_dataset(dataset2_name, split=dataset2_split)
 #             texts1 = [example[dataset1_text_column] for example in ds1]
 #             texts2 = [example[dataset2_text_column] for example in ds2]
 #             embedding_matrix1 = compute_embeddings(texts1, batch_size=64, progress=progress, desc="Computing embeddings for Dataset 1")
 #             embedding_matrix2 = compute_embeddings(texts2, batch_size=64, progress=progress, desc="Computing embeddings for Dataset 2")
+#             duplicate_indices_in_ds2, duplicate_to_original_mapping = deduplicate_across_datasets(embedding_matrix1, embedding_matrix2, threshold, progress=progress)
 #             num_duplicates = len(duplicate_indices_in_ds2)
 #             num_total_ds2 = len(texts2)
 #             result_text += f"**Number of duplicates found in {dataset2_name}/{dataset2_split}:** {num_duplicates}\n"
 #             result_text += f"**Number of unique documents in {dataset2_name}/{dataset2_split} after deduplication:** {num_unique_ds2}\n\n"
 #             if num_duplicates > 0:
 #                 result_text += "**Examples of duplicates found in Dataset 2:**\n\n"
 #                 num_examples = min(5, num_duplicates)
 #             else:
 #                 result_text += "No duplicates found."
+#             yield result_text
 #     except Exception as e:
 #         yield f"An error occurred: {e}", ""
+# # Adjust the height of the status_output and result_output components
+# with gr.Blocks(css="#status_output { height: 300px; overflow: auto; } #result_output { height: 300px; overflow: auto; }") as demo:
 #     gr.Markdown("# Semantic Deduplication")
 #     deduplication_type = gr.Radio(
 #     compute_button = gr.Button("Compute")
+#     status_output = gr.Markdown(elem_id="status_output")
+#     result_output = gr.Markdown(elem_id="result_output")
 #     def update_visibility(deduplication_type_value):
 #         if deduplication_type_value == "Cross-dataset":
 #             return gr.update(visible=True)
 # demo.launch()
+# # import gradio as gr
+# # from datasets import load_dataset
+# # import numpy as np
+# # from model2vec import StaticModel
+# # from reach import Reach
+# # from difflib import ndiff
+# # import tqdm
+# # # Load the model at startup
+# # model = StaticModel.from_pretrained("minishlab/M2V_base_output")
+# # # Update default dataset to 'sst2' and set default threshold to 0.9
+# # default_dataset1_name = "sst2"
+# # default_dataset1_split = "train"
+# # default_dataset2_name = "sst2"
+# # default_dataset2_split = "validation"
+# # default_text_column = "sentence"
+# # default_threshold = 0.9
+# # # Load the default datasets at startup
+# # ds_default1 = load_dataset(default_dataset1_name, split=default_dataset1_split)
+# # ds_default2 = load_dataset(default_dataset2_name, split=default_dataset2_split)
+# # def batch_iterable(iterable, batch_size):
+# #     """Helper function to create batches from an iterable."""
+# #     for i in range(0, len(iterable), batch_size):
+# #         yield iterable[i:i + batch_size]
+# # def compute_embeddings(texts, batch_size, progress, desc="Computing embeddings"):
+# #     embeddings = []
+# #     for batch in progress.tqdm(batch_iterable(texts, batch_size), total=(len(texts) + batch_size - 1) // batch_size, desc=desc):
+# #         batch_embeddings = model.encode(batch, show_progressbar=False)
+# #         embeddings.append(batch_embeddings)
+# #     return np.concatenate(embeddings, axis=0)
+# # def deduplicate(embedding_matrix: np.ndarray, threshold: float, batch_size: int = 1024, progress=None) -> tuple[np.ndarray, dict[int, int]]:
+# #     """
+# #     Deduplicate embeddings and return the deduplicated indices and a mapping of removed indices to their corresponding original indices.
+# #     """
+# #     # Building the index
+# #     progress(0, desc="Building search index...")
+# #     reach = Reach(vectors=embedding_matrix, items=[str(i) for i in range(len(embedding_matrix))])
+# #     deduplicated_indices = set(range(len(embedding_matrix)))
+# #     duplicate_to_original_mapping = {}
+# #     # Finding nearest neighbors
+# #     progress(0, desc="Finding nearest neighbors...")
+# #     results = reach.nearest_neighbor_threshold(
+# #         embedding_matrix,
+# #         threshold=threshold,
+# #         batch_size=batch_size,
+# #         show_progressbar=False  # Disable internal progress bar
+# #     )
+# #     # Processing duplicates with a progress bar
+# #     total_items = len(embedding_matrix)
+# #     for i, similar_items in enumerate(progress.tqdm(results, desc="Processing duplicates", total=total_items)):
+# #         if i not in deduplicated_indices:
+# #             continue
+# #         similar_indices = [int(item[0]) for item in similar_items if int(item[0]) != i]
+# #         for sim_idx in similar_indices:
+# #             if sim_idx in deduplicated_indices:
+# #                 deduplicated_indices.remove(sim_idx)
+# #                 duplicate_to_original_mapping[sim_idx] = i
+# #     return np.array(list(deduplicated_indices)), duplicate_to_original_mapping
+# # def deduplicate_across_datasets(embedding_matrix_1: np.ndarray, embedding_matrix_2: np.ndarray, threshold: float, batch_size: int = 1024, progress=None) -> tuple[list[int], dict[int, int]]:
+# #     """
+# #     Deduplicate embeddings across two datasets and return the indices of duplicates between them.
+# #     """
+# #     # Building the index from Dataset 1
+# #     progress(0, desc="Building search index from Dataset 1...")
+# #     reach = Reach(vectors=embedding_matrix_1, items=[str(i) for i in range(len(embedding_matrix_1))])
+# #     duplicate_indices_in_test = []
+# #     duplicate_to_original_mapping = {}
+# #     # Finding nearest neighbors between datasets
+# #     progress(0, desc="Finding nearest neighbors between datasets...")
+# #     results = reach.nearest_neighbor_threshold(
+# #         embedding_matrix_2,
+# #         threshold=threshold,
+# #         batch_size=batch_size,
+# #         show_progressbar=False  # Disable internal progress bar
+# #     )
+# #     total_items = len(embedding_matrix_2)
+# #     # Processing duplicates with a progress bar
+# #     for i, similar_items in enumerate(progress.tqdm(results, desc="Processing duplicates across datasets", total=total_items)):
+# #         similar_indices = [int(item[0]) for item in similar_items if item[1] >= threshold]
+# #         if similar_indices:
+# #             duplicate_indices_in_test.append(i)
+# #             duplicate_to_original_mapping[i] = similar_indices[0]
+# #     return duplicate_indices_in_test, duplicate_to_original_mapping
+# # def display_word_differences(x: str, y: str) -> str:
+# #     diff = ndiff(x.split(), y.split())
+# #     return " ".join([word for word in diff if word.startswith(('+', '-'))])
+# # def perform_deduplication(
+# #     deduplication_type,
+# #     dataset1_name,
+# #     dataset1_split,
+# #     dataset1_text_column,
+# #     dataset2_name="",
+# #     dataset2_split="",
+# #     dataset2_text_column="",
+# #     threshold=default_threshold,
+# #     progress=gr.Progress(track_tqdm=True)
+# # ):
+# #     try:
+# #         # Convert threshold to float
+# #         threshold = float(threshold)
+# #         # Initialize status message
+# #         status = ""
+# #         if deduplication_type == "Single dataset":
+# #             # Load Dataset 1
+# #             status = "Loading Dataset 1..."
+# #             yield status, ""
+# #             if dataset1_name == default_dataset1_name and dataset1_split == default_dataset1_split:
+# #                 ds = ds_default1
+# #             else:
+# #                 ds = load_dataset(dataset1_name, split=dataset1_split)
+# #             # Extract texts
+# #             status = "Extracting texts from Dataset 1..."
+# #             yield status, ""
+# #             texts = [example[dataset1_text_column] for example in ds]
+# #             # Compute embeddings
+# #             status = "Computing embeddings for Dataset 1..."
+# #             yield status, ""
+# #             embedding_matrix = compute_embeddings(texts, batch_size=64, progress=progress, desc="Computing embeddings for Dataset 1")
+# #             # Deduplicate
+# #             status = "Deduplicating embeddings..."
+# #             yield status, ""
+# #             deduplicated_indices, duplicate_to_original_mapping = deduplicate(
+# #                 embedding_matrix, threshold, progress=progress
+# #             )
+# #             # Prepare the results
+# #             num_duplicates = len(duplicate_to_original_mapping)
+# #             num_total = len(texts)
+# #             num_deduplicated = len(deduplicated_indices)
+# #             result_text = f"**Total documents:** {num_total}\n"
+# #             result_text += f"**Number of duplicates found:** {num_duplicates}\n"
+# #             result_text += f"**Number of unique documents after deduplication:** {num_deduplicated}\n\n"
+# #             # Show deduplicated examples
+# #             if num_duplicates > 0:
+# #                 result_text += "**Examples of duplicates found:**\n\n"
+# #                 num_examples = min(5, num_duplicates)
+# #                 for duplicate_idx, original_idx in list(duplicate_to_original_mapping.items())[:num_examples]:
+# #                     original_text = texts[original_idx]
+# #                     duplicate_text = texts[duplicate_idx]
+# #                     differences = display_word_differences(original_text, duplicate_text)
+# #                     result_text += f"**Original text:**\n{original_text}\n\n"
+# #                     result_text += f"**Duplicate text:**\n{duplicate_text}\n\n"
+# #                     result_text += f"**Differences:**\n{differences}\n"
+# #                     result_text += "-" * 50 + "\n\n"
+# #             else:
+# #                 result_text += "No duplicates found."
+# #             # Final status
+# #             status = "Deduplication completed."
+# #             yield status, result_text
+# #         elif deduplication_type == "Cross-dataset":
+# #             # Load Dataset 1
+# #             status = "Loading Dataset 1..."
+# #             yield status, ""
+# #             if dataset1_name == default_dataset1_name and dataset1_split == default_dataset1_split:
+# #                 ds1 = ds_default1
+# #             else:
+# #                 ds1 = load_dataset(dataset1_name, split=dataset1_split)
+# #             # Load Dataset 2
+# #             status = "Loading Dataset 2..."
+# #             yield status, ""
+# #             if dataset2_name == default_dataset2_name and dataset2_split == default_dataset2_split:
+# #                 ds2 = ds_default2
+# #             else:
+# #                 ds2 = load_dataset(dataset2_name, split=dataset2_split)
+# #             # Extract texts from Dataset 1
+# #             status = "Extracting texts from Dataset 1..."
+# #             yield status, ""
+# #             texts1 = [example[dataset1_text_column] for example in ds1]
+# #             # Extract texts from Dataset 2
+# #             status = "Extracting texts from Dataset 2..."
+# #             yield status, ""
+# #             texts2 = [example[dataset2_text_column] for example in ds2]
+# #             # Compute embeddings for Dataset 1
+# #             status = "Computing embeddings for Dataset 1..."
+# #             yield status, ""
+# #             embedding_matrix1 = compute_embeddings(texts1, batch_size=64, progress=progress, desc="Computing embeddings for Dataset 1")
+# #             # Compute embeddings for Dataset 2
+# #             status = "Computing embeddings for Dataset 2..."
+# #             yield status, ""
+# #             embedding_matrix2 = compute_embeddings(texts2, batch_size=64, progress=progress, desc="Computing embeddings for Dataset 2")
+# #             # Deduplicate across datasets
+# #             status = "Deduplicating embeddings across datasets..."
+# #             yield status, ""
+# #             duplicate_indices_in_ds2, duplicate_to_original_mapping = deduplicate_across_datasets(
+# #                 embedding_matrix1, embedding_matrix2, threshold, progress=progress
+# #             )
+# #             num_duplicates = len(duplicate_indices_in_ds2)
+# #             num_total_ds2 = len(texts2)
+# #             num_unique_ds2 = num_total_ds2 - num_duplicates
+# #             result_text = f"**Total documents in {dataset2_name}/{dataset2_split}:** {num_total_ds2}\n"
+# #             result_text += f"**Number of duplicates found in {dataset2_name}/{dataset2_split}:** {num_duplicates}\n"
+# #             result_text += f"**Number of unique documents in {dataset2_name}/{dataset2_split} after deduplication:** {num_unique_ds2}\n\n"
+# #             # Show deduplicated examples
+# #             if num_duplicates > 0:
+# #                 result_text += "**Examples of duplicates found in Dataset 2:**\n\n"
+# #                 num_examples = min(5, num_duplicates)
+# #                 for duplicate_idx in duplicate_indices_in_ds2[:num_examples]:
+# #                     original_idx = duplicate_to_original_mapping[duplicate_idx]
+# #                     original_text = texts1[original_idx]
+# #                     duplicate_text = texts2[duplicate_idx]
+# #                     differences = display_word_differences(original_text, duplicate_text)
+# #                     result_text += f"**Original text (Dataset 1):**\n{original_text}\n\n"
+# #                     result_text += f"**Duplicate text (Dataset 2):**\n{duplicate_text}\n\n"
+# #                     result_text += f"**Differences:**\n{differences}\n"
+# #                     result_text += "-" * 50 + "\n\n"
+# #             else:
+# #                 result_text += "No duplicates found."
+# #             # Final status
+# #             status = "Deduplication completed."
+# #             yield status, result_text
+# #     except Exception as e:
+# #         yield f"An error occurred: {e}", ""
+# #         raise e
+# # with gr.Blocks() as demo:
+# #     gr.Markdown("# Semantic Deduplication")
+# #     deduplication_type = gr.Radio(
+# #         choices=["Single dataset", "Cross-dataset"],
+# #         label="Deduplication Type",
+# #         value="Single dataset"
+# #     )
+# #     with gr.Row():
+# #         dataset1_name = gr.Textbox(value=default_dataset1_name, label="Dataset 1 Name")
+# #         dataset1_split = gr.Textbox(value=default_dataset1_split, label="Dataset 1 Split")
+# #         dataset1_text_column = gr.Textbox(value=default_text_column, label="Text Column Name")
+# #     dataset2_inputs = gr.Column(visible=False)
+# #     with dataset2_inputs:
+# #         gr.Markdown("### Dataset 2")
+# #         with gr.Row():
+# #             dataset2_name = gr.Textbox(value=default_dataset2_name, label="Dataset 2 Name")
+# #             dataset2_split = gr.Textbox(value=default_dataset2_split, label="Dataset 2 Split")
+# #             dataset2_text_column = gr.Textbox(value=default_text_column, label="Text Column Name")
+# #     threshold = gr.Slider(
+# #         minimum=0.0,
+# #         maximum=1.0,
+# #         value=default_threshold,
+# #         label="Similarity Threshold"
+# #     )
+# #     compute_button = gr.Button("Compute")
+# #     status_output = gr.Markdown()
+# #     result_output = gr.Markdown()
+# #     # Function to update the visibility of dataset2_inputs
+# #     def update_visibility(deduplication_type_value):
+# #         if deduplication_type_value == "Cross-dataset":
+# #             return gr.update(visible=True)
+# #         else:
+# #             return gr.update(visible=False)
+# #     deduplication_type.change(
+# #         update_visibility,
+# #         inputs=deduplication_type,
+# #         outputs=dataset2_inputs
+# #     )
+# #     compute_button.click(
+# #         fn=perform_deduplication,
+# #         inputs=[
+# #             deduplication_type,
+# #             dataset1_name,
+# #             dataset1_split,
+# #             dataset1_text_column,
+# #             dataset2_name,
+# #             dataset2_split,
+# #             dataset2_text_column,
+# #             threshold
+# #         ],
+# #         outputs=[status_output, result_output]
+# #     )
+# # demo.launch()
 # # import gradio as gr
 # # import model2vec
 # # from reach import Reach
 # # from difflib import ndiff
 # # # Load the model at startup
 # # model = model2vec.StaticModel.from_pretrained("minishlab/M2V_base_output")
 # #     for i in range(0, len(iterable), batch_size):
 # #         yield iterable[i:i + batch_size]
+# # def compute_embeddings(texts, batch_size, progress, desc="Computing embeddings"):
 # #     embeddings = []
 # #     total_batches = (len(texts) + batch_size - 1) // batch_size
 # #     for i, batch_texts in enumerate(batch_iterable(texts, batch_size)):
 # #     embedding_matrix: np.ndarray,
 # #     threshold: float,
 # #     batch_size: int = 1024,
+# #     progress=None
 # # ) -> tuple[np.ndarray, dict[int, int]]:
+# #     reach = Reach(vectors=embedding_matrix, items=[str(i) for i in range(len(embedding_matrix))])
 # #     deduplicated_indices = set(range(len(embedding_matrix)))
 # #     duplicate_to_original_mapping = {}
 # #     results = reach.nearest_neighbor_threshold(
 # #         embedding_matrix,
 # #         threshold=threshold,
 # #         batch_size=batch_size,
+# #         show_progressbar=False,
 # #     )
 # #     total_items = len(embedding_matrix)
+# #     for i, similar_items in enumerate(progress.tqdm(results, desc="Processing duplicates", total=total_items)):
 # #         if i not in deduplicated_indices:
 # #             continue
 # #         similar_indices = [int(item[0]) for item in similar_items if int(item[0]) != i]
 # #         for sim_idx in similar_indices:
 # #             if sim_idx in deduplicated_indices:
 # #                 deduplicated_indices.remove(sim_idx)
 # #     diff = ndiff(x.split(), y.split())
 # #     return " ".join([word for word in diff if word.startswith(("+", "-"))])
 # # def perform_deduplication(
 # #     deduplication_type,
 # #     dataset1_name,
 # #     threshold=default_threshold,
 # #     progress=gr.Progress(track_tqdm=True),
 # # ):
 # #     try:
 # #         threshold = float(threshold)
 # #         if deduplication_type == "Single dataset":
+# #             ds = ds_default1 if dataset1_name == default_dataset1_name and dataset1_split == default_dataset1_split else load_dataset(dataset1_name, split=dataset1_split)
 # #             texts = [example[dataset1_text_column] for example in ds]
+# #             embedding_matrix = compute_embeddings(texts, batch_size=64, progress=progress)
+# #             deduplicated_indices, duplicate_to_original_mapping = deduplicate(embedding_matrix, threshold, progress=progress)
 # #             num_duplicates = len(duplicate_to_original_mapping)
 # #             num_total = len(texts)
 # #             num_deduplicated = len(deduplicated_indices)
 # #             result_text = f"**Total documents:** {num_total}\n"
 # #             result_text += f"**Number of duplicates found:** {num_duplicates}\n"
+# #             result_text += f"**Number of unique documents after deduplication:** {num_deduplicated}\n\n"
 # #             if num_duplicates > 0:
 # #                 result_text += "**Examples of duplicates found:**\n\n"
 # #                 num_examples = min(5, num_duplicates)
 # #             else:
 # #                 result_text += "No duplicates found."
+# #             yield result_text
 # #     except Exception as e:
+# #         yield f"An error occurred: {e}"
+# # # Gradio interface setup
 # # with gr.Blocks(css="#status_output { height: 150px; overflow: auto; }") as demo:
 # #     gr.Markdown("# Semantic Deduplication")
 # #             dataset2_split = gr.Textbox(value=default_dataset2_split, label="Dataset 2 Split")
 # #             dataset2_text_column = gr.Textbox(value=default_text_column, label="Text Column Name")
+# #     threshold = gr.Slider(minimum=0.0, maximum=1.0, value=default_threshold, label="Similarity Threshold")
 # #     compute_button = gr.Button("Compute")
 # #     result_output = gr.Markdown()
 # #     def update_visibility(deduplication_type_value):
+# #         return gr.update(visible=True) if deduplication_type_value == "Cross-dataset" else gr.update(visible=False)
 # #     deduplication_type.change(
 # #         update_visibility, inputs=deduplication_type, outputs=dataset2_inputs
 # #             dataset2_text_column,
 # #             threshold,
 # #         ],
+# #         outputs=[result_output],
 # #     )
 # # demo.launch()
 # # # import gradio as gr
 # # # from datasets import load_dataset
 # # # import numpy as np
 # # # import model2vec
 # # # from reach import Reach
 # # # from difflib import ndiff
+# # # import time
 # # # # Load the model at startup
 # # # model = model2vec.StaticModel.from_pretrained("minishlab/M2V_base_output")
 # # # ds_default1 = load_dataset(default_dataset1_name, split=default_dataset1_split)
 # # # ds_default2 = load_dataset(default_dataset2_name, split=default_dataset2_split)
 # # # def batch_iterable(iterable, batch_size):
 # # #     """Helper function to create batches from an iterable."""
 # # #     for i in range(0, len(iterable), batch_size):
 # # #         yield iterable[i:i + batch_size]
+# # # def log_time(message, start_time=None, logs=None):
+# # #     """Helper function to log the start and end times."""
+# # #     current_time = time.time()
+# # #     if start_time is not None:
+# # #         elapsed = current_time - start_time
+# # #         log_message = f"{message} - Took {elapsed:.2f} seconds"
+# # #     else:
+# # #         log_message = f"{message} - Started"
+# # #     if logs is not None:
+# # #         logs.append(log_message)
+# # # def compute_embeddings(texts, batch_size, progress, logs, desc="Computing embeddings"):
 # # #     embeddings = []
 # # #     total_batches = (len(texts) + batch_size - 1) // batch_size
 # # #     for i, batch_texts in enumerate(batch_iterable(texts, batch_size)):
 # # #     embedding_matrix: np.ndarray,
 # # #     threshold: float,
 # # #     batch_size: int = 1024,
+# # #     progress=None,
+# # #     logs=None
 # # # ) -> tuple[np.ndarray, dict[int, int]]:
 # # #     # Building the index
+# # #     log_time("Building search index", logs=logs)
 # # #     reach = Reach(
 # # #         vectors=embedding_matrix, items=[str(i) for i in range(len(embedding_matrix))]
 # # #     )
 # # #     duplicate_to_original_mapping = {}
 # # #     # Finding nearest neighbors
+# # #     log_time("Finding nearest neighbors", logs=logs)
 # # #     results = reach.nearest_neighbor_threshold(
 # # #         embedding_matrix,
 # # #         threshold=threshold,
 # # #     # Processing duplicates with a progress bar
 # # #     total_items = len(embedding_matrix)
+# # #     log_time("Processing duplicates", logs=logs)
 # # #     for i, similar_items in enumerate(
 # # #         progress.tqdm(results, desc="Processing duplicates", total=total_items)
 # # #     ):
 # # #     diff = ndiff(x.split(), y.split())
 # # #     return " ".join([word for word in diff if word.startswith(("+", "-"))])
+# # # def encode_texts(texts, progress=None, logs=None):
 # # #     embedding_matrix = model.encode(texts, show_progressbar=False)
+# # #     log_time("Encoding texts completed", logs=logs)
 # # #     return embedding_matrix
 # # # def perform_deduplication(
 # # #     threshold=default_threshold,
 # # #     progress=gr.Progress(track_tqdm=True),
 # # # ):
+# # #     logs = []  # To store log messages
 # # #     try:
 # # #         # Convert threshold to float
 # # #         threshold = float(threshold)
 # # #         # Initialize status message
+# # #         log_time("Deduplication started", logs=logs)
 # # #         if deduplication_type == "Single dataset":
 # # #             # Load Dataset 1
+# # #             start_time = time.time()
+# # #             log_time("Loading Dataset 1", logs=logs)
 # # #             if (
 # # #                 dataset1_name == default_dataset1_name
 # # #                 and dataset1_split == default_dataset1_split
 # # #                 ds = ds_default1
 # # #             else:
 # # #                 ds = load_dataset(dataset1_name, split=dataset1_split)
+# # #             log_time("Loading Dataset 1 completed", start_time=start_time, logs=logs)
 # # #             # Extract texts
+# # #             start_time = time.time()
+# # #             log_time("Extracting texts from Dataset 1", logs=logs)
 # # #             texts = [example[dataset1_text_column] for example in ds]
+# # #             log_time("Extracting texts from Dataset 1 completed", start_time=start_time, logs=logs)
 # # #             # Compute embeddings
+# # #             start_time = time.time()
+# # #             log_time("Computing embeddings for Dataset 1", logs=logs)
+# # #             embedding_matrix = encode_texts(texts, progress=progress, logs=logs)
+# # #             log_time("Computing embeddings for Dataset 1 completed", start_time=start_time, logs=logs)
 # # #             # Deduplicate
+# # #             start_time = time.time()
+# # #             log_time("Deduplicating embeddings", logs=logs)
 # # #             deduplicated_indices, duplicate_to_original_mapping = deduplicate(
+# # #                 embedding_matrix, threshold, progress=progress, logs=logs
 # # #             )
+# # #             log_time("Deduplication completed", start_time=start_time, logs=logs)
 # # #             # Prepare the results
 # # #             num_duplicates = len(duplicate_to_original_mapping)
 # # #             else:
 # # #                 result_text += "No duplicates found."
+# # #             log_time("Deduplication process finished", logs=logs)
+# # #             full_log = "\n".join(logs)  # Combine all logs into one output
+# # #             yield full_log, result_text
 # # #     except Exception as e:
+# # #         full_log = "\n".join(logs)  # Combine all logs into one output in case of an error
 # # #         yield f"An error occurred: {e}", ""
 # # #         raise e
 # # # # Adjust the height of the status_output component using custom CSS
 # # # with gr.Blocks(css="#status_output { height: 150px; overflow: auto; }") as demo:
 # # #     gr.Markdown("# Semantic Deduplication")
 # # #     )
 # # # demo.launch()
+# # # # import gradio as gr
+# # # # from datasets import load_dataset
+# # # # import numpy as np
+# # # # #from model2vec import StaticModel
+# # # # import model2vec
+# # # # from reach import Reach
+# # # # from difflib import ndiff
+# # # # # Load the model at startup
+# # # # model = model2vec.StaticModel.from_pretrained("minishlab/M2V_base_output")
+# # # # # Default dataset parameters
+# # # # default_dataset1_name = "sst2"
+# # # # default_dataset1_split = "train"
+# # # # default_dataset2_name = "sst2"
+# # # # default_dataset2_split = "validation"
+# # # # default_text_column = "sentence"
+# # # # default_threshold = 0.9
+# # # # # Load the default datasets at startup
+# # # # ds_default1 = load_dataset(default_dataset1_name, split=default_dataset1_split)
+# # # # ds_default2 = load_dataset(default_dataset2_name, split=default_dataset2_split)
+# # # # def batch_iterable(iterable, batch_size):
+# # # #     """Helper function to create batches from an iterable."""
+# # # #     for i in range(0, len(iterable), batch_size):
+# # # #         yield iterable[i:i + batch_size]
+# # # # def compute_embeddings(texts, batch_size, progress, desc="Computing embeddings"):
+# # # #     embeddings = []
+# # # #     total_batches = (len(texts) + batch_size - 1) // batch_size
+# # # #     for i, batch_texts in enumerate(batch_iterable(texts, batch_size)):
+# # # #         batch_embeddings = model.encode(batch_texts, show_progressbar=False)
+# # # #         embeddings.append(batch_embeddings)
+# # # #         progress((i + 1) / total_batches, desc=desc)
+# # # #     return np.concatenate(embeddings, axis=0)
+# # # # def deduplicate(
+# # # #     embedding_matrix: np.ndarray,
+# # # #     threshold: float,
+# # # #     batch_size: int = 1024,
+# # # #     progress=None
+# # # # ) -> tuple[np.ndarray, dict[int, int]]:
+# # # #     # Building the index
+# # # #     progress(0, desc="Building search index...")
+# # # #     reach = Reach(
+# # # #         vectors=embedding_matrix, items=[str(i) for i in range(len(embedding_matrix))]
+# # # #     )
+# # # #     deduplicated_indices = set(range(len(embedding_matrix)))
+# # # #     duplicate_to_original_mapping = {}
+# # # #     # Finding nearest neighbors
+# # # #     progress(0, desc="Finding nearest neighbors...")
+# # # #     results = reach.nearest_neighbor_threshold(
+# # # #         embedding_matrix,
+# # # #         threshold=threshold,
+# # # #         batch_size=batch_size,
+# # # #         show_progressbar=False,  # Disable internal progress bar
+# # # #     )
+# # # #     # Processing duplicates with a progress bar
+# # # #     total_items = len(embedding_matrix)
+# # # #     for i, similar_items in enumerate(
+# # # #         progress.tqdm(results, desc="Processing duplicates", total=total_items)
+# # # #     ):
+# # # #         if i not in deduplicated_indices:
+# # # #             continue
+# # # #         similar_indices = [int(item[0]) for item in similar_items if int(item[0]) != i]
+# # # #         for sim_idx in similar_indices:
+# # # #             if sim_idx in deduplicated_indices:
+# # # #                 deduplicated_indices.remove(sim_idx)
+# # # #                 duplicate_to_original_mapping[sim_idx] = i
+# # # #     return np.array(list(deduplicated_indices)), duplicate_to_original_mapping
+# # # # def display_word_differences(x: str, y: str) -> str:
+# # # #     diff = ndiff(x.split(), y.split())
+# # # #     return " ".join([word for word in diff if word.startswith(("+", "-"))])
+# # # # def encode_texts(texts, progress=None):
+# # # #     embedding_matrix = model.encode(texts, show_progressbar=False)
+# # # #     return embedding_matrix
+# # # # def perform_deduplication(
+# # # #     deduplication_type,
+# # # #     dataset1_name,
+# # # #     dataset1_split,
+# # # #     dataset1_text_column,
+# # # #     dataset2_name="",
+# # # #     dataset2_split="",
+# # # #     dataset2_text_column="",
+# # # #     threshold=default_threshold,
+# # # #     progress=gr.Progress(track_tqdm=True),
+# # # # ):
+# # # #     try:
+# # # #         # Convert threshold to float
+# # # #         threshold = float(threshold)
+# # # #         # Initialize status message
+# # # #         status = ""
+# # # #         if deduplication_type == "Single dataset":
+# # # #             # Load Dataset 1
+# # # #             status = "Loading Dataset 1..."
+# # # #             yield status, ""
+# # # #             if (
+# # # #                 dataset1_name == default_dataset1_name
+# # # #                 and dataset1_split == default_dataset1_split
+# # # #             ):
+# # # #                 ds = ds_default1
+# # # #             else:
+# # # #                 ds = load_dataset(dataset1_name, split=dataset1_split)
+# # # #             # Extract texts
+# # # #             status = "Extracting texts from Dataset 1..."
+# # # #             yield status, ""
+# # # #             texts = [example[dataset1_text_column] for example in ds]
+# # # #             # Compute embeddings
+# # # #             status = "Computing embeddings for Dataset 1..."
+# # # #             yield status, ""
+# # # #             embedding_matrix = encode_texts(texts, progress=progress)
+# # # #             #embedding_matrix = model.encode(texts, show_progressbar=True)
+# # # #             # embedding_matrix = compute_embeddings(
+# # # #             #     texts,
+# # # #             #     batch_size=64,
+# # # #             #     progress=progress,
+# # # #             #     desc="Computing embeddings for Dataset 1",
+# # # #             # )
+# # # #             # Deduplicate
+# # # #             status = "Deduplicating embeddings..."
+# # # #             yield status, ""
+# # # #             deduplicated_indices, duplicate_to_original_mapping = deduplicate(
+# # # #                 embedding_matrix, threshold, progress=progress
+# # # #             )
+# # # #             # Prepare the results
+# # # #             num_duplicates = len(duplicate_to_original_mapping)
+# # # #             num_total = len(texts)
+# # # #             num_deduplicated = len(deduplicated_indices)
+# # # #             result_text = f"**Total documents:** {num_total}\n"
+# # # #             result_text += f"**Number of duplicates found:** {num_duplicates}\n"
+# # # #             result_text += (
+# # # #                 f"**Number of unique documents after deduplication:** {num_deduplicated}\n\n"
+# # # #             )
+# # # #             # Show deduplicated examples
+# # # #             if num_duplicates > 0:
+# # # #                 result_text += "**Examples of duplicates found:**\n\n"
+# # # #                 num_examples = min(5, num_duplicates)
+# # # #                 for duplicate_idx, original_idx in list(duplicate_to_original_mapping.items())[:num_examples]:
+# # # #                     original_text = texts[original_idx]
+# # # #                     duplicate_text = texts[duplicate_idx]
+# # # #                     differences = display_word_differences(original_text, duplicate_text)
+# # # #                     result_text += f"**Original text:**\n{original_text}\n\n"
+# # # #                     result_text += f"**Duplicate text:**\n{duplicate_text}\n\n"
+# # # #                     result_text += f"**Differences:**\n{differences}\n"
+# # # #                     result_text += "-" * 50 + "\n\n"
+# # # #             else:
+# # # #                 result_text += "No duplicates found."
+# # # #             # Final status
+# # # #             status = "Deduplication completed."
+# # # #             yield status, result_text
+# # # #         elif deduplication_type == "Cross-dataset":
+# # # #             # Similar code for cross-dataset deduplication
+# # # #             # Load Dataset 1
+# # # #             status = "Loading Dataset 1..."
+# # # #             yield status, ""
+# # # #             if (
+# # # #                 dataset1_name == default_dataset1_name
+# # # #                 and dataset1_split == default_dataset1_split
+# # # #             ):
+# # # #                 ds1 = ds_default1
+# # # #             else:
+# # # #                 ds1 = load_dataset(dataset1_name, split=dataset1_split)
+# # # #             # Load Dataset 2
+# # # #             status = "Loading Dataset 2..."
+# # # #             yield status, ""
+# # # #             if (
+# # # #                 dataset2_name == default_dataset2_name
+# # # #                 and dataset2_split == default_dataset2_split
+# # # #             ):
+# # # #                 ds2 = ds_default2
+# # # #             else:
+# # # #                 ds2 = load_dataset(dataset2_name, split=dataset2_split)
+# # # #             # Extract texts from Dataset 1
+# # # #             status = "Extracting texts from Dataset 1..."
+# # # #             yield status, ""
+# # # #             texts1 = [example[dataset1_text_column] for example in ds1]
+# # # #             # Extract texts from Dataset 2
+# # # #             status = "Extracting texts from Dataset 2..."
+# # # #             yield status, ""
+# # # #             texts2 = [example[dataset2_text_column] for example in ds2]
+# # # #             # Compute embeddings for Dataset 1
+# # # #             status = "Computing embeddings for Dataset 1..."
+# # # #             yield status, ""
+# # # #             embedding_matrix1 = compute_embeddings(
+# # # #                 texts1,
+# # # #                 batch_size=64,
+# # # #                 progress=progress,
+# # # #                 desc="Computing embeddings for Dataset 1",
+# # # #             )
+# # # #             # Compute embeddings for Dataset 2
+# # # #             status = "Computing embeddings for Dataset 2..."
+# # # #             yield status, ""
+# # # #             embedding_matrix2 = compute_embeddings(
+# # # #                 texts2,
+# # # #                 batch_size=64,
+# # # #                 progress=progress,
+# # # #                 desc="Computing embeddings for Dataset 2",
+# # # #             )
+# # # #             # Deduplicate across datasets
+# # # #             status = "Deduplicating embeddings across datasets..."
+# # # #             yield status, ""
+# # # #             duplicate_indices_in_ds2, duplicate_to_original_mapping = deduplicate_across_datasets(
+# # # #                 embedding_matrix1, embedding_matrix2, threshold, progress=progress
+# # # #             )
+# # # #             num_duplicates = len(duplicate_indices_in_ds2)
+# # # #             num_total_ds2 = len(texts2)
+# # # #             num_unique_ds2 = num_total_ds2 - num_duplicates
+# # # #             result_text = f"**Total documents in {dataset2_name}/{dataset2_split}:** {num_total_ds2}\n"
+# # # #             result_text += f"**Number of duplicates found in {dataset2_name}/{dataset2_split}:** {num_duplicates}\n"
+# # # #             result_text += f"**Number of unique documents in {dataset2_name}/{dataset2_split} after deduplication:** {num_unique_ds2}\n\n"
+# # # #             # Show deduplicated examples
+# # # #             if num_duplicates > 0:
+# # # #                 result_text += "**Examples of duplicates found in Dataset 2:**\n\n"
+# # # #                 num_examples = min(5, num_duplicates)
+# # # #                 for duplicate_idx in duplicate_indices_in_ds2[:num_examples]:
+# # # #                     original_idx = duplicate_to_original_mapping[duplicate_idx]
+# # # #                     original_text = texts1[original_idx]
+# # # #                     duplicate_text = texts2[duplicate_idx]
+# # # #                     differences = display_word_differences(original_text, duplicate_text)
+# # # #                     result_text += f"**Original text (Dataset 1):**\n{original_text}\n\n"
+# # # #                     result_text += f"**Duplicate text (Dataset 2):**\n{duplicate_text}\n\n"
+# # # #                     result_text += f"**Differences:**\n{differences}\n"
+# # # #                     result_text += "-" * 50 + "\n\n"
+# # # #             else:
+# # # #                 result_text += "No duplicates found."
+# # # #             # Final status
+# # # #             status = "Deduplication completed."
+# # # #             yield status, result_text
+# # # #     except Exception as e:
+# # # #         yield f"An error occurred: {e}", ""
+# # # #         raise e
+# # # # def deduplicate_across_datasets(
+# # # #     embedding_matrix_1: np.ndarray,
+# # # #     embedding_matrix_2: np.ndarray,
+# # # #     threshold: float,
+# # # #     batch_size: int = 1024,
+# # # #     progress=None
+# # # # ) -> tuple[list[int], dict[int, int]]:
+# # # #     # Building the index from Dataset 1
+# # # #     progress(0, desc="Building search index from Dataset 1...")
+# # # #     reach = Reach(
+# # # #         vectors=embedding_matrix_1, items=[str(i) for i in range(len(embedding_matrix_1))]
+# # # #     )
+# # # #     duplicate_indices_in_test = []
+# # # #     duplicate_to_original_mapping = {}
+# # # #     # Finding nearest neighbors between datasets
+# # # #     progress(0, desc="Finding nearest neighbors between datasets...")
+# # # #     results = reach.nearest_neighbor_threshold(
+# # # #         embedding_matrix_2,
+# # # #         threshold=threshold,
+# # # #         batch_size=batch_size,
+# # # #         show_progressbar=False,  # Disable internal progress bar
+# # # #     )
+# # # #     total_items = len(embedding_matrix_2)
+# # # #     # Processing duplicates with a progress bar
+# # # #     for i, similar_items in enumerate(
+# # # #         progress.tqdm(results, desc="Processing duplicates across datasets", total=total_items)
+# # # #     ):
+# # # #         similar_indices = [int(item[0]) for item in similar_items if item[1] >= threshold]
+# # # #         if similar_indices:
+# # # #             duplicate_indices_in_test.append(i)
+# # # #             duplicate_to_original_mapping[i] = similar_indices[0]
+# # # #     return duplicate_indices_in_test, duplicate_to_original_mapping
+# # # # # Adjust the height of the status_output component using custom CSS
+# # # # with gr.Blocks(css="#status_output { height: 150px; overflow: auto; }") as demo:
+# # # #     gr.Markdown("# Semantic Deduplication")
+# # # #     deduplication_type = gr.Radio(
+# # # #         choices=["Single dataset", "Cross-dataset"],
+# # # #         label="Deduplication Type",
+# # # #         value="Single dataset",
+# # # #     )
+# # # #     with gr.Row():
+# # # #         dataset1_name = gr.Textbox(value=default_dataset1_name, label="Dataset 1 Name")
+# # # #         dataset1_split = gr.Textbox(value=default_dataset1_split, label="Dataset 1 Split")
+# # # #         dataset1_text_column = gr.Textbox(value=default_text_column, label="Text Column Name")
+# # # #     dataset2_inputs = gr.Column(visible=False)
+# # # #     with dataset2_inputs:
+# # # #         gr.Markdown("### Dataset 2")
+# # # #         with gr.Row():
+# # # #             dataset2_name = gr.Textbox(value=default_dataset2_name, label="Dataset 2 Name")
+# # # #             dataset2_split = gr.Textbox(value=default_dataset2_split, label="Dataset 2 Split")
+# # # #             dataset2_text_column = gr.Textbox(value=default_text_column, label="Text Column Name")
+# # # #     threshold = gr.Slider(
+# # # #         minimum=0.0, maximum=1.0, value=default_threshold, label="Similarity Threshold"
+# # # #     )
+# # # #     compute_button = gr.Button("Compute")
+# # # #     # Use 'gr.Markdown' with 'elem_id' and custom CSS to adjust height
+# # # #     status_output = gr.Markdown(elem_id="status_output")
+# # # #     result_output = gr.Markdown()
+# # # #     # Function to update the visibility of dataset2_inputs
+# # # #     def update_visibility(deduplication_type_value):
+# # # #         if deduplication_type_value == "Cross-dataset":
+# # # #             return gr.update(visible=True)
+# # # #         else:
+# # # #             return gr.update(visible=False)
+# # # #     deduplication_type.change(
+# # # #         update_visibility, inputs=deduplication_type, outputs=dataset2_inputs
+# # # #     )
+# # # #     compute_button.click(
+# # # #         fn=perform_deduplication,
+# # # #         inputs=[
+# # # #             deduplication_type,
+# # # #             dataset1_name,
+# # # #             dataset1_split,
+# # # #             dataset1_text_column,
+# # # #             dataset2_name,
+# # # #             dataset2_split,
+# # # #             dataset2_text_column,
+# # # #             threshold,
+# # # #         ],
+# # # #         outputs=[status_output, result_output],
+# # # #     )
+# # # # demo.launch()