Spaces:

minishlab
/

semantic-deduplication

Running

App Files Files Community

Pringled commited on Oct 12, 2024

Commit

8f283dc

1 Parent(s): 20f4a6e

Updated app with code for deduplication

Browse files

Files changed (1) hide show

app.py +991 -671

app.py CHANGED Viewed

@@ -26,6 +26,15 @@ def batch_iterable(iterable, batch_size):
     for i in range(0, len(iterable), batch_size):
         yield iterable[i:i + batch_size]
 def display_word_differences(x: str, y: str) -> str:
     diff = ndiff(x.split(), y.split())
     return " ".join([word for word in diff if word.startswith(('+', '-'))])
@@ -65,13 +74,7 @@ def perform_deduplication(
             # Compute embeddings
             status = "Computing embeddings for Dataset 1..."
             yield status, ""
-            embeddings = []
-            batch_size = 64
-            total_batches = (len(texts) + batch_size - 1) // batch_size
-            for batch_texts in progress.tqdm(batch_iterable(texts, batch_size), desc="Computing embeddings", total=total_batches):
-                batch_embeddings = model.encode(batch_texts, show_progressbar=False)
-                embeddings.append(batch_embeddings)
-            embedding_matrix = np.concatenate(embeddings, axis=0)
             # Deduplicate
             status = "Deduplicating embeddings..."
@@ -138,23 +141,12 @@ def perform_deduplication(
             # Compute embeddings for Dataset 1
             status = "Computing embeddings for Dataset 1..."
             yield status, ""
-            embeddings1 = []
-            batch_size = 64
-            total_batches1 = (len(texts1) + batch_size - 1) // batch_size
-            for batch_texts in progress.tqdm(batch_iterable(texts1, batch_size), desc="Computing embeddings for Dataset 1", total=total_batches1):
-                batch_embeddings = model.encode(batch_texts, show_progressbar=False)
-                embeddings1.append(batch_embeddings)
-            embedding_matrix1 = np.concatenate(embeddings1, axis=0)
             # Compute embeddings for Dataset 2
             status = "Computing embeddings for Dataset 2..."
             yield status, ""
-            embeddings2 = []
-            total_batches2 = (len(texts2) + batch_size - 1) // batch_size
-            for batch_texts in progress.tqdm(batch_iterable(texts2, batch_size), desc="Computing embeddings for Dataset 2", total=total_batches2):
-                batch_embeddings = model.encode(batch_texts, show_progressbar=False)
-                embeddings2.append(batch_embeddings)
-            embedding_matrix2 = np.concatenate(embeddings2, axis=0)
             # Deduplicate across datasets
             status = "Deduplicating embeddings across datasets..."
@@ -326,7 +318,6 @@ with gr.Blocks() as demo:
 demo.launch()
 # import gradio as gr
 # from datasets import load_dataset
 # import numpy as np
@@ -355,79 +346,6 @@ demo.launch()
 #     for i in range(0, len(iterable), batch_size):
 #         yield iterable[i:i + batch_size]
-# def compute_embeddings(texts, batch_size, progress, desc="Computing embeddings"):
-#     embeddings = []
-#     for batch in progress.tqdm(batch_iterable(texts, batch_size), total=(len(texts) + batch_size - 1) // batch_size, desc=desc):
-#         batch_embeddings = model.encode(batch, show_progressbar=False)
-#         embeddings.append(batch_embeddings)
-#     return np.concatenate(embeddings, axis=0)
-# def deduplicate(embedding_matrix: np.ndarray, threshold: float, batch_size: int = 1024, progress=None) -> tuple[np.ndarray, dict[int, int]]:
-#     """
-#     Deduplicate embeddings and return the deduplicated indices and a mapping of removed indices to their corresponding original indices.
-#     """
-#     # Building the index
-#     progress(0, desc="Building search index...")
-#     reach = Reach(vectors=embedding_matrix, items=[str(i) for i in range(len(embedding_matrix))])
-#     deduplicated_indices = set(range(len(embedding_matrix)))
-#     duplicate_to_original_mapping = {}
-#     # Finding nearest neighbors
-#     progress(0, desc="Finding nearest neighbors...")
-#     results = reach.nearest_neighbor_threshold(
-#         embedding_matrix,
-#         threshold=threshold,
-#         batch_size=batch_size,
-#         show_progressbar=False  # Disable internal progress bar
-#     )
-#     # Processing duplicates with a progress bar
-#     total_items = len(embedding_matrix)
-#     for i, similar_items in enumerate(progress.tqdm(results, desc="Processing duplicates", total=total_items)):
-#         if i not in deduplicated_indices:
-#             continue
-#         similar_indices = [int(item[0]) for item in similar_items if int(item[0]) != i]
-#         for sim_idx in similar_indices:
-#             if sim_idx in deduplicated_indices:
-#                 deduplicated_indices.remove(sim_idx)
-#                 duplicate_to_original_mapping[sim_idx] = i
-#     return np.array(list(deduplicated_indices)), duplicate_to_original_mapping
-# def deduplicate_across_datasets(embedding_matrix_1: np.ndarray, embedding_matrix_2: np.ndarray, threshold: float, batch_size: int = 1024, progress=None) -> tuple[list[int], dict[int, int]]:
-#     """
-#     Deduplicate embeddings across two datasets and return the indices of duplicates between them.
-#     """
-#     # Building the index from Dataset 1
-#     progress(0, desc="Building search index from Dataset 1...")
-#     reach = Reach(vectors=embedding_matrix_1, items=[str(i) for i in range(len(embedding_matrix_1))])
-#     duplicate_indices_in_test = []
-#     duplicate_to_original_mapping = {}
-#     # Finding nearest neighbors between datasets
-#     progress(0, desc="Finding nearest neighbors between datasets...")
-#     results = reach.nearest_neighbor_threshold(
-#         embedding_matrix_2,
-#         threshold=threshold,
-#         batch_size=batch_size,
-#         show_progressbar=False  # Disable internal progress bar
-#     )
-#     total_items = len(embedding_matrix_2)
-#     # Processing duplicates with a progress bar
-#     for i, similar_items in enumerate(progress.tqdm(results, desc="Processing duplicates across datasets", total=total_items)):
-#         similar_indices = [int(item[0]) for item in similar_items if item[1] >= threshold]
-#         if similar_indices:
-#             duplicate_indices_in_test.append(i)
-#             duplicate_to_original_mapping[i] = similar_indices[0]
-#     return duplicate_indices_in_test, duplicate_to_original_mapping
 # def display_word_differences(x: str, y: str) -> str:
 #     diff = ndiff(x.split(), y.split())
 #     return " ".join([word for word in diff if word.startswith(('+', '-'))])
@@ -467,7 +385,13 @@ demo.launch()
 #             # Compute embeddings
 #             status = "Computing embeddings for Dataset 1..."
 #             yield status, ""
-#             embedding_matrix = compute_embeddings(texts, batch_size=64, progress=progress, desc="Computing embeddings for Dataset 1")
 #             # Deduplicate
 #             status = "Deduplicating embeddings..."
@@ -534,12 +458,23 @@ demo.launch()
 #             # Compute embeddings for Dataset 1
 #             status = "Computing embeddings for Dataset 1..."
 #             yield status, ""
-#             embedding_matrix1 = compute_embeddings(texts1, batch_size=64, progress=progress, desc="Computing embeddings for Dataset 1")
 #             # Compute embeddings for Dataset 2
 #             status = "Computing embeddings for Dataset 2..."
 #             yield status, ""
-#             embedding_matrix2 = compute_embeddings(texts2, batch_size=64, progress=progress, desc="Computing embeddings for Dataset 2")
 #             # Deduplicate across datasets
 #             status = "Deduplicating embeddings across datasets..."
@@ -552,8 +487,8 @@ demo.launch()
 #             num_total_ds2 = len(texts2)
 #             num_unique_ds2 = num_total_ds2 - num_duplicates
-#             result_text = f"**Total documents in {dataset2_name}/{dataset2_split}:** {num_total_ds2}\n\n"
-#             result_text += f"**Number of duplicates found in {dataset2_name}/{dataset2_split}:** {num_duplicates}\n\n"
 #             result_text += f"**Number of unique documents in {dataset2_name}/{dataset2_split} after deduplication:** {num_unique_ds2}\n\n"
 #             # Show deduplicated examples
@@ -580,6 +515,72 @@ demo.launch()
 #         yield f"An error occurred: {e}", ""
 #         raise e
 # with gr.Blocks() as demo:
 #     gr.Markdown("# Semantic Deduplication")
@@ -644,25 +645,6 @@ demo.launch()
 # demo.launch()
 # # import gradio as gr
@@ -671,7 +653,6 @@ demo.launch()
 # # from model2vec import StaticModel
 # # from reach import Reach
 # # from difflib import ndiff
-# # import sys
 # # import tqdm
 # # # Load the model at startup
@@ -689,26 +670,41 @@ demo.launch()
 # # ds_default1 = load_dataset(default_dataset1_name, split=default_dataset1_split)
 # # ds_default2 = load_dataset(default_dataset2_name, split=default_dataset2_split)
-# # def deduplicate(embedding_matrix: np.ndarray, threshold: float, batch_size: int = 1024) -> tuple[np.ndarray, dict[int, int]]:
 # #     """
 # #     Deduplicate embeddings and return the deduplicated indices and a mapping of removed indices to their corresponding original indices.
 # #     """
 # #     # Building the index
 # #     reach = Reach(vectors=embedding_matrix, items=[str(i) for i in range(len(embedding_matrix))])
 # #     deduplicated_indices = set(range(len(embedding_matrix)))
 # #     duplicate_to_original_mapping = {}
 # #     # Finding nearest neighbors
 # #     results = reach.nearest_neighbor_threshold(
 # #         embedding_matrix,
 # #         threshold=threshold,
 # #         batch_size=batch_size,
-# #         show_progressbar=True  # Allow internal progress bar
 # #     )
-# #     # Processing duplicates
-# #     for i, similar_items in enumerate(results):
 # #         if i not in deduplicated_indices:
 # #             continue
@@ -721,26 +717,29 @@ demo.launch()
 # #     return np.array(list(deduplicated_indices)), duplicate_to_original_mapping
-# # def deduplicate_across_datasets(embedding_matrix_1: np.ndarray, embedding_matrix_2: np.ndarray, threshold: float, batch_size: int = 1024) -> tuple[list[int], dict[int, int]]:
 # #     """
 # #     Deduplicate embeddings across two datasets and return the indices of duplicates between them.
 # #     """
 # #     # Building the index from Dataset 1
 # #     reach = Reach(vectors=embedding_matrix_1, items=[str(i) for i in range(len(embedding_matrix_1))])
 # #     duplicate_indices_in_test = []
 # #     duplicate_to_original_mapping = {}
 # #     # Finding nearest neighbors between datasets
 # #     results = reach.nearest_neighbor_threshold(
 # #         embedding_matrix_2,
 # #         threshold=threshold,
 # #         batch_size=batch_size,
-# #         show_progressbar=True  # Allow internal progress bar
 # #     )
-# #     # Processing duplicates
-# #     for i, similar_items in enumerate(results):
 # #         similar_indices = [int(item[0]) for item in similar_items if item[1] >= threshold]
 # #         if similar_indices:
@@ -764,17 +763,10 @@ demo.launch()
 # #     threshold=default_threshold,
 # #     progress=gr.Progress(track_tqdm=True)
 # # ):
-# #     # Deep Monkey-Patching of tqdm
-# #     original_tqdm = tqdm.tqdm
-# #     tqdm.tqdm = progress.tqdm
-# #     for mod_name in list(sys.modules.keys()):
-# #         if 'tqdm' in mod_name:
-# #             sys.modules[mod_name].tqdm = progress.tqdm
 # #     try:
 # #         # Convert threshold to float
 # #         threshold = float(threshold)
 # #         # Initialize status message
 # #         status = ""
@@ -786,33 +778,33 @@ demo.launch()
 # #                 ds = ds_default1
 # #             else:
 # #                 ds = load_dataset(dataset1_name, split=dataset1_split)
 # #             # Extract texts
 # #             status = "Extracting texts from Dataset 1..."
 # #             yield status, ""
 # #             texts = [example[dataset1_text_column] for example in ds]
 # #             # Compute embeddings
 # #             status = "Computing embeddings for Dataset 1..."
 # #             yield status, ""
-# #             embedding_matrix = model.encode(texts, show_progressbar=True)  # Enable internal progress bar
 # #             # Deduplicate
 # #             status = "Deduplicating embeddings..."
 # #             yield status, ""
 # #             deduplicated_indices, duplicate_to_original_mapping = deduplicate(
-# #                 embedding_matrix, threshold
 # #             )
 # #             # Prepare the results
 # #             num_duplicates = len(duplicate_to_original_mapping)
 # #             num_total = len(texts)
 # #             num_deduplicated = len(deduplicated_indices)
 # #             result_text = f"**Total documents:** {num_total}\n"
 # #             result_text += f"**Number of duplicates found:** {num_duplicates}\n"
 # #             result_text += f"**Number of unique documents after deduplication:** {num_deduplicated}\n\n"
 # #             # Show deduplicated examples
 # #             if num_duplicates > 0:
 # #                 result_text += "**Examples of duplicates found:**\n\n"
@@ -827,11 +819,11 @@ demo.launch()
 # #                     result_text += "-" * 50 + "\n\n"
 # #             else:
 # #                 result_text += "No duplicates found."
 # #             # Final status
 # #             status = "Deduplication completed."
 # #             yield status, result_text
 # #         elif deduplication_type == "Cross-dataset":
 # #             # Load Dataset 1
 # #             status = "Loading Dataset 1..."
@@ -840,7 +832,7 @@ demo.launch()
 # #                 ds1 = ds_default1
 # #             else:
 # #                 ds1 = load_dataset(dataset1_name, split=dataset1_split)
 # #             # Load Dataset 2
 # #             status = "Loading Dataset 2..."
 # #             yield status, ""
@@ -848,42 +840,42 @@ demo.launch()
 # #                 ds2 = ds_default2
 # #             else:
 # #                 ds2 = load_dataset(dataset2_name, split=dataset2_split)
 # #             # Extract texts from Dataset 1
 # #             status = "Extracting texts from Dataset 1..."
 # #             yield status, ""
 # #             texts1 = [example[dataset1_text_column] for example in ds1]
 # #             # Extract texts from Dataset 2
 # #             status = "Extracting texts from Dataset 2..."
 # #             yield status, ""
 # #             texts2 = [example[dataset2_text_column] for example in ds2]
 # #             # Compute embeddings for Dataset 1
 # #             status = "Computing embeddings for Dataset 1..."
 # #             yield status, ""
-# #             embedding_matrix1 = model.encode(texts1, show_progressbar=True)
 # #             # Compute embeddings for Dataset 2
 # #             status = "Computing embeddings for Dataset 2..."
 # #             yield status, ""
-# #             embedding_matrix2 = model.encode(texts2, show_progressbar=True)
 # #             # Deduplicate across datasets
 # #             status = "Deduplicating embeddings across datasets..."
 # #             yield status, ""
 # #             duplicate_indices_in_ds2, duplicate_to_original_mapping = deduplicate_across_datasets(
-# #                 embedding_matrix1, embedding_matrix2, threshold
 # #             )
 # #             num_duplicates = len(duplicate_indices_in_ds2)
 # #             num_total_ds2 = len(texts2)
 # #             num_unique_ds2 = num_total_ds2 - num_duplicates
-# #             result_text = f"**Total documents in {dataset2_name}/{dataset2_split}:** {num_total_ds2}\n"
-# #             result_text += f"**Number of duplicates found in {dataset2_name}/{dataset2_split}:** {num_duplicates}\n"
 # #             result_text += f"**Number of unique documents in {dataset2_name}/{dataset2_split} after deduplication:** {num_unique_ds2}\n\n"
 # #             # Show deduplicated examples
 # #             if num_duplicates > 0:
 # #                 result_text += "**Examples of duplicates found in Dataset 2:**\n\n"
@@ -899,17 +891,14 @@ demo.launch()
 # #                     result_text += "-" * 50 + "\n\n"
 # #             else:
 # #                 result_text += "No duplicates found."
 # #             # Final status
 # #             status = "Deduplication completed."
 # #             yield status, result_text
-# #     finally:
-# #         # Restore original tqdm
-# #         tqdm.tqdm = original_tqdm
-# #         for mod_name in list(sys.modules.keys()):
-# #             if 'tqdm' in mod_name:
-# #                 sys.modules[mod_name].tqdm = original_tqdm
 # # with gr.Blocks() as demo:
 # #     gr.Markdown("# Semantic Deduplication")
@@ -961,605 +950,670 @@ demo.launch()
 # #     compute_button.click(
 # #         fn=perform_deduplication,
 # #         inputs=[
-# #             deduplication_type,
-# #             dataset1_name,
-# #             dataset1_split,
 # #             dataset1_text_column,
-# #             dataset2_name,
-# #             dataset2_split,
 # #             dataset2_text_column,
 # #             threshold
 # #         ],
 # #         outputs=[status_output, result_output]
 # #     )
 # # demo.launch()
-# # import gradio as gr
-# # from datasets import load_dataset
-# # import numpy as np
-# # from model2vec import StaticModel
-# # from reach import Reach
-# # from difflib import ndiff
-# # import sys
-# # import tqdm
-# # # Load the model at startup
-# # model = StaticModel.from_pretrained("minishlab/M2V_base_output")
-# # # Update default dataset to 'sst2' and set default threshold to 0.9
-# # default_dataset1_name = "sst2"
-# # default_dataset1_split = "train"
-# # default_dataset2_name = "sst2"
-# # default_dataset2_split = "validation"
-# # default_text_column = "sentence"
-# # default_threshold = 0.9
-# # # Load the default datasets at startup
-# # ds_default1 = load_dataset(default_dataset1_name, split=default_dataset1_split)
-# # ds_default2 = load_dataset(default_dataset2_name, split=default_dataset2_split)
-# # def deduplicate(embedding_matrix: np.ndarray, threshold: float, batch_size: int = 1024, progress=None) -> tuple[np.ndarray, dict[int, int]]:
-# #     """
-# #     Deduplicate embeddings and return the deduplicated indices and a mapping of removed indices to their corresponding original indices.
-# #     """
-# #     # Update progress to indicate building the index
-# #     progress(0, desc="Building search index...")
-# #     reach = Reach(vectors=embedding_matrix, items=[str(i) for i in range(len(embedding_matrix))])
-# #     deduplicated_indices = set(range(len(embedding_matrix)))
-# #     duplicate_to_original_mapping = {}
-# #     # Finding nearest neighbors
-# #     progress(0, desc="Finding nearest neighbors...")
-# #     results = reach.nearest_neighbor_threshold(
-# #         embedding_matrix,
-# #         threshold=threshold,
-# #         batch_size=batch_size,
-# #         show_progressbar=True  # Allow internal progress bar
-# #     )
-# #     # Processing duplicates with a progress bar
-# #     total_items = len(embedding_matrix)
-# #     for i, similar_items in enumerate(progress.tqdm(results, desc="Processing duplicates", total=total_items)):
-# #         if i not in deduplicated_indices:
-# #             continue
-# #         similar_indices = [int(item[0]) for item in similar_items if int(item[0]) != i]
-# #         for sim_idx in similar_indices:
-# #             if sim_idx in deduplicated_indices:
-# #                 deduplicated_indices.remove(sim_idx)
-# #                 duplicate_to_original_mapping[sim_idx] = i
-# #     return np.array(list(deduplicated_indices)), duplicate_to_original_mapping
-# # def deduplicate_across_datasets(embedding_matrix_1: np.ndarray, embedding_matrix_2: np.ndarray, threshold: float, batch_size: int = 1024, progress=None) -> tuple[list[int], dict[int, int]]:
-# #     """
-# #     Deduplicate embeddings across two datasets and return the indices of duplicates between them.
-# #     """
-# #     # Update progress to indicate building the index
-# #     progress(0, desc="Building search index from Dataset 1...")
-# #     reach = Reach(vectors=embedding_matrix_1, items=[str(i) for i in range(len(embedding_matrix_1))])
-# #     duplicate_indices_in_test = []
-# #     duplicate_to_original_mapping = {}
-# #     # Finding nearest neighbors between datasets
-# #     progress(0, desc="Finding nearest neighbors between datasets...")
-# #     results = reach.nearest_neighbor_threshold(
-# #         embedding_matrix_2,
-# #         threshold=threshold,
-# #         batch_size=batch_size,
-# #         show_progressbar=True  # Allow internal progress bar
-# #     )
-# #     total_items = len(embedding_matrix_2)
-# #     # Processing duplicates with a progress bar
-# #     for i, similar_items in enumerate(progress.tqdm(results, desc="Processing duplicates across datasets", total=total_items)):
-# #         similar_indices = [int(item[0]) for item in similar_items if item[1] >= threshold]
-# #         if similar_indices:
-# #             duplicate_indices_in_test.append(i)
-# #             duplicate_to_original_mapping[i] = similar_indices[0]
-# #     return duplicate_indices_in_test, duplicate_to_original_mapping
-# # def display_word_differences(x: str, y: str) -> str:
-# #     diff = ndiff(x.split(), y.split())
-# #     return " ".join([word for word in diff if word.startswith(('+', '-'))])
-# # def perform_deduplication(
-# #     deduplication_type,
-# #     dataset1_name,
-# #     dataset1_split,
-# #     dataset1_text_column,
-# #     dataset2_name="",
-# #     dataset2_split="",
-# #     dataset2_text_column="",
-# #     threshold=default_threshold,
-# #     progress=gr.Progress(track_tqdm=True)
-# # ):
-# #     # Monkey-patch tqdm
-# #     original_tqdm = tqdm.tqdm
-# #     original_reach_tqdm = Reach.__dict__['tqdm'] if 'tqdm' in Reach.__dict__ else None
-# #     tqdm.tqdm = progress.tqdm
-# #     sys.modules['tqdm'].tqdm = progress.tqdm
-# #     sys.modules['tqdm.auto'].tqdm = progress.tqdm
-# #     Reach.tqdm = progress.tqdm  # Monkey-patch reach's tqdm
-# #     try:
-# #         # Convert threshold to float
-# #         threshold = float(threshold)
-# #         if deduplication_type == "Single dataset":
-# #             # Load Dataset 1
-# #             progress(0, desc="Loading Dataset 1...")
-# #             if dataset1_name == default_dataset1_name and dataset1_split == default_dataset1_split:
-# #                 ds = ds_default1
-# #             else:
-# #                 ds = load_dataset(dataset1_name, split=dataset1_split)
-# #             # Extract texts
-# #             progress(0, desc="Extracting texts from Dataset 1...")
-# #             texts = [example[dataset1_text_column] for example in ds]
-# #             # Compute embeddings
-# #             progress(0, desc="Computing embeddings for Dataset 1...")
-# #             embedding_matrix = model.encode(texts, show_progressbar=True)  # Enable internal progress bar
-# #             # Deduplicate
-# #             result_text = deduplicate_and_prepare_results_single(
-# #                 embedding_matrix, texts, threshold, progress
-# #             )
-# #             return result_text
-# #         elif deduplication_type == "Cross-dataset":
-# #             # Load Dataset 1
-# #             progress(0, desc="Loading Dataset 1...")
-# #             if dataset1_name == default_dataset1_name and dataset1_split == default_dataset1_split:
-# #                 ds1 = ds_default1
-# #             else:
-# #                 ds1 = load_dataset(dataset1_name, split=dataset1_split)
-# #             # Load Dataset 2
-# #             progress(0, desc="Loading Dataset 2...")
-# #             if dataset2_name == default_dataset2_name and dataset2_split == default_dataset2_split:
-# #                 ds2 = ds_default2
-# #             else:
-# #                 ds2 = load_dataset(dataset2_name, split=dataset2_split)
-# #             # Extract texts from Dataset 1
-# #             progress(0, desc="Extracting texts from Dataset 1...")
-# #             texts1 = [example[dataset1_text_column] for example in ds1]
-# #             # Extract texts from Dataset 2
-# #             progress(0, desc="Extracting texts from Dataset 2...")
-# #             texts2 = [example[dataset2_text_column] for example in ds2]
-# #             # Compute embeddings for Dataset 1
-# #             progress(0, desc="Computing embeddings for Dataset 1...")
-# #             embedding_matrix1 = model.encode(texts1, show_progressbar=True)
-# #             # Compute embeddings for Dataset 2
-# #             progress(0, desc="Computing embeddings for Dataset 2...")
-# #             embedding_matrix2 = model.encode(texts2, show_progressbar=True)
-# #             # Deduplicate across datasets
-# #             result_text = deduplicate_and_prepare_results_cross(
-# #                 embedding_matrix1, embedding_matrix2, texts1, texts2, threshold, progress, dataset2_name, dataset2_split
-# #             )
-# #             return result_text
-# #     finally:
-# #         # Restore original tqdm
-# #         tqdm.tqdm = original_tqdm
-# #         sys.modules['tqdm'].tqdm = original_tqdm
-# #         sys.modules['tqdm.auto'].tqdm = original_tqdm
-# #         # Restore reach's original tqdm
-# #         if original_reach_tqdm is not None:
-# #             Reach.tqdm = original_reach_tqdm
-# #         else:
-# #             del Reach.tqdm  # If it wasn't originally in Reach's __dict__
-# # def deduplicate_and_prepare_results_single(embedding_matrix, texts, threshold, progress):
-# #     # Deduplicate
-# #     deduplicated_indices, duplicate_to_original_mapping = deduplicate(
-# #         embedding_matrix, threshold, progress=progress
-# #     )
-# #     # Prepare the results
-# #     num_duplicates = len(duplicate_to_original_mapping)
-# #     num_total = len(texts)
-# #     num_deduplicated = len(deduplicated_indices)
-# #     result_text = f"**Total documents:** {num_total}\n"
-# #     result_text += f"**Number of duplicates found:** {num_duplicates}\n"
-# #     result_text += f"**Number of unique documents after deduplication:** {num_deduplicated}\n\n"
-# #     # Show deduplicated examples
-# #     if num_duplicates > 0:
-# #         result_text += "**Examples of duplicates found:**\n\n"
-# #         num_examples = min(5, num_duplicates)
-# #         for duplicate_idx, original_idx in list(duplicate_to_original_mapping.items())[:num_examples]:
-# #             original_text = texts[original_idx]
-# #             duplicate_text = texts[duplicate_idx]
-# #             differences = display_word_differences(original_text, duplicate_text)
-# #             result_text += f"**Original text:**\n{original_text}\n\n"
-# #             result_text += f"**Duplicate text:**\n{duplicate_text}\n\n"
-# #             result_text += f"**Differences:**\n{differences}\n"
-# #             result_text += "-" * 50 + "\n\n"
-# #     else:
-# #         result_text += "No duplicates found."
-# #     return result_text
-# # def deduplicate_and_prepare_results_cross(embedding_matrix1, embedding_matrix2, texts1, texts2, threshold, progress, dataset2_name, dataset2_split):
-# #     # Deduplicate across datasets
-# #     duplicate_indices_in_ds2, duplicate_to_original_mapping = deduplicate_across_datasets(
-# #         embedding_matrix1, embedding_matrix2, threshold, progress=progress
-# #     )
-# #     num_duplicates = len(duplicate_indices_in_ds2)
-# #     num_total_ds2 = len(texts2)
-# #     num_unique_ds2 = num_total_ds2 - num_duplicates
-# #     result_text = f"**Total documents in {dataset2_name}/{dataset2_split}:** {num_total_ds2}\n"
-# #     result_text += f"**Number of duplicates found in {dataset2_name}/{dataset2_split}:** {num_duplicates}\n"
-# #     result_text += f"**Number of unique documents in {dataset2_name}/{dataset2_split} after deduplication:** {num_unique_ds2}\n\n"
-# #     # Show deduplicated examples
-# #     if num_duplicates > 0:
-# #         result_text += "**Examples of duplicates found in Dataset 2:**\n\n"
-# #         num_examples = min(5, num_duplicates)
-# #         for duplicate_idx in duplicate_indices_in_ds2[:num_examples]:
-# #             original_idx = duplicate_to_original_mapping[duplicate_idx]
-# #             original_text = texts1[original_idx]
-# #             duplicate_text = texts2[duplicate_idx]
-# #             differences = display_word_differences(original_text, duplicate_text)
-# #             result_text += f"**Original text (Dataset 1):**\n{original_text}\n\n"
-# #             result_text += f"**Duplicate text (Dataset 2):**\n{duplicate_text}\n\n"
-# #             result_text += f"**Differences:**\n{differences}\n"
-# #             result_text += "-" * 50 + "\n\n"
-# #     else:
-# #         result_text += "No duplicates found."
-# #     return result_text
-# # with gr.Blocks() as demo:
-# #     gr.Markdown("# Semantic Deduplication")
-# #     deduplication_type = gr.Radio(
-# #         choices=["Single dataset", "Cross-dataset"],
-# #         label="Deduplication Type",
-# #         value="Single dataset"
-# #     )
-# #     with gr.Row():
-# #         dataset1_name = gr.Textbox(value=default_dataset1_name, label="Dataset 1 Name")
-# #         dataset1_split = gr.Textbox(value=default_dataset1_split, label="Dataset 1 Split")
-# #         dataset1_text_column = gr.Textbox(value=default_text_column, label="Text Column Name")
-# #     dataset2_inputs = gr.Column(visible=False)
-# #     with dataset2_inputs:
-# #         gr.Markdown("### Dataset 2")
-# #         with gr.Row():
-# #             dataset2_name = gr.Textbox(value=default_dataset2_name, label="Dataset 2 Name")
-# #             dataset2_split = gr.Textbox(value=default_dataset2_split, label="Dataset 2 Split")
-# #             dataset2_text_column = gr.Textbox(value=default_text_column, label="Text Column Name")
-# #     threshold = gr.Slider(
-# #         minimum=0.0,
-# #         maximum=1.0,
-# #         value=default_threshold,
-# #         label="Similarity Threshold"
-# #     )
-# #     compute_button = gr.Button("Compute")
-# #     output = gr.Markdown()
-# #     # Function to update the visibility of dataset2_inputs
-# #     def update_visibility(deduplication_type_value):
-# #         if deduplication_type_value == "Cross-dataset":
-# #             return gr.update(visible=True)
-# #         else:
-# #             return gr.update(visible=False)
-# #     deduplication_type.change(
-# #         update_visibility,
-# #         inputs=deduplication_type,
-# #         outputs=dataset2_inputs
-# #     )
-# #     compute_button.click(
-# #         fn=perform_deduplication,
-# #         inputs=[
-# #             deduplication_type,
-# #             dataset1_name,
-# #             dataset1_split,
-# #             dataset1_text_column,
-# #             dataset2_name,
-# #             dataset2_split,
-# #             dataset2_text_column,
-# #             threshold
-# #         ],
-# #         outputs=output
-# #     )
-# # demo.launch()
-# # import gradio as gr
-# # from datasets import load_dataset
-# # import numpy as np
-# # from model2vec import StaticModel
-# # from reach import Reach
-# # from difflib import ndiff
-# # import sys
-# # import tqdm
-# # # Load the model at startup
-# # model = StaticModel.from_pretrained("minishlab/M2V_base_output")
-# # # Load the default datasets at startup
-# # default_dataset1_name = "ag_news"
-# # default_dataset1_split = "train"
-# # default_dataset2_name = "ag_news"
-# # default_dataset2_split = "test"
-# # ds_default1 = load_dataset(default_dataset1_name, split=default_dataset1_split)
-# # ds_default2 = load_dataset(default_dataset2_name, split=default_dataset2_split)
-# # def deduplicate(embedding_matrix: np.ndarray, threshold: float, batch_size: int = 1024, progress=None) -> tuple[np.ndarray, dict[int, int]]:
-# #     """
-# #     Deduplicate embeddings and return the deduplicated indices and a mapping of removed indices to their corresponding original indices.
-# #     """
-# #     reach = Reach(vectors=embedding_matrix, items=[str(i) for i in range(len(embedding_matrix))])
-# #     deduplicated_indices = set(range(len(embedding_matrix)))
-# #     duplicate_to_original_mapping = {}
-# #     results = reach.nearest_neighbor_threshold(
-# #         embedding_matrix,
-# #         threshold=threshold,
-# #         batch_size=batch_size,
-# #         show_progressbar=True  # Allow internal progress bar
-# #     )
-# #     # Process duplicates
-# #     for i, similar_items in enumerate(progress.tqdm(results, desc="Processing duplicates", total=len(embedding_matrix))):
-# #         if i not in deduplicated_indices:
-# #             continue
-# #         similar_indices = [int(item[0]) for item in similar_items if int(item[0]) != i]
-# #         for sim_idx in similar_indices:
-# #             if sim_idx in deduplicated_indices:
-# #                 deduplicated_indices.remove(sim_idx)
-# #                 duplicate_to_original_mapping[sim_idx] = i
-# #     return np.array(list(deduplicated_indices)), duplicate_to_original_mapping
-# # def deduplicate_across_datasets(embedding_matrix_1: np.ndarray, embedding_matrix_2: np.ndarray, threshold: float, batch_size: int = 1024, progress=None) -> tuple[list[int], dict[int, int]]:
-# #     """
-# #     Deduplicate embeddings across two datasets and return the indices of duplicates between them.
-# #     """
-# #     reach = Reach(vectors=embedding_matrix_1, items=[str(i) for i in range(len(embedding_matrix_1))])
-# #     duplicate_indices_in_test = []
-# #     duplicate_to_original_mapping = {}
-# #     results = reach.nearest_neighbor_threshold(
-# #         embedding_matrix_2,
-# #         threshold=threshold,
-# #         batch_size=batch_size,
-# #         show_progressbar=True  # Allow internal progress bar
-# #     )
-# #     for i, similar_items in enumerate(progress.tqdm(results, desc="Processing duplicates across datasets", total=len(embedding_matrix_2))):
-# #         similar_indices = [int(item[0]) for item in similar_items if item[1] >= threshold]
-# #         if similar_indices:
-# #             duplicate_indices_in_test.append(i)
-# #             duplicate_to_original_mapping[i] = similar_indices[0]
-# #     return duplicate_indices_in_test, duplicate_to_original_mapping
-# # def display_word_differences(x: str, y: str) -> str:
-# #     diff = ndiff(x.split(), y.split())
-# #     return " ".join([word for word in diff if word.startswith(('+', '-'))])
-# # def perform_deduplication(
-# #     deduplication_type,
-# #     dataset1_name,
-# #     dataset1_split,
-# #     dataset1_text_column,
-# #     dataset2_name="",
-# #     dataset2_split="",
-# #     dataset2_text_column="",
-# #     threshold=0.8,
-# #     progress=gr.Progress(track_tqdm=True)
-# # ):
-# #     # Monkey-patch tqdm
-# #     original_tqdm = tqdm.tqdm
-# #     original_reach_tqdm = Reach.__dict__['tqdm'] if 'tqdm' in Reach.__dict__ else None
-# #     tqdm.tqdm = progress.tqdm
-# #     sys.modules['tqdm'].tqdm = progress.tqdm
-# #     sys.modules['tqdm.auto'].tqdm = progress.tqdm
-# #     Reach.tqdm = progress.tqdm  # Monkey-patch reach's tqdm
-# #     try:
-# #         # Convert threshold to float
-# #         threshold = float(threshold)
-# #         if deduplication_type == "Single dataset":
-# #             # Check if the dataset is the default one
-# #             if dataset1_name == default_dataset1_name and dataset1_split == default_dataset1_split:
-# #                 ds = ds_default1
-# #             else:
-# #                 ds = load_dataset(dataset1_name, split=dataset1_split)
-# #             # Extract texts
-# #             texts = [example[dataset1_text_column] for example in ds]
-# #             # Compute embeddings
-# #             embedding_matrix = model.encode(texts, show_progressbar=True)  # Enable internal progress bar
-# #             # Deduplicate
-# #             deduplicated_indices, duplicate_to_original_mapping = deduplicate(embedding_matrix, threshold, progress=progress)
-# #             # Prepare the results
-# #             num_duplicates = len(duplicate_to_original_mapping)
-# #             num_total = len(texts)
-# #             num_deduplicated = len(deduplicated_indices)
-# #             result_text = f"**Total documents:** {num_total}\n"
-# #             result_text += f"**Number of duplicates found:** {num_duplicates}\n"
-# #             result_text += f"**Number of unique documents after deduplication:** {num_deduplicated}\n\n"
-# #             # Show deduplicated examples
-# #             result_text += "**Examples of duplicates found:**\n\n"
-# #             num_examples = min(5, num_duplicates)
-# #             for duplicate_idx, original_idx in list(duplicate_to_original_mapping.items())[:num_examples]:
-# #                 original_text = texts[original_idx]
-# #                 duplicate_text = texts[duplicate_idx]
-# #                 differences = display_word_differences(original_text, duplicate_text)
-# #                 result_text += f"**Original text:**\n{original_text}\n\n"
-# #                 result_text += f"**Duplicate text:**\n{duplicate_text}\n\n"
-# #                 result_text += f"**Differences:**\n{differences}\n"
-# #                 result_text += "-" * 50 + "\n\n"
-# #             return result_text
-# #         elif deduplication_type == "Cross-dataset":
-# #             # Dataset 1
-# #             if dataset1_name == default_dataset1_name and dataset1_split == default_dataset1_split:
-# #                 ds1 = ds_default1
-# #             else:
-# #                 ds1 = load_dataset(dataset1_name, split=dataset1_split)
-# #             # Dataset 2
-# #             if dataset2_name == default_dataset2_name and dataset2_split == default_dataset2_split:
-# #                 ds2 = ds_default2
-# #             else:
-# #                 ds2 = load_dataset(dataset2_name, split=dataset2_split)
-# #             # Extract texts
-# #             texts1 = [example[dataset1_text_column] for example in ds1]
-# #             texts2 = [example[dataset2_text_column] for example in ds2]
-# #             # Compute embeddings
-# #             embedding_matrix1 = model.encode(texts1, show_progressbar=True)  # Enable internal progress bar
-# #             embedding_matrix2 = model.encode(texts2, show_progressbar=True)  # Enable internal progress bar
-# #             # Deduplicate across datasets
-# #             duplicate_indices_in_ds2, duplicate_to_original_mapping = deduplicate_across_datasets(
-# #                 embedding_matrix1, embedding_matrix2, threshold, progress=progress)
-# #             num_duplicates = len(duplicate_indices_in_ds2)
-# #             num_total_ds2 = len(texts2)
-# #             num_unique_ds2 = num_total_ds2 - num_duplicates
-# #             result_text = f"**Total documents in {dataset2_name}/{dataset2_split}:** {num_total_ds2}\n"
-# #             result_text += f"**Number of duplicates found in {dataset2_name}/{dataset2_split}:** {num_duplicates}\n"
-# #             result_text += f"**Number of unique documents in {dataset2_name}/{dataset2_split} after deduplication:** {num_unique_ds2}\n\n"
-# #             # Show deduplicated examples
-# #             result_text += "**Examples of duplicates found in Dataset 2:**\n\n"
-# #             num_examples = min(5, num_duplicates)
-# #             for duplicate_idx in duplicate_indices_in_ds2[:num_examples]:
-# #                 original_idx = duplicate_to_original_mapping[duplicate_idx]
-# #                 original_text = texts1[original_idx]
-# #                 duplicate_text = texts2[duplicate_idx]
-# #                 differences = display_word_differences(original_text, duplicate_text)
-# #                 result_text += f"**Original text (Dataset 1):**\n{original_text}\n\n"
-# #                 result_text += f"**Duplicate text (Dataset 2):**\n{duplicate_text}\n\n"
-# #                 result_text += f"**Differences:**\n{differences}\n"
-# #                 result_text += "-" * 50 + "\n\n"
-# #             return result_text
-# #     finally:
-# #         # Restore original tqdm
-# #         tqdm.tqdm = original_tqdm
-# #         sys.modules['tqdm'].tqdm = original_tqdm
-# #         sys.modules['tqdm.auto'].tqdm = original_tqdm
-# #         # Restore reach's original tqdm
-# #         if original_reach_tqdm is not None:
-# #             Reach.tqdm = original_reach_tqdm
-# #         else:
-# #             del Reach.tqdm  # If it wasn't originally in Reach's __dict__
-# # with gr.Blocks() as demo:
-# #     gr.Markdown("# Semantic Deduplication")
-# #     deduplication_type = gr.Radio(
-# #         choices=["Single dataset", "Cross-dataset"],
-# #         label="Deduplication Type",
-# #         value="Single dataset"
-# #     )
-# #     with gr.Row():
-# #         dataset1_name = gr.Textbox(value="ag_news", label="Dataset 1 Name")
-# #         dataset1_split = gr.Textbox(value="train", label="Dataset 1 Split")
-# #         dataset1_text_column = gr.Textbox(value="text", label="Text Column Name")
-# #     dataset2_inputs = gr.Column(visible=False)
-# #     with dataset2_inputs:
-# #         gr.Markdown("### Dataset 2")
-# #         with gr.Row():
-# #             dataset2_name = gr.Textbox(value="ag_news", label="Dataset 2 Name")
-# #             dataset2_split = gr.Textbox(value="test", label="Dataset 2 Split")
-# #             dataset2_text_column = gr.Textbox(value="text", label="Text Column Name")
-# #     threshold = gr.Slider(
-# #         minimum=0.0,
-# #         maximum=1.0,
-# #         value=0.8,
-# #         label="Similarity Threshold"
-# #     )
-# #     compute_button = gr.Button("Compute")
-# #     output = gr.Markdown()
-# #     # Function to update the visibility of dataset2_inputs
-# #     def update_visibility(deduplication_type_value):
-# #         if deduplication_type_value == "Cross-dataset":
-# #             return gr.update(visible=True)
-# #         else:
-# #             return gr.update(visible=False)
-# #     deduplication_type.change(
-# #         update_visibility,
-# #         inputs=deduplication_type,
-# #         outputs=dataset2_inputs
-# #     )
-# #     compute_button.click(
-# #         fn=perform_deduplication,
-# #         inputs=[
-# #             deduplication_type,
-# #             dataset1_name,
-# #             dataset1_split,
-# #             dataset1_text_column,
-# #             dataset2_name,
-# #             dataset2_split,
-# #             dataset2_text_column,
-# #             threshold
-# #         ],
-# #         outputs=output
-# #     )
-# # demo.launch()
 # # # import gradio as gr
@@ -1600,7 +1654,7 @@ demo.launch()
 # # #     )
 # # #     # Process duplicates
-# # #     for i, similar_items in enumerate(progress.tqdm(results, desc="Processing duplicates")):
 # # #         if i not in deduplicated_indices:
 # # #             continue
@@ -1629,8 +1683,7 @@ demo.launch()
 # # #         show_progressbar=True  # Allow internal progress bar
 # # #     )
-# # #     # Process duplicates
-# # #     for i, similar_items in enumerate(progress.tqdm(results, desc="Processing duplicates across datasets")):
 # # #         similar_indices = [int(item[0]) for item in similar_items if item[1] >= threshold]
 # # #         if similar_indices:
@@ -1656,9 +1709,11 @@ demo.launch()
 # # # ):
 # # #     # Monkey-patch tqdm
 # # #     original_tqdm = tqdm.tqdm
 # # #     tqdm.tqdm = progress.tqdm
 # # #     sys.modules['tqdm'].tqdm = progress.tqdm
 # # #     sys.modules['tqdm.auto'].tqdm = progress.tqdm
 # # #     try:
 # # #         # Convert threshold to float
@@ -1725,7 +1780,8 @@ demo.launch()
 # # #             embedding_matrix2 = model.encode(texts2, show_progressbar=True)  # Enable internal progress bar
 # # #             # Deduplicate across datasets
-# # #             duplicate_indices_in_ds2, duplicate_to_original_mapping = deduplicate_across_datasets(embedding_matrix1, embedding_matrix2, threshold, progress=progress)
 # # #             num_duplicates = len(duplicate_indices_in_ds2)
 # # #             num_total_ds2 = len(texts2)
@@ -1756,6 +1812,12 @@ demo.launch()
 # # #         sys.modules['tqdm'].tqdm = original_tqdm
 # # #         sys.modules['tqdm.auto'].tqdm = original_tqdm
 # # # with gr.Blocks() as demo:
 # # #     gr.Markdown("# Semantic Deduplication")
@@ -1818,3 +1880,261 @@ demo.launch()
 # # #     )
 # # # demo.launch()

     for i in range(0, len(iterable), batch_size):
         yield iterable[i:i + batch_size]
+def compute_embeddings_with_progress(texts, batch_size, progress, desc="Computing embeddings"):
+    embeddings = []
+    total_batches = (len(texts) + batch_size - 1) // batch_size
+    for batch_texts in progress.tqdm(batch_iterable(texts, batch_size), desc=desc, total=total_batches):
+        batch_embeddings = model.encode(batch_texts, show_progressbar=False)
+        embeddings.append(batch_embeddings)
+    embedding_matrix = np.concatenate(embeddings, axis=0)
+    return embedding_matrix
 def display_word_differences(x: str, y: str) -> str:
     diff = ndiff(x.split(), y.split())
     return " ".join([word for word in diff if word.startswith(('+', '-'))])
             # Compute embeddings
             status = "Computing embeddings for Dataset 1..."
             yield status, ""
+            embedding_matrix = compute_embeddings_with_progress(texts, batch_size=64, progress=progress, desc="Computing embeddings for Dataset 1")
             # Deduplicate
             status = "Deduplicating embeddings..."
             # Compute embeddings for Dataset 1
             status = "Computing embeddings for Dataset 1..."
             yield status, ""
+            embedding_matrix1 = compute_embeddings_with_progress(texts1, batch_size=64, progress=progress, desc="Computing embeddings for Dataset 1")
             # Compute embeddings for Dataset 2
             status = "Computing embeddings for Dataset 2..."
             yield status, ""
+            embedding_matrix2 = compute_embeddings_with_progress(texts2, batch_size=64, progress=progress, desc="Computing embeddings for Dataset 2")
             # Deduplicate across datasets
             status = "Deduplicating embeddings across datasets..."
 demo.launch()
 # import gradio as gr
 # from datasets import load_dataset
 # import numpy as np
 #     for i in range(0, len(iterable), batch_size):
 #         yield iterable[i:i + batch_size]
 # def display_word_differences(x: str, y: str) -> str:
 #     diff = ndiff(x.split(), y.split())
 #     return " ".join([word for word in diff if word.startswith(('+', '-'))])
 #             # Compute embeddings
 #             status = "Computing embeddings for Dataset 1..."
 #             yield status, ""
+#             embeddings = []
+#             batch_size = 64
+#             total_batches = (len(texts) + batch_size - 1) // batch_size
+#             for batch_texts in progress.tqdm(batch_iterable(texts, batch_size), desc="Computing embeddings", total=total_batches):
+#                 batch_embeddings = model.encode(batch_texts, show_progressbar=False)
+#                 embeddings.append(batch_embeddings)
+#             embedding_matrix = np.concatenate(embeddings, axis=0)
 #             # Deduplicate
 #             status = "Deduplicating embeddings..."
 #             # Compute embeddings for Dataset 1
 #             status = "Computing embeddings for Dataset 1..."
 #             yield status, ""
+#             embeddings1 = []
+#             batch_size = 64
+#             total_batches1 = (len(texts1) + batch_size - 1) // batch_size
+#             for batch_texts in progress.tqdm(batch_iterable(texts1, batch_size), desc="Computing embeddings for Dataset 1", total=total_batches1):
+#                 batch_embeddings = model.encode(batch_texts, show_progressbar=False)
+#                 embeddings1.append(batch_embeddings)
+#             embedding_matrix1 = np.concatenate(embeddings1, axis=0)
 #             # Compute embeddings for Dataset 2
 #             status = "Computing embeddings for Dataset 2..."
 #             yield status, ""
+#             embeddings2 = []
+#             total_batches2 = (len(texts2) + batch_size - 1) // batch_size
+#             for batch_texts in progress.tqdm(batch_iterable(texts2, batch_size), desc="Computing embeddings for Dataset 2", total=total_batches2):
+#                 batch_embeddings = model.encode(batch_texts, show_progressbar=False)
+#                 embeddings2.append(batch_embeddings)
+#             embedding_matrix2 = np.concatenate(embeddings2, axis=0)
 #             # Deduplicate across datasets
 #             status = "Deduplicating embeddings across datasets..."
 #             num_total_ds2 = len(texts2)
 #             num_unique_ds2 = num_total_ds2 - num_duplicates
+#             result_text = f"**Total documents in {dataset2_name}/{dataset2_split}:** {num_total_ds2}\n"
+#             result_text += f"**Number of duplicates found in {dataset2_name}/{dataset2_split}:** {num_duplicates}\n"
 #             result_text += f"**Number of unique documents in {dataset2_name}/{dataset2_split} after deduplication:** {num_unique_ds2}\n\n"
 #             # Show deduplicated examples
 #         yield f"An error occurred: {e}", ""
 #         raise e
+# def deduplicate(embedding_matrix: np.ndarray, threshold: float, batch_size: int = 1024, progress=None) -> tuple[np.ndarray, dict[int, int]]:
+#     """
+#     Deduplicate embeddings and return the deduplicated indices and a mapping of removed indices to their corresponding original indices.
+#     """
+#     # Building the index
+#     progress(0, desc="Building search index...")
+#     reach = Reach(vectors=embedding_matrix, items=[str(i) for i in range(len(embedding_matrix))])
+#     deduplicated_indices = set(range(len(embedding_matrix)))
+#     duplicate_to_original_mapping = {}
+#     # Finding nearest neighbors
+#     progress(0, desc="Finding nearest neighbors...")
+#     results = reach.nearest_neighbor_threshold(
+#         embedding_matrix,
+#         threshold=threshold,
+#         batch_size=batch_size,
+#         show_progressbar=False  # Disable internal progress bar
+#     )
+#     # Processing duplicates with a progress bar
+#     total_items = len(embedding_matrix)
+#     for i, similar_items in enumerate(progress.tqdm(results, desc="Processing duplicates", total=total_items)):
+#         if i not in deduplicated_indices:
+#             continue
+#         similar_indices = [int(item[0]) for item in similar_items if int(item[0]) != i]
+#         for sim_idx in similar_indices:
+#             if sim_idx in deduplicated_indices:
+#                 deduplicated_indices.remove(sim_idx)
+#                 duplicate_to_original_mapping[sim_idx] = i
+#     return np.array(list(deduplicated_indices)), duplicate_to_original_mapping
+# def deduplicate_across_datasets(embedding_matrix_1: np.ndarray, embedding_matrix_2: np.ndarray, threshold: float, batch_size: int = 1024, progress=None) -> tuple[list[int], dict[int, int]]:
+#     """
+#     Deduplicate embeddings across two datasets and return the indices of duplicates between them.
+#     """
+#     # Building the index from Dataset 1
+#     progress(0, desc="Building search index from Dataset 1...")
+#     reach = Reach(vectors=embedding_matrix_1, items=[str(i) for i in range(len(embedding_matrix_1))])
+#     duplicate_indices_in_test = []
+#     duplicate_to_original_mapping = {}
+#     # Finding nearest neighbors between datasets
+#     progress(0, desc="Finding nearest neighbors between datasets...")
+#     results = reach.nearest_neighbor_threshold(
+#         embedding_matrix_2,
+#         threshold=threshold,
+#         batch_size=batch_size,
+#         show_progressbar=False  # Disable internal progress bar
+#     )
+#     total_items = len(embedding_matrix_2)
+#     # Processing duplicates with a progress bar
+#     for i, similar_items in enumerate(progress.tqdm(results, desc="Processing duplicates across datasets", total=total_items)):
+#         similar_indices = [int(item[0]) for item in similar_items if item[1] >= threshold]
+#         if similar_indices:
+#             duplicate_indices_in_test.append(i)
+#             duplicate_to_original_mapping[i] = similar_indices[0]
+#     return duplicate_indices_in_test, duplicate_to_original_mapping
 # with gr.Blocks() as demo:
 #     gr.Markdown("# Semantic Deduplication")
 # demo.launch()
 # # import gradio as gr
 # # from model2vec import StaticModel
 # # from reach import Reach
 # # from difflib import ndiff
 # # import tqdm
 # # # Load the model at startup
 # # ds_default1 = load_dataset(default_dataset1_name, split=default_dataset1_split)
 # # ds_default2 = load_dataset(default_dataset2_name, split=default_dataset2_split)
+# # def batch_iterable(iterable, batch_size):
+# #     """Helper function to create batches from an iterable."""
+# #     for i in range(0, len(iterable), batch_size):
+# #         yield iterable[i:i + batch_size]
+# # def compute_embeddings(texts, batch_size, progress, desc="Computing embeddings"):
+# #     embeddings = []
+# #     for batch in progress.tqdm(batch_iterable(texts, batch_size), total=(len(texts) + batch_size - 1) // batch_size, desc=desc):
+# #         batch_embeddings = model.encode(batch, show_progressbar=False)
+# #         embeddings.append(batch_embeddings)
+# #     return np.concatenate(embeddings, axis=0)
+# # def deduplicate(embedding_matrix: np.ndarray, threshold: float, batch_size: int = 1024, progress=None) -> tuple[np.ndarray, dict[int, int]]:
 # #     """
 # #     Deduplicate embeddings and return the deduplicated indices and a mapping of removed indices to their corresponding original indices.
 # #     """
 # #     # Building the index
+# #     progress(0, desc="Building search index...")
 # #     reach = Reach(vectors=embedding_matrix, items=[str(i) for i in range(len(embedding_matrix))])
 # #     deduplicated_indices = set(range(len(embedding_matrix)))
 # #     duplicate_to_original_mapping = {}
 # #     # Finding nearest neighbors
+# #     progress(0, desc="Finding nearest neighbors...")
 # #     results = reach.nearest_neighbor_threshold(
 # #         embedding_matrix,
 # #         threshold=threshold,
 # #         batch_size=batch_size,
+# #         show_progressbar=False  # Disable internal progress bar
 # #     )
+# #     # Processing duplicates with a progress bar
+# #     total_items = len(embedding_matrix)
+# #     for i, similar_items in enumerate(progress.tqdm(results, desc="Processing duplicates", total=total_items)):
 # #         if i not in deduplicated_indices:
 # #             continue
 # #     return np.array(list(deduplicated_indices)), duplicate_to_original_mapping
+# # def deduplicate_across_datasets(embedding_matrix_1: np.ndarray, embedding_matrix_2: np.ndarray, threshold: float, batch_size: int = 1024, progress=None) -> tuple[list[int], dict[int, int]]:
 # #     """
 # #     Deduplicate embeddings across two datasets and return the indices of duplicates between them.
 # #     """
 # #     # Building the index from Dataset 1
+# #     progress(0, desc="Building search index from Dataset 1...")
 # #     reach = Reach(vectors=embedding_matrix_1, items=[str(i) for i in range(len(embedding_matrix_1))])
 # #     duplicate_indices_in_test = []
 # #     duplicate_to_original_mapping = {}
 # #     # Finding nearest neighbors between datasets
+# #     progress(0, desc="Finding nearest neighbors between datasets...")
 # #     results = reach.nearest_neighbor_threshold(
 # #         embedding_matrix_2,
 # #         threshold=threshold,
 # #         batch_size=batch_size,
+# #         show_progressbar=False  # Disable internal progress bar
 # #     )
+# #     total_items = len(embedding_matrix_2)
+# #     # Processing duplicates with a progress bar
+# #     for i, similar_items in enumerate(progress.tqdm(results, desc="Processing duplicates across datasets", total=total_items)):
 # #         similar_indices = [int(item[0]) for item in similar_items if item[1] >= threshold]
 # #         if similar_indices:
 # #     threshold=default_threshold,
 # #     progress=gr.Progress(track_tqdm=True)
 # # ):
 # #     try:
 # #         # Convert threshold to float
 # #         threshold = float(threshold)
 # #         # Initialize status message
 # #         status = ""
 # #                 ds = ds_default1
 # #             else:
 # #                 ds = load_dataset(dataset1_name, split=dataset1_split)
 # #             # Extract texts
 # #             status = "Extracting texts from Dataset 1..."
 # #             yield status, ""
 # #             texts = [example[dataset1_text_column] for example in ds]
 # #             # Compute embeddings
 # #             status = "Computing embeddings for Dataset 1..."
 # #             yield status, ""
+# #             embedding_matrix = compute_embeddings(texts, batch_size=64, progress=progress, desc="Computing embeddings for Dataset 1")
 # #             # Deduplicate
 # #             status = "Deduplicating embeddings..."
 # #             yield status, ""
 # #             deduplicated_indices, duplicate_to_original_mapping = deduplicate(
+# #                 embedding_matrix, threshold, progress=progress
 # #             )
 # #             # Prepare the results
 # #             num_duplicates = len(duplicate_to_original_mapping)
 # #             num_total = len(texts)
 # #             num_deduplicated = len(deduplicated_indices)
 # #             result_text = f"**Total documents:** {num_total}\n"
 # #             result_text += f"**Number of duplicates found:** {num_duplicates}\n"
 # #             result_text += f"**Number of unique documents after deduplication:** {num_deduplicated}\n\n"
 # #             # Show deduplicated examples
 # #             if num_duplicates > 0:
 # #                 result_text += "**Examples of duplicates found:**\n\n"
 # #                     result_text += "-" * 50 + "\n\n"
 # #             else:
 # #                 result_text += "No duplicates found."
 # #             # Final status
 # #             status = "Deduplication completed."
 # #             yield status, result_text
 # #         elif deduplication_type == "Cross-dataset":
 # #             # Load Dataset 1
 # #             status = "Loading Dataset 1..."
 # #                 ds1 = ds_default1
 # #             else:
 # #                 ds1 = load_dataset(dataset1_name, split=dataset1_split)
 # #             # Load Dataset 2
 # #             status = "Loading Dataset 2..."
 # #             yield status, ""
 # #                 ds2 = ds_default2
 # #             else:
 # #                 ds2 = load_dataset(dataset2_name, split=dataset2_split)
 # #             # Extract texts from Dataset 1
 # #             status = "Extracting texts from Dataset 1..."
 # #             yield status, ""
 # #             texts1 = [example[dataset1_text_column] for example in ds1]
 # #             # Extract texts from Dataset 2
 # #             status = "Extracting texts from Dataset 2..."
 # #             yield status, ""
 # #             texts2 = [example[dataset2_text_column] for example in ds2]
 # #             # Compute embeddings for Dataset 1
 # #             status = "Computing embeddings for Dataset 1..."
 # #             yield status, ""
+# #             embedding_matrix1 = compute_embeddings(texts1, batch_size=64, progress=progress, desc="Computing embeddings for Dataset 1")
 # #             # Compute embeddings for Dataset 2
 # #             status = "Computing embeddings for Dataset 2..."
 # #             yield status, ""
+# #             embedding_matrix2 = compute_embeddings(texts2, batch_size=64, progress=progress, desc="Computing embeddings for Dataset 2")
 # #             # Deduplicate across datasets
 # #             status = "Deduplicating embeddings across datasets..."
 # #             yield status, ""
 # #             duplicate_indices_in_ds2, duplicate_to_original_mapping = deduplicate_across_datasets(
+# #                 embedding_matrix1, embedding_matrix2, threshold, progress=progress
 # #             )
 # #             num_duplicates = len(duplicate_indices_in_ds2)
 # #             num_total_ds2 = len(texts2)
 # #             num_unique_ds2 = num_total_ds2 - num_duplicates
+# #             result_text = f"**Total documents in {dataset2_name}/{dataset2_split}:** {num_total_ds2}\n\n"
+# #             result_text += f"**Number of duplicates found in {dataset2_name}/{dataset2_split}:** {num_duplicates}\n\n"
 # #             result_text += f"**Number of unique documents in {dataset2_name}/{dataset2_split} after deduplication:** {num_unique_ds2}\n\n"
 # #             # Show deduplicated examples
 # #             if num_duplicates > 0:
 # #                 result_text += "**Examples of duplicates found in Dataset 2:**\n\n"
 # #                     result_text += "-" * 50 + "\n\n"
 # #             else:
 # #                 result_text += "No duplicates found."
 # #             # Final status
 # #             status = "Deduplication completed."
 # #             yield status, result_text
+# #     except Exception as e:
+# #         yield f"An error occurred: {e}", ""
+# #         raise e
 # # with gr.Blocks() as demo:
 # #     gr.Markdown("# Semantic Deduplication")
 # #     compute_button.click(
 # #         fn=perform_deduplication,
 # #         inputs=[
+# #             deduplication_type,
+# #             dataset1_name,
+# #             dataset1_split,
 # #             dataset1_text_column,
+# #             dataset2_name,
+# #             dataset2_split,
 # #             dataset2_text_column,
 # #             threshold
 # #         ],
 # #         outputs=[status_output, result_output]
 # #     )
 # # demo.launch()
+# # # import gradio as gr
+# # # from datasets import load_dataset
+# # # import numpy as np
+# # # from model2vec import StaticModel
+# # # from reach import Reach
+# # # from difflib import ndiff
+# # # import sys
+# # # import tqdm
+# # # # Load the model at startup
+# # # model = StaticModel.from_pretrained("minishlab/M2V_base_output")
+# # # # Update default dataset to 'sst2' and set default threshold to 0.9
+# # # default_dataset1_name = "sst2"
+# # # default_dataset1_split = "train"
+# # # default_dataset2_name = "sst2"
+# # # default_dataset2_split = "validation"
+# # # default_text_column = "sentence"
+# # # default_threshold = 0.9
+# # # # Load the default datasets at startup
+# # # ds_default1 = load_dataset(default_dataset1_name, split=default_dataset1_split)
+# # # ds_default2 = load_dataset(default_dataset2_name, split=default_dataset2_split)
+# # # def deduplicate(embedding_matrix: np.ndarray, threshold: float, batch_size: int = 1024) -> tuple[np.ndarray, dict[int, int]]:
+# # #     """
+# # #     Deduplicate embeddings and return the deduplicated indices and a mapping of removed indices to their corresponding original indices.
+# # #     """
+# # #     # Building the index
+# # #     reach = Reach(vectors=embedding_matrix, items=[str(i) for i in range(len(embedding_matrix))])
+# # #     deduplicated_indices = set(range(len(embedding_matrix)))
+# # #     duplicate_to_original_mapping = {}
+# # #     # Finding nearest neighbors
+# # #     results = reach.nearest_neighbor_threshold(
+# # #         embedding_matrix,
+# # #         threshold=threshold,
+# # #         batch_size=batch_size,
+# # #         show_progressbar=True  # Allow internal progress bar
+# # #     )
+# # #     # Processing duplicates
+# # #     for i, similar_items in enumerate(results):
+# # #         if i not in deduplicated_indices:
+# # #             continue
+# # #         similar_indices = [int(item[0]) for item in similar_items if int(item[0]) != i]
+# # #         for sim_idx in similar_indices:
+# # #             if sim_idx in deduplicated_indices:
+# # #                 deduplicated_indices.remove(sim_idx)
+# # #                 duplicate_to_original_mapping[sim_idx] = i
+# # #     return np.array(list(deduplicated_indices)), duplicate_to_original_mapping
+# # # def deduplicate_across_datasets(embedding_matrix_1: np.ndarray, embedding_matrix_2: np.ndarray, threshold: float, batch_size: int = 1024) -> tuple[list[int], dict[int, int]]:
+# # #     """
+# # #     Deduplicate embeddings across two datasets and return the indices of duplicates between them.
+# # #     """
+# # #     # Building the index from Dataset 1
+# # #     reach = Reach(vectors=embedding_matrix_1, items=[str(i) for i in range(len(embedding_matrix_1))])
+# # #     duplicate_indices_in_test = []
+# # #     duplicate_to_original_mapping = {}
+# # #     # Finding nearest neighbors between datasets
+# # #     results = reach.nearest_neighbor_threshold(
+# # #         embedding_matrix_2,
+# # #         threshold=threshold,
+# # #         batch_size=batch_size,
+# # #         show_progressbar=True  # Allow internal progress bar
+# # #     )
+# # #     # Processing duplicates
+# # #     for i, similar_items in enumerate(results):
+# # #         similar_indices = [int(item[0]) for item in similar_items if item[1] >= threshold]
+# # #         if similar_indices:
+# # #             duplicate_indices_in_test.append(i)
+# # #             duplicate_to_original_mapping[i] = similar_indices[0]
+# # #     return duplicate_indices_in_test, duplicate_to_original_mapping
+# # # def display_word_differences(x: str, y: str) -> str:
+# # #     diff = ndiff(x.split(), y.split())
+# # #     return " ".join([word for word in diff if word.startswith(('+', '-'))])
+# # # def perform_deduplication(
+# # #     deduplication_type,
+# # #     dataset1_name,
+# # #     dataset1_split,
+# # #     dataset1_text_column,
+# # #     dataset2_name="",
+# # #     dataset2_split="",
+# # #     dataset2_text_column="",
+# # #     threshold=default_threshold,
+# # #     progress=gr.Progress(track_tqdm=True)
+# # # ):
+# # #     # Deep Monkey-Patching of tqdm
+# # #     original_tqdm = tqdm.tqdm
+# # #     tqdm.tqdm = progress.tqdm
+# # #     for mod_name in list(sys.modules.keys()):
+# # #         if 'tqdm' in mod_name:
+# # #             sys.modules[mod_name].tqdm = progress.tqdm
+# # #     try:
+# # #         # Convert threshold to float
+# # #         threshold = float(threshold)
+# # #         # Initialize status message
+# # #         status = ""
+# # #         if deduplication_type == "Single dataset":
+# # #             # Load Dataset 1
+# # #             status = "Loading Dataset 1..."
+# # #             yield status, ""
+# # #             if dataset1_name == default_dataset1_name and dataset1_split == default_dataset1_split:
+# # #                 ds = ds_default1
+# # #             else:
+# # #                 ds = load_dataset(dataset1_name, split=dataset1_split)
+# # #             # Extract texts
+# # #             status = "Extracting texts from Dataset 1..."
+# # #             yield status, ""
+# # #             texts = [example[dataset1_text_column] for example in ds]
+# # #             # Compute embeddings
+# # #             status = "Computing embeddings for Dataset 1..."
+# # #             yield status, ""
+# # #             embedding_matrix = model.encode(texts, show_progressbar=True)  # Enable internal progress bar
+# # #             # Deduplicate
+# # #             status = "Deduplicating embeddings..."
+# # #             yield status, ""
+# # #             deduplicated_indices, duplicate_to_original_mapping = deduplicate(
+# # #                 embedding_matrix, threshold
+# # #             )
+# # #             # Prepare the results
+# # #             num_duplicates = len(duplicate_to_original_mapping)
+# # #             num_total = len(texts)
+# # #             num_deduplicated = len(deduplicated_indices)
+# # #             result_text = f"**Total documents:** {num_total}\n"
+# # #             result_text += f"**Number of duplicates found:** {num_duplicates}\n"
+# # #             result_text += f"**Number of unique documents after deduplication:** {num_deduplicated}\n\n"
+# # #             # Show deduplicated examples
+# # #             if num_duplicates > 0:
+# # #                 result_text += "**Examples of duplicates found:**\n\n"
+# # #                 num_examples = min(5, num_duplicates)
+# # #                 for duplicate_idx, original_idx in list(duplicate_to_original_mapping.items())[:num_examples]:
+# # #                     original_text = texts[original_idx]
+# # #                     duplicate_text = texts[duplicate_idx]
+# # #                     differences = display_word_differences(original_text, duplicate_text)
+# # #                     result_text += f"**Original text:**\n{original_text}\n\n"
+# # #                     result_text += f"**Duplicate text:**\n{duplicate_text}\n\n"
+# # #                     result_text += f"**Differences:**\n{differences}\n"
+# # #                     result_text += "-" * 50 + "\n\n"
+# # #             else:
+# # #                 result_text += "No duplicates found."
+# # #             # Final status
+# # #             status = "Deduplication completed."
+# # #             yield status, result_text
+# # #         elif deduplication_type == "Cross-dataset":
+# # #             # Load Dataset 1
+# # #             status = "Loading Dataset 1..."
+# # #             yield status, ""
+# # #             if dataset1_name == default_dataset1_name and dataset1_split == default_dataset1_split:
+# # #                 ds1 = ds_default1
+# # #             else:
+# # #                 ds1 = load_dataset(dataset1_name, split=dataset1_split)
+# # #             # Load Dataset 2
+# # #             status = "Loading Dataset 2..."
+# # #             yield status, ""
+# # #             if dataset2_name == default_dataset2_name and dataset2_split == default_dataset2_split:
+# # #                 ds2 = ds_default2
+# # #             else:
+# # #                 ds2 = load_dataset(dataset2_name, split=dataset2_split)
+# # #             # Extract texts from Dataset 1
+# # #             status = "Extracting texts from Dataset 1..."
+# # #             yield status, ""
+# # #             texts1 = [example[dataset1_text_column] for example in ds1]
+# # #             # Extract texts from Dataset 2
+# # #             status = "Extracting texts from Dataset 2..."
+# # #             yield status, ""
+# # #             texts2 = [example[dataset2_text_column] for example in ds2]
+# # #             # Compute embeddings for Dataset 1
+# # #             status = "Computing embeddings for Dataset 1..."
+# # #             yield status, ""
+# # #             embedding_matrix1 = model.encode(texts1, show_progressbar=True)
+# # #             # Compute embeddings for Dataset 2
+# # #             status = "Computing embeddings for Dataset 2..."
+# # #             yield status, ""
+# # #             embedding_matrix2 = model.encode(texts2, show_progressbar=True)
+# # #             # Deduplicate across datasets
+# # #             status = "Deduplicating embeddings across datasets..."
+# # #             yield status, ""
+# # #             duplicate_indices_in_ds2, duplicate_to_original_mapping = deduplicate_across_datasets(
+# # #                 embedding_matrix1, embedding_matrix2, threshold
+# # #             )
+# # #             num_duplicates = len(duplicate_indices_in_ds2)
+# # #             num_total_ds2 = len(texts2)
+# # #             num_unique_ds2 = num_total_ds2 - num_duplicates
+# # #             result_text = f"**Total documents in {dataset2_name}/{dataset2_split}:** {num_total_ds2}\n"
+# # #             result_text += f"**Number of duplicates found in {dataset2_name}/{dataset2_split}:** {num_duplicates}\n"
+# # #             result_text += f"**Number of unique documents in {dataset2_name}/{dataset2_split} after deduplication:** {num_unique_ds2}\n\n"
+# # #             # Show deduplicated examples
+# # #             if num_duplicates > 0:
+# # #                 result_text += "**Examples of duplicates found in Dataset 2:**\n\n"
+# # #                 num_examples = min(5, num_duplicates)
+# # #                 for duplicate_idx in duplicate_indices_in_ds2[:num_examples]:
+# # #                     original_idx = duplicate_to_original_mapping[duplicate_idx]
+# # #                     original_text = texts1[original_idx]
+# # #                     duplicate_text = texts2[duplicate_idx]
+# # #                     differences = display_word_differences(original_text, duplicate_text)
+# # #                     result_text += f"**Original text (Dataset 1):**\n{original_text}\n\n"
+# # #                     result_text += f"**Duplicate text (Dataset 2):**\n{duplicate_text}\n\n"
+# # #                     result_text += f"**Differences:**\n{differences}\n"
+# # #                     result_text += "-" * 50 + "\n\n"
+# # #             else:
+# # #                 result_text += "No duplicates found."
+# # #             # Final status
+# # #             status = "Deduplication completed."
+# # #             yield status, result_text
+# # #     finally:
+# # #         # Restore original tqdm
+# # #         tqdm.tqdm = original_tqdm
+# # #         for mod_name in list(sys.modules.keys()):
+# # #             if 'tqdm' in mod_name:
+# # #                 sys.modules[mod_name].tqdm = original_tqdm
+# # # with gr.Blocks() as demo:
+# # #     gr.Markdown("# Semantic Deduplication")
+# # #     deduplication_type = gr.Radio(
+# # #         choices=["Single dataset", "Cross-dataset"],
+# # #         label="Deduplication Type",
+# # #         value="Single dataset"
+# # #     )
+# # #     with gr.Row():
+# # #         dataset1_name = gr.Textbox(value=default_dataset1_name, label="Dataset 1 Name")
+# # #         dataset1_split = gr.Textbox(value=default_dataset1_split, label="Dataset 1 Split")
+# # #         dataset1_text_column = gr.Textbox(value=default_text_column, label="Text Column Name")
+# # #     dataset2_inputs = gr.Column(visible=False)
+# # #     with dataset2_inputs:
+# # #         gr.Markdown("### Dataset 2")
+# # #         with gr.Row():
+# # #             dataset2_name = gr.Textbox(value=default_dataset2_name, label="Dataset 2 Name")
+# # #             dataset2_split = gr.Textbox(value=default_dataset2_split, label="Dataset 2 Split")
+# # #             dataset2_text_column = gr.Textbox(value=default_text_column, label="Text Column Name")
+# # #     threshold = gr.Slider(
+# # #         minimum=0.0,
+# # #         maximum=1.0,
+# # #         value=default_threshold,
+# # #         label="Similarity Threshold"
+# # #     )
+# # #     compute_button = gr.Button("Compute")
+# # #     status_output = gr.Markdown()
+# # #     result_output = gr.Markdown()
+# # #     # Function to update the visibility of dataset2_inputs
+# # #     def update_visibility(deduplication_type_value):
+# # #         if deduplication_type_value == "Cross-dataset":
+# # #             return gr.update(visible=True)
+# # #         else:
+# # #             return gr.update(visible=False)
+# # #     deduplication_type.change(
+# # #         update_visibility,
+# # #         inputs=deduplication_type,
+# # #         outputs=dataset2_inputs
+# # #     )
+# # #     compute_button.click(
+# # #         fn=perform_deduplication,
+# # #         inputs=[
+# # #             deduplication_type,
+# # #             dataset1_name,
+# # #             dataset1_split,
+# # #             dataset1_text_column,
+# # #             dataset2_name,
+# # #             dataset2_split,
+# # #             dataset2_text_column,
+# # #             threshold
+# # #         ],
+# # #         outputs=[status_output, result_output]
+# # #     )
+# # # demo.launch()
+# # # import gradio as gr
+# # # from datasets import load_dataset
+# # # import numpy as np
+# # # from model2vec import StaticModel
+# # # from reach import Reach
+# # # from difflib import ndiff
+# # # import sys
+# # # import tqdm
+# # # # Load the model at startup
+# # # model = StaticModel.from_pretrained("minishlab/M2V_base_output")
+# # # # Update default dataset to 'sst2' and set default threshold to 0.9
+# # # default_dataset1_name = "sst2"
+# # # default_dataset1_split = "train"
+# # # default_dataset2_name = "sst2"
+# # # default_dataset2_split = "validation"
+# # # default_text_column = "sentence"
+# # # default_threshold = 0.9
+# # # # Load the default datasets at startup
+# # # ds_default1 = load_dataset(default_dataset1_name, split=default_dataset1_split)
+# # # ds_default2 = load_dataset(default_dataset2_name, split=default_dataset2_split)
+# # # def deduplicate(embedding_matrix: np.ndarray, threshold: float, batch_size: int = 1024, progress=None) -> tuple[np.ndarray, dict[int, int]]:
+# # #     """
+# # #     Deduplicate embeddings and return the deduplicated indices and a mapping of removed indices to their corresponding original indices.
+# # #     """
+# # #     # Update progress to indicate building the index
+# # #     progress(0, desc="Building search index...")
+# # #     reach = Reach(vectors=embedding_matrix, items=[str(i) for i in range(len(embedding_matrix))])
+# # #     deduplicated_indices = set(range(len(embedding_matrix)))
+# # #     duplicate_to_original_mapping = {}
+# # #     # Finding nearest neighbors
+# # #     progress(0, desc="Finding nearest neighbors...")
+# # #     results = reach.nearest_neighbor_threshold(
+# # #         embedding_matrix,
+# # #         threshold=threshold,
+# # #         batch_size=batch_size,
+# # #         show_progressbar=True  # Allow internal progress bar
+# # #     )
+# # #     # Processing duplicates with a progress bar
+# # #     total_items = len(embedding_matrix)
+# # #     for i, similar_items in enumerate(progress.tqdm(results, desc="Processing duplicates", total=total_items)):
+# # #         if i not in deduplicated_indices:
+# # #             continue
+# # #         similar_indices = [int(item[0]) for item in similar_items if int(item[0]) != i]
+# # #         for sim_idx in similar_indices:
+# # #             if sim_idx in deduplicated_indices:
+# # #                 deduplicated_indices.remove(sim_idx)
+# # #                 duplicate_to_original_mapping[sim_idx] = i
+# # #     return np.array(list(deduplicated_indices)), duplicate_to_original_mapping
+# # # def deduplicate_across_datasets(embedding_matrix_1: np.ndarray, embedding_matrix_2: np.ndarray, threshold: float, batch_size: int = 1024, progress=None) -> tuple[list[int], dict[int, int]]:
+# # #     """
+# # #     Deduplicate embeddings across two datasets and return the indices of duplicates between them.
+# # #     """
+# # #     # Update progress to indicate building the index
+# # #     progress(0, desc="Building search index from Dataset 1...")
+# # #     reach = Reach(vectors=embedding_matrix_1, items=[str(i) for i in range(len(embedding_matrix_1))])
+# # #     duplicate_indices_in_test = []
+# # #     duplicate_to_original_mapping = {}
+# # #     # Finding nearest neighbors between datasets
+# # #     progress(0, desc="Finding nearest neighbors between datasets...")
+# # #     results = reach.nearest_neighbor_threshold(
+# # #         embedding_matrix_2,
+# # #         threshold=threshold,
+# # #         batch_size=batch_size,
+# # #         show_progressbar=True  # Allow internal progress bar
+# # #     )
+# # #     total_items = len(embedding_matrix_2)
+# # #     # Processing duplicates with a progress bar
+# # #     for i, similar_items in enumerate(progress.tqdm(results, desc="Processing duplicates across datasets", total=total_items)):
+# # #         similar_indices = [int(item[0]) for item in similar_items if item[1] >= threshold]
+# # #         if similar_indices:
+# # #             duplicate_indices_in_test.append(i)
+# # #             duplicate_to_original_mapping[i] = similar_indices[0]
+# # #     return duplicate_indices_in_test, duplicate_to_original_mapping
+# # # def display_word_differences(x: str, y: str) -> str:
+# # #     diff = ndiff(x.split(), y.split())
+# # #     return " ".join([word for word in diff if word.startswith(('+', '-'))])
+# # # def perform_deduplication(
+# # #     deduplication_type,
+# # #     dataset1_name,
+# # #     dataset1_split,
+# # #     dataset1_text_column,
+# # #     dataset2_name="",
+# # #     dataset2_split="",
+# # #     dataset2_text_column="",
+# # #     threshold=default_threshold,
+# # #     progress=gr.Progress(track_tqdm=True)
+# # # ):
+# # #     # Monkey-patch tqdm
+# # #     original_tqdm = tqdm.tqdm
+# # #     original_reach_tqdm = Reach.__dict__['tqdm'] if 'tqdm' in Reach.__dict__ else None
+# # #     tqdm.tqdm = progress.tqdm
+# # #     sys.modules['tqdm'].tqdm = progress.tqdm
+# # #     sys.modules['tqdm.auto'].tqdm = progress.tqdm
+# # #     Reach.tqdm = progress.tqdm  # Monkey-patch reach's tqdm
+# # #     try:
+# # #         # Convert threshold to float
+# # #         threshold = float(threshold)
+# # #         if deduplication_type == "Single dataset":
+# # #             # Load Dataset 1
+# # #             progress(0, desc="Loading Dataset 1...")
+# # #             if dataset1_name == default_dataset1_name and dataset1_split == default_dataset1_split:
+# # #                 ds = ds_default1
+# # #             else:
+# # #                 ds = load_dataset(dataset1_name, split=dataset1_split)
+# # #             # Extract texts
+# # #             progress(0, desc="Extracting texts from Dataset 1...")
+# # #             texts = [example[dataset1_text_column] for example in ds]
+# # #             # Compute embeddings
+# # #             progress(0, desc="Computing embeddings for Dataset 1...")
+# # #             embedding_matrix = model.encode(texts, show_progressbar=True)  # Enable internal progress bar
+# # #             # Deduplicate
+# # #             result_text = deduplicate_and_prepare_results_single(
+# # #                 embedding_matrix, texts, threshold, progress
+# # #             )
+# # #             return result_text
+# # #         elif deduplication_type == "Cross-dataset":
+# # #             # Load Dataset 1
+# # #             progress(0, desc="Loading Dataset 1...")
+# # #             if dataset1_name == default_dataset1_name and dataset1_split == default_dataset1_split:
+# # #                 ds1 = ds_default1
+# # #             else:
+# # #                 ds1 = load_dataset(dataset1_name, split=dataset1_split)
+# # #             # Load Dataset 2
+# # #             progress(0, desc="Loading Dataset 2...")
+# # #             if dataset2_name == default_dataset2_name and dataset2_split == default_dataset2_split:
+# # #                 ds2 = ds_default2
+# # #             else:
+# # #                 ds2 = load_dataset(dataset2_name, split=dataset2_split)
+# # #             # Extract texts from Dataset 1
+# # #             progress(0, desc="Extracting texts from Dataset 1...")
+# # #             texts1 = [example[dataset1_text_column] for example in ds1]
+# # #             # Extract texts from Dataset 2
+# # #             progress(0, desc="Extracting texts from Dataset 2...")
+# # #             texts2 = [example[dataset2_text_column] for example in ds2]
+# # #             # Compute embeddings for Dataset 1
+# # #             progress(0, desc="Computing embeddings for Dataset 1...")
+# # #             embedding_matrix1 = model.encode(texts1, show_progressbar=True)
+# # #             # Compute embeddings for Dataset 2
+# # #             progress(0, desc="Computing embeddings for Dataset 2...")
+# # #             embedding_matrix2 = model.encode(texts2, show_progressbar=True)
+# # #             # Deduplicate across datasets
+# # #             result_text = deduplicate_and_prepare_results_cross(
+# # #                 embedding_matrix1, embedding_matrix2, texts1, texts2, threshold, progress, dataset2_name, dataset2_split
+# # #             )
+# # #             return result_text
+# # #     finally:
+# # #         # Restore original tqdm
+# # #         tqdm.tqdm = original_tqdm
+# # #         sys.modules['tqdm'].tqdm = original_tqdm
+# # #         sys.modules['tqdm.auto'].tqdm = original_tqdm
+# # #         # Restore reach's original tqdm
+# # #         if original_reach_tqdm is not None:
+# # #             Reach.tqdm = original_reach_tqdm
+# # #         else:
+# # #             del Reach.tqdm  # If it wasn't originally in Reach's __dict__
+# # # def deduplicate_and_prepare_results_single(embedding_matrix, texts, threshold, progress):
+# # #     # Deduplicate
+# # #     deduplicated_indices, duplicate_to_original_mapping = deduplicate(
+# # #         embedding_matrix, threshold, progress=progress
+# # #     )
+# # #     # Prepare the results
+# # #     num_duplicates = len(duplicate_to_original_mapping)
+# # #     num_total = len(texts)
+# # #     num_deduplicated = len(deduplicated_indices)
+# # #     result_text = f"**Total documents:** {num_total}\n"
+# # #     result_text += f"**Number of duplicates found:** {num_duplicates}\n"
+# # #     result_text += f"**Number of unique documents after deduplication:** {num_deduplicated}\n\n"
+# # #     # Show deduplicated examples
+# # #     if num_duplicates > 0:
+# # #         result_text += "**Examples of duplicates found:**\n\n"
+# # #         num_examples = min(5, num_duplicates)
+# # #         for duplicate_idx, original_idx in list(duplicate_to_original_mapping.items())[:num_examples]:
+# # #             original_text = texts[original_idx]
+# # #             duplicate_text = texts[duplicate_idx]
+# # #             differences = display_word_differences(original_text, duplicate_text)
+# # #             result_text += f"**Original text:**\n{original_text}\n\n"
+# # #             result_text += f"**Duplicate text:**\n{duplicate_text}\n\n"
+# # #             result_text += f"**Differences:**\n{differences}\n"
+# # #             result_text += "-" * 50 + "\n\n"
+# # #     else:
+# # #         result_text += "No duplicates found."
+# # #     return result_text
+# # # def deduplicate_and_prepare_results_cross(embedding_matrix1, embedding_matrix2, texts1, texts2, threshold, progress, dataset2_name, dataset2_split):
+# # #     # Deduplicate across datasets
+# # #     duplicate_indices_in_ds2, duplicate_to_original_mapping = deduplicate_across_datasets(
+# # #         embedding_matrix1, embedding_matrix2, threshold, progress=progress
+# # #     )
+# # #     num_duplicates = len(duplicate_indices_in_ds2)
+# # #     num_total_ds2 = len(texts2)
+# # #     num_unique_ds2 = num_total_ds2 - num_duplicates
+# # #     result_text = f"**Total documents in {dataset2_name}/{dataset2_split}:** {num_total_ds2}\n"
+# # #     result_text += f"**Number of duplicates found in {dataset2_name}/{dataset2_split}:** {num_duplicates}\n"
+# # #     result_text += f"**Number of unique documents in {dataset2_name}/{dataset2_split} after deduplication:** {num_unique_ds2}\n\n"
+# # #     # Show deduplicated examples
+# # #     if num_duplicates > 0:
+# # #         result_text += "**Examples of duplicates found in Dataset 2:**\n\n"
+# # #         num_examples = min(5, num_duplicates)
+# # #         for duplicate_idx in duplicate_indices_in_ds2[:num_examples]:
+# # #             original_idx = duplicate_to_original_mapping[duplicate_idx]
+# # #             original_text = texts1[original_idx]
+# # #             duplicate_text = texts2[duplicate_idx]
+# # #             differences = display_word_differences(original_text, duplicate_text)
+# # #             result_text += f"**Original text (Dataset 1):**\n{original_text}\n\n"
+# # #             result_text += f"**Duplicate text (Dataset 2):**\n{duplicate_text}\n\n"
+# # #             result_text += f"**Differences:**\n{differences}\n"
+# # #             result_text += "-" * 50 + "\n\n"
+# # #     else:
+# # #         result_text += "No duplicates found."
+# # #     return result_text
+# # # with gr.Blocks() as demo:
+# # #     gr.Markdown("# Semantic Deduplication")
+# # #     deduplication_type = gr.Radio(
+# # #         choices=["Single dataset", "Cross-dataset"],
+# # #         label="Deduplication Type",
+# # #         value="Single dataset"
+# # #     )
+# # #     with gr.Row():
+# # #         dataset1_name = gr.Textbox(value=default_dataset1_name, label="Dataset 1 Name")
+# # #         dataset1_split = gr.Textbox(value=default_dataset1_split, label="Dataset 1 Split")
+# # #         dataset1_text_column = gr.Textbox(value=default_text_column, label="Text Column Name")
+# # #     dataset2_inputs = gr.Column(visible=False)
+# # #     with dataset2_inputs:
+# # #         gr.Markdown("### Dataset 2")
+# # #         with gr.Row():
+# # #             dataset2_name = gr.Textbox(value=default_dataset2_name, label="Dataset 2 Name")
+# # #             dataset2_split = gr.Textbox(value=default_dataset2_split, label="Dataset 2 Split")
+# # #             dataset2_text_column = gr.Textbox(value=default_text_column, label="Text Column Name")
+# # #     threshold = gr.Slider(
+# # #         minimum=0.0,
+# # #         maximum=1.0,
+# # #         value=default_threshold,
+# # #         label="Similarity Threshold"
+# # #     )
+# # #     compute_button = gr.Button("Compute")
+# # #     output = gr.Markdown()
+# # #     # Function to update the visibility of dataset2_inputs
+# # #     def update_visibility(deduplication_type_value):
+# # #         if deduplication_type_value == "Cross-dataset":
+# # #             return gr.update(visible=True)
+# # #         else:
+# # #             return gr.update(visible=False)
+# # #     deduplication_type.change(
+# # #         update_visibility,
+# # #         inputs=deduplication_type,
+# # #         outputs=dataset2_inputs
+# # #     )
+# # #     compute_button.click(
+# # #         fn=perform_deduplication,
+# # #         inputs=[
+# # #             deduplication_type,
+# # #             dataset1_name,
+# # #             dataset1_split,
+# # #             dataset1_text_column,
+# # #             dataset2_name,
+# # #             dataset2_split,
+# # #             dataset2_text_column,
+# # #             threshold
+# # #         ],
+# # #         outputs=output
+# # #     )
+# # # demo.launch()
 # # # import gradio as gr
 # # #     )
 # # #     # Process duplicates
+# # #     for i, similar_items in enumerate(progress.tqdm(results, desc="Processing duplicates", total=len(embedding_matrix))):
 # # #         if i not in deduplicated_indices:
 # # #             continue
 # # #         show_progressbar=True  # Allow internal progress bar
 # # #     )
+# # #     for i, similar_items in enumerate(progress.tqdm(results, desc="Processing duplicates across datasets", total=len(embedding_matrix_2))):
 # # #         similar_indices = [int(item[0]) for item in similar_items if item[1] >= threshold]
 # # #         if similar_indices:
 # # # ):
 # # #     # Monkey-patch tqdm
 # # #     original_tqdm = tqdm.tqdm
+# # #     original_reach_tqdm = Reach.__dict__['tqdm'] if 'tqdm' in Reach.__dict__ else None
 # # #     tqdm.tqdm = progress.tqdm
 # # #     sys.modules['tqdm'].tqdm = progress.tqdm
 # # #     sys.modules['tqdm.auto'].tqdm = progress.tqdm
+# # #     Reach.tqdm = progress.tqdm  # Monkey-patch reach's tqdm
 # # #     try:
 # # #         # Convert threshold to float
 # # #             embedding_matrix2 = model.encode(texts2, show_progressbar=True)  # Enable internal progress bar
 # # #             # Deduplicate across datasets
+# # #             duplicate_indices_in_ds2, duplicate_to_original_mapping = deduplicate_across_datasets(
+# # #                 embedding_matrix1, embedding_matrix2, threshold, progress=progress)
 # # #             num_duplicates = len(duplicate_indices_in_ds2)
 # # #             num_total_ds2 = len(texts2)
 # # #         sys.modules['tqdm'].tqdm = original_tqdm
 # # #         sys.modules['tqdm.auto'].tqdm = original_tqdm
+# # #         # Restore reach's original tqdm
+# # #         if original_reach_tqdm is not None:
+# # #             Reach.tqdm = original_reach_tqdm
+# # #         else:
+# # #             del Reach.tqdm  # If it wasn't originally in Reach's __dict__
 # # # with gr.Blocks() as demo:
 # # #     gr.Markdown("# Semantic Deduplication")
 # # #     )
 # # # demo.launch()
+# # # # import gradio as gr
+# # # # from datasets import load_dataset
+# # # # import numpy as np
+# # # # from model2vec import StaticModel
+# # # # from reach import Reach
+# # # # from difflib import ndiff
+# # # # import sys
+# # # # import tqdm
+# # # # # Load the model at startup
+# # # # model = StaticModel.from_pretrained("minishlab/M2V_base_output")
+# # # # # Load the default datasets at startup
+# # # # default_dataset1_name = "ag_news"
+# # # # default_dataset1_split = "train"
+# # # # default_dataset2_name = "ag_news"
+# # # # default_dataset2_split = "test"
+# # # # ds_default1 = load_dataset(default_dataset1_name, split=default_dataset1_split)
+# # # # ds_default2 = load_dataset(default_dataset2_name, split=default_dataset2_split)
+# # # # def deduplicate(embedding_matrix: np.ndarray, threshold: float, batch_size: int = 1024, progress=None) -> tuple[np.ndarray, dict[int, int]]:
+# # # #     """
+# # # #     Deduplicate embeddings and return the deduplicated indices and a mapping of removed indices to their corresponding original indices.
+# # # #     """
+# # # #     reach = Reach(vectors=embedding_matrix, items=[str(i) for i in range(len(embedding_matrix))])
+# # # #     deduplicated_indices = set(range(len(embedding_matrix)))
+# # # #     duplicate_to_original_mapping = {}
+# # # #     results = reach.nearest_neighbor_threshold(
+# # # #         embedding_matrix,
+# # # #         threshold=threshold,
+# # # #         batch_size=batch_size,
+# # # #         show_progressbar=True  # Allow internal progress bar
+# # # #     )
+# # # #     # Process duplicates
+# # # #     for i, similar_items in enumerate(progress.tqdm(results, desc="Processing duplicates")):
+# # # #         if i not in deduplicated_indices:
+# # # #             continue
+# # # #         similar_indices = [int(item[0]) for item in similar_items if int(item[0]) != i]
+# # # #         for sim_idx in similar_indices:
+# # # #             if sim_idx in deduplicated_indices:
+# # # #                 deduplicated_indices.remove(sim_idx)
+# # # #                 duplicate_to_original_mapping[sim_idx] = i
+# # # #     return np.array(list(deduplicated_indices)), duplicate_to_original_mapping
+# # # # def deduplicate_across_datasets(embedding_matrix_1: np.ndarray, embedding_matrix_2: np.ndarray, threshold: float, batch_size: int = 1024, progress=None) -> tuple[list[int], dict[int, int]]:
+# # # #     """
+# # # #     Deduplicate embeddings across two datasets and return the indices of duplicates between them.
+# # # #     """
+# # # #     reach = Reach(vectors=embedding_matrix_1, items=[str(i) for i in range(len(embedding_matrix_1))])
+# # # #     duplicate_indices_in_test = []
+# # # #     duplicate_to_original_mapping = {}
+# # # #     results = reach.nearest_neighbor_threshold(
+# # # #         embedding_matrix_2,
+# # # #         threshold=threshold,
+# # # #         batch_size=batch_size,
+# # # #         show_progressbar=True  # Allow internal progress bar
+# # # #     )
+# # # #     # Process duplicates
+# # # #     for i, similar_items in enumerate(progress.tqdm(results, desc="Processing duplicates across datasets")):
+# # # #         similar_indices = [int(item[0]) for item in similar_items if item[1] >= threshold]
+# # # #         if similar_indices:
+# # # #             duplicate_indices_in_test.append(i)
+# # # #             duplicate_to_original_mapping[i] = similar_indices[0]
+# # # #     return duplicate_indices_in_test, duplicate_to_original_mapping
+# # # # def display_word_differences(x: str, y: str) -> str:
+# # # #     diff = ndiff(x.split(), y.split())
+# # # #     return " ".join([word for word in diff if word.startswith(('+', '-'))])
+# # # # def perform_deduplication(
+# # # #     deduplication_type,
+# # # #     dataset1_name,
+# # # #     dataset1_split,
+# # # #     dataset1_text_column,
+# # # #     dataset2_name="",
+# # # #     dataset2_split="",
+# # # #     dataset2_text_column="",
+# # # #     threshold=0.8,
+# # # #     progress=gr.Progress(track_tqdm=True)
+# # # # ):
+# # # #     # Monkey-patch tqdm
+# # # #     original_tqdm = tqdm.tqdm
+# # # #     tqdm.tqdm = progress.tqdm
+# # # #     sys.modules['tqdm'].tqdm = progress.tqdm
+# # # #     sys.modules['tqdm.auto'].tqdm = progress.tqdm
+# # # #     try:
+# # # #         # Convert threshold to float
+# # # #         threshold = float(threshold)
+# # # #         if deduplication_type == "Single dataset":
+# # # #             # Check if the dataset is the default one
+# # # #             if dataset1_name == default_dataset1_name and dataset1_split == default_dataset1_split:
+# # # #                 ds = ds_default1
+# # # #             else:
+# # # #                 ds = load_dataset(dataset1_name, split=dataset1_split)
+# # # #             # Extract texts
+# # # #             texts = [example[dataset1_text_column] for example in ds]
+# # # #             # Compute embeddings
+# # # #             embedding_matrix = model.encode(texts, show_progressbar=True)  # Enable internal progress bar
+# # # #             # Deduplicate
+# # # #             deduplicated_indices, duplicate_to_original_mapping = deduplicate(embedding_matrix, threshold, progress=progress)
+# # # #             # Prepare the results
+# # # #             num_duplicates = len(duplicate_to_original_mapping)
+# # # #             num_total = len(texts)
+# # # #             num_deduplicated = len(deduplicated_indices)
+# # # #             result_text = f"**Total documents:** {num_total}\n"
+# # # #             result_text += f"**Number of duplicates found:** {num_duplicates}\n"
+# # # #             result_text += f"**Number of unique documents after deduplication:** {num_deduplicated}\n\n"
+# # # #             # Show deduplicated examples
+# # # #             result_text += "**Examples of duplicates found:**\n\n"
+# # # #             num_examples = min(5, num_duplicates)
+# # # #             for duplicate_idx, original_idx in list(duplicate_to_original_mapping.items())[:num_examples]:
+# # # #                 original_text = texts[original_idx]
+# # # #                 duplicate_text = texts[duplicate_idx]
+# # # #                 differences = display_word_differences(original_text, duplicate_text)
+# # # #                 result_text += f"**Original text:**\n{original_text}\n\n"
+# # # #                 result_text += f"**Duplicate text:**\n{duplicate_text}\n\n"
+# # # #                 result_text += f"**Differences:**\n{differences}\n"
+# # # #                 result_text += "-" * 50 + "\n\n"
+# # # #             return result_text
+# # # #         elif deduplication_type == "Cross-dataset":
+# # # #             # Dataset 1
+# # # #             if dataset1_name == default_dataset1_name and dataset1_split == default_dataset1_split:
+# # # #                 ds1 = ds_default1
+# # # #             else:
+# # # #                 ds1 = load_dataset(dataset1_name, split=dataset1_split)
+# # # #             # Dataset 2
+# # # #             if dataset2_name == default_dataset2_name and dataset2_split == default_dataset2_split:
+# # # #                 ds2 = ds_default2
+# # # #             else:
+# # # #                 ds2 = load_dataset(dataset2_name, split=dataset2_split)
+# # # #             # Extract texts
+# # # #             texts1 = [example[dataset1_text_column] for example in ds1]
+# # # #             texts2 = [example[dataset2_text_column] for example in ds2]
+# # # #             # Compute embeddings
+# # # #             embedding_matrix1 = model.encode(texts1, show_progressbar=True)  # Enable internal progress bar
+# # # #             embedding_matrix2 = model.encode(texts2, show_progressbar=True)  # Enable internal progress bar
+# # # #             # Deduplicate across datasets
+# # # #             duplicate_indices_in_ds2, duplicate_to_original_mapping = deduplicate_across_datasets(embedding_matrix1, embedding_matrix2, threshold, progress=progress)
+# # # #             num_duplicates = len(duplicate_indices_in_ds2)
+# # # #             num_total_ds2 = len(texts2)
+# # # #             num_unique_ds2 = num_total_ds2 - num_duplicates
+# # # #             result_text = f"**Total documents in {dataset2_name}/{dataset2_split}:** {num_total_ds2}\n"
+# # # #             result_text += f"**Number of duplicates found in {dataset2_name}/{dataset2_split}:** {num_duplicates}\n"
+# # # #             result_text += f"**Number of unique documents in {dataset2_name}/{dataset2_split} after deduplication:** {num_unique_ds2}\n\n"
+# # # #             # Show deduplicated examples
+# # # #             result_text += "**Examples of duplicates found in Dataset 2:**\n\n"
+# # # #             num_examples = min(5, num_duplicates)
+# # # #             for duplicate_idx in duplicate_indices_in_ds2[:num_examples]:
+# # # #                 original_idx = duplicate_to_original_mapping[duplicate_idx]
+# # # #                 original_text = texts1[original_idx]
+# # # #                 duplicate_text = texts2[duplicate_idx]
+# # # #                 differences = display_word_differences(original_text, duplicate_text)
+# # # #                 result_text += f"**Original text (Dataset 1):**\n{original_text}\n\n"
+# # # #                 result_text += f"**Duplicate text (Dataset 2):**\n{duplicate_text}\n\n"
+# # # #                 result_text += f"**Differences:**\n{differences}\n"
+# # # #                 result_text += "-" * 50 + "\n\n"
+# # # #             return result_text
+# # # #     finally:
+# # # #         # Restore original tqdm
+# # # #         tqdm.tqdm = original_tqdm
+# # # #         sys.modules['tqdm'].tqdm = original_tqdm
+# # # #         sys.modules['tqdm.auto'].tqdm = original_tqdm
+# # # # with gr.Blocks() as demo:
+# # # #     gr.Markdown("# Semantic Deduplication")
+# # # #     deduplication_type = gr.Radio(
+# # # #         choices=["Single dataset", "Cross-dataset"],
+# # # #         label="Deduplication Type",
+# # # #         value="Single dataset"
+# # # #     )
+# # # #     with gr.Row():
+# # # #         dataset1_name = gr.Textbox(value="ag_news", label="Dataset 1 Name")
+# # # #         dataset1_split = gr.Textbox(value="train", label="Dataset 1 Split")
+# # # #         dataset1_text_column = gr.Textbox(value="text", label="Text Column Name")
+# # # #     dataset2_inputs = gr.Column(visible=False)
+# # # #     with dataset2_inputs:
+# # # #         gr.Markdown("### Dataset 2")
+# # # #         with gr.Row():
+# # # #             dataset2_name = gr.Textbox(value="ag_news", label="Dataset 2 Name")
+# # # #             dataset2_split = gr.Textbox(value="test", label="Dataset 2 Split")
+# # # #             dataset2_text_column = gr.Textbox(value="text", label="Text Column Name")
+# # # #     threshold = gr.Slider(
+# # # #         minimum=0.0,
+# # # #         maximum=1.0,
+# # # #         value=0.8,
+# # # #         label="Similarity Threshold"
+# # # #     )
+# # # #     compute_button = gr.Button("Compute")
+# # # #     output = gr.Markdown()
+# # # #     # Function to update the visibility of dataset2_inputs
+# # # #     def update_visibility(deduplication_type_value):
+# # # #         if deduplication_type_value == "Cross-dataset":
+# # # #             return gr.update(visible=True)
+# # # #         else:
+# # # #             return gr.update(visible=False)
+# # # #     deduplication_type.change(
+# # # #         update_visibility,
+# # # #         inputs=deduplication_type,
+# # # #         outputs=dataset2_inputs
+# # # #     )
+# # # #     compute_button.click(
+# # # #         fn=perform_deduplication,
+# # # #         inputs=[
+# # # #             deduplication_type,
+# # # #             dataset1_name,
+# # # #             dataset1_split,
+# # # #             dataset1_text_column,
+# # # #             dataset2_name,
+# # # #             dataset2_split,
+# # # #             dataset2_text_column,
+# # # #             threshold
+# # # #         ],
+# # # #         outputs=output
+# # # #     )
+# # # # demo.launch()