Spaces:

minishlab
/

semantic-deduplication

Running

App Files Files Community

Pringled commited on Oct 12, 2024

Commit

39a5b1c

1 Parent(s): a847bef

Updates

Browse files

Files changed (1) hide show

app.py +600 -281

app.py CHANGED Viewed

@@ -1,14 +1,16 @@
 import gradio as gr
 from datasets import load_dataset
 import numpy as np
-import model2vec
 from reach import Reach
 from difflib import ndiff
 # Load the model at startup
-model = model2vec.StaticModel.from_pretrained("minishlab/M2V_base_output")
-# Default dataset parameters
 default_dataset1_name = "sst2"
 default_dataset1_split = "train"
 default_dataset2_name = "sst2"
@@ -27,37 +29,39 @@ def batch_iterable(iterable, batch_size):
 def compute_embeddings(texts, batch_size, progress, desc="Computing embeddings"):
     embeddings = []
-    total_batches = (len(texts) + batch_size - 1) // batch_size
-    for i, batch_texts in enumerate(batch_iterable(texts, batch_size)):
-        batch_embeddings = model.encode(batch_texts, show_progressbar=False)
         embeddings.append(batch_embeddings)
-        progress((i + 1) / total_batches, desc=desc)
     return np.concatenate(embeddings, axis=0)
-def deduplicate(
-    embedding_matrix: np.ndarray,
-    threshold: float,
-    batch_size: int = 1024,
-    progress=None
-) -> tuple[np.ndarray, dict[int, int]]:
     reach = Reach(vectors=embedding_matrix, items=[str(i) for i in range(len(embedding_matrix))])
     deduplicated_indices = set(range(len(embedding_matrix)))
     duplicate_to_original_mapping = {}
     results = reach.nearest_neighbor_threshold(
         embedding_matrix,
         threshold=threshold,
         batch_size=batch_size,
-        show_progressbar=False,
     )
     total_items = len(embedding_matrix)
     for i, similar_items in enumerate(progress.tqdm(results, desc="Processing duplicates", total=total_items)):
         if i not in deduplicated_indices:
             continue
         similar_indices = [int(item[0]) for item in similar_items if int(item[0]) != i]
         for sim_idx in similar_indices:
             if sim_idx in deduplicated_indices:
                 deduplicated_indices.remove(sim_idx)
@@ -65,9 +69,40 @@ def deduplicate(
     return np.array(list(deduplicated_indices)), duplicate_to_original_mapping
 def display_word_differences(x: str, y: str) -> str:
     diff = ndiff(x.split(), y.split())
-    return " ".join([word for word in diff if word.startswith(("+", "-"))])
 def perform_deduplication(
     deduplication_type,
@@ -78,18 +113,42 @@ def perform_deduplication(
     dataset2_split="",
     dataset2_text_column="",
     threshold=default_threshold,
-    progress=gr.Progress(track_tqdm=True),
 ):
     try:
         threshold = float(threshold)
         if deduplication_type == "Single dataset":
-            ds = ds_default1 if dataset1_name == default_dataset1_name and dataset1_split == default_dataset1_split else load_dataset(dataset1_name, split=dataset1_split)
             texts = [example[dataset1_text_column] for example in ds]
-            embedding_matrix = compute_embeddings(texts, batch_size=64, progress=progress)
-            deduplicated_indices, duplicate_to_original_mapping = deduplicate(embedding_matrix, threshold, progress=progress)
             num_duplicates = len(duplicate_to_original_mapping)
             num_total = len(texts)
             num_deduplicated = len(deduplicated_indices)
@@ -98,6 +157,7 @@ def perform_deduplication(
             result_text += f"**Number of duplicates found:** {num_duplicates}\n"
             result_text += f"**Number of unique documents after deduplication:** {num_deduplicated}\n\n"
             if num_duplicates > 0:
                 result_text += "**Examples of duplicates found:**\n\n"
                 num_examples = min(5, num_duplicates)
@@ -112,19 +172,93 @@ def perform_deduplication(
             else:
                 result_text += "No duplicates found."
-            yield result_text
     except Exception as e:
-        yield f"An error occurred: {e}"
-# Gradio interface setup
-with gr.Blocks(css="#status_output { height: 150px; overflow: auto; }") as demo:
     gr.Markdown("# Semantic Deduplication")
     deduplication_type = gr.Radio(
         choices=["Single dataset", "Cross-dataset"],
         label="Deduplication Type",
-        value="Single dataset",
     )
     with gr.Row():
@@ -140,17 +274,29 @@ with gr.Blocks(css="#status_output { height: 150px; overflow: auto; }") as demo:
             dataset2_split = gr.Textbox(value=default_dataset2_split, label="Dataset 2 Split")
             dataset2_text_column = gr.Textbox(value=default_text_column, label="Text Column Name")
-    threshold = gr.Slider(minimum=0.0, maximum=1.0, value=default_threshold, label="Similarity Threshold")
     compute_button = gr.Button("Compute")
     result_output = gr.Markdown()
     def update_visibility(deduplication_type_value):
-        return gr.update(visible=True) if deduplication_type_value == "Cross-dataset" else gr.update(visible=False)
     deduplication_type.change(
-        update_visibility, inputs=deduplication_type, outputs=dataset2_inputs
     )
     compute_button.click(
@@ -163,9 +309,9 @@ with gr.Blocks(css="#status_output { height: 150px; overflow: auto; }") as demo:
             dataset2_name,
             dataset2_split,
             dataset2_text_column,
-            threshold,
         ],
-        outputs=[result_output],
     )
 demo.launch()
@@ -177,7 +323,6 @@ demo.launch()
 # import model2vec
 # from reach import Reach
 # from difflib import ndiff
-# import time
 # # Load the model at startup
 # model = model2vec.StaticModel.from_pretrained("minishlab/M2V_base_output")
@@ -199,19 +344,7 @@ demo.launch()
 #     for i in range(0, len(iterable), batch_size):
 #         yield iterable[i:i + batch_size]
-# def log_time(message, start_time=None, logs=None):
-#     """Helper function to log the start and end times."""
-#     current_time = time.time()
-#     if start_time is not None:
-#         elapsed = current_time - start_time
-#         log_message = f"{message} - Took {elapsed:.2f} seconds"
-#     else:
-#         log_message = f"{message} - Started"
-#     if logs is not None:
-#         logs.append(log_message)
-# def compute_embeddings(texts, batch_size, progress, logs, desc="Computing embeddings"):
 #     embeddings = []
 #     total_batches = (len(texts) + batch_size - 1) // batch_size
 #     for i, batch_texts in enumerate(batch_iterable(texts, batch_size)):
@@ -224,38 +357,26 @@ demo.launch()
 #     embedding_matrix: np.ndarray,
 #     threshold: float,
 #     batch_size: int = 1024,
-#     progress=None,
-#     logs=None
 # ) -> tuple[np.ndarray, dict[int, int]]:
-#     # Building the index
-#     log_time("Building search index", logs=logs)
-#     reach = Reach(
-#         vectors=embedding_matrix, items=[str(i) for i in range(len(embedding_matrix))]
-#     )
 #     deduplicated_indices = set(range(len(embedding_matrix)))
 #     duplicate_to_original_mapping = {}
-#     # Finding nearest neighbors
-#     log_time("Finding nearest neighbors", logs=logs)
 #     results = reach.nearest_neighbor_threshold(
 #         embedding_matrix,
 #         threshold=threshold,
 #         batch_size=batch_size,
-#         show_progressbar=False,  # Disable internal progress bar
 #     )
-#     # Processing duplicates with a progress bar
 #     total_items = len(embedding_matrix)
-#     log_time("Processing duplicates", logs=logs)
-#     for i, similar_items in enumerate(
-#         progress.tqdm(results, desc="Processing duplicates", total=total_items)
-#     ):
 #         if i not in deduplicated_indices:
 #             continue
 #         similar_indices = [int(item[0]) for item in similar_items if int(item[0]) != i]
 #         for sim_idx in similar_indices:
 #             if sim_idx in deduplicated_indices:
 #                 deduplicated_indices.remove(sim_idx)
@@ -267,11 +388,6 @@ demo.launch()
 #     diff = ndiff(x.split(), y.split())
 #     return " ".join([word for word in diff if word.startswith(("+", "-"))])
-# def encode_texts(texts, progress=None, logs=None):
-#     embedding_matrix = model.encode(texts, show_progressbar=False)
-#     log_time("Encoding texts completed", logs=logs)
-#     return embedding_matrix
 # def perform_deduplication(
 #     deduplication_type,
 #     dataset1_name,
@@ -283,59 +399,24 @@ demo.launch()
 #     threshold=default_threshold,
 #     progress=gr.Progress(track_tqdm=True),
 # ):
-#     logs = []  # To store log messages
 #     try:
-#         # Convert threshold to float
 #         threshold = float(threshold)
-#         # Initialize status message
-#         log_time("Deduplication started", logs=logs)
 #         if deduplication_type == "Single dataset":
-#             # Load Dataset 1
-#             start_time = time.time()
-#             log_time("Loading Dataset 1", logs=logs)
-#             if (
-#                 dataset1_name == default_dataset1_name
-#                 and dataset1_split == default_dataset1_split
-#             ):
-#                 ds = ds_default1
-#             else:
-#                 ds = load_dataset(dataset1_name, split=dataset1_split)
-#             log_time("Loading Dataset 1 completed", start_time=start_time, logs=logs)
-#             # Extract texts
-#             start_time = time.time()
-#             log_time("Extracting texts from Dataset 1", logs=logs)
 #             texts = [example[dataset1_text_column] for example in ds]
-#             log_time("Extracting texts from Dataset 1 completed", start_time=start_time, logs=logs)
-#             # Compute embeddings
-#             start_time = time.time()
-#             log_time("Computing embeddings for Dataset 1", logs=logs)
-#             embedding_matrix = encode_texts(texts, progress=progress, logs=logs)
-#             log_time("Computing embeddings for Dataset 1 completed", start_time=start_time, logs=logs)
-#             # Deduplicate
-#             start_time = time.time()
-#             log_time("Deduplicating embeddings", logs=logs)
-#             deduplicated_indices, duplicate_to_original_mapping = deduplicate(
-#                 embedding_matrix, threshold, progress=progress, logs=logs
-#             )
-#             log_time("Deduplication completed", start_time=start_time, logs=logs)
-#             # Prepare the results
 #             num_duplicates = len(duplicate_to_original_mapping)
 #             num_total = len(texts)
 #             num_deduplicated = len(deduplicated_indices)
 #             result_text = f"**Total documents:** {num_total}\n"
 #             result_text += f"**Number of duplicates found:** {num_duplicates}\n"
-#             result_text += (
-#                 f"**Number of unique documents after deduplication:** {num_deduplicated}\n\n"
-#             )
-#             # Show deduplicated examples
 #             if num_duplicates > 0:
 #                 result_text += "**Examples of duplicates found:**\n\n"
 #                 num_examples = min(5, num_duplicates)
@@ -350,16 +431,12 @@ demo.launch()
 #             else:
 #                 result_text += "No duplicates found."
-#             log_time("Deduplication process finished", logs=logs)
-#             full_log = "\n".join(logs)  # Combine all logs into one output
-#             yield full_log, result_text
 #     except Exception as e:
-#         full_log = "\n".join(logs)  # Combine all logs into one output in case of an error
-#         yield f"An error occurred: {e}", ""
-#         raise e
-# # Adjust the height of the status_output component using custom CSS
 # with gr.Blocks(css="#status_output { height: 150px; overflow: auto; }") as demo:
 #     gr.Markdown("# Semantic Deduplication")
@@ -382,22 +459,14 @@ demo.launch()
 #             dataset2_split = gr.Textbox(value=default_dataset2_split, label="Dataset 2 Split")
 #             dataset2_text_column = gr.Textbox(value=default_text_column, label="Text Column Name")
-#     threshold = gr.Slider(
-#         minimum=0.0, maximum=1.0, value=default_threshold, label="Similarity Threshold"
-#     )
 #     compute_button = gr.Button("Compute")
-#     # Use 'gr.Markdown' with 'elem_id' and custom CSS to adjust height
-#     status_output = gr.Markdown(elem_id="status_output")
 #     result_output = gr.Markdown()
-#     # Function to update the visibility of dataset2_inputs
 #     def update_visibility(deduplication_type_value):
-#         if deduplication_type_value == "Cross-dataset":
-#             return gr.update(visible=True)
-#         else:
-#             return gr.update(visible=False)
 #     deduplication_type.change(
 #         update_visibility, inputs=deduplication_type, outputs=dataset2_inputs
@@ -415,21 +484,19 @@ demo.launch()
 #             dataset2_text_column,
 #             threshold,
 #         ],
-#         outputs=[status_output, result_output],
 #     )
 # demo.launch()
 # # import gradio as gr
 # # from datasets import load_dataset
 # # import numpy as np
-# # #from model2vec import StaticModel
 # # import model2vec
 # # from reach import Reach
 # # from difflib import ndiff
 # # # Load the model at startup
 # # model = model2vec.StaticModel.from_pretrained("minishlab/M2V_base_output")
@@ -446,13 +513,24 @@ demo.launch()
 # # ds_default1 = load_dataset(default_dataset1_name, split=default_dataset1_split)
 # # ds_default2 = load_dataset(default_dataset2_name, split=default_dataset2_split)
 # # def batch_iterable(iterable, batch_size):
 # #     """Helper function to create batches from an iterable."""
 # #     for i in range(0, len(iterable), batch_size):
 # #         yield iterable[i:i + batch_size]
-# # def compute_embeddings(texts, batch_size, progress, desc="Computing embeddings"):
 # #     embeddings = []
 # #     total_batches = (len(texts) + batch_size - 1) // batch_size
 # #     for i, batch_texts in enumerate(batch_iterable(texts, batch_size)):
@@ -465,10 +543,11 @@ demo.launch()
 # #     embedding_matrix: np.ndarray,
 # #     threshold: float,
 # #     batch_size: int = 1024,
-# #     progress=None
 # # ) -> tuple[np.ndarray, dict[int, int]]:
 # #     # Building the index
-# #     progress(0, desc="Building search index...")
 # #     reach = Reach(
 # #         vectors=embedding_matrix, items=[str(i) for i in range(len(embedding_matrix))]
 # #     )
@@ -477,7 +556,7 @@ demo.launch()
 # #     duplicate_to_original_mapping = {}
 # #     # Finding nearest neighbors
-# #     progress(0, desc="Finding nearest neighbors...")
 # #     results = reach.nearest_neighbor_threshold(
 # #         embedding_matrix,
 # #         threshold=threshold,
@@ -487,6 +566,7 @@ demo.launch()
 # #     # Processing duplicates with a progress bar
 # #     total_items = len(embedding_matrix)
 # #     for i, similar_items in enumerate(
 # #         progress.tqdm(results, desc="Processing duplicates", total=total_items)
 # #     ):
@@ -506,9 +586,9 @@ demo.launch()
 # #     diff = ndiff(x.split(), y.split())
 # #     return " ".join([word for word in diff if word.startswith(("+", "-"))])
-# # def encode_texts(texts, progress=None):
 # #     embedding_matrix = model.encode(texts, show_progressbar=False)
 # #     return embedding_matrix
 # # def perform_deduplication(
@@ -522,17 +602,18 @@ demo.launch()
 # #     threshold=default_threshold,
 # #     progress=gr.Progress(track_tqdm=True),
 # # ):
 # #     try:
 # #         # Convert threshold to float
 # #         threshold = float(threshold)
 # #         # Initialize status message
-# #         status = ""
 # #         if deduplication_type == "Single dataset":
 # #             # Load Dataset 1
-# #             status = "Loading Dataset 1..."
-# #             yield status, ""
 # #             if (
 # #                 dataset1_name == default_dataset1_name
 # #                 and dataset1_split == default_dataset1_split
@@ -540,29 +621,27 @@ demo.launch()
 # #                 ds = ds_default1
 # #             else:
 # #                 ds = load_dataset(dataset1_name, split=dataset1_split)
 # #             # Extract texts
-# #             status = "Extracting texts from Dataset 1..."
-# #             yield status, ""
 # #             texts = [example[dataset1_text_column] for example in ds]
 # #             # Compute embeddings
-# #             status = "Computing embeddings for Dataset 1..."
-# #             yield status, ""
-# #             embedding_matrix = encode_texts(texts, progress=progress)
-# #             #embedding_matrix = model.encode(texts, show_progressbar=True)
-# #             # embedding_matrix = compute_embeddings(
-# #             #     texts,
-# #             #     batch_size=64,
-# #             #     progress=progress,
-# #             #     desc="Computing embeddings for Dataset 1",
-# #             # )
 # #             # Deduplicate
-# #             status = "Deduplicating embeddings..."
-# #             yield status, ""
 # #             deduplicated_indices, duplicate_to_original_mapping = deduplicate(
-# #                 embedding_matrix, threshold, progress=progress
 # #             )
 # #             # Prepare the results
 # #             num_duplicates = len(duplicate_to_original_mapping)
@@ -590,141 +669,15 @@ demo.launch()
 # #             else:
 # #                 result_text += "No duplicates found."
-# #             # Final status
-# #             status = "Deduplication completed."
-# #             yield status, result_text
-# #         elif deduplication_type == "Cross-dataset":
-# #             # Similar code for cross-dataset deduplication
-# #             # Load Dataset 1
-# #             status = "Loading Dataset 1..."
-# #             yield status, ""
-# #             if (
-# #                 dataset1_name == default_dataset1_name
-# #                 and dataset1_split == default_dataset1_split
-# #             ):
-# #                 ds1 = ds_default1
-# #             else:
-# #                 ds1 = load_dataset(dataset1_name, split=dataset1_split)
-# #             # Load Dataset 2
-# #             status = "Loading Dataset 2..."
-# #             yield status, ""
-# #             if (
-# #                 dataset2_name == default_dataset2_name
-# #                 and dataset2_split == default_dataset2_split
-# #             ):
-# #                 ds2 = ds_default2
-# #             else:
-# #                 ds2 = load_dataset(dataset2_name, split=dataset2_split)
-# #             # Extract texts from Dataset 1
-# #             status = "Extracting texts from Dataset 1..."
-# #             yield status, ""
-# #             texts1 = [example[dataset1_text_column] for example in ds1]
-# #             # Extract texts from Dataset 2
-# #             status = "Extracting texts from Dataset 2..."
-# #             yield status, ""
-# #             texts2 = [example[dataset2_text_column] for example in ds2]
-# #             # Compute embeddings for Dataset 1
-# #             status = "Computing embeddings for Dataset 1..."
-# #             yield status, ""
-# #             embedding_matrix1 = compute_embeddings(
-# #                 texts1,
-# #                 batch_size=64,
-# #                 progress=progress,
-# #                 desc="Computing embeddings for Dataset 1",
-# #             )
-# #             # Compute embeddings for Dataset 2
-# #             status = "Computing embeddings for Dataset 2..."
-# #             yield status, ""
-# #             embedding_matrix2 = compute_embeddings(
-# #                 texts2,
-# #                 batch_size=64,
-# #                 progress=progress,
-# #                 desc="Computing embeddings for Dataset 2",
-# #             )
-# #             # Deduplicate across datasets
-# #             status = "Deduplicating embeddings across datasets..."
-# #             yield status, ""
-# #             duplicate_indices_in_ds2, duplicate_to_original_mapping = deduplicate_across_datasets(
-# #                 embedding_matrix1, embedding_matrix2, threshold, progress=progress
-# #             )
-# #             num_duplicates = len(duplicate_indices_in_ds2)
-# #             num_total_ds2 = len(texts2)
-# #             num_unique_ds2 = num_total_ds2 - num_duplicates
-# #             result_text = f"**Total documents in {dataset2_name}/{dataset2_split}:** {num_total_ds2}\n"
-# #             result_text += f"**Number of duplicates found in {dataset2_name}/{dataset2_split}:** {num_duplicates}\n"
-# #             result_text += f"**Number of unique documents in {dataset2_name}/{dataset2_split} after deduplication:** {num_unique_ds2}\n\n"
-# #             # Show deduplicated examples
-# #             if num_duplicates > 0:
-# #                 result_text += "**Examples of duplicates found in Dataset 2:**\n\n"
-# #                 num_examples = min(5, num_duplicates)
-# #                 for duplicate_idx in duplicate_indices_in_ds2[:num_examples]:
-# #                     original_idx = duplicate_to_original_mapping[duplicate_idx]
-# #                     original_text = texts1[original_idx]
-# #                     duplicate_text = texts2[duplicate_idx]
-# #                     differences = display_word_differences(original_text, duplicate_text)
-# #                     result_text += f"**Original text (Dataset 1):**\n{original_text}\n\n"
-# #                     result_text += f"**Duplicate text (Dataset 2):**\n{duplicate_text}\n\n"
-# #                     result_text += f"**Differences:**\n{differences}\n"
-# #                     result_text += "-" * 50 + "\n\n"
-# #             else:
-# #                 result_text += "No duplicates found."
-# #             # Final status
-# #             status = "Deduplication completed."
-# #             yield status, result_text
 # #     except Exception as e:
 # #         yield f"An error occurred: {e}", ""
 # #         raise e
-# # def deduplicate_across_datasets(
-# #     embedding_matrix_1: np.ndarray,
-# #     embedding_matrix_2: np.ndarray,
-# #     threshold: float,
-# #     batch_size: int = 1024,
-# #     progress=None
-# # ) -> tuple[list[int], dict[int, int]]:
-# #     # Building the index from Dataset 1
-# #     progress(0, desc="Building search index from Dataset 1...")
-# #     reach = Reach(
-# #         vectors=embedding_matrix_1, items=[str(i) for i in range(len(embedding_matrix_1))]
-# #     )
-# #     duplicate_indices_in_test = []
-# #     duplicate_to_original_mapping = {}
-# #     # Finding nearest neighbors between datasets
-# #     progress(0, desc="Finding nearest neighbors between datasets...")
-# #     results = reach.nearest_neighbor_threshold(
-# #         embedding_matrix_2,
-# #         threshold=threshold,
-# #         batch_size=batch_size,
-# #         show_progressbar=False,  # Disable internal progress bar
-# #     )
-# #     total_items = len(embedding_matrix_2)
-# #     # Processing duplicates with a progress bar
-# #     for i, similar_items in enumerate(
-# #         progress.tqdm(results, desc="Processing duplicates across datasets", total=total_items)
-# #     ):
-# #         similar_indices = [int(item[0]) for item in similar_items if item[1] >= threshold]
-# #         if similar_indices:
-# #             duplicate_indices_in_test.append(i)
-# #             duplicate_to_original_mapping[i] = similar_indices[0]
-# #     return duplicate_indices_in_test, duplicate_to_original_mapping
 # # # Adjust the height of the status_output component using custom CSS
 # # with gr.Blocks(css="#status_output { height: 150px; overflow: auto; }") as demo:
 # #     gr.Markdown("# Semantic Deduplication")
@@ -785,3 +738,369 @@ demo.launch()
 # #     )
 # # demo.launch()

 import gradio as gr
 from datasets import load_dataset
 import numpy as np
+from model2vec import StaticModel
 from reach import Reach
 from difflib import ndiff
+import tqdm
 # Load the model at startup
+model = StaticModel.from_pretrained("minishlab/M2V_base_output")
+# Update default dataset to 'sst2' and set default threshold to 0.9
 default_dataset1_name = "sst2"
 default_dataset1_split = "train"
 default_dataset2_name = "sst2"
 def compute_embeddings(texts, batch_size, progress, desc="Computing embeddings"):
     embeddings = []
+    for batch in progress.tqdm(batch_iterable(texts, batch_size), total=(len(texts) + batch_size - 1) // batch_size, desc=desc):
+        batch_embeddings = model.encode(batch, show_progressbar=False)
         embeddings.append(batch_embeddings)
     return np.concatenate(embeddings, axis=0)
+def deduplicate(embedding_matrix: np.ndarray, threshold: float, batch_size: int = 1024, progress=None) -> tuple[np.ndarray, dict[int, int]]:
+    """
+    Deduplicate embeddings and return the deduplicated indices and a mapping of removed indices to their corresponding original indices.
+    """
+    # Building the index
+    progress(0, desc="Building search index...")
     reach = Reach(vectors=embedding_matrix, items=[str(i) for i in range(len(embedding_matrix))])
     deduplicated_indices = set(range(len(embedding_matrix)))
     duplicate_to_original_mapping = {}
+    # Finding nearest neighbors
+    progress(0, desc="Finding nearest neighbors...")
     results = reach.nearest_neighbor_threshold(
         embedding_matrix,
         threshold=threshold,
         batch_size=batch_size,
+        show_progressbar=False  # Disable internal progress bar
     )
+    # Processing duplicates with a progress bar
     total_items = len(embedding_matrix)
     for i, similar_items in enumerate(progress.tqdm(results, desc="Processing duplicates", total=total_items)):
         if i not in deduplicated_indices:
             continue
         similar_indices = [int(item[0]) for item in similar_items if int(item[0]) != i]
         for sim_idx in similar_indices:
             if sim_idx in deduplicated_indices:
                 deduplicated_indices.remove(sim_idx)
     return np.array(list(deduplicated_indices)), duplicate_to_original_mapping
+def deduplicate_across_datasets(embedding_matrix_1: np.ndarray, embedding_matrix_2: np.ndarray, threshold: float, batch_size: int = 1024, progress=None) -> tuple[list[int], dict[int, int]]:
+    """
+    Deduplicate embeddings across two datasets and return the indices of duplicates between them.
+    """
+    # Building the index from Dataset 1
+    progress(0, desc="Building search index from Dataset 1...")
+    reach = Reach(vectors=embedding_matrix_1, items=[str(i) for i in range(len(embedding_matrix_1))])
+    duplicate_indices_in_test = []
+    duplicate_to_original_mapping = {}
+    # Finding nearest neighbors between datasets
+    progress(0, desc="Finding nearest neighbors between datasets...")
+    results = reach.nearest_neighbor_threshold(
+        embedding_matrix_2,
+        threshold=threshold,
+        batch_size=batch_size,
+        show_progressbar=False  # Disable internal progress bar
+    )
+    total_items = len(embedding_matrix_2)
+    # Processing duplicates with a progress bar
+    for i, similar_items in enumerate(progress.tqdm(results, desc="Processing duplicates across datasets", total=total_items)):
+        similar_indices = [int(item[0]) for item in similar_items if item[1] >= threshold]
+        if similar_indices:
+            duplicate_indices_in_test.append(i)
+            duplicate_to_original_mapping[i] = similar_indices[0]
+    return duplicate_indices_in_test, duplicate_to_original_mapping
 def display_word_differences(x: str, y: str) -> str:
     diff = ndiff(x.split(), y.split())
+    return " ".join([word for word in diff if word.startswith(('+', '-'))])
 def perform_deduplication(
     deduplication_type,
     dataset2_split="",
     dataset2_text_column="",
     threshold=default_threshold,
+    progress=gr.Progress(track_tqdm=True)
 ):
     try:
+        # Convert threshold to float
         threshold = float(threshold)
+        # Initialize status message
+        status = ""
         if deduplication_type == "Single dataset":
+            # Load Dataset 1
+            status = "Loading Dataset 1..."
+            yield status, ""
+            if dataset1_name == default_dataset1_name and dataset1_split == default_dataset1_split:
+                ds = ds_default1
+            else:
+                ds = load_dataset(dataset1_name, split=dataset1_split)
+            # Extract texts
+            status = "Extracting texts from Dataset 1..."
+            yield status, ""
             texts = [example[dataset1_text_column] for example in ds]
+            # Compute embeddings
+            status = "Computing embeddings for Dataset 1..."
+            yield status, ""
+            embedding_matrix = compute_embeddings(texts, batch_size=64, progress=progress, desc="Computing embeddings for Dataset 1")
+            # Deduplicate
+            status = "Deduplicating embeddings..."
+            yield status, ""
+            deduplicated_indices, duplicate_to_original_mapping = deduplicate(
+                embedding_matrix, threshold, progress=progress
+            )
+            # Prepare the results
             num_duplicates = len(duplicate_to_original_mapping)
             num_total = len(texts)
             num_deduplicated = len(deduplicated_indices)
             result_text += f"**Number of duplicates found:** {num_duplicates}\n"
             result_text += f"**Number of unique documents after deduplication:** {num_deduplicated}\n\n"
+            # Show deduplicated examples
             if num_duplicates > 0:
                 result_text += "**Examples of duplicates found:**\n\n"
                 num_examples = min(5, num_duplicates)
             else:
                 result_text += "No duplicates found."
+            # Final status
+            status = "Deduplication completed."
+            yield status, result_text
+        elif deduplication_type == "Cross-dataset":
+            # Load Dataset 1
+            status = "Loading Dataset 1..."
+            yield status, ""
+            if dataset1_name == default_dataset1_name and dataset1_split == default_dataset1_split:
+                ds1 = ds_default1
+            else:
+                ds1 = load_dataset(dataset1_name, split=dataset1_split)
+            # Load Dataset 2
+            status = "Loading Dataset 2..."
+            yield status, ""
+            if dataset2_name == default_dataset2_name and dataset2_split == default_dataset2_split:
+                ds2 = ds_default2
+            else:
+                ds2 = load_dataset(dataset2_name, split=dataset2_split)
+            # Extract texts from Dataset 1
+            status = "Extracting texts from Dataset 1..."
+            yield status, ""
+            texts1 = [example[dataset1_text_column] for example in ds1]
+            # Extract texts from Dataset 2
+            status = "Extracting texts from Dataset 2..."
+            yield status, ""
+            texts2 = [example[dataset2_text_column] for example in ds2]
+            # Compute embeddings for Dataset 1
+            status = "Computing embeddings for Dataset 1..."
+            yield status, ""
+            embedding_matrix1 = compute_embeddings(texts1, batch_size=64, progress=progress, desc="Computing embeddings for Dataset 1")
+            # Compute embeddings for Dataset 2
+            status = "Computing embeddings for Dataset 2..."
+            yield status, ""
+            embedding_matrix2 = compute_embeddings(texts2, batch_size=64, progress=progress, desc="Computing embeddings for Dataset 2")
+            # Deduplicate across datasets
+            status = "Deduplicating embeddings across datasets..."
+            yield status, ""
+            duplicate_indices_in_ds2, duplicate_to_original_mapping = deduplicate_across_datasets(
+                embedding_matrix1, embedding_matrix2, threshold, progress=progress
+            )
+            num_duplicates = len(duplicate_indices_in_ds2)
+            num_total_ds2 = len(texts2)
+            num_unique_ds2 = num_total_ds2 - num_duplicates
+            result_text = f"**Total documents in {dataset2_name}/{dataset2_split}:** {num_total_ds2}\n"
+            result_text += f"**Number of duplicates found in {dataset2_name}/{dataset2_split}:** {num_duplicates}\n"
+            result_text += f"**Number of unique documents in {dataset2_name}/{dataset2_split} after deduplication:** {num_unique_ds2}\n\n"
+            # Show deduplicated examples
+            if num_duplicates > 0:
+                result_text += "**Examples of duplicates found in Dataset 2:**\n\n"
+                num_examples = min(5, num_duplicates)
+                for duplicate_idx in duplicate_indices_in_ds2[:num_examples]:
+                    original_idx = duplicate_to_original_mapping[duplicate_idx]
+                    original_text = texts1[original_idx]
+                    duplicate_text = texts2[duplicate_idx]
+                    differences = display_word_differences(original_text, duplicate_text)
+                    result_text += f"**Original text (Dataset 1):**\n{original_text}\n\n"
+                    result_text += f"**Duplicate text (Dataset 2):**\n{duplicate_text}\n\n"
+                    result_text += f"**Differences:**\n{differences}\n"
+                    result_text += "-" * 50 + "\n\n"
+            else:
+                result_text += "No duplicates found."
+            # Final status
+            status = "Deduplication completed."
+            yield status, result_text
     except Exception as e:
+        yield f"An error occurred: {e}", ""
+        raise e
+with gr.Blocks() as demo:
     gr.Markdown("# Semantic Deduplication")
     deduplication_type = gr.Radio(
         choices=["Single dataset", "Cross-dataset"],
         label="Deduplication Type",
+        value="Single dataset"
     )
     with gr.Row():
             dataset2_split = gr.Textbox(value=default_dataset2_split, label="Dataset 2 Split")
             dataset2_text_column = gr.Textbox(value=default_text_column, label="Text Column Name")
+    threshold = gr.Slider(
+        minimum=0.0,
+        maximum=1.0,
+        value=default_threshold,
+        label="Similarity Threshold"
+    )
     compute_button = gr.Button("Compute")
+    status_output = gr.Markdown()
     result_output = gr.Markdown()
+    # Function to update the visibility of dataset2_inputs
     def update_visibility(deduplication_type_value):
+        if deduplication_type_value == "Cross-dataset":
+            return gr.update(visible=True)
+        else:
+            return gr.update(visible=False)
     deduplication_type.change(
+        update_visibility,
+        inputs=deduplication_type,
+        outputs=dataset2_inputs
     )
     compute_button.click(
             dataset2_name,
             dataset2_split,
             dataset2_text_column,
+            threshold
         ],
+        outputs=[status_output, result_output]
     )
 demo.launch()
 # import model2vec
 # from reach import Reach
 # from difflib import ndiff
 # # Load the model at startup
 # model = model2vec.StaticModel.from_pretrained("minishlab/M2V_base_output")
 #     for i in range(0, len(iterable), batch_size):
 #         yield iterable[i:i + batch_size]
+# def compute_embeddings(texts, batch_size, progress, desc="Computing embeddings"):
 #     embeddings = []
 #     total_batches = (len(texts) + batch_size - 1) // batch_size
 #     for i, batch_texts in enumerate(batch_iterable(texts, batch_size)):
 #     embedding_matrix: np.ndarray,
 #     threshold: float,
 #     batch_size: int = 1024,
+#     progress=None
 # ) -> tuple[np.ndarray, dict[int, int]]:
+#     reach = Reach(vectors=embedding_matrix, items=[str(i) for i in range(len(embedding_matrix))])
 #     deduplicated_indices = set(range(len(embedding_matrix)))
 #     duplicate_to_original_mapping = {}
 #     results = reach.nearest_neighbor_threshold(
 #         embedding_matrix,
 #         threshold=threshold,
 #         batch_size=batch_size,
+#         show_progressbar=False,
 #     )
 #     total_items = len(embedding_matrix)
+#     for i, similar_items in enumerate(progress.tqdm(results, desc="Processing duplicates", total=total_items)):
 #         if i not in deduplicated_indices:
 #             continue
 #         similar_indices = [int(item[0]) for item in similar_items if int(item[0]) != i]
 #         for sim_idx in similar_indices:
 #             if sim_idx in deduplicated_indices:
 #                 deduplicated_indices.remove(sim_idx)
 #     diff = ndiff(x.split(), y.split())
 #     return " ".join([word for word in diff if word.startswith(("+", "-"))])
 # def perform_deduplication(
 #     deduplication_type,
 #     dataset1_name,
 #     threshold=default_threshold,
 #     progress=gr.Progress(track_tqdm=True),
 # ):
 #     try:
 #         threshold = float(threshold)
 #         if deduplication_type == "Single dataset":
+#             ds = ds_default1 if dataset1_name == default_dataset1_name and dataset1_split == default_dataset1_split else load_dataset(dataset1_name, split=dataset1_split)
 #             texts = [example[dataset1_text_column] for example in ds]
+#             embedding_matrix = compute_embeddings(texts, batch_size=64, progress=progress)
+#             deduplicated_indices, duplicate_to_original_mapping = deduplicate(embedding_matrix, threshold, progress=progress)
 #             num_duplicates = len(duplicate_to_original_mapping)
 #             num_total = len(texts)
 #             num_deduplicated = len(deduplicated_indices)
 #             result_text = f"**Total documents:** {num_total}\n"
 #             result_text += f"**Number of duplicates found:** {num_duplicates}\n"
+#             result_text += f"**Number of unique documents after deduplication:** {num_deduplicated}\n\n"
 #             if num_duplicates > 0:
 #                 result_text += "**Examples of duplicates found:**\n\n"
 #                 num_examples = min(5, num_duplicates)
 #             else:
 #                 result_text += "No duplicates found."
+#             yield result_text
 #     except Exception as e:
+#         yield f"An error occurred: {e}"
+# # Gradio interface setup
 # with gr.Blocks(css="#status_output { height: 150px; overflow: auto; }") as demo:
 #     gr.Markdown("# Semantic Deduplication")
 #             dataset2_split = gr.Textbox(value=default_dataset2_split, label="Dataset 2 Split")
 #             dataset2_text_column = gr.Textbox(value=default_text_column, label="Text Column Name")
+#     threshold = gr.Slider(minimum=0.0, maximum=1.0, value=default_threshold, label="Similarity Threshold")
 #     compute_button = gr.Button("Compute")
 #     result_output = gr.Markdown()
 #     def update_visibility(deduplication_type_value):
+#         return gr.update(visible=True) if deduplication_type_value == "Cross-dataset" else gr.update(visible=False)
 #     deduplication_type.change(
 #         update_visibility, inputs=deduplication_type, outputs=dataset2_inputs
 #             dataset2_text_column,
 #             threshold,
 #         ],
+#         outputs=[result_output],
 #     )
 # demo.launch()
 # # import gradio as gr
 # # from datasets import load_dataset
 # # import numpy as np
 # # import model2vec
 # # from reach import Reach
 # # from difflib import ndiff
+# # import time
 # # # Load the model at startup
 # # model = model2vec.StaticModel.from_pretrained("minishlab/M2V_base_output")
 # # ds_default1 = load_dataset(default_dataset1_name, split=default_dataset1_split)
 # # ds_default2 = load_dataset(default_dataset2_name, split=default_dataset2_split)
 # # def batch_iterable(iterable, batch_size):
 # #     """Helper function to create batches from an iterable."""
 # #     for i in range(0, len(iterable), batch_size):
 # #         yield iterable[i:i + batch_size]
+# # def log_time(message, start_time=None, logs=None):
+# #     """Helper function to log the start and end times."""
+# #     current_time = time.time()
+# #     if start_time is not None:
+# #         elapsed = current_time - start_time
+# #         log_message = f"{message} - Took {elapsed:.2f} seconds"
+# #     else:
+# #         log_message = f"{message} - Started"
+# #     if logs is not None:
+# #         logs.append(log_message)
+# # def compute_embeddings(texts, batch_size, progress, logs, desc="Computing embeddings"):
 # #     embeddings = []
 # #     total_batches = (len(texts) + batch_size - 1) // batch_size
 # #     for i, batch_texts in enumerate(batch_iterable(texts, batch_size)):
 # #     embedding_matrix: np.ndarray,
 # #     threshold: float,
 # #     batch_size: int = 1024,
+# #     progress=None,
+# #     logs=None
 # # ) -> tuple[np.ndarray, dict[int, int]]:
 # #     # Building the index
+# #     log_time("Building search index", logs=logs)
 # #     reach = Reach(
 # #         vectors=embedding_matrix, items=[str(i) for i in range(len(embedding_matrix))]
 # #     )
 # #     duplicate_to_original_mapping = {}
 # #     # Finding nearest neighbors
+# #     log_time("Finding nearest neighbors", logs=logs)
 # #     results = reach.nearest_neighbor_threshold(
 # #         embedding_matrix,
 # #         threshold=threshold,
 # #     # Processing duplicates with a progress bar
 # #     total_items = len(embedding_matrix)
+# #     log_time("Processing duplicates", logs=logs)
 # #     for i, similar_items in enumerate(
 # #         progress.tqdm(results, desc="Processing duplicates", total=total_items)
 # #     ):
 # #     diff = ndiff(x.split(), y.split())
 # #     return " ".join([word for word in diff if word.startswith(("+", "-"))])
+# # def encode_texts(texts, progress=None, logs=None):
 # #     embedding_matrix = model.encode(texts, show_progressbar=False)
+# #     log_time("Encoding texts completed", logs=logs)
 # #     return embedding_matrix
 # # def perform_deduplication(
 # #     threshold=default_threshold,
 # #     progress=gr.Progress(track_tqdm=True),
 # # ):
+# #     logs = []  # To store log messages
 # #     try:
 # #         # Convert threshold to float
 # #         threshold = float(threshold)
 # #         # Initialize status message
+# #         log_time("Deduplication started", logs=logs)
 # #         if deduplication_type == "Single dataset":
 # #             # Load Dataset 1
+# #             start_time = time.time()
+# #             log_time("Loading Dataset 1", logs=logs)
 # #             if (
 # #                 dataset1_name == default_dataset1_name
 # #                 and dataset1_split == default_dataset1_split
 # #                 ds = ds_default1
 # #             else:
 # #                 ds = load_dataset(dataset1_name, split=dataset1_split)
+# #             log_time("Loading Dataset 1 completed", start_time=start_time, logs=logs)
 # #             # Extract texts
+# #             start_time = time.time()
+# #             log_time("Extracting texts from Dataset 1", logs=logs)
 # #             texts = [example[dataset1_text_column] for example in ds]
+# #             log_time("Extracting texts from Dataset 1 completed", start_time=start_time, logs=logs)
 # #             # Compute embeddings
+# #             start_time = time.time()
+# #             log_time("Computing embeddings for Dataset 1", logs=logs)
+# #             embedding_matrix = encode_texts(texts, progress=progress, logs=logs)
+# #             log_time("Computing embeddings for Dataset 1 completed", start_time=start_time, logs=logs)
 # #             # Deduplicate
+# #             start_time = time.time()
+# #             log_time("Deduplicating embeddings", logs=logs)
 # #             deduplicated_indices, duplicate_to_original_mapping = deduplicate(
+# #                 embedding_matrix, threshold, progress=progress, logs=logs
 # #             )
+# #             log_time("Deduplication completed", start_time=start_time, logs=logs)
 # #             # Prepare the results
 # #             num_duplicates = len(duplicate_to_original_mapping)
 # #             else:
 # #                 result_text += "No duplicates found."
+# #             log_time("Deduplication process finished", logs=logs)
+# #             full_log = "\n".join(logs)  # Combine all logs into one output
+# #             yield full_log, result_text
 # #     except Exception as e:
+# #         full_log = "\n".join(logs)  # Combine all logs into one output in case of an error
 # #         yield f"An error occurred: {e}", ""
 # #         raise e
 # # # Adjust the height of the status_output component using custom CSS
 # # with gr.Blocks(css="#status_output { height: 150px; overflow: auto; }") as demo:
 # #     gr.Markdown("# Semantic Deduplication")
 # #     )
 # # demo.launch()
+# # # import gradio as gr
+# # # from datasets import load_dataset
+# # # import numpy as np
+# # # #from model2vec import StaticModel
+# # # import model2vec
+# # # from reach import Reach
+# # # from difflib import ndiff
+# # # # Load the model at startup
+# # # model = model2vec.StaticModel.from_pretrained("minishlab/M2V_base_output")
+# # # # Default dataset parameters
+# # # default_dataset1_name = "sst2"
+# # # default_dataset1_split = "train"
+# # # default_dataset2_name = "sst2"
+# # # default_dataset2_split = "validation"
+# # # default_text_column = "sentence"
+# # # default_threshold = 0.9
+# # # # Load the default datasets at startup
+# # # ds_default1 = load_dataset(default_dataset1_name, split=default_dataset1_split)
+# # # ds_default2 = load_dataset(default_dataset2_name, split=default_dataset2_split)
+# # # def batch_iterable(iterable, batch_size):
+# # #     """Helper function to create batches from an iterable."""
+# # #     for i in range(0, len(iterable), batch_size):
+# # #         yield iterable[i:i + batch_size]
+# # # def compute_embeddings(texts, batch_size, progress, desc="Computing embeddings"):
+# # #     embeddings = []
+# # #     total_batches = (len(texts) + batch_size - 1) // batch_size
+# # #     for i, batch_texts in enumerate(batch_iterable(texts, batch_size)):
+# # #         batch_embeddings = model.encode(batch_texts, show_progressbar=False)
+# # #         embeddings.append(batch_embeddings)
+# # #         progress((i + 1) / total_batches, desc=desc)
+# # #     return np.concatenate(embeddings, axis=0)
+# # # def deduplicate(
+# # #     embedding_matrix: np.ndarray,
+# # #     threshold: float,
+# # #     batch_size: int = 1024,
+# # #     progress=None
+# # # ) -> tuple[np.ndarray, dict[int, int]]:
+# # #     # Building the index
+# # #     progress(0, desc="Building search index...")
+# # #     reach = Reach(
+# # #         vectors=embedding_matrix, items=[str(i) for i in range(len(embedding_matrix))]
+# # #     )
+# # #     deduplicated_indices = set(range(len(embedding_matrix)))
+# # #     duplicate_to_original_mapping = {}
+# # #     # Finding nearest neighbors
+# # #     progress(0, desc="Finding nearest neighbors...")
+# # #     results = reach.nearest_neighbor_threshold(
+# # #         embedding_matrix,
+# # #         threshold=threshold,
+# # #         batch_size=batch_size,
+# # #         show_progressbar=False,  # Disable internal progress bar
+# # #     )
+# # #     # Processing duplicates with a progress bar
+# # #     total_items = len(embedding_matrix)
+# # #     for i, similar_items in enumerate(
+# # #         progress.tqdm(results, desc="Processing duplicates", total=total_items)
+# # #     ):
+# # #         if i not in deduplicated_indices:
+# # #             continue
+# # #         similar_indices = [int(item[0]) for item in similar_items if int(item[0]) != i]
+# # #         for sim_idx in similar_indices:
+# # #             if sim_idx in deduplicated_indices:
+# # #                 deduplicated_indices.remove(sim_idx)
+# # #                 duplicate_to_original_mapping[sim_idx] = i
+# # #     return np.array(list(deduplicated_indices)), duplicate_to_original_mapping
+# # # def display_word_differences(x: str, y: str) -> str:
+# # #     diff = ndiff(x.split(), y.split())
+# # #     return " ".join([word for word in diff if word.startswith(("+", "-"))])
+# # # def encode_texts(texts, progress=None):
+# # #     embedding_matrix = model.encode(texts, show_progressbar=False)
+# # #     return embedding_matrix
+# # # def perform_deduplication(
+# # #     deduplication_type,
+# # #     dataset1_name,
+# # #     dataset1_split,
+# # #     dataset1_text_column,
+# # #     dataset2_name="",
+# # #     dataset2_split="",
+# # #     dataset2_text_column="",
+# # #     threshold=default_threshold,
+# # #     progress=gr.Progress(track_tqdm=True),
+# # # ):
+# # #     try:
+# # #         # Convert threshold to float
+# # #         threshold = float(threshold)
+# # #         # Initialize status message
+# # #         status = ""
+# # #         if deduplication_type == "Single dataset":
+# # #             # Load Dataset 1
+# # #             status = "Loading Dataset 1..."
+# # #             yield status, ""
+# # #             if (
+# # #                 dataset1_name == default_dataset1_name
+# # #                 and dataset1_split == default_dataset1_split
+# # #             ):
+# # #                 ds = ds_default1
+# # #             else:
+# # #                 ds = load_dataset(dataset1_name, split=dataset1_split)
+# # #             # Extract texts
+# # #             status = "Extracting texts from Dataset 1..."
+# # #             yield status, ""
+# # #             texts = [example[dataset1_text_column] for example in ds]
+# # #             # Compute embeddings
+# # #             status = "Computing embeddings for Dataset 1..."
+# # #             yield status, ""
+# # #             embedding_matrix = encode_texts(texts, progress=progress)
+# # #             #embedding_matrix = model.encode(texts, show_progressbar=True)
+# # #             # embedding_matrix = compute_embeddings(
+# # #             #     texts,
+# # #             #     batch_size=64,
+# # #             #     progress=progress,
+# # #             #     desc="Computing embeddings for Dataset 1",
+# # #             # )
+# # #             # Deduplicate
+# # #             status = "Deduplicating embeddings..."
+# # #             yield status, ""
+# # #             deduplicated_indices, duplicate_to_original_mapping = deduplicate(
+# # #                 embedding_matrix, threshold, progress=progress
+# # #             )
+# # #             # Prepare the results
+# # #             num_duplicates = len(duplicate_to_original_mapping)
+# # #             num_total = len(texts)
+# # #             num_deduplicated = len(deduplicated_indices)
+# # #             result_text = f"**Total documents:** {num_total}\n"
+# # #             result_text += f"**Number of duplicates found:** {num_duplicates}\n"
+# # #             result_text += (
+# # #                 f"**Number of unique documents after deduplication:** {num_deduplicated}\n\n"
+# # #             )
+# # #             # Show deduplicated examples
+# # #             if num_duplicates > 0:
+# # #                 result_text += "**Examples of duplicates found:**\n\n"
+# # #                 num_examples = min(5, num_duplicates)
+# # #                 for duplicate_idx, original_idx in list(duplicate_to_original_mapping.items())[:num_examples]:
+# # #                     original_text = texts[original_idx]
+# # #                     duplicate_text = texts[duplicate_idx]
+# # #                     differences = display_word_differences(original_text, duplicate_text)
+# # #                     result_text += f"**Original text:**\n{original_text}\n\n"
+# # #                     result_text += f"**Duplicate text:**\n{duplicate_text}\n\n"
+# # #                     result_text += f"**Differences:**\n{differences}\n"
+# # #                     result_text += "-" * 50 + "\n\n"
+# # #             else:
+# # #                 result_text += "No duplicates found."
+# # #             # Final status
+# # #             status = "Deduplication completed."
+# # #             yield status, result_text
+# # #         elif deduplication_type == "Cross-dataset":
+# # #             # Similar code for cross-dataset deduplication
+# # #             # Load Dataset 1
+# # #             status = "Loading Dataset 1..."
+# # #             yield status, ""
+# # #             if (
+# # #                 dataset1_name == default_dataset1_name
+# # #                 and dataset1_split == default_dataset1_split
+# # #             ):
+# # #                 ds1 = ds_default1
+# # #             else:
+# # #                 ds1 = load_dataset(dataset1_name, split=dataset1_split)
+# # #             # Load Dataset 2
+# # #             status = "Loading Dataset 2..."
+# # #             yield status, ""
+# # #             if (
+# # #                 dataset2_name == default_dataset2_name
+# # #                 and dataset2_split == default_dataset2_split
+# # #             ):
+# # #                 ds2 = ds_default2
+# # #             else:
+# # #                 ds2 = load_dataset(dataset2_name, split=dataset2_split)
+# # #             # Extract texts from Dataset 1
+# # #             status = "Extracting texts from Dataset 1..."
+# # #             yield status, ""
+# # #             texts1 = [example[dataset1_text_column] for example in ds1]
+# # #             # Extract texts from Dataset 2
+# # #             status = "Extracting texts from Dataset 2..."
+# # #             yield status, ""
+# # #             texts2 = [example[dataset2_text_column] for example in ds2]
+# # #             # Compute embeddings for Dataset 1
+# # #             status = "Computing embeddings for Dataset 1..."
+# # #             yield status, ""
+# # #             embedding_matrix1 = compute_embeddings(
+# # #                 texts1,
+# # #                 batch_size=64,
+# # #                 progress=progress,
+# # #                 desc="Computing embeddings for Dataset 1",
+# # #             )
+# # #             # Compute embeddings for Dataset 2
+# # #             status = "Computing embeddings for Dataset 2..."
+# # #             yield status, ""
+# # #             embedding_matrix2 = compute_embeddings(
+# # #                 texts2,
+# # #                 batch_size=64,
+# # #                 progress=progress,
+# # #                 desc="Computing embeddings for Dataset 2",
+# # #             )
+# # #             # Deduplicate across datasets
+# # #             status = "Deduplicating embeddings across datasets..."
+# # #             yield status, ""
+# # #             duplicate_indices_in_ds2, duplicate_to_original_mapping = deduplicate_across_datasets(
+# # #                 embedding_matrix1, embedding_matrix2, threshold, progress=progress
+# # #             )
+# # #             num_duplicates = len(duplicate_indices_in_ds2)
+# # #             num_total_ds2 = len(texts2)
+# # #             num_unique_ds2 = num_total_ds2 - num_duplicates
+# # #             result_text = f"**Total documents in {dataset2_name}/{dataset2_split}:** {num_total_ds2}\n"
+# # #             result_text += f"**Number of duplicates found in {dataset2_name}/{dataset2_split}:** {num_duplicates}\n"
+# # #             result_text += f"**Number of unique documents in {dataset2_name}/{dataset2_split} after deduplication:** {num_unique_ds2}\n\n"
+# # #             # Show deduplicated examples
+# # #             if num_duplicates > 0:
+# # #                 result_text += "**Examples of duplicates found in Dataset 2:**\n\n"
+# # #                 num_examples = min(5, num_duplicates)
+# # #                 for duplicate_idx in duplicate_indices_in_ds2[:num_examples]:
+# # #                     original_idx = duplicate_to_original_mapping[duplicate_idx]
+# # #                     original_text = texts1[original_idx]
+# # #                     duplicate_text = texts2[duplicate_idx]
+# # #                     differences = display_word_differences(original_text, duplicate_text)
+# # #                     result_text += f"**Original text (Dataset 1):**\n{original_text}\n\n"
+# # #                     result_text += f"**Duplicate text (Dataset 2):**\n{duplicate_text}\n\n"
+# # #                     result_text += f"**Differences:**\n{differences}\n"
+# # #                     result_text += "-" * 50 + "\n\n"
+# # #             else:
+# # #                 result_text += "No duplicates found."
+# # #             # Final status
+# # #             status = "Deduplication completed."
+# # #             yield status, result_text
+# # #     except Exception as e:
+# # #         yield f"An error occurred: {e}", ""
+# # #         raise e
+# # # def deduplicate_across_datasets(
+# # #     embedding_matrix_1: np.ndarray,
+# # #     embedding_matrix_2: np.ndarray,
+# # #     threshold: float,
+# # #     batch_size: int = 1024,
+# # #     progress=None
+# # # ) -> tuple[list[int], dict[int, int]]:
+# # #     # Building the index from Dataset 1
+# # #     progress(0, desc="Building search index from Dataset 1...")
+# # #     reach = Reach(
+# # #         vectors=embedding_matrix_1, items=[str(i) for i in range(len(embedding_matrix_1))]
+# # #     )
+# # #     duplicate_indices_in_test = []
+# # #     duplicate_to_original_mapping = {}
+# # #     # Finding nearest neighbors between datasets
+# # #     progress(0, desc="Finding nearest neighbors between datasets...")
+# # #     results = reach.nearest_neighbor_threshold(
+# # #         embedding_matrix_2,
+# # #         threshold=threshold,
+# # #         batch_size=batch_size,
+# # #         show_progressbar=False,  # Disable internal progress bar
+# # #     )
+# # #     total_items = len(embedding_matrix_2)
+# # #     # Processing duplicates with a progress bar
+# # #     for i, similar_items in enumerate(
+# # #         progress.tqdm(results, desc="Processing duplicates across datasets", total=total_items)
+# # #     ):
+# # #         similar_indices = [int(item[0]) for item in similar_items if item[1] >= threshold]
+# # #         if similar_indices:
+# # #             duplicate_indices_in_test.append(i)
+# # #             duplicate_to_original_mapping[i] = similar_indices[0]
+# # #     return duplicate_indices_in_test, duplicate_to_original_mapping
+# # # # Adjust the height of the status_output component using custom CSS
+# # # with gr.Blocks(css="#status_output { height: 150px; overflow: auto; }") as demo:
+# # #     gr.Markdown("# Semantic Deduplication")
+# # #     deduplication_type = gr.Radio(
+# # #         choices=["Single dataset", "Cross-dataset"],
+# # #         label="Deduplication Type",
+# # #         value="Single dataset",
+# # #     )
+# # #     with gr.Row():
+# # #         dataset1_name = gr.Textbox(value=default_dataset1_name, label="Dataset 1 Name")
+# # #         dataset1_split = gr.Textbox(value=default_dataset1_split, label="Dataset 1 Split")
+# # #         dataset1_text_column = gr.Textbox(value=default_text_column, label="Text Column Name")
+# # #     dataset2_inputs = gr.Column(visible=False)
+# # #     with dataset2_inputs:
+# # #         gr.Markdown("### Dataset 2")
+# # #         with gr.Row():
+# # #             dataset2_name = gr.Textbox(value=default_dataset2_name, label="Dataset 2 Name")
+# # #             dataset2_split = gr.Textbox(value=default_dataset2_split, label="Dataset 2 Split")
+# # #             dataset2_text_column = gr.Textbox(value=default_text_column, label="Text Column Name")
+# # #     threshold = gr.Slider(
+# # #         minimum=0.0, maximum=1.0, value=default_threshold, label="Similarity Threshold"
+# # #     )
+# # #     compute_button = gr.Button("Compute")
+# # #     # Use 'gr.Markdown' with 'elem_id' and custom CSS to adjust height
+# # #     status_output = gr.Markdown(elem_id="status_output")
+# # #     result_output = gr.Markdown()
+# # #     # Function to update the visibility of dataset2_inputs
+# # #     def update_visibility(deduplication_type_value):
+# # #         if deduplication_type_value == "Cross-dataset":
+# # #             return gr.update(visible=True)
+# # #         else:
+# # #             return gr.update(visible=False)
+# # #     deduplication_type.change(
+# # #         update_visibility, inputs=deduplication_type, outputs=dataset2_inputs
+# # #     )
+# # #     compute_button.click(
+# # #         fn=perform_deduplication,
+# # #         inputs=[
+# # #             deduplication_type,
+# # #             dataset1_name,
+# # #             dataset1_split,
+# # #             dataset1_text_column,
+# # #             dataset2_name,
+# # #             dataset2_split,
+# # #             dataset2_text_column,
+# # #             threshold,
+# # #         ],
+# # #         outputs=[status_output, result_output],
+# # #     )
+# # # demo.launch()