Pringled commited on
Commit
8f283dc
·
1 Parent(s): 20f4a6e

Updated app with code for deduplication

Browse files
Files changed (1) hide show
  1. app.py +991 -671
app.py CHANGED
@@ -26,6 +26,15 @@ def batch_iterable(iterable, batch_size):
26
  for i in range(0, len(iterable), batch_size):
27
  yield iterable[i:i + batch_size]
28
 
 
 
 
 
 
 
 
 
 
29
  def display_word_differences(x: str, y: str) -> str:
30
  diff = ndiff(x.split(), y.split())
31
  return " ".join([word for word in diff if word.startswith(('+', '-'))])
@@ -65,13 +74,7 @@ def perform_deduplication(
65
  # Compute embeddings
66
  status = "Computing embeddings for Dataset 1..."
67
  yield status, ""
68
- embeddings = []
69
- batch_size = 64
70
- total_batches = (len(texts) + batch_size - 1) // batch_size
71
- for batch_texts in progress.tqdm(batch_iterable(texts, batch_size), desc="Computing embeddings", total=total_batches):
72
- batch_embeddings = model.encode(batch_texts, show_progressbar=False)
73
- embeddings.append(batch_embeddings)
74
- embedding_matrix = np.concatenate(embeddings, axis=0)
75
 
76
  # Deduplicate
77
  status = "Deduplicating embeddings..."
@@ -138,23 +141,12 @@ def perform_deduplication(
138
  # Compute embeddings for Dataset 1
139
  status = "Computing embeddings for Dataset 1..."
140
  yield status, ""
141
- embeddings1 = []
142
- batch_size = 64
143
- total_batches1 = (len(texts1) + batch_size - 1) // batch_size
144
- for batch_texts in progress.tqdm(batch_iterable(texts1, batch_size), desc="Computing embeddings for Dataset 1", total=total_batches1):
145
- batch_embeddings = model.encode(batch_texts, show_progressbar=False)
146
- embeddings1.append(batch_embeddings)
147
- embedding_matrix1 = np.concatenate(embeddings1, axis=0)
148
 
149
  # Compute embeddings for Dataset 2
150
  status = "Computing embeddings for Dataset 2..."
151
  yield status, ""
152
- embeddings2 = []
153
- total_batches2 = (len(texts2) + batch_size - 1) // batch_size
154
- for batch_texts in progress.tqdm(batch_iterable(texts2, batch_size), desc="Computing embeddings for Dataset 2", total=total_batches2):
155
- batch_embeddings = model.encode(batch_texts, show_progressbar=False)
156
- embeddings2.append(batch_embeddings)
157
- embedding_matrix2 = np.concatenate(embeddings2, axis=0)
158
 
159
  # Deduplicate across datasets
160
  status = "Deduplicating embeddings across datasets..."
@@ -326,7 +318,6 @@ with gr.Blocks() as demo:
326
  demo.launch()
327
 
328
 
329
-
330
  # import gradio as gr
331
  # from datasets import load_dataset
332
  # import numpy as np
@@ -355,79 +346,6 @@ demo.launch()
355
  # for i in range(0, len(iterable), batch_size):
356
  # yield iterable[i:i + batch_size]
357
 
358
- # def compute_embeddings(texts, batch_size, progress, desc="Computing embeddings"):
359
- # embeddings = []
360
- # for batch in progress.tqdm(batch_iterable(texts, batch_size), total=(len(texts) + batch_size - 1) // batch_size, desc=desc):
361
- # batch_embeddings = model.encode(batch, show_progressbar=False)
362
- # embeddings.append(batch_embeddings)
363
- # return np.concatenate(embeddings, axis=0)
364
-
365
- # def deduplicate(embedding_matrix: np.ndarray, threshold: float, batch_size: int = 1024, progress=None) -> tuple[np.ndarray, dict[int, int]]:
366
- # """
367
- # Deduplicate embeddings and return the deduplicated indices and a mapping of removed indices to their corresponding original indices.
368
- # """
369
- # # Building the index
370
- # progress(0, desc="Building search index...")
371
- # reach = Reach(vectors=embedding_matrix, items=[str(i) for i in range(len(embedding_matrix))])
372
-
373
- # deduplicated_indices = set(range(len(embedding_matrix)))
374
- # duplicate_to_original_mapping = {}
375
-
376
- # # Finding nearest neighbors
377
- # progress(0, desc="Finding nearest neighbors...")
378
- # results = reach.nearest_neighbor_threshold(
379
- # embedding_matrix,
380
- # threshold=threshold,
381
- # batch_size=batch_size,
382
- # show_progressbar=False # Disable internal progress bar
383
- # )
384
-
385
- # # Processing duplicates with a progress bar
386
- # total_items = len(embedding_matrix)
387
- # for i, similar_items in enumerate(progress.tqdm(results, desc="Processing duplicates", total=total_items)):
388
- # if i not in deduplicated_indices:
389
- # continue
390
-
391
- # similar_indices = [int(item[0]) for item in similar_items if int(item[0]) != i]
392
-
393
- # for sim_idx in similar_indices:
394
- # if sim_idx in deduplicated_indices:
395
- # deduplicated_indices.remove(sim_idx)
396
- # duplicate_to_original_mapping[sim_idx] = i
397
-
398
- # return np.array(list(deduplicated_indices)), duplicate_to_original_mapping
399
-
400
- # def deduplicate_across_datasets(embedding_matrix_1: np.ndarray, embedding_matrix_2: np.ndarray, threshold: float, batch_size: int = 1024, progress=None) -> tuple[list[int], dict[int, int]]:
401
- # """
402
- # Deduplicate embeddings across two datasets and return the indices of duplicates between them.
403
- # """
404
- # # Building the index from Dataset 1
405
- # progress(0, desc="Building search index from Dataset 1...")
406
- # reach = Reach(vectors=embedding_matrix_1, items=[str(i) for i in range(len(embedding_matrix_1))])
407
-
408
- # duplicate_indices_in_test = []
409
- # duplicate_to_original_mapping = {}
410
-
411
- # # Finding nearest neighbors between datasets
412
- # progress(0, desc="Finding nearest neighbors between datasets...")
413
- # results = reach.nearest_neighbor_threshold(
414
- # embedding_matrix_2,
415
- # threshold=threshold,
416
- # batch_size=batch_size,
417
- # show_progressbar=False # Disable internal progress bar
418
- # )
419
-
420
- # total_items = len(embedding_matrix_2)
421
- # # Processing duplicates with a progress bar
422
- # for i, similar_items in enumerate(progress.tqdm(results, desc="Processing duplicates across datasets", total=total_items)):
423
- # similar_indices = [int(item[0]) for item in similar_items if item[1] >= threshold]
424
-
425
- # if similar_indices:
426
- # duplicate_indices_in_test.append(i)
427
- # duplicate_to_original_mapping[i] = similar_indices[0]
428
-
429
- # return duplicate_indices_in_test, duplicate_to_original_mapping
430
-
431
  # def display_word_differences(x: str, y: str) -> str:
432
  # diff = ndiff(x.split(), y.split())
433
  # return " ".join([word for word in diff if word.startswith(('+', '-'))])
@@ -467,7 +385,13 @@ demo.launch()
467
  # # Compute embeddings
468
  # status = "Computing embeddings for Dataset 1..."
469
  # yield status, ""
470
- # embedding_matrix = compute_embeddings(texts, batch_size=64, progress=progress, desc="Computing embeddings for Dataset 1")
 
 
 
 
 
 
471
 
472
  # # Deduplicate
473
  # status = "Deduplicating embeddings..."
@@ -534,12 +458,23 @@ demo.launch()
534
  # # Compute embeddings for Dataset 1
535
  # status = "Computing embeddings for Dataset 1..."
536
  # yield status, ""
537
- # embedding_matrix1 = compute_embeddings(texts1, batch_size=64, progress=progress, desc="Computing embeddings for Dataset 1")
 
 
 
 
 
 
538
 
539
  # # Compute embeddings for Dataset 2
540
  # status = "Computing embeddings for Dataset 2..."
541
  # yield status, ""
542
- # embedding_matrix2 = compute_embeddings(texts2, batch_size=64, progress=progress, desc="Computing embeddings for Dataset 2")
 
 
 
 
 
543
 
544
  # # Deduplicate across datasets
545
  # status = "Deduplicating embeddings across datasets..."
@@ -552,8 +487,8 @@ demo.launch()
552
  # num_total_ds2 = len(texts2)
553
  # num_unique_ds2 = num_total_ds2 - num_duplicates
554
 
555
- # result_text = f"**Total documents in {dataset2_name}/{dataset2_split}:** {num_total_ds2}\n\n"
556
- # result_text += f"**Number of duplicates found in {dataset2_name}/{dataset2_split}:** {num_duplicates}\n\n"
557
  # result_text += f"**Number of unique documents in {dataset2_name}/{dataset2_split} after deduplication:** {num_unique_ds2}\n\n"
558
 
559
  # # Show deduplicated examples
@@ -580,6 +515,72 @@ demo.launch()
580
  # yield f"An error occurred: {e}", ""
581
  # raise e
582
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
583
  # with gr.Blocks() as demo:
584
  # gr.Markdown("# Semantic Deduplication")
585
 
@@ -644,25 +645,6 @@ demo.launch()
644
 
645
  # demo.launch()
646
 
647
-
648
-
649
-
650
-
651
-
652
-
653
-
654
-
655
-
656
-
657
-
658
-
659
-
660
-
661
-
662
-
663
-
664
-
665
-
666
 
667
 
668
  # # import gradio as gr
@@ -671,7 +653,6 @@ demo.launch()
671
  # # from model2vec import StaticModel
672
  # # from reach import Reach
673
  # # from difflib import ndiff
674
- # # import sys
675
  # # import tqdm
676
 
677
  # # # Load the model at startup
@@ -689,26 +670,41 @@ demo.launch()
689
  # # ds_default1 = load_dataset(default_dataset1_name, split=default_dataset1_split)
690
  # # ds_default2 = load_dataset(default_dataset2_name, split=default_dataset2_split)
691
 
692
- # # def deduplicate(embedding_matrix: np.ndarray, threshold: float, batch_size: int = 1024) -> tuple[np.ndarray, dict[int, int]]:
 
 
 
 
 
 
 
 
 
 
 
 
693
  # # """
694
  # # Deduplicate embeddings and return the deduplicated indices and a mapping of removed indices to their corresponding original indices.
695
  # # """
696
  # # # Building the index
 
697
  # # reach = Reach(vectors=embedding_matrix, items=[str(i) for i in range(len(embedding_matrix))])
698
 
699
  # # deduplicated_indices = set(range(len(embedding_matrix)))
700
  # # duplicate_to_original_mapping = {}
701
 
702
  # # # Finding nearest neighbors
 
703
  # # results = reach.nearest_neighbor_threshold(
704
  # # embedding_matrix,
705
  # # threshold=threshold,
706
  # # batch_size=batch_size,
707
- # # show_progressbar=True # Allow internal progress bar
708
  # # )
709
 
710
- # # # Processing duplicates
711
- # # for i, similar_items in enumerate(results):
 
712
  # # if i not in deduplicated_indices:
713
  # # continue
714
 
@@ -721,26 +717,29 @@ demo.launch()
721
 
722
  # # return np.array(list(deduplicated_indices)), duplicate_to_original_mapping
723
 
724
- # # def deduplicate_across_datasets(embedding_matrix_1: np.ndarray, embedding_matrix_2: np.ndarray, threshold: float, batch_size: int = 1024) -> tuple[list[int], dict[int, int]]:
725
  # # """
726
  # # Deduplicate embeddings across two datasets and return the indices of duplicates between them.
727
  # # """
728
  # # # Building the index from Dataset 1
 
729
  # # reach = Reach(vectors=embedding_matrix_1, items=[str(i) for i in range(len(embedding_matrix_1))])
730
 
731
  # # duplicate_indices_in_test = []
732
  # # duplicate_to_original_mapping = {}
733
 
734
  # # # Finding nearest neighbors between datasets
 
735
  # # results = reach.nearest_neighbor_threshold(
736
  # # embedding_matrix_2,
737
  # # threshold=threshold,
738
  # # batch_size=batch_size,
739
- # # show_progressbar=True # Allow internal progress bar
740
  # # )
741
 
742
- # # # Processing duplicates
743
- # # for i, similar_items in enumerate(results):
 
744
  # # similar_indices = [int(item[0]) for item in similar_items if item[1] >= threshold]
745
 
746
  # # if similar_indices:
@@ -764,17 +763,10 @@ demo.launch()
764
  # # threshold=default_threshold,
765
  # # progress=gr.Progress(track_tqdm=True)
766
  # # ):
767
- # # # Deep Monkey-Patching of tqdm
768
- # # original_tqdm = tqdm.tqdm
769
- # # tqdm.tqdm = progress.tqdm
770
- # # for mod_name in list(sys.modules.keys()):
771
- # # if 'tqdm' in mod_name:
772
- # # sys.modules[mod_name].tqdm = progress.tqdm
773
-
774
  # # try:
775
  # # # Convert threshold to float
776
  # # threshold = float(threshold)
777
-
778
  # # # Initialize status message
779
  # # status = ""
780
 
@@ -786,33 +778,33 @@ demo.launch()
786
  # # ds = ds_default1
787
  # # else:
788
  # # ds = load_dataset(dataset1_name, split=dataset1_split)
789
-
790
  # # # Extract texts
791
  # # status = "Extracting texts from Dataset 1..."
792
  # # yield status, ""
793
  # # texts = [example[dataset1_text_column] for example in ds]
794
-
795
  # # # Compute embeddings
796
  # # status = "Computing embeddings for Dataset 1..."
797
  # # yield status, ""
798
- # # embedding_matrix = model.encode(texts, show_progressbar=True) # Enable internal progress bar
799
-
800
  # # # Deduplicate
801
  # # status = "Deduplicating embeddings..."
802
  # # yield status, ""
803
  # # deduplicated_indices, duplicate_to_original_mapping = deduplicate(
804
- # # embedding_matrix, threshold
805
  # # )
806
-
807
  # # # Prepare the results
808
  # # num_duplicates = len(duplicate_to_original_mapping)
809
  # # num_total = len(texts)
810
  # # num_deduplicated = len(deduplicated_indices)
811
-
812
  # # result_text = f"**Total documents:** {num_total}\n"
813
  # # result_text += f"**Number of duplicates found:** {num_duplicates}\n"
814
  # # result_text += f"**Number of unique documents after deduplication:** {num_deduplicated}\n\n"
815
-
816
  # # # Show deduplicated examples
817
  # # if num_duplicates > 0:
818
  # # result_text += "**Examples of duplicates found:**\n\n"
@@ -827,11 +819,11 @@ demo.launch()
827
  # # result_text += "-" * 50 + "\n\n"
828
  # # else:
829
  # # result_text += "No duplicates found."
830
-
831
  # # # Final status
832
  # # status = "Deduplication completed."
833
  # # yield status, result_text
834
-
835
  # # elif deduplication_type == "Cross-dataset":
836
  # # # Load Dataset 1
837
  # # status = "Loading Dataset 1..."
@@ -840,7 +832,7 @@ demo.launch()
840
  # # ds1 = ds_default1
841
  # # else:
842
  # # ds1 = load_dataset(dataset1_name, split=dataset1_split)
843
-
844
  # # # Load Dataset 2
845
  # # status = "Loading Dataset 2..."
846
  # # yield status, ""
@@ -848,42 +840,42 @@ demo.launch()
848
  # # ds2 = ds_default2
849
  # # else:
850
  # # ds2 = load_dataset(dataset2_name, split=dataset2_split)
851
-
852
  # # # Extract texts from Dataset 1
853
  # # status = "Extracting texts from Dataset 1..."
854
  # # yield status, ""
855
  # # texts1 = [example[dataset1_text_column] for example in ds1]
856
-
857
  # # # Extract texts from Dataset 2
858
  # # status = "Extracting texts from Dataset 2..."
859
  # # yield status, ""
860
  # # texts2 = [example[dataset2_text_column] for example in ds2]
861
-
862
  # # # Compute embeddings for Dataset 1
863
  # # status = "Computing embeddings for Dataset 1..."
864
  # # yield status, ""
865
- # # embedding_matrix1 = model.encode(texts1, show_progressbar=True)
866
-
867
  # # # Compute embeddings for Dataset 2
868
  # # status = "Computing embeddings for Dataset 2..."
869
  # # yield status, ""
870
- # # embedding_matrix2 = model.encode(texts2, show_progressbar=True)
871
-
872
  # # # Deduplicate across datasets
873
  # # status = "Deduplicating embeddings across datasets..."
874
  # # yield status, ""
875
  # # duplicate_indices_in_ds2, duplicate_to_original_mapping = deduplicate_across_datasets(
876
- # # embedding_matrix1, embedding_matrix2, threshold
877
  # # )
878
-
879
  # # num_duplicates = len(duplicate_indices_in_ds2)
880
  # # num_total_ds2 = len(texts2)
881
  # # num_unique_ds2 = num_total_ds2 - num_duplicates
882
-
883
- # # result_text = f"**Total documents in {dataset2_name}/{dataset2_split}:** {num_total_ds2}\n"
884
- # # result_text += f"**Number of duplicates found in {dataset2_name}/{dataset2_split}:** {num_duplicates}\n"
885
  # # result_text += f"**Number of unique documents in {dataset2_name}/{dataset2_split} after deduplication:** {num_unique_ds2}\n\n"
886
-
887
  # # # Show deduplicated examples
888
  # # if num_duplicates > 0:
889
  # # result_text += "**Examples of duplicates found in Dataset 2:**\n\n"
@@ -899,17 +891,14 @@ demo.launch()
899
  # # result_text += "-" * 50 + "\n\n"
900
  # # else:
901
  # # result_text += "No duplicates found."
902
-
903
  # # # Final status
904
  # # status = "Deduplication completed."
905
  # # yield status, result_text
906
 
907
- # # finally:
908
- # # # Restore original tqdm
909
- # # tqdm.tqdm = original_tqdm
910
- # # for mod_name in list(sys.modules.keys()):
911
- # # if 'tqdm' in mod_name:
912
- # # sys.modules[mod_name].tqdm = original_tqdm
913
 
914
  # # with gr.Blocks() as demo:
915
  # # gr.Markdown("# Semantic Deduplication")
@@ -961,605 +950,670 @@ demo.launch()
961
  # # compute_button.click(
962
  # # fn=perform_deduplication,
963
  # # inputs=[
964
- # # deduplication_type,
965
- # # dataset1_name,
966
- # # dataset1_split,
967
  # # dataset1_text_column,
968
- # # dataset2_name,
969
- # # dataset2_split,
970
  # # dataset2_text_column,
971
  # # threshold
972
  # # ],
973
  # # outputs=[status_output, result_output]
974
  # # )
975
-
976
  # # demo.launch()
977
 
978
 
979
- # # import gradio as gr
980
- # # from datasets import load_dataset
981
- # # import numpy as np
982
- # # from model2vec import StaticModel
983
- # # from reach import Reach
984
- # # from difflib import ndiff
985
- # # import sys
986
- # # import tqdm
987
 
988
- # # # Load the model at startup
989
- # # model = StaticModel.from_pretrained("minishlab/M2V_base_output")
990
 
991
- # # # Update default dataset to 'sst2' and set default threshold to 0.9
992
- # # default_dataset1_name = "sst2"
993
- # # default_dataset1_split = "train"
994
- # # default_dataset2_name = "sst2"
995
- # # default_dataset2_split = "validation"
996
- # # default_text_column = "sentence"
997
- # # default_threshold = 0.9
998
 
999
- # # # Load the default datasets at startup
1000
- # # ds_default1 = load_dataset(default_dataset1_name, split=default_dataset1_split)
1001
- # # ds_default2 = load_dataset(default_dataset2_name, split=default_dataset2_split)
1002
 
1003
- # # def deduplicate(embedding_matrix: np.ndarray, threshold: float, batch_size: int = 1024, progress=None) -> tuple[np.ndarray, dict[int, int]]:
1004
- # # """
1005
- # # Deduplicate embeddings and return the deduplicated indices and a mapping of removed indices to their corresponding original indices.
1006
- # # """
1007
- # # # Update progress to indicate building the index
1008
- # # progress(0, desc="Building search index...")
1009
- # # reach = Reach(vectors=embedding_matrix, items=[str(i) for i in range(len(embedding_matrix))])
1010
 
1011
- # # deduplicated_indices = set(range(len(embedding_matrix)))
1012
- # # duplicate_to_original_mapping = {}
1013
 
1014
- # # # Finding nearest neighbors
1015
- # # progress(0, desc="Finding nearest neighbors...")
1016
- # # results = reach.nearest_neighbor_threshold(
1017
- # # embedding_matrix,
1018
- # # threshold=threshold,
1019
- # # batch_size=batch_size,
1020
- # # show_progressbar=True # Allow internal progress bar
1021
- # # )
1022
 
1023
- # # # Processing duplicates with a progress bar
1024
- # # total_items = len(embedding_matrix)
1025
- # # for i, similar_items in enumerate(progress.tqdm(results, desc="Processing duplicates", total=total_items)):
1026
- # # if i not in deduplicated_indices:
1027
- # # continue
1028
 
1029
- # # similar_indices = [int(item[0]) for item in similar_items if int(item[0]) != i]
1030
 
1031
- # # for sim_idx in similar_indices:
1032
- # # if sim_idx in deduplicated_indices:
1033
- # # deduplicated_indices.remove(sim_idx)
1034
- # # duplicate_to_original_mapping[sim_idx] = i
1035
 
1036
- # # return np.array(list(deduplicated_indices)), duplicate_to_original_mapping
1037
 
1038
- # # def deduplicate_across_datasets(embedding_matrix_1: np.ndarray, embedding_matrix_2: np.ndarray, threshold: float, batch_size: int = 1024, progress=None) -> tuple[list[int], dict[int, int]]:
1039
- # # """
1040
- # # Deduplicate embeddings across two datasets and return the indices of duplicates between them.
1041
- # # """
1042
- # # # Update progress to indicate building the index
1043
- # # progress(0, desc="Building search index from Dataset 1...")
1044
- # # reach = Reach(vectors=embedding_matrix_1, items=[str(i) for i in range(len(embedding_matrix_1))])
1045
 
1046
- # # duplicate_indices_in_test = []
1047
- # # duplicate_to_original_mapping = {}
1048
 
1049
- # # # Finding nearest neighbors between datasets
1050
- # # progress(0, desc="Finding nearest neighbors between datasets...")
1051
- # # results = reach.nearest_neighbor_threshold(
1052
- # # embedding_matrix_2,
1053
- # # threshold=threshold,
1054
- # # batch_size=batch_size,
1055
- # # show_progressbar=True # Allow internal progress bar
1056
- # # )
1057
 
1058
- # # total_items = len(embedding_matrix_2)
1059
- # # # Processing duplicates with a progress bar
1060
- # # for i, similar_items in enumerate(progress.tqdm(results, desc="Processing duplicates across datasets", total=total_items)):
1061
- # # similar_indices = [int(item[0]) for item in similar_items if item[1] >= threshold]
1062
 
1063
- # # if similar_indices:
1064
- # # duplicate_indices_in_test.append(i)
1065
- # # duplicate_to_original_mapping[i] = similar_indices[0]
1066
 
1067
- # # return duplicate_indices_in_test, duplicate_to_original_mapping
1068
 
1069
- # # def display_word_differences(x: str, y: str) -> str:
1070
- # # diff = ndiff(x.split(), y.split())
1071
- # # return " ".join([word for word in diff if word.startswith(('+', '-'))])
1072
 
1073
- # # def perform_deduplication(
1074
- # # deduplication_type,
1075
- # # dataset1_name,
1076
- # # dataset1_split,
1077
- # # dataset1_text_column,
1078
- # # dataset2_name="",
1079
- # # dataset2_split="",
1080
- # # dataset2_text_column="",
1081
- # # threshold=default_threshold,
1082
- # # progress=gr.Progress(track_tqdm=True)
1083
- # # ):
1084
- # # # Monkey-patch tqdm
1085
- # # original_tqdm = tqdm.tqdm
1086
- # # original_reach_tqdm = Reach.__dict__['tqdm'] if 'tqdm' in Reach.__dict__ else None
1087
- # # tqdm.tqdm = progress.tqdm
1088
- # # sys.modules['tqdm'].tqdm = progress.tqdm
1089
- # # sys.modules['tqdm.auto'].tqdm = progress.tqdm
1090
- # # Reach.tqdm = progress.tqdm # Monkey-patch reach's tqdm
1091
 
1092
- # # try:
1093
- # # # Convert threshold to float
1094
- # # threshold = float(threshold)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1095
 
1096
- # # if deduplication_type == "Single dataset":
1097
- # # # Load Dataset 1
1098
- # # progress(0, desc="Loading Dataset 1...")
1099
- # # if dataset1_name == default_dataset1_name and dataset1_split == default_dataset1_split:
1100
- # # ds = ds_default1
1101
- # # else:
1102
- # # ds = load_dataset(dataset1_name, split=dataset1_split)
 
 
 
 
1103
 
1104
- # # # Extract texts
1105
- # # progress(0, desc="Extracting texts from Dataset 1...")
1106
- # # texts = [example[dataset1_text_column] for example in ds]
 
1107
 
1108
- # # # Compute embeddings
1109
- # # progress(0, desc="Computing embeddings for Dataset 1...")
1110
- # # embedding_matrix = model.encode(texts, show_progressbar=True) # Enable internal progress bar
 
1111
 
1112
- # # # Deduplicate
1113
- # # result_text = deduplicate_and_prepare_results_single(
1114
- # # embedding_matrix, texts, threshold, progress
1115
- # # )
 
 
 
 
 
 
 
 
 
 
 
1116
 
1117
- # # return result_text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1118
 
1119
- # # elif deduplication_type == "Cross-dataset":
1120
- # # # Load Dataset 1
1121
- # # progress(0, desc="Loading Dataset 1...")
1122
- # # if dataset1_name == default_dataset1_name and dataset1_split == default_dataset1_split:
1123
- # # ds1 = ds_default1
1124
- # # else:
1125
- # # ds1 = load_dataset(dataset1_name, split=dataset1_split)
 
1126
 
1127
- # # # Load Dataset 2
1128
- # # progress(0, desc="Loading Dataset 2...")
1129
- # # if dataset2_name == default_dataset2_name and dataset2_split == default_dataset2_split:
1130
- # # ds2 = ds_default2
1131
- # # else:
1132
- # # ds2 = load_dataset(dataset2_name, split=dataset2_split)
 
1133
 
1134
- # # # Extract texts from Dataset 1
1135
- # # progress(0, desc="Extracting texts from Dataset 1...")
1136
- # # texts1 = [example[dataset1_text_column] for example in ds1]
 
1137
 
1138
- # # # Extract texts from Dataset 2
1139
- # # progress(0, desc="Extracting texts from Dataset 2...")
1140
- # # texts2 = [example[dataset2_text_column] for example in ds2]
 
1141
 
1142
- # # # Compute embeddings for Dataset 1
1143
- # # progress(0, desc="Computing embeddings for Dataset 1...")
1144
- # # embedding_matrix1 = model.encode(texts1, show_progressbar=True)
 
1145
 
1146
- # # # Compute embeddings for Dataset 2
1147
- # # progress(0, desc="Computing embeddings for Dataset 2...")
1148
- # # embedding_matrix2 = model.encode(texts2, show_progressbar=True)
 
1149
 
1150
- # # # Deduplicate across datasets
1151
- # # result_text = deduplicate_and_prepare_results_cross(
1152
- # # embedding_matrix1, embedding_matrix2, texts1, texts2, threshold, progress, dataset2_name, dataset2_split
1153
- # # )
 
 
1154
 
1155
- # # return result_text
1156
-
1157
- # # finally:
1158
- # # # Restore original tqdm
1159
- # # tqdm.tqdm = original_tqdm
1160
- # # sys.modules['tqdm'].tqdm = original_tqdm
1161
- # # sys.modules['tqdm.auto'].tqdm = original_tqdm
1162
-
1163
- # # # Restore reach's original tqdm
1164
- # # if original_reach_tqdm is not None:
1165
- # # Reach.tqdm = original_reach_tqdm
1166
- # # else:
1167
- # # del Reach.tqdm # If it wasn't originally in Reach's __dict__
1168
-
1169
- # # def deduplicate_and_prepare_results_single(embedding_matrix, texts, threshold, progress):
1170
- # # # Deduplicate
1171
- # # deduplicated_indices, duplicate_to_original_mapping = deduplicate(
1172
- # # embedding_matrix, threshold, progress=progress
1173
- # # )
1174
-
1175
- # # # Prepare the results
1176
- # # num_duplicates = len(duplicate_to_original_mapping)
1177
- # # num_total = len(texts)
1178
- # # num_deduplicated = len(deduplicated_indices)
1179
-
1180
- # # result_text = f"**Total documents:** {num_total}\n"
1181
- # # result_text += f"**Number of duplicates found:** {num_duplicates}\n"
1182
- # # result_text += f"**Number of unique documents after deduplication:** {num_deduplicated}\n\n"
1183
-
1184
- # # # Show deduplicated examples
1185
- # # if num_duplicates > 0:
1186
- # # result_text += "**Examples of duplicates found:**\n\n"
1187
- # # num_examples = min(5, num_duplicates)
1188
- # # for duplicate_idx, original_idx in list(duplicate_to_original_mapping.items())[:num_examples]:
1189
- # # original_text = texts[original_idx]
1190
- # # duplicate_text = texts[duplicate_idx]
1191
- # # differences = display_word_differences(original_text, duplicate_text)
1192
- # # result_text += f"**Original text:**\n{original_text}\n\n"
1193
- # # result_text += f"**Duplicate text:**\n{duplicate_text}\n\n"
1194
- # # result_text += f"**Differences:**\n{differences}\n"
1195
- # # result_text += "-" * 50 + "\n\n"
1196
- # # else:
1197
- # # result_text += "No duplicates found."
1198
-
1199
- # # return result_text
1200
-
1201
- # # def deduplicate_and_prepare_results_cross(embedding_matrix1, embedding_matrix2, texts1, texts2, threshold, progress, dataset2_name, dataset2_split):
1202
- # # # Deduplicate across datasets
1203
- # # duplicate_indices_in_ds2, duplicate_to_original_mapping = deduplicate_across_datasets(
1204
- # # embedding_matrix1, embedding_matrix2, threshold, progress=progress
1205
- # # )
1206
-
1207
- # # num_duplicates = len(duplicate_indices_in_ds2)
1208
- # # num_total_ds2 = len(texts2)
1209
- # # num_unique_ds2 = num_total_ds2 - num_duplicates
1210
-
1211
- # # result_text = f"**Total documents in {dataset2_name}/{dataset2_split}:** {num_total_ds2}\n"
1212
- # # result_text += f"**Number of duplicates found in {dataset2_name}/{dataset2_split}:** {num_duplicates}\n"
1213
- # # result_text += f"**Number of unique documents in {dataset2_name}/{dataset2_split} after deduplication:** {num_unique_ds2}\n\n"
1214
-
1215
- # # # Show deduplicated examples
1216
- # # if num_duplicates > 0:
1217
- # # result_text += "**Examples of duplicates found in Dataset 2:**\n\n"
1218
- # # num_examples = min(5, num_duplicates)
1219
- # # for duplicate_idx in duplicate_indices_in_ds2[:num_examples]:
1220
- # # original_idx = duplicate_to_original_mapping[duplicate_idx]
1221
- # # original_text = texts1[original_idx]
1222
- # # duplicate_text = texts2[duplicate_idx]
1223
- # # differences = display_word_differences(original_text, duplicate_text)
1224
- # # result_text += f"**Original text (Dataset 1):**\n{original_text}\n\n"
1225
- # # result_text += f"**Duplicate text (Dataset 2):**\n{duplicate_text}\n\n"
1226
- # # result_text += f"**Differences:**\n{differences}\n"
1227
- # # result_text += "-" * 50 + "\n\n"
1228
- # # else:
1229
- # # result_text += "No duplicates found."
1230
-
1231
- # # return result_text
1232
-
1233
- # # with gr.Blocks() as demo:
1234
- # # gr.Markdown("# Semantic Deduplication")
1235
 
1236
- # # deduplication_type = gr.Radio(
1237
- # # choices=["Single dataset", "Cross-dataset"],
1238
- # # label="Deduplication Type",
1239
- # # value="Single dataset"
1240
- # # )
 
1241
 
1242
- # # with gr.Row():
1243
- # # dataset1_name = gr.Textbox(value=default_dataset1_name, label="Dataset 1 Name")
1244
- # # dataset1_split = gr.Textbox(value=default_dataset1_split, label="Dataset 1 Split")
1245
- # # dataset1_text_column = gr.Textbox(value=default_text_column, label="Text Column Name")
1246
 
1247
- # # dataset2_inputs = gr.Column(visible=False)
1248
- # # with dataset2_inputs:
1249
- # # gr.Markdown("### Dataset 2")
1250
- # # with gr.Row():
1251
- # # dataset2_name = gr.Textbox(value=default_dataset2_name, label="Dataset 2 Name")
1252
- # # dataset2_split = gr.Textbox(value=default_dataset2_split, label="Dataset 2 Split")
1253
- # # dataset2_text_column = gr.Textbox(value=default_text_column, label="Text Column Name")
1254
 
1255
- # # threshold = gr.Slider(
1256
- # # minimum=0.0,
1257
- # # maximum=1.0,
1258
- # # value=default_threshold,
1259
- # # label="Similarity Threshold"
1260
- # # )
1261
 
1262
- # # compute_button = gr.Button("Compute")
 
 
 
 
 
 
1263
 
1264
- # # output = gr.Markdown()
 
 
 
 
 
1265
 
1266
- # # # Function to update the visibility of dataset2_inputs
1267
- # # def update_visibility(deduplication_type_value):
1268
- # # if deduplication_type_value == "Cross-dataset":
1269
- # # return gr.update(visible=True)
1270
- # # else:
1271
- # # return gr.update(visible=False)
1272
 
1273
- # # deduplication_type.change(
1274
- # # update_visibility,
1275
- # # inputs=deduplication_type,
1276
- # # outputs=dataset2_inputs
1277
- # # )
1278
 
1279
- # # compute_button.click(
1280
- # # fn=perform_deduplication,
1281
- # # inputs=[
1282
- # # deduplication_type,
1283
- # # dataset1_name,
1284
- # # dataset1_split,
1285
- # # dataset1_text_column,
1286
- # # dataset2_name,
1287
- # # dataset2_split,
1288
- # # dataset2_text_column,
1289
- # # threshold
1290
- # # ],
1291
- # # outputs=output
1292
- # # )
1293
-
1294
- # # demo.launch()
1295
 
 
 
 
 
 
1296
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1297
 
1298
 
1299
- # # import gradio as gr
1300
- # # from datasets import load_dataset
1301
- # # import numpy as np
1302
- # # from model2vec import StaticModel
1303
- # # from reach import Reach
1304
- # # from difflib import ndiff
1305
- # # import sys
1306
- # # import tqdm
1307
 
1308
- # # # Load the model at startup
1309
- # # model = StaticModel.from_pretrained("minishlab/M2V_base_output")
1310
 
1311
- # # # Load the default datasets at startup
1312
- # # default_dataset1_name = "ag_news"
1313
- # # default_dataset1_split = "train"
1314
- # # default_dataset2_name = "ag_news"
1315
- # # default_dataset2_split = "test"
 
 
1316
 
1317
- # # ds_default1 = load_dataset(default_dataset1_name, split=default_dataset1_split)
1318
- # # ds_default2 = load_dataset(default_dataset2_name, split=default_dataset2_split)
 
1319
 
1320
- # # def deduplicate(embedding_matrix: np.ndarray, threshold: float, batch_size: int = 1024, progress=None) -> tuple[np.ndarray, dict[int, int]]:
1321
- # # """
1322
- # # Deduplicate embeddings and return the deduplicated indices and a mapping of removed indices to their corresponding original indices.
1323
- # # """
1324
- # # reach = Reach(vectors=embedding_matrix, items=[str(i) for i in range(len(embedding_matrix))])
 
 
1325
 
1326
- # # deduplicated_indices = set(range(len(embedding_matrix)))
1327
- # # duplicate_to_original_mapping = {}
1328
 
1329
- # # results = reach.nearest_neighbor_threshold(
1330
- # # embedding_matrix,
1331
- # # threshold=threshold,
1332
- # # batch_size=batch_size,
1333
- # # show_progressbar=True # Allow internal progress bar
1334
- # # )
 
 
1335
 
1336
- # # # Process duplicates
1337
- # # for i, similar_items in enumerate(progress.tqdm(results, desc="Processing duplicates", total=len(embedding_matrix))):
1338
- # # if i not in deduplicated_indices:
1339
- # # continue
 
1340
 
1341
- # # similar_indices = [int(item[0]) for item in similar_items if int(item[0]) != i]
1342
 
1343
- # # for sim_idx in similar_indices:
1344
- # # if sim_idx in deduplicated_indices:
1345
- # # deduplicated_indices.remove(sim_idx)
1346
- # # duplicate_to_original_mapping[sim_idx] = i
1347
 
1348
- # # return np.array(list(deduplicated_indices)), duplicate_to_original_mapping
1349
 
1350
- # # def deduplicate_across_datasets(embedding_matrix_1: np.ndarray, embedding_matrix_2: np.ndarray, threshold: float, batch_size: int = 1024, progress=None) -> tuple[list[int], dict[int, int]]:
1351
- # # """
1352
- # # Deduplicate embeddings across two datasets and return the indices of duplicates between them.
1353
- # # """
1354
- # # reach = Reach(vectors=embedding_matrix_1, items=[str(i) for i in range(len(embedding_matrix_1))])
 
 
1355
 
1356
- # # duplicate_indices_in_test = []
1357
- # # duplicate_to_original_mapping = {}
1358
 
1359
- # # results = reach.nearest_neighbor_threshold(
1360
- # # embedding_matrix_2,
1361
- # # threshold=threshold,
1362
- # # batch_size=batch_size,
1363
- # # show_progressbar=True # Allow internal progress bar
1364
- # # )
 
 
1365
 
1366
- # # for i, similar_items in enumerate(progress.tqdm(results, desc="Processing duplicates across datasets", total=len(embedding_matrix_2))):
1367
- # # similar_indices = [int(item[0]) for item in similar_items if item[1] >= threshold]
 
 
1368
 
1369
- # # if similar_indices:
1370
- # # duplicate_indices_in_test.append(i)
1371
- # # duplicate_to_original_mapping[i] = similar_indices[0]
1372
 
1373
- # # return duplicate_indices_in_test, duplicate_to_original_mapping
1374
 
1375
- # # def display_word_differences(x: str, y: str) -> str:
1376
- # # diff = ndiff(x.split(), y.split())
1377
- # # return " ".join([word for word in diff if word.startswith(('+', '-'))])
1378
 
1379
- # # def perform_deduplication(
1380
- # # deduplication_type,
1381
- # # dataset1_name,
1382
- # # dataset1_split,
1383
- # # dataset1_text_column,
1384
- # # dataset2_name="",
1385
- # # dataset2_split="",
1386
- # # dataset2_text_column="",
1387
- # # threshold=0.8,
1388
- # # progress=gr.Progress(track_tqdm=True)
1389
- # # ):
1390
- # # # Monkey-patch tqdm
1391
- # # original_tqdm = tqdm.tqdm
1392
- # # original_reach_tqdm = Reach.__dict__['tqdm'] if 'tqdm' in Reach.__dict__ else None
1393
- # # tqdm.tqdm = progress.tqdm
1394
- # # sys.modules['tqdm'].tqdm = progress.tqdm
1395
- # # sys.modules['tqdm.auto'].tqdm = progress.tqdm
1396
- # # Reach.tqdm = progress.tqdm # Monkey-patch reach's tqdm
1397
 
1398
- # # try:
1399
- # # # Convert threshold to float
1400
- # # threshold = float(threshold)
1401
 
1402
- # # if deduplication_type == "Single dataset":
1403
- # # # Check if the dataset is the default one
1404
- # # if dataset1_name == default_dataset1_name and dataset1_split == default_dataset1_split:
1405
- # # ds = ds_default1
1406
- # # else:
1407
- # # ds = load_dataset(dataset1_name, split=dataset1_split)
1408
-
1409
- # # # Extract texts
1410
- # # texts = [example[dataset1_text_column] for example in ds]
1411
-
1412
- # # # Compute embeddings
1413
- # # embedding_matrix = model.encode(texts, show_progressbar=True) # Enable internal progress bar
1414
-
1415
- # # # Deduplicate
1416
- # # deduplicated_indices, duplicate_to_original_mapping = deduplicate(embedding_matrix, threshold, progress=progress)
1417
 
1418
- # # # Prepare the results
1419
- # # num_duplicates = len(duplicate_to_original_mapping)
1420
- # # num_total = len(texts)
1421
- # # num_deduplicated = len(deduplicated_indices)
1422
 
1423
- # # result_text = f"**Total documents:** {num_total}\n"
1424
- # # result_text += f"**Number of duplicates found:** {num_duplicates}\n"
1425
- # # result_text += f"**Number of unique documents after deduplication:** {num_deduplicated}\n\n"
1426
 
1427
- # # # Show deduplicated examples
1428
- # # result_text += "**Examples of duplicates found:**\n\n"
1429
- # # num_examples = min(5, num_duplicates)
1430
- # # for duplicate_idx, original_idx in list(duplicate_to_original_mapping.items())[:num_examples]:
1431
- # # original_text = texts[original_idx]
1432
- # # duplicate_text = texts[duplicate_idx]
1433
- # # differences = display_word_differences(original_text, duplicate_text)
1434
- # # result_text += f"**Original text:**\n{original_text}\n\n"
1435
- # # result_text += f"**Duplicate text:**\n{duplicate_text}\n\n"
1436
- # # result_text += f"**Differences:**\n{differences}\n"
1437
- # # result_text += "-" * 50 + "\n\n"
1438
 
1439
- # # return result_text
1440
 
1441
- # # elif deduplication_type == "Cross-dataset":
1442
- # # # Dataset 1
1443
- # # if dataset1_name == default_dataset1_name and dataset1_split == default_dataset1_split:
1444
- # # ds1 = ds_default1
1445
- # # else:
1446
- # # ds1 = load_dataset(dataset1_name, split=dataset1_split)
1447
-
1448
- # # # Dataset 2
1449
- # # if dataset2_name == default_dataset2_name and dataset2_split == default_dataset2_split:
1450
- # # ds2 = ds_default2
1451
- # # else:
1452
- # # ds2 = load_dataset(dataset2_name, split=dataset2_split)
1453
 
1454
- # # # Extract texts
1455
- # # texts1 = [example[dataset1_text_column] for example in ds1]
1456
- # # texts2 = [example[dataset2_text_column] for example in ds2]
 
 
 
1457
 
1458
- # # # Compute embeddings
1459
- # # embedding_matrix1 = model.encode(texts1, show_progressbar=True) # Enable internal progress bar
1460
- # # embedding_matrix2 = model.encode(texts2, show_progressbar=True) # Enable internal progress bar
1461
 
1462
- # # # Deduplicate across datasets
1463
- # # duplicate_indices_in_ds2, duplicate_to_original_mapping = deduplicate_across_datasets(
1464
- # # embedding_matrix1, embedding_matrix2, threshold, progress=progress)
1465
 
1466
- # # num_duplicates = len(duplicate_indices_in_ds2)
1467
- # # num_total_ds2 = len(texts2)
1468
- # # num_unique_ds2 = num_total_ds2 - num_duplicates
1469
 
1470
- # # result_text = f"**Total documents in {dataset2_name}/{dataset2_split}:** {num_total_ds2}\n"
1471
- # # result_text += f"**Number of duplicates found in {dataset2_name}/{dataset2_split}:** {num_duplicates}\n"
1472
- # # result_text += f"**Number of unique documents in {dataset2_name}/{dataset2_split} after deduplication:** {num_unique_ds2}\n\n"
1473
 
1474
- # # # Show deduplicated examples
1475
- # # result_text += "**Examples of duplicates found in Dataset 2:**\n\n"
1476
- # # num_examples = min(5, num_duplicates)
1477
- # # for duplicate_idx in duplicate_indices_in_ds2[:num_examples]:
1478
- # # original_idx = duplicate_to_original_mapping[duplicate_idx]
1479
- # # original_text = texts1[original_idx]
1480
- # # duplicate_text = texts2[duplicate_idx]
1481
- # # differences = display_word_differences(original_text, duplicate_text)
1482
- # # result_text += f"**Original text (Dataset 1):**\n{original_text}\n\n"
1483
- # # result_text += f"**Duplicate text (Dataset 2):**\n{duplicate_text}\n\n"
1484
- # # result_text += f"**Differences:**\n{differences}\n"
1485
- # # result_text += "-" * 50 + "\n\n"
1486
 
1487
- # # return result_text
1488
 
1489
- # # finally:
1490
- # # # Restore original tqdm
1491
- # # tqdm.tqdm = original_tqdm
1492
- # # sys.modules['tqdm'].tqdm = original_tqdm
1493
- # # sys.modules['tqdm.auto'].tqdm = original_tqdm
1494
 
1495
- # # # Restore reach's original tqdm
1496
- # # if original_reach_tqdm is not None:
1497
- # # Reach.tqdm = original_reach_tqdm
1498
- # # else:
1499
- # # del Reach.tqdm # If it wasn't originally in Reach's __dict__
1500
 
1501
- # # with gr.Blocks() as demo:
1502
- # # gr.Markdown("# Semantic Deduplication")
1503
-
1504
- # # deduplication_type = gr.Radio(
1505
- # # choices=["Single dataset", "Cross-dataset"],
1506
- # # label="Deduplication Type",
1507
- # # value="Single dataset"
1508
- # # )
1509
 
1510
- # # with gr.Row():
1511
- # # dataset1_name = gr.Textbox(value="ag_news", label="Dataset 1 Name")
1512
- # # dataset1_split = gr.Textbox(value="train", label="Dataset 1 Split")
1513
- # # dataset1_text_column = gr.Textbox(value="text", label="Text Column Name")
1514
 
1515
- # # dataset2_inputs = gr.Column(visible=False)
1516
- # # with dataset2_inputs:
1517
- # # gr.Markdown("### Dataset 2")
1518
- # # with gr.Row():
1519
- # # dataset2_name = gr.Textbox(value="ag_news", label="Dataset 2 Name")
1520
- # # dataset2_split = gr.Textbox(value="test", label="Dataset 2 Split")
1521
- # # dataset2_text_column = gr.Textbox(value="text", label="Text Column Name")
1522
 
1523
- # # threshold = gr.Slider(
1524
- # # minimum=0.0,
1525
- # # maximum=1.0,
1526
- # # value=0.8,
1527
- # # label="Similarity Threshold"
1528
- # # )
 
 
 
 
 
 
 
 
1529
 
1530
- # # compute_button = gr.Button("Compute")
 
 
 
 
 
 
1531
 
1532
- # # output = gr.Markdown()
 
 
1533
 
1534
- # # # Function to update the visibility of dataset2_inputs
1535
- # # def update_visibility(deduplication_type_value):
1536
- # # if deduplication_type_value == "Cross-dataset":
1537
- # # return gr.update(visible=True)
1538
- # # else:
1539
- # # return gr.update(visible=False)
1540
 
1541
- # # deduplication_type.change(
1542
- # # update_visibility,
1543
- # # inputs=deduplication_type,
1544
- # # outputs=dataset2_inputs
1545
- # # )
 
 
 
 
 
 
 
 
 
 
1546
 
1547
- # # compute_button.click(
1548
- # # fn=perform_deduplication,
1549
- # # inputs=[
1550
- # # deduplication_type,
1551
- # # dataset1_name,
1552
- # # dataset1_split,
1553
- # # dataset1_text_column,
1554
- # # dataset2_name,
1555
- # # dataset2_split,
1556
- # # dataset2_text_column,
1557
- # # threshold
1558
- # # ],
1559
- # # outputs=output
1560
- # # )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1561
 
1562
- # # demo.launch()
 
 
1563
 
1564
 
1565
  # # # import gradio as gr
@@ -1600,7 +1654,7 @@ demo.launch()
1600
  # # # )
1601
 
1602
  # # # # Process duplicates
1603
- # # # for i, similar_items in enumerate(progress.tqdm(results, desc="Processing duplicates")):
1604
  # # # if i not in deduplicated_indices:
1605
  # # # continue
1606
 
@@ -1629,8 +1683,7 @@ demo.launch()
1629
  # # # show_progressbar=True # Allow internal progress bar
1630
  # # # )
1631
 
1632
- # # # # Process duplicates
1633
- # # # for i, similar_items in enumerate(progress.tqdm(results, desc="Processing duplicates across datasets")):
1634
  # # # similar_indices = [int(item[0]) for item in similar_items if item[1] >= threshold]
1635
 
1636
  # # # if similar_indices:
@@ -1656,9 +1709,11 @@ demo.launch()
1656
  # # # ):
1657
  # # # # Monkey-patch tqdm
1658
  # # # original_tqdm = tqdm.tqdm
 
1659
  # # # tqdm.tqdm = progress.tqdm
1660
  # # # sys.modules['tqdm'].tqdm = progress.tqdm
1661
  # # # sys.modules['tqdm.auto'].tqdm = progress.tqdm
 
1662
 
1663
  # # # try:
1664
  # # # # Convert threshold to float
@@ -1725,7 +1780,8 @@ demo.launch()
1725
  # # # embedding_matrix2 = model.encode(texts2, show_progressbar=True) # Enable internal progress bar
1726
 
1727
  # # # # Deduplicate across datasets
1728
- # # # duplicate_indices_in_ds2, duplicate_to_original_mapping = deduplicate_across_datasets(embedding_matrix1, embedding_matrix2, threshold, progress=progress)
 
1729
 
1730
  # # # num_duplicates = len(duplicate_indices_in_ds2)
1731
  # # # num_total_ds2 = len(texts2)
@@ -1756,6 +1812,12 @@ demo.launch()
1756
  # # # sys.modules['tqdm'].tqdm = original_tqdm
1757
  # # # sys.modules['tqdm.auto'].tqdm = original_tqdm
1758
 
 
 
 
 
 
 
1759
  # # # with gr.Blocks() as demo:
1760
  # # # gr.Markdown("# Semantic Deduplication")
1761
 
@@ -1818,3 +1880,261 @@ demo.launch()
1818
  # # # )
1819
 
1820
  # # # demo.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
  for i in range(0, len(iterable), batch_size):
27
  yield iterable[i:i + batch_size]
28
 
29
+ def compute_embeddings_with_progress(texts, batch_size, progress, desc="Computing embeddings"):
30
+ embeddings = []
31
+ total_batches = (len(texts) + batch_size - 1) // batch_size
32
+ for batch_texts in progress.tqdm(batch_iterable(texts, batch_size), desc=desc, total=total_batches):
33
+ batch_embeddings = model.encode(batch_texts, show_progressbar=False)
34
+ embeddings.append(batch_embeddings)
35
+ embedding_matrix = np.concatenate(embeddings, axis=0)
36
+ return embedding_matrix
37
+
38
  def display_word_differences(x: str, y: str) -> str:
39
  diff = ndiff(x.split(), y.split())
40
  return " ".join([word for word in diff if word.startswith(('+', '-'))])
 
74
  # Compute embeddings
75
  status = "Computing embeddings for Dataset 1..."
76
  yield status, ""
77
+ embedding_matrix = compute_embeddings_with_progress(texts, batch_size=64, progress=progress, desc="Computing embeddings for Dataset 1")
 
 
 
 
 
 
78
 
79
  # Deduplicate
80
  status = "Deduplicating embeddings..."
 
141
  # Compute embeddings for Dataset 1
142
  status = "Computing embeddings for Dataset 1..."
143
  yield status, ""
144
+ embedding_matrix1 = compute_embeddings_with_progress(texts1, batch_size=64, progress=progress, desc="Computing embeddings for Dataset 1")
 
 
 
 
 
 
145
 
146
  # Compute embeddings for Dataset 2
147
  status = "Computing embeddings for Dataset 2..."
148
  yield status, ""
149
+ embedding_matrix2 = compute_embeddings_with_progress(texts2, batch_size=64, progress=progress, desc="Computing embeddings for Dataset 2")
 
 
 
 
 
150
 
151
  # Deduplicate across datasets
152
  status = "Deduplicating embeddings across datasets..."
 
318
  demo.launch()
319
 
320
 
 
321
  # import gradio as gr
322
  # from datasets import load_dataset
323
  # import numpy as np
 
346
  # for i in range(0, len(iterable), batch_size):
347
  # yield iterable[i:i + batch_size]
348
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
349
  # def display_word_differences(x: str, y: str) -> str:
350
  # diff = ndiff(x.split(), y.split())
351
  # return " ".join([word for word in diff if word.startswith(('+', '-'))])
 
385
  # # Compute embeddings
386
  # status = "Computing embeddings for Dataset 1..."
387
  # yield status, ""
388
+ # embeddings = []
389
+ # batch_size = 64
390
+ # total_batches = (len(texts) + batch_size - 1) // batch_size
391
+ # for batch_texts in progress.tqdm(batch_iterable(texts, batch_size), desc="Computing embeddings", total=total_batches):
392
+ # batch_embeddings = model.encode(batch_texts, show_progressbar=False)
393
+ # embeddings.append(batch_embeddings)
394
+ # embedding_matrix = np.concatenate(embeddings, axis=0)
395
 
396
  # # Deduplicate
397
  # status = "Deduplicating embeddings..."
 
458
  # # Compute embeddings for Dataset 1
459
  # status = "Computing embeddings for Dataset 1..."
460
  # yield status, ""
461
+ # embeddings1 = []
462
+ # batch_size = 64
463
+ # total_batches1 = (len(texts1) + batch_size - 1) // batch_size
464
+ # for batch_texts in progress.tqdm(batch_iterable(texts1, batch_size), desc="Computing embeddings for Dataset 1", total=total_batches1):
465
+ # batch_embeddings = model.encode(batch_texts, show_progressbar=False)
466
+ # embeddings1.append(batch_embeddings)
467
+ # embedding_matrix1 = np.concatenate(embeddings1, axis=0)
468
 
469
  # # Compute embeddings for Dataset 2
470
  # status = "Computing embeddings for Dataset 2..."
471
  # yield status, ""
472
+ # embeddings2 = []
473
+ # total_batches2 = (len(texts2) + batch_size - 1) // batch_size
474
+ # for batch_texts in progress.tqdm(batch_iterable(texts2, batch_size), desc="Computing embeddings for Dataset 2", total=total_batches2):
475
+ # batch_embeddings = model.encode(batch_texts, show_progressbar=False)
476
+ # embeddings2.append(batch_embeddings)
477
+ # embedding_matrix2 = np.concatenate(embeddings2, axis=0)
478
 
479
  # # Deduplicate across datasets
480
  # status = "Deduplicating embeddings across datasets..."
 
487
  # num_total_ds2 = len(texts2)
488
  # num_unique_ds2 = num_total_ds2 - num_duplicates
489
 
490
+ # result_text = f"**Total documents in {dataset2_name}/{dataset2_split}:** {num_total_ds2}\n"
491
+ # result_text += f"**Number of duplicates found in {dataset2_name}/{dataset2_split}:** {num_duplicates}\n"
492
  # result_text += f"**Number of unique documents in {dataset2_name}/{dataset2_split} after deduplication:** {num_unique_ds2}\n\n"
493
 
494
  # # Show deduplicated examples
 
515
  # yield f"An error occurred: {e}", ""
516
  # raise e
517
 
518
+ # def deduplicate(embedding_matrix: np.ndarray, threshold: float, batch_size: int = 1024, progress=None) -> tuple[np.ndarray, dict[int, int]]:
519
+ # """
520
+ # Deduplicate embeddings and return the deduplicated indices and a mapping of removed indices to their corresponding original indices.
521
+ # """
522
+ # # Building the index
523
+ # progress(0, desc="Building search index...")
524
+ # reach = Reach(vectors=embedding_matrix, items=[str(i) for i in range(len(embedding_matrix))])
525
+
526
+ # deduplicated_indices = set(range(len(embedding_matrix)))
527
+ # duplicate_to_original_mapping = {}
528
+
529
+ # # Finding nearest neighbors
530
+ # progress(0, desc="Finding nearest neighbors...")
531
+ # results = reach.nearest_neighbor_threshold(
532
+ # embedding_matrix,
533
+ # threshold=threshold,
534
+ # batch_size=batch_size,
535
+ # show_progressbar=False # Disable internal progress bar
536
+ # )
537
+
538
+ # # Processing duplicates with a progress bar
539
+ # total_items = len(embedding_matrix)
540
+ # for i, similar_items in enumerate(progress.tqdm(results, desc="Processing duplicates", total=total_items)):
541
+ # if i not in deduplicated_indices:
542
+ # continue
543
+
544
+ # similar_indices = [int(item[0]) for item in similar_items if int(item[0]) != i]
545
+
546
+ # for sim_idx in similar_indices:
547
+ # if sim_idx in deduplicated_indices:
548
+ # deduplicated_indices.remove(sim_idx)
549
+ # duplicate_to_original_mapping[sim_idx] = i
550
+
551
+ # return np.array(list(deduplicated_indices)), duplicate_to_original_mapping
552
+
553
+ # def deduplicate_across_datasets(embedding_matrix_1: np.ndarray, embedding_matrix_2: np.ndarray, threshold: float, batch_size: int = 1024, progress=None) -> tuple[list[int], dict[int, int]]:
554
+ # """
555
+ # Deduplicate embeddings across two datasets and return the indices of duplicates between them.
556
+ # """
557
+ # # Building the index from Dataset 1
558
+ # progress(0, desc="Building search index from Dataset 1...")
559
+ # reach = Reach(vectors=embedding_matrix_1, items=[str(i) for i in range(len(embedding_matrix_1))])
560
+
561
+ # duplicate_indices_in_test = []
562
+ # duplicate_to_original_mapping = {}
563
+
564
+ # # Finding nearest neighbors between datasets
565
+ # progress(0, desc="Finding nearest neighbors between datasets...")
566
+ # results = reach.nearest_neighbor_threshold(
567
+ # embedding_matrix_2,
568
+ # threshold=threshold,
569
+ # batch_size=batch_size,
570
+ # show_progressbar=False # Disable internal progress bar
571
+ # )
572
+
573
+ # total_items = len(embedding_matrix_2)
574
+ # # Processing duplicates with a progress bar
575
+ # for i, similar_items in enumerate(progress.tqdm(results, desc="Processing duplicates across datasets", total=total_items)):
576
+ # similar_indices = [int(item[0]) for item in similar_items if item[1] >= threshold]
577
+
578
+ # if similar_indices:
579
+ # duplicate_indices_in_test.append(i)
580
+ # duplicate_to_original_mapping[i] = similar_indices[0]
581
+
582
+ # return duplicate_indices_in_test, duplicate_to_original_mapping
583
+
584
  # with gr.Blocks() as demo:
585
  # gr.Markdown("# Semantic Deduplication")
586
 
 
645
 
646
  # demo.launch()
647
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
648
 
649
 
650
  # # import gradio as gr
 
653
  # # from model2vec import StaticModel
654
  # # from reach import Reach
655
  # # from difflib import ndiff
 
656
  # # import tqdm
657
 
658
  # # # Load the model at startup
 
670
  # # ds_default1 = load_dataset(default_dataset1_name, split=default_dataset1_split)
671
  # # ds_default2 = load_dataset(default_dataset2_name, split=default_dataset2_split)
672
 
673
+ # # def batch_iterable(iterable, batch_size):
674
+ # # """Helper function to create batches from an iterable."""
675
+ # # for i in range(0, len(iterable), batch_size):
676
+ # # yield iterable[i:i + batch_size]
677
+
678
+ # # def compute_embeddings(texts, batch_size, progress, desc="Computing embeddings"):
679
+ # # embeddings = []
680
+ # # for batch in progress.tqdm(batch_iterable(texts, batch_size), total=(len(texts) + batch_size - 1) // batch_size, desc=desc):
681
+ # # batch_embeddings = model.encode(batch, show_progressbar=False)
682
+ # # embeddings.append(batch_embeddings)
683
+ # # return np.concatenate(embeddings, axis=0)
684
+
685
+ # # def deduplicate(embedding_matrix: np.ndarray, threshold: float, batch_size: int = 1024, progress=None) -> tuple[np.ndarray, dict[int, int]]:
686
  # # """
687
  # # Deduplicate embeddings and return the deduplicated indices and a mapping of removed indices to their corresponding original indices.
688
  # # """
689
  # # # Building the index
690
+ # # progress(0, desc="Building search index...")
691
  # # reach = Reach(vectors=embedding_matrix, items=[str(i) for i in range(len(embedding_matrix))])
692
 
693
  # # deduplicated_indices = set(range(len(embedding_matrix)))
694
  # # duplicate_to_original_mapping = {}
695
 
696
  # # # Finding nearest neighbors
697
+ # # progress(0, desc="Finding nearest neighbors...")
698
  # # results = reach.nearest_neighbor_threshold(
699
  # # embedding_matrix,
700
  # # threshold=threshold,
701
  # # batch_size=batch_size,
702
+ # # show_progressbar=False # Disable internal progress bar
703
  # # )
704
 
705
+ # # # Processing duplicates with a progress bar
706
+ # # total_items = len(embedding_matrix)
707
+ # # for i, similar_items in enumerate(progress.tqdm(results, desc="Processing duplicates", total=total_items)):
708
  # # if i not in deduplicated_indices:
709
  # # continue
710
 
 
717
 
718
  # # return np.array(list(deduplicated_indices)), duplicate_to_original_mapping
719
 
720
+ # # def deduplicate_across_datasets(embedding_matrix_1: np.ndarray, embedding_matrix_2: np.ndarray, threshold: float, batch_size: int = 1024, progress=None) -> tuple[list[int], dict[int, int]]:
721
  # # """
722
  # # Deduplicate embeddings across two datasets and return the indices of duplicates between them.
723
  # # """
724
  # # # Building the index from Dataset 1
725
+ # # progress(0, desc="Building search index from Dataset 1...")
726
  # # reach = Reach(vectors=embedding_matrix_1, items=[str(i) for i in range(len(embedding_matrix_1))])
727
 
728
  # # duplicate_indices_in_test = []
729
  # # duplicate_to_original_mapping = {}
730
 
731
  # # # Finding nearest neighbors between datasets
732
+ # # progress(0, desc="Finding nearest neighbors between datasets...")
733
  # # results = reach.nearest_neighbor_threshold(
734
  # # embedding_matrix_2,
735
  # # threshold=threshold,
736
  # # batch_size=batch_size,
737
+ # # show_progressbar=False # Disable internal progress bar
738
  # # )
739
 
740
+ # # total_items = len(embedding_matrix_2)
741
+ # # # Processing duplicates with a progress bar
742
+ # # for i, similar_items in enumerate(progress.tqdm(results, desc="Processing duplicates across datasets", total=total_items)):
743
  # # similar_indices = [int(item[0]) for item in similar_items if item[1] >= threshold]
744
 
745
  # # if similar_indices:
 
763
  # # threshold=default_threshold,
764
  # # progress=gr.Progress(track_tqdm=True)
765
  # # ):
 
 
 
 
 
 
 
766
  # # try:
767
  # # # Convert threshold to float
768
  # # threshold = float(threshold)
769
+
770
  # # # Initialize status message
771
  # # status = ""
772
 
 
778
  # # ds = ds_default1
779
  # # else:
780
  # # ds = load_dataset(dataset1_name, split=dataset1_split)
781
+
782
  # # # Extract texts
783
  # # status = "Extracting texts from Dataset 1..."
784
  # # yield status, ""
785
  # # texts = [example[dataset1_text_column] for example in ds]
786
+
787
  # # # Compute embeddings
788
  # # status = "Computing embeddings for Dataset 1..."
789
  # # yield status, ""
790
+ # # embedding_matrix = compute_embeddings(texts, batch_size=64, progress=progress, desc="Computing embeddings for Dataset 1")
791
+
792
  # # # Deduplicate
793
  # # status = "Deduplicating embeddings..."
794
  # # yield status, ""
795
  # # deduplicated_indices, duplicate_to_original_mapping = deduplicate(
796
+ # # embedding_matrix, threshold, progress=progress
797
  # # )
798
+
799
  # # # Prepare the results
800
  # # num_duplicates = len(duplicate_to_original_mapping)
801
  # # num_total = len(texts)
802
  # # num_deduplicated = len(deduplicated_indices)
803
+
804
  # # result_text = f"**Total documents:** {num_total}\n"
805
  # # result_text += f"**Number of duplicates found:** {num_duplicates}\n"
806
  # # result_text += f"**Number of unique documents after deduplication:** {num_deduplicated}\n\n"
807
+
808
  # # # Show deduplicated examples
809
  # # if num_duplicates > 0:
810
  # # result_text += "**Examples of duplicates found:**\n\n"
 
819
  # # result_text += "-" * 50 + "\n\n"
820
  # # else:
821
  # # result_text += "No duplicates found."
822
+
823
  # # # Final status
824
  # # status = "Deduplication completed."
825
  # # yield status, result_text
826
+
827
  # # elif deduplication_type == "Cross-dataset":
828
  # # # Load Dataset 1
829
  # # status = "Loading Dataset 1..."
 
832
  # # ds1 = ds_default1
833
  # # else:
834
  # # ds1 = load_dataset(dataset1_name, split=dataset1_split)
835
+
836
  # # # Load Dataset 2
837
  # # status = "Loading Dataset 2..."
838
  # # yield status, ""
 
840
  # # ds2 = ds_default2
841
  # # else:
842
  # # ds2 = load_dataset(dataset2_name, split=dataset2_split)
843
+
844
  # # # Extract texts from Dataset 1
845
  # # status = "Extracting texts from Dataset 1..."
846
  # # yield status, ""
847
  # # texts1 = [example[dataset1_text_column] for example in ds1]
848
+
849
  # # # Extract texts from Dataset 2
850
  # # status = "Extracting texts from Dataset 2..."
851
  # # yield status, ""
852
  # # texts2 = [example[dataset2_text_column] for example in ds2]
853
+
854
  # # # Compute embeddings for Dataset 1
855
  # # status = "Computing embeddings for Dataset 1..."
856
  # # yield status, ""
857
+ # # embedding_matrix1 = compute_embeddings(texts1, batch_size=64, progress=progress, desc="Computing embeddings for Dataset 1")
858
+
859
  # # # Compute embeddings for Dataset 2
860
  # # status = "Computing embeddings for Dataset 2..."
861
  # # yield status, ""
862
+ # # embedding_matrix2 = compute_embeddings(texts2, batch_size=64, progress=progress, desc="Computing embeddings for Dataset 2")
863
+
864
  # # # Deduplicate across datasets
865
  # # status = "Deduplicating embeddings across datasets..."
866
  # # yield status, ""
867
  # # duplicate_indices_in_ds2, duplicate_to_original_mapping = deduplicate_across_datasets(
868
+ # # embedding_matrix1, embedding_matrix2, threshold, progress=progress
869
  # # )
870
+
871
  # # num_duplicates = len(duplicate_indices_in_ds2)
872
  # # num_total_ds2 = len(texts2)
873
  # # num_unique_ds2 = num_total_ds2 - num_duplicates
874
+
875
+ # # result_text = f"**Total documents in {dataset2_name}/{dataset2_split}:** {num_total_ds2}\n\n"
876
+ # # result_text += f"**Number of duplicates found in {dataset2_name}/{dataset2_split}:** {num_duplicates}\n\n"
877
  # # result_text += f"**Number of unique documents in {dataset2_name}/{dataset2_split} after deduplication:** {num_unique_ds2}\n\n"
878
+
879
  # # # Show deduplicated examples
880
  # # if num_duplicates > 0:
881
  # # result_text += "**Examples of duplicates found in Dataset 2:**\n\n"
 
891
  # # result_text += "-" * 50 + "\n\n"
892
  # # else:
893
  # # result_text += "No duplicates found."
894
+
895
  # # # Final status
896
  # # status = "Deduplication completed."
897
  # # yield status, result_text
898
 
899
+ # # except Exception as e:
900
+ # # yield f"An error occurred: {e}", ""
901
+ # # raise e
 
 
 
902
 
903
  # # with gr.Blocks() as demo:
904
  # # gr.Markdown("# Semantic Deduplication")
 
950
  # # compute_button.click(
951
  # # fn=perform_deduplication,
952
  # # inputs=[
953
+ # # deduplication_type,
954
+ # # dataset1_name,
955
+ # # dataset1_split,
956
  # # dataset1_text_column,
957
+ # # dataset2_name,
958
+ # # dataset2_split,
959
  # # dataset2_text_column,
960
  # # threshold
961
  # # ],
962
  # # outputs=[status_output, result_output]
963
  # # )
964
+
965
  # # demo.launch()
966
 
967
 
 
 
 
 
 
 
 
 
968
 
 
 
969
 
 
 
 
 
 
 
 
970
 
 
 
 
971
 
 
 
 
 
 
 
 
972
 
 
 
973
 
 
 
 
 
 
 
 
 
974
 
 
 
 
 
 
975
 
 
976
 
 
 
 
 
977
 
 
978
 
 
 
 
 
 
 
 
979
 
 
 
980
 
 
 
 
 
 
 
 
 
981
 
 
 
 
 
982
 
 
 
 
983
 
 
984
 
 
 
 
985
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
986
 
987
+
988
+ # # # import gradio as gr
989
+ # # # from datasets import load_dataset
990
+ # # # import numpy as np
991
+ # # # from model2vec import StaticModel
992
+ # # # from reach import Reach
993
+ # # # from difflib import ndiff
994
+ # # # import sys
995
+ # # # import tqdm
996
+
997
+ # # # # Load the model at startup
998
+ # # # model = StaticModel.from_pretrained("minishlab/M2V_base_output")
999
+
1000
+ # # # # Update default dataset to 'sst2' and set default threshold to 0.9
1001
+ # # # default_dataset1_name = "sst2"
1002
+ # # # default_dataset1_split = "train"
1003
+ # # # default_dataset2_name = "sst2"
1004
+ # # # default_dataset2_split = "validation"
1005
+ # # # default_text_column = "sentence"
1006
+ # # # default_threshold = 0.9
1007
+
1008
+ # # # # Load the default datasets at startup
1009
+ # # # ds_default1 = load_dataset(default_dataset1_name, split=default_dataset1_split)
1010
+ # # # ds_default2 = load_dataset(default_dataset2_name, split=default_dataset2_split)
1011
+
1012
+ # # # def deduplicate(embedding_matrix: np.ndarray, threshold: float, batch_size: int = 1024) -> tuple[np.ndarray, dict[int, int]]:
1013
+ # # # """
1014
+ # # # Deduplicate embeddings and return the deduplicated indices and a mapping of removed indices to their corresponding original indices.
1015
+ # # # """
1016
+ # # # # Building the index
1017
+ # # # reach = Reach(vectors=embedding_matrix, items=[str(i) for i in range(len(embedding_matrix))])
1018
+
1019
+ # # # deduplicated_indices = set(range(len(embedding_matrix)))
1020
+ # # # duplicate_to_original_mapping = {}
1021
+
1022
+ # # # # Finding nearest neighbors
1023
+ # # # results = reach.nearest_neighbor_threshold(
1024
+ # # # embedding_matrix,
1025
+ # # # threshold=threshold,
1026
+ # # # batch_size=batch_size,
1027
+ # # # show_progressbar=True # Allow internal progress bar
1028
+ # # # )
1029
+
1030
+ # # # # Processing duplicates
1031
+ # # # for i, similar_items in enumerate(results):
1032
+ # # # if i not in deduplicated_indices:
1033
+ # # # continue
1034
+
1035
+ # # # similar_indices = [int(item[0]) for item in similar_items if int(item[0]) != i]
1036
+
1037
+ # # # for sim_idx in similar_indices:
1038
+ # # # if sim_idx in deduplicated_indices:
1039
+ # # # deduplicated_indices.remove(sim_idx)
1040
+ # # # duplicate_to_original_mapping[sim_idx] = i
1041
+
1042
+ # # # return np.array(list(deduplicated_indices)), duplicate_to_original_mapping
1043
+
1044
+ # # # def deduplicate_across_datasets(embedding_matrix_1: np.ndarray, embedding_matrix_2: np.ndarray, threshold: float, batch_size: int = 1024) -> tuple[list[int], dict[int, int]]:
1045
+ # # # """
1046
+ # # # Deduplicate embeddings across two datasets and return the indices of duplicates between them.
1047
+ # # # """
1048
+ # # # # Building the index from Dataset 1
1049
+ # # # reach = Reach(vectors=embedding_matrix_1, items=[str(i) for i in range(len(embedding_matrix_1))])
1050
+
1051
+ # # # duplicate_indices_in_test = []
1052
+ # # # duplicate_to_original_mapping = {}
1053
+
1054
+ # # # # Finding nearest neighbors between datasets
1055
+ # # # results = reach.nearest_neighbor_threshold(
1056
+ # # # embedding_matrix_2,
1057
+ # # # threshold=threshold,
1058
+ # # # batch_size=batch_size,
1059
+ # # # show_progressbar=True # Allow internal progress bar
1060
+ # # # )
1061
+
1062
+ # # # # Processing duplicates
1063
+ # # # for i, similar_items in enumerate(results):
1064
+ # # # similar_indices = [int(item[0]) for item in similar_items if item[1] >= threshold]
1065
+
1066
+ # # # if similar_indices:
1067
+ # # # duplicate_indices_in_test.append(i)
1068
+ # # # duplicate_to_original_mapping[i] = similar_indices[0]
1069
+
1070
+ # # # return duplicate_indices_in_test, duplicate_to_original_mapping
1071
+
1072
+ # # # def display_word_differences(x: str, y: str) -> str:
1073
+ # # # diff = ndiff(x.split(), y.split())
1074
+ # # # return " ".join([word for word in diff if word.startswith(('+', '-'))])
1075
+
1076
+ # # # def perform_deduplication(
1077
+ # # # deduplication_type,
1078
+ # # # dataset1_name,
1079
+ # # # dataset1_split,
1080
+ # # # dataset1_text_column,
1081
+ # # # dataset2_name="",
1082
+ # # # dataset2_split="",
1083
+ # # # dataset2_text_column="",
1084
+ # # # threshold=default_threshold,
1085
+ # # # progress=gr.Progress(track_tqdm=True)
1086
+ # # # ):
1087
+ # # # # Deep Monkey-Patching of tqdm
1088
+ # # # original_tqdm = tqdm.tqdm
1089
+ # # # tqdm.tqdm = progress.tqdm
1090
+ # # # for mod_name in list(sys.modules.keys()):
1091
+ # # # if 'tqdm' in mod_name:
1092
+ # # # sys.modules[mod_name].tqdm = progress.tqdm
1093
+
1094
+ # # # try:
1095
+ # # # # Convert threshold to float
1096
+ # # # threshold = float(threshold)
1097
 
1098
+ # # # # Initialize status message
1099
+ # # # status = ""
1100
+
1101
+ # # # if deduplication_type == "Single dataset":
1102
+ # # # # Load Dataset 1
1103
+ # # # status = "Loading Dataset 1..."
1104
+ # # # yield status, ""
1105
+ # # # if dataset1_name == default_dataset1_name and dataset1_split == default_dataset1_split:
1106
+ # # # ds = ds_default1
1107
+ # # # else:
1108
+ # # # ds = load_dataset(dataset1_name, split=dataset1_split)
1109
 
1110
+ # # # # Extract texts
1111
+ # # # status = "Extracting texts from Dataset 1..."
1112
+ # # # yield status, ""
1113
+ # # # texts = [example[dataset1_text_column] for example in ds]
1114
 
1115
+ # # # # Compute embeddings
1116
+ # # # status = "Computing embeddings for Dataset 1..."
1117
+ # # # yield status, ""
1118
+ # # # embedding_matrix = model.encode(texts, show_progressbar=True) # Enable internal progress bar
1119
 
1120
+ # # # # Deduplicate
1121
+ # # # status = "Deduplicating embeddings..."
1122
+ # # # yield status, ""
1123
+ # # # deduplicated_indices, duplicate_to_original_mapping = deduplicate(
1124
+ # # # embedding_matrix, threshold
1125
+ # # # )
1126
+
1127
+ # # # # Prepare the results
1128
+ # # # num_duplicates = len(duplicate_to_original_mapping)
1129
+ # # # num_total = len(texts)
1130
+ # # # num_deduplicated = len(deduplicated_indices)
1131
+
1132
+ # # # result_text = f"**Total documents:** {num_total}\n"
1133
+ # # # result_text += f"**Number of duplicates found:** {num_duplicates}\n"
1134
+ # # # result_text += f"**Number of unique documents after deduplication:** {num_deduplicated}\n\n"
1135
 
1136
+ # # # # Show deduplicated examples
1137
+ # # # if num_duplicates > 0:
1138
+ # # # result_text += "**Examples of duplicates found:**\n\n"
1139
+ # # # num_examples = min(5, num_duplicates)
1140
+ # # # for duplicate_idx, original_idx in list(duplicate_to_original_mapping.items())[:num_examples]:
1141
+ # # # original_text = texts[original_idx]
1142
+ # # # duplicate_text = texts[duplicate_idx]
1143
+ # # # differences = display_word_differences(original_text, duplicate_text)
1144
+ # # # result_text += f"**Original text:**\n{original_text}\n\n"
1145
+ # # # result_text += f"**Duplicate text:**\n{duplicate_text}\n\n"
1146
+ # # # result_text += f"**Differences:**\n{differences}\n"
1147
+ # # # result_text += "-" * 50 + "\n\n"
1148
+ # # # else:
1149
+ # # # result_text += "No duplicates found."
1150
+
1151
+ # # # # Final status
1152
+ # # # status = "Deduplication completed."
1153
+ # # # yield status, result_text
1154
 
1155
+ # # # elif deduplication_type == "Cross-dataset":
1156
+ # # # # Load Dataset 1
1157
+ # # # status = "Loading Dataset 1..."
1158
+ # # # yield status, ""
1159
+ # # # if dataset1_name == default_dataset1_name and dataset1_split == default_dataset1_split:
1160
+ # # # ds1 = ds_default1
1161
+ # # # else:
1162
+ # # # ds1 = load_dataset(dataset1_name, split=dataset1_split)
1163
 
1164
+ # # # # Load Dataset 2
1165
+ # # # status = "Loading Dataset 2..."
1166
+ # # # yield status, ""
1167
+ # # # if dataset2_name == default_dataset2_name and dataset2_split == default_dataset2_split:
1168
+ # # # ds2 = ds_default2
1169
+ # # # else:
1170
+ # # # ds2 = load_dataset(dataset2_name, split=dataset2_split)
1171
 
1172
+ # # # # Extract texts from Dataset 1
1173
+ # # # status = "Extracting texts from Dataset 1..."
1174
+ # # # yield status, ""
1175
+ # # # texts1 = [example[dataset1_text_column] for example in ds1]
1176
 
1177
+ # # # # Extract texts from Dataset 2
1178
+ # # # status = "Extracting texts from Dataset 2..."
1179
+ # # # yield status, ""
1180
+ # # # texts2 = [example[dataset2_text_column] for example in ds2]
1181
 
1182
+ # # # # Compute embeddings for Dataset 1
1183
+ # # # status = "Computing embeddings for Dataset 1..."
1184
+ # # # yield status, ""
1185
+ # # # embedding_matrix1 = model.encode(texts1, show_progressbar=True)
1186
 
1187
+ # # # # Compute embeddings for Dataset 2
1188
+ # # # status = "Computing embeddings for Dataset 2..."
1189
+ # # # yield status, ""
1190
+ # # # embedding_matrix2 = model.encode(texts2, show_progressbar=True)
1191
 
1192
+ # # # # Deduplicate across datasets
1193
+ # # # status = "Deduplicating embeddings across datasets..."
1194
+ # # # yield status, ""
1195
+ # # # duplicate_indices_in_ds2, duplicate_to_original_mapping = deduplicate_across_datasets(
1196
+ # # # embedding_matrix1, embedding_matrix2, threshold
1197
+ # # # )
1198
 
1199
+ # # # num_duplicates = len(duplicate_indices_in_ds2)
1200
+ # # # num_total_ds2 = len(texts2)
1201
+ # # # num_unique_ds2 = num_total_ds2 - num_duplicates
1202
+
1203
+ # # # result_text = f"**Total documents in {dataset2_name}/{dataset2_split}:** {num_total_ds2}\n"
1204
+ # # # result_text += f"**Number of duplicates found in {dataset2_name}/{dataset2_split}:** {num_duplicates}\n"
1205
+ # # # result_text += f"**Number of unique documents in {dataset2_name}/{dataset2_split} after deduplication:** {num_unique_ds2}\n\n"
1206
+
1207
+ # # # # Show deduplicated examples
1208
+ # # # if num_duplicates > 0:
1209
+ # # # result_text += "**Examples of duplicates found in Dataset 2:**\n\n"
1210
+ # # # num_examples = min(5, num_duplicates)
1211
+ # # # for duplicate_idx in duplicate_indices_in_ds2[:num_examples]:
1212
+ # # # original_idx = duplicate_to_original_mapping[duplicate_idx]
1213
+ # # # original_text = texts1[original_idx]
1214
+ # # # duplicate_text = texts2[duplicate_idx]
1215
+ # # # differences = display_word_differences(original_text, duplicate_text)
1216
+ # # # result_text += f"**Original text (Dataset 1):**\n{original_text}\n\n"
1217
+ # # # result_text += f"**Duplicate text (Dataset 2):**\n{duplicate_text}\n\n"
1218
+ # # # result_text += f"**Differences:**\n{differences}\n"
1219
+ # # # result_text += "-" * 50 + "\n\n"
1220
+ # # # else:
1221
+ # # # result_text += "No duplicates found."
1222
+
1223
+ # # # # Final status
1224
+ # # # status = "Deduplication completed."
1225
+ # # # yield status, result_text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1226
 
1227
+ # # # finally:
1228
+ # # # # Restore original tqdm
1229
+ # # # tqdm.tqdm = original_tqdm
1230
+ # # # for mod_name in list(sys.modules.keys()):
1231
+ # # # if 'tqdm' in mod_name:
1232
+ # # # sys.modules[mod_name].tqdm = original_tqdm
1233
 
1234
+ # # # with gr.Blocks() as demo:
1235
+ # # # gr.Markdown("# Semantic Deduplication")
 
 
1236
 
1237
+ # # # deduplication_type = gr.Radio(
1238
+ # # # choices=["Single dataset", "Cross-dataset"],
1239
+ # # # label="Deduplication Type",
1240
+ # # # value="Single dataset"
1241
+ # # # )
 
 
1242
 
1243
+ # # # with gr.Row():
1244
+ # # # dataset1_name = gr.Textbox(value=default_dataset1_name, label="Dataset 1 Name")
1245
+ # # # dataset1_split = gr.Textbox(value=default_dataset1_split, label="Dataset 1 Split")
1246
+ # # # dataset1_text_column = gr.Textbox(value=default_text_column, label="Text Column Name")
 
 
1247
 
1248
+ # # # dataset2_inputs = gr.Column(visible=False)
1249
+ # # # with dataset2_inputs:
1250
+ # # # gr.Markdown("### Dataset 2")
1251
+ # # # with gr.Row():
1252
+ # # # dataset2_name = gr.Textbox(value=default_dataset2_name, label="Dataset 2 Name")
1253
+ # # # dataset2_split = gr.Textbox(value=default_dataset2_split, label="Dataset 2 Split")
1254
+ # # # dataset2_text_column = gr.Textbox(value=default_text_column, label="Text Column Name")
1255
 
1256
+ # # # threshold = gr.Slider(
1257
+ # # # minimum=0.0,
1258
+ # # # maximum=1.0,
1259
+ # # # value=default_threshold,
1260
+ # # # label="Similarity Threshold"
1261
+ # # # )
1262
 
1263
+ # # # compute_button = gr.Button("Compute")
 
 
 
 
 
1264
 
1265
+ # # # status_output = gr.Markdown()
1266
+ # # # result_output = gr.Markdown()
 
 
 
1267
 
1268
+ # # # # Function to update the visibility of dataset2_inputs
1269
+ # # # def update_visibility(deduplication_type_value):
1270
+ # # # if deduplication_type_value == "Cross-dataset":
1271
+ # # # return gr.update(visible=True)
1272
+ # # # else:
1273
+ # # # return gr.update(visible=False)
 
 
 
 
 
 
 
 
 
 
1274
 
1275
+ # # # deduplication_type.change(
1276
+ # # # update_visibility,
1277
+ # # # inputs=deduplication_type,
1278
+ # # # outputs=dataset2_inputs
1279
+ # # # )
1280
 
1281
+ # # # compute_button.click(
1282
+ # # # fn=perform_deduplication,
1283
+ # # # inputs=[
1284
+ # # # deduplication_type,
1285
+ # # # dataset1_name,
1286
+ # # # dataset1_split,
1287
+ # # # dataset1_text_column,
1288
+ # # # dataset2_name,
1289
+ # # # dataset2_split,
1290
+ # # # dataset2_text_column,
1291
+ # # # threshold
1292
+ # # # ],
1293
+ # # # outputs=[status_output, result_output]
1294
+ # # # )
1295
+
1296
+ # # # demo.launch()
1297
 
1298
 
1299
+ # # # import gradio as gr
1300
+ # # # from datasets import load_dataset
1301
+ # # # import numpy as np
1302
+ # # # from model2vec import StaticModel
1303
+ # # # from reach import Reach
1304
+ # # # from difflib import ndiff
1305
+ # # # import sys
1306
+ # # # import tqdm
1307
 
1308
+ # # # # Load the model at startup
1309
+ # # # model = StaticModel.from_pretrained("minishlab/M2V_base_output")
1310
 
1311
+ # # # # Update default dataset to 'sst2' and set default threshold to 0.9
1312
+ # # # default_dataset1_name = "sst2"
1313
+ # # # default_dataset1_split = "train"
1314
+ # # # default_dataset2_name = "sst2"
1315
+ # # # default_dataset2_split = "validation"
1316
+ # # # default_text_column = "sentence"
1317
+ # # # default_threshold = 0.9
1318
 
1319
+ # # # # Load the default datasets at startup
1320
+ # # # ds_default1 = load_dataset(default_dataset1_name, split=default_dataset1_split)
1321
+ # # # ds_default2 = load_dataset(default_dataset2_name, split=default_dataset2_split)
1322
 
1323
+ # # # def deduplicate(embedding_matrix: np.ndarray, threshold: float, batch_size: int = 1024, progress=None) -> tuple[np.ndarray, dict[int, int]]:
1324
+ # # # """
1325
+ # # # Deduplicate embeddings and return the deduplicated indices and a mapping of removed indices to their corresponding original indices.
1326
+ # # # """
1327
+ # # # # Update progress to indicate building the index
1328
+ # # # progress(0, desc="Building search index...")
1329
+ # # # reach = Reach(vectors=embedding_matrix, items=[str(i) for i in range(len(embedding_matrix))])
1330
 
1331
+ # # # deduplicated_indices = set(range(len(embedding_matrix)))
1332
+ # # # duplicate_to_original_mapping = {}
1333
 
1334
+ # # # # Finding nearest neighbors
1335
+ # # # progress(0, desc="Finding nearest neighbors...")
1336
+ # # # results = reach.nearest_neighbor_threshold(
1337
+ # # # embedding_matrix,
1338
+ # # # threshold=threshold,
1339
+ # # # batch_size=batch_size,
1340
+ # # # show_progressbar=True # Allow internal progress bar
1341
+ # # # )
1342
 
1343
+ # # # # Processing duplicates with a progress bar
1344
+ # # # total_items = len(embedding_matrix)
1345
+ # # # for i, similar_items in enumerate(progress.tqdm(results, desc="Processing duplicates", total=total_items)):
1346
+ # # # if i not in deduplicated_indices:
1347
+ # # # continue
1348
 
1349
+ # # # similar_indices = [int(item[0]) for item in similar_items if int(item[0]) != i]
1350
 
1351
+ # # # for sim_idx in similar_indices:
1352
+ # # # if sim_idx in deduplicated_indices:
1353
+ # # # deduplicated_indices.remove(sim_idx)
1354
+ # # # duplicate_to_original_mapping[sim_idx] = i
1355
 
1356
+ # # # return np.array(list(deduplicated_indices)), duplicate_to_original_mapping
1357
 
1358
+ # # # def deduplicate_across_datasets(embedding_matrix_1: np.ndarray, embedding_matrix_2: np.ndarray, threshold: float, batch_size: int = 1024, progress=None) -> tuple[list[int], dict[int, int]]:
1359
+ # # # """
1360
+ # # # Deduplicate embeddings across two datasets and return the indices of duplicates between them.
1361
+ # # # """
1362
+ # # # # Update progress to indicate building the index
1363
+ # # # progress(0, desc="Building search index from Dataset 1...")
1364
+ # # # reach = Reach(vectors=embedding_matrix_1, items=[str(i) for i in range(len(embedding_matrix_1))])
1365
 
1366
+ # # # duplicate_indices_in_test = []
1367
+ # # # duplicate_to_original_mapping = {}
1368
 
1369
+ # # # # Finding nearest neighbors between datasets
1370
+ # # # progress(0, desc="Finding nearest neighbors between datasets...")
1371
+ # # # results = reach.nearest_neighbor_threshold(
1372
+ # # # embedding_matrix_2,
1373
+ # # # threshold=threshold,
1374
+ # # # batch_size=batch_size,
1375
+ # # # show_progressbar=True # Allow internal progress bar
1376
+ # # # )
1377
 
1378
+ # # # total_items = len(embedding_matrix_2)
1379
+ # # # # Processing duplicates with a progress bar
1380
+ # # # for i, similar_items in enumerate(progress.tqdm(results, desc="Processing duplicates across datasets", total=total_items)):
1381
+ # # # similar_indices = [int(item[0]) for item in similar_items if item[1] >= threshold]
1382
 
1383
+ # # # if similar_indices:
1384
+ # # # duplicate_indices_in_test.append(i)
1385
+ # # # duplicate_to_original_mapping[i] = similar_indices[0]
1386
 
1387
+ # # # return duplicate_indices_in_test, duplicate_to_original_mapping
1388
 
1389
+ # # # def display_word_differences(x: str, y: str) -> str:
1390
+ # # # diff = ndiff(x.split(), y.split())
1391
+ # # # return " ".join([word for word in diff if word.startswith(('+', '-'))])
1392
 
1393
+ # # # def perform_deduplication(
1394
+ # # # deduplication_type,
1395
+ # # # dataset1_name,
1396
+ # # # dataset1_split,
1397
+ # # # dataset1_text_column,
1398
+ # # # dataset2_name="",
1399
+ # # # dataset2_split="",
1400
+ # # # dataset2_text_column="",
1401
+ # # # threshold=default_threshold,
1402
+ # # # progress=gr.Progress(track_tqdm=True)
1403
+ # # # ):
1404
+ # # # # Monkey-patch tqdm
1405
+ # # # original_tqdm = tqdm.tqdm
1406
+ # # # original_reach_tqdm = Reach.__dict__['tqdm'] if 'tqdm' in Reach.__dict__ else None
1407
+ # # # tqdm.tqdm = progress.tqdm
1408
+ # # # sys.modules['tqdm'].tqdm = progress.tqdm
1409
+ # # # sys.modules['tqdm.auto'].tqdm = progress.tqdm
1410
+ # # # Reach.tqdm = progress.tqdm # Monkey-patch reach's tqdm
1411
 
1412
+ # # # try:
1413
+ # # # # Convert threshold to float
1414
+ # # # threshold = float(threshold)
1415
 
1416
+ # # # if deduplication_type == "Single dataset":
1417
+ # # # # Load Dataset 1
1418
+ # # # progress(0, desc="Loading Dataset 1...")
1419
+ # # # if dataset1_name == default_dataset1_name and dataset1_split == default_dataset1_split:
1420
+ # # # ds = ds_default1
1421
+ # # # else:
1422
+ # # # ds = load_dataset(dataset1_name, split=dataset1_split)
 
 
 
 
 
 
 
 
1423
 
1424
+ # # # # Extract texts
1425
+ # # # progress(0, desc="Extracting texts from Dataset 1...")
1426
+ # # # texts = [example[dataset1_text_column] for example in ds]
 
1427
 
1428
+ # # # # Compute embeddings
1429
+ # # # progress(0, desc="Computing embeddings for Dataset 1...")
1430
+ # # # embedding_matrix = model.encode(texts, show_progressbar=True) # Enable internal progress bar
1431
 
1432
+ # # # # Deduplicate
1433
+ # # # result_text = deduplicate_and_prepare_results_single(
1434
+ # # # embedding_matrix, texts, threshold, progress
1435
+ # # # )
 
 
 
 
 
 
 
1436
 
1437
+ # # # return result_text
1438
 
1439
+ # # # elif deduplication_type == "Cross-dataset":
1440
+ # # # # Load Dataset 1
1441
+ # # # progress(0, desc="Loading Dataset 1...")
1442
+ # # # if dataset1_name == default_dataset1_name and dataset1_split == default_dataset1_split:
1443
+ # # # ds1 = ds_default1
1444
+ # # # else:
1445
+ # # # ds1 = load_dataset(dataset1_name, split=dataset1_split)
 
 
 
 
 
1446
 
1447
+ # # # # Load Dataset 2
1448
+ # # # progress(0, desc="Loading Dataset 2...")
1449
+ # # # if dataset2_name == default_dataset2_name and dataset2_split == default_dataset2_split:
1450
+ # # # ds2 = ds_default2
1451
+ # # # else:
1452
+ # # # ds2 = load_dataset(dataset2_name, split=dataset2_split)
1453
 
1454
+ # # # # Extract texts from Dataset 1
1455
+ # # # progress(0, desc="Extracting texts from Dataset 1...")
1456
+ # # # texts1 = [example[dataset1_text_column] for example in ds1]
1457
 
1458
+ # # # # Extract texts from Dataset 2
1459
+ # # # progress(0, desc="Extracting texts from Dataset 2...")
1460
+ # # # texts2 = [example[dataset2_text_column] for example in ds2]
1461
 
1462
+ # # # # Compute embeddings for Dataset 1
1463
+ # # # progress(0, desc="Computing embeddings for Dataset 1...")
1464
+ # # # embedding_matrix1 = model.encode(texts1, show_progressbar=True)
1465
 
1466
+ # # # # Compute embeddings for Dataset 2
1467
+ # # # progress(0, desc="Computing embeddings for Dataset 2...")
1468
+ # # # embedding_matrix2 = model.encode(texts2, show_progressbar=True)
1469
 
1470
+ # # # # Deduplicate across datasets
1471
+ # # # result_text = deduplicate_and_prepare_results_cross(
1472
+ # # # embedding_matrix1, embedding_matrix2, texts1, texts2, threshold, progress, dataset2_name, dataset2_split
1473
+ # # # )
 
 
 
 
 
 
 
 
1474
 
1475
+ # # # return result_text
1476
 
1477
+ # # # finally:
1478
+ # # # # Restore original tqdm
1479
+ # # # tqdm.tqdm = original_tqdm
1480
+ # # # sys.modules['tqdm'].tqdm = original_tqdm
1481
+ # # # sys.modules['tqdm.auto'].tqdm = original_tqdm
1482
 
1483
+ # # # # Restore reach's original tqdm
1484
+ # # # if original_reach_tqdm is not None:
1485
+ # # # Reach.tqdm = original_reach_tqdm
1486
+ # # # else:
1487
+ # # # del Reach.tqdm # If it wasn't originally in Reach's __dict__
1488
 
1489
+ # # # def deduplicate_and_prepare_results_single(embedding_matrix, texts, threshold, progress):
1490
+ # # # # Deduplicate
1491
+ # # # deduplicated_indices, duplicate_to_original_mapping = deduplicate(
1492
+ # # # embedding_matrix, threshold, progress=progress
1493
+ # # # )
 
 
 
1494
 
1495
+ # # # # Prepare the results
1496
+ # # # num_duplicates = len(duplicate_to_original_mapping)
1497
+ # # # num_total = len(texts)
1498
+ # # # num_deduplicated = len(deduplicated_indices)
1499
 
1500
+ # # # result_text = f"**Total documents:** {num_total}\n"
1501
+ # # # result_text += f"**Number of duplicates found:** {num_duplicates}\n"
1502
+ # # # result_text += f"**Number of unique documents after deduplication:** {num_deduplicated}\n\n"
 
 
 
 
1503
 
1504
+ # # # # Show deduplicated examples
1505
+ # # # if num_duplicates > 0:
1506
+ # # # result_text += "**Examples of duplicates found:**\n\n"
1507
+ # # # num_examples = min(5, num_duplicates)
1508
+ # # # for duplicate_idx, original_idx in list(duplicate_to_original_mapping.items())[:num_examples]:
1509
+ # # # original_text = texts[original_idx]
1510
+ # # # duplicate_text = texts[duplicate_idx]
1511
+ # # # differences = display_word_differences(original_text, duplicate_text)
1512
+ # # # result_text += f"**Original text:**\n{original_text}\n\n"
1513
+ # # # result_text += f"**Duplicate text:**\n{duplicate_text}\n\n"
1514
+ # # # result_text += f"**Differences:**\n{differences}\n"
1515
+ # # # result_text += "-" * 50 + "\n\n"
1516
+ # # # else:
1517
+ # # # result_text += "No duplicates found."
1518
 
1519
+ # # # return result_text
1520
+
1521
+ # # # def deduplicate_and_prepare_results_cross(embedding_matrix1, embedding_matrix2, texts1, texts2, threshold, progress, dataset2_name, dataset2_split):
1522
+ # # # # Deduplicate across datasets
1523
+ # # # duplicate_indices_in_ds2, duplicate_to_original_mapping = deduplicate_across_datasets(
1524
+ # # # embedding_matrix1, embedding_matrix2, threshold, progress=progress
1525
+ # # # )
1526
 
1527
+ # # # num_duplicates = len(duplicate_indices_in_ds2)
1528
+ # # # num_total_ds2 = len(texts2)
1529
+ # # # num_unique_ds2 = num_total_ds2 - num_duplicates
1530
 
1531
+ # # # result_text = f"**Total documents in {dataset2_name}/{dataset2_split}:** {num_total_ds2}\n"
1532
+ # # # result_text += f"**Number of duplicates found in {dataset2_name}/{dataset2_split}:** {num_duplicates}\n"
1533
+ # # # result_text += f"**Number of unique documents in {dataset2_name}/{dataset2_split} after deduplication:** {num_unique_ds2}\n\n"
 
 
 
1534
 
1535
+ # # # # Show deduplicated examples
1536
+ # # # if num_duplicates > 0:
1537
+ # # # result_text += "**Examples of duplicates found in Dataset 2:**\n\n"
1538
+ # # # num_examples = min(5, num_duplicates)
1539
+ # # # for duplicate_idx in duplicate_indices_in_ds2[:num_examples]:
1540
+ # # # original_idx = duplicate_to_original_mapping[duplicate_idx]
1541
+ # # # original_text = texts1[original_idx]
1542
+ # # # duplicate_text = texts2[duplicate_idx]
1543
+ # # # differences = display_word_differences(original_text, duplicate_text)
1544
+ # # # result_text += f"**Original text (Dataset 1):**\n{original_text}\n\n"
1545
+ # # # result_text += f"**Duplicate text (Dataset 2):**\n{duplicate_text}\n\n"
1546
+ # # # result_text += f"**Differences:**\n{differences}\n"
1547
+ # # # result_text += "-" * 50 + "\n\n"
1548
+ # # # else:
1549
+ # # # result_text += "No duplicates found."
1550
 
1551
+ # # # return result_text
1552
+
1553
+ # # # with gr.Blocks() as demo:
1554
+ # # # gr.Markdown("# Semantic Deduplication")
1555
+
1556
+ # # # deduplication_type = gr.Radio(
1557
+ # # # choices=["Single dataset", "Cross-dataset"],
1558
+ # # # label="Deduplication Type",
1559
+ # # # value="Single dataset"
1560
+ # # # )
1561
+
1562
+ # # # with gr.Row():
1563
+ # # # dataset1_name = gr.Textbox(value=default_dataset1_name, label="Dataset 1 Name")
1564
+ # # # dataset1_split = gr.Textbox(value=default_dataset1_split, label="Dataset 1 Split")
1565
+ # # # dataset1_text_column = gr.Textbox(value=default_text_column, label="Text Column Name")
1566
+
1567
+ # # # dataset2_inputs = gr.Column(visible=False)
1568
+ # # # with dataset2_inputs:
1569
+ # # # gr.Markdown("### Dataset 2")
1570
+ # # # with gr.Row():
1571
+ # # # dataset2_name = gr.Textbox(value=default_dataset2_name, label="Dataset 2 Name")
1572
+ # # # dataset2_split = gr.Textbox(value=default_dataset2_split, label="Dataset 2 Split")
1573
+ # # # dataset2_text_column = gr.Textbox(value=default_text_column, label="Text Column Name")
1574
+
1575
+ # # # threshold = gr.Slider(
1576
+ # # # minimum=0.0,
1577
+ # # # maximum=1.0,
1578
+ # # # value=default_threshold,
1579
+ # # # label="Similarity Threshold"
1580
+ # # # )
1581
+
1582
+ # # # compute_button = gr.Button("Compute")
1583
+
1584
+ # # # output = gr.Markdown()
1585
+
1586
+ # # # # Function to update the visibility of dataset2_inputs
1587
+ # # # def update_visibility(deduplication_type_value):
1588
+ # # # if deduplication_type_value == "Cross-dataset":
1589
+ # # # return gr.update(visible=True)
1590
+ # # # else:
1591
+ # # # return gr.update(visible=False)
1592
+
1593
+ # # # deduplication_type.change(
1594
+ # # # update_visibility,
1595
+ # # # inputs=deduplication_type,
1596
+ # # # outputs=dataset2_inputs
1597
+ # # # )
1598
+
1599
+ # # # compute_button.click(
1600
+ # # # fn=perform_deduplication,
1601
+ # # # inputs=[
1602
+ # # # deduplication_type,
1603
+ # # # dataset1_name,
1604
+ # # # dataset1_split,
1605
+ # # # dataset1_text_column,
1606
+ # # # dataset2_name,
1607
+ # # # dataset2_split,
1608
+ # # # dataset2_text_column,
1609
+ # # # threshold
1610
+ # # # ],
1611
+ # # # outputs=output
1612
+ # # # )
1613
 
1614
+ # # # demo.launch()
1615
+
1616
+
1617
 
1618
 
1619
  # # # import gradio as gr
 
1654
  # # # )
1655
 
1656
  # # # # Process duplicates
1657
+ # # # for i, similar_items in enumerate(progress.tqdm(results, desc="Processing duplicates", total=len(embedding_matrix))):
1658
  # # # if i not in deduplicated_indices:
1659
  # # # continue
1660
 
 
1683
  # # # show_progressbar=True # Allow internal progress bar
1684
  # # # )
1685
 
1686
+ # # # for i, similar_items in enumerate(progress.tqdm(results, desc="Processing duplicates across datasets", total=len(embedding_matrix_2))):
 
1687
  # # # similar_indices = [int(item[0]) for item in similar_items if item[1] >= threshold]
1688
 
1689
  # # # if similar_indices:
 
1709
  # # # ):
1710
  # # # # Monkey-patch tqdm
1711
  # # # original_tqdm = tqdm.tqdm
1712
+ # # # original_reach_tqdm = Reach.__dict__['tqdm'] if 'tqdm' in Reach.__dict__ else None
1713
  # # # tqdm.tqdm = progress.tqdm
1714
  # # # sys.modules['tqdm'].tqdm = progress.tqdm
1715
  # # # sys.modules['tqdm.auto'].tqdm = progress.tqdm
1716
+ # # # Reach.tqdm = progress.tqdm # Monkey-patch reach's tqdm
1717
 
1718
  # # # try:
1719
  # # # # Convert threshold to float
 
1780
  # # # embedding_matrix2 = model.encode(texts2, show_progressbar=True) # Enable internal progress bar
1781
 
1782
  # # # # Deduplicate across datasets
1783
+ # # # duplicate_indices_in_ds2, duplicate_to_original_mapping = deduplicate_across_datasets(
1784
+ # # # embedding_matrix1, embedding_matrix2, threshold, progress=progress)
1785
 
1786
  # # # num_duplicates = len(duplicate_indices_in_ds2)
1787
  # # # num_total_ds2 = len(texts2)
 
1812
  # # # sys.modules['tqdm'].tqdm = original_tqdm
1813
  # # # sys.modules['tqdm.auto'].tqdm = original_tqdm
1814
 
1815
+ # # # # Restore reach's original tqdm
1816
+ # # # if original_reach_tqdm is not None:
1817
+ # # # Reach.tqdm = original_reach_tqdm
1818
+ # # # else:
1819
+ # # # del Reach.tqdm # If it wasn't originally in Reach's __dict__
1820
+
1821
  # # # with gr.Blocks() as demo:
1822
  # # # gr.Markdown("# Semantic Deduplication")
1823
 
 
1880
  # # # )
1881
 
1882
  # # # demo.launch()
1883
+
1884
+
1885
+ # # # # import gradio as gr
1886
+ # # # # from datasets import load_dataset
1887
+ # # # # import numpy as np
1888
+ # # # # from model2vec import StaticModel
1889
+ # # # # from reach import Reach
1890
+ # # # # from difflib import ndiff
1891
+ # # # # import sys
1892
+ # # # # import tqdm
1893
+
1894
+ # # # # # Load the model at startup
1895
+ # # # # model = StaticModel.from_pretrained("minishlab/M2V_base_output")
1896
+
1897
+ # # # # # Load the default datasets at startup
1898
+ # # # # default_dataset1_name = "ag_news"
1899
+ # # # # default_dataset1_split = "train"
1900
+ # # # # default_dataset2_name = "ag_news"
1901
+ # # # # default_dataset2_split = "test"
1902
+
1903
+ # # # # ds_default1 = load_dataset(default_dataset1_name, split=default_dataset1_split)
1904
+ # # # # ds_default2 = load_dataset(default_dataset2_name, split=default_dataset2_split)
1905
+
1906
+ # # # # def deduplicate(embedding_matrix: np.ndarray, threshold: float, batch_size: int = 1024, progress=None) -> tuple[np.ndarray, dict[int, int]]:
1907
+ # # # # """
1908
+ # # # # Deduplicate embeddings and return the deduplicated indices and a mapping of removed indices to their corresponding original indices.
1909
+ # # # # """
1910
+ # # # # reach = Reach(vectors=embedding_matrix, items=[str(i) for i in range(len(embedding_matrix))])
1911
+
1912
+ # # # # deduplicated_indices = set(range(len(embedding_matrix)))
1913
+ # # # # duplicate_to_original_mapping = {}
1914
+
1915
+ # # # # results = reach.nearest_neighbor_threshold(
1916
+ # # # # embedding_matrix,
1917
+ # # # # threshold=threshold,
1918
+ # # # # batch_size=batch_size,
1919
+ # # # # show_progressbar=True # Allow internal progress bar
1920
+ # # # # )
1921
+
1922
+ # # # # # Process duplicates
1923
+ # # # # for i, similar_items in enumerate(progress.tqdm(results, desc="Processing duplicates")):
1924
+ # # # # if i not in deduplicated_indices:
1925
+ # # # # continue
1926
+
1927
+ # # # # similar_indices = [int(item[0]) for item in similar_items if int(item[0]) != i]
1928
+
1929
+ # # # # for sim_idx in similar_indices:
1930
+ # # # # if sim_idx in deduplicated_indices:
1931
+ # # # # deduplicated_indices.remove(sim_idx)
1932
+ # # # # duplicate_to_original_mapping[sim_idx] = i
1933
+
1934
+ # # # # return np.array(list(deduplicated_indices)), duplicate_to_original_mapping
1935
+
1936
+ # # # # def deduplicate_across_datasets(embedding_matrix_1: np.ndarray, embedding_matrix_2: np.ndarray, threshold: float, batch_size: int = 1024, progress=None) -> tuple[list[int], dict[int, int]]:
1937
+ # # # # """
1938
+ # # # # Deduplicate embeddings across two datasets and return the indices of duplicates between them.
1939
+ # # # # """
1940
+ # # # # reach = Reach(vectors=embedding_matrix_1, items=[str(i) for i in range(len(embedding_matrix_1))])
1941
+
1942
+ # # # # duplicate_indices_in_test = []
1943
+ # # # # duplicate_to_original_mapping = {}
1944
+
1945
+ # # # # results = reach.nearest_neighbor_threshold(
1946
+ # # # # embedding_matrix_2,
1947
+ # # # # threshold=threshold,
1948
+ # # # # batch_size=batch_size,
1949
+ # # # # show_progressbar=True # Allow internal progress bar
1950
+ # # # # )
1951
+
1952
+ # # # # # Process duplicates
1953
+ # # # # for i, similar_items in enumerate(progress.tqdm(results, desc="Processing duplicates across datasets")):
1954
+ # # # # similar_indices = [int(item[0]) for item in similar_items if item[1] >= threshold]
1955
+
1956
+ # # # # if similar_indices:
1957
+ # # # # duplicate_indices_in_test.append(i)
1958
+ # # # # duplicate_to_original_mapping[i] = similar_indices[0]
1959
+
1960
+ # # # # return duplicate_indices_in_test, duplicate_to_original_mapping
1961
+
1962
+ # # # # def display_word_differences(x: str, y: str) -> str:
1963
+ # # # # diff = ndiff(x.split(), y.split())
1964
+ # # # # return " ".join([word for word in diff if word.startswith(('+', '-'))])
1965
+
1966
+ # # # # def perform_deduplication(
1967
+ # # # # deduplication_type,
1968
+ # # # # dataset1_name,
1969
+ # # # # dataset1_split,
1970
+ # # # # dataset1_text_column,
1971
+ # # # # dataset2_name="",
1972
+ # # # # dataset2_split="",
1973
+ # # # # dataset2_text_column="",
1974
+ # # # # threshold=0.8,
1975
+ # # # # progress=gr.Progress(track_tqdm=True)
1976
+ # # # # ):
1977
+ # # # # # Monkey-patch tqdm
1978
+ # # # # original_tqdm = tqdm.tqdm
1979
+ # # # # tqdm.tqdm = progress.tqdm
1980
+ # # # # sys.modules['tqdm'].tqdm = progress.tqdm
1981
+ # # # # sys.modules['tqdm.auto'].tqdm = progress.tqdm
1982
+
1983
+ # # # # try:
1984
+ # # # # # Convert threshold to float
1985
+ # # # # threshold = float(threshold)
1986
+
1987
+ # # # # if deduplication_type == "Single dataset":
1988
+ # # # # # Check if the dataset is the default one
1989
+ # # # # if dataset1_name == default_dataset1_name and dataset1_split == default_dataset1_split:
1990
+ # # # # ds = ds_default1
1991
+ # # # # else:
1992
+ # # # # ds = load_dataset(dataset1_name, split=dataset1_split)
1993
+
1994
+ # # # # # Extract texts
1995
+ # # # # texts = [example[dataset1_text_column] for example in ds]
1996
+
1997
+ # # # # # Compute embeddings
1998
+ # # # # embedding_matrix = model.encode(texts, show_progressbar=True) # Enable internal progress bar
1999
+
2000
+ # # # # # Deduplicate
2001
+ # # # # deduplicated_indices, duplicate_to_original_mapping = deduplicate(embedding_matrix, threshold, progress=progress)
2002
+
2003
+ # # # # # Prepare the results
2004
+ # # # # num_duplicates = len(duplicate_to_original_mapping)
2005
+ # # # # num_total = len(texts)
2006
+ # # # # num_deduplicated = len(deduplicated_indices)
2007
+
2008
+ # # # # result_text = f"**Total documents:** {num_total}\n"
2009
+ # # # # result_text += f"**Number of duplicates found:** {num_duplicates}\n"
2010
+ # # # # result_text += f"**Number of unique documents after deduplication:** {num_deduplicated}\n\n"
2011
+
2012
+ # # # # # Show deduplicated examples
2013
+ # # # # result_text += "**Examples of duplicates found:**\n\n"
2014
+ # # # # num_examples = min(5, num_duplicates)
2015
+ # # # # for duplicate_idx, original_idx in list(duplicate_to_original_mapping.items())[:num_examples]:
2016
+ # # # # original_text = texts[original_idx]
2017
+ # # # # duplicate_text = texts[duplicate_idx]
2018
+ # # # # differences = display_word_differences(original_text, duplicate_text)
2019
+ # # # # result_text += f"**Original text:**\n{original_text}\n\n"
2020
+ # # # # result_text += f"**Duplicate text:**\n{duplicate_text}\n\n"
2021
+ # # # # result_text += f"**Differences:**\n{differences}\n"
2022
+ # # # # result_text += "-" * 50 + "\n\n"
2023
+
2024
+ # # # # return result_text
2025
+
2026
+ # # # # elif deduplication_type == "Cross-dataset":
2027
+ # # # # # Dataset 1
2028
+ # # # # if dataset1_name == default_dataset1_name and dataset1_split == default_dataset1_split:
2029
+ # # # # ds1 = ds_default1
2030
+ # # # # else:
2031
+ # # # # ds1 = load_dataset(dataset1_name, split=dataset1_split)
2032
+
2033
+ # # # # # Dataset 2
2034
+ # # # # if dataset2_name == default_dataset2_name and dataset2_split == default_dataset2_split:
2035
+ # # # # ds2 = ds_default2
2036
+ # # # # else:
2037
+ # # # # ds2 = load_dataset(dataset2_name, split=dataset2_split)
2038
+
2039
+ # # # # # Extract texts
2040
+ # # # # texts1 = [example[dataset1_text_column] for example in ds1]
2041
+ # # # # texts2 = [example[dataset2_text_column] for example in ds2]
2042
+
2043
+ # # # # # Compute embeddings
2044
+ # # # # embedding_matrix1 = model.encode(texts1, show_progressbar=True) # Enable internal progress bar
2045
+ # # # # embedding_matrix2 = model.encode(texts2, show_progressbar=True) # Enable internal progress bar
2046
+
2047
+ # # # # # Deduplicate across datasets
2048
+ # # # # duplicate_indices_in_ds2, duplicate_to_original_mapping = deduplicate_across_datasets(embedding_matrix1, embedding_matrix2, threshold, progress=progress)
2049
+
2050
+ # # # # num_duplicates = len(duplicate_indices_in_ds2)
2051
+ # # # # num_total_ds2 = len(texts2)
2052
+ # # # # num_unique_ds2 = num_total_ds2 - num_duplicates
2053
+
2054
+ # # # # result_text = f"**Total documents in {dataset2_name}/{dataset2_split}:** {num_total_ds2}\n"
2055
+ # # # # result_text += f"**Number of duplicates found in {dataset2_name}/{dataset2_split}:** {num_duplicates}\n"
2056
+ # # # # result_text += f"**Number of unique documents in {dataset2_name}/{dataset2_split} after deduplication:** {num_unique_ds2}\n\n"
2057
+
2058
+ # # # # # Show deduplicated examples
2059
+ # # # # result_text += "**Examples of duplicates found in Dataset 2:**\n\n"
2060
+ # # # # num_examples = min(5, num_duplicates)
2061
+ # # # # for duplicate_idx in duplicate_indices_in_ds2[:num_examples]:
2062
+ # # # # original_idx = duplicate_to_original_mapping[duplicate_idx]
2063
+ # # # # original_text = texts1[original_idx]
2064
+ # # # # duplicate_text = texts2[duplicate_idx]
2065
+ # # # # differences = display_word_differences(original_text, duplicate_text)
2066
+ # # # # result_text += f"**Original text (Dataset 1):**\n{original_text}\n\n"
2067
+ # # # # result_text += f"**Duplicate text (Dataset 2):**\n{duplicate_text}\n\n"
2068
+ # # # # result_text += f"**Differences:**\n{differences}\n"
2069
+ # # # # result_text += "-" * 50 + "\n\n"
2070
+
2071
+ # # # # return result_text
2072
+
2073
+ # # # # finally:
2074
+ # # # # # Restore original tqdm
2075
+ # # # # tqdm.tqdm = original_tqdm
2076
+ # # # # sys.modules['tqdm'].tqdm = original_tqdm
2077
+ # # # # sys.modules['tqdm.auto'].tqdm = original_tqdm
2078
+
2079
+ # # # # with gr.Blocks() as demo:
2080
+ # # # # gr.Markdown("# Semantic Deduplication")
2081
+
2082
+ # # # # deduplication_type = gr.Radio(
2083
+ # # # # choices=["Single dataset", "Cross-dataset"],
2084
+ # # # # label="Deduplication Type",
2085
+ # # # # value="Single dataset"
2086
+ # # # # )
2087
+
2088
+ # # # # with gr.Row():
2089
+ # # # # dataset1_name = gr.Textbox(value="ag_news", label="Dataset 1 Name")
2090
+ # # # # dataset1_split = gr.Textbox(value="train", label="Dataset 1 Split")
2091
+ # # # # dataset1_text_column = gr.Textbox(value="text", label="Text Column Name")
2092
+
2093
+ # # # # dataset2_inputs = gr.Column(visible=False)
2094
+ # # # # with dataset2_inputs:
2095
+ # # # # gr.Markdown("### Dataset 2")
2096
+ # # # # with gr.Row():
2097
+ # # # # dataset2_name = gr.Textbox(value="ag_news", label="Dataset 2 Name")
2098
+ # # # # dataset2_split = gr.Textbox(value="test", label="Dataset 2 Split")
2099
+ # # # # dataset2_text_column = gr.Textbox(value="text", label="Text Column Name")
2100
+
2101
+ # # # # threshold = gr.Slider(
2102
+ # # # # minimum=0.0,
2103
+ # # # # maximum=1.0,
2104
+ # # # # value=0.8,
2105
+ # # # # label="Similarity Threshold"
2106
+ # # # # )
2107
+
2108
+ # # # # compute_button = gr.Button("Compute")
2109
+
2110
+ # # # # output = gr.Markdown()
2111
+
2112
+ # # # # # Function to update the visibility of dataset2_inputs
2113
+ # # # # def update_visibility(deduplication_type_value):
2114
+ # # # # if deduplication_type_value == "Cross-dataset":
2115
+ # # # # return gr.update(visible=True)
2116
+ # # # # else:
2117
+ # # # # return gr.update(visible=False)
2118
+
2119
+ # # # # deduplication_type.change(
2120
+ # # # # update_visibility,
2121
+ # # # # inputs=deduplication_type,
2122
+ # # # # outputs=dataset2_inputs
2123
+ # # # # )
2124
+
2125
+ # # # # compute_button.click(
2126
+ # # # # fn=perform_deduplication,
2127
+ # # # # inputs=[
2128
+ # # # # deduplication_type,
2129
+ # # # # dataset1_name,
2130
+ # # # # dataset1_split,
2131
+ # # # # dataset1_text_column,
2132
+ # # # # dataset2_name,
2133
+ # # # # dataset2_split,
2134
+ # # # # dataset2_text_column,
2135
+ # # # # threshold
2136
+ # # # # ],
2137
+ # # # # outputs=output
2138
+ # # # # )
2139
+
2140
+ # # # # demo.launch()