Pringled commited on
Commit
b4f232c
·
1 Parent(s): ff4eb41
Files changed (1) hide show
  1. app.py +33 -1837
app.py CHANGED
@@ -1,5 +1,3 @@
1
- # Try last 2 approaches, app was not rebuilding properly
2
-
3
  import gradio as gr
4
  from datasets import load_dataset
5
  import numpy as np
@@ -28,15 +26,6 @@ def batch_iterable(iterable, batch_size):
28
  for i in range(0, len(iterable), batch_size):
29
  yield iterable[i:i + batch_size]
30
 
31
- def compute_embeddings_with_progress(texts, batch_size, progress, desc="Computing embeddings"):
32
- embeddings = []
33
- total_batches = (len(texts) + batch_size - 1) // batch_size
34
- for batch_texts in progress.tqdm(batch_iterable(texts, batch_size), desc=desc, total=total_batches):
35
- batch_embeddings = model.encode(batch_texts, show_progressbar=False)
36
- embeddings.append(batch_embeddings)
37
- embedding_matrix = np.concatenate(embeddings, axis=0)
38
- return embedding_matrix
39
-
40
  def display_word_differences(x: str, y: str) -> str:
41
  diff = ndiff(x.split(), y.split())
42
  return " ".join([word for word in diff if word.startswith(('+', '-'))])
@@ -76,7 +65,17 @@ def perform_deduplication(
76
  # Compute embeddings
77
  status = "Computing embeddings for Dataset 1..."
78
  yield status, ""
79
- embedding_matrix = compute_embeddings_with_progress(texts, batch_size=64, progress=progress, desc="Computing embeddings for Dataset 1")
 
 
 
 
 
 
 
 
 
 
80
 
81
  # Deduplicate
82
  status = "Deduplicating embeddings..."
@@ -143,12 +142,31 @@ def perform_deduplication(
143
  # Compute embeddings for Dataset 1
144
  status = "Computing embeddings for Dataset 1..."
145
  yield status, ""
146
- embedding_matrix1 = compute_embeddings_with_progress(texts1, batch_size=64, progress=progress, desc="Computing embeddings for Dataset 1")
 
 
 
 
 
 
 
 
 
 
147
 
148
  # Compute embeddings for Dataset 2
149
  status = "Computing embeddings for Dataset 2..."
150
  yield status, ""
151
- embedding_matrix2 = compute_embeddings_with_progress(texts2, batch_size=64, progress=progress, desc="Computing embeddings for Dataset 2")
 
 
 
 
 
 
 
 
 
152
 
153
  # Deduplicate across datasets
154
  status = "Deduplicating embeddings across datasets..."
@@ -284,7 +302,7 @@ with gr.Blocks() as demo:
284
  label="Similarity Threshold"
285
  )
286
 
287
- compute_button = gr.Button("Run")
288
 
289
  status_output = gr.Markdown()
290
  result_output = gr.Markdown()
@@ -318,1825 +336,3 @@ with gr.Blocks() as demo:
318
  )
319
 
320
  demo.launch()
321
-
322
-
323
- # import gradio as gr
324
- # from datasets import load_dataset
325
- # import numpy as np
326
- # from model2vec import StaticModel
327
- # from reach import Reach
328
- # from difflib import ndiff
329
- # import tqdm
330
-
331
- # # Load the model at startup
332
- # model = StaticModel.from_pretrained("minishlab/M2V_base_output")
333
-
334
- # # Update default dataset to 'sst2' and set default threshold to 0.9
335
- # default_dataset1_name = "sst2"
336
- # default_dataset1_split = "train"
337
- # default_dataset2_name = "sst2"
338
- # default_dataset2_split = "validation"
339
- # default_text_column = "sentence"
340
- # default_threshold = 0.9
341
-
342
- # # Load the default datasets at startup
343
- # ds_default1 = load_dataset(default_dataset1_name, split=default_dataset1_split)
344
- # ds_default2 = load_dataset(default_dataset2_name, split=default_dataset2_split)
345
-
346
- # def batch_iterable(iterable, batch_size):
347
- # """Helper function to create batches from an iterable."""
348
- # for i in range(0, len(iterable), batch_size):
349
- # yield iterable[i:i + batch_size]
350
-
351
- # def display_word_differences(x: str, y: str) -> str:
352
- # diff = ndiff(x.split(), y.split())
353
- # return " ".join([word for word in diff if word.startswith(('+', '-'))])
354
-
355
- # def perform_deduplication(
356
- # deduplication_type,
357
- # dataset1_name,
358
- # dataset1_split,
359
- # dataset1_text_column,
360
- # dataset2_name="",
361
- # dataset2_split="",
362
- # dataset2_text_column="",
363
- # threshold=default_threshold,
364
- # progress=gr.Progress(track_tqdm=True)
365
- # ):
366
- # try:
367
- # # Convert threshold to float
368
- # threshold = float(threshold)
369
-
370
- # # Initialize status message
371
- # status = ""
372
-
373
- # if deduplication_type == "Single dataset":
374
- # # Load Dataset 1
375
- # status = "Loading Dataset 1..."
376
- # yield status, ""
377
- # if dataset1_name == default_dataset1_name and dataset1_split == default_dataset1_split:
378
- # ds = ds_default1
379
- # else:
380
- # ds = load_dataset(dataset1_name, split=dataset1_split)
381
-
382
- # # Extract texts
383
- # status = "Extracting texts from Dataset 1..."
384
- # yield status, ""
385
- # texts = [example[dataset1_text_column] for example in ds]
386
-
387
- # # Compute embeddings
388
- # status = "Computing embeddings for Dataset 1..."
389
- # yield status, ""
390
- # embeddings = []
391
- # batch_size = 64
392
- # total_batches = (len(texts) + batch_size - 1) // batch_size
393
- # for batch_texts in progress.tqdm(batch_iterable(texts, batch_size), desc="Computing embeddings", total=total_batches):
394
- # batch_embeddings = model.encode(batch_texts, show_progressbar=False)
395
- # embeddings.append(batch_embeddings)
396
- # embedding_matrix = np.concatenate(embeddings, axis=0)
397
-
398
- # # Deduplicate
399
- # status = "Deduplicating embeddings..."
400
- # yield status, ""
401
- # deduplicated_indices, duplicate_to_original_mapping = deduplicate(
402
- # embedding_matrix, threshold, progress=progress
403
- # )
404
-
405
- # # Prepare the results
406
- # num_duplicates = len(duplicate_to_original_mapping)
407
- # num_total = len(texts)
408
- # num_deduplicated = len(deduplicated_indices)
409
-
410
- # result_text = f"**Total documents:** {num_total}\n"
411
- # result_text += f"**Number of duplicates found:** {num_duplicates}\n"
412
- # result_text += f"**Number of unique documents after deduplication:** {num_deduplicated}\n\n"
413
-
414
- # # Show deduplicated examples
415
- # if num_duplicates > 0:
416
- # result_text += "**Examples of duplicates found:**\n\n"
417
- # num_examples = min(5, num_duplicates)
418
- # for duplicate_idx, original_idx in list(duplicate_to_original_mapping.items())[:num_examples]:
419
- # original_text = texts[original_idx]
420
- # duplicate_text = texts[duplicate_idx]
421
- # differences = display_word_differences(original_text, duplicate_text)
422
- # result_text += f"**Original text:**\n{original_text}\n\n"
423
- # result_text += f"**Duplicate text:**\n{duplicate_text}\n\n"
424
- # result_text += f"**Differences:**\n{differences}\n"
425
- # result_text += "-" * 50 + "\n\n"
426
- # else:
427
- # result_text += "No duplicates found."
428
-
429
- # # Final status
430
- # status = "Deduplication completed."
431
- # yield status, result_text
432
-
433
- # elif deduplication_type == "Cross-dataset":
434
- # # Load Dataset 1
435
- # status = "Loading Dataset 1..."
436
- # yield status, ""
437
- # if dataset1_name == default_dataset1_name and dataset1_split == default_dataset1_split:
438
- # ds1 = ds_default1
439
- # else:
440
- # ds1 = load_dataset(dataset1_name, split=dataset1_split)
441
-
442
- # # Load Dataset 2
443
- # status = "Loading Dataset 2..."
444
- # yield status, ""
445
- # if dataset2_name == default_dataset2_name and dataset2_split == default_dataset2_split:
446
- # ds2 = ds_default2
447
- # else:
448
- # ds2 = load_dataset(dataset2_name, split=dataset2_split)
449
-
450
- # # Extract texts from Dataset 1
451
- # status = "Extracting texts from Dataset 1..."
452
- # yield status, ""
453
- # texts1 = [example[dataset1_text_column] for example in ds1]
454
-
455
- # # Extract texts from Dataset 2
456
- # status = "Extracting texts from Dataset 2..."
457
- # yield status, ""
458
- # texts2 = [example[dataset2_text_column] for example in ds2]
459
-
460
- # # Compute embeddings for Dataset 1
461
- # status = "Computing embeddings for Dataset 1..."
462
- # yield status, ""
463
- # embeddings1 = []
464
- # batch_size = 64
465
- # total_batches1 = (len(texts1) + batch_size - 1) // batch_size
466
- # for batch_texts in progress.tqdm(batch_iterable(texts1, batch_size), desc="Computing embeddings for Dataset 1", total=total_batches1):
467
- # batch_embeddings = model.encode(batch_texts, show_progressbar=False)
468
- # embeddings1.append(batch_embeddings)
469
- # embedding_matrix1 = np.concatenate(embeddings1, axis=0)
470
-
471
- # # Compute embeddings for Dataset 2
472
- # status = "Computing embeddings for Dataset 2..."
473
- # yield status, ""
474
- # embeddings2 = []
475
- # total_batches2 = (len(texts2) + batch_size - 1) // batch_size
476
- # for batch_texts in progress.tqdm(batch_iterable(texts2, batch_size), desc="Computing embeddings for Dataset 2", total=total_batches2):
477
- # batch_embeddings = model.encode(batch_texts, show_progressbar=False)
478
- # embeddings2.append(batch_embeddings)
479
- # embedding_matrix2 = np.concatenate(embeddings2, axis=0)
480
-
481
- # # Deduplicate across datasets
482
- # status = "Deduplicating embeddings across datasets..."
483
- # yield status, ""
484
- # duplicate_indices_in_ds2, duplicate_to_original_mapping = deduplicate_across_datasets(
485
- # embedding_matrix1, embedding_matrix2, threshold, progress=progress
486
- # )
487
-
488
- # num_duplicates = len(duplicate_indices_in_ds2)
489
- # num_total_ds2 = len(texts2)
490
- # num_unique_ds2 = num_total_ds2 - num_duplicates
491
-
492
- # result_text = f"**Total documents in {dataset2_name}/{dataset2_split}:** {num_total_ds2}\n"
493
- # result_text += f"**Number of duplicates found in {dataset2_name}/{dataset2_split}:** {num_duplicates}\n"
494
- # result_text += f"**Number of unique documents in {dataset2_name}/{dataset2_split} after deduplication:** {num_unique_ds2}\n\n"
495
-
496
- # # Show deduplicated examples
497
- # if num_duplicates > 0:
498
- # result_text += "**Examples of duplicates found in Dataset 2:**\n\n"
499
- # num_examples = min(5, num_duplicates)
500
- # for duplicate_idx in duplicate_indices_in_ds2[:num_examples]:
501
- # original_idx = duplicate_to_original_mapping[duplicate_idx]
502
- # original_text = texts1[original_idx]
503
- # duplicate_text = texts2[duplicate_idx]
504
- # differences = display_word_differences(original_text, duplicate_text)
505
- # result_text += f"**Original text (Dataset 1):**\n{original_text}\n\n"
506
- # result_text += f"**Duplicate text (Dataset 2):**\n{duplicate_text}\n\n"
507
- # result_text += f"**Differences:**\n{differences}\n"
508
- # result_text += "-" * 50 + "\n\n"
509
- # else:
510
- # result_text += "No duplicates found."
511
-
512
- # # Final status
513
- # status = "Deduplication completed."
514
- # yield status, result_text
515
-
516
- # except Exception as e:
517
- # yield f"An error occurred: {e}", ""
518
- # raise e
519
-
520
- # def deduplicate(embedding_matrix: np.ndarray, threshold: float, batch_size: int = 1024, progress=None) -> tuple[np.ndarray, dict[int, int]]:
521
- # """
522
- # Deduplicate embeddings and return the deduplicated indices and a mapping of removed indices to their corresponding original indices.
523
- # """
524
- # # Building the index
525
- # progress(0, desc="Building search index...")
526
- # reach = Reach(vectors=embedding_matrix, items=[str(i) for i in range(len(embedding_matrix))])
527
-
528
- # deduplicated_indices = set(range(len(embedding_matrix)))
529
- # duplicate_to_original_mapping = {}
530
-
531
- # # Finding nearest neighbors
532
- # progress(0, desc="Finding nearest neighbors...")
533
- # results = reach.nearest_neighbor_threshold(
534
- # embedding_matrix,
535
- # threshold=threshold,
536
- # batch_size=batch_size,
537
- # show_progressbar=False # Disable internal progress bar
538
- # )
539
-
540
- # # Processing duplicates with a progress bar
541
- # total_items = len(embedding_matrix)
542
- # for i, similar_items in enumerate(progress.tqdm(results, desc="Processing duplicates", total=total_items)):
543
- # if i not in deduplicated_indices:
544
- # continue
545
-
546
- # similar_indices = [int(item[0]) for item in similar_items if int(item[0]) != i]
547
-
548
- # for sim_idx in similar_indices:
549
- # if sim_idx in deduplicated_indices:
550
- # deduplicated_indices.remove(sim_idx)
551
- # duplicate_to_original_mapping[sim_idx] = i
552
-
553
- # return np.array(list(deduplicated_indices)), duplicate_to_original_mapping
554
-
555
- # def deduplicate_across_datasets(embedding_matrix_1: np.ndarray, embedding_matrix_2: np.ndarray, threshold: float, batch_size: int = 1024, progress=None) -> tuple[list[int], dict[int, int]]:
556
- # """
557
- # Deduplicate embeddings across two datasets and return the indices of duplicates between them.
558
- # """
559
- # # Building the index from Dataset 1
560
- # progress(0, desc="Building search index from Dataset 1...")
561
- # reach = Reach(vectors=embedding_matrix_1, items=[str(i) for i in range(len(embedding_matrix_1))])
562
-
563
- # duplicate_indices_in_test = []
564
- # duplicate_to_original_mapping = {}
565
-
566
- # # Finding nearest neighbors between datasets
567
- # progress(0, desc="Finding nearest neighbors between datasets...")
568
- # results = reach.nearest_neighbor_threshold(
569
- # embedding_matrix_2,
570
- # threshold=threshold,
571
- # batch_size=batch_size,
572
- # show_progressbar=False # Disable internal progress bar
573
- # )
574
-
575
- # total_items = len(embedding_matrix_2)
576
- # # Processing duplicates with a progress bar
577
- # for i, similar_items in enumerate(progress.tqdm(results, desc="Processing duplicates across datasets", total=total_items)):
578
- # similar_indices = [int(item[0]) for item in similar_items if item[1] >= threshold]
579
-
580
- # if similar_indices:
581
- # duplicate_indices_in_test.append(i)
582
- # duplicate_to_original_mapping[i] = similar_indices[0]
583
-
584
- # return duplicate_indices_in_test, duplicate_to_original_mapping
585
-
586
- # with gr.Blocks() as demo:
587
- # gr.Markdown("# Semantic Deduplication")
588
-
589
- # deduplication_type = gr.Radio(
590
- # choices=["Single dataset", "Cross-dataset"],
591
- # label="Deduplication Type",
592
- # value="Single dataset"
593
- # )
594
-
595
- # with gr.Row():
596
- # dataset1_name = gr.Textbox(value=default_dataset1_name, label="Dataset 1 Name")
597
- # dataset1_split = gr.Textbox(value=default_dataset1_split, label="Dataset 1 Split")
598
- # dataset1_text_column = gr.Textbox(value=default_text_column, label="Text Column Name")
599
-
600
- # dataset2_inputs = gr.Column(visible=False)
601
- # with dataset2_inputs:
602
- # gr.Markdown("### Dataset 2")
603
- # with gr.Row():
604
- # dataset2_name = gr.Textbox(value=default_dataset2_name, label="Dataset 2 Name")
605
- # dataset2_split = gr.Textbox(value=default_dataset2_split, label="Dataset 2 Split")
606
- # dataset2_text_column = gr.Textbox(value=default_text_column, label="Text Column Name")
607
-
608
- # threshold = gr.Slider(
609
- # minimum=0.0,
610
- # maximum=1.0,
611
- # value=default_threshold,
612
- # label="Similarity Threshold"
613
- # )
614
-
615
- # compute_button = gr.Button("Compute")
616
-
617
- # status_output = gr.Markdown()
618
- # result_output = gr.Markdown()
619
-
620
- # # Function to update the visibility of dataset2_inputs
621
- # def update_visibility(deduplication_type_value):
622
- # if deduplication_type_value == "Cross-dataset":
623
- # return gr.update(visible=True)
624
- # else:
625
- # return gr.update(visible=False)
626
-
627
- # deduplication_type.change(
628
- # update_visibility,
629
- # inputs=deduplication_type,
630
- # outputs=dataset2_inputs
631
- # )
632
-
633
- # compute_button.click(
634
- # fn=perform_deduplication,
635
- # inputs=[
636
- # deduplication_type,
637
- # dataset1_name,
638
- # dataset1_split,
639
- # dataset1_text_column,
640
- # dataset2_name,
641
- # dataset2_split,
642
- # dataset2_text_column,
643
- # threshold
644
- # ],
645
- # outputs=[status_output, result_output]
646
- # )
647
-
648
- # demo.launch()
649
-
650
-
651
-
652
- # # import gradio as gr
653
- # # from datasets import load_dataset
654
- # # import numpy as np
655
- # # from model2vec import StaticModel
656
- # # from reach import Reach
657
- # # from difflib import ndiff
658
- # # import tqdm
659
-
660
- # # # Load the model at startup
661
- # # model = StaticModel.from_pretrained("minishlab/M2V_base_output")
662
-
663
- # # # Update default dataset to 'sst2' and set default threshold to 0.9
664
- # # default_dataset1_name = "sst2"
665
- # # default_dataset1_split = "train"
666
- # # default_dataset2_name = "sst2"
667
- # # default_dataset2_split = "validation"
668
- # # default_text_column = "sentence"
669
- # # default_threshold = 0.9
670
-
671
- # # # Load the default datasets at startup
672
- # # ds_default1 = load_dataset(default_dataset1_name, split=default_dataset1_split)
673
- # # ds_default2 = load_dataset(default_dataset2_name, split=default_dataset2_split)
674
-
675
- # # def batch_iterable(iterable, batch_size):
676
- # # """Helper function to create batches from an iterable."""
677
- # # for i in range(0, len(iterable), batch_size):
678
- # # yield iterable[i:i + batch_size]
679
-
680
- # # def compute_embeddings(texts, batch_size, progress, desc="Computing embeddings"):
681
- # # embeddings = []
682
- # # for batch in progress.tqdm(batch_iterable(texts, batch_size), total=(len(texts) + batch_size - 1) // batch_size, desc=desc):
683
- # # batch_embeddings = model.encode(batch, show_progressbar=False)
684
- # # embeddings.append(batch_embeddings)
685
- # # return np.concatenate(embeddings, axis=0)
686
-
687
- # # def deduplicate(embedding_matrix: np.ndarray, threshold: float, batch_size: int = 1024, progress=None) -> tuple[np.ndarray, dict[int, int]]:
688
- # # """
689
- # # Deduplicate embeddings and return the deduplicated indices and a mapping of removed indices to their corresponding original indices.
690
- # # """
691
- # # # Building the index
692
- # # progress(0, desc="Building search index...")
693
- # # reach = Reach(vectors=embedding_matrix, items=[str(i) for i in range(len(embedding_matrix))])
694
-
695
- # # deduplicated_indices = set(range(len(embedding_matrix)))
696
- # # duplicate_to_original_mapping = {}
697
-
698
- # # # Finding nearest neighbors
699
- # # progress(0, desc="Finding nearest neighbors...")
700
- # # results = reach.nearest_neighbor_threshold(
701
- # # embedding_matrix,
702
- # # threshold=threshold,
703
- # # batch_size=batch_size,
704
- # # show_progressbar=False # Disable internal progress bar
705
- # # )
706
-
707
- # # # Processing duplicates with a progress bar
708
- # # total_items = len(embedding_matrix)
709
- # # for i, similar_items in enumerate(progress.tqdm(results, desc="Processing duplicates", total=total_items)):
710
- # # if i not in deduplicated_indices:
711
- # # continue
712
-
713
- # # similar_indices = [int(item[0]) for item in similar_items if int(item[0]) != i]
714
-
715
- # # for sim_idx in similar_indices:
716
- # # if sim_idx in deduplicated_indices:
717
- # # deduplicated_indices.remove(sim_idx)
718
- # # duplicate_to_original_mapping[sim_idx] = i
719
-
720
- # # return np.array(list(deduplicated_indices)), duplicate_to_original_mapping
721
-
722
- # # def deduplicate_across_datasets(embedding_matrix_1: np.ndarray, embedding_matrix_2: np.ndarray, threshold: float, batch_size: int = 1024, progress=None) -> tuple[list[int], dict[int, int]]:
723
- # # """
724
- # # Deduplicate embeddings across two datasets and return the indices of duplicates between them.
725
- # # """
726
- # # # Building the index from Dataset 1
727
- # # progress(0, desc="Building search index from Dataset 1...")
728
- # # reach = Reach(vectors=embedding_matrix_1, items=[str(i) for i in range(len(embedding_matrix_1))])
729
-
730
- # # duplicate_indices_in_test = []
731
- # # duplicate_to_original_mapping = {}
732
-
733
- # # # Finding nearest neighbors between datasets
734
- # # progress(0, desc="Finding nearest neighbors between datasets...")
735
- # # results = reach.nearest_neighbor_threshold(
736
- # # embedding_matrix_2,
737
- # # threshold=threshold,
738
- # # batch_size=batch_size,
739
- # # show_progressbar=False # Disable internal progress bar
740
- # # )
741
-
742
- # # total_items = len(embedding_matrix_2)
743
- # # # Processing duplicates with a progress bar
744
- # # for i, similar_items in enumerate(progress.tqdm(results, desc="Processing duplicates across datasets", total=total_items)):
745
- # # similar_indices = [int(item[0]) for item in similar_items if item[1] >= threshold]
746
-
747
- # # if similar_indices:
748
- # # duplicate_indices_in_test.append(i)
749
- # # duplicate_to_original_mapping[i] = similar_indices[0]
750
-
751
- # # return duplicate_indices_in_test, duplicate_to_original_mapping
752
-
753
- # # def display_word_differences(x: str, y: str) -> str:
754
- # # diff = ndiff(x.split(), y.split())
755
- # # return " ".join([word for word in diff if word.startswith(('+', '-'))])
756
-
757
- # # def perform_deduplication(
758
- # # deduplication_type,
759
- # # dataset1_name,
760
- # # dataset1_split,
761
- # # dataset1_text_column,
762
- # # dataset2_name="",
763
- # # dataset2_split="",
764
- # # dataset2_text_column="",
765
- # # threshold=default_threshold,
766
- # # progress=gr.Progress(track_tqdm=True)
767
- # # ):
768
- # # try:
769
- # # # Convert threshold to float
770
- # # threshold = float(threshold)
771
-
772
- # # # Initialize status message
773
- # # status = ""
774
-
775
- # # if deduplication_type == "Single dataset":
776
- # # # Load Dataset 1
777
- # # status = "Loading Dataset 1..."
778
- # # yield status, ""
779
- # # if dataset1_name == default_dataset1_name and dataset1_split == default_dataset1_split:
780
- # # ds = ds_default1
781
- # # else:
782
- # # ds = load_dataset(dataset1_name, split=dataset1_split)
783
-
784
- # # # Extract texts
785
- # # status = "Extracting texts from Dataset 1..."
786
- # # yield status, ""
787
- # # texts = [example[dataset1_text_column] for example in ds]
788
-
789
- # # # Compute embeddings
790
- # # status = "Computing embeddings for Dataset 1..."
791
- # # yield status, ""
792
- # # embedding_matrix = compute_embeddings(texts, batch_size=64, progress=progress, desc="Computing embeddings for Dataset 1")
793
-
794
- # # # Deduplicate
795
- # # status = "Deduplicating embeddings..."
796
- # # yield status, ""
797
- # # deduplicated_indices, duplicate_to_original_mapping = deduplicate(
798
- # # embedding_matrix, threshold, progress=progress
799
- # # )
800
-
801
- # # # Prepare the results
802
- # # num_duplicates = len(duplicate_to_original_mapping)
803
- # # num_total = len(texts)
804
- # # num_deduplicated = len(deduplicated_indices)
805
-
806
- # # result_text = f"**Total documents:** {num_total}\n"
807
- # # result_text += f"**Number of duplicates found:** {num_duplicates}\n"
808
- # # result_text += f"**Number of unique documents after deduplication:** {num_deduplicated}\n\n"
809
-
810
- # # # Show deduplicated examples
811
- # # if num_duplicates > 0:
812
- # # result_text += "**Examples of duplicates found:**\n\n"
813
- # # num_examples = min(5, num_duplicates)
814
- # # for duplicate_idx, original_idx in list(duplicate_to_original_mapping.items())[:num_examples]:
815
- # # original_text = texts[original_idx]
816
- # # duplicate_text = texts[duplicate_idx]
817
- # # differences = display_word_differences(original_text, duplicate_text)
818
- # # result_text += f"**Original text:**\n{original_text}\n\n"
819
- # # result_text += f"**Duplicate text:**\n{duplicate_text}\n\n"
820
- # # result_text += f"**Differences:**\n{differences}\n"
821
- # # result_text += "-" * 50 + "\n\n"
822
- # # else:
823
- # # result_text += "No duplicates found."
824
-
825
- # # # Final status
826
- # # status = "Deduplication completed."
827
- # # yield status, result_text
828
-
829
- # # elif deduplication_type == "Cross-dataset":
830
- # # # Load Dataset 1
831
- # # status = "Loading Dataset 1..."
832
- # # yield status, ""
833
- # # if dataset1_name == default_dataset1_name and dataset1_split == default_dataset1_split:
834
- # # ds1 = ds_default1
835
- # # else:
836
- # # ds1 = load_dataset(dataset1_name, split=dataset1_split)
837
-
838
- # # # Load Dataset 2
839
- # # status = "Loading Dataset 2..."
840
- # # yield status, ""
841
- # # if dataset2_name == default_dataset2_name and dataset2_split == default_dataset2_split:
842
- # # ds2 = ds_default2
843
- # # else:
844
- # # ds2 = load_dataset(dataset2_name, split=dataset2_split)
845
-
846
- # # # Extract texts from Dataset 1
847
- # # status = "Extracting texts from Dataset 1..."
848
- # # yield status, ""
849
- # # texts1 = [example[dataset1_text_column] for example in ds1]
850
-
851
- # # # Extract texts from Dataset 2
852
- # # status = "Extracting texts from Dataset 2..."
853
- # # yield status, ""
854
- # # texts2 = [example[dataset2_text_column] for example in ds2]
855
-
856
- # # # Compute embeddings for Dataset 1
857
- # # status = "Computing embeddings for Dataset 1..."
858
- # # yield status, ""
859
- # # embedding_matrix1 = compute_embeddings(texts1, batch_size=64, progress=progress, desc="Computing embeddings for Dataset 1")
860
-
861
- # # # Compute embeddings for Dataset 2
862
- # # status = "Computing embeddings for Dataset 2..."
863
- # # yield status, ""
864
- # # embedding_matrix2 = compute_embeddings(texts2, batch_size=64, progress=progress, desc="Computing embeddings for Dataset 2")
865
-
866
- # # # Deduplicate across datasets
867
- # # status = "Deduplicating embeddings across datasets..."
868
- # # yield status, ""
869
- # # duplicate_indices_in_ds2, duplicate_to_original_mapping = deduplicate_across_datasets(
870
- # # embedding_matrix1, embedding_matrix2, threshold, progress=progress
871
- # # )
872
-
873
- # # num_duplicates = len(duplicate_indices_in_ds2)
874
- # # num_total_ds2 = len(texts2)
875
- # # num_unique_ds2 = num_total_ds2 - num_duplicates
876
-
877
- # # result_text = f"**Total documents in {dataset2_name}/{dataset2_split}:** {num_total_ds2}\n\n"
878
- # # result_text += f"**Number of duplicates found in {dataset2_name}/{dataset2_split}:** {num_duplicates}\n\n"
879
- # # result_text += f"**Number of unique documents in {dataset2_name}/{dataset2_split} after deduplication:** {num_unique_ds2}\n\n"
880
-
881
- # # # Show deduplicated examples
882
- # # if num_duplicates > 0:
883
- # # result_text += "**Examples of duplicates found in Dataset 2:**\n\n"
884
- # # num_examples = min(5, num_duplicates)
885
- # # for duplicate_idx in duplicate_indices_in_ds2[:num_examples]:
886
- # # original_idx = duplicate_to_original_mapping[duplicate_idx]
887
- # # original_text = texts1[original_idx]
888
- # # duplicate_text = texts2[duplicate_idx]
889
- # # differences = display_word_differences(original_text, duplicate_text)
890
- # # result_text += f"**Original text (Dataset 1):**\n{original_text}\n\n"
891
- # # result_text += f"**Duplicate text (Dataset 2):**\n{duplicate_text}\n\n"
892
- # # result_text += f"**Differences:**\n{differences}\n"
893
- # # result_text += "-" * 50 + "\n\n"
894
- # # else:
895
- # # result_text += "No duplicates found."
896
-
897
- # # # Final status
898
- # # status = "Deduplication completed."
899
- # # yield status, result_text
900
-
901
- # # except Exception as e:
902
- # # yield f"An error occurred: {e}", ""
903
- # # raise e
904
-
905
- # # with gr.Blocks() as demo:
906
- # # gr.Markdown("# Semantic Deduplication")
907
-
908
- # # deduplication_type = gr.Radio(
909
- # # choices=["Single dataset", "Cross-dataset"],
910
- # # label="Deduplication Type",
911
- # # value="Single dataset"
912
- # # )
913
-
914
- # # with gr.Row():
915
- # # dataset1_name = gr.Textbox(value=default_dataset1_name, label="Dataset 1 Name")
916
- # # dataset1_split = gr.Textbox(value=default_dataset1_split, label="Dataset 1 Split")
917
- # # dataset1_text_column = gr.Textbox(value=default_text_column, label="Text Column Name")
918
-
919
- # # dataset2_inputs = gr.Column(visible=False)
920
- # # with dataset2_inputs:
921
- # # gr.Markdown("### Dataset 2")
922
- # # with gr.Row():
923
- # # dataset2_name = gr.Textbox(value=default_dataset2_name, label="Dataset 2 Name")
924
- # # dataset2_split = gr.Textbox(value=default_dataset2_split, label="Dataset 2 Split")
925
- # # dataset2_text_column = gr.Textbox(value=default_text_column, label="Text Column Name")
926
-
927
- # # threshold = gr.Slider(
928
- # # minimum=0.0,
929
- # # maximum=1.0,
930
- # # value=default_threshold,
931
- # # label="Similarity Threshold"
932
- # # )
933
-
934
- # # compute_button = gr.Button("Compute")
935
-
936
- # # status_output = gr.Markdown()
937
- # # result_output = gr.Markdown()
938
-
939
- # # # Function to update the visibility of dataset2_inputs
940
- # # def update_visibility(deduplication_type_value):
941
- # # if deduplication_type_value == "Cross-dataset":
942
- # # return gr.update(visible=True)
943
- # # else:
944
- # # return gr.update(visible=False)
945
-
946
- # # deduplication_type.change(
947
- # # update_visibility,
948
- # # inputs=deduplication_type,
949
- # # outputs=dataset2_inputs
950
- # # )
951
-
952
- # # compute_button.click(
953
- # # fn=perform_deduplication,
954
- # # inputs=[
955
- # # deduplication_type,
956
- # # dataset1_name,
957
- # # dataset1_split,
958
- # # dataset1_text_column,
959
- # # dataset2_name,
960
- # # dataset2_split,
961
- # # dataset2_text_column,
962
- # # threshold
963
- # # ],
964
- # # outputs=[status_output, result_output]
965
- # # )
966
-
967
- # # demo.launch()
968
-
969
-
970
-
971
-
972
-
973
-
974
-
975
-
976
-
977
-
978
-
979
-
980
-
981
-
982
-
983
-
984
-
985
-
986
-
987
-
988
-
989
-
990
- # # # import gradio as gr
991
- # # # from datasets import load_dataset
992
- # # # import numpy as np
993
- # # # from model2vec import StaticModel
994
- # # # from reach import Reach
995
- # # # from difflib import ndiff
996
- # # # import sys
997
- # # # import tqdm
998
-
999
- # # # # Load the model at startup
1000
- # # # model = StaticModel.from_pretrained("minishlab/M2V_base_output")
1001
-
1002
- # # # # Update default dataset to 'sst2' and set default threshold to 0.9
1003
- # # # default_dataset1_name = "sst2"
1004
- # # # default_dataset1_split = "train"
1005
- # # # default_dataset2_name = "sst2"
1006
- # # # default_dataset2_split = "validation"
1007
- # # # default_text_column = "sentence"
1008
- # # # default_threshold = 0.9
1009
-
1010
- # # # # Load the default datasets at startup
1011
- # # # ds_default1 = load_dataset(default_dataset1_name, split=default_dataset1_split)
1012
- # # # ds_default2 = load_dataset(default_dataset2_name, split=default_dataset2_split)
1013
-
1014
- # # # def deduplicate(embedding_matrix: np.ndarray, threshold: float, batch_size: int = 1024) -> tuple[np.ndarray, dict[int, int]]:
1015
- # # # """
1016
- # # # Deduplicate embeddings and return the deduplicated indices and a mapping of removed indices to their corresponding original indices.
1017
- # # # """
1018
- # # # # Building the index
1019
- # # # reach = Reach(vectors=embedding_matrix, items=[str(i) for i in range(len(embedding_matrix))])
1020
-
1021
- # # # deduplicated_indices = set(range(len(embedding_matrix)))
1022
- # # # duplicate_to_original_mapping = {}
1023
-
1024
- # # # # Finding nearest neighbors
1025
- # # # results = reach.nearest_neighbor_threshold(
1026
- # # # embedding_matrix,
1027
- # # # threshold=threshold,
1028
- # # # batch_size=batch_size,
1029
- # # # show_progressbar=True # Allow internal progress bar
1030
- # # # )
1031
-
1032
- # # # # Processing duplicates
1033
- # # # for i, similar_items in enumerate(results):
1034
- # # # if i not in deduplicated_indices:
1035
- # # # continue
1036
-
1037
- # # # similar_indices = [int(item[0]) for item in similar_items if int(item[0]) != i]
1038
-
1039
- # # # for sim_idx in similar_indices:
1040
- # # # if sim_idx in deduplicated_indices:
1041
- # # # deduplicated_indices.remove(sim_idx)
1042
- # # # duplicate_to_original_mapping[sim_idx] = i
1043
-
1044
- # # # return np.array(list(deduplicated_indices)), duplicate_to_original_mapping
1045
-
1046
- # # # def deduplicate_across_datasets(embedding_matrix_1: np.ndarray, embedding_matrix_2: np.ndarray, threshold: float, batch_size: int = 1024) -> tuple[list[int], dict[int, int]]:
1047
- # # # """
1048
- # # # Deduplicate embeddings across two datasets and return the indices of duplicates between them.
1049
- # # # """
1050
- # # # # Building the index from Dataset 1
1051
- # # # reach = Reach(vectors=embedding_matrix_1, items=[str(i) for i in range(len(embedding_matrix_1))])
1052
-
1053
- # # # duplicate_indices_in_test = []
1054
- # # # duplicate_to_original_mapping = {}
1055
-
1056
- # # # # Finding nearest neighbors between datasets
1057
- # # # results = reach.nearest_neighbor_threshold(
1058
- # # # embedding_matrix_2,
1059
- # # # threshold=threshold,
1060
- # # # batch_size=batch_size,
1061
- # # # show_progressbar=True # Allow internal progress bar
1062
- # # # )
1063
-
1064
- # # # # Processing duplicates
1065
- # # # for i, similar_items in enumerate(results):
1066
- # # # similar_indices = [int(item[0]) for item in similar_items if item[1] >= threshold]
1067
-
1068
- # # # if similar_indices:
1069
- # # # duplicate_indices_in_test.append(i)
1070
- # # # duplicate_to_original_mapping[i] = similar_indices[0]
1071
-
1072
- # # # return duplicate_indices_in_test, duplicate_to_original_mapping
1073
-
1074
- # # # def display_word_differences(x: str, y: str) -> str:
1075
- # # # diff = ndiff(x.split(), y.split())
1076
- # # # return " ".join([word for word in diff if word.startswith(('+', '-'))])
1077
-
1078
- # # # def perform_deduplication(
1079
- # # # deduplication_type,
1080
- # # # dataset1_name,
1081
- # # # dataset1_split,
1082
- # # # dataset1_text_column,
1083
- # # # dataset2_name="",
1084
- # # # dataset2_split="",
1085
- # # # dataset2_text_column="",
1086
- # # # threshold=default_threshold,
1087
- # # # progress=gr.Progress(track_tqdm=True)
1088
- # # # ):
1089
- # # # # Deep Monkey-Patching of tqdm
1090
- # # # original_tqdm = tqdm.tqdm
1091
- # # # tqdm.tqdm = progress.tqdm
1092
- # # # for mod_name in list(sys.modules.keys()):
1093
- # # # if 'tqdm' in mod_name:
1094
- # # # sys.modules[mod_name].tqdm = progress.tqdm
1095
-
1096
- # # # try:
1097
- # # # # Convert threshold to float
1098
- # # # threshold = float(threshold)
1099
-
1100
- # # # # Initialize status message
1101
- # # # status = ""
1102
-
1103
- # # # if deduplication_type == "Single dataset":
1104
- # # # # Load Dataset 1
1105
- # # # status = "Loading Dataset 1..."
1106
- # # # yield status, ""
1107
- # # # if dataset1_name == default_dataset1_name and dataset1_split == default_dataset1_split:
1108
- # # # ds = ds_default1
1109
- # # # else:
1110
- # # # ds = load_dataset(dataset1_name, split=dataset1_split)
1111
-
1112
- # # # # Extract texts
1113
- # # # status = "Extracting texts from Dataset 1..."
1114
- # # # yield status, ""
1115
- # # # texts = [example[dataset1_text_column] for example in ds]
1116
-
1117
- # # # # Compute embeddings
1118
- # # # status = "Computing embeddings for Dataset 1..."
1119
- # # # yield status, ""
1120
- # # # embedding_matrix = model.encode(texts, show_progressbar=True) # Enable internal progress bar
1121
-
1122
- # # # # Deduplicate
1123
- # # # status = "Deduplicating embeddings..."
1124
- # # # yield status, ""
1125
- # # # deduplicated_indices, duplicate_to_original_mapping = deduplicate(
1126
- # # # embedding_matrix, threshold
1127
- # # # )
1128
-
1129
- # # # # Prepare the results
1130
- # # # num_duplicates = len(duplicate_to_original_mapping)
1131
- # # # num_total = len(texts)
1132
- # # # num_deduplicated = len(deduplicated_indices)
1133
-
1134
- # # # result_text = f"**Total documents:** {num_total}\n"
1135
- # # # result_text += f"**Number of duplicates found:** {num_duplicates}\n"
1136
- # # # result_text += f"**Number of unique documents after deduplication:** {num_deduplicated}\n\n"
1137
-
1138
- # # # # Show deduplicated examples
1139
- # # # if num_duplicates > 0:
1140
- # # # result_text += "**Examples of duplicates found:**\n\n"
1141
- # # # num_examples = min(5, num_duplicates)
1142
- # # # for duplicate_idx, original_idx in list(duplicate_to_original_mapping.items())[:num_examples]:
1143
- # # # original_text = texts[original_idx]
1144
- # # # duplicate_text = texts[duplicate_idx]
1145
- # # # differences = display_word_differences(original_text, duplicate_text)
1146
- # # # result_text += f"**Original text:**\n{original_text}\n\n"
1147
- # # # result_text += f"**Duplicate text:**\n{duplicate_text}\n\n"
1148
- # # # result_text += f"**Differences:**\n{differences}\n"
1149
- # # # result_text += "-" * 50 + "\n\n"
1150
- # # # else:
1151
- # # # result_text += "No duplicates found."
1152
-
1153
- # # # # Final status
1154
- # # # status = "Deduplication completed."
1155
- # # # yield status, result_text
1156
-
1157
- # # # elif deduplication_type == "Cross-dataset":
1158
- # # # # Load Dataset 1
1159
- # # # status = "Loading Dataset 1..."
1160
- # # # yield status, ""
1161
- # # # if dataset1_name == default_dataset1_name and dataset1_split == default_dataset1_split:
1162
- # # # ds1 = ds_default1
1163
- # # # else:
1164
- # # # ds1 = load_dataset(dataset1_name, split=dataset1_split)
1165
-
1166
- # # # # Load Dataset 2
1167
- # # # status = "Loading Dataset 2..."
1168
- # # # yield status, ""
1169
- # # # if dataset2_name == default_dataset2_name and dataset2_split == default_dataset2_split:
1170
- # # # ds2 = ds_default2
1171
- # # # else:
1172
- # # # ds2 = load_dataset(dataset2_name, split=dataset2_split)
1173
-
1174
- # # # # Extract texts from Dataset 1
1175
- # # # status = "Extracting texts from Dataset 1..."
1176
- # # # yield status, ""
1177
- # # # texts1 = [example[dataset1_text_column] for example in ds1]
1178
-
1179
- # # # # Extract texts from Dataset 2
1180
- # # # status = "Extracting texts from Dataset 2..."
1181
- # # # yield status, ""
1182
- # # # texts2 = [example[dataset2_text_column] for example in ds2]
1183
-
1184
- # # # # Compute embeddings for Dataset 1
1185
- # # # status = "Computing embeddings for Dataset 1..."
1186
- # # # yield status, ""
1187
- # # # embedding_matrix1 = model.encode(texts1, show_progressbar=True)
1188
-
1189
- # # # # Compute embeddings for Dataset 2
1190
- # # # status = "Computing embeddings for Dataset 2..."
1191
- # # # yield status, ""
1192
- # # # embedding_matrix2 = model.encode(texts2, show_progressbar=True)
1193
-
1194
- # # # # Deduplicate across datasets
1195
- # # # status = "Deduplicating embeddings across datasets..."
1196
- # # # yield status, ""
1197
- # # # duplicate_indices_in_ds2, duplicate_to_original_mapping = deduplicate_across_datasets(
1198
- # # # embedding_matrix1, embedding_matrix2, threshold
1199
- # # # )
1200
-
1201
- # # # num_duplicates = len(duplicate_indices_in_ds2)
1202
- # # # num_total_ds2 = len(texts2)
1203
- # # # num_unique_ds2 = num_total_ds2 - num_duplicates
1204
-
1205
- # # # result_text = f"**Total documents in {dataset2_name}/{dataset2_split}:** {num_total_ds2}\n"
1206
- # # # result_text += f"**Number of duplicates found in {dataset2_name}/{dataset2_split}:** {num_duplicates}\n"
1207
- # # # result_text += f"**Number of unique documents in {dataset2_name}/{dataset2_split} after deduplication:** {num_unique_ds2}\n\n"
1208
-
1209
- # # # # Show deduplicated examples
1210
- # # # if num_duplicates > 0:
1211
- # # # result_text += "**Examples of duplicates found in Dataset 2:**\n\n"
1212
- # # # num_examples = min(5, num_duplicates)
1213
- # # # for duplicate_idx in duplicate_indices_in_ds2[:num_examples]:
1214
- # # # original_idx = duplicate_to_original_mapping[duplicate_idx]
1215
- # # # original_text = texts1[original_idx]
1216
- # # # duplicate_text = texts2[duplicate_idx]
1217
- # # # differences = display_word_differences(original_text, duplicate_text)
1218
- # # # result_text += f"**Original text (Dataset 1):**\n{original_text}\n\n"
1219
- # # # result_text += f"**Duplicate text (Dataset 2):**\n{duplicate_text}\n\n"
1220
- # # # result_text += f"**Differences:**\n{differences}\n"
1221
- # # # result_text += "-" * 50 + "\n\n"
1222
- # # # else:
1223
- # # # result_text += "No duplicates found."
1224
-
1225
- # # # # Final status
1226
- # # # status = "Deduplication completed."
1227
- # # # yield status, result_text
1228
-
1229
- # # # finally:
1230
- # # # # Restore original tqdm
1231
- # # # tqdm.tqdm = original_tqdm
1232
- # # # for mod_name in list(sys.modules.keys()):
1233
- # # # if 'tqdm' in mod_name:
1234
- # # # sys.modules[mod_name].tqdm = original_tqdm
1235
-
1236
- # # # with gr.Blocks() as demo:
1237
- # # # gr.Markdown("# Semantic Deduplication")
1238
-
1239
- # # # deduplication_type = gr.Radio(
1240
- # # # choices=["Single dataset", "Cross-dataset"],
1241
- # # # label="Deduplication Type",
1242
- # # # value="Single dataset"
1243
- # # # )
1244
-
1245
- # # # with gr.Row():
1246
- # # # dataset1_name = gr.Textbox(value=default_dataset1_name, label="Dataset 1 Name")
1247
- # # # dataset1_split = gr.Textbox(value=default_dataset1_split, label="Dataset 1 Split")
1248
- # # # dataset1_text_column = gr.Textbox(value=default_text_column, label="Text Column Name")
1249
-
1250
- # # # dataset2_inputs = gr.Column(visible=False)
1251
- # # # with dataset2_inputs:
1252
- # # # gr.Markdown("### Dataset 2")
1253
- # # # with gr.Row():
1254
- # # # dataset2_name = gr.Textbox(value=default_dataset2_name, label="Dataset 2 Name")
1255
- # # # dataset2_split = gr.Textbox(value=default_dataset2_split, label="Dataset 2 Split")
1256
- # # # dataset2_text_column = gr.Textbox(value=default_text_column, label="Text Column Name")
1257
-
1258
- # # # threshold = gr.Slider(
1259
- # # # minimum=0.0,
1260
- # # # maximum=1.0,
1261
- # # # value=default_threshold,
1262
- # # # label="Similarity Threshold"
1263
- # # # )
1264
-
1265
- # # # compute_button = gr.Button("Compute")
1266
-
1267
- # # # status_output = gr.Markdown()
1268
- # # # result_output = gr.Markdown()
1269
-
1270
- # # # # Function to update the visibility of dataset2_inputs
1271
- # # # def update_visibility(deduplication_type_value):
1272
- # # # if deduplication_type_value == "Cross-dataset":
1273
- # # # return gr.update(visible=True)
1274
- # # # else:
1275
- # # # return gr.update(visible=False)
1276
-
1277
- # # # deduplication_type.change(
1278
- # # # update_visibility,
1279
- # # # inputs=deduplication_type,
1280
- # # # outputs=dataset2_inputs
1281
- # # # )
1282
-
1283
- # # # compute_button.click(
1284
- # # # fn=perform_deduplication,
1285
- # # # inputs=[
1286
- # # # deduplication_type,
1287
- # # # dataset1_name,
1288
- # # # dataset1_split,
1289
- # # # dataset1_text_column,
1290
- # # # dataset2_name,
1291
- # # # dataset2_split,
1292
- # # # dataset2_text_column,
1293
- # # # threshold
1294
- # # # ],
1295
- # # # outputs=[status_output, result_output]
1296
- # # # )
1297
-
1298
- # # # demo.launch()
1299
-
1300
-
1301
- # # # import gradio as gr
1302
- # # # from datasets import load_dataset
1303
- # # # import numpy as np
1304
- # # # from model2vec import StaticModel
1305
- # # # from reach import Reach
1306
- # # # from difflib import ndiff
1307
- # # # import sys
1308
- # # # import tqdm
1309
-
1310
- # # # # Load the model at startup
1311
- # # # model = StaticModel.from_pretrained("minishlab/M2V_base_output")
1312
-
1313
- # # # # Update default dataset to 'sst2' and set default threshold to 0.9
1314
- # # # default_dataset1_name = "sst2"
1315
- # # # default_dataset1_split = "train"
1316
- # # # default_dataset2_name = "sst2"
1317
- # # # default_dataset2_split = "validation"
1318
- # # # default_text_column = "sentence"
1319
- # # # default_threshold = 0.9
1320
-
1321
- # # # # Load the default datasets at startup
1322
- # # # ds_default1 = load_dataset(default_dataset1_name, split=default_dataset1_split)
1323
- # # # ds_default2 = load_dataset(default_dataset2_name, split=default_dataset2_split)
1324
-
1325
- # # # def deduplicate(embedding_matrix: np.ndarray, threshold: float, batch_size: int = 1024, progress=None) -> tuple[np.ndarray, dict[int, int]]:
1326
- # # # """
1327
- # # # Deduplicate embeddings and return the deduplicated indices and a mapping of removed indices to their corresponding original indices.
1328
- # # # """
1329
- # # # # Update progress to indicate building the index
1330
- # # # progress(0, desc="Building search index...")
1331
- # # # reach = Reach(vectors=embedding_matrix, items=[str(i) for i in range(len(embedding_matrix))])
1332
-
1333
- # # # deduplicated_indices = set(range(len(embedding_matrix)))
1334
- # # # duplicate_to_original_mapping = {}
1335
-
1336
- # # # # Finding nearest neighbors
1337
- # # # progress(0, desc="Finding nearest neighbors...")
1338
- # # # results = reach.nearest_neighbor_threshold(
1339
- # # # embedding_matrix,
1340
- # # # threshold=threshold,
1341
- # # # batch_size=batch_size,
1342
- # # # show_progressbar=True # Allow internal progress bar
1343
- # # # )
1344
-
1345
- # # # # Processing duplicates with a progress bar
1346
- # # # total_items = len(embedding_matrix)
1347
- # # # for i, similar_items in enumerate(progress.tqdm(results, desc="Processing duplicates", total=total_items)):
1348
- # # # if i not in deduplicated_indices:
1349
- # # # continue
1350
-
1351
- # # # similar_indices = [int(item[0]) for item in similar_items if int(item[0]) != i]
1352
-
1353
- # # # for sim_idx in similar_indices:
1354
- # # # if sim_idx in deduplicated_indices:
1355
- # # # deduplicated_indices.remove(sim_idx)
1356
- # # # duplicate_to_original_mapping[sim_idx] = i
1357
-
1358
- # # # return np.array(list(deduplicated_indices)), duplicate_to_original_mapping
1359
-
1360
- # # # def deduplicate_across_datasets(embedding_matrix_1: np.ndarray, embedding_matrix_2: np.ndarray, threshold: float, batch_size: int = 1024, progress=None) -> tuple[list[int], dict[int, int]]:
1361
- # # # """
1362
- # # # Deduplicate embeddings across two datasets and return the indices of duplicates between them.
1363
- # # # """
1364
- # # # # Update progress to indicate building the index
1365
- # # # progress(0, desc="Building search index from Dataset 1...")
1366
- # # # reach = Reach(vectors=embedding_matrix_1, items=[str(i) for i in range(len(embedding_matrix_1))])
1367
-
1368
- # # # duplicate_indices_in_test = []
1369
- # # # duplicate_to_original_mapping = {}
1370
-
1371
- # # # # Finding nearest neighbors between datasets
1372
- # # # progress(0, desc="Finding nearest neighbors between datasets...")
1373
- # # # results = reach.nearest_neighbor_threshold(
1374
- # # # embedding_matrix_2,
1375
- # # # threshold=threshold,
1376
- # # # batch_size=batch_size,
1377
- # # # show_progressbar=True # Allow internal progress bar
1378
- # # # )
1379
-
1380
- # # # total_items = len(embedding_matrix_2)
1381
- # # # # Processing duplicates with a progress bar
1382
- # # # for i, similar_items in enumerate(progress.tqdm(results, desc="Processing duplicates across datasets", total=total_items)):
1383
- # # # similar_indices = [int(item[0]) for item in similar_items if item[1] >= threshold]
1384
-
1385
- # # # if similar_indices:
1386
- # # # duplicate_indices_in_test.append(i)
1387
- # # # duplicate_to_original_mapping[i] = similar_indices[0]
1388
-
1389
- # # # return duplicate_indices_in_test, duplicate_to_original_mapping
1390
-
1391
- # # # def display_word_differences(x: str, y: str) -> str:
1392
- # # # diff = ndiff(x.split(), y.split())
1393
- # # # return " ".join([word for word in diff if word.startswith(('+', '-'))])
1394
-
1395
- # # # def perform_deduplication(
1396
- # # # deduplication_type,
1397
- # # # dataset1_name,
1398
- # # # dataset1_split,
1399
- # # # dataset1_text_column,
1400
- # # # dataset2_name="",
1401
- # # # dataset2_split="",
1402
- # # # dataset2_text_column="",
1403
- # # # threshold=default_threshold,
1404
- # # # progress=gr.Progress(track_tqdm=True)
1405
- # # # ):
1406
- # # # # Monkey-patch tqdm
1407
- # # # original_tqdm = tqdm.tqdm
1408
- # # # original_reach_tqdm = Reach.__dict__['tqdm'] if 'tqdm' in Reach.__dict__ else None
1409
- # # # tqdm.tqdm = progress.tqdm
1410
- # # # sys.modules['tqdm'].tqdm = progress.tqdm
1411
- # # # sys.modules['tqdm.auto'].tqdm = progress.tqdm
1412
- # # # Reach.tqdm = progress.tqdm # Monkey-patch reach's tqdm
1413
-
1414
- # # # try:
1415
- # # # # Convert threshold to float
1416
- # # # threshold = float(threshold)
1417
-
1418
- # # # if deduplication_type == "Single dataset":
1419
- # # # # Load Dataset 1
1420
- # # # progress(0, desc="Loading Dataset 1...")
1421
- # # # if dataset1_name == default_dataset1_name and dataset1_split == default_dataset1_split:
1422
- # # # ds = ds_default1
1423
- # # # else:
1424
- # # # ds = load_dataset(dataset1_name, split=dataset1_split)
1425
-
1426
- # # # # Extract texts
1427
- # # # progress(0, desc="Extracting texts from Dataset 1...")
1428
- # # # texts = [example[dataset1_text_column] for example in ds]
1429
-
1430
- # # # # Compute embeddings
1431
- # # # progress(0, desc="Computing embeddings for Dataset 1...")
1432
- # # # embedding_matrix = model.encode(texts, show_progressbar=True) # Enable internal progress bar
1433
-
1434
- # # # # Deduplicate
1435
- # # # result_text = deduplicate_and_prepare_results_single(
1436
- # # # embedding_matrix, texts, threshold, progress
1437
- # # # )
1438
-
1439
- # # # return result_text
1440
-
1441
- # # # elif deduplication_type == "Cross-dataset":
1442
- # # # # Load Dataset 1
1443
- # # # progress(0, desc="Loading Dataset 1...")
1444
- # # # if dataset1_name == default_dataset1_name and dataset1_split == default_dataset1_split:
1445
- # # # ds1 = ds_default1
1446
- # # # else:
1447
- # # # ds1 = load_dataset(dataset1_name, split=dataset1_split)
1448
-
1449
- # # # # Load Dataset 2
1450
- # # # progress(0, desc="Loading Dataset 2...")
1451
- # # # if dataset2_name == default_dataset2_name and dataset2_split == default_dataset2_split:
1452
- # # # ds2 = ds_default2
1453
- # # # else:
1454
- # # # ds2 = load_dataset(dataset2_name, split=dataset2_split)
1455
-
1456
- # # # # Extract texts from Dataset 1
1457
- # # # progress(0, desc="Extracting texts from Dataset 1...")
1458
- # # # texts1 = [example[dataset1_text_column] for example in ds1]
1459
-
1460
- # # # # Extract texts from Dataset 2
1461
- # # # progress(0, desc="Extracting texts from Dataset 2...")
1462
- # # # texts2 = [example[dataset2_text_column] for example in ds2]
1463
-
1464
- # # # # Compute embeddings for Dataset 1
1465
- # # # progress(0, desc="Computing embeddings for Dataset 1...")
1466
- # # # embedding_matrix1 = model.encode(texts1, show_progressbar=True)
1467
-
1468
- # # # # Compute embeddings for Dataset 2
1469
- # # # progress(0, desc="Computing embeddings for Dataset 2...")
1470
- # # # embedding_matrix2 = model.encode(texts2, show_progressbar=True)
1471
-
1472
- # # # # Deduplicate across datasets
1473
- # # # result_text = deduplicate_and_prepare_results_cross(
1474
- # # # embedding_matrix1, embedding_matrix2, texts1, texts2, threshold, progress, dataset2_name, dataset2_split
1475
- # # # )
1476
-
1477
- # # # return result_text
1478
-
1479
- # # # finally:
1480
- # # # # Restore original tqdm
1481
- # # # tqdm.tqdm = original_tqdm
1482
- # # # sys.modules['tqdm'].tqdm = original_tqdm
1483
- # # # sys.modules['tqdm.auto'].tqdm = original_tqdm
1484
-
1485
- # # # # Restore reach's original tqdm
1486
- # # # if original_reach_tqdm is not None:
1487
- # # # Reach.tqdm = original_reach_tqdm
1488
- # # # else:
1489
- # # # del Reach.tqdm # If it wasn't originally in Reach's __dict__
1490
-
1491
- # # # def deduplicate_and_prepare_results_single(embedding_matrix, texts, threshold, progress):
1492
- # # # # Deduplicate
1493
- # # # deduplicated_indices, duplicate_to_original_mapping = deduplicate(
1494
- # # # embedding_matrix, threshold, progress=progress
1495
- # # # )
1496
-
1497
- # # # # Prepare the results
1498
- # # # num_duplicates = len(duplicate_to_original_mapping)
1499
- # # # num_total = len(texts)
1500
- # # # num_deduplicated = len(deduplicated_indices)
1501
-
1502
- # # # result_text = f"**Total documents:** {num_total}\n"
1503
- # # # result_text += f"**Number of duplicates found:** {num_duplicates}\n"
1504
- # # # result_text += f"**Number of unique documents after deduplication:** {num_deduplicated}\n\n"
1505
-
1506
- # # # # Show deduplicated examples
1507
- # # # if num_duplicates > 0:
1508
- # # # result_text += "**Examples of duplicates found:**\n\n"
1509
- # # # num_examples = min(5, num_duplicates)
1510
- # # # for duplicate_idx, original_idx in list(duplicate_to_original_mapping.items())[:num_examples]:
1511
- # # # original_text = texts[original_idx]
1512
- # # # duplicate_text = texts[duplicate_idx]
1513
- # # # differences = display_word_differences(original_text, duplicate_text)
1514
- # # # result_text += f"**Original text:**\n{original_text}\n\n"
1515
- # # # result_text += f"**Duplicate text:**\n{duplicate_text}\n\n"
1516
- # # # result_text += f"**Differences:**\n{differences}\n"
1517
- # # # result_text += "-" * 50 + "\n\n"
1518
- # # # else:
1519
- # # # result_text += "No duplicates found."
1520
-
1521
- # # # return result_text
1522
-
1523
- # # # def deduplicate_and_prepare_results_cross(embedding_matrix1, embedding_matrix2, texts1, texts2, threshold, progress, dataset2_name, dataset2_split):
1524
- # # # # Deduplicate across datasets
1525
- # # # duplicate_indices_in_ds2, duplicate_to_original_mapping = deduplicate_across_datasets(
1526
- # # # embedding_matrix1, embedding_matrix2, threshold, progress=progress
1527
- # # # )
1528
-
1529
- # # # num_duplicates = len(duplicate_indices_in_ds2)
1530
- # # # num_total_ds2 = len(texts2)
1531
- # # # num_unique_ds2 = num_total_ds2 - num_duplicates
1532
-
1533
- # # # result_text = f"**Total documents in {dataset2_name}/{dataset2_split}:** {num_total_ds2}\n"
1534
- # # # result_text += f"**Number of duplicates found in {dataset2_name}/{dataset2_split}:** {num_duplicates}\n"
1535
- # # # result_text += f"**Number of unique documents in {dataset2_name}/{dataset2_split} after deduplication:** {num_unique_ds2}\n\n"
1536
-
1537
- # # # # Show deduplicated examples
1538
- # # # if num_duplicates > 0:
1539
- # # # result_text += "**Examples of duplicates found in Dataset 2:**\n\n"
1540
- # # # num_examples = min(5, num_duplicates)
1541
- # # # for duplicate_idx in duplicate_indices_in_ds2[:num_examples]:
1542
- # # # original_idx = duplicate_to_original_mapping[duplicate_idx]
1543
- # # # original_text = texts1[original_idx]
1544
- # # # duplicate_text = texts2[duplicate_idx]
1545
- # # # differences = display_word_differences(original_text, duplicate_text)
1546
- # # # result_text += f"**Original text (Dataset 1):**\n{original_text}\n\n"
1547
- # # # result_text += f"**Duplicate text (Dataset 2):**\n{duplicate_text}\n\n"
1548
- # # # result_text += f"**Differences:**\n{differences}\n"
1549
- # # # result_text += "-" * 50 + "\n\n"
1550
- # # # else:
1551
- # # # result_text += "No duplicates found."
1552
-
1553
- # # # return result_text
1554
-
1555
- # # # with gr.Blocks() as demo:
1556
- # # # gr.Markdown("# Semantic Deduplication")
1557
-
1558
- # # # deduplication_type = gr.Radio(
1559
- # # # choices=["Single dataset", "Cross-dataset"],
1560
- # # # label="Deduplication Type",
1561
- # # # value="Single dataset"
1562
- # # # )
1563
-
1564
- # # # with gr.Row():
1565
- # # # dataset1_name = gr.Textbox(value=default_dataset1_name, label="Dataset 1 Name")
1566
- # # # dataset1_split = gr.Textbox(value=default_dataset1_split, label="Dataset 1 Split")
1567
- # # # dataset1_text_column = gr.Textbox(value=default_text_column, label="Text Column Name")
1568
-
1569
- # # # dataset2_inputs = gr.Column(visible=False)
1570
- # # # with dataset2_inputs:
1571
- # # # gr.Markdown("### Dataset 2")
1572
- # # # with gr.Row():
1573
- # # # dataset2_name = gr.Textbox(value=default_dataset2_name, label="Dataset 2 Name")
1574
- # # # dataset2_split = gr.Textbox(value=default_dataset2_split, label="Dataset 2 Split")
1575
- # # # dataset2_text_column = gr.Textbox(value=default_text_column, label="Text Column Name")
1576
-
1577
- # # # threshold = gr.Slider(
1578
- # # # minimum=0.0,
1579
- # # # maximum=1.0,
1580
- # # # value=default_threshold,
1581
- # # # label="Similarity Threshold"
1582
- # # # )
1583
-
1584
- # # # compute_button = gr.Button("Compute")
1585
-
1586
- # # # output = gr.Markdown()
1587
-
1588
- # # # # Function to update the visibility of dataset2_inputs
1589
- # # # def update_visibility(deduplication_type_value):
1590
- # # # if deduplication_type_value == "Cross-dataset":
1591
- # # # return gr.update(visible=True)
1592
- # # # else:
1593
- # # # return gr.update(visible=False)
1594
-
1595
- # # # deduplication_type.change(
1596
- # # # update_visibility,
1597
- # # # inputs=deduplication_type,
1598
- # # # outputs=dataset2_inputs
1599
- # # # )
1600
-
1601
- # # # compute_button.click(
1602
- # # # fn=perform_deduplication,
1603
- # # # inputs=[
1604
- # # # deduplication_type,
1605
- # # # dataset1_name,
1606
- # # # dataset1_split,
1607
- # # # dataset1_text_column,
1608
- # # # dataset2_name,
1609
- # # # dataset2_split,
1610
- # # # dataset2_text_column,
1611
- # # # threshold
1612
- # # # ],
1613
- # # # outputs=output
1614
- # # # )
1615
-
1616
- # # # demo.launch()
1617
-
1618
-
1619
-
1620
-
1621
- # # # import gradio as gr
1622
- # # # from datasets import load_dataset
1623
- # # # import numpy as np
1624
- # # # from model2vec import StaticModel
1625
- # # # from reach import Reach
1626
- # # # from difflib import ndiff
1627
- # # # import sys
1628
- # # # import tqdm
1629
-
1630
- # # # # Load the model at startup
1631
- # # # model = StaticModel.from_pretrained("minishlab/M2V_base_output")
1632
-
1633
- # # # # Load the default datasets at startup
1634
- # # # default_dataset1_name = "ag_news"
1635
- # # # default_dataset1_split = "train"
1636
- # # # default_dataset2_name = "ag_news"
1637
- # # # default_dataset2_split = "test"
1638
-
1639
- # # # ds_default1 = load_dataset(default_dataset1_name, split=default_dataset1_split)
1640
- # # # ds_default2 = load_dataset(default_dataset2_name, split=default_dataset2_split)
1641
-
1642
- # # # def deduplicate(embedding_matrix: np.ndarray, threshold: float, batch_size: int = 1024, progress=None) -> tuple[np.ndarray, dict[int, int]]:
1643
- # # # """
1644
- # # # Deduplicate embeddings and return the deduplicated indices and a mapping of removed indices to their corresponding original indices.
1645
- # # # """
1646
- # # # reach = Reach(vectors=embedding_matrix, items=[str(i) for i in range(len(embedding_matrix))])
1647
-
1648
- # # # deduplicated_indices = set(range(len(embedding_matrix)))
1649
- # # # duplicate_to_original_mapping = {}
1650
-
1651
- # # # results = reach.nearest_neighbor_threshold(
1652
- # # # embedding_matrix,
1653
- # # # threshold=threshold,
1654
- # # # batch_size=batch_size,
1655
- # # # show_progressbar=True # Allow internal progress bar
1656
- # # # )
1657
-
1658
- # # # # Process duplicates
1659
- # # # for i, similar_items in enumerate(progress.tqdm(results, desc="Processing duplicates", total=len(embedding_matrix))):
1660
- # # # if i not in deduplicated_indices:
1661
- # # # continue
1662
-
1663
- # # # similar_indices = [int(item[0]) for item in similar_items if int(item[0]) != i]
1664
-
1665
- # # # for sim_idx in similar_indices:
1666
- # # # if sim_idx in deduplicated_indices:
1667
- # # # deduplicated_indices.remove(sim_idx)
1668
- # # # duplicate_to_original_mapping[sim_idx] = i
1669
-
1670
- # # # return np.array(list(deduplicated_indices)), duplicate_to_original_mapping
1671
-
1672
- # # # def deduplicate_across_datasets(embedding_matrix_1: np.ndarray, embedding_matrix_2: np.ndarray, threshold: float, batch_size: int = 1024, progress=None) -> tuple[list[int], dict[int, int]]:
1673
- # # # """
1674
- # # # Deduplicate embeddings across two datasets and return the indices of duplicates between them.
1675
- # # # """
1676
- # # # reach = Reach(vectors=embedding_matrix_1, items=[str(i) for i in range(len(embedding_matrix_1))])
1677
-
1678
- # # # duplicate_indices_in_test = []
1679
- # # # duplicate_to_original_mapping = {}
1680
-
1681
- # # # results = reach.nearest_neighbor_threshold(
1682
- # # # embedding_matrix_2,
1683
- # # # threshold=threshold,
1684
- # # # batch_size=batch_size,
1685
- # # # show_progressbar=True # Allow internal progress bar
1686
- # # # )
1687
-
1688
- # # # for i, similar_items in enumerate(progress.tqdm(results, desc="Processing duplicates across datasets", total=len(embedding_matrix_2))):
1689
- # # # similar_indices = [int(item[0]) for item in similar_items if item[1] >= threshold]
1690
-
1691
- # # # if similar_indices:
1692
- # # # duplicate_indices_in_test.append(i)
1693
- # # # duplicate_to_original_mapping[i] = similar_indices[0]
1694
-
1695
- # # # return duplicate_indices_in_test, duplicate_to_original_mapping
1696
-
1697
- # # # def display_word_differences(x: str, y: str) -> str:
1698
- # # # diff = ndiff(x.split(), y.split())
1699
- # # # return " ".join([word for word in diff if word.startswith(('+', '-'))])
1700
-
1701
- # # # def perform_deduplication(
1702
- # # # deduplication_type,
1703
- # # # dataset1_name,
1704
- # # # dataset1_split,
1705
- # # # dataset1_text_column,
1706
- # # # dataset2_name="",
1707
- # # # dataset2_split="",
1708
- # # # dataset2_text_column="",
1709
- # # # threshold=0.8,
1710
- # # # progress=gr.Progress(track_tqdm=True)
1711
- # # # ):
1712
- # # # # Monkey-patch tqdm
1713
- # # # original_tqdm = tqdm.tqdm
1714
- # # # original_reach_tqdm = Reach.__dict__['tqdm'] if 'tqdm' in Reach.__dict__ else None
1715
- # # # tqdm.tqdm = progress.tqdm
1716
- # # # sys.modules['tqdm'].tqdm = progress.tqdm
1717
- # # # sys.modules['tqdm.auto'].tqdm = progress.tqdm
1718
- # # # Reach.tqdm = progress.tqdm # Monkey-patch reach's tqdm
1719
-
1720
- # # # try:
1721
- # # # # Convert threshold to float
1722
- # # # threshold = float(threshold)
1723
-
1724
- # # # if deduplication_type == "Single dataset":
1725
- # # # # Check if the dataset is the default one
1726
- # # # if dataset1_name == default_dataset1_name and dataset1_split == default_dataset1_split:
1727
- # # # ds = ds_default1
1728
- # # # else:
1729
- # # # ds = load_dataset(dataset1_name, split=dataset1_split)
1730
-
1731
- # # # # Extract texts
1732
- # # # texts = [example[dataset1_text_column] for example in ds]
1733
-
1734
- # # # # Compute embeddings
1735
- # # # embedding_matrix = model.encode(texts, show_progressbar=True) # Enable internal progress bar
1736
-
1737
- # # # # Deduplicate
1738
- # # # deduplicated_indices, duplicate_to_original_mapping = deduplicate(embedding_matrix, threshold, progress=progress)
1739
-
1740
- # # # # Prepare the results
1741
- # # # num_duplicates = len(duplicate_to_original_mapping)
1742
- # # # num_total = len(texts)
1743
- # # # num_deduplicated = len(deduplicated_indices)
1744
-
1745
- # # # result_text = f"**Total documents:** {num_total}\n"
1746
- # # # result_text += f"**Number of duplicates found:** {num_duplicates}\n"
1747
- # # # result_text += f"**Number of unique documents after deduplication:** {num_deduplicated}\n\n"
1748
-
1749
- # # # # Show deduplicated examples
1750
- # # # result_text += "**Examples of duplicates found:**\n\n"
1751
- # # # num_examples = min(5, num_duplicates)
1752
- # # # for duplicate_idx, original_idx in list(duplicate_to_original_mapping.items())[:num_examples]:
1753
- # # # original_text = texts[original_idx]
1754
- # # # duplicate_text = texts[duplicate_idx]
1755
- # # # differences = display_word_differences(original_text, duplicate_text)
1756
- # # # result_text += f"**Original text:**\n{original_text}\n\n"
1757
- # # # result_text += f"**Duplicate text:**\n{duplicate_text}\n\n"
1758
- # # # result_text += f"**Differences:**\n{differences}\n"
1759
- # # # result_text += "-" * 50 + "\n\n"
1760
-
1761
- # # # return result_text
1762
-
1763
- # # # elif deduplication_type == "Cross-dataset":
1764
- # # # # Dataset 1
1765
- # # # if dataset1_name == default_dataset1_name and dataset1_split == default_dataset1_split:
1766
- # # # ds1 = ds_default1
1767
- # # # else:
1768
- # # # ds1 = load_dataset(dataset1_name, split=dataset1_split)
1769
-
1770
- # # # # Dataset 2
1771
- # # # if dataset2_name == default_dataset2_name and dataset2_split == default_dataset2_split:
1772
- # # # ds2 = ds_default2
1773
- # # # else:
1774
- # # # ds2 = load_dataset(dataset2_name, split=dataset2_split)
1775
-
1776
- # # # # Extract texts
1777
- # # # texts1 = [example[dataset1_text_column] for example in ds1]
1778
- # # # texts2 = [example[dataset2_text_column] for example in ds2]
1779
-
1780
- # # # # Compute embeddings
1781
- # # # embedding_matrix1 = model.encode(texts1, show_progressbar=True) # Enable internal progress bar
1782
- # # # embedding_matrix2 = model.encode(texts2, show_progressbar=True) # Enable internal progress bar
1783
-
1784
- # # # # Deduplicate across datasets
1785
- # # # duplicate_indices_in_ds2, duplicate_to_original_mapping = deduplicate_across_datasets(
1786
- # # # embedding_matrix1, embedding_matrix2, threshold, progress=progress)
1787
-
1788
- # # # num_duplicates = len(duplicate_indices_in_ds2)
1789
- # # # num_total_ds2 = len(texts2)
1790
- # # # num_unique_ds2 = num_total_ds2 - num_duplicates
1791
-
1792
- # # # result_text = f"**Total documents in {dataset2_name}/{dataset2_split}:** {num_total_ds2}\n"
1793
- # # # result_text += f"**Number of duplicates found in {dataset2_name}/{dataset2_split}:** {num_duplicates}\n"
1794
- # # # result_text += f"**Number of unique documents in {dataset2_name}/{dataset2_split} after deduplication:** {num_unique_ds2}\n\n"
1795
-
1796
- # # # # Show deduplicated examples
1797
- # # # result_text += "**Examples of duplicates found in Dataset 2:**\n\n"
1798
- # # # num_examples = min(5, num_duplicates)
1799
- # # # for duplicate_idx in duplicate_indices_in_ds2[:num_examples]:
1800
- # # # original_idx = duplicate_to_original_mapping[duplicate_idx]
1801
- # # # original_text = texts1[original_idx]
1802
- # # # duplicate_text = texts2[duplicate_idx]
1803
- # # # differences = display_word_differences(original_text, duplicate_text)
1804
- # # # result_text += f"**Original text (Dataset 1):**\n{original_text}\n\n"
1805
- # # # result_text += f"**Duplicate text (Dataset 2):**\n{duplicate_text}\n\n"
1806
- # # # result_text += f"**Differences:**\n{differences}\n"
1807
- # # # result_text += "-" * 50 + "\n\n"
1808
-
1809
- # # # return result_text
1810
-
1811
- # # # finally:
1812
- # # # # Restore original tqdm
1813
- # # # tqdm.tqdm = original_tqdm
1814
- # # # sys.modules['tqdm'].tqdm = original_tqdm
1815
- # # # sys.modules['tqdm.auto'].tqdm = original_tqdm
1816
-
1817
- # # # # Restore reach's original tqdm
1818
- # # # if original_reach_tqdm is not None:
1819
- # # # Reach.tqdm = original_reach_tqdm
1820
- # # # else:
1821
- # # # del Reach.tqdm # If it wasn't originally in Reach's __dict__
1822
-
1823
- # # # with gr.Blocks() as demo:
1824
- # # # gr.Markdown("# Semantic Deduplication")
1825
-
1826
- # # # deduplication_type = gr.Radio(
1827
- # # # choices=["Single dataset", "Cross-dataset"],
1828
- # # # label="Deduplication Type",
1829
- # # # value="Single dataset"
1830
- # # # )
1831
-
1832
- # # # with gr.Row():
1833
- # # # dataset1_name = gr.Textbox(value="ag_news", label="Dataset 1 Name")
1834
- # # # dataset1_split = gr.Textbox(value="train", label="Dataset 1 Split")
1835
- # # # dataset1_text_column = gr.Textbox(value="text", label="Text Column Name")
1836
-
1837
- # # # dataset2_inputs = gr.Column(visible=False)
1838
- # # # with dataset2_inputs:
1839
- # # # gr.Markdown("### Dataset 2")
1840
- # # # with gr.Row():
1841
- # # # dataset2_name = gr.Textbox(value="ag_news", label="Dataset 2 Name")
1842
- # # # dataset2_split = gr.Textbox(value="test", label="Dataset 2 Split")
1843
- # # # dataset2_text_column = gr.Textbox(value="text", label="Text Column Name")
1844
-
1845
- # # # threshold = gr.Slider(
1846
- # # # minimum=0.0,
1847
- # # # maximum=1.0,
1848
- # # # value=0.8,
1849
- # # # label="Similarity Threshold"
1850
- # # # )
1851
-
1852
- # # # compute_button = gr.Button("Compute")
1853
-
1854
- # # # output = gr.Markdown()
1855
-
1856
- # # # # Function to update the visibility of dataset2_inputs
1857
- # # # def update_visibility(deduplication_type_value):
1858
- # # # if deduplication_type_value == "Cross-dataset":
1859
- # # # return gr.update(visible=True)
1860
- # # # else:
1861
- # # # return gr.update(visible=False)
1862
-
1863
- # # # deduplication_type.change(
1864
- # # # update_visibility,
1865
- # # # inputs=deduplication_type,
1866
- # # # outputs=dataset2_inputs
1867
- # # # )
1868
-
1869
- # # # compute_button.click(
1870
- # # # fn=perform_deduplication,
1871
- # # # inputs=[
1872
- # # # deduplication_type,
1873
- # # # dataset1_name,
1874
- # # # dataset1_split,
1875
- # # # dataset1_text_column,
1876
- # # # dataset2_name,
1877
- # # # dataset2_split,
1878
- # # # dataset2_text_column,
1879
- # # # threshold
1880
- # # # ],
1881
- # # # outputs=output
1882
- # # # )
1883
-
1884
- # # # demo.launch()
1885
-
1886
-
1887
- # # # # import gradio as gr
1888
- # # # # from datasets import load_dataset
1889
- # # # # import numpy as np
1890
- # # # # from model2vec import StaticModel
1891
- # # # # from reach import Reach
1892
- # # # # from difflib import ndiff
1893
- # # # # import sys
1894
- # # # # import tqdm
1895
-
1896
- # # # # # Load the model at startup
1897
- # # # # model = StaticModel.from_pretrained("minishlab/M2V_base_output")
1898
-
1899
- # # # # # Load the default datasets at startup
1900
- # # # # default_dataset1_name = "ag_news"
1901
- # # # # default_dataset1_split = "train"
1902
- # # # # default_dataset2_name = "ag_news"
1903
- # # # # default_dataset2_split = "test"
1904
-
1905
- # # # # ds_default1 = load_dataset(default_dataset1_name, split=default_dataset1_split)
1906
- # # # # ds_default2 = load_dataset(default_dataset2_name, split=default_dataset2_split)
1907
-
1908
- # # # # def deduplicate(embedding_matrix: np.ndarray, threshold: float, batch_size: int = 1024, progress=None) -> tuple[np.ndarray, dict[int, int]]:
1909
- # # # # """
1910
- # # # # Deduplicate embeddings and return the deduplicated indices and a mapping of removed indices to their corresponding original indices.
1911
- # # # # """
1912
- # # # # reach = Reach(vectors=embedding_matrix, items=[str(i) for i in range(len(embedding_matrix))])
1913
-
1914
- # # # # deduplicated_indices = set(range(len(embedding_matrix)))
1915
- # # # # duplicate_to_original_mapping = {}
1916
-
1917
- # # # # results = reach.nearest_neighbor_threshold(
1918
- # # # # embedding_matrix,
1919
- # # # # threshold=threshold,
1920
- # # # # batch_size=batch_size,
1921
- # # # # show_progressbar=True # Allow internal progress bar
1922
- # # # # )
1923
-
1924
- # # # # # Process duplicates
1925
- # # # # for i, similar_items in enumerate(progress.tqdm(results, desc="Processing duplicates")):
1926
- # # # # if i not in deduplicated_indices:
1927
- # # # # continue
1928
-
1929
- # # # # similar_indices = [int(item[0]) for item in similar_items if int(item[0]) != i]
1930
-
1931
- # # # # for sim_idx in similar_indices:
1932
- # # # # if sim_idx in deduplicated_indices:
1933
- # # # # deduplicated_indices.remove(sim_idx)
1934
- # # # # duplicate_to_original_mapping[sim_idx] = i
1935
-
1936
- # # # # return np.array(list(deduplicated_indices)), duplicate_to_original_mapping
1937
-
1938
- # # # # def deduplicate_across_datasets(embedding_matrix_1: np.ndarray, embedding_matrix_2: np.ndarray, threshold: float, batch_size: int = 1024, progress=None) -> tuple[list[int], dict[int, int]]:
1939
- # # # # """
1940
- # # # # Deduplicate embeddings across two datasets and return the indices of duplicates between them.
1941
- # # # # """
1942
- # # # # reach = Reach(vectors=embedding_matrix_1, items=[str(i) for i in range(len(embedding_matrix_1))])
1943
-
1944
- # # # # duplicate_indices_in_test = []
1945
- # # # # duplicate_to_original_mapping = {}
1946
-
1947
- # # # # results = reach.nearest_neighbor_threshold(
1948
- # # # # embedding_matrix_2,
1949
- # # # # threshold=threshold,
1950
- # # # # batch_size=batch_size,
1951
- # # # # show_progressbar=True # Allow internal progress bar
1952
- # # # # )
1953
-
1954
- # # # # # Process duplicates
1955
- # # # # for i, similar_items in enumerate(progress.tqdm(results, desc="Processing duplicates across datasets")):
1956
- # # # # similar_indices = [int(item[0]) for item in similar_items if item[1] >= threshold]
1957
-
1958
- # # # # if similar_indices:
1959
- # # # # duplicate_indices_in_test.append(i)
1960
- # # # # duplicate_to_original_mapping[i] = similar_indices[0]
1961
-
1962
- # # # # return duplicate_indices_in_test, duplicate_to_original_mapping
1963
-
1964
- # # # # def display_word_differences(x: str, y: str) -> str:
1965
- # # # # diff = ndiff(x.split(), y.split())
1966
- # # # # return " ".join([word for word in diff if word.startswith(('+', '-'))])
1967
-
1968
- # # # # def perform_deduplication(
1969
- # # # # deduplication_type,
1970
- # # # # dataset1_name,
1971
- # # # # dataset1_split,
1972
- # # # # dataset1_text_column,
1973
- # # # # dataset2_name="",
1974
- # # # # dataset2_split="",
1975
- # # # # dataset2_text_column="",
1976
- # # # # threshold=0.8,
1977
- # # # # progress=gr.Progress(track_tqdm=True)
1978
- # # # # ):
1979
- # # # # # Monkey-patch tqdm
1980
- # # # # original_tqdm = tqdm.tqdm
1981
- # # # # tqdm.tqdm = progress.tqdm
1982
- # # # # sys.modules['tqdm'].tqdm = progress.tqdm
1983
- # # # # sys.modules['tqdm.auto'].tqdm = progress.tqdm
1984
-
1985
- # # # # try:
1986
- # # # # # Convert threshold to float
1987
- # # # # threshold = float(threshold)
1988
-
1989
- # # # # if deduplication_type == "Single dataset":
1990
- # # # # # Check if the dataset is the default one
1991
- # # # # if dataset1_name == default_dataset1_name and dataset1_split == default_dataset1_split:
1992
- # # # # ds = ds_default1
1993
- # # # # else:
1994
- # # # # ds = load_dataset(dataset1_name, split=dataset1_split)
1995
-
1996
- # # # # # Extract texts
1997
- # # # # texts = [example[dataset1_text_column] for example in ds]
1998
-
1999
- # # # # # Compute embeddings
2000
- # # # # embedding_matrix = model.encode(texts, show_progressbar=True) # Enable internal progress bar
2001
-
2002
- # # # # # Deduplicate
2003
- # # # # deduplicated_indices, duplicate_to_original_mapping = deduplicate(embedding_matrix, threshold, progress=progress)
2004
-
2005
- # # # # # Prepare the results
2006
- # # # # num_duplicates = len(duplicate_to_original_mapping)
2007
- # # # # num_total = len(texts)
2008
- # # # # num_deduplicated = len(deduplicated_indices)
2009
-
2010
- # # # # result_text = f"**Total documents:** {num_total}\n"
2011
- # # # # result_text += f"**Number of duplicates found:** {num_duplicates}\n"
2012
- # # # # result_text += f"**Number of unique documents after deduplication:** {num_deduplicated}\n\n"
2013
-
2014
- # # # # # Show deduplicated examples
2015
- # # # # result_text += "**Examples of duplicates found:**\n\n"
2016
- # # # # num_examples = min(5, num_duplicates)
2017
- # # # # for duplicate_idx, original_idx in list(duplicate_to_original_mapping.items())[:num_examples]:
2018
- # # # # original_text = texts[original_idx]
2019
- # # # # duplicate_text = texts[duplicate_idx]
2020
- # # # # differences = display_word_differences(original_text, duplicate_text)
2021
- # # # # result_text += f"**Original text:**\n{original_text}\n\n"
2022
- # # # # result_text += f"**Duplicate text:**\n{duplicate_text}\n\n"
2023
- # # # # result_text += f"**Differences:**\n{differences}\n"
2024
- # # # # result_text += "-" * 50 + "\n\n"
2025
-
2026
- # # # # return result_text
2027
-
2028
- # # # # elif deduplication_type == "Cross-dataset":
2029
- # # # # # Dataset 1
2030
- # # # # if dataset1_name == default_dataset1_name and dataset1_split == default_dataset1_split:
2031
- # # # # ds1 = ds_default1
2032
- # # # # else:
2033
- # # # # ds1 = load_dataset(dataset1_name, split=dataset1_split)
2034
-
2035
- # # # # # Dataset 2
2036
- # # # # if dataset2_name == default_dataset2_name and dataset2_split == default_dataset2_split:
2037
- # # # # ds2 = ds_default2
2038
- # # # # else:
2039
- # # # # ds2 = load_dataset(dataset2_name, split=dataset2_split)
2040
-
2041
- # # # # # Extract texts
2042
- # # # # texts1 = [example[dataset1_text_column] for example in ds1]
2043
- # # # # texts2 = [example[dataset2_text_column] for example in ds2]
2044
-
2045
- # # # # # Compute embeddings
2046
- # # # # embedding_matrix1 = model.encode(texts1, show_progressbar=True) # Enable internal progress bar
2047
- # # # # embedding_matrix2 = model.encode(texts2, show_progressbar=True) # Enable internal progress bar
2048
-
2049
- # # # # # Deduplicate across datasets
2050
- # # # # duplicate_indices_in_ds2, duplicate_to_original_mapping = deduplicate_across_datasets(embedding_matrix1, embedding_matrix2, threshold, progress=progress)
2051
-
2052
- # # # # num_duplicates = len(duplicate_indices_in_ds2)
2053
- # # # # num_total_ds2 = len(texts2)
2054
- # # # # num_unique_ds2 = num_total_ds2 - num_duplicates
2055
-
2056
- # # # # result_text = f"**Total documents in {dataset2_name}/{dataset2_split}:** {num_total_ds2}\n"
2057
- # # # # result_text += f"**Number of duplicates found in {dataset2_name}/{dataset2_split}:** {num_duplicates}\n"
2058
- # # # # result_text += f"**Number of unique documents in {dataset2_name}/{dataset2_split} after deduplication:** {num_unique_ds2}\n\n"
2059
-
2060
- # # # # # Show deduplicated examples
2061
- # # # # result_text += "**Examples of duplicates found in Dataset 2:**\n\n"
2062
- # # # # num_examples = min(5, num_duplicates)
2063
- # # # # for duplicate_idx in duplicate_indices_in_ds2[:num_examples]:
2064
- # # # # original_idx = duplicate_to_original_mapping[duplicate_idx]
2065
- # # # # original_text = texts1[original_idx]
2066
- # # # # duplicate_text = texts2[duplicate_idx]
2067
- # # # # differences = display_word_differences(original_text, duplicate_text)
2068
- # # # # result_text += f"**Original text (Dataset 1):**\n{original_text}\n\n"
2069
- # # # # result_text += f"**Duplicate text (Dataset 2):**\n{duplicate_text}\n\n"
2070
- # # # # result_text += f"**Differences:**\n{differences}\n"
2071
- # # # # result_text += "-" * 50 + "\n\n"
2072
-
2073
- # # # # return result_text
2074
-
2075
- # # # # finally:
2076
- # # # # # Restore original tqdm
2077
- # # # # tqdm.tqdm = original_tqdm
2078
- # # # # sys.modules['tqdm'].tqdm = original_tqdm
2079
- # # # # sys.modules['tqdm.auto'].tqdm = original_tqdm
2080
-
2081
- # # # # with gr.Blocks() as demo:
2082
- # # # # gr.Markdown("# Semantic Deduplication")
2083
-
2084
- # # # # deduplication_type = gr.Radio(
2085
- # # # # choices=["Single dataset", "Cross-dataset"],
2086
- # # # # label="Deduplication Type",
2087
- # # # # value="Single dataset"
2088
- # # # # )
2089
-
2090
- # # # # with gr.Row():
2091
- # # # # dataset1_name = gr.Textbox(value="ag_news", label="Dataset 1 Name")
2092
- # # # # dataset1_split = gr.Textbox(value="train", label="Dataset 1 Split")
2093
- # # # # dataset1_text_column = gr.Textbox(value="text", label="Text Column Name")
2094
-
2095
- # # # # dataset2_inputs = gr.Column(visible=False)
2096
- # # # # with dataset2_inputs:
2097
- # # # # gr.Markdown("### Dataset 2")
2098
- # # # # with gr.Row():
2099
- # # # # dataset2_name = gr.Textbox(value="ag_news", label="Dataset 2 Name")
2100
- # # # # dataset2_split = gr.Textbox(value="test", label="Dataset 2 Split")
2101
- # # # # dataset2_text_column = gr.Textbox(value="text", label="Text Column Name")
2102
-
2103
- # # # # threshold = gr.Slider(
2104
- # # # # minimum=0.0,
2105
- # # # # maximum=1.0,
2106
- # # # # value=0.8,
2107
- # # # # label="Similarity Threshold"
2108
- # # # # )
2109
-
2110
- # # # # compute_button = gr.Button("Compute")
2111
-
2112
- # # # # output = gr.Markdown()
2113
-
2114
- # # # # # Function to update the visibility of dataset2_inputs
2115
- # # # # def update_visibility(deduplication_type_value):
2116
- # # # # if deduplication_type_value == "Cross-dataset":
2117
- # # # # return gr.update(visible=True)
2118
- # # # # else:
2119
- # # # # return gr.update(visible=False)
2120
-
2121
- # # # # deduplication_type.change(
2122
- # # # # update_visibility,
2123
- # # # # inputs=deduplication_type,
2124
- # # # # outputs=dataset2_inputs
2125
- # # # # )
2126
-
2127
- # # # # compute_button.click(
2128
- # # # # fn=perform_deduplication,
2129
- # # # # inputs=[
2130
- # # # # deduplication_type,
2131
- # # # # dataset1_name,
2132
- # # # # dataset1_split,
2133
- # # # # dataset1_text_column,
2134
- # # # # dataset2_name,
2135
- # # # # dataset2_split,
2136
- # # # # dataset2_text_column,
2137
- # # # # threshold
2138
- # # # # ],
2139
- # # # # outputs=output
2140
- # # # # )
2141
-
2142
- # # # # demo.launch()
 
 
 
1
  import gradio as gr
2
  from datasets import load_dataset
3
  import numpy as np
 
26
  for i in range(0, len(iterable), batch_size):
27
  yield iterable[i:i + batch_size]
28
 
 
 
 
 
 
 
 
 
 
29
  def display_word_differences(x: str, y: str) -> str:
30
  diff = ndiff(x.split(), y.split())
31
  return " ".join([word for word in diff if word.startswith(('+', '-'))])
 
65
  # Compute embeddings
66
  status = "Computing embeddings for Dataset 1..."
67
  yield status, ""
68
+ embeddings = []
69
+ batch_size = 64
70
+ total_batches = (len(texts) + batch_size - 1) // batch_size
71
+ for i, batch_texts in enumerate(batch_iterable(texts, batch_size)):
72
+ batch_embeddings = model.encode(batch_texts, show_progressbar=False)
73
+ embeddings.append(batch_embeddings)
74
+ # Update progress
75
+ progress((i + 1) / total_batches, desc="Computing embeddings for Dataset 1")
76
+ # Yield control back to Gradio
77
+ yield status, ""
78
+ embedding_matrix = np.concatenate(embeddings, axis=0)
79
 
80
  # Deduplicate
81
  status = "Deduplicating embeddings..."
 
142
  # Compute embeddings for Dataset 1
143
  status = "Computing embeddings for Dataset 1..."
144
  yield status, ""
145
+ embeddings1 = []
146
+ batch_size = 64
147
+ total_batches1 = (len(texts1) + batch_size - 1) // batch_size
148
+ for i, batch_texts in enumerate(batch_iterable(texts1, batch_size)):
149
+ batch_embeddings = model.encode(batch_texts, show_progressbar=False)
150
+ embeddings1.append(batch_embeddings)
151
+ # Update progress
152
+ progress((i + 1) / total_batches1, desc="Computing embeddings for Dataset 1")
153
+ # Yield control back to Gradio
154
+ yield status, ""
155
+ embedding_matrix1 = np.concatenate(embeddings1, axis=0)
156
 
157
  # Compute embeddings for Dataset 2
158
  status = "Computing embeddings for Dataset 2..."
159
  yield status, ""
160
+ embeddings2 = []
161
+ total_batches2 = (len(texts2) + batch_size - 1) // batch_size
162
+ for i, batch_texts in enumerate(batch_iterable(texts2, batch_size)):
163
+ batch_embeddings = model.encode(batch_texts, show_progressbar=False)
164
+ embeddings2.append(batch_embeddings)
165
+ # Update progress
166
+ progress((i + 1) / total_batches2, desc="Computing embeddings for Dataset 2")
167
+ # Yield control back to Gradio
168
+ yield status, ""
169
+ embedding_matrix2 = np.concatenate(embeddings2, axis=0)
170
 
171
  # Deduplicate across datasets
172
  status = "Deduplicating embeddings across datasets..."
 
302
  label="Similarity Threshold"
303
  )
304
 
305
+ compute_button = gr.Button("Compute")
306
 
307
  status_output = gr.Markdown()
308
  result_output = gr.Markdown()
 
336
  )
337
 
338
  demo.launch()