Spaces:

minishlab
/

semantic-deduplication

Running

App Files Files Community

Pringled commited on Oct 12, 2024

Commit

38ed48e

1 Parent(s): 75ff340

Updates

Browse files

Files changed (1) hide show

app.py +10 -7

app.py CHANGED Viewed

@@ -2,6 +2,7 @@ import gradio as gr
 from datasets import load_dataset
 import numpy as np
 from model2vec import StaticModel
 from reach import Reach
 from difflib import ndiff
@@ -24,25 +25,26 @@ ds_default2 = load_dataset(default_dataset2_name, split=default_dataset2_split)
 # Patch tqdm to use Gradio's progress bar
 from tqdm import tqdm as original_tqdm
 # Patch tqdm to use Gradio's progress bar
 def patch_tqdm_for_gradio(progress):
     class GradioTqdm(original_tqdm):
         def __init__(self, *args, **kwargs):
             super().__init__(*args, **kwargs)
             self.progress = progress
-            # Set smaller step sizes or update more frequently based on total items
             self.total_batches = kwargs.get('total', len(args[0])) if len(args) > 0 else 1
-            self.update_interval = max(1, self.total_batches // 100)  # Update every 1% of progress
         def update(self, n=1):
             super().update(n)
-            # Only update Gradio's progress every `update_interval` steps
             if self.n % self.update_interval == 0 or self.n == self.total_batches:
                 self.progress(self.n / self.total_batches)
     return GradioTqdm
 # Function to patch the original encode function with our Gradio tqdm
 def original_encode_with_tqdm(original_encode_func, patched_tqdm):
@@ -153,8 +155,9 @@ def perform_deduplication(
             yield status, ""
             texts = [example[dataset1_text_column] for example in ds]
-            patched_tqdm = patch_tqdm_for_gradio(progress)
-            model.encode = original_encode_with_tqdm(model.encode, patched_tqdm)
             # Compute embeddings
             status = "Computing embeddings for Dataset 1..."
             yield status, ""

 from datasets import load_dataset
 import numpy as np
 from model2vec import StaticModel
+import model2vec
 from reach import Reach
 from difflib import ndiff
 # Patch tqdm to use Gradio's progress bar
 from tqdm import tqdm as original_tqdm
+# Patch tqdm to use Gradio's progress bar
 # Patch tqdm to use Gradio's progress bar
 def patch_tqdm_for_gradio(progress):
     class GradioTqdm(original_tqdm):
         def __init__(self, *args, **kwargs):
             super().__init__(*args, **kwargs)
             self.progress = progress
             self.total_batches = kwargs.get('total', len(args[0])) if len(args) > 0 else 1
+            self.update_interval = max(1, self.total_batches // 100)  # Update every 1%
         def update(self, n=1):
             super().update(n)
             if self.n % self.update_interval == 0 or self.n == self.total_batches:
                 self.progress(self.n / self.total_batches)
     return GradioTqdm
+def patch_model2vec_tqdm(progress):
+    patched_tqdm = patch_tqdm_for_gradio(progress)
+    model2vec.tqdm = patched_tqdm  # Replace tqdm in the StaticModel's module
 # Function to patch the original encode function with our Gradio tqdm
 def original_encode_with_tqdm(original_encode_func, patched_tqdm):
             yield status, ""
             texts = [example[dataset1_text_column] for example in ds]
+            #patched_tqdm = patch_tqdm_for_gradio(progress)
+            patch_model2vec_tqdm(progress)
+            #model.encode = original_encode_with_tqdm(model.encode, patched_tqdm)
             # Compute embeddings
             status = "Computing embeddings for Dataset 1..."
             yield status, ""