Spaces:

minishlab
/

semantic-deduplication

Running

App Files Files Community

Pringled commited on Oct 12, 2024

Commit

5963317

1 Parent(s): b4f232c

Updates

Browse files

Files changed (1) hide show

app.py +4 -15

app.py CHANGED Viewed

@@ -68,13 +68,10 @@ def perform_deduplication(
             embeddings = []
             batch_size = 64
             total_batches = (len(texts) + batch_size - 1) // batch_size
-            for i, batch_texts in enumerate(batch_iterable(texts, batch_size)):
                 batch_embeddings = model.encode(batch_texts, show_progressbar=False)
                 embeddings.append(batch_embeddings)
-                # Update progress
-                progress((i + 1) / total_batches, desc="Computing embeddings for Dataset 1")
-                # Yield control back to Gradio
-                yield status, ""
             embedding_matrix = np.concatenate(embeddings, axis=0)
             # Deduplicate
@@ -145,13 +142,9 @@ def perform_deduplication(
             embeddings1 = []
             batch_size = 64
             total_batches1 = (len(texts1) + batch_size - 1) // batch_size
-            for i, batch_texts in enumerate(batch_iterable(texts1, batch_size)):
                 batch_embeddings = model.encode(batch_texts, show_progressbar=False)
                 embeddings1.append(batch_embeddings)
-                # Update progress
-                progress((i + 1) / total_batches1, desc="Computing embeddings for Dataset 1")
-                # Yield control back to Gradio
-                yield status, ""
             embedding_matrix1 = np.concatenate(embeddings1, axis=0)
             # Compute embeddings for Dataset 2
@@ -159,13 +152,9 @@ def perform_deduplication(
             yield status, ""
             embeddings2 = []
             total_batches2 = (len(texts2) + batch_size - 1) // batch_size
-            for i, batch_texts in enumerate(batch_iterable(texts2, batch_size)):
                 batch_embeddings = model.encode(batch_texts, show_progressbar=False)
                 embeddings2.append(batch_embeddings)
-                # Update progress
-                progress((i + 1) / total_batches2, desc="Computing embeddings for Dataset 2")
-                # Yield control back to Gradio
-                yield status, ""
             embedding_matrix2 = np.concatenate(embeddings2, axis=0)
             # Deduplicate across datasets

             embeddings = []
             batch_size = 64
             total_batches = (len(texts) + batch_size - 1) // batch_size
+            # Use progress.tqdm without yielding inside the loop
+            for batch_texts in progress.tqdm(batch_iterable(texts, batch_size), desc="Computing embeddings for Dataset 1", total=total_batches):
                 batch_embeddings = model.encode(batch_texts, show_progressbar=False)
                 embeddings.append(batch_embeddings)
             embedding_matrix = np.concatenate(embeddings, axis=0)
             # Deduplicate
             embeddings1 = []
             batch_size = 64
             total_batches1 = (len(texts1) + batch_size - 1) // batch_size
+            for batch_texts in progress.tqdm(batch_iterable(texts1, batch_size), desc="Computing embeddings for Dataset 1", total=total_batches1):
                 batch_embeddings = model.encode(batch_texts, show_progressbar=False)
                 embeddings1.append(batch_embeddings)
             embedding_matrix1 = np.concatenate(embeddings1, axis=0)
             # Compute embeddings for Dataset 2
             yield status, ""
             embeddings2 = []
             total_batches2 = (len(texts2) + batch_size - 1) // batch_size
+            for batch_texts in progress.tqdm(batch_iterable(texts2, batch_size), desc="Computing embeddings for Dataset 2", total=total_batches2):
                 batch_embeddings = model.encode(batch_texts, show_progressbar=False)
                 embeddings2.append(batch_embeddings)
             embedding_matrix2 = np.concatenate(embeddings2, axis=0)
             # Deduplicate across datasets