Pringled commited on
Commit
5963317
·
1 Parent(s): b4f232c
Files changed (1) hide show
  1. app.py +4 -15
app.py CHANGED
@@ -68,13 +68,10 @@ def perform_deduplication(
68
  embeddings = []
69
  batch_size = 64
70
  total_batches = (len(texts) + batch_size - 1) // batch_size
71
- for i, batch_texts in enumerate(batch_iterable(texts, batch_size)):
 
72
  batch_embeddings = model.encode(batch_texts, show_progressbar=False)
73
  embeddings.append(batch_embeddings)
74
- # Update progress
75
- progress((i + 1) / total_batches, desc="Computing embeddings for Dataset 1")
76
- # Yield control back to Gradio
77
- yield status, ""
78
  embedding_matrix = np.concatenate(embeddings, axis=0)
79
 
80
  # Deduplicate
@@ -145,13 +142,9 @@ def perform_deduplication(
145
  embeddings1 = []
146
  batch_size = 64
147
  total_batches1 = (len(texts1) + batch_size - 1) // batch_size
148
- for i, batch_texts in enumerate(batch_iterable(texts1, batch_size)):
149
  batch_embeddings = model.encode(batch_texts, show_progressbar=False)
150
  embeddings1.append(batch_embeddings)
151
- # Update progress
152
- progress((i + 1) / total_batches1, desc="Computing embeddings for Dataset 1")
153
- # Yield control back to Gradio
154
- yield status, ""
155
  embedding_matrix1 = np.concatenate(embeddings1, axis=0)
156
 
157
  # Compute embeddings for Dataset 2
@@ -159,13 +152,9 @@ def perform_deduplication(
159
  yield status, ""
160
  embeddings2 = []
161
  total_batches2 = (len(texts2) + batch_size - 1) // batch_size
162
- for i, batch_texts in enumerate(batch_iterable(texts2, batch_size)):
163
  batch_embeddings = model.encode(batch_texts, show_progressbar=False)
164
  embeddings2.append(batch_embeddings)
165
- # Update progress
166
- progress((i + 1) / total_batches2, desc="Computing embeddings for Dataset 2")
167
- # Yield control back to Gradio
168
- yield status, ""
169
  embedding_matrix2 = np.concatenate(embeddings2, axis=0)
170
 
171
  # Deduplicate across datasets
 
68
  embeddings = []
69
  batch_size = 64
70
  total_batches = (len(texts) + batch_size - 1) // batch_size
71
+ # Use progress.tqdm without yielding inside the loop
72
+ for batch_texts in progress.tqdm(batch_iterable(texts, batch_size), desc="Computing embeddings for Dataset 1", total=total_batches):
73
  batch_embeddings = model.encode(batch_texts, show_progressbar=False)
74
  embeddings.append(batch_embeddings)
 
 
 
 
75
  embedding_matrix = np.concatenate(embeddings, axis=0)
76
 
77
  # Deduplicate
 
142
  embeddings1 = []
143
  batch_size = 64
144
  total_batches1 = (len(texts1) + batch_size - 1) // batch_size
145
+ for batch_texts in progress.tqdm(batch_iterable(texts1, batch_size), desc="Computing embeddings for Dataset 1", total=total_batches1):
146
  batch_embeddings = model.encode(batch_texts, show_progressbar=False)
147
  embeddings1.append(batch_embeddings)
 
 
 
 
148
  embedding_matrix1 = np.concatenate(embeddings1, axis=0)
149
 
150
  # Compute embeddings for Dataset 2
 
152
  yield status, ""
153
  embeddings2 = []
154
  total_batches2 = (len(texts2) + batch_size - 1) // batch_size
155
+ for batch_texts in progress.tqdm(batch_iterable(texts2, batch_size), desc="Computing embeddings for Dataset 2", total=total_batches2):
156
  batch_embeddings = model.encode(batch_texts, show_progressbar=False)
157
  embeddings2.append(batch_embeddings)
 
 
 
 
158
  embedding_matrix2 = np.concatenate(embeddings2, axis=0)
159
 
160
  # Deduplicate across datasets