Updates
Browse files
app.py
CHANGED
@@ -4,8 +4,6 @@ import numpy as np
|
|
4 |
from model2vec import StaticModel
|
5 |
from reach import Reach
|
6 |
from difflib import ndiff
|
7 |
-
import tqdm
|
8 |
-
from contextlib import contextmanager
|
9 |
|
10 |
# Load the model at startup
|
11 |
model = StaticModel.from_pretrained("minishlab/M2V_base_output")
|
@@ -27,19 +25,14 @@ def batch_iterable(iterable, batch_size):
|
|
27 |
for i in range(0, len(iterable), batch_size):
|
28 |
yield iterable[i:i + batch_size]
|
29 |
|
30 |
-
@contextmanager
|
31 |
-
def tqdm_redirect(progress):
|
32 |
-
original_tqdm = tqdm.tqdm
|
33 |
-
try:
|
34 |
-
tqdm.tqdm = progress.tqdm
|
35 |
-
yield
|
36 |
-
finally:
|
37 |
-
tqdm.tqdm = original_tqdm
|
38 |
-
|
39 |
def compute_embeddings(texts, batch_size, progress, desc="Computing embeddings"):
|
40 |
-
|
41 |
-
|
42 |
-
|
|
|
|
|
|
|
|
|
43 |
|
44 |
def deduplicate(
|
45 |
embedding_matrix: np.ndarray,
|
@@ -299,7 +292,8 @@ def deduplicate_across_datasets(
|
|
299 |
|
300 |
return duplicate_indices_in_test, duplicate_to_original_mapping
|
301 |
|
302 |
-
|
|
|
303 |
gr.Markdown("# Semantic Deduplication")
|
304 |
|
305 |
deduplication_type = gr.Radio(
|
@@ -327,8 +321,8 @@ with gr.Blocks() as demo:
|
|
327 |
|
328 |
compute_button = gr.Button("Compute")
|
329 |
|
330 |
-
# Use '
|
331 |
-
status_output = gr.
|
332 |
result_output = gr.Markdown()
|
333 |
|
334 |
# Function to update the visibility of dataset2_inputs
|
|
|
4 |
from model2vec import StaticModel
|
5 |
from reach import Reach
|
6 |
from difflib import ndiff
|
|
|
|
|
7 |
|
8 |
# Load the model at startup
|
9 |
model = StaticModel.from_pretrained("minishlab/M2V_base_output")
|
|
|
25 |
for i in range(0, len(iterable), batch_size):
|
26 |
yield iterable[i:i + batch_size]
|
27 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
28 |
def compute_embeddings(texts, batch_size, progress, desc="Computing embeddings"):
|
29 |
+
embeddings = []
|
30 |
+
total_batches = (len(texts) + batch_size - 1) // batch_size
|
31 |
+
for i, batch_texts in enumerate(batch_iterable(texts, batch_size)):
|
32 |
+
batch_embeddings = model.encode(batch_texts, show_progressbar=False)
|
33 |
+
embeddings.append(batch_embeddings)
|
34 |
+
progress((i + 1) / total_batches, desc=desc)
|
35 |
+
return np.concatenate(embeddings, axis=0)
|
36 |
|
37 |
def deduplicate(
|
38 |
embedding_matrix: np.ndarray,
|
|
|
292 |
|
293 |
return duplicate_indices_in_test, duplicate_to_original_mapping
|
294 |
|
295 |
+
# Adjust the height of the status_output component using custom CSS
|
296 |
+
with gr.Blocks(css="#status_output { height: 150px; overflow: auto; }") as demo:
|
297 |
gr.Markdown("# Semantic Deduplication")
|
298 |
|
299 |
deduplication_type = gr.Radio(
|
|
|
321 |
|
322 |
compute_button = gr.Button("Compute")
|
323 |
|
324 |
+
# Use 'gr.Markdown' with 'elem_id' and custom CSS to adjust height
|
325 |
+
status_output = gr.Markdown(elem_id="status_output")
|
326 |
result_output = gr.Markdown()
|
327 |
|
328 |
# Function to update the visibility of dataset2_inputs
|