Spaces:

minishlab
/

semantic-deduplication

Running

Pringled commited on Oct 12, 2024

Commit

d90d4c0

1 Parent(s): e49e0e9

Updates

Files changed (1) hide show

app.py CHANGED Viewed

@@ -4,8 +4,6 @@ import numpy as np
 from model2vec import StaticModel
 from reach import Reach
 from difflib import ndiff
-import tqdm
-from contextlib import contextmanager
 # Load the model at startup
 model = StaticModel.from_pretrained("minishlab/M2V_base_output")
@@ -27,19 +25,14 @@ def batch_iterable(iterable, batch_size):
     for i in range(0, len(iterable), batch_size):
         yield iterable[i:i + batch_size]
-@contextmanager
-def tqdm_redirect(progress):
-    original_tqdm = tqdm.tqdm
-    try:
-        tqdm.tqdm = progress.tqdm
-        yield
-    finally:
-        tqdm.tqdm = original_tqdm
 def compute_embeddings(texts, batch_size, progress, desc="Computing embeddings"):
-    with tqdm_redirect(progress):
-        embeddings = model.encode(texts, show_progressbar=True, batch_size=batch_size)
-    return embeddings
 def deduplicate(
     embedding_matrix: np.ndarray,
@@ -299,7 +292,8 @@ def deduplicate_across_datasets(
     return duplicate_indices_in_test, duplicate_to_original_mapping
-with gr.Blocks() as demo:
     gr.Markdown("# Semantic Deduplication")
     deduplication_type = gr.Radio(
@@ -327,8 +321,8 @@ with gr.Blocks() as demo:
     compute_button = gr.Button("Compute")
-    # Use 'lines' parameter to set the height
-    status_output = gr.Textbox(lines=10, label="Status")
     result_output = gr.Markdown()
     # Function to update the visibility of dataset2_inputs

 from model2vec import StaticModel
 from reach import Reach
 from difflib import ndiff
 # Load the model at startup
 model = StaticModel.from_pretrained("minishlab/M2V_base_output")
     for i in range(0, len(iterable), batch_size):
         yield iterable[i:i + batch_size]
 def compute_embeddings(texts, batch_size, progress, desc="Computing embeddings"):
+    embeddings = []
+    total_batches = (len(texts) + batch_size - 1) // batch_size
+    for i, batch_texts in enumerate(batch_iterable(texts, batch_size)):
+        batch_embeddings = model.encode(batch_texts, show_progressbar=False)
+        embeddings.append(batch_embeddings)
+        progress((i + 1) / total_batches, desc=desc)
+    return np.concatenate(embeddings, axis=0)
 def deduplicate(
     embedding_matrix: np.ndarray,
     return duplicate_indices_in_test, duplicate_to_original_mapping
+# Adjust the height of the status_output component using custom CSS
+with gr.Blocks(css="#status_output { height: 150px; overflow: auto; }") as demo:
     gr.Markdown("# Semantic Deduplication")
     deduplication_type = gr.Radio(
     compute_button = gr.Button("Compute")
+    # Use 'gr.Markdown' with 'elem_id' and custom CSS to adjust height
+    status_output = gr.Markdown(elem_id="status_output")
     result_output = gr.Markdown()
     # Function to update the visibility of dataset2_inputs