Spaces:

IotaCluster
/

embedding-model

Running

IotaCluster commited on Jun 30

Commit

bea665a

verified ·

1 Parent(s): b974815

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -20,15 +20,16 @@ def embed_sparse(text: str):
     if not text.strip():
         return {"error": "Input text is empty."}
     tokens = text.split()
     bm25 = BM25Okapi([tokens])
-    scores = bm25.get_scores(tokens)
-    # Map each term to its BM25 weight
-    term_weights = {tok: float(score) for tok, score in zip(tokens, scores)}
-    # Build a consistent vocabulary (sorted for deterministic indices)
-    terms = sorted(term_weights.keys())
-    indices = list(range(len(terms)))
-    values = [term_weights[term] for term in terms]
-    return {"indices": indices, "values": values, "terms": terms}  # 'terms' is optional, for debugging
 # 3. Late-interaction embedding model (ColBERT)
 colbert_tokenizer = AutoTokenizer.from_pretrained('colbert-ir/colbertv2.0', use_fast=True)

     if not text.strip():
         return {"error": "Input text is empty."}
     tokens = text.split()
+    # Treat the input as a single document and also as the query
     bm25 = BM25Okapi([tokens])
+    unique_terms = sorted(set(tokens))
+    # BM25 expects a query, so we use the unique terms as the query
+    scores = bm25.get_scores(unique_terms)
+    term_weights = {term: float(score) for term, score in zip(unique_terms, scores)}
+    # Build Qdrant format
+    indices = list(range(len(unique_terms)))
+    values = [term_weights[term] for term in unique_terms]
+    return {"indices": indices, "values": values, "terms": unique_terms}
 # 3. Late-interaction embedding model (ColBERT)
 colbert_tokenizer = AutoTokenizer.from_pretrained('colbert-ir/colbertv2.0', use_fast=True)