Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -24,7 +24,11 @@ def embed_sparse(text: str):
|
|
24 |
scores = bm25.get_scores(tokens)
|
25 |
# Map each term to its BM25 weight
|
26 |
term_weights = {tok: float(score) for tok, score in zip(tokens, scores)}
|
27 |
-
|
|
|
|
|
|
|
|
|
28 |
|
29 |
# 3. Late-interaction embedding model (ColBERT)
|
30 |
colbert_tokenizer = AutoTokenizer.from_pretrained('colbert-ir/colbertv2.0', use_fast=True)
|
|
|
24 |
scores = bm25.get_scores(tokens)
|
25 |
# Map each term to its BM25 weight
|
26 |
term_weights = {tok: float(score) for tok, score in zip(tokens, scores)}
|
27 |
+
# Build a consistent vocabulary (sorted for deterministic indices)
|
28 |
+
terms = sorted(term_weights.keys())
|
29 |
+
indices = list(range(len(terms)))
|
30 |
+
values = [term_weights[term] for term in terms]
|
31 |
+
return {"indices": indices, "values": values, "terms": terms} # 'terms' is optional, for debugging
|
32 |
|
33 |
# 3. Late-interaction embedding model (ColBERT)
|
34 |
colbert_tokenizer = AutoTokenizer.from_pretrained('colbert-ir/colbertv2.0', use_fast=True)
|