Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -20,15 +20,16 @@ def embed_sparse(text: str):
|
|
20 |
if not text.strip():
|
21 |
return {"error": "Input text is empty."}
|
22 |
tokens = text.split()
|
|
|
23 |
bm25 = BM25Okapi([tokens])
|
24 |
-
|
25 |
-
#
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
indices = list(range(len(
|
30 |
-
values = [term_weights[term] for term in
|
31 |
-
return {"indices": indices, "values": values, "terms":
|
32 |
|
33 |
# 3. Late-interaction embedding model (ColBERT)
|
34 |
colbert_tokenizer = AutoTokenizer.from_pretrained('colbert-ir/colbertv2.0', use_fast=True)
|
|
|
20 |
if not text.strip():
|
21 |
return {"error": "Input text is empty."}
|
22 |
tokens = text.split()
|
23 |
+
# Treat the input as a single document and also as the query
|
24 |
bm25 = BM25Okapi([tokens])
|
25 |
+
unique_terms = sorted(set(tokens))
|
26 |
+
# BM25 expects a query, so we use the unique terms as the query
|
27 |
+
scores = bm25.get_scores(unique_terms)
|
28 |
+
term_weights = {term: float(score) for term, score in zip(unique_terms, scores)}
|
29 |
+
# Build Qdrant format
|
30 |
+
indices = list(range(len(unique_terms)))
|
31 |
+
values = [term_weights[term] for term in unique_terms]
|
32 |
+
return {"indices": indices, "values": values, "terms": unique_terms}
|
33 |
|
34 |
# 3. Late-interaction embedding model (ColBERT)
|
35 |
colbert_tokenizer = AutoTokenizer.from_pretrained('colbert-ir/colbertv2.0', use_fast=True)
|