IotaCluster commited on
Commit
bea665a
·
verified ·
1 Parent(s): b974815

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +9 -8
app.py CHANGED
@@ -20,15 +20,16 @@ def embed_sparse(text: str):
20
  if not text.strip():
21
  return {"error": "Input text is empty."}
22
  tokens = text.split()
 
23
  bm25 = BM25Okapi([tokens])
24
- scores = bm25.get_scores(tokens)
25
- # Map each term to its BM25 weight
26
- term_weights = {tok: float(score) for tok, score in zip(tokens, scores)}
27
- # Build a consistent vocabulary (sorted for deterministic indices)
28
- terms = sorted(term_weights.keys())
29
- indices = list(range(len(terms)))
30
- values = [term_weights[term] for term in terms]
31
- return {"indices": indices, "values": values, "terms": terms} # 'terms' is optional, for debugging
32
 
33
  # 3. Late-interaction embedding model (ColBERT)
34
  colbert_tokenizer = AutoTokenizer.from_pretrained('colbert-ir/colbertv2.0', use_fast=True)
 
20
  if not text.strip():
21
  return {"error": "Input text is empty."}
22
  tokens = text.split()
23
+ # Treat the input as a single document and also as the query
24
  bm25 = BM25Okapi([tokens])
25
+ unique_terms = sorted(set(tokens))
26
+ # BM25 expects a query, so we use the unique terms as the query
27
+ scores = bm25.get_scores(unique_terms)
28
+ term_weights = {term: float(score) for term, score in zip(unique_terms, scores)}
29
+ # Build Qdrant format
30
+ indices = list(range(len(unique_terms)))
31
+ values = [term_weights[term] for term in unique_terms]
32
+ return {"indices": indices, "values": values, "terms": unique_terms}
33
 
34
  # 3. Late-interaction embedding model (ColBERT)
35
  colbert_tokenizer = AutoTokenizer.from_pretrained('colbert-ir/colbertv2.0', use_fast=True)