mwitiderrick commited on
Commit
ec64e6e
·
verified ·
1 Parent(s): 3d2faa5

Update index_miriad_to_qdrant.py

Browse files
Files changed (1) hide show
  1. index_miriad_to_qdrant.py +20 -3
index_miriad_to_qdrant.py CHANGED
@@ -16,7 +16,7 @@ client = QdrantClient(
16
  )
17
 
18
  # Load MIRIAD dataset (sample for demo)
19
- ds = load_dataset("miriad/miriad-5.8M", split="train").select(range(100000))
20
 
21
  dense_documents = [
22
  models.Document(text=doc, model="BAAI/bge-small-en")
@@ -32,7 +32,7 @@ collection_name = "medical_chat_bot"
32
 
33
  # Create collection
34
  if not client.collection_exists(collection_name):
35
- client.recreate_collection(
36
  collection_name=collection_name,
37
  vectors_config={
38
  "dense": models.VectorParams(size=384, distance=models.Distance.COSINE),
@@ -47,6 +47,19 @@ if not client.collection_exists(collection_name):
47
  }
48
  )
49
 
 
 
 
 
 
 
 
 
 
 
 
 
 
50
  # Batch upload in chunks
51
  BATCH_SIZE = 3
52
  points_batch = []
@@ -58,7 +71,11 @@ for i in range(len(ds['passage_text'])):
58
  "dense": dense_documents[i],
59
  "colbert": colbert_documents[i]
60
  },
61
- payload={"passage_text": ds['passage_text'][i], "paper_id": ds['paper_id'][i]}
 
 
 
 
62
  )
63
  points_batch.append(point)
64
 
 
16
  )
17
 
18
  # Load MIRIAD dataset (sample for demo)
19
+ ds = load_dataset("miriad/miriad-5.8M", split="train").select(range(1000))
20
 
21
  dense_documents = [
22
  models.Document(text=doc, model="BAAI/bge-small-en")
 
32
 
33
  # Create collection
34
  if not client.collection_exists(collection_name):
35
+ client.create_collection(
36
  collection_name=collection_name,
37
  vectors_config={
38
  "dense": models.VectorParams(size=384, distance=models.Distance.COSINE),
 
47
  }
48
  )
49
 
50
+ # Create payload indexes
51
+ client.create_payload_index(
52
+ collection_name=collection_name,
53
+ field_name="specialty",
54
+ field_schema="keyword",
55
+ )
56
+
57
+ client.create_payload_index(
58
+ collection_name=collection_name,
59
+ field_name="year",
60
+ field_schema="integer",
61
+ )
62
+
63
  # Batch upload in chunks
64
  BATCH_SIZE = 3
65
  points_batch = []
 
71
  "dense": dense_documents[i],
72
  "colbert": colbert_documents[i]
73
  },
74
+ payload={
75
+ "passage_text": ds['passage_text'][i],
76
+ "year": ds['year'][i],
77
+ "specialty": ds['specialty'][i],
78
+ }
79
  )
80
  points_batch.append(point)
81