Spaces:
Sleeping
Sleeping
Update index_miriad_to_qdrant.py
Browse files- index_miriad_to_qdrant.py +20 -3
index_miriad_to_qdrant.py
CHANGED
@@ -16,7 +16,7 @@ client = QdrantClient(
|
|
16 |
)
|
17 |
|
18 |
# Load MIRIAD dataset (sample for demo)
|
19 |
-
ds = load_dataset("miriad/miriad-5.8M", split="train").select(range(
|
20 |
|
21 |
dense_documents = [
|
22 |
models.Document(text=doc, model="BAAI/bge-small-en")
|
@@ -32,7 +32,7 @@ collection_name = "medical_chat_bot"
|
|
32 |
|
33 |
# Create collection
|
34 |
if not client.collection_exists(collection_name):
|
35 |
-
client.
|
36 |
collection_name=collection_name,
|
37 |
vectors_config={
|
38 |
"dense": models.VectorParams(size=384, distance=models.Distance.COSINE),
|
@@ -47,6 +47,19 @@ if not client.collection_exists(collection_name):
|
|
47 |
}
|
48 |
)
|
49 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
50 |
# Batch upload in chunks
|
51 |
BATCH_SIZE = 3
|
52 |
points_batch = []
|
@@ -58,7 +71,11 @@ for i in range(len(ds['passage_text'])):
|
|
58 |
"dense": dense_documents[i],
|
59 |
"colbert": colbert_documents[i]
|
60 |
},
|
61 |
-
payload={
|
|
|
|
|
|
|
|
|
62 |
)
|
63 |
points_batch.append(point)
|
64 |
|
|
|
16 |
)
|
17 |
|
18 |
# Load MIRIAD dataset (sample for demo)
|
19 |
+
ds = load_dataset("miriad/miriad-5.8M", split="train").select(range(1000))
|
20 |
|
21 |
dense_documents = [
|
22 |
models.Document(text=doc, model="BAAI/bge-small-en")
|
|
|
32 |
|
33 |
# Create collection
|
34 |
if not client.collection_exists(collection_name):
|
35 |
+
client.create_collection(
|
36 |
collection_name=collection_name,
|
37 |
vectors_config={
|
38 |
"dense": models.VectorParams(size=384, distance=models.Distance.COSINE),
|
|
|
47 |
}
|
48 |
)
|
49 |
|
50 |
+
# Create payload indexes
|
51 |
+
client.create_payload_index(
|
52 |
+
collection_name=collection_name,
|
53 |
+
field_name="specialty",
|
54 |
+
field_schema="keyword",
|
55 |
+
)
|
56 |
+
|
57 |
+
client.create_payload_index(
|
58 |
+
collection_name=collection_name,
|
59 |
+
field_name="year",
|
60 |
+
field_schema="integer",
|
61 |
+
)
|
62 |
+
|
63 |
# Batch upload in chunks
|
64 |
BATCH_SIZE = 3
|
65 |
points_batch = []
|
|
|
71 |
"dense": dense_documents[i],
|
72 |
"colbert": colbert_documents[i]
|
73 |
},
|
74 |
+
payload={
|
75 |
+
"passage_text": ds['passage_text'][i],
|
76 |
+
"year": ds['year'][i],
|
77 |
+
"specialty": ds['specialty'][i],
|
78 |
+
}
|
79 |
)
|
80 |
points_batch.append(point)
|
81 |
|