Spaces:
Runtime error
Runtime error
Commit
·
335e8a6
1
Parent(s):
cce1c58
add: predict functions
Browse files
medrag_multi_modal/retrieval/bm25s_retrieval.py
CHANGED
|
@@ -141,21 +141,6 @@ class BM25sRetriever(weave.Model):
|
|
| 141 |
The results are returned as a list of dictionaries, each containing a chunk and
|
| 142 |
its corresponding relevance score.
|
| 143 |
|
| 144 |
-
!!! example "Example Usage"
|
| 145 |
-
```python
|
| 146 |
-
import weave
|
| 147 |
-
from dotenv import load_dotenv
|
| 148 |
-
|
| 149 |
-
from medrag_multi_modal.retrieval import BM25sRetriever
|
| 150 |
-
|
| 151 |
-
load_dotenv()
|
| 152 |
-
weave.init(project_name="ml-colabs/medrag-multi-modal")
|
| 153 |
-
retriever = BM25sRetriever.from_wandb_artifact(
|
| 154 |
-
index_artifact_address="ml-colabs/medrag-multi-modal/grays-anatomy-bm25s:v2"
|
| 155 |
-
)
|
| 156 |
-
retrieved_chunks = retriever.retrieve(query="What are Ribosomes?")
|
| 157 |
-
```
|
| 158 |
-
|
| 159 |
Args:
|
| 160 |
query (str): The input query string to search for relevant chunks.
|
| 161 |
top_k (int, optional): The number of top relevant chunks to retrieve. Defaults to 2.
|
|
@@ -177,3 +162,37 @@ class BM25sRetriever(weave.Model):
|
|
| 177 |
):
|
| 178 |
retrieved_chunks.append({"chunk": chunk, "score": score})
|
| 179 |
return retrieved_chunks
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 141 |
The results are returned as a list of dictionaries, each containing a chunk and
|
| 142 |
its corresponding relevance score.
|
| 143 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 144 |
Args:
|
| 145 |
query (str): The input query string to search for relevant chunks.
|
| 146 |
top_k (int, optional): The number of top relevant chunks to retrieve. Defaults to 2.
|
|
|
|
| 162 |
):
|
| 163 |
retrieved_chunks.append({"chunk": chunk, "score": score})
|
| 164 |
return retrieved_chunks
|
| 165 |
+
|
| 166 |
+
@weave.op()
|
| 167 |
+
def predict(self, query: str, top_k: int = 2):
|
| 168 |
+
"""
|
| 169 |
+
Predicts the top-k most relevant chunks for a given query using the BM25 algorithm.
|
| 170 |
+
|
| 171 |
+
This function is a wrapper around the `retrieve` method. It takes an input query string,
|
| 172 |
+
tokenizes it using the BM25 tokenizer, and retrieves the top-k most relevant chunks from
|
| 173 |
+
the BM25 index. The results are returned as a list of dictionaries, each containing a chunk
|
| 174 |
+
and its corresponding relevance score.
|
| 175 |
+
|
| 176 |
+
!!! example "Example Usage"
|
| 177 |
+
```python
|
| 178 |
+
import weave
|
| 179 |
+
from dotenv import load_dotenv
|
| 180 |
+
|
| 181 |
+
from medrag_multi_modal.retrieval import BM25sRetriever
|
| 182 |
+
|
| 183 |
+
load_dotenv()
|
| 184 |
+
weave.init(project_name="ml-colabs/medrag-multi-modal")
|
| 185 |
+
retriever = BM25sRetriever.from_wandb_artifact(
|
| 186 |
+
index_artifact_address="ml-colabs/medrag-multi-modal/grays-anatomy-bm25s:v2"
|
| 187 |
+
)
|
| 188 |
+
retrieved_chunks = retriever.predict(query="What are Ribosomes?")
|
| 189 |
+
```
|
| 190 |
+
|
| 191 |
+
Args:
|
| 192 |
+
query (str): The input query string to search for relevant chunks.
|
| 193 |
+
top_k (int, optional): The number of top relevant chunks to retrieve. Defaults to 2.
|
| 194 |
+
|
| 195 |
+
Returns:
|
| 196 |
+
list: A list of dictionaries, each containing a retrieved chunk and its relevance score.
|
| 197 |
+
"""
|
| 198 |
+
return self.retrieve(query, top_k)
|
medrag_multi_modal/retrieval/contriever_retrieval.py
CHANGED
|
@@ -170,22 +170,6 @@ class ContrieverRetriever(weave.Model):
|
|
| 170 |
cosine similarity or Euclidean distance. The top-k chunks with the highest similarity scores
|
| 171 |
are returned as a list of dictionaries, each containing a chunk and its corresponding score.
|
| 172 |
|
| 173 |
-
!!! example "Example Usage"
|
| 174 |
-
```python
|
| 175 |
-
import weave
|
| 176 |
-
from dotenv import load_dotenv
|
| 177 |
-
|
| 178 |
-
from medrag_multi_modal.retrieval import ContrieverRetriever, SimilarityMetric
|
| 179 |
-
|
| 180 |
-
load_dotenv()
|
| 181 |
-
weave.init(project_name="ml-colabs/medrag-multi-modal")
|
| 182 |
-
retriever = ContrieverRetriever.from_wandb_artifact(
|
| 183 |
-
chunk_dataset_name="grays-anatomy-chunks:v0",
|
| 184 |
-
index_artifact_address="ml-colabs/medrag-multi-modal/grays-anatomy-contriever:v1",
|
| 185 |
-
)
|
| 186 |
-
scores = retriever.retrieve(query="What are Ribosomes?", metric=SimilarityMetric.COSINE)
|
| 187 |
-
```
|
| 188 |
-
|
| 189 |
Args:
|
| 190 |
query (str): The input query string to search for relevant chunks.
|
| 191 |
top_k (int, optional): The number of top relevant chunks to retrieve. Defaults to 2.
|
|
@@ -213,3 +197,44 @@ class ContrieverRetriever(weave.Model):
|
|
| 213 |
}
|
| 214 |
)
|
| 215 |
return retrieved_chunks
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 170 |
cosine similarity or Euclidean distance. The top-k chunks with the highest similarity scores
|
| 171 |
are returned as a list of dictionaries, each containing a chunk and its corresponding score.
|
| 172 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 173 |
Args:
|
| 174 |
query (str): The input query string to search for relevant chunks.
|
| 175 |
top_k (int, optional): The number of top relevant chunks to retrieve. Defaults to 2.
|
|
|
|
| 197 |
}
|
| 198 |
)
|
| 199 |
return retrieved_chunks
|
| 200 |
+
|
| 201 |
+
@weave.op()
|
| 202 |
+
def predict(
|
| 203 |
+
self,
|
| 204 |
+
query: str,
|
| 205 |
+
top_k: int = 2,
|
| 206 |
+
metric: SimilarityMetric = SimilarityMetric.COSINE,
|
| 207 |
+
):
|
| 208 |
+
"""
|
| 209 |
+
Predicts the top-k most relevant chunks for a given query using the specified similarity metric.
|
| 210 |
+
|
| 211 |
+
This function is a wrapper around the `retrieve` method. It takes an input query string,
|
| 212 |
+
retrieves the top-k most relevant chunks from the precomputed vector index based on the
|
| 213 |
+
specified similarity metric, and returns the results as a list of dictionaries, each containing
|
| 214 |
+
a chunk and its corresponding relevance score.
|
| 215 |
+
|
| 216 |
+
!!! example "Example Usage"
|
| 217 |
+
```python
|
| 218 |
+
import weave
|
| 219 |
+
from dotenv import load_dotenv
|
| 220 |
+
|
| 221 |
+
from medrag_multi_modal.retrieval import ContrieverRetriever, SimilarityMetric
|
| 222 |
+
|
| 223 |
+
load_dotenv()
|
| 224 |
+
weave.init(project_name="ml-colabs/medrag-multi-modal")
|
| 225 |
+
retriever = ContrieverRetriever.from_wandb_artifact(
|
| 226 |
+
chunk_dataset_name="grays-anatomy-chunks:v0",
|
| 227 |
+
index_artifact_address="ml-colabs/medrag-multi-modal/grays-anatomy-contriever:v1",
|
| 228 |
+
)
|
| 229 |
+
scores = retriever.predict(query="What are Ribosomes?", metric=SimilarityMetric.COSINE)
|
| 230 |
+
```
|
| 231 |
+
|
| 232 |
+
Args:
|
| 233 |
+
query (str): The input query string to search for relevant chunks.
|
| 234 |
+
top_k (int, optional): The number of top relevant chunks to retrieve. Defaults to 2.
|
| 235 |
+
metric (SimilarityMetric, optional): The similarity metric to use for scoring. Defaults to cosine similarity.
|
| 236 |
+
|
| 237 |
+
Returns:
|
| 238 |
+
list: A list of dictionaries, each containing a retrieved chunk and its relevance score.
|
| 239 |
+
"""
|
| 240 |
+
return self.retrieve(query, top_k, metric)
|
medrag_multi_modal/retrieval/medcpt_retrieval.py
CHANGED
|
@@ -200,23 +200,6 @@ class MedCPTRetriever(weave.Model):
|
|
| 200 |
cosine similarity or Euclidean distance. The top-k chunks with the highest similarity scores
|
| 201 |
are returned as a list of dictionaries, each containing a chunk and its corresponding score.
|
| 202 |
|
| 203 |
-
!!! example "Example Usage"
|
| 204 |
-
```python
|
| 205 |
-
import weave
|
| 206 |
-
from dotenv import load_dotenv
|
| 207 |
-
|
| 208 |
-
import wandb
|
| 209 |
-
from medrag_multi_modal.retrieval import MedCPTRetriever
|
| 210 |
-
|
| 211 |
-
load_dotenv()
|
| 212 |
-
weave.init(project_name="ml-colabs/medrag-multi-modal")
|
| 213 |
-
retriever = MedCPTRetriever.from_wandb_artifact(
|
| 214 |
-
chunk_dataset_name="grays-anatomy-chunks:v0",
|
| 215 |
-
index_artifact_address="ml-colabs/medrag-multi-modal/grays-anatomy-medcpt:v0",
|
| 216 |
-
)
|
| 217 |
-
retriever.retrieve(query="What are Ribosomes?")
|
| 218 |
-
```
|
| 219 |
-
|
| 220 |
Args:
|
| 221 |
query (str): The input query string to search for relevant chunks.
|
| 222 |
top_k (int, optional): The number of top relevant chunks to retrieve. Defaults to 2.
|
|
@@ -253,3 +236,44 @@ class MedCPTRetriever(weave.Model):
|
|
| 253 |
}
|
| 254 |
)
|
| 255 |
return retrieved_chunks
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 200 |
cosine similarity or Euclidean distance. The top-k chunks with the highest similarity scores
|
| 201 |
are returned as a list of dictionaries, each containing a chunk and its corresponding score.
|
| 202 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 203 |
Args:
|
| 204 |
query (str): The input query string to search for relevant chunks.
|
| 205 |
top_k (int, optional): The number of top relevant chunks to retrieve. Defaults to 2.
|
|
|
|
| 236 |
}
|
| 237 |
)
|
| 238 |
return retrieved_chunks
|
| 239 |
+
|
| 240 |
+
@weave.op()
|
| 241 |
+
def predict(
|
| 242 |
+
self,
|
| 243 |
+
query: str,
|
| 244 |
+
top_k: int = 2,
|
| 245 |
+
metric: SimilarityMetric = SimilarityMetric.COSINE,
|
| 246 |
+
):
|
| 247 |
+
"""
|
| 248 |
+
Predicts the most relevant chunks for a given query.
|
| 249 |
+
|
| 250 |
+
This function uses the `retrieve` method to find the top-k relevant chunks
|
| 251 |
+
from the dataset based on the input query. It allows specifying the number
|
| 252 |
+
of top relevant chunks to retrieve and the similarity metric to use for scoring.
|
| 253 |
+
|
| 254 |
+
!!! example "Example Usage"
|
| 255 |
+
```python
|
| 256 |
+
import weave
|
| 257 |
+
from dotenv import load_dotenv
|
| 258 |
+
|
| 259 |
+
import wandb
|
| 260 |
+
from medrag_multi_modal.retrieval import MedCPTRetriever
|
| 261 |
+
|
| 262 |
+
load_dotenv()
|
| 263 |
+
weave.init(project_name="ml-colabs/medrag-multi-modal")
|
| 264 |
+
retriever = MedCPTRetriever.from_wandb_artifact(
|
| 265 |
+
chunk_dataset_name="grays-anatomy-chunks:v0",
|
| 266 |
+
index_artifact_address="ml-colabs/medrag-multi-modal/grays-anatomy-medcpt:v0",
|
| 267 |
+
)
|
| 268 |
+
retriever.predict(query="What are Ribosomes?")
|
| 269 |
+
```
|
| 270 |
+
|
| 271 |
+
Args:
|
| 272 |
+
query (str): The input query string to search for relevant chunks.
|
| 273 |
+
top_k (int, optional): The number of top relevant chunks to retrieve. Defaults to 2.
|
| 274 |
+
metric (SimilarityMetric, optional): The similarity metric to use for scoring. Defaults to cosine similarity.
|
| 275 |
+
|
| 276 |
+
Returns:
|
| 277 |
+
list: A list of dictionaries, each containing a retrieved chunk and its relevance score.
|
| 278 |
+
"""
|
| 279 |
+
return self.retrieve(query, top_k, metric)
|
medrag_multi_modal/retrieval/nv_embed_2.py
CHANGED
|
@@ -177,27 +177,6 @@ class NVEmbed2Retriever(weave.Model):
|
|
| 177 |
cosine similarity or Euclidean distance. The top-k chunks with the highest similarity scores
|
| 178 |
are returned as a list of dictionaries, each containing a chunk and its corresponding score.
|
| 179 |
|
| 180 |
-
!!! example "Example Usage"
|
| 181 |
-
```python
|
| 182 |
-
import weave
|
| 183 |
-
from dotenv import load_dotenv
|
| 184 |
-
|
| 185 |
-
import wandb
|
| 186 |
-
from medrag_multi_modal.retrieval import NVEmbed2Retriever
|
| 187 |
-
|
| 188 |
-
load_dotenv()
|
| 189 |
-
weave.init(project_name="ml-colabs/medrag-multi-modal")
|
| 190 |
-
retriever = NVEmbed2Retriever(model_name="nvidia/NV-Embed-v2")
|
| 191 |
-
retriever.index(
|
| 192 |
-
chunk_dataset_name="grays-anatomy-chunks:v0",
|
| 193 |
-
index_name="grays-anatomy-nvembed2",
|
| 194 |
-
)
|
| 195 |
-
retriever = NVEmbed2Retriever.from_wandb_artifact(
|
| 196 |
-
chunk_dataset_name="grays-anatomy-chunks:v0",
|
| 197 |
-
index_artifact_address="ml-colabs/medrag-multi-modal/grays-anatomy-nvembed2:v0",
|
| 198 |
-
)
|
| 199 |
-
```
|
| 200 |
-
|
| 201 |
Args:
|
| 202 |
query (list[str]): The input query strings to search for relevant chunks.
|
| 203 |
top_k (int, optional): The number of top relevant chunks to retrieve.
|
|
@@ -273,6 +252,7 @@ class NVEmbed2Retriever(weave.Model):
|
|
| 273 |
list: A list of dictionaries, each containing a retrieved chunk and its relevance score.
|
| 274 |
"""
|
| 275 |
query = [
|
| 276 |
-
f"Instruct: Given a question, retrieve passages that answer the question
|
|
|
|
| 277 |
]
|
| 278 |
return self.retrieve(query, top_k, metric)
|
|
|
|
| 177 |
cosine similarity or Euclidean distance. The top-k chunks with the highest similarity scores
|
| 178 |
are returned as a list of dictionaries, each containing a chunk and its corresponding score.
|
| 179 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 180 |
Args:
|
| 181 |
query (list[str]): The input query strings to search for relevant chunks.
|
| 182 |
top_k (int, optional): The number of top relevant chunks to retrieve.
|
|
|
|
| 252 |
list: A list of dictionaries, each containing a retrieved chunk and its relevance score.
|
| 253 |
"""
|
| 254 |
query = [
|
| 255 |
+
f"""Instruct: Given a question, retrieve passages that answer the question
|
| 256 |
+
Query: {query}"""
|
| 257 |
]
|
| 258 |
return self.retrieve(query, top_k, metric)
|