Spaces:
Running
on
L4
Running
on
L4
Upload backend/colpali.py with huggingface_hub
Browse files- backend/colpali.py +25 -8
backend/colpali.py
CHANGED
|
@@ -132,6 +132,7 @@ def gen_similarity_maps(
|
|
| 132 |
query_embs (torch.Tensor): Query embeddings.
|
| 133 |
token_idx_map (dict): Mapping from tokens to their indices.
|
| 134 |
images (List[Union[Path, str]]): List of image paths or base64-encoded strings.
|
|
|
|
| 135 |
|
| 136 |
Returns:
|
| 137 |
List[Dict[str, str]]: A list where each item is a dictionary mapping tokens to base64-encoded blended images.
|
|
@@ -178,8 +179,13 @@ def gen_similarity_maps(
|
|
| 178 |
# ... and so on.
|
| 179 |
# Now turn these into a tensor of same shape as previous similarity map
|
| 180 |
vespa_sim_map_tensor = torch.zeros(
|
| 181 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 182 |
)
|
|
|
|
| 183 |
for idx, vespa_sim_map in enumerate(vespa_sim_maps):
|
| 184 |
for cell in vespa_sim_map["similarities"]["cells"]:
|
| 185 |
patch = int(cell["address"]["patch"])
|
|
@@ -187,10 +193,17 @@ def gen_similarity_maps(
|
|
| 187 |
continue
|
| 188 |
query_token = int(cell["address"]["querytoken"])
|
| 189 |
value = cell["value"]
|
| 190 |
-
vespa_sim_map_tensor[
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 191 |
|
| 192 |
# Normalize the similarity map per query token
|
| 193 |
-
similarity_map_normalized = normalize_similarity_map_per_query_token(
|
|
|
|
|
|
|
| 194 |
else:
|
| 195 |
# Preprocess inputs
|
| 196 |
print("Computing similarity maps")
|
|
@@ -227,7 +240,9 @@ def gen_similarity_maps(
|
|
| 227 |
print(f"Similarity map computation took: {end2 - start2} s")
|
| 228 |
|
| 229 |
# Normalize the similarity map per query token
|
| 230 |
-
similarity_map_normalized = normalize_similarity_map_per_query_token(
|
|
|
|
|
|
|
| 231 |
|
| 232 |
# Collect the blended images
|
| 233 |
start3 = time.perf_counter()
|
|
@@ -242,8 +257,8 @@ def gen_similarity_maps(
|
|
| 242 |
# Get the similarity map for this image and the selected token
|
| 243 |
sim_map = similarity_map_normalized[idx, token_idx, :, :] # Shape: (h, w)
|
| 244 |
|
| 245 |
-
# Move the similarity map to CPU and convert to NumPy array
|
| 246 |
-
sim_map_np = sim_map.cpu().numpy()
|
| 247 |
|
| 248 |
# Resize the similarity map to the original image size
|
| 249 |
sim_map_img = Image.fromarray(sim_map_np)
|
|
@@ -344,7 +359,9 @@ async def query_vespa_default(
|
|
| 344 |
)
|
| 345 |
assert response.is_successful(), response.json
|
| 346 |
stop = time.perf_counter()
|
| 347 |
-
print(
|
|
|
|
|
|
|
| 348 |
open("response.json", "w").write(json.dumps(response.json))
|
| 349 |
return format_query_results(query, response)
|
| 350 |
|
|
@@ -512,7 +529,7 @@ def add_sim_maps_to_result(
|
|
| 512 |
query_embs=q_embs,
|
| 513 |
token_idx_map=token_to_idx,
|
| 514 |
images=imgs,
|
| 515 |
-
vespa_sim_maps=vespa_sim_maps
|
| 516 |
)
|
| 517 |
for single_result, sim_map_dict in zip(result["root"]["children"], sim_map_imgs):
|
| 518 |
for token, sim_mapb64 in sim_map_dict.items():
|
|
|
|
| 132 |
query_embs (torch.Tensor): Query embeddings.
|
| 133 |
token_idx_map (dict): Mapping from tokens to their indices.
|
| 134 |
images (List[Union[Path, str]]): List of image paths or base64-encoded strings.
|
| 135 |
+
vespa_sim_maps (List[str]): List of Vespa similarity maps.
|
| 136 |
|
| 137 |
Returns:
|
| 138 |
List[Dict[str, str]]: A list where each item is a dictionary mapping tokens to base64-encoded blended images.
|
|
|
|
| 179 |
# ... and so on.
|
| 180 |
# Now turn these into a tensor of same shape as previous similarity map
|
| 181 |
vespa_sim_map_tensor = torch.zeros(
|
| 182 |
+
(
|
| 183 |
+
len(vespa_sim_maps),
|
| 184 |
+
query_embs.size(dim=1),
|
| 185 |
+
vit_config.n_patch_per_dim,
|
| 186 |
+
vit_config.n_patch_per_dim,
|
| 187 |
)
|
| 188 |
+
)
|
| 189 |
for idx, vespa_sim_map in enumerate(vespa_sim_maps):
|
| 190 |
for cell in vespa_sim_map["similarities"]["cells"]:
|
| 191 |
patch = int(cell["address"]["patch"])
|
|
|
|
| 193 |
continue
|
| 194 |
query_token = int(cell["address"]["querytoken"])
|
| 195 |
value = cell["value"]
|
| 196 |
+
vespa_sim_map_tensor[
|
| 197 |
+
idx,
|
| 198 |
+
int(query_token),
|
| 199 |
+
int(patch) // vit_config.n_patch_per_dim,
|
| 200 |
+
int(patch) % vit_config.n_patch_per_dim,
|
| 201 |
+
] = value
|
| 202 |
|
| 203 |
# Normalize the similarity map per query token
|
| 204 |
+
similarity_map_normalized = normalize_similarity_map_per_query_token(
|
| 205 |
+
vespa_sim_map_tensor
|
| 206 |
+
)
|
| 207 |
else:
|
| 208 |
# Preprocess inputs
|
| 209 |
print("Computing similarity maps")
|
|
|
|
| 240 |
print(f"Similarity map computation took: {end2 - start2} s")
|
| 241 |
|
| 242 |
# Normalize the similarity map per query token
|
| 243 |
+
similarity_map_normalized = normalize_similarity_map_per_query_token(
|
| 244 |
+
similarity_map
|
| 245 |
+
)
|
| 246 |
|
| 247 |
# Collect the blended images
|
| 248 |
start3 = time.perf_counter()
|
|
|
|
| 257 |
# Get the similarity map for this image and the selected token
|
| 258 |
sim_map = similarity_map_normalized[idx, token_idx, :, :] # Shape: (h, w)
|
| 259 |
|
| 260 |
+
# Move the similarity map to CPU, convert to float (as BFloat16 not supported by Numpy) and convert to NumPy array
|
| 261 |
+
sim_map_np = sim_map.cpu().float().numpy()
|
| 262 |
|
| 263 |
# Resize the similarity map to the original image size
|
| 264 |
sim_map_img = Image.fromarray(sim_map_np)
|
|
|
|
| 359 |
)
|
| 360 |
assert response.is_successful(), response.json
|
| 361 |
stop = time.perf_counter()
|
| 362 |
+
print(
|
| 363 |
+
f"Query time + data transfer took: {stop - start} s, vespa said searchtime was {response.json.get('timing', {}).get('searchtime', -1)} s"
|
| 364 |
+
)
|
| 365 |
open("response.json", "w").write(json.dumps(response.json))
|
| 366 |
return format_query_results(query, response)
|
| 367 |
|
|
|
|
| 529 |
query_embs=q_embs,
|
| 530 |
token_idx_map=token_to_idx,
|
| 531 |
images=imgs,
|
| 532 |
+
vespa_sim_maps=vespa_sim_maps,
|
| 533 |
)
|
| 534 |
for single_result, sim_map_dict in zip(result["root"]["children"], sim_map_imgs):
|
| 535 |
for token, sim_mapb64 in sim_map_dict.items():
|