traopia commited on
Commit
e9b2c9e
·
1 Parent(s): c57012a

queries and other fizes

Browse files
Files changed (3) hide show
  1. app_fashionDB.py +116 -9
  2. example_queries.py +28 -0
  3. search_fashionDB.py +44 -12
app_fashionDB.py CHANGED
@@ -4,16 +4,18 @@ import numpy as np
4
  from search_fashionDB import search_images_by_text, get_similar_images, search_images_by_image
5
  import requests
6
  from io import BytesIO
 
7
 
8
  import requests
9
  from io import BytesIO
10
-
11
 
12
  #@st.cache_data(show_spinner="Loading FashionDB...")
13
  def load_data_hf():
14
  # Load the Parquet file directly from Hugging Face
15
  df_url = "https://huggingface.co/datasets/traopia/FashionDB/resolve/main/data_vogue_final.parquet"
16
  df = pd.read_parquet(df_url)
 
17
  df = df.explode("image_urls_sample")
18
  df = df.rename(columns={"image_urls_sample":"url", "URL":"collection"})
19
 
@@ -39,8 +41,10 @@ df, df_fh, df_designers, embeddings, embeddings_urls = load_data_hf()
39
  # Suppose embeddings is a numpy array (N, D) and embeddings_urls is a list of urls/keys
40
  embedding_map = {url: i for i, url in enumerate(embeddings_urls)}
41
 
 
42
  # Filter and search
43
- def filter_and_search(fashion_house, designer, category, season, start_year, end_year, query):
 
44
  filtered = df.copy()
45
 
46
  if fashion_house:
@@ -54,6 +58,30 @@ def filter_and_search(fashion_house, designer, category, season, start_year, end
54
  filtered = filtered[filtered['season'].isin(season)]
55
  filtered = filtered[(filtered['year'] >= start_year) & (filtered['year'] <= end_year)]
56
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
57
  if query:
58
  image_urls, metadata = search_images_by_text(query, filtered, embeddings, embeddings_urls)
59
  else:
@@ -104,6 +132,20 @@ with gr.Blocks() as demo:
104
  start_year = gr.Slider(label="Start Year", minimum=min_year, maximum=max_year, value=2000, step=1)
105
  end_year = gr.Slider(label="End Year", minimum=min_year, maximum=max_year, value=2024, step=1)
106
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
107
  query = gr.Textbox(label="Search by text", placeholder="e.g., pink dress")
108
  search_button = gr.Button("Search")
109
 
@@ -115,13 +157,13 @@ with gr.Blocks() as demo:
115
  metadata_state = gr.State([])
116
  selected_idx = gr.Number(value=0, visible=False)
117
 
118
- def handle_search(fh, dis, cat, sea, sy, ey, q):
119
- imgs, meta = filter_and_search(fh, dis, cat, sea, sy, ey, q)
120
  return imgs, meta, "", [], None
121
 
122
  search_button.click(
123
  handle_search,
124
- inputs=[fashion_house, designer, category, season, start_year, end_year, query],
125
  outputs=[result_gallery, metadata_state, metadata_output, similar_gallery, reference_image]
126
  )
127
 
@@ -174,6 +216,14 @@ with gr.Blocks() as demo:
174
  start_year_img = gr.Slider(label="Start Year", minimum=min_year, maximum=max_year, value=2000, step=1)
175
  end_year_img = gr.Slider(label="End Year", minimum=min_year, maximum=max_year, value=2024, step=1)
176
 
 
 
 
 
 
 
 
 
177
  uploaded_image = gr.Image(label="Upload an image", type="pil")
178
  search_by_image_button = gr.Button("Search by Image")
179
 
@@ -182,23 +232,46 @@ with gr.Blocks() as demo:
182
  uploaded_metadata_output = gr.Markdown()
183
  uploaded_reference_image = gr.Image(label="Reference Image", interactive=False)
184
 
185
- def handle_search_by_image(image, fh, dis, cat, sea, sy, ey):
186
  if image is None:
187
  return [], "Please upload an image first.", None
188
  # Apply filters
189
  filtered_df = df.copy()
190
  if fh: filtered_df = filtered_df[filtered_df["fashion_house"].isin(fh)]
191
- if dis: filtered_df = filtered_df[filtered_df["designer_name"].isin(fh)]
192
  if cat: filtered_df = filtered_df[filtered_df["category"].isin(cat)]
193
  if sea: filtered_df = filtered_df[filtered_df["season"].isin(sea)]
194
  filtered_df = filtered_df[(filtered_df["year"] >= sy) & (filtered_df["year"] <= ey)]
195
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
196
  images, metadata = search_images_by_image(image, filtered_df, embeddings, embeddings_urls)
197
  return images, metadata, ""
198
 
199
  search_by_image_button.click(
200
  handle_search_by_image,
201
- inputs=[uploaded_image, fashion_house_img, designer_img, category_img, season_img, start_year_img, end_year_img],
202
  outputs=[uploaded_result_gallery, uploaded_metadata_state, uploaded_metadata_output]
203
  )
204
 
@@ -257,12 +330,46 @@ with gr.Blocks() as demo:
257
  )
258
 
259
  with gr.Tab("Query on FashionDB"):
260
- with gr.Row():
 
 
261
  gr.Markdown(
262
  "### 🔗 Query FashionDB SPARQL Endpoint\n"
263
  "[Click here to open the SPARQL endpoint](https://fashionwiki.wikibase.cloud/query/)",
264
  elem_id="sparql-link"
265
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
266
 
267
  back_button = gr.Button("Back to Home")
268
 
 
4
  from search_fashionDB import search_images_by_text, get_similar_images, search_images_by_image
5
  import requests
6
  from io import BytesIO
7
+ import urllib.parse
8
 
9
  import requests
10
  from io import BytesIO
11
+ from example_queries import EXAMPLE_QUERIES
12
 
13
  #@st.cache_data(show_spinner="Loading FashionDB...")
14
  def load_data_hf():
15
  # Load the Parquet file directly from Hugging Face
16
  df_url = "https://huggingface.co/datasets/traopia/FashionDB/resolve/main/data_vogue_final.parquet"
17
  df = pd.read_parquet(df_url)
18
+ df = df.drop_duplicates(subset=["URL"])
19
  df = df.explode("image_urls_sample")
20
  df = df.rename(columns={"image_urls_sample":"url", "URL":"collection"})
21
 
 
41
  # Suppose embeddings is a numpy array (N, D) and embeddings_urls is a list of urls/keys
42
  embedding_map = {url: i for i, url in enumerate(embeddings_urls)}
43
 
44
+
45
  # Filter and search
46
+ def filter_and_search(fashion_house, designer, category, season, start_year, end_year, query,
47
+ fh_country, fh_city, designer_nationality, designer_birth_year_start, designer_birth_year_end):
48
  filtered = df.copy()
49
 
50
  if fashion_house:
 
58
  filtered = filtered[filtered['season'].isin(season)]
59
  filtered = filtered[(filtered['year'] >= start_year) & (filtered['year'] <= end_year)]
60
 
61
+ # Fashion house filters via df_fh (country, city)
62
+ if (fh_country and len(fh_country) > 0) or (fh_city and len(fh_city) > 0):
63
+ fh_cols = [c for c in ['fashion_house', 'country', 'city'] if c in df_fh.columns]
64
+ if 'fashion_house' in fh_cols:
65
+ merged = filtered.merge(df_fh[fh_cols], on='fashion_house', how='left')
66
+ if fh_country and 'country' in merged.columns:
67
+ merged = merged[merged['country'].isin(fh_country)]
68
+ if fh_city and 'city' in merged.columns:
69
+ merged = merged[merged['city'].isin(fh_city)]
70
+ filtered = merged.drop_duplicates(subset=['url'])
71
+
72
+ # Designer filters via df_designers (nationality, year_birth)
73
+ if (designer_nationality and len(designer_nationality) > 0) or (designer_birth_year_start is not None or designer_birth_year_end is not None):
74
+ des_cols = [c for c in ['designer_name', 'nationality', 'year_birth'] if c in df_designers.columns]
75
+ if 'designer_name' in des_cols:
76
+ merged = filtered.merge(df_designers[des_cols], on='designer_name', how='left')
77
+ if designer_nationality and 'nationality' in merged.columns:
78
+ merged = merged[merged['nationality'].isin(designer_nationality)]
79
+ if (designer_birth_year_start is not None or designer_birth_year_end is not None) and 'year_birth' in merged.columns:
80
+ by_start = designer_birth_year_start if designer_birth_year_start is not None else merged['year_birth'].min()
81
+ by_end = designer_birth_year_end if designer_birth_year_end is not None else merged['year_birth'].max()
82
+ merged = merged[(merged['year_birth'] >= by_start) & (merged['year_birth'] <= by_end)]
83
+ filtered = merged.drop_duplicates(subset=['url'])
84
+
85
  if query:
86
  image_urls, metadata = search_images_by_text(query, filtered, embeddings, embeddings_urls)
87
  else:
 
132
  start_year = gr.Slider(label="Start Year", minimum=min_year, maximum=max_year, value=2000, step=1)
133
  end_year = gr.Slider(label="End Year", minimum=min_year, maximum=max_year, value=2024, step=1)
134
 
135
+ # Additional filters banner for Fashion House and Designer metadata
136
+ with gr.Row():
137
+ fh_countries = sorted(df_fh['country'].dropna().unique()) if 'country' in df_fh.columns else []
138
+ fh_cities = sorted(df_fh['city'].dropna().unique()) if 'city' in df_fh.columns else []
139
+ designer_places = sorted(df_designers['nationality'].dropna().unique()) if 'nationality' in df_designers.columns else []
140
+ birth_year_min = int(df_designers['year_birth'].min()) if 'year_birth' in df_designers.columns else 1900
141
+ birth_year_max = int(df_designers['year_birth'].max()) if 'year_birth' in df_designers.columns else 2024
142
+
143
+ fh_country = gr.Dropdown(label="Country of Fashion House", choices=fh_countries, multiselect=True)
144
+ fh_city = gr.Dropdown(label="HQ of Fashion House", choices=fh_cities, multiselect=True)
145
+ designer_nationality = gr.Dropdown(label="Designer Nationality", choices=designer_places, multiselect=True)
146
+ designer_birth_year_start = gr.Slider(minimum=birth_year_min, maximum=birth_year_max, value=birth_year_min, step=1, label="Designer Birth Year Start")
147
+ designer_birth_year_end = gr.Slider(minimum=birth_year_min, maximum=birth_year_max, value=birth_year_max, step=1, label="Designer Birth Year End")
148
+
149
  query = gr.Textbox(label="Search by text", placeholder="e.g., pink dress")
150
  search_button = gr.Button("Search")
151
 
 
157
  metadata_state = gr.State([])
158
  selected_idx = gr.Number(value=0, visible=False)
159
 
160
+ def handle_search(fh, dis, cat, sea, sy, ey, q, fh_co, fh_ci, d_pob, d_by_start, d_by_end):
161
+ imgs, meta = filter_and_search(fh, dis, cat, sea, sy, ey, q, fh_co, fh_ci, d_pob, d_by_start, d_by_end)
162
  return imgs, meta, "", [], None
163
 
164
  search_button.click(
165
  handle_search,
166
+ inputs=[fashion_house, designer, category, season, start_year, end_year, query, fh_country, fh_city, designer_nationality, designer_birth_year_start, designer_birth_year_end],
167
  outputs=[result_gallery, metadata_state, metadata_output, similar_gallery, reference_image]
168
  )
169
 
 
216
  start_year_img = gr.Slider(label="Start Year", minimum=min_year, maximum=max_year, value=2000, step=1)
217
  end_year_img = gr.Slider(label="End Year", minimum=min_year, maximum=max_year, value=2024, step=1)
218
 
219
+ # Additional banner for FH/Designer filters in image search
220
+ with gr.Row():
221
+ fh_country_img = gr.Dropdown(label="Country of Fashion House", choices=fh_countries, multiselect=True)
222
+ fh_city_img = gr.Dropdown(label="HQ of Fashion House", choices=fh_cities, multiselect=True)
223
+ designer_nationality_img = gr.Dropdown(label="Designer Nationality", choices=designer_places, multiselect=True)
224
+ designer_birth_year_start_img = gr.Slider(minimum=birth_year_min, maximum=birth_year_max, value=birth_year_min, step=1, label="Designer Birth Year Start")
225
+ designer_birth_year_end_img = gr.Slider(minimum=birth_year_min, maximum=birth_year_max, value=birth_year_max, step=1, label="Designer Birth Year End")
226
+
227
  uploaded_image = gr.Image(label="Upload an image", type="pil")
228
  search_by_image_button = gr.Button("Search by Image")
229
 
 
232
  uploaded_metadata_output = gr.Markdown()
233
  uploaded_reference_image = gr.Image(label="Reference Image", interactive=False)
234
 
235
+ def handle_search_by_image(image, fh, dis, cat, sea, sy, ey, fh_co, fh_ci, d_pob, d_by_start, d_by_end):
236
  if image is None:
237
  return [], "Please upload an image first.", None
238
  # Apply filters
239
  filtered_df = df.copy()
240
  if fh: filtered_df = filtered_df[filtered_df["fashion_house"].isin(fh)]
241
+ if dis: filtered_df = filtered_df[filtered_df["designer_name"].isin(dis)]
242
  if cat: filtered_df = filtered_df[filtered_df["category"].isin(cat)]
243
  if sea: filtered_df = filtered_df[filtered_df["season"].isin(sea)]
244
  filtered_df = filtered_df[(filtered_df["year"] >= sy) & (filtered_df["year"] <= ey)]
245
 
246
+ # FH/Designer metadata filters via joins
247
+ if (fh_co and len(fh_co) > 0) or (fh_ci and len(fh_ci) > 0):
248
+ fh_cols = [c for c in ['fashion_house', 'country', 'city'] if c in df_fh.columns]
249
+ if 'fashion_house' in fh_cols:
250
+ merged = filtered_df.merge(df_fh[fh_cols], on='fashion_house', how='left')
251
+ if fh_co and 'country' in merged.columns:
252
+ merged = merged[merged['country'].isin(fh_co)]
253
+ if fh_ci and 'city' in merged.columns:
254
+ merged = merged[merged['city'].isin(fh_ci)]
255
+ filtered_df = merged.drop_duplicates(subset=['url'])
256
+
257
+ if (d_pob and len(d_pob) > 0) or (d_by_start is not None or d_by_end is not None):
258
+ des_cols = [c for c in ['designer_name', 'nationality', 'year_birth'] if c in df_designers.columns]
259
+ if 'designer_name' in des_cols:
260
+ merged = filtered_df.merge(df_designers[des_cols], on='designer_name', how='left')
261
+ if d_pob and 'nationality' in merged.columns:
262
+ merged = merged[merged['nationality'].isin(d_pob)]
263
+ if (d_by_start is not None or d_by_end is not None) and 'year_birth' in merged.columns:
264
+ by_start = d_by_start if d_by_start is not None else merged['year_birth'].min()
265
+ by_end = d_by_end if d_by_end is not None else merged['year_birth'].max()
266
+ merged = merged[(merged['year_birth'] >= by_start) & (merged['year_birth'] <= by_end)]
267
+ filtered_df = merged.drop_duplicates(subset=['url'])
268
+
269
  images, metadata = search_images_by_image(image, filtered_df, embeddings, embeddings_urls)
270
  return images, metadata, ""
271
 
272
  search_by_image_button.click(
273
  handle_search_by_image,
274
+ inputs=[uploaded_image, fashion_house_img, designer_img, category_img, season_img, start_year_img, end_year_img, fh_country_img, fh_city_img, designer_nationality_img, designer_birth_year_start_img, designer_birth_year_end_img],
275
  outputs=[uploaded_result_gallery, uploaded_metadata_state, uploaded_metadata_output]
276
  )
277
 
 
330
  )
331
 
332
  with gr.Tab("Query on FashionDB"):
333
+
334
+ # Front-page SPARQL query UI and examples
335
+ with gr.Accordion("Query FashionDB (SPARQL)", open=True):
336
  gr.Markdown(
337
  "### 🔗 Query FashionDB SPARQL Endpoint\n"
338
  "[Click here to open the SPARQL endpoint](https://fashionwiki.wikibase.cloud/query/)",
339
  elem_id="sparql-link"
340
  )
341
+ with gr.Row():
342
+ example_dropdown = gr.Dropdown(label="Example SPARQL Queries", choices=list(EXAMPLE_QUERIES.keys()))
343
+ query_text = gr.Textbox(label="SPARQL Query", lines=10)
344
+ open_link_md = gr.Markdown()
345
+
346
+ def on_example_change(example_key):
347
+ if not example_key or example_key not in EXAMPLE_QUERIES:
348
+ return "", ""
349
+ q = EXAMPLE_QUERIES[example_key].strip()
350
+ encoded = urllib.parse.quote(q)
351
+ link = f"[Open in SPARQL Editor](https://fashionwiki.wikibase.cloud/query/#query={encoded})"
352
+ return q, link
353
+
354
+ example_dropdown.change(
355
+ on_example_change,
356
+ inputs=[example_dropdown],
357
+ outputs=[query_text, open_link_md]
358
+ )
359
+
360
+ def on_query_change(q):
361
+ q = (q or "").strip()
362
+ if not q:
363
+ return ""
364
+ encoded = urllib.parse.quote(q)
365
+ return f"[Open in SPARQL Editor](https://fashionwiki.wikibase.cloud/query/#query={encoded})"
366
+
367
+ query_text.change(
368
+ on_query_change,
369
+ inputs=[query_text],
370
+ outputs=[open_link_md]
371
+ )
372
+
373
 
374
  back_button = gr.Button("Back to Home")
375
 
example_queries.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Example SPARQL queries for FashionDB
2
+ EXAMPLE_QUERIES = {
3
+ # "All fashion houses with country and city": (
4
+ # """
5
+ # PREFIX wbt: <https://fashionwiki.wikibase.cloud/prop/direct/>
6
+ # PREFIX wb: <https://fashionwiki.wikibase.cloud/entity/>
7
+ # PREFIX pq: <https://fashionwiki.wikibase.cloud/prop/qualifier/>
8
+ # PREFIX pr: <https://fashionwiki.wikibase.cloud/prop/reference/>
9
+ # PREFIX ps: <https://fashionwiki.wikibase.cloud/prop/statement/>
10
+ # PREFIX p: <https://fashionwiki.wikibase.cloud/prop/>
11
+
12
+ # SELECT ?fashion_house ?fashion_houseLabel ?countryLabel ?cityLabel WHERE {
13
+ # ?fashion_house wbt:P31 wb:Q783794; # instance of fashion house (example)
14
+ # wbt:P17 ?country. # country
15
+ # OPTIONAL { ?fashion_house wbt:P131 ?city. } # located in the administrative territorial entity
16
+ # SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }
17
+ # }
18
+ # LIMIT 50
19
+ # """
20
+ # ),
21
+ "which designer were born in 1969": (
22
+ "PREFIX wbt: <https://fashionwiki.wikibase.cloud/prop/direct/>\nPREFIX wb: <https://fashionwiki.wikibase.cloud/entity/>\nPREFIX ps: <https://fashionwiki.wikibase.cloud/prop/statement/> \nPREFIX pq: <https://fashionwiki.wikibase.cloud/prop/qualifier/> \nPREFIX p: <https://fashionwiki.wikibase.cloud/prop/> \nPREFIX prov: <http://www.w3.org/ns/prov#> \nPREFIX xsd: <http://www.w3.org/2001/XMLSchema#>\n\nSELECT ?designerLabel ?birthdate WHERE {\n ?designer wbt:P3 ?birthdate .\n FILTER (YEAR(?birthdate) = 1969)\n\n SERVICE wikibase:label { bd:serviceParam wikibase:language \"en\". }\n}"
23
+ ),
24
+
25
+ "Which designers studied at Central Saint Martins?": (
26
+ "PREFIX wbt: <https://fashionwiki.wikibase.cloud/prop/direct/>\nPREFIX wb: <https://fashionwiki.wikibase.cloud/entity/>\nPREFIX pq: <https://fashionwiki.wikibase.cloud/prop/qualifier/> \nPREFIX pr: <https://fashionwiki.wikibase.cloud/prop/reference/>\nPREFIX ps: <https://fashionwiki.wikibase.cloud/prop/statement/> \nPREFIX p: <https://fashionwiki.wikibase.cloud/prop/> \nPREFIX prov: <http://www.w3.org/ns/prov#> \n\nSELECT ?fashion_designerLabel (SAMPLE(?reference_URL) AS ?reference_URL) {\n # Restrict to designers who are instances of fashion designer (Q5)\n\n ?fashion_designer wbt:P2 wb:Q5.\n ?fashion_designer wbt:P9 ?educated_at.\n ?educated_at rdfs:label 'Central Saint Martins'@en . \n\n\n # Retrieve references from the statement\n OPTIONAL {\n ?statement prov:wasDerivedFrom ?reference.\n ?reference pr:P24 ?reference_URL.\n }\n\n # Retrieve labels for the fashion designer\n SERVICE wikibase:label { bd:serviceParam wikibase:language \"en\". } \n} \nGROUP BY ?fashion_designerLabel \nORDER BY ?fashion_designerLabel"
27
+ ),
28
+ }
search_fashionDB.py CHANGED
@@ -80,10 +80,17 @@ def search_images_by_image(uploaded_image, df, embeddings,embeddings_urls, top_
80
  sims = cosine_similarity([image_emb], embeddings)[0]
81
  top_indices = np.argsort(sims)[::-1][:top_k]
82
  top_urls = [embeddings_urls[i] for i in top_indices]
83
- metadata = df[df["url"].isin(top_urls)].copy().to_dict(orient="records")
84
-
 
 
 
 
 
 
 
85
 
86
- return top_urls, metadata
87
 
88
 
89
 
@@ -97,16 +104,41 @@ def search_images_by_text(text, df, embeddings, embeddings_urls, top_k=30):
97
  with torch.no_grad():
98
  text_emb = model.get_text_features(**inputs).cpu().numpy()
99
 
100
- df_indices = df.index.to_numpy()
101
- # slice embeddings & urls to match the filtered df
102
- embeddings_filtered = embeddings[df_indices]
 
 
 
 
 
 
 
103
  sims = cosine_similarity(text_emb, embeddings_filtered)[0]
104
- sims = np.asarray(sims).flatten()
105
- top_indices = np.argsort(sims)[::-1][:top_k]
106
- top_urls = [embeddings_urls[i] for i in top_indices]
107
- metadata = df[df["url"].isin(top_urls)].copy().to_dict(orient="records")
108
-
109
- return top_urls, metadata
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
110
 
111
  def get_similar_images(df, image_key, embeddings, embedding_map, embeddings_urls, top_k=5):
112
  if image_key not in embedding_map:
 
80
  sims = cosine_similarity([image_emb], embeddings)[0]
81
  top_indices = np.argsort(sims)[::-1][:top_k]
82
  top_urls = [embeddings_urls[i] for i in top_indices]
83
+ # Build metadata in the same order as top_urls
84
+ df_subset = df[df["url"].isin(top_urls)].copy()
85
+ records = df_subset.to_dict(orient="records")
86
+ by_url = {}
87
+ for r in records:
88
+ u = r.get("url")
89
+ if u is not None and u not in by_url:
90
+ by_url[u] = r
91
+ ordered_metadata = [by_url[u] for u in top_urls if u in by_url]
92
 
93
+ return top_urls, ordered_metadata
94
 
95
 
96
 
 
104
  with torch.no_grad():
105
  text_emb = model.get_text_features(**inputs).cpu().numpy()
106
 
107
+ # Build URL -> index map once per call
108
+ url_to_index = {str(url): idx for idx, url in enumerate(embeddings_urls)}
109
+ # Collect indices of embeddings corresponding to filtered df URLs
110
+ filtered_urls = df["url"].astype(str).tolist()
111
+ filtered_indices = [url_to_index[u] for u in filtered_urls if u in url_to_index]
112
+
113
+ if not filtered_indices:
114
+ return [], []
115
+
116
+ embeddings_filtered = embeddings[filtered_indices]
117
  sims = cosine_similarity(text_emb, embeddings_filtered)[0]
118
+ sims = np.asarray(sims).flatten()
119
+
120
+ # Rank within the filtered set
121
+ top_indices_local = np.argsort(sims)[::-1][:top_k]
122
+ # Map local ranks back to URLs in the same order, dedupe while preserving order
123
+ ranked_urls = [embeddings_urls[filtered_indices[i]] for i in top_indices_local]
124
+ seen = set()
125
+ top_urls = []
126
+ for u in ranked_urls:
127
+ if u not in seen:
128
+ seen.add(u)
129
+ top_urls.append(u)
130
+
131
+ # Build metadata in the same order as top_urls
132
+ df_subset = df[df["url"].isin(top_urls)].copy()
133
+ records = df_subset.to_dict(orient="records")
134
+ by_url = {}
135
+ for r in records:
136
+ u = r.get("url")
137
+ if u is not None and u not in by_url:
138
+ by_url[u] = r
139
+ ordered_metadata = [by_url[u] for u in top_urls if u in by_url]
140
+
141
+ return top_urls, ordered_metadata
142
 
143
  def get_similar_images(df, image_key, embeddings, embedding_map, embeddings_urls, top_k=5):
144
  if image_key not in embedding_map: