Geraldine commited on
Commit
ff06935
·
verified ·
1 Parent(s): 5bbaba7

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +19 -39
app.py CHANGED
@@ -15,6 +15,8 @@ from dotenv import load_dotenv
15
  import helpers
16
  from omeka_s_api_client import OmekaSClient, OmekaSClientError
17
  from lancedb_client import LanceDBManager
 
 
18
 
19
  # Load .env for credentials
20
  load_dotenv()
@@ -52,7 +54,7 @@ app.layout = html.Div([
52
  # Header
53
  dbc.NavbarSimple(
54
  children=[],
55
- brand="Omeka S Computer Vision Asistant",
56
  brand_href="/",
57
  color="light",
58
  dark=False,
@@ -70,8 +72,8 @@ app.layout = html.Div([
70
 
71
  # Tabs
72
  dcc.Tabs(id="data-tabs", value="api", children=[
73
- dcc.Tab(label="🔍 From Omeka S", value="omeka"),
74
- dcc.Tab(label="📁 From LanceDB", value="lance")
75
  ]),
76
 
77
  html.Div(id="data-tab-content"),
@@ -204,36 +206,6 @@ app.layout = html.Div([
204
  html.Div(id="status"),
205
  dcc.Store(id="omeka-client-config", storage_type="session"),
206
  ]),
207
-
208
- # Footer
209
- html.Footer([
210
- html.Hr(),
211
- dbc.Container([
212
- dbc.Row([
213
- dbc.Col([
214
- html.Img(src="SmartBibl.IA_Solutions.png", height="50"),
215
- html.Small([
216
- html.Br(),
217
- html.A("Géraldine Geoffroy", href="mailto:[email protected]", className="text-muted")
218
- ])
219
- ]),
220
- dbc.Col([
221
- html.H5("Code source"),
222
- html.Ul([
223
- html.Li(html.A("Github", href="https://github.com/gegedenice/openalex-explorer", className="text-muted", target="_blank"))
224
- ])
225
- ]),
226
- dbc.Col([
227
- html.H5("Ressources"),
228
- html.Ul([
229
- html.Li(html.A("Nomic Atlas", href="https://atlas.nomic.ai/", target="_blank", className="text-muted")),
230
- html.Li(html.A("Model nomic-embed-text-v1.5", href="https://huggingface.co/nomic-ai/nomic-embed-text-v1.5", target="_blank", className="text-muted")),
231
- html.Li(html.A("Model nomic-embed-vision-v1.5", href="https://huggingface.co/nomic-ai/nomic-embed-vision-v1.5", target="_blank", className="text-muted"))
232
- ])
233
- ])
234
- ])
235
- ])
236
- ], className="mt-5 p-3 bg-light border-top")
237
  ])
238
 
239
  # -------------------- UI Callbacks --------------------
@@ -248,7 +220,7 @@ def render_tab_content(tab):
248
  if tab == "omeka":
249
  return html.Div([
250
  html.Div([
251
- html.H5("🔍 From Omeka S", className="mb-3"),
252
  # API URL input with full width
253
  dbc.InputGroup([
254
  dbc.Input(
@@ -308,7 +280,7 @@ def render_tab_content(tab):
308
  ], className="border rounded bg-white shadow-sm")
309
  elif tab == "lance":
310
  return html.Div([
311
- html.H5("📁 From LanceDB"),
312
  dbc.Button("Load LanceDB tables", id="load-tables", color="link", size="sm", className="mt-2"),
313
  dcc.Dropdown(id="db-tables-dropdown", placeholder="Select an existing table"),
314
  dbc.Button("Display Table", id="load-data-db", color="success", size="sm", className="mt-2"),
@@ -409,14 +381,22 @@ def handle_omeka_data(n_clicks, item_set_id, client_config, table_name):
409
 
410
  text_embed = helpers.generate_text_embed(df['text'].tolist())
411
  img_embed = helpers.generate_img_embed(df['images_urls'].tolist())
412
- embeddings = (text_embed + img_embed) / 2 # Average the embeddings
 
 
 
 
 
 
 
 
413
  df["embeddings"] = embeddings.tolist()
414
 
415
- reducer = umap.UMAP(n_neighbors=15, min_dist=0.1, n_components=2, metric='cosine', random_state=42)
416
  umap_embeddings = reducer.fit_transform(embeddings)
417
  df["umap_embeddings"] = umap_embeddings.tolist()
418
 
419
- clusterer = hdbscan.HDBSCAN(min_cluster_size=10)
420
  cluster_labels = clusterer.fit_predict(umap_embeddings)
421
  df["Cluster"] = cluster_labels
422
 
@@ -708,7 +688,7 @@ def create_umap_plot(df):
708
  paper_bgcolor='white',
709
  height=700,
710
  margin=dict(t=30, b=30, l=30, r=30),
711
- showlegend=False,
712
  legend=dict(
713
  yanchor="top",
714
  y=0.99,
 
15
  import helpers
16
  from omeka_s_api_client import OmekaSClient, OmekaSClientError
17
  from lancedb_client import LanceDBManager
18
+ import torch
19
+ import torch.nn.functional as F
20
 
21
  # Load .env for credentials
22
  load_dotenv()
 
54
  # Header
55
  dbc.NavbarSimple(
56
  children=[],
57
+ brand="Omeka S Computer Vision Assistant",
58
  brand_href="/",
59
  color="light",
60
  dark=False,
 
72
 
73
  # Tabs
74
  dcc.Tabs(id="data-tabs", value="api", children=[
75
+ dcc.Tab(label="From Omeka S", value="omeka"),
76
+ dcc.Tab(label="From LanceDB", value="lance")
77
  ]),
78
 
79
  html.Div(id="data-tab-content"),
 
206
  html.Div(id="status"),
207
  dcc.Store(id="omeka-client-config", storage_type="session"),
208
  ]),
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
209
  ])
210
 
211
  # -------------------- UI Callbacks --------------------
 
220
  if tab == "omeka":
221
  return html.Div([
222
  html.Div([
223
+ html.H5("From Omeka S", className="mb-3"),
224
  # API URL input with full width
225
  dbc.InputGroup([
226
  dbc.Input(
 
280
  ], className="border rounded bg-white shadow-sm")
281
  elif tab == "lance":
282
  return html.Div([
283
+ html.H5("From LanceDB"),
284
  dbc.Button("Load LanceDB tables", id="load-tables", color="link", size="sm", className="mt-2"),
285
  dcc.Dropdown(id="db-tables-dropdown", placeholder="Select an existing table"),
286
  dbc.Button("Display Table", id="load-data-db", color="success", size="sm", className="mt-2"),
 
381
 
382
  text_embed = helpers.generate_text_embed(df['text'].tolist())
383
  img_embed = helpers.generate_img_embed(df['images_urls'].tolist())
384
+ # Convert to tensors if needed
385
+ text_tensor = torch.tensor(text_embed)
386
+ img_tensor = torch.tensor(img_embed)
387
+
388
+ # Average then normalize
389
+ combined = (0.7 * text_tensor + 0.3 * img_tensor)
390
+ normalized_embeddings = F.normalize(combined, p=2, dim=1)
391
+
392
+ embeddings = normalized_embeddings.numpy()
393
  df["embeddings"] = embeddings.tolist()
394
 
395
+ reducer = umap.UMAP(n_neighbors=15, min_dist=0.1, metric="cosine")
396
  umap_embeddings = reducer.fit_transform(embeddings)
397
  df["umap_embeddings"] = umap_embeddings.tolist()
398
 
399
+ clusterer = hdbscan.HDBSCAN(min_cluster_size=10, metric="euclidean")
400
  cluster_labels = clusterer.fit_predict(umap_embeddings)
401
  df["Cluster"] = cluster_labels
402
 
 
688
  paper_bgcolor='white',
689
  height=700,
690
  margin=dict(t=30, b=30, l=30, r=30),
691
+ showlegend=True,
692
  legend=dict(
693
  yanchor="top",
694
  y=0.99,