CCockrum commited on
Commit
e004315
·
verified ·
1 Parent(s): fc8f7bb

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +151 -62
app.py CHANGED
@@ -1,12 +1,63 @@
 
1
  import requests
2
  import pandas as pd
3
- import numpy as np
4
  import streamlit as st
 
5
  import matplotlib
6
  import plotly.express as px
7
  from sklearn.feature_extraction.text import TfidfVectorizer
8
  from sklearn.metrics.pairwise import cosine_similarity
9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
  # Custom CSS
11
  st.markdown("""
12
  <style>
@@ -118,6 +169,11 @@ st.markdown("""
118
  </style>
119
  """, unsafe_allow_html=True)
120
 
 
 
 
 
 
121
  # Use an image from a URL for the banner
122
  st.image("https://cdn-uploads.huggingface.co/production/uploads/67351c643fe51cb1aa28f2e5/7ThcAOjbuM8ajrP85bGs4.jpeg", use_container_width=True)
123
 
@@ -222,24 +278,50 @@ if fetch_data:
222
 
223
  metadata_df = pd.DataFrame(items)
224
 
 
 
 
 
 
 
 
 
 
225
  # Define custom completeness check
226
  def is_incomplete(value):
227
  return pd.isna(value) or value in ["", "N/A", "null", None]
228
 
229
 
230
  if not metadata_df.empty:
231
- incomplete_mask = metadata_df.map(is_incomplete).any(axis=1)
232
- incomplete_count = incomplete_mask.sum()
 
 
 
 
 
 
 
 
233
  total_fields = metadata_df.size
234
- filled_fields = (~metadata_df.map(is_incomplete)).sum().sum()
235
  overall_percent = (filled_fields / total_fields) * 100
 
 
 
 
 
 
 
 
 
236
 
237
  # Field-level completeness
238
  completeness = (~metadata_df.map(is_incomplete)).mean() * 100
239
  completeness_df = pd.DataFrame({"Field": completeness.index, "Completeness (%)": completeness.values})
240
  completeness_table = completeness_df.set_index("Field")
241
 
242
- # Sidebar Quick Stats (index hidden, orange theme)
243
  quick_stats = pd.DataFrame({
244
  "Metric": ["Total Records", "Incomplete Records", "Percent Complete"],
245
  "Value": [len(metadata_df), incomplete_count, round(overall_percent, 1)]
@@ -280,6 +362,7 @@ if fetch_data:
280
  hide_index=True, # <<< ADD THIS
281
  height=min(300, len(missing_df) * 35 + 38)
282
  )
 
283
  # Calculate Top 10 Subjects
284
  if 'subject' in metadata_df.columns:
285
  top_subjects = (
@@ -339,101 +422,107 @@ if fetch_data:
339
  st.dataframe(metadata_df.head())
340
 
341
 
342
- # Fill the placeholder created earlier
343
  st.subheader("Field Completeness Breakdown")
344
 
 
345
  st.markdown("""
346
  <div style='
347
  background-color: #2e2e2e;
348
- padding: 1.2rem;
349
  border-radius: 10px;
350
  margin-top: 1.5rem;
351
  color: lightgray;
352
  '>
353
  """, unsafe_allow_html=True)
354
 
 
355
  st.dataframe(
356
- completeness_table.style.background_gradient(cmap="Greens").format("{:.1f}%"),
 
 
 
357
  use_container_width=True,
358
  height=240
359
  )
360
 
361
  st.markdown("</div>", unsafe_allow_html=True)
362
 
363
-
364
  # Identify incomplete records
365
  incomplete_mask = metadata_df.map(is_incomplete).any(axis=1)
366
  incomplete_records = metadata_df[incomplete_mask]
367
 
368
- st.subheader("Suggested Metadata Enhancements")
 
 
369
 
370
- # Look for records with descriptions but missing subjects or other fields
371
- incomplete_with_desc = metadata_df[metadata_df['description'].notnull() &
372
- (metadata_df['subject'].isnull() |
373
- metadata_df['creator'].isnull())]
374
 
375
- # Reference data should be complete records with both subjects and descriptions
376
- reference_df = metadata_df[metadata_df['subject'].notnull() &
377
- metadata_df['description'].notnull() &
378
- metadata_df['creator'].notnull()]
 
379
 
380
- # Print debugging info
381
- st.write(f"Records with descriptions but missing fields: {len(incomplete_with_desc)}")
382
- st.write(f"Complete reference records: {len(reference_df)}")
 
383
 
384
- tfidf = TfidfVectorizer(stop_words='english', max_features=1000)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
385
 
386
- if len(incomplete_with_desc) > 0 and len(reference_df) > 0:
387
- try:
388
- suggestions = []
389
- # Fit TF-IDF on all complete descriptions
390
- tfidf_matrix = tfidf.fit_transform(reference_df['description'].fillna('').astype(str))
391
-
392
- # For each incomplete record
393
- for idx, row in incomplete_with_desc.iterrows():
394
- if pd.notna(row['description']):
395
- # Transform this record's description
396
- desc_vec = tfidf.transform([str(row['description'])])
397
-
398
- # Get similarity scores to all reference records
399
- sims = cosine_similarity(desc_vec, tfidf_matrix).flatten()
400
-
401
- # Find the top 3 most similar records
402
- top_indices = sims.argsort()[-3:][::-1]
403
-
404
- # Get the most frequent subject among top matches
405
- top_subjects = reference_df.iloc[top_indices]['subject'].value_counts().index
406
- if len(top_subjects) > 0:
407
- suggested_subject = top_subjects[0]
408
- suggestions.append((row['title'], suggested_subject))
409
-
410
  if suggestions:
411
  suggestions_df = pd.DataFrame(suggestions, columns=["Title", "Suggested Subject"])
412
-
413
- # Apply similar styling as your other tables
414
  styled_suggestions = (
415
  suggestions_df.style
416
  .background_gradient(cmap="Greens", subset=["Suggested Subject"])
417
  .hide(axis="index")
418
  )
419
-
420
- # Display as a dataframe with styling
 
 
 
 
 
 
 
 
 
 
421
  st.dataframe(
422
  styled_suggestions,
423
  use_container_width=True,
424
  hide_index=True,
425
- height=min(240, len(suggestions) * 35 + 38)
426
  )
 
 
427
  else:
428
- empty_df = pd.DataFrame([["No metadata enhancement suggestions available."]],
429
- columns=["Message"])
430
- styled_empty = empty_df.style.hide(axis="index")
431
- st.dataframe(styled_empty, use_container_width=True, hide_index=True)
432
- except Exception as e:
433
- st.error(f"Error generating metadata suggestions: {e}")
434
- st.error(f"Error details: {str(e)}")
435
  else:
436
- empty_df = pd.DataFrame([["Not enough descriptive data to generate metadata suggestions."]],
437
- columns=["Message"])
438
- styled_empty = empty_df.style.hide(axis="index")
439
- st.dataframe(styled_empty, use_container_width=True, hide_index=True)
 
1
+ import os
2
  import requests
3
  import pandas as pd
 
4
  import streamlit as st
5
+ import time
6
  import matplotlib
7
  import plotly.express as px
8
  from sklearn.feature_extraction.text import TfidfVectorizer
9
  from sklearn.metrics.pairwise import cosine_similarity
10
 
11
+ def is_missing(value):
12
+ return pd.isna(value) or str(value).strip() == ""
13
+
14
+ # Load the Hugging Face API key from environment
15
+ api_key = os.getenv('HF_API')
16
+
17
+ def get_huggingface_suggestions(title, description):
18
+ API_URL = "https://api-inference.huggingface.co/models/facebook/bart-large-mnli"
19
+ headers = {"Authorization": f"Bearer {api_key}"}
20
+
21
+ full_text = f"{title}. {description}".strip()
22
+
23
+ if not full_text:
24
+ return None
25
+
26
+ candidate_labels = [
27
+ "History", "Politics", "Science", "Technology", "Art", "Literature",
28
+ "Education", "Economics", "Military", "Geography", "Sociology",
29
+ "Philosophy", "Religion", "Law", "Medicine", "Engineering",
30
+ "Mathematics", "Computer Science", "Agriculture", "Environment",
31
+ "Maps", "United States", "Civil War", "Revolution", "Posters", "Women's Rights", "World War I"
32
+ ]
33
+
34
+ payload = {
35
+ "inputs": full_text,
36
+ "parameters": {
37
+ "candidate_labels": candidate_labels,
38
+ "multi_label": True
39
+ }
40
+ }
41
+
42
+ try:
43
+ response = requests.post(API_URL, headers=headers, json=payload)
44
+ result = response.json()
45
+
46
+ if "error" in result:
47
+ st.error(f"API error: {result['error']}")
48
+ return None
49
+
50
+ labels = [
51
+ label for label, score in zip(result.get("labels", []), result.get("scores", []))
52
+ if score > 0.3
53
+ ]
54
+
55
+ return ", ".join(labels) if labels else None
56
+
57
+ except Exception as e:
58
+ st.error(f"API Error: {e}")
59
+ return None
60
+
61
  # Custom CSS
62
  st.markdown("""
63
  <style>
 
169
  </style>
170
  """, unsafe_allow_html=True)
171
 
172
+ # Function to get subject suggestions using Hugging Face API
173
+ def get_huggingface_suggestions(title, description):
174
+ API_URL = "https://api-inference.huggingface.co/models/facebook/bart-large-mnli"
175
+ # Rest of the function code...
176
+
177
  # Use an image from a URL for the banner
178
  st.image("https://cdn-uploads.huggingface.co/production/uploads/67351c643fe51cb1aa28f2e5/7ThcAOjbuM8ajrP85bGs4.jpeg", use_container_width=True)
179
 
 
278
 
279
  metadata_df = pd.DataFrame(items)
280
 
281
+ # Missing field detection
282
+ fields_to_check = ["subject", "creator", "date", "title", "description"]
283
+ missing_counts = {}
284
+
285
+ for field in fields_to_check:
286
+ if field in metadata_df.columns:
287
+ missing = metadata_df[field].apply(is_missing)
288
+ missing_counts[field] = missing.sum()
289
+
290
  # Define custom completeness check
291
  def is_incomplete(value):
292
  return pd.isna(value) or value in ["", "N/A", "null", None]
293
 
294
 
295
  if not metadata_df.empty:
296
+ # --- Unified Completeness and Missing Fields Analysis ---
297
+
298
+ #Define incompleteness at the cell level
299
+ is_incomplete = lambda value: pd.isna(value) or value in ["", "N/A", "null", None]
300
+
301
+ #Create a mask for missing values
302
+ missing_mask = metadata_df.map(is_incomplete)
303
+
304
+ #Compute overall record-level completeness
305
+ incomplete_count = missing_mask.any(axis=1).sum()
306
  total_fields = metadata_df.size
307
+ filled_fields = (~missing_mask).sum().sum()
308
  overall_percent = (filled_fields / total_fields) * 100
309
+
310
+ #Field-specific missing counts (for Missing Metadata Summary)
311
+ missing_counts = missing_mask.sum().sort_values(ascending=False)
312
+ missing_df = (
313
+ pd.DataFrame(missing_counts)
314
+ .reset_index()
315
+ .rename(columns={"index": "Field", 0: "Missing Count"})
316
+ )
317
+
318
 
319
  # Field-level completeness
320
  completeness = (~metadata_df.map(is_incomplete)).mean() * 100
321
  completeness_df = pd.DataFrame({"Field": completeness.index, "Completeness (%)": completeness.values})
322
  completeness_table = completeness_df.set_index("Field")
323
 
324
+ # Sidebar Quick Stats
325
  quick_stats = pd.DataFrame({
326
  "Metric": ["Total Records", "Incomplete Records", "Percent Complete"],
327
  "Value": [len(metadata_df), incomplete_count, round(overall_percent, 1)]
 
362
  hide_index=True, # <<< ADD THIS
363
  height=min(300, len(missing_df) * 35 + 38)
364
  )
365
+
366
  # Calculate Top 10 Subjects
367
  if 'subject' in metadata_df.columns:
368
  top_subjects = (
 
422
  st.dataframe(metadata_df.head())
423
 
424
 
 
425
  st.subheader("Field Completeness Breakdown")
426
 
427
+ #DARK box for the Field Completeness Breakdown (MATCH others!)
428
  st.markdown("""
429
  <div style='
430
  background-color: #2e2e2e;
431
+ padding: 1.5rem;
432
  border-radius: 10px;
433
  margin-top: 1.5rem;
434
  color: lightgray;
435
  '>
436
  """, unsafe_allow_html=True)
437
 
438
+ #Dataframe inside the dark box
439
  st.dataframe(
440
+ completeness_table.style
441
+ .background_gradient(cmap="Greens")
442
+ .format("{:.1f}%")
443
+ .hide(axis="index"),
444
  use_container_width=True,
445
  height=240
446
  )
447
 
448
  st.markdown("</div>", unsafe_allow_html=True)
449
 
 
450
  # Identify incomplete records
451
  incomplete_mask = metadata_df.map(is_incomplete).any(axis=1)
452
  incomplete_records = metadata_df[incomplete_mask]
453
 
454
+
455
+ # Suggested Metadata Enhancements Section
456
+ st.subheader("Suggested Metadata Enhancements")
457
 
458
+ # Always show the checkbox
459
+ use_ai = st.checkbox("Use AI Suggestions", value=True)
 
 
460
 
461
+ # Then check if records exist
462
+ incomplete_with_desc = metadata_df[
463
+ (metadata_df['description'].notnull() | metadata_df['title'].notnull()) &
464
+ (metadata_df['subject'].isnull())
465
+ ]
466
 
467
+ if not incomplete_with_desc.empty:
468
+ if use_ai:
469
+ suggestions = []
470
+ records_to_process = min(10, len(incomplete_with_desc))
471
 
472
+ progress = st.progress(0)
473
+ status = st.empty()
474
+
475
+ for i, (idx, row) in enumerate(incomplete_with_desc.iterrows()):
476
+ if i >= records_to_process:
477
+ break
478
+
479
+ title = row['title'] if pd.notna(row['title']) else ""
480
+ description = row['description'] if pd.notna(row['description']) else ""
481
+
482
+ status.text(f"Analyzing {i+1}/{records_to_process}: {title[:30]}...")
483
+
484
+ suggested_subject = get_huggingface_suggestions(title, description)
485
+
486
+ if suggested_subject:
487
+ suggestions.append((title, suggested_subject))
488
+
489
+ progress.progress((i + 1) / records_to_process)
490
+
491
+ status.empty()
492
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
493
  if suggestions:
494
  suggestions_df = pd.DataFrame(suggestions, columns=["Title", "Suggested Subject"])
495
+
 
496
  styled_suggestions = (
497
  suggestions_df.style
498
  .background_gradient(cmap="Greens", subset=["Suggested Subject"])
499
  .hide(axis="index")
500
  )
501
+
502
+ #Custom dark card wrapper for the table
503
+ st.markdown("""
504
+ <div style='
505
+ background-color: #2e2e2e;
506
+ padding: 1.5rem;
507
+ border-radius: 10px;
508
+ margin-top: 1.5rem;
509
+ color: lightgray;
510
+ '>
511
+ """, unsafe_allow_html=True)
512
+
513
  st.dataframe(
514
  styled_suggestions,
515
  use_container_width=True,
516
  hide_index=True,
517
+ height=min(300, len(suggestions) * 35 + 38)
518
  )
519
+
520
+ st.markdown("</div>", unsafe_allow_html=True)
521
  else:
522
+ st.info("No metadata enhancement suggestions available.")
523
+
524
+
525
+ else:
526
+ st.info("Enable AI Suggestions to view recommendations.")
 
 
527
  else:
528
+ st.success("All records already have subjects or no usable text available.")