Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -1,12 +1,63 @@
|
|
|
|
1 |
import requests
|
2 |
import pandas as pd
|
3 |
-
import numpy as np
|
4 |
import streamlit as st
|
|
|
5 |
import matplotlib
|
6 |
import plotly.express as px
|
7 |
from sklearn.feature_extraction.text import TfidfVectorizer
|
8 |
from sklearn.metrics.pairwise import cosine_similarity
|
9 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
10 |
# Custom CSS
|
11 |
st.markdown("""
|
12 |
<style>
|
@@ -118,6 +169,11 @@ st.markdown("""
|
|
118 |
</style>
|
119 |
""", unsafe_allow_html=True)
|
120 |
|
|
|
|
|
|
|
|
|
|
|
121 |
# Use an image from a URL for the banner
|
122 |
st.image("https://cdn-uploads.huggingface.co/production/uploads/67351c643fe51cb1aa28f2e5/7ThcAOjbuM8ajrP85bGs4.jpeg", use_container_width=True)
|
123 |
|
@@ -222,24 +278,50 @@ if fetch_data:
|
|
222 |
|
223 |
metadata_df = pd.DataFrame(items)
|
224 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
225 |
# Define custom completeness check
|
226 |
def is_incomplete(value):
|
227 |
return pd.isna(value) or value in ["", "N/A", "null", None]
|
228 |
|
229 |
|
230 |
if not metadata_df.empty:
|
231 |
-
|
232 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
233 |
total_fields = metadata_df.size
|
234 |
-
filled_fields = (~
|
235 |
overall_percent = (filled_fields / total_fields) * 100
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
236 |
|
237 |
# Field-level completeness
|
238 |
completeness = (~metadata_df.map(is_incomplete)).mean() * 100
|
239 |
completeness_df = pd.DataFrame({"Field": completeness.index, "Completeness (%)": completeness.values})
|
240 |
completeness_table = completeness_df.set_index("Field")
|
241 |
|
242 |
-
# Sidebar Quick Stats
|
243 |
quick_stats = pd.DataFrame({
|
244 |
"Metric": ["Total Records", "Incomplete Records", "Percent Complete"],
|
245 |
"Value": [len(metadata_df), incomplete_count, round(overall_percent, 1)]
|
@@ -280,6 +362,7 @@ if fetch_data:
|
|
280 |
hide_index=True, # <<< ADD THIS
|
281 |
height=min(300, len(missing_df) * 35 + 38)
|
282 |
)
|
|
|
283 |
# Calculate Top 10 Subjects
|
284 |
if 'subject' in metadata_df.columns:
|
285 |
top_subjects = (
|
@@ -339,101 +422,107 @@ if fetch_data:
|
|
339 |
st.dataframe(metadata_df.head())
|
340 |
|
341 |
|
342 |
-
# Fill the placeholder created earlier
|
343 |
st.subheader("Field Completeness Breakdown")
|
344 |
|
|
|
345 |
st.markdown("""
|
346 |
<div style='
|
347 |
background-color: #2e2e2e;
|
348 |
-
padding: 1.
|
349 |
border-radius: 10px;
|
350 |
margin-top: 1.5rem;
|
351 |
color: lightgray;
|
352 |
'>
|
353 |
""", unsafe_allow_html=True)
|
354 |
|
|
|
355 |
st.dataframe(
|
356 |
-
completeness_table.style
|
|
|
|
|
|
|
357 |
use_container_width=True,
|
358 |
height=240
|
359 |
)
|
360 |
|
361 |
st.markdown("</div>", unsafe_allow_html=True)
|
362 |
|
363 |
-
|
364 |
# Identify incomplete records
|
365 |
incomplete_mask = metadata_df.map(is_incomplete).any(axis=1)
|
366 |
incomplete_records = metadata_df[incomplete_mask]
|
367 |
|
368 |
-
|
|
|
|
|
369 |
|
370 |
-
#
|
371 |
-
|
372 |
-
(metadata_df['subject'].isnull() |
|
373 |
-
metadata_df['creator'].isnull())]
|
374 |
|
375 |
-
#
|
376 |
-
|
377 |
-
|
378 |
-
|
|
|
379 |
|
380 |
-
|
381 |
-
|
382 |
-
|
|
|
383 |
|
384 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
385 |
|
386 |
-
if len(incomplete_with_desc) > 0 and len(reference_df) > 0:
|
387 |
-
try:
|
388 |
-
suggestions = []
|
389 |
-
# Fit TF-IDF on all complete descriptions
|
390 |
-
tfidf_matrix = tfidf.fit_transform(reference_df['description'].fillna('').astype(str))
|
391 |
-
|
392 |
-
# For each incomplete record
|
393 |
-
for idx, row in incomplete_with_desc.iterrows():
|
394 |
-
if pd.notna(row['description']):
|
395 |
-
# Transform this record's description
|
396 |
-
desc_vec = tfidf.transform([str(row['description'])])
|
397 |
-
|
398 |
-
# Get similarity scores to all reference records
|
399 |
-
sims = cosine_similarity(desc_vec, tfidf_matrix).flatten()
|
400 |
-
|
401 |
-
# Find the top 3 most similar records
|
402 |
-
top_indices = sims.argsort()[-3:][::-1]
|
403 |
-
|
404 |
-
# Get the most frequent subject among top matches
|
405 |
-
top_subjects = reference_df.iloc[top_indices]['subject'].value_counts().index
|
406 |
-
if len(top_subjects) > 0:
|
407 |
-
suggested_subject = top_subjects[0]
|
408 |
-
suggestions.append((row['title'], suggested_subject))
|
409 |
-
|
410 |
if suggestions:
|
411 |
suggestions_df = pd.DataFrame(suggestions, columns=["Title", "Suggested Subject"])
|
412 |
-
|
413 |
-
# Apply similar styling as your other tables
|
414 |
styled_suggestions = (
|
415 |
suggestions_df.style
|
416 |
.background_gradient(cmap="Greens", subset=["Suggested Subject"])
|
417 |
.hide(axis="index")
|
418 |
)
|
419 |
-
|
420 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
421 |
st.dataframe(
|
422 |
styled_suggestions,
|
423 |
use_container_width=True,
|
424 |
hide_index=True,
|
425 |
-
height=min(
|
426 |
)
|
|
|
|
|
427 |
else:
|
428 |
-
|
429 |
-
|
430 |
-
|
431 |
-
|
432 |
-
|
433 |
-
st.error(f"Error generating metadata suggestions: {e}")
|
434 |
-
st.error(f"Error details: {str(e)}")
|
435 |
else:
|
436 |
-
|
437 |
-
columns=["Message"])
|
438 |
-
styled_empty = empty_df.style.hide(axis="index")
|
439 |
-
st.dataframe(styled_empty, use_container_width=True, hide_index=True)
|
|
|
1 |
+
import os
|
2 |
import requests
|
3 |
import pandas as pd
|
|
|
4 |
import streamlit as st
|
5 |
+
import time
|
6 |
import matplotlib
|
7 |
import plotly.express as px
|
8 |
from sklearn.feature_extraction.text import TfidfVectorizer
|
9 |
from sklearn.metrics.pairwise import cosine_similarity
|
10 |
|
11 |
+
def is_missing(value):
|
12 |
+
return pd.isna(value) or str(value).strip() == ""
|
13 |
+
|
14 |
+
# Load the Hugging Face API key from environment
|
15 |
+
api_key = os.getenv('HF_API')
|
16 |
+
|
17 |
+
def get_huggingface_suggestions(title, description):
|
18 |
+
API_URL = "https://api-inference.huggingface.co/models/facebook/bart-large-mnli"
|
19 |
+
headers = {"Authorization": f"Bearer {api_key}"}
|
20 |
+
|
21 |
+
full_text = f"{title}. {description}".strip()
|
22 |
+
|
23 |
+
if not full_text:
|
24 |
+
return None
|
25 |
+
|
26 |
+
candidate_labels = [
|
27 |
+
"History", "Politics", "Science", "Technology", "Art", "Literature",
|
28 |
+
"Education", "Economics", "Military", "Geography", "Sociology",
|
29 |
+
"Philosophy", "Religion", "Law", "Medicine", "Engineering",
|
30 |
+
"Mathematics", "Computer Science", "Agriculture", "Environment",
|
31 |
+
"Maps", "United States", "Civil War", "Revolution", "Posters", "Women's Rights", "World War I"
|
32 |
+
]
|
33 |
+
|
34 |
+
payload = {
|
35 |
+
"inputs": full_text,
|
36 |
+
"parameters": {
|
37 |
+
"candidate_labels": candidate_labels,
|
38 |
+
"multi_label": True
|
39 |
+
}
|
40 |
+
}
|
41 |
+
|
42 |
+
try:
|
43 |
+
response = requests.post(API_URL, headers=headers, json=payload)
|
44 |
+
result = response.json()
|
45 |
+
|
46 |
+
if "error" in result:
|
47 |
+
st.error(f"API error: {result['error']}")
|
48 |
+
return None
|
49 |
+
|
50 |
+
labels = [
|
51 |
+
label for label, score in zip(result.get("labels", []), result.get("scores", []))
|
52 |
+
if score > 0.3
|
53 |
+
]
|
54 |
+
|
55 |
+
return ", ".join(labels) if labels else None
|
56 |
+
|
57 |
+
except Exception as e:
|
58 |
+
st.error(f"API Error: {e}")
|
59 |
+
return None
|
60 |
+
|
61 |
# Custom CSS
|
62 |
st.markdown("""
|
63 |
<style>
|
|
|
169 |
</style>
|
170 |
""", unsafe_allow_html=True)
|
171 |
|
172 |
+
# Function to get subject suggestions using Hugging Face API
|
173 |
+
def get_huggingface_suggestions(title, description):
|
174 |
+
API_URL = "https://api-inference.huggingface.co/models/facebook/bart-large-mnli"
|
175 |
+
# Rest of the function code...
|
176 |
+
|
177 |
# Use an image from a URL for the banner
|
178 |
st.image("https://cdn-uploads.huggingface.co/production/uploads/67351c643fe51cb1aa28f2e5/7ThcAOjbuM8ajrP85bGs4.jpeg", use_container_width=True)
|
179 |
|
|
|
278 |
|
279 |
metadata_df = pd.DataFrame(items)
|
280 |
|
281 |
+
# Missing field detection
|
282 |
+
fields_to_check = ["subject", "creator", "date", "title", "description"]
|
283 |
+
missing_counts = {}
|
284 |
+
|
285 |
+
for field in fields_to_check:
|
286 |
+
if field in metadata_df.columns:
|
287 |
+
missing = metadata_df[field].apply(is_missing)
|
288 |
+
missing_counts[field] = missing.sum()
|
289 |
+
|
290 |
# Define custom completeness check
|
291 |
def is_incomplete(value):
|
292 |
return pd.isna(value) or value in ["", "N/A", "null", None]
|
293 |
|
294 |
|
295 |
if not metadata_df.empty:
|
296 |
+
# --- Unified Completeness and Missing Fields Analysis ---
|
297 |
+
|
298 |
+
#Define incompleteness at the cell level
|
299 |
+
is_incomplete = lambda value: pd.isna(value) or value in ["", "N/A", "null", None]
|
300 |
+
|
301 |
+
#Create a mask for missing values
|
302 |
+
missing_mask = metadata_df.map(is_incomplete)
|
303 |
+
|
304 |
+
#Compute overall record-level completeness
|
305 |
+
incomplete_count = missing_mask.any(axis=1).sum()
|
306 |
total_fields = metadata_df.size
|
307 |
+
filled_fields = (~missing_mask).sum().sum()
|
308 |
overall_percent = (filled_fields / total_fields) * 100
|
309 |
+
|
310 |
+
#Field-specific missing counts (for Missing Metadata Summary)
|
311 |
+
missing_counts = missing_mask.sum().sort_values(ascending=False)
|
312 |
+
missing_df = (
|
313 |
+
pd.DataFrame(missing_counts)
|
314 |
+
.reset_index()
|
315 |
+
.rename(columns={"index": "Field", 0: "Missing Count"})
|
316 |
+
)
|
317 |
+
|
318 |
|
319 |
# Field-level completeness
|
320 |
completeness = (~metadata_df.map(is_incomplete)).mean() * 100
|
321 |
completeness_df = pd.DataFrame({"Field": completeness.index, "Completeness (%)": completeness.values})
|
322 |
completeness_table = completeness_df.set_index("Field")
|
323 |
|
324 |
+
# Sidebar Quick Stats
|
325 |
quick_stats = pd.DataFrame({
|
326 |
"Metric": ["Total Records", "Incomplete Records", "Percent Complete"],
|
327 |
"Value": [len(metadata_df), incomplete_count, round(overall_percent, 1)]
|
|
|
362 |
hide_index=True, # <<< ADD THIS
|
363 |
height=min(300, len(missing_df) * 35 + 38)
|
364 |
)
|
365 |
+
|
366 |
# Calculate Top 10 Subjects
|
367 |
if 'subject' in metadata_df.columns:
|
368 |
top_subjects = (
|
|
|
422 |
st.dataframe(metadata_df.head())
|
423 |
|
424 |
|
|
|
425 |
st.subheader("Field Completeness Breakdown")
|
426 |
|
427 |
+
#DARK box for the Field Completeness Breakdown (MATCH others!)
|
428 |
st.markdown("""
|
429 |
<div style='
|
430 |
background-color: #2e2e2e;
|
431 |
+
padding: 1.5rem;
|
432 |
border-radius: 10px;
|
433 |
margin-top: 1.5rem;
|
434 |
color: lightgray;
|
435 |
'>
|
436 |
""", unsafe_allow_html=True)
|
437 |
|
438 |
+
#Dataframe inside the dark box
|
439 |
st.dataframe(
|
440 |
+
completeness_table.style
|
441 |
+
.background_gradient(cmap="Greens")
|
442 |
+
.format("{:.1f}%")
|
443 |
+
.hide(axis="index"),
|
444 |
use_container_width=True,
|
445 |
height=240
|
446 |
)
|
447 |
|
448 |
st.markdown("</div>", unsafe_allow_html=True)
|
449 |
|
|
|
450 |
# Identify incomplete records
|
451 |
incomplete_mask = metadata_df.map(is_incomplete).any(axis=1)
|
452 |
incomplete_records = metadata_df[incomplete_mask]
|
453 |
|
454 |
+
|
455 |
+
# Suggested Metadata Enhancements Section
|
456 |
+
st.subheader("Suggested Metadata Enhancements")
|
457 |
|
458 |
+
# Always show the checkbox
|
459 |
+
use_ai = st.checkbox("Use AI Suggestions", value=True)
|
|
|
|
|
460 |
|
461 |
+
# Then check if records exist
|
462 |
+
incomplete_with_desc = metadata_df[
|
463 |
+
(metadata_df['description'].notnull() | metadata_df['title'].notnull()) &
|
464 |
+
(metadata_df['subject'].isnull())
|
465 |
+
]
|
466 |
|
467 |
+
if not incomplete_with_desc.empty:
|
468 |
+
if use_ai:
|
469 |
+
suggestions = []
|
470 |
+
records_to_process = min(10, len(incomplete_with_desc))
|
471 |
|
472 |
+
progress = st.progress(0)
|
473 |
+
status = st.empty()
|
474 |
+
|
475 |
+
for i, (idx, row) in enumerate(incomplete_with_desc.iterrows()):
|
476 |
+
if i >= records_to_process:
|
477 |
+
break
|
478 |
+
|
479 |
+
title = row['title'] if pd.notna(row['title']) else ""
|
480 |
+
description = row['description'] if pd.notna(row['description']) else ""
|
481 |
+
|
482 |
+
status.text(f"Analyzing {i+1}/{records_to_process}: {title[:30]}...")
|
483 |
+
|
484 |
+
suggested_subject = get_huggingface_suggestions(title, description)
|
485 |
+
|
486 |
+
if suggested_subject:
|
487 |
+
suggestions.append((title, suggested_subject))
|
488 |
+
|
489 |
+
progress.progress((i + 1) / records_to_process)
|
490 |
+
|
491 |
+
status.empty()
|
492 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
493 |
if suggestions:
|
494 |
suggestions_df = pd.DataFrame(suggestions, columns=["Title", "Suggested Subject"])
|
495 |
+
|
|
|
496 |
styled_suggestions = (
|
497 |
suggestions_df.style
|
498 |
.background_gradient(cmap="Greens", subset=["Suggested Subject"])
|
499 |
.hide(axis="index")
|
500 |
)
|
501 |
+
|
502 |
+
#Custom dark card wrapper for the table
|
503 |
+
st.markdown("""
|
504 |
+
<div style='
|
505 |
+
background-color: #2e2e2e;
|
506 |
+
padding: 1.5rem;
|
507 |
+
border-radius: 10px;
|
508 |
+
margin-top: 1.5rem;
|
509 |
+
color: lightgray;
|
510 |
+
'>
|
511 |
+
""", unsafe_allow_html=True)
|
512 |
+
|
513 |
st.dataframe(
|
514 |
styled_suggestions,
|
515 |
use_container_width=True,
|
516 |
hide_index=True,
|
517 |
+
height=min(300, len(suggestions) * 35 + 38)
|
518 |
)
|
519 |
+
|
520 |
+
st.markdown("</div>", unsafe_allow_html=True)
|
521 |
else:
|
522 |
+
st.info("No metadata enhancement suggestions available.")
|
523 |
+
|
524 |
+
|
525 |
+
else:
|
526 |
+
st.info("Enable AI Suggestions to view recommendations.")
|
|
|
|
|
527 |
else:
|
528 |
+
st.success("All records already have subjects or no usable text available.")
|
|
|
|
|
|