import os
import requests
import pandas as pd
import streamlit as st
import time
import matplotlib
import plotly.express as px
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
def is_missing(value):
return pd.isna(value) or str(value).strip() == ""
# Load the Hugging Face API key from environment
api_key = os.getenv('HF_API')
def get_huggingface_suggestions(title, description):
API_URL = "https://api-inference.huggingface.co/models/facebook/bart-large-mnli"
headers = {"Authorization": f"Bearer {api_key}"}
full_text = f"{title}. {description}".strip()
if not full_text:
return None
candidate_labels = [
"History", "Politics", "Science", "Technology", "Art", "Literature",
"Education", "Economics", "Military", "Geography", "Sociology",
"Philosophy", "Religion", "Law", "Medicine", "Engineering",
"Mathematics", "Computer Science", "Agriculture", "Environment",
"Maps", "United States", "Civil War", "Revolution", "Posters", "Women's Rights", "World War I"
]
payload = {
"inputs": full_text,
"parameters": {
"candidate_labels": candidate_labels,
"multi_label": True
}
}
try:
response = requests.post(API_URL, headers=headers, json=payload)
result = response.json()
if "error" in result:
st.error(f"API error: {result['error']}")
return None
labels = [
label for label, score in zip(result.get("labels", []), result.get("scores", []))
if score > 0.3
]
return ", ".join(labels) if labels else None
except Exception as e:
st.error(f"API Error: {e}")
return None
# Custom CSS
st.markdown("""
""", unsafe_allow_html=True)
# Function to get subject suggestions using Hugging Face API
def get_huggingface_suggestions(title, description):
API_URL = "https://api-inference.huggingface.co/models/facebook/bart-large-mnli"
# Rest of the function code...
# Use an image from a URL for the banner
st.image("https://cdn-uploads.huggingface.co/production/uploads/67351c643fe51cb1aa28f2e5/7ThcAOjbuM8ajrP85bGs4.jpeg", use_container_width=True)
# Streamlit app header
st.title("MetaDiscovery Agent for Library of Congress Collections")
st.markdown("""
This tool connects to the LOC API, retrieves metadata from a selected collection, and performs
an analysis of metadata completeness, suggests enhancements, and identifies authority gaps.
""")
# Updated collection URLs using the correct LOC API
collections = {
"American Revolutionary War Maps": "american+revolutionary+war+maps",
"Civil War Maps": "civil+war+maps",
"Women's Suffrage": "women+suffrage",
"World War I Posters": "world+war+posters"
}
# Sidebar for selecting collection
#st.sidebar.markdown("## Settings")
# Create empty metadata_df variable to ensure it exists before checking
metadata_df = pd.DataFrame()
# Add a key to the selectbox to ensure it refreshes properly
with st.sidebar:
st.markdown("""
""", unsafe_allow_html=True)
selected = st.radio("Select a Collection", list(collections.keys()), key="collection_selector")
st.markdown("
", unsafe_allow_html=True)
search_query = collections[selected]
# Define the collection URL
collection_url = f"https://www.loc.gov/search/?q={search_query}&fo=json"
# Create an empty placeholder for Quick Stats
stats_placeholder = st.sidebar.empty()
# Add a fetch button to make the action explicit
fetch_data = True
if fetch_data:
# Display a loading spinner while fetching data
with st.spinner(f"Fetching data for {selected}..."):
# Fetch data from LOC API with spoofed User-Agent header
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 Chrome/110.0.0.0 Safari/537.36"
}
try:
response = requests.get(collection_url, headers=headers)
response.raise_for_status()
data = response.json()
if "results" in data:
records = data.get("results", [])
elif "items" in data:
records = data.get("items", [])
else:
records = []
st.error("Unexpected API response structure. No records found.")
st.write(f"Retrieved {len(records)} records")
except requests.exceptions.RequestException as e:
st.error(f"API Connection Error: {e}")
records = []
except ValueError:
st.error("Failed to parse API response as JSON")
records = []
# Extract selected metadata fields
items = []
for record in records:
if isinstance(record, dict):
description = record.get("description", "")
if isinstance(description, list):
description = " ".join([str(d) for d in description])
item = {
"id": record.get("id", ""),
"title": record.get("title", ""),
"date": record.get("date", ""),
"subject": ", ".join(record.get("subject", [])) if isinstance(record.get("subject"), list) else record.get("subject", ""),
"creator": record.get("creator", ""),
"description": description
}
if not item["title"] and "item" in record:
item["title"] = record.get("item", {}).get("title", "")
if not item["date"] and "item" in record:
item["date"] = record.get("item", {}).get("date", "")
items.append(item)
metadata_df = pd.DataFrame(items)
# Missing field detection
fields_to_check = ["subject", "creator", "date", "title", "description"]
missing_counts = {}
for field in fields_to_check:
if field in metadata_df.columns:
missing = metadata_df[field].apply(is_missing)
missing_counts[field] = missing.sum()
# Define custom completeness check
def is_incomplete(value):
return pd.isna(value) or value in ["", "N/A", "null", None]
if not metadata_df.empty:
# --- Unified Completeness and Missing Fields Analysis ---
#Define incompleteness at the cell level
is_incomplete = lambda value: pd.isna(value) or value in ["", "N/A", "null", None]
#Create a mask for missing values
missing_mask = metadata_df.map(is_incomplete)
#Compute overall record-level completeness
incomplete_count = missing_mask.any(axis=1).sum()
total_fields = metadata_df.size
filled_fields = (~missing_mask).sum().sum()
overall_percent = (filled_fields / total_fields) * 100
#Field-specific missing counts (for Missing Metadata Summary)
missing_counts = missing_mask.sum().sort_values(ascending=False)
missing_df = (
pd.DataFrame(missing_counts)
.reset_index()
.rename(columns={"index": "Field", 0: "Missing Count"})
)
# Field-level completeness
completeness = (~metadata_df.map(is_incomplete)).mean() * 100
completeness_df = pd.DataFrame({"Field": completeness.index, "Completeness (%)": completeness.values})
completeness_table = completeness_df.set_index("Field")
# Sidebar Quick Stats
quick_stats = pd.DataFrame({
"Metric": ["Total Records", "Incomplete Records", "Percent Complete"],
"Value": [len(metadata_df), incomplete_count, round(overall_percent, 1)]
})
styled_quick_stats = (
quick_stats.style
.hide(axis="index")
.background_gradient(cmap="Oranges", subset=["Value"])
.format({"Value": "{:.1f}"})
)
# Add an expander and put the dataframe inside it
with st.sidebar.expander("Quick Stats", expanded=True):
st.dataframe(
styled_quick_stats,
use_container_width=True,
hide_index=True
)
# Sidebar: Metadata Missing Stats
missing_df = (
pd.DataFrame(list(missing_counts.items()), columns=["Field", "Missing Count"])
.sort_values(by="Missing Count", ascending=False)
.reset_index(drop=True)
)
styled_missing_df = (
missing_df.style
.background_gradient(cmap="Blues", subset=["Missing Count"])
.hide(axis="index")
)
with st.sidebar.expander("🧹 Missing Metadata Summary", expanded=True):
st.dataframe(
styled_missing_df,
use_container_width=True,
hide_index=True, # <<< ADD THIS
height=min(300, len(missing_df) * 35 + 38)
)
# Calculate Top 10 Subjects
if 'subject' in metadata_df.columns:
top_subjects = (
metadata_df['subject']
.dropna()
.str.split(',')
.explode()
.str.strip()
.value_counts()
.head(10)
.to_frame(name="Count")
)
#Most Common Subjects in Sidebar
with st.sidebar.expander("Top 10 Most Common Subjects", expanded=True):
st.dataframe(
top_subjects.style.background_gradient(cmap="Greens").format("{:.0f}"),
use_container_width=True,
height=240
)
with st.sidebar.expander("Helpful Resources", expanded=False):
st.markdown("""
""", unsafe_allow_html=True)
# Utility functions for deeper metadata quality analysis
def is_incomplete(value):
return pd.isna(value) or value in ["", "N/A", "null", None]
def is_valid_date(value):
try:
pd.to_datetime(value)
return True
except:
return False
if not metadata_df.empty:
st.subheader("Retrieved Metadata Sample")
st.dataframe(metadata_df.head())
st.subheader("Field Completeness Breakdown")
#DARK box for the Field Completeness Breakdown (MATCH others!)
st.markdown("""
""", unsafe_allow_html=True)
#Dataframe inside the dark box
st.dataframe(
completeness_table.style
.background_gradient(cmap="Greens")
.format("{:.1f}%")
.hide(axis="index"),
use_container_width=True,
height=240
)
st.markdown("
", unsafe_allow_html=True)
# Identify incomplete records
incomplete_mask = metadata_df.map(is_incomplete).any(axis=1)
incomplete_records = metadata_df[incomplete_mask]
# Suggested Metadata Enhancements Section
st.subheader("Suggested Metadata Enhancements")
# Always show the checkbox
use_ai = st.checkbox("Use AI Suggestions", value=True)
# Then check if records exist
incomplete_with_desc = metadata_df[
(metadata_df['description'].notnull() | metadata_df['title'].notnull()) &
(metadata_df['subject'].isnull())
]
if not incomplete_with_desc.empty:
if use_ai:
suggestions = []
records_to_process = min(10, len(incomplete_with_desc))
progress = st.progress(0)
status = st.empty()
for i, (idx, row) in enumerate(incomplete_with_desc.iterrows()):
if i >= records_to_process:
break
title = row['title'] if pd.notna(row['title']) else ""
description = row['description'] if pd.notna(row['description']) else ""
status.text(f"Analyzing {i+1}/{records_to_process}: {title[:30]}...")
suggested_subject = get_huggingface_suggestions(title, description)
if suggested_subject:
suggestions.append((title, suggested_subject))
progress.progress((i + 1) / records_to_process)
status.empty()
if suggestions:
suggestions_df = pd.DataFrame(suggestions, columns=["Title", "Suggested Subject"])
styled_suggestions = (
suggestions_df.style
.background_gradient(cmap="Greens", subset=["Suggested Subject"])
.hide(axis="index")
)
#Custom dark card wrapper for the table
st.markdown("""
""", unsafe_allow_html=True)
st.dataframe(
styled_suggestions,
use_container_width=True,
hide_index=True,
height=min(300, len(suggestions) * 35 + 38)
)
st.markdown("
", unsafe_allow_html=True)
else:
st.info("No metadata enhancement suggestions available.")
else:
st.info("Enable AI Suggestions to view recommendations.")
else:
st.success("All records already have subjects or no usable text available.")