CCockrum's picture
Update app.py
8f294a5 verified
raw
history blame
20.4 kB
import os
import requests
import pandas as pd
import streamlit as st
import time
import matplotlib
import plotly.express as px
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
def is_missing(value):
return pd.isna(value) or str(value).strip() == ""
# Load the Hugging Face API key from environment
api_key = os.getenv('HF_API')
def get_huggingface_suggestions(title, description):
API_URL = "https://api-inference.huggingface.co/models/facebook/bart-large-mnli"
headers = {"Authorization": f"Bearer {api_key}"}
full_text = f"{title}. {description}".strip()
if not full_text:
return None
candidate_labels = [
"History", "Politics", "Science", "Technology", "Art", "Literature",
"Education", "Economics", "Military", "Geography", "Sociology",
"Philosophy", "Religion", "Law", "Medicine", "Engineering",
"Mathematics", "Computer Science", "Agriculture", "Environment",
"Maps", "United States", "Civil War", "Revolution", "Posters", "Women's Rights", "World War I"
]
payload = {
"inputs": full_text,
"parameters": {
"candidate_labels": candidate_labels,
"multi_label": True
}
}
try:
response = requests.post(API_URL, headers=headers, json=payload)
result = response.json()
if "error" in result:
st.error(f"API error: {result['error']}")
return None
labels = [
label for label, score in zip(result.get("labels", []), result.get("scores", []))
if score > 0.3
]
return ", ".join(labels) if labels else None
except Exception as e:
st.error(f"API Error: {e}")
return None
# Custom CSS
st.markdown("""
<style>
.main {
background-color: #1A1A1A !important; /* dark */
color: #D3D3D3 !important;
}
}
.block-container {
background-color: #D3D3D3 !important;
color: #cccccc !important;
padding-left: 3rem !important;
padding-right: 3rem !important;
max-width: 900px; /* widen main feed */
margin: auto; /* center it */
}
/* Headings */
h1, h2, h3, h4 {
color: #eeeeee !important; /* brighter light gray for headings */
font-weight: 700 !important; /* bold */
margin-bottom: 1rem !important;
}
p, span, div {
color: #cccccc !important;
}
/* Subheaders (optional) */
.stSubheader {
color: #dddddd !important;
font-size: 1.4rem !important;
}
/* Dataframes (optional tweak) */
.stDataFrame {
background-color: #2e2e2e !important;
border-radius: 10px;
padding: 1rem;
}
section[data-testid="stSidebar"] > div:first-child {
background-color: #808080 !important;
padding: 1rem;
border-radius: 0.5rem;
color: #808080 !important;
}
.stMarkdown, .stTextInput, .stDataFrame {
color: #1A1A1A!important;
}
img.banner {
width: 100%;
border-radius: 12px;
margin-bottom: 1rem;
}
.stAlert {
background-color: #f0f0f5 !important;
color: #1A1A1A !important;
padding: 1.25rem !important;
font-size: 1rem !important;
border-radius: 0.5rem !important;
box-shadow: 0 2px 5px rgba(0, 0, 0, 0.05) !important;
}
header[data-testid="stHeader"] {
background-color: #1A1A1A !important;
}
section[data-testid="stSidebar"] > div:first-child {
background-color: #1A1A1A !important;
color: #FFFFFF !important;
padding: 2rem 1.5rem 1.5rem 1.5rem !important;
border-radius: 12px;
box-shadow: 0 4px 12px rgba(0, 0, 0, 0.08);
font-size: 0.95rem;
line-height: 1.5;
}
;
html, body, [data-testid="stApp"] {
background-color: #1A1A1A !important;
}
.custom-table {
background-color: #D3D3D3;
color: #1A1A1A;
font-family: monospace;
padding: 1rem;
border-radius: 8px;
overflow-x: auto;
white-space: pre;
border: 1px solid #ccc;
}
.sidebar-stats {
color: lightgray !important;
font-size: 1.1rem !important;
margin-top: 1.5rem;
font-weight: 600;
}
.sidebar-contrast-block {
background-color: #2b2b2b !important;
padding: 1.25rem;
border-radius: 10px;
margin-top: 1.5rem;
}
section.main > div { /* widen main container */
max-width: 95%;
padding-left: 3rem;
padding-right: 3rem;
}
</style>
""", unsafe_allow_html=True)
# Function to get subject suggestions using Hugging Face API
def get_huggingface_suggestions(title, description):
API_URL = "https://api-inference.huggingface.co/models/facebook/bart-large-mnli"
# Rest of the function code...
# Use an image from a URL for the banner
st.image("https://cdn-uploads.huggingface.co/production/uploads/67351c643fe51cb1aa28f2e5/7ThcAOjbuM8ajrP85bGs4.jpeg", use_container_width=True)
# Streamlit app header
st.title("MetaDiscovery Agent for Library of Congress Collections")
st.markdown("""
This tool connects to the LOC API, retrieves metadata from a selected collection, and performs
an analysis of metadata completeness, suggests enhancements, and identifies authority gaps.
""")
# Updated collection URLs using the correct LOC API
collections = {
"American Revolutionary War Maps": "american+revolutionary+war+maps",
"Civil War Maps": "civil+war+maps",
"Women's Suffrage": "women+suffrage",
"World War I Posters": "world+war+posters"
}
# Sidebar for selecting collection
#st.sidebar.markdown("## Settings")
# Create empty metadata_df variable to ensure it exists before checking
metadata_df = pd.DataFrame()
# Add a key to the selectbox to ensure it refreshes properly
with st.sidebar:
st.markdown("""
<div style='
background-color: #2b2b2b
padding: 1.5rem;
border-radius: 12px;
margin-bottom: 1.5rem;
box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
'>
""", unsafe_allow_html=True)
selected = st.radio("Select a Collection", list(collections.keys()), key="collection_selector")
st.markdown("</div>", unsafe_allow_html=True)
search_query = collections[selected]
# Define the collection URL
collection_url = f"https://www.loc.gov/search/?q={search_query}&fo=json"
# Create an empty placeholder for Quick Stats
stats_placeholder = st.sidebar.empty()
# Add a fetch button to make the action explicit
fetch_data = True
if fetch_data:
# Display a loading spinner while fetching data
with st.spinner(f"Fetching data for {selected}..."):
# Fetch data from LOC API with spoofed User-Agent header
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 Chrome/110.0.0.0 Safari/537.36"
}
try:
response = requests.get(collection_url, headers=headers)
response.raise_for_status()
data = response.json()
if "results" in data:
records = data.get("results", [])
elif "items" in data:
records = data.get("items", [])
else:
records = []
st.error("Unexpected API response structure. No records found.")
st.write(f"Retrieved {len(records)} records")
except requests.exceptions.RequestException as e:
st.error(f"API Connection Error: {e}")
records = []
except ValueError:
st.error("Failed to parse API response as JSON")
records = []
# Extract selected metadata fields
items = []
for record in records:
if isinstance(record, dict):
description = record.get("description", "")
if isinstance(description, list):
description = " ".join([str(d) for d in description])
item = {
"id": record.get("id", ""),
"title": record.get("title", ""),
"date": record.get("date", ""),
"subject": ", ".join(record.get("subject", [])) if isinstance(record.get("subject"), list) else record.get("subject", ""),
"creator": record.get("creator", ""),
"description": description
}
if not item["title"] and "item" in record:
item["title"] = record.get("item", {}).get("title", "")
if not item["date"] and "item" in record:
item["date"] = record.get("item", {}).get("date", "")
items.append(item)
metadata_df = pd.DataFrame(items)
# Missing field detection
fields_to_check = ["subject", "creator", "date", "title", "description"]
missing_counts = {}
for field in fields_to_check:
if field in metadata_df.columns:
missing = metadata_df[field].apply(is_missing)
missing_counts[field] = missing.sum()
# Define custom completeness check
def is_incomplete(value):
return pd.isna(value) or value in ["", "N/A", "null", None]
if not metadata_df.empty:
# --- Unified Completeness and Missing Fields Analysis ---
#Define incompleteness at the cell level
is_incomplete = lambda value: pd.isna(value) or value in ["", "N/A", "null", None]
#Create a mask for missing values
missing_mask = metadata_df.map(is_incomplete)
#Compute overall record-level completeness
incomplete_count = missing_mask.any(axis=1).sum()
total_fields = metadata_df.size
filled_fields = (~missing_mask).sum().sum()
overall_percent = (filled_fields / total_fields) * 100
#Field-specific missing counts (for Missing Metadata Summary)
missing_counts = missing_mask.sum().sort_values(ascending=False)
missing_df = (
pd.DataFrame(missing_counts)
.reset_index()
.rename(columns={"index": "Field", 0: "Missing Count"})
)
# Field-level completeness
completeness = (~metadata_df.map(is_incomplete)).mean() * 100
completeness_df = pd.DataFrame({"Field": completeness.index, "Completeness (%)": completeness.values})
completeness_table = completeness_df.set_index("Field")
# Sidebar Quick Stats
quick_stats = pd.DataFrame({
"Metric": ["Total Records", "Incomplete Records", "Percent Complete"],
"Value": [len(metadata_df), incomplete_count, round(overall_percent, 1)]
})
styled_quick_stats = (
quick_stats.style
.hide(axis="index")
.background_gradient(cmap="Oranges", subset=["Value"])
.format({"Value": "{:.1f}"})
)
# Add an expander and put the dataframe inside it
with st.sidebar.expander("Quick Stats", expanded=True):
st.dataframe(
styled_quick_stats,
use_container_width=True,
hide_index=True
)
# Sidebar: Metadata Missing Stats
missing_df = (
pd.DataFrame(list(missing_counts.items()), columns=["Field", "Missing Count"])
.sort_values(by="Missing Count", ascending=False)
.reset_index(drop=True)
)
styled_missing_df = (
missing_df.style
.background_gradient(cmap="Blues", subset=["Missing Count"])
.hide(axis="index")
)
with st.sidebar.expander("🧹 Missing Metadata Summary", expanded=True):
st.dataframe(
styled_missing_df,
use_container_width=True,
hide_index=True, # <<< ADD THIS
height=min(300, len(missing_df) * 35 + 38)
)
# Calculate Top 10 Subjects
if 'subject' in metadata_df.columns:
top_subjects = (
metadata_df['subject']
.dropna()
.str.split(',')
.explode()
.str.strip()
.value_counts()
.head(10)
.to_frame(name="Count")
)
#Most Common Subjects in Sidebar
with st.sidebar.expander("Top 10 Most Common Subjects", expanded=True):
st.dataframe(
top_subjects.style.background_gradient(cmap="Greens").format("{:.0f}"),
use_container_width=True,
height=240
)
with st.sidebar.expander("Helpful Resources", expanded=False):
st.markdown("""
<style>
.sidebar-links a {
color: lightgray !important;
text-decoration: none !important;
}
.sidebar-links a:hover {
text-decoration: underline !important;
}
</style>
<div class="sidebar-links">
<ul style='padding-left: 1em'>
<li><a href="https://www.loc.gov/apis/" target="_blank">LOC API Info</a></li>
<li><a href="https://www.loc.gov/" target="_blank">Library of Congress Homepage</a></li>
<li><a href="https://www.loc.gov/collections/" target="_blank">LOC Digital Collections</a></li>
<li><a href="https://www.loc.gov/marc/" target="_blank">MARC Metadata Standards</a></li>
<li><a href="https://labs.loc.gov/about-labs/digital-strategy/" target="_blank">LOC Digital Strategy</a></li>
</ul>
</div>
""", unsafe_allow_html=True)
# Utility functions for deeper metadata quality analysis
def is_incomplete(value):
return pd.isna(value) or value in ["", "N/A", "null", None]
def is_valid_date(value):
try:
pd.to_datetime(value)
return True
except:
return False
if not metadata_df.empty:
st.subheader("Retrieved Metadata Sample")
st.dataframe(metadata_df.head())
st.subheader("Field Completeness Breakdown")
#DARK box for the Field Completeness Breakdown (MATCH others!)
st.markdown("""
<div style='
background-color: #2e2e2e;
padding: 1.5rem;
border-radius: 10px;
margin-top: 1.5rem;
color: lightgray;
'>
""", unsafe_allow_html=True)
#Dataframe inside the dark box
st.dataframe(
completeness_table.style
.background_gradient(cmap="Greens")
.format("{:.0f}%")
.hide(axis="index"),
use_container_width=True,
height=240
)
st.markdown("</div>", unsafe_allow_html=True)
# Identify incomplete records
incomplete_mask = metadata_df.map(is_incomplete).any(axis=1)
incomplete_records = metadata_df[incomplete_mask]
# --- Suggested Metadata Enhancements Section ---
st.subheader("Suggested Metadata Enhancements")
# Create a row with checkbox for AI suggestions - with proper label
use_ai = st.checkbox("Use AI Suggestions", value=True, label_visibility="hidden")
st.markdown("🤖 Use AI Suggestions (Hugging Face)")
# Check if records exist
incomplete_with_desc = metadata_df[
(metadata_df['description'].notnull() | metadata_df['title'].notnull()) &
(metadata_df['subject'].isnull())
]
if not incomplete_with_desc.empty:
if use_ai:
suggestions = []
records_to_process = min(10, len(incomplete_with_desc))
progress = st.progress(0)
status = st.empty()
for i, (idx, row) in enumerate(incomplete_with_desc.iterrows()):
if i >= records_to_process:
break
title = row['title'] if pd.notna(row['title']) else ""
description = row['description'] if pd.notna(row['description']) else ""
status.text(f"Analyzing {i+1}/{records_to_process}: {title[:30]}...")
suggested_subject = get_huggingface_suggestions(title, description)
if suggested_subject:
suggestions.append((title, suggested_subject))
progress.progress((i + 1) / records_to_process)
status.empty()
progress.empty()
if suggestions:
suggestions_df = pd.DataFrame(suggestions, columns=["Title", "Suggested Subject"])
# Create a custom dark-styled HTML table instead
html_table = """
<div style="background-color: #1e1e1e; padding: 1.5rem; border-radius: 10px; margin-top: 1rem;">
<table style="width: 100%; border-collapse: collapse; color: #e0e0e0;">
<thead>
<tr style="border-bottom: 1px solid #444;">
<th style="padding: 12px; text-align: left; color: #e0e0e0;">Title</th>
<th style="padding: 12px; text-align: left; color: #e0e0e0;">Suggested Subject</th>
</tr>
</thead>
<tbody>
"""
for _, row in suggestions_df.iterrows():
title = row['Title']
title_display = title[:50] + "..." if len(title) > 50 else title
subject = row['Suggested Subject']
# Calculate a shade of green based on confidence or some other metric
# For demonstration, using a fixed green shade
green_shade = "rgba(0, 100, 0, 0.3)"
html_table += f"""
<tr style="border-bottom: 1px solid #444;">
<td style="padding: 12px; text-align: left;">{title_display}</td>
<td style="padding: 12px; text-align: left; background-color: {green_shade};">{subject}</td>
</tr>
"""
html_table += """
</tbody>
</table>
</div>
"""
st.markdown(html_table, unsafe_allow_html=True)
else:
st.markdown("""
<div style="background-color: #1e1e1e; padding: 1.5rem; border-radius: 10px; margin-top: 1rem; color: #e0e0e0;">
No metadata enhancement suggestions available.
</div>
""", unsafe_allow_html=True)
else:
st.markdown("""
<div style="background-color: #1e1e1e; padding: 1.5rem; border-radius: 10px; margin-top: 1rem; color: #e0e0e0;">
Enable AI Suggestions to view recommendations.
</div>
""", unsafe_allow_html=True)
else:
st.markdown("""
<div style="background-color: #1e1e1e; padding: 1.5rem; border-radius: 10px; margin-top: 1rem; color: #e0e0e0;">
All records already have subjects or no usable text available.
</div>
""", unsafe_allow_html=True)