CCockrum's picture
Update app.py
1b02b65 verified
raw
history blame
16 kB
import requests
import pandas as pd
import numpy as np
import streamlit as st
import matplotlib
import plotly.express as px
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
# Custom CSS
st.markdown("""
<style>
.main {
background-color: #1A1A1A !important; /* dark */
color: #D3D3D3 !important;
}
}
.block-container {
background-color: #D3D3D3 !important;
color: #cccccc !important;
padding-left: 3rem !important;
padding-right: 3rem !important;
max-width: 900px; /* widen main feed */
margin: auto; /* center it */
}
/* Headings */
h1, h2, h3, h4 {
color: #eeeeee !important; /* brighter light gray for headings */
font-weight: 700 !important; /* bold */
margin-bottom: 1rem !important;
}
p, span, div {
color: #cccccc !important;
}
/* Subheaders (optional) */
.stSubheader {
color: #dddddd !important;
font-size: 1.4rem !important;
}
/* Dataframes (optional tweak) */
.stDataFrame {
background-color: #2e2e2e !important;
border-radius: 10px;
padding: 1rem;
}
section[data-testid="stSidebar"] > div:first-child {
background-color: #808080 !important;
padding: 1rem;
border-radius: 0.5rem;
color: #808080 !important;
}
.stMarkdown, .stTextInput, .stDataFrame {
color: #1A1A1A!important;
}
img.banner {
width: 100%;
border-radius: 12px;
margin-bottom: 1rem;
}
.stAlert {
background-color: #f0f0f5 !important;
color: #1A1A1A !important;
padding: 1.25rem !important;
font-size: 1rem !important;
border-radius: 0.5rem !important;
box-shadow: 0 2px 5px rgba(0, 0, 0, 0.05) !important;
}
header[data-testid="stHeader"] {
background-color: #1A1A1A !important;
}
section[data-testid="stSidebar"] > div:first-child {
background-color: #1A1A1A !important;
color: #FFFFFF !important;
padding: 2rem 1.5rem 1.5rem 1.5rem !important;
border-radius: 12px;
box-shadow: 0 4px 12px rgba(0, 0, 0, 0.08);
font-size: 0.95rem;
line-height: 1.5;
}
;
html, body, [data-testid="stApp"] {
background-color: #1A1A1A !important;
}
.custom-table {
background-color: #D3D3D3;
color: #1A1A1A;
font-family: monospace;
padding: 1rem;
border-radius: 8px;
overflow-x: auto;
white-space: pre;
border: 1px solid #ccc;
}
.sidebar-stats {
color: lightgray !important;
font-size: 1.1rem !important;
margin-top: 1.5rem;
font-weight: 600;
}
.sidebar-contrast-block {
background-color: #2b2b2b !important;
padding: 1.25rem;
border-radius: 10px;
margin-top: 1.5rem;
}
section.main > div { /* widen main container */
max-width: 95%;
padding-left: 3rem;
padding-right: 3rem;
}
}
</style>
""", unsafe_allow_html=True)
# Use an image from a URL for the banner
st.image("https://cdn-uploads.huggingface.co/production/uploads/67351c643fe51cb1aa28f2e5/7ThcAOjbuM8ajrP85bGs4.jpeg", use_container_width=True)
# Streamlit app header
st.title("MetaDiscovery Agent for Library of Congress Collections")
st.markdown("""
This tool connects to the LOC API, retrieves metadata from a selected collection, and performs
an analysis of metadata completeness, suggests enhancements, and identifies authority gaps.
""")
# Updated collection URLs using the correct LOC API
collections = {
"American Revolutionary War Maps": "american+revolutionary+war+maps",
"Civil War Maps": "civil+war+maps",
"Women's Suffrage": "women+suffrage",
"World War I Posters": "world+war+posters"
}
# Sidebar for selecting collection
#st.sidebar.markdown("## Settings")
# Create empty metadata_df variable to ensure it exists before checking
metadata_df = pd.DataFrame()
# Add a key to the selectbox to ensure it refreshes properly
with st.sidebar:
st.markdown("""
<div style='
background-color: #2b2b2b
padding: 1.5rem;
border-radius: 12px;
margin-bottom: 1.5rem;
box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
'>
""", unsafe_allow_html=True)
selected = st.radio("Select a Collection", list(collections.keys()), key="collection_selector")
st.markdown("</div>", unsafe_allow_html=True)
search_query = collections[selected]
# Define the collection URL
collection_url = f"https://www.loc.gov/search/?q={search_query}&fo=json"
# Create an empty placeholder for Quick Stats
stats_placeholder = st.sidebar.empty()
# Add a fetch button to make the action explicit
fetch_data = True
if fetch_data:
# Display a loading spinner while fetching data
with st.spinner(f"Fetching data for {selected}..."):
# Fetch data from LOC API with spoofed User-Agent header
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 Chrome/110.0.0.0 Safari/537.36"
}
try:
response = requests.get(collection_url, headers=headers)
response.raise_for_status()
data = response.json()
if "results" in data:
records = data.get("results", [])
elif "items" in data:
records = data.get("items", [])
else:
records = []
st.error("Unexpected API response structure. No records found.")
st.write(f"Retrieved {len(records)} records")
except requests.exceptions.RequestException as e:
st.error(f"API Connection Error: {e}")
records = []
except ValueError:
st.error("Failed to parse API response as JSON")
records = []
# Extract selected metadata fields
items = []
for record in records:
if isinstance(record, dict):
description = record.get("description", "")
if isinstance(description, list):
description = " ".join([str(d) for d in description])
item = {
"id": record.get("id", ""),
"title": record.get("title", ""),
"date": record.get("date", ""),
"subject": ", ".join(record.get("subject", [])) if isinstance(record.get("subject"), list) else record.get("subject", ""),
"creator": record.get("creator", ""),
"description": description
}
if not item["title"] and "item" in record:
item["title"] = record.get("item", {}).get("title", "")
if not item["date"] and "item" in record:
item["date"] = record.get("item", {}).get("date", "")
items.append(item)
metadata_df = pd.DataFrame(items)
# Define custom completeness check
def is_incomplete(value):
return pd.isna(value) or value in ["", "N/A", "null", None]
if not metadata_df.empty:
incomplete_mask = metadata_df.map(is_incomplete).any(axis=1)
incomplete_count = incomplete_mask.sum()
total_fields = metadata_df.size
filled_fields = (~metadata_df.map(is_incomplete)).sum().sum()
overall_percent = (filled_fields / total_fields) * 100
# Field-level completeness
completeness = (~metadata_df.map(is_incomplete)).mean() * 100
completeness_df = pd.DataFrame({"Field": completeness.index, "Completeness (%)": completeness.values})
completeness_table = completeness_df.set_index("Field")
# Sidebar Quick Stats (index hidden, orange theme)
quick_stats = pd.DataFrame({
"Metric": ["Total Records", "Incomplete Records", "Percent Complete"],
"Value": [len(metadata_df), incomplete_count, round(overall_percent, 1)]
})
# Style it without index
styled_quick_stats = (
quick_stats.style
.hide(axis="index")
.background_gradient(cmap="Oranges", subset=["Value"])
.format({"Value": "{:.1f}"})
)
# Add an expander and put the dataframe inside it
with st.sidebar.expander("Quick Stats", expanded=True):
st.dataframe(
styled_quick_stats,
use_container_width=True,
hide_index=True
)
# Calculate Top 10 Subjects
if 'subject' in metadata_df.columns:
top_subjects = (
metadata_df['subject']
.dropna()
.str.split(',')
.explode()
.str.strip()
.value_counts()
.head(10)
.to_frame(name="Count")
)
#Most Common Subjects in Sidebar
with st.sidebar.expander("Top 10 Most Common Subjects", expanded=True):
st.dataframe(
top_subjects.style.background_gradient(cmap="Greens").format("{:.0f}"),
use_container_width=True,
height=240
)
with st.sidebar.expander("Helpful Resources", expanded=False):
st.markdown("""
<style>
.sidebar-links a {
color: lightgray !important;
text-decoration: none !important;
}
.sidebar-links a:hover {
text-decoration: underline !important;
}
</style>
<div class="sidebar-links">
<ul style='padding-left: 1em'>
<li><a href="https://www.loc.gov/apis/" target="_blank">LOC API Info</a></li>
<li><a href="https://www.loc.gov/" target="_blank">Library of Congress Homepage</a></li>
<li><a href="https://www.loc.gov/collections/" target="_blank">LOC Digital Collections</a></li>
<li><a href="https://www.loc.gov/marc/" target="_blank">MARC Metadata Standards</a></li>
<li><a href="https://labs.loc.gov/about-labs/digital-strategy/" target="_blank">LOC Digital Strategy</a></li>
</ul>
</div>
""", unsafe_allow_html=True)
# Utility functions for deeper metadata quality analysis
def is_incomplete(value):
return pd.isna(value) or value in ["", "N/A", "null", None]
def is_valid_date(value):
try:
pd.to_datetime(value)
return True
except:
return False
if not metadata_df.empty:
st.subheader("Retrieved Metadata Sample")
st.dataframe(metadata_df.head())
# Fill the placeholder created earlier
st.subheader("Field Completeness Breakdown")
st.markdown("""
<div style='
background-color: #2e2e2e;
padding: 1.2rem;
border-radius: 10px;
margin-top: 1.5rem;
color: lightgray;
'>
""", unsafe_allow_html=True)
st.dataframe(
completeness_table.style.background_gradient(cmap="Greens").format("{:.1f}%"),
use_container_width=True,
height=240
)
st.markdown("</div>", unsafe_allow_html=True)
# Identify incomplete records
incomplete_mask = metadata_df.map(is_incomplete).any(axis=1)
incomplete_records = metadata_df[incomplete_mask]
st.subheader("Suggested Metadata Enhancements")
# Look for records with descriptions but missing subjects or other fields
incomplete_with_desc = metadata_df[metadata_df['description'].notnull() &
(metadata_df['subject'].isnull() |
metadata_df['creator'].isnull())]
# Reference data should be complete records with both subjects and descriptions
reference_df = metadata_df[metadata_df['subject'].notnull() &
metadata_df['description'].notnull() &
metadata_df['creator'].notnull()]
# Print debugging info
st.write(f"Records with descriptions but missing fields: {len(incomplete_with_desc)}")
st.write(f"Complete reference records: {len(reference_df)}")
tfidf = TfidfVectorizer(stop_words='english', max_features=1000)
if len(incomplete_with_desc) > 0 and len(reference_df) > 0:
try:
suggestions = []
# Fit TF-IDF on all complete descriptions
tfidf_matrix = tfidf.fit_transform(reference_df['description'].fillna('').astype(str))
# For each incomplete record
for idx, row in incomplete_with_desc.iterrows():
if pd.notna(row['description']):
# Transform this record's description
desc_vec = tfidf.transform([str(row['description'])])
# Get similarity scores to all reference records
sims = cosine_similarity(desc_vec, tfidf_matrix).flatten()
# Find the top 3 most similar records
top_indices = sims.argsort()[-3:][::-1]
# Get the most frequent subject among top matches
top_subjects = reference_df.iloc[top_indices]['subject'].value_counts().index
if len(top_subjects) > 0:
suggested_subject = top_subjects[0]
suggestions.append((row['title'], suggested_subject))
if suggestions:
suggestions_df = pd.DataFrame(suggestions, columns=["Title", "Suggested Subject"])
# Apply similar styling as your other tables
styled_suggestions = (
suggestions_df.style
.background_gradient(cmap="Greens", subset=["Suggested Subject"])
.hide(axis="index")
)
# Display as a dataframe with styling
st.dataframe(
styled_suggestions,
use_container_width=True,
hide_index=True,
height=min(240, len(suggestions) * 35 + 38)
)
else:
empty_df = pd.DataFrame([["No metadata enhancement suggestions available."]],
columns=["Message"])
styled_empty = empty_df.style.hide(axis="index")
st.dataframe(styled_empty, use_container_width=True, hide_index=True)
except Exception as e:
st.error(f"Error generating metadata suggestions: {e}")
st.error(f"Error details: {str(e)}")
else:
empty_df = pd.DataFrame([["Not enough descriptive data to generate metadata suggestions."]],
columns=["Message"])
styled_empty = empty_df.style.hide(axis="index")
st.dataframe(styled_empty, use_container_width=True, hide_index=True)