Spaces:

CCockrum
/

LOC-Metadate-Analyzer

Running

File size: 9,541 Bytes

import requests
import pandas as pd
import streamlit as st
import matplotlib
import plotly.express as px
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# ------------------- Custom CSS -------------------
st.markdown("""
    <style>
        html, body, [data-testid="stApp"] {
            background-color: #1A1A1A !important;
        }
        .main {
            background-color: #D3D3D3 !important;
            color: #1A1A1A!important;
        }
        .block-container {
            background-color: gray !important;
            color: #1A1A1A !important;
            padding-left: 2rem !important;
            padding-right: 2rem !important;
        }
        section[data-testid="stSidebar"] > div:first-child {
            background-color: #1A1A1A !important;
            color: #FFFFFF !important;
            padding: 2rem 1.5rem 1.5rem 1.5rem !important;
            border-radius: 12px;
            box-shadow: 0 4px 12px rgba(0, 0, 0, 0.08);
            font-size: 0.95rem;
        }
        .custom-table {
            background-color: #D3D3D3;
            color: #1A1A1A;
            font-family: monospace;
            padding: 1rem;
            border-radius: 8px;
            overflow-x: auto;
            white-space: pre;
            border: 1px solid #ccc;
        }
        .sidebar-stats {
            color: lightgray !important;
            font-size: 1.1rem !important;
            font-weight: 600;
        }
        .sidebar-contrast-block {
            background-color: #2b2b2b !important;
            padding: 1.25rem;
            border-radius: 10px;
            margin-top: 1.5rem;
        }
        .sidebar-section h3 {
            color: lightgray !important;
            font-size: 1.1rem !important;
            margin-top: 1.5rem;
        }
        .sidebar-links a {
            color: lightgray !important;
            text-decoration: none !important;
        }
        .sidebar-links a:hover {
            text-decoration: underline !important;
        }
    </style>
""", unsafe_allow_html=True)

# ------------------- Banner Image -------------------
st.image("https://cdn-uploads.huggingface.co/production/uploads/67351c643fe51cb1aa28f2e5/7ThcAOjbuM8ajrP85bGs4.jpeg", use_container_width=True)

# ------------------- App Title & Description -------------------
st.title("MetaDiscovery Agent for Library of Congress Collections")
st.markdown("""
This tool connects to the LOC API, retrieves metadata from a selected collection, and performs
an analysis of metadata completeness, suggests enhancements, and identifies authority gaps.
""")

# ------------------- Collection Selection -------------------
collections = {
    "American Revolutionary War Maps": "american+revolutionary+war+maps",
    "Civil War Maps": "civil+war+maps",
    "Women's Suffrage": "women+suffrage",
    "World War I Posters": "world+war+posters"
}

selected = st.sidebar.selectbox("Select a collection", list(collections.keys()), key="collection_selector")
search_query = collections[selected]
collection_url = f"https://www.loc.gov/search/?q={search_query}&fo=json"

# ------------------- Placeholders -------------------
stats_placeholder = st.sidebar.empty()
completeness_placeholder = st.sidebar.empty()

# ------------------- Helpful Resources -------------------
st.sidebar.markdown("""
<div class="sidebar-section">
  <h3>🔗 Helpful Resources</h3>
  <div class="sidebar-links">
    <ul style='padding-left: 1em'>
      <li><a href="https://www.loc.gov/apis/" target="_blank">LOC API Info</a></li>
      <li><a href="https://www.loc.gov/" target="_blank">Library of Congress Homepage</a></li>
      <li><a href="https://www.loc.gov/collections/" target="_blank">LOC Digital Collections</a></li>
      <li><a href="https://www.loc.gov/marc/" target="_blank">MARC Metadata Standards</a></li>
      <li><a href="https://labs.loc.gov/about-labs/digital-strategy/" target="_blank">LOC Digital Strategy</a></li>
    </ul>
  </div>
</div>
""", unsafe_allow_html=True)

# ------------------- Fetch Data -------------------
with st.spinner(f"Fetching data for {selected}..."):
    headers = {"User-Agent": "Mozilla/5.0"}
    try:
        response = requests.get(collection_url, headers=headers)
        response.raise_for_status()
        data = response.json()
        records = data.get("results") or data.get("items") or []
    except:
        records = []
        st.error("Failed to load data from LOC API")

# ------------------- Data Preparation -------------------
items = []
for record in records:
    description = record.get("description", "")
    if isinstance(description, list):
        description = " ".join([str(d) for d in description])
    item = {
        "id": record.get("id", ""),
        "title": record.get("title", ""),
        "date": record.get("date", ""),
        "subject": ", ".join(record.get("subject", [])) if isinstance(record.get("subject"), list) else record.get("subject", ""),
        "creator": record.get("creator", ""),
        "description": description
    }
    items.append(item)

metadata_df = pd.DataFrame(items)

# ------------------- Completeness Logic -------------------
def is_incomplete(value):
    return pd.isna(value) or value in ["", "N/A", "null", None]

if not metadata_df.empty:
    incomplete_mask = metadata_df.map(is_incomplete).any(axis=1)
    incomplete_count = incomplete_mask.sum()
    total_fields = metadata_df.size
    filled_fields = (~metadata_df.map(is_incomplete)).sum().sum()
    overall_percent = (filled_fields / total_fields) * 100
    completeness = (~metadata_df.map(is_incomplete)).mean() * 100
    completeness_df = pd.DataFrame({"Field": completeness.index, "Completeness (%)": completeness.values})
    completeness_table = completeness_df.set_index("Field")

    # ------------------- Quick Stats -------------------
    stats_html = f"""
    <div class="sidebar-stats">
        <h3 style="color: lightgray;">📊 Quick Stats</h3>
        <p style="color:lightgray;">Total Records: <b>{len(metadata_df)}</b></p>
        <p style="color:lightgray;">Incomplete Records: <b>{incomplete_count}</b></p>
        <p style="color:lightgray;">Overall Metadata Completeness: <b>{overall_percent:.1f}%</b></p>
    </div>
    """
    stats_placeholder.markdown(stats_html, unsafe_allow_html=True)

    # ------------------- Field Completeness Table -------------------
    with completeness_placeholder:
        st.markdown("""
            <div style='
                background-color: #2e2e2e;
                padding: 1.2rem;
                border-radius: 10px;
                margin-top: 1.5rem;
                color: lightgray;
            '>
            <h4 style='margin-bottom: 1rem;'>Field Completeness Breakdown</h4>
        """, unsafe_allow_html=True)
        st.dataframe(
            completeness_table.style.background_gradient(cmap="Greens").format("{:.1f}%"),
            use_container_width=True,
            height=240
        )
        st.markdown("</div>", unsafe_allow_html=True)

    # ------------------- Main Panel -------------------
    st.subheader("Retrieved Metadata Sample")
    st.dataframe(metadata_df.head())

        # Metadata completeness analysis (enhanced)
    st.subheader("📊 Metadata Completeness Analysis")
    
    completeness = metadata_df.map(lambda x: not is_incomplete(x)).mean() * 100
    completeness_df = pd.DataFrame({
        "Field": completeness.index,
        "Completeness (%)": completeness.values
    })
    
    fig = px.bar(
        completeness_df,
        x="Field",
        y="Completeness (%)",
        title="Metadata Completeness by Field",
        labels={"Field": "Metadata Field", "Completeness (%)": "Completeness (%)"}
    )
    st.plotly_chart(fig, use_container_width=True)


    # ------------------- Metadata Suggestions -------------------
    st.subheader("✨ Suggested Metadata Enhancements")
    incomplete_with_desc = metadata_df[incomplete_mask & metadata_df['description'].notnull()]
    reference_df = metadata_df[metadata_df['subject'].notnull() & metadata_df['description'].notnull()]

    if len(incomplete_with_desc) > 1 and len(reference_df) > 1:
        try:
            tfidf = TfidfVectorizer(stop_words='english')
            tfidf_matrix = tfidf.fit_transform(reference_df['description'])
            suggestions = []
            for _, row in incomplete_with_desc.iterrows():
                if pd.isna(row['subject']) and pd.notna(row['description']):
                    desc_vec = tfidf.transform([str(row['description'])])
                    sims = cosine_similarity(desc_vec, tfidf_matrix).flatten()
                    top_idx = sims.argmax()
                    suggested_subject = reference_df.iloc[top_idx]['subject']
                    if pd.notna(suggested_subject):
                        suggestions.append((row['title'], suggested_subject))
            if suggestions:
                suggestions_df = pd.DataFrame(suggestions, columns=["Title", "Suggested Subject"])
                st.markdown("<div class='custom-table'>" + suggestions_df.to_markdown(index=False) + "</div>", unsafe_allow_html=True)
            else:
                st.info("No metadata enhancement suggestions available.")
        except Exception as e:
            st.error(f"Error generating suggestions: {e}")
    else:
        st.info("Not enough descriptive data to generate metadata suggestions.")
else:
    st.warning("⚠️ No metadata records found for this collection.")