Spaces:

CCockrum
/

LOC-Metadate-Analyzer

Running

File size: 16,024 Bytes

import requests
import pandas as pd
import numpy as np
import streamlit as st
import matplotlib
import plotly.express as px
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Custom CSS 
st.markdown("""
    <style>
          
        .main {
            background-color: #1A1A1A !important;  /* dark */
            color: #D3D3D3 !important;
        }

        }
        .block-container {
            background-color: #D3D3D3 !important;
            color: #cccccc !important;
            padding-left: 3rem !important;
            padding-right: 3rem !important;
            max-width: 900px;  /* widen main feed */
            margin: auto;  /* center it */
        }
        /* Headings */
        h1, h2, h3, h4 {
            color: #eeeeee !important; /* brighter light gray for headings */
            font-weight: 700 !important; /* bold */
            margin-bottom: 1rem !important;
        }
        p, span, div {
            color: #cccccc !important;
        }
        /* Subheaders (optional) */
        .stSubheader {
            color: #dddddd !important;
            font-size: 1.4rem !important;
        }
        /* Dataframes (optional tweak) */
        .stDataFrame {
            background-color: #2e2e2e !important;
            border-radius: 10px;
            padding: 1rem;
        }
        section[data-testid="stSidebar"] > div:first-child {
            background-color: #808080 !important;
            padding: 1rem;
            border-radius: 0.5rem;
            color: #808080 !important;
        }
        .stMarkdown, .stTextInput, .stDataFrame {
            color: #1A1A1A!important;
        }
        img.banner {
            width: 100%;
            border-radius: 12px;
            margin-bottom: 1rem;
        }
                                 .stAlert {
            background-color: #f0f0f5 !important;
            color: #1A1A1A !important;
            padding: 1.25rem !important;
            font-size: 1rem !important;
            border-radius: 0.5rem !important;
            box-shadow: 0 2px 5px rgba(0, 0, 0, 0.05) !important;
        }
        header[data-testid="stHeader"] {
            background-color: #1A1A1A !important;
        }
    
        section[data-testid="stSidebar"] > div:first-child {
            background-color: #1A1A1A !important;
            color: #FFFFFF !important;
            padding: 2rem 1.5rem 1.5rem 1.5rem !important;
            border-radius: 12px;
            box-shadow: 0 4px 12px rgba(0, 0, 0, 0.08);
            font-size: 0.95rem;
            line-height: 1.5;
}
;
        html, body, [data-testid="stApp"] {
            background-color: #1A1A1A !important;
        }
        .custom-table {
            background-color: #D3D3D3;
            color: #1A1A1A;
            font-family: monospace;
            padding: 1rem;
            border-radius: 8px;
            overflow-x: auto;
            white-space: pre;
            border: 1px solid #ccc;

        }
        .sidebar-stats {
            color: lightgray !important;
            font-size: 1.1rem !important;
            margin-top: 1.5rem;
            font-weight: 600;
        }
        .sidebar-contrast-block {
            background-color: #2b2b2b !important;
            padding: 1.25rem;
            border-radius: 10px;
            margin-top: 1.5rem;
}
        section.main > div {  /* widen main container */
            max-width: 95%;
            padding-left: 3rem;
            padding-right: 3rem;
}

        }

</style>
""", unsafe_allow_html=True)

# Use an image from a URL for the banner
st.image("https://cdn-uploads.huggingface.co/production/uploads/67351c643fe51cb1aa28f2e5/7ThcAOjbuM8ajrP85bGs4.jpeg", use_container_width=True)

# Streamlit app header
st.title("MetaDiscovery Agent for Library of Congress Collections")
st.markdown("""
This tool connects to the LOC API, retrieves metadata from a selected collection, and performs
an analysis of metadata completeness, suggests enhancements, and identifies authority gaps.
""")

# Updated collection URLs using the correct LOC API 
collections = {
    "American Revolutionary War Maps": "american+revolutionary+war+maps",
    "Civil War Maps": "civil+war+maps",
    "Women's Suffrage": "women+suffrage",
    "World War I Posters": "world+war+posters"
}

# Sidebar for selecting collection
#st.sidebar.markdown("## Settings")

# Create empty metadata_df variable to ensure it exists before checking
metadata_df = pd.DataFrame()

# Add a key to the selectbox to ensure it refreshes properly
with st.sidebar:
    st.markdown("""
    <div style='
        background-color: #2b2b2b
        padding: 1.5rem;
        border-radius: 12px;
        margin-bottom: 1.5rem;
        box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
    '>
    """, unsafe_allow_html=True)
    
    selected = st.radio("Select a Collection", list(collections.keys()), key="collection_selector")
    
    st.markdown("</div>", unsafe_allow_html=True)

search_query = collections[selected]

# Define the collection URL
collection_url = f"https://www.loc.gov/search/?q={search_query}&fo=json"

# Create an empty placeholder for Quick Stats
stats_placeholder = st.sidebar.empty()


# Add a fetch button to make the action explicit
fetch_data = True
    
if fetch_data:
    # Display a loading spinner while fetching data
    with st.spinner(f"Fetching data for {selected}..."):
        # Fetch data from LOC API with spoofed User-Agent header
        headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 Chrome/110.0.0.0 Safari/537.36"
        }

        try:
            response = requests.get(collection_url, headers=headers)
            response.raise_for_status()
            data = response.json()

            if "results" in data:
                records = data.get("results", [])
            elif "items" in data:
                records = data.get("items", [])
            else:
                records = []
                st.error("Unexpected API response structure. No records found.")
            st.write(f"Retrieved {len(records)} records")

        except requests.exceptions.RequestException as e:
            st.error(f"API Connection Error: {e}")
            records = []
        except ValueError:
            st.error("Failed to parse API response as JSON")
            records = []

        # Extract selected metadata fields
        items = []
        for record in records:
            if isinstance(record, dict):
                description = record.get("description", "")
                if isinstance(description, list):
                    description = " ".join([str(d) for d in description])
                item = {
                    "id": record.get("id", ""),
                    "title": record.get("title", ""),
                    "date": record.get("date", ""),
                    "subject": ", ".join(record.get("subject", [])) if isinstance(record.get("subject"), list) else record.get("subject", ""),
                    "creator": record.get("creator", ""),
                    "description": description
                }
                if not item["title"] and "item" in record:
                    item["title"] = record.get("item", {}).get("title", "")
                if not item["date"] and "item" in record:
                    item["date"] = record.get("item", {}).get("date", "")
                items.append(item)

        metadata_df = pd.DataFrame(items)
        
        # Define custom completeness check
        def is_incomplete(value):
            return pd.isna(value) or value in ["", "N/A", "null", None]
        
        
        if not metadata_df.empty:
            incomplete_mask = metadata_df.map(is_incomplete).any(axis=1)
            incomplete_count = incomplete_mask.sum()
            total_fields = metadata_df.size
            filled_fields = (~metadata_df.map(is_incomplete)).sum().sum()
            overall_percent = (filled_fields / total_fields) * 100
        
            # Field-level completeness
            completeness = (~metadata_df.map(is_incomplete)).mean() * 100
            completeness_df = pd.DataFrame({"Field": completeness.index, "Completeness (%)": completeness.values})
            completeness_table = completeness_df.set_index("Field")
        
            # Sidebar Quick Stats (index hidden, orange theme)
            quick_stats = pd.DataFrame({
                "Metric": ["Total Records", "Incomplete Records", "Percent Complete"],
                "Value": [len(metadata_df), incomplete_count, round(overall_percent, 1)]
            })
            
            # Style it without index
            styled_quick_stats = (
                quick_stats.style
                .hide(axis="index")
                .background_gradient(cmap="Oranges", subset=["Value"])
                .format({"Value": "{:.1f}"})
            )
            
            # Add an expander and put the dataframe inside it
            with st.sidebar.expander("Quick Stats", expanded=True):
                st.dataframe(
                    styled_quick_stats,
                    use_container_width=True,
                    hide_index=True
                )
            # Calculate Top 10 Subjects
            if 'subject' in metadata_df.columns:
                top_subjects = (
                    metadata_df['subject']
                    .dropna()
                    .str.split(',')              
                    .explode()                  
                    .str.strip()                  
                    .value_counts()
                    .head(10)
                    .to_frame(name="Count")
                )

            #Most Common Subjects in Sidebar
            with st.sidebar.expander("Top 10 Most Common Subjects", expanded=True):
                st.dataframe(
                    top_subjects.style.background_gradient(cmap="Greens").format("{:.0f}"),
                    use_container_width=True,
                    height=240
    )

        with st.sidebar.expander("Helpful Resources", expanded=False):
            st.markdown("""
                <style>
                    .sidebar-links a {
                        color: lightgray !important;
                        text-decoration: none !important;
                    }
                    .sidebar-links a:hover {
                        text-decoration: underline !important;
                    }
                </style>
                <div class="sidebar-links">
                  <ul style='padding-left: 1em'>
                    <li><a href="https://www.loc.gov/apis/" target="_blank">LOC API Info</a></li>
                    <li><a href="https://www.loc.gov/" target="_blank">Library of Congress Homepage</a></li>
                    <li><a href="https://www.loc.gov/collections/" target="_blank">LOC Digital Collections</a></li>
                    <li><a href="https://www.loc.gov/marc/" target="_blank">MARC Metadata Standards</a></li>
                    <li><a href="https://labs.loc.gov/about-labs/digital-strategy/" target="_blank">LOC Digital Strategy</a></li>
                  </ul>
                </div>
            """, unsafe_allow_html=True)

        # Utility functions for deeper metadata quality analysis
        def is_incomplete(value):
            return pd.isna(value) or value in ["", "N/A", "null", None]

        def is_valid_date(value):
            try:
                pd.to_datetime(value)
                return True
            except:
                return False

        if not metadata_df.empty:
            st.subheader("Retrieved Metadata Sample")
            st.dataframe(metadata_df.head())


            # Fill the placeholder created earlier            
            st.subheader("Field Completeness Breakdown")

            st.markdown("""
                <div style='
                    background-color: #2e2e2e;
                    padding: 1.2rem;
                    border-radius: 10px;
                    margin-top: 1.5rem;
                    color: lightgray;
                '>
            """, unsafe_allow_html=True)
            
            st.dataframe(
                completeness_table.style.background_gradient(cmap="Greens").format("{:.1f}%"),
                use_container_width=True,
                height=240
            )
            
            st.markdown("</div>", unsafe_allow_html=True)


            # Identify incomplete records
            incomplete_mask = metadata_df.map(is_incomplete).any(axis=1)
            incomplete_records = metadata_df[incomplete_mask]

            st.subheader("Suggested Metadata Enhancements")

# Look for records with descriptions but missing subjects or other fields
incomplete_with_desc = metadata_df[metadata_df['description'].notnull() & 
                                  (metadata_df['subject'].isnull() | 
                                   metadata_df['creator'].isnull())]

# Reference data should be complete records with both subjects and descriptions
reference_df = metadata_df[metadata_df['subject'].notnull() & 
                          metadata_df['description'].notnull() &
                          metadata_df['creator'].notnull()]

# Print debugging info
st.write(f"Records with descriptions but missing fields: {len(incomplete_with_desc)}")
st.write(f"Complete reference records: {len(reference_df)}")

tfidf = TfidfVectorizer(stop_words='english', max_features=1000)

if len(incomplete_with_desc) > 0 and len(reference_df) > 0:
    try:
        suggestions = []
        # Fit TF-IDF on all complete descriptions
        tfidf_matrix = tfidf.fit_transform(reference_df['description'].fillna('').astype(str))
        
        # For each incomplete record
        for idx, row in incomplete_with_desc.iterrows():
            if pd.notna(row['description']):
                # Transform this record's description 
                desc_vec = tfidf.transform([str(row['description'])])
                
                # Get similarity scores to all reference records
                sims = cosine_similarity(desc_vec, tfidf_matrix).flatten()
                
                # Find the top 3 most similar records
                top_indices = sims.argsort()[-3:][::-1]
                
                # Get the most frequent subject among top matches
                top_subjects = reference_df.iloc[top_indices]['subject'].value_counts().index
                if len(top_subjects) > 0:
                    suggested_subject = top_subjects[0]
                    suggestions.append((row['title'], suggested_subject))
        
        if suggestions:
            suggestions_df = pd.DataFrame(suggestions, columns=["Title", "Suggested Subject"])
            
            # Apply similar styling as your other tables
            styled_suggestions = (
                suggestions_df.style
                .background_gradient(cmap="Greens", subset=["Suggested Subject"])
                .hide(axis="index")
            )
            
            # Display as a dataframe with styling
            st.dataframe(
                styled_suggestions,
                use_container_width=True,
                hide_index=True,
                height=min(240, len(suggestions) * 35 + 38)
            )
        else:
            empty_df = pd.DataFrame([["No metadata enhancement suggestions available."]], 
                                   columns=["Message"])
            styled_empty = empty_df.style.hide(axis="index")
            st.dataframe(styled_empty, use_container_width=True, hide_index=True)
    except Exception as e:
        st.error(f"Error generating metadata suggestions: {e}")
        st.error(f"Error details: {str(e)}")
else:
    empty_df = pd.DataFrame([["Not enough descriptive data to generate metadata suggestions."]], 
                           columns=["Message"])
    styled_empty = empty_df.style.hide(axis="index")
    st.dataframe(styled_empty, use_container_width=True, hide_index=True)