Spaces:

CCockrum
/

LOC-Metadate-Analyzer

Running

File size: 20,436 Bytes

import os
import requests
import pandas as pd
import streamlit as st
import time
import matplotlib
import plotly.express as px
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def is_missing(value):
    return pd.isna(value) or str(value).strip() == ""

# Load the Hugging Face API key from environment
api_key = os.getenv('HF_API')

def get_huggingface_suggestions(title, description):
    API_URL = "https://api-inference.huggingface.co/models/facebook/bart-large-mnli"
    headers = {"Authorization": f"Bearer {api_key}"}

    full_text = f"{title}. {description}".strip()

    if not full_text:
        return None
    
    candidate_labels = [
        "History", "Politics", "Science", "Technology", "Art", "Literature",
        "Education", "Economics", "Military", "Geography", "Sociology",
        "Philosophy", "Religion", "Law", "Medicine", "Engineering",
        "Mathematics", "Computer Science", "Agriculture", "Environment",
        "Maps", "United States", "Civil War", "Revolution", "Posters", "Women's Rights", "World War I"
    ]

    payload = {
        "inputs": full_text,
        "parameters": {
            "candidate_labels": candidate_labels,
            "multi_label": True
        }
    }

    try:
        response = requests.post(API_URL, headers=headers, json=payload)
        result = response.json()

        if "error" in result:
            st.error(f"API error: {result['error']}")
            return None

        labels = [
            label for label, score in zip(result.get("labels", []), result.get("scores", []))
            if score > 0.3
        ]

        return ", ".join(labels) if labels else None

    except Exception as e:
        st.error(f"API Error: {e}")
        return None

# Custom CSS 
st.markdown("""
    <style>
          
        .main {
            background-color: #1A1A1A !important;  /* dark */
            color: #D3D3D3 !important;
        }

        }
        .block-container {
            background-color: #D3D3D3 !important;
            color: #cccccc !important;
            padding-left: 3rem !important;
            padding-right: 3rem !important;
            max-width: 900px;  /* widen main feed */
            margin: auto;  /* center it */
        }
        /* Headings */
        h1, h2, h3, h4 {
            color: #eeeeee !important; /* brighter light gray for headings */
            font-weight: 700 !important; /* bold */
            margin-bottom: 1rem !important;
        }
        p, span, div {
            color: #cccccc !important;
        }
        /* Subheaders (optional) */
        .stSubheader {
            color: #dddddd !important;
            font-size: 1.4rem !important;
        }
        /* Dataframes (optional tweak) */
        .stDataFrame {
            background-color: #2e2e2e !important;
            border-radius: 10px;
            padding: 1rem;
        }
        section[data-testid="stSidebar"] > div:first-child {
            background-color: #808080 !important;
            padding: 1rem;
            border-radius: 0.5rem;
            color: #808080 !important;
        }
        .stMarkdown, .stTextInput, .stDataFrame {
            color: #1A1A1A!important;
        }
        img.banner {
            width: 100%;
            border-radius: 12px;
            margin-bottom: 1rem;
        }
                                 .stAlert {
            background-color: #f0f0f5 !important;
            color: #1A1A1A !important;
            padding: 1.25rem !important;
            font-size: 1rem !important;
            border-radius: 0.5rem !important;
            box-shadow: 0 2px 5px rgba(0, 0, 0, 0.05) !important;
        }
        header[data-testid="stHeader"] {
            background-color: #1A1A1A !important;
        }
    
        section[data-testid="stSidebar"] > div:first-child {
            background-color: #1A1A1A !important;
            color: #FFFFFF !important;
            padding: 2rem 1.5rem 1.5rem 1.5rem !important;
            border-radius: 12px;
            box-shadow: 0 4px 12px rgba(0, 0, 0, 0.08);
            font-size: 0.95rem;
            line-height: 1.5;
}
;
        html, body, [data-testid="stApp"] {
            background-color: #1A1A1A !important;
        }
        .custom-table {
            background-color: #D3D3D3;
            color: #1A1A1A;
            font-family: monospace;
            padding: 1rem;
            border-radius: 8px;
            overflow-x: auto;
            white-space: pre;
            border: 1px solid #ccc;

        }
        .sidebar-stats {
            color: lightgray !important;
            font-size: 1.1rem !important;
            margin-top: 1.5rem;
            font-weight: 600;
        }
        .sidebar-contrast-block {
            background-color: #2b2b2b !important;
            padding: 1.25rem;
            border-radius: 10px;
            margin-top: 1.5rem;
}
        section.main > div {  /* widen main container */
            max-width: 95%;
            padding-left: 3rem;
            padding-right: 3rem;

        }

</style>
""", unsafe_allow_html=True)

# Function to get subject suggestions using Hugging Face API
def get_huggingface_suggestions(title, description):
    API_URL = "https://api-inference.huggingface.co/models/facebook/bart-large-mnli"
    # Rest of the function code...

# Use an image from a URL for the banner
st.image("https://cdn-uploads.huggingface.co/production/uploads/67351c643fe51cb1aa28f2e5/7ThcAOjbuM8ajrP85bGs4.jpeg", use_container_width=True)

# Streamlit app header
st.title("MetaDiscovery Agent for Library of Congress Collections")
st.markdown("""
This tool connects to the LOC API, retrieves metadata from a selected collection, and performs
an analysis of metadata completeness, suggests enhancements, and identifies authority gaps.
""")

# Updated collection URLs using the correct LOC API 
collections = {
    "American Revolutionary War Maps": "american+revolutionary+war+maps",
    "Civil War Maps": "civil+war+maps",
    "Women's Suffrage": "women+suffrage",
    "World War I Posters": "world+war+posters"
}

# Sidebar for selecting collection
#st.sidebar.markdown("## Settings")

# Create empty metadata_df variable to ensure it exists before checking
metadata_df = pd.DataFrame()

# Add a key to the selectbox to ensure it refreshes properly
with st.sidebar:
    st.markdown("""
    <div style='
        background-color: #2b2b2b
        padding: 1.5rem;
        border-radius: 12px;
        margin-bottom: 1.5rem;
        box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
    '>
    """, unsafe_allow_html=True)
    
    selected = st.radio("Select a Collection", list(collections.keys()), key="collection_selector")
    
    st.markdown("</div>", unsafe_allow_html=True)

search_query = collections[selected]

# Define the collection URL
collection_url = f"https://www.loc.gov/search/?q={search_query}&fo=json"

# Create an empty placeholder for Quick Stats
stats_placeholder = st.sidebar.empty()


# Add a fetch button to make the action explicit
fetch_data = True
    
if fetch_data:
    # Display a loading spinner while fetching data
    with st.spinner(f"Fetching data for {selected}..."):
        # Fetch data from LOC API with spoofed User-Agent header
        headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 Chrome/110.0.0.0 Safari/537.36"
        }

        try:
            response = requests.get(collection_url, headers=headers)
            response.raise_for_status()
            data = response.json()

            if "results" in data:
                records = data.get("results", [])
            elif "items" in data:
                records = data.get("items", [])
            else:
                records = []
                st.error("Unexpected API response structure. No records found.")
            st.write(f"Retrieved {len(records)} records")

        except requests.exceptions.RequestException as e:
            st.error(f"API Connection Error: {e}")
            records = []
        except ValueError:
            st.error("Failed to parse API response as JSON")
            records = []

        # Extract selected metadata fields
        items = []
        for record in records:
            if isinstance(record, dict):
                description = record.get("description", "")
                if isinstance(description, list):
                    description = " ".join([str(d) for d in description])
                item = {
                    "id": record.get("id", ""),
                    "title": record.get("title", ""),
                    "date": record.get("date", ""),
                    "subject": ", ".join(record.get("subject", [])) if isinstance(record.get("subject"), list) else record.get("subject", ""),
                    "creator": record.get("creator", ""),
                    "description": description
                }
                if not item["title"] and "item" in record:
                    item["title"] = record.get("item", {}).get("title", "")
                if not item["date"] and "item" in record:
                    item["date"] = record.get("item", {}).get("date", "")
                items.append(item)

        metadata_df = pd.DataFrame(items)
        
        # Missing field detection
        fields_to_check = ["subject", "creator", "date", "title", "description"]
        missing_counts = {}
        
        for field in fields_to_check:
            if field in metadata_df.columns:
                missing = metadata_df[field].apply(is_missing)
                missing_counts[field] = missing.sum()
        
        # Define custom completeness check
        def is_incomplete(value):
            return pd.isna(value) or value in ["", "N/A", "null", None]
        
        
        if not metadata_df.empty:
            # --- Unified Completeness and Missing Fields Analysis ---

            #Define incompleteness at the cell level
            is_incomplete = lambda value: pd.isna(value) or value in ["", "N/A", "null", None]
            
            #Create a mask for missing values
            missing_mask = metadata_df.map(is_incomplete)
            
            #Compute overall record-level completeness
            incomplete_count = missing_mask.any(axis=1).sum()
            total_fields = metadata_df.size
            filled_fields = (~missing_mask).sum().sum()
            overall_percent = (filled_fields / total_fields) * 100
            
            #Field-specific missing counts (for Missing Metadata Summary)
            missing_counts = missing_mask.sum().sort_values(ascending=False)
            missing_df = (
                pd.DataFrame(missing_counts)
                .reset_index()
                .rename(columns={"index": "Field", 0: "Missing Count"})
            )

        
            # Field-level completeness
            completeness = (~metadata_df.map(is_incomplete)).mean() * 100
            completeness_df = pd.DataFrame({"Field": completeness.index, "Completeness (%)": completeness.values})
            completeness_table = completeness_df.set_index("Field")
        
            # Sidebar Quick Stats 
            quick_stats = pd.DataFrame({
                "Metric": ["Total Records", "Incomplete Records", "Percent Complete"],
                "Value": [len(metadata_df), incomplete_count, round(overall_percent, 1)]
            })
            
            styled_quick_stats = (
                quick_stats.style
                .hide(axis="index")
                .background_gradient(cmap="Oranges", subset=["Value"])
                .format({"Value": "{:.1f}"})
            )
            
            # Add an expander and put the dataframe inside it
            with st.sidebar.expander("Quick Stats", expanded=True):
                st.dataframe(
                    styled_quick_stats,
                    use_container_width=True,
                    hide_index=True
                )

            # Sidebar: Metadata Missing Stats
            missing_df = (
                pd.DataFrame(list(missing_counts.items()), columns=["Field", "Missing Count"])
                .sort_values(by="Missing Count", ascending=False)
                .reset_index(drop=True)
            )
            
            styled_missing_df = (
                missing_df.style
                .background_gradient(cmap="Blues", subset=["Missing Count"])
                .hide(axis="index")
            )
            
            with st.sidebar.expander("🧹 Missing Metadata Summary", expanded=True):
                st.dataframe(
                    styled_missing_df,
                    use_container_width=True,
                    hide_index=True,  # <<< ADD THIS
                    height=min(300, len(missing_df) * 35 + 38)
            )

            # Calculate Top 10 Subjects
            if 'subject' in metadata_df.columns:
                top_subjects = (
                    metadata_df['subject']
                    .dropna()
                    .str.split(',')              
                    .explode()                  
                    .str.strip()                  
                    .value_counts()
                    .head(10)
                    .to_frame(name="Count")
                )

            #Most Common Subjects in Sidebar
            with st.sidebar.expander("Top 10 Most Common Subjects", expanded=True):
                st.dataframe(
                    top_subjects.style.background_gradient(cmap="Greens").format("{:.0f}"),
                    use_container_width=True,
                    height=240
    )

        with st.sidebar.expander("Helpful Resources", expanded=False):
            st.markdown("""
                <style>
                    .sidebar-links a {
                        color: lightgray !important;
                        text-decoration: none !important;
                    }
                    .sidebar-links a:hover {
                        text-decoration: underline !important;
                    }
                </style>
                <div class="sidebar-links">
                  <ul style='padding-left: 1em'>
                    <li><a href="https://www.loc.gov/apis/" target="_blank">LOC API Info</a></li>
                    <li><a href="https://www.loc.gov/" target="_blank">Library of Congress Homepage</a></li>
                    <li><a href="https://www.loc.gov/collections/" target="_blank">LOC Digital Collections</a></li>
                    <li><a href="https://www.loc.gov/marc/" target="_blank">MARC Metadata Standards</a></li>
                    <li><a href="https://labs.loc.gov/about-labs/digital-strategy/" target="_blank">LOC Digital Strategy</a></li>
                  </ul>
                </div>
            """, unsafe_allow_html=True)

        # Utility functions for deeper metadata quality analysis
        def is_incomplete(value):
            return pd.isna(value) or value in ["", "N/A", "null", None]

        def is_valid_date(value):
            try:
                pd.to_datetime(value)
                return True
            except:
                return False

        if not metadata_df.empty:
            st.subheader("Retrieved Metadata Sample")
            st.dataframe(metadata_df.head())


            st.subheader("Field Completeness Breakdown")

            #DARK box for the Field Completeness Breakdown (MATCH others!)
            st.markdown("""
                <div style='
                    background-color: #2e2e2e;
                    padding: 1.5rem;
                    border-radius: 10px;
                    margin-top: 1.5rem;
                    color: lightgray;
                '>
            """, unsafe_allow_html=True)
            
            #Dataframe inside the dark box
            st.dataframe(
                completeness_table.style
                .background_gradient(cmap="Greens")
                .format("{:.0f}%")
                .hide(axis="index"),   
                use_container_width=True,
                height=240
            )

            st.markdown("</div>", unsafe_allow_html=True)

            # Identify incomplete records
            incomplete_mask = metadata_df.map(is_incomplete).any(axis=1)
            incomplete_records = metadata_df[incomplete_mask]

            
# --- Suggested Metadata Enhancements Section ---
st.subheader("Suggested Metadata Enhancements")

# Create a row with checkbox for AI suggestions - with proper label
use_ai = st.checkbox("Use AI Suggestions", value=True, label_visibility="hidden")
st.markdown("🤖 Use AI Suggestions (Hugging Face)")

# Check if records exist
incomplete_with_desc = metadata_df[
    (metadata_df['description'].notnull() | metadata_df['title'].notnull()) &
    (metadata_df['subject'].isnull())
]

if not incomplete_with_desc.empty:
    if use_ai:
        suggestions = []
        records_to_process = min(10, len(incomplete_with_desc))
        progress = st.progress(0)
        status = st.empty()
        
        for i, (idx, row) in enumerate(incomplete_with_desc.iterrows()):
            if i >= records_to_process:
                break
            title = row['title'] if pd.notna(row['title']) else ""
            description = row['description'] if pd.notna(row['description']) else ""
            status.text(f"Analyzing {i+1}/{records_to_process}: {title[:30]}...")
            suggested_subject = get_huggingface_suggestions(title, description)
            if suggested_subject:
                suggestions.append((title, suggested_subject))
            progress.progress((i + 1) / records_to_process)
        
        status.empty()
        progress.empty()
        
        if suggestions:
            suggestions_df = pd.DataFrame(suggestions, columns=["Title", "Suggested Subject"])
            
            # Create a custom dark-styled HTML table instead
            html_table = """
            <div style="background-color: #1e1e1e; padding: 1.5rem; border-radius: 10px; margin-top: 1rem;">
                <table style="width: 100%; border-collapse: collapse; color: #e0e0e0;">
                    <thead>
                        <tr style="border-bottom: 1px solid #444;">
                            <th style="padding: 12px; text-align: left; color: #e0e0e0;">Title</th>
                            <th style="padding: 12px; text-align: left; color: #e0e0e0;">Suggested Subject</th>
                        </tr>
                    </thead>
                    <tbody>
            """
            
            for _, row in suggestions_df.iterrows():
                title = row['Title']
                title_display = title[:50] + "..." if len(title) > 50 else title
                subject = row['Suggested Subject']
                
                # Calculate a shade of green based on confidence or some other metric
                # For demonstration, using a fixed green shade
                green_shade = "rgba(0, 100, 0, 0.3)"
                
                html_table += f"""
                    <tr style="border-bottom: 1px solid #444;">
                        <td style="padding: 12px; text-align: left;">{title_display}</td>
                        <td style="padding: 12px; text-align: left; background-color: {green_shade};">{subject}</td>
                    </tr>
                """
            
            html_table += """
                    </tbody>
                </table>
            </div>
            """
            
            st.markdown(html_table, unsafe_allow_html=True)
        else:
            st.markdown("""
                <div style="background-color: #1e1e1e; padding: 1.5rem; border-radius: 10px; margin-top: 1rem; color: #e0e0e0;">
                    No metadata enhancement suggestions available.
                </div>
            """, unsafe_allow_html=True)
    else:
        st.markdown("""
            <div style="background-color: #1e1e1e; padding: 1.5rem; border-radius: 10px; margin-top: 1rem; color: #e0e0e0;">
                Enable AI Suggestions to view recommendations.
            </div>
        """, unsafe_allow_html=True)
else:
    st.markdown("""
        <div style="background-color: #1e1e1e; padding: 1.5rem; border-radius: 10px; margin-top: 1rem; color: #e0e0e0;">
            All records already have subjects or no usable text available.
        </div>
    """, unsafe_allow_html=True)