Spaces:

CCockrum
/

LOC-Metadate-Analyzer

Running

File size: 14,279 Bytes

# MetaDiscovery Agent - LOC API with Enhanced Completeness and Quality Analysis
import requests
import pandas as pd
import numpy as np
import streamlit as st
import matplotlib
import plotly.express as px
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Custom CSS for white background, styled sidebar, banner, and dark grey font
st.markdown("""
    <style>
          
        .main {
            background-color: #D3D3D3 !important;
            color: #1A1A1A!important;

        }
        .block-container {
            background-color: gray !important;
            color: #1A1A1A!important;
        }
        section[data-testid="stSidebar"] > div:first-child {
            background-color: #808080 !important;
            padding: 1rem;
            border-radius: 0.5rem;
            color: #808080 !important;
        }
        .stMarkdown, .stTextInput, .stDataFrame {
            color: #1A1A1A!important;
        }
        img.banner {
            width: 100%;
            border-radius: 12px;
            margin-bottom: 1rem;
        }
                                 .stAlert {
            background-color: #f0f0f5 !important;
            color: #333333 !important;
            padding: 1.25rem !important;
            font-size: 1rem !important;
            border-radius: 0.5rem !important;
            box-shadow: 0 2px 5px rgba(0, 0, 0, 0.05) !important;
        }
        header[data-testid="stHeader"] {
    background-color: gray !important;
}
        section[data-testid="stSidebar"] > div:first-child {
    background-color: #1A1A1A !important;
    color: #FFFFFF !important;
    padding: 2rem 1.5rem 1.5rem 1.5rem !important;
    border-radius: 12px;
    box-shadow: 0 4px 12px rgba(0, 0, 0, 0.08);
    font-size: 0.95rem;
    line-height: 1.5;
}
            .block-container {
    background-color: gray !important;
    color: #1A1A1A !important;
    padding-left: 2rem !important;
    padding-right: 2rem !important;
    box-shadow: none !important;
}
        html, body, [data-testid="stApp"] {
            background-color: #1A1A1A !important;
        }
        .custom-table {
            background-color: #D3D3D3;
            color: #1A1A1A;
            font-family: monospace;
            padding: 1rem;
            border-radius: 8px;
            overflow-x: auto;
            white-space: pre;
            border: 1px solid #ccc;

        }
        .sidebar-stats {
            color: lightgray !important;
            font-size: 1.1rem !important;
            margin-top: 1.5rem;
            font-weight: 600;
        }
        .sidebar-contrast-block {
            background-color: #2b2b2b !important;  /* Slightly lighter than #1A1A1A */
            padding: 1.25rem;
            border-radius: 10px;
            margin-top: 1.5rem;
        }

</style>
""", unsafe_allow_html=True)

# OPTION 1: Use an image from a URL for the banner
st.image("https://cdn-uploads.huggingface.co/production/uploads/67351c643fe51cb1aa28f2e5/7ThcAOjbuM8ajrP85bGs4.jpeg", use_container_width=True)

# Streamlit app header
st.title("MetaDiscovery Agent for Library of Congress Collections")
st.markdown("""
This tool connects to the LOC API, retrieves metadata from a selected collection, and performs
an analysis of metadata completeness, suggests enhancements, and identifies authority gaps.
""")

# Updated collection URLs using the correct LOC API format
collections = {
    "American Revolutionary War Maps": "american+revolutionary+war+maps",
    "Civil War Maps": "civil+war+maps",
    "Women's Suffrage": "women+suffrage",
    "World War I Posters": "world+war+posters"
}

# Sidebar for selecting collection
#st.sidebar.markdown("## Settings")

# Create empty metadata_df variable to ensure it exists before checking
metadata_df = pd.DataFrame()

# Add a key to the selectbox to ensure it refreshes properly
selected = st.sidebar.selectbox("Select a collection", list(collections.keys()), key="collection_selector")
search_query = collections[selected]

# Define the collection URL
collection_url = f"https://www.loc.gov/search/?q={search_query}&fo=json"

# Create an empty placeholder for Quick Stats
stats_placeholder = st.sidebar.empty()

# Create placeholder for Field Completeness Breakdown
completeness_placeholder = st.sidebar.empty()

# Helpful Resources (styled and moved below dropdown)
st.sidebar.markdown("### Helpful Resources", unsafe_allow_html=True)
# Helpful Resources styled section
# 3. Helpful Resources Section (Fixed, under Completeness)
st.sidebar.markdown("""
    <style>
        .sidebar-section h3 {
            color: lightgray !important;
            font-size: 1.1rem !important;
            margin-top: 1.5rem;
        }
        .sidebar-links a {
            color: lightgray !important;
            text-decoration: none !important;
        }
        .sidebar-links a:hover {
            text-decoration: underline !important;
        }
    </style>
    <div class="sidebar-section">
      <h3>🔗 Helpful Resources</h3>
      <div class="sidebar-links">
        <ul style='padding-left: 1em'>
          <li><a href="https://www.loc.gov/apis/" target="_blank">LOC API Info</a></li>
          <li><a href="https://www.loc.gov/" target="_blank">Library of Congress Homepage</a></li>
          <li><a href="https://www.loc.gov/collections/" target="_blank">LOC Digital Collections</a></li>
          <li><a href="https://www.loc.gov/marc/" target="_blank">MARC Metadata Standards</a></li>
          <li><a href="https://labs.loc.gov/about-labs/digital-strategy/" target="_blank">LOC Digital Strategy</a></li>
        </ul>
      </div>
    </div>
""", unsafe_allow_html=True)


# Add a fetch button to make the action explicit
fetch_data = True
    
if fetch_data:
    # Display a loading spinner while fetching data
    with st.spinner(f"Fetching data for {selected}..."):
        # Fetch data from LOC API with spoofed User-Agent header
        headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 Chrome/110.0.0.0 Safari/537.36"
        }

        try:
            response = requests.get(collection_url, headers=headers)
            response.raise_for_status()
            data = response.json()

            if "results" in data:
                records = data.get("results", [])
            elif "items" in data:
                records = data.get("items", [])
            else:
                records = []
                st.error("Unexpected API response structure. No records found.")
            st.write(f"Retrieved {len(records)} records")

        except requests.exceptions.RequestException as e:
            st.error(f"API Connection Error: {e}")
            records = []
        except ValueError:
            st.error("Failed to parse API response as JSON")
            records = []

        # Extract selected metadata fields
        items = []
        for record in records:
            if isinstance(record, dict):
                description = record.get("description", "")
                if isinstance(description, list):
                    description = " ".join([str(d) for d in description])
                item = {
                    "id": record.get("id", ""),
                    "title": record.get("title", ""),
                    "date": record.get("date", ""),
                    "subject": ", ".join(record.get("subject", [])) if isinstance(record.get("subject"), list) else record.get("subject", ""),
                    "creator": record.get("creator", ""),
                    "description": description
                }
                if not item["title"] and "item" in record:
                    item["title"] = record.get("item", {}).get("title", "")
                if not item["date"] and "item" in record:
                    item["date"] = record.get("item", {}).get("date", "")
                items.append(item)

        metadata_df = pd.DataFrame(items)
        
        # Define custom completeness check
        def is_incomplete(value):
            return pd.isna(value) or value in ["", "N/A", "null", None]
        
        if not metadata_df.empty:
            incomplete_mask = metadata_df.map(is_incomplete).any(axis=1)
            incomplete_count = incomplete_mask.sum()
            total_fields = metadata_df.size
            filled_fields = (~metadata_df.map(is_incomplete)).sum().sum()
            overall_percent = (filled_fields / total_fields) * 100

            stats_html = f"""
            <div class="sidebar-stats">
                <h3 style="color: lightgray;">📊 Quick Stats</h3>
                <p style="color:lightgray;">Total Records: <b>{len(metadata_df)}</b></p>
                <p style="color:lightgray;">Incomplete Records: <b>{incomplete_count}</b></p>
                <p style="color:lightgray;">Overall Metadata Completeness: <b>{overall_percent:.1f}%</b></p>
            </div>
            """
            stats_placeholder.markdown(stats_html, unsafe_allow_html=True)
    
            # ✅ Then show this right after
            with st.sidebar.expander("📊 Field Completeness Breakdown", expanded=True):
                st.dataframe(
                    completeness_table.style.background_gradient(cmap="Greens").format("{:.1f}%"),
                    use_container_width=True,
                    height=240
                )


            # Render collapsible green completeness table in sidebar
            with st.sidebar.expander("📊 Field Completeness Breakdown", expanded=True):
                st.dataframe(
                    completeness_table.style.background_gradient(cmap="Greens").format("{:.1f}%"),
                    use_container_width=True,
                    height=240
                )
        

        # Utility functions for deeper metadata quality analysis
        def is_incomplete(value):
            return pd.isna(value) or value in ["", "N/A", "null", None]

        def is_valid_date(value):
            try:
                pd.to_datetime(value)
                return True
            except:
                return False

        if not metadata_df.empty:
            st.subheader("Retrieved Metadata Sample")
            st.dataframe(metadata_df.head())

            # Metadata completeness analysis (enhanced)
            st.subheader("Metadata Completeness Analysis")
            # Create the completeness table
            completeness = metadata_df.map(lambda x: not is_incomplete(x)).mean() * 100
            completeness_df = pd.DataFrame({
                "Field": completeness.index,
                "Completeness (%)": completeness.values
            })
            completeness_table = completeness_df.set_index("Field")

            # FILL THE PLACEHOLDER created earlier
            
            with completeness_placeholder:
                st.markdown("""
                    <div style='
                        background-color: #2e2e2e;
                        padding: 1.2rem;
                        border-radius: 10px;
                        margin-top: 1.5rem;
                        color: lightgray;
                    '>
                    <h4 style='margin-bottom: 1rem;'>Field Completeness Breakdown</h4>
                """, unsafe_allow_html=True)
            
                st.dataframe(
                    completeness_table.style.background_gradient(cmap="Greens").format("{:.1f}%"),
                    use_container_width=True,
                    height=240
                )
            
                st.markdown("</div>", unsafe_allow_html=True)


            # Then continue plotting in main panel
            fig = px.bar(completeness_df, x="Field", y="Completeness (%)", title="Metadata Completeness by Field")
            st.plotly_chart(fig)



            # Identify incomplete records
            incomplete_mask = metadata_df.map(is_incomplete).any(axis=1)
            incomplete_records = metadata_df[incomplete_mask]

            st.subheader("✨ Suggested Metadata Enhancements")

            incomplete_with_desc = incomplete_records[incomplete_records['description'].notnull()]
            reference_df = metadata_df[metadata_df['subject'].notnull() & metadata_df['description'].notnull()]
            tfidf = TfidfVectorizer(stop_words='english')
        
            if len(incomplete_with_desc) > 1 and len(reference_df) > 1:
                try:
                    suggestions = []
                    tfidf_matrix = tfidf.fit_transform(reference_df['description'])
        
                    for idx, row in incomplete_with_desc.iterrows():
                        if pd.isna(row['subject']) and pd.notna(row['description']):
                            desc_vec = tfidf.transform([str(row['description'])])
                            sims = cosine_similarity(desc_vec, tfidf_matrix).flatten()
                            top_idx = sims.argmax()
                            suggested_subject = reference_df.iloc[top_idx]['subject']
                            if pd.notna(suggested_subject) and suggested_subject:
                                suggestions.append((row['title'], suggested_subject))
        
                    if suggestions:
                        suggestions_df = pd.DataFrame(suggestions, columns=["Title", "Suggested Subject"])
                        st.markdown("<div class='custom-table'>" + suggestions_df.to_markdown(index=False) + "</div>", unsafe_allow_html=True)
                    else:
                        st.markdown("""
                            <div class='custom-table'>
                            <b>No metadata enhancement suggestions available.</b>
                            </div>
                        """, unsafe_allow_html=True)

                except Exception as e:
                    st.error(f"Error generating metadata suggestions: {e}")
            else:
                st.markdown("""
                    <div class='custom-table'>
                    <b>Not enough descriptive data to generate metadata suggestions.</b>
                    </div>
                    """, unsafe_allow_html=True)
        else:
            st.warning("⚠️ No metadata records found for this collection. Try selecting another one.")