Spaces:

CCockrum
/

LOC-Metadate-Analyzer

Running

File size: 7,914 Bytes

e6e6524
d707455
 
61165d4
d707455
 
91c3d7f
 
d707455
ac76af4
08b2694
 
b081f8c
08b2694
049c3fd
6eb7050
049c3fd
08b2694
 
0bc9813
604282e
08b2694
 
604282e
08b2694
 
604282e
ac76af4
 
50c0feb
08b2694
 
 
 
 
 
996fbde
f29e5df
3c8882c
996fbde
 
 
 
d4d0c2a
03258fe
25dfdd7
fce73d4
89b0646
7f3ee80
25dfdd7
89b0646
 
 
 
 
 
3c8882c
fce73d4
4f626fc
fce73d4
 
 
d7069a9
3a9e51c
3c8882c
08b2694
 
 
4df0ff4
08b2694
d707455
4df0ff4
d707455
 
91c3d7f
d707455
 
a4af329
083533c
b948611
 
 
 
083533c
 
 
d707455
083533c
b948611
083533c
b948611
 
083533c
f195449
61165d4
7b55b9a
15c7e45
 
 
 
 
61165d4
15c7e45
e6e6524
 
0b93d55
61165d4
 
 
 
 
 
 
 
e6e6524
61165d4
 
 
 
 
 
 
e6e6524
d707455
 
61165d4
d8a2f22
 
 
61165d4
 
 
 
 
 
d8a2f22
61165d4
 
 
 
 
 
d707455
 
91c3d7f
e6e6524
 
 
 
 
 
 
 
 
 
 
91c3d7f
996fbde
91c3d7f
e6e6524
 
996fbde
405d73b
91c3d7f
 
 
e6e6524
 
405d73b
e6e6524
 
996fbde
61165d4
405d73b
61165d4
 
e6e6524
996fbde
21b5793
 
 
 
e6e6524
996fbde
91c3d7f
61165d4
 
 
 
 
 
 
 
 
 
 
e6e6524
61165d4
 
 
 
 
 
 
 
91c3d7f
61165d4
91c3d7f
e6e6524

# MetaDiscovery Agent - LOC API with Enhanced Completeness and Quality Analysis
import requests
import pandas as pd
import numpy as np
import streamlit as st
import plotly.express as px
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Custom CSS for white background, styled sidebar, banner, and dark grey font
st.markdown("""
    <style>
          
        .main {
            background-color: #D3D3D3 !important;
            color: #1A1A1A!important;

        }
        .block-container {
            background-color: gray !important;
            color: #808080!important;
        }
        section[data-testid="stSidebar"] > div:first-child {
            background-color: #808080 !important;
            padding: 1rem;
            border-radius: 0.5rem;
            color: #808080 !important;
        }
        .stMarkdown, .stTextInput, .stDataFrame {
            color: #1A1A1A!important;
        }
        img.banner {
            width: 100%;
            border-radius: 12px;
            margin-bottom: 1rem;
        }
         .stAlert {
            background-color: #f0f0f5 !important;
            color: #333333 !important;
            padding: 1.25rem !important;
            font-size: 1rem !important;
            border-radius: 0.5rem !important;
            box-shadow: 0 2px 5px rgba(0, 0, 0, 0.05) !important;
        }
        header[data-testid="stHeader"] {
    background-color: gray !important;
}
        section[data-testid="stSidebar"] > div:first-child {
    background-color: #1A1A1A !important;
    color: #D3D3D3 !important;
    padding: 2rem 1.5rem 1.5rem 1.5rem !important;
    border-radius: 12px;
    box-shadow: 0 4px 12px rgba(0, 0, 0, 0.08);
    font-size: 0.95rem;
    line-height: 1.5;
}
            .block-container {
    background-color: gray !important;
    color: #1A1A1A !important;
    padding-left: 2rem !important;
    padding-right: 2rem !important;
    box-shadow: none !important;

}
</style>
""", unsafe_allow_html=True)

# Optional: Add a banner image (replace with your image URL)
st.markdown('<img src="https://cdn-uploads.huggingface.co/production/uploads/67351c643fe51cb1aa28f2e5/7ThcAOjbuM8ajrP85bGs4.jpeg" class="banner">', unsafe_allow_html=True)

# Streamlit app header
st.title("LOC MetaDiscovery Agent")
st.markdown("""
This tool connects to the LOC API, retrieves metadata from a selected collection, and performs
an analysis of metadata completeness, suggests enhancements, and identifies authority gaps.
""")

# Updated collection URLs using the correct LOC API format
collections = {
    "American Revolutionary War Maps": "american+revolutionary+war+maps",
    "Civil War Maps": "civil+war+maps",
    "Women's Suffrage": "women+suffrage",
    "World War I Posters": "world+war+posters"
}

# Sidebar for selecting collection
st.sidebar.markdown("## Settings")
selected = st.sidebar.selectbox("Select a collection", list(collections.keys()))
search_query = collections[selected]

# Use the main search endpoint (most reliable)
collection_url = f"https://www.loc.gov/search/?q={search_query}&fo=json"
st.sidebar.write(f"Selected Collection: {selected}")
st.sidebar.markdown(f"<span style='color: lightgray;'>API URL: {collection_url}</span>", unsafe_allow_html=True)


# Fetch data from LOC API with spoofed User-Agent header
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 Chrome/110.0.0.0 Safari/537.36"
}

try:
    response = requests.get(collection_url, headers=headers)
    response.raise_for_status()
    data = response.json()

    if "results" in data:
        records = data.get("results", [])
    elif "items" in data:
        records = data.get("items", [])
    else:
        records = []
        st.error("Unexpected API response structure. No records found.")
    st.write(f"Retrieved {len(records)} records")

except requests.exceptions.RequestException as e:
    st.error(f"API Connection Error: {e}")
    records = []
except ValueError:
    st.error("Failed to parse API response as JSON")
    records = []

# Extract selected metadata fields
items = []
for record in records:
    if isinstance(record, dict):
        description = record.get("description", "")
        if isinstance(description, list):
            description = " ".join([str(d) for d in description])
        item = {
            "id": record.get("id", ""),
            "title": record.get("title", ""),
            "date": record.get("date", ""),
            "subject": ", ".join(record.get("subject", [])) if isinstance(record.get("subject"), list) else record.get("subject", ""),
            "creator": record.get("creator", ""),
            "description": description
        }
        if not item["title"] and "item" in record:
            item["title"] = record.get("item", {}).get("title", "")
        if not item["date"] and "item" in record:
            item["date"] = record.get("item", {}).get("date", "")
        items.append(item)

metadata_df = pd.DataFrame(items)

# Utility functions for deeper metadata quality analysis
def is_incomplete(value):
    return pd.isna(value) or value in ["", "N/A", "null", None]

def is_valid_date(value):
    try:
        pd.to_datetime(value)
        return True
    except:
        return False

if not metadata_df.empty:
    st.subheader("Retrieved Metadata Sample")
    st.dataframe(metadata_df.head())

    # Metadata completeness analysis (enhanced)
    st.subheader("Metadata Completeness Analysis")
    completeness = metadata_df.map(lambda x: not is_incomplete(x)).mean() * 100
    completeness_df = pd.DataFrame({"Field": completeness.index, "Completeness (%)": completeness.values})
    fig = px.bar(completeness_df, x="Field", y="Completeness (%)", title="Metadata Completeness by Field")
    st.plotly_chart(fig)

    # Identify incomplete records
    incomplete_mask = metadata_df.map(is_incomplete).any(axis=1)
    incomplete_records = metadata_df[incomplete_mask]

    st.subheader("Records with Incomplete Metadata")
    if not incomplete_records.empty:
        st.dataframe(incomplete_records.astype(str))
    else:
        st.success("All metadata fields are complete in this collection!")

    st.subheader("Identifiers of Items Needing Metadata Updates")
    if not incomplete_records.empty:
        st.write(incomplete_records[['id', 'title']])
    else:
        st.success("All records are complete!")

    st.subheader("Suggested Metadata Enhancements")
    filled_descriptions = metadata_df[metadata_df['description'].notnull()]['description'].astype(str)
    if len(filled_descriptions) > 1:
        try:
            tfidf = TfidfVectorizer(stop_words='english')
            tfidf_matrix = tfidf.fit_transform(filled_descriptions)
            suggestions = []
            for idx, row in incomplete_records.iterrows():
                if pd.isna(row['subject']) and pd.notna(row['description']):
                    desc_vec = tfidf.transform([str(row['description'])])
                    sims = cosine_similarity(desc_vec, tfidf_matrix).flatten()
                    top_idx = sims.argmax()
                    suggested_subject = metadata_df.iloc[top_idx]['subject']
                    if pd.notna(suggested_subject) and suggested_subject:
                        suggestions.append((row['title'], suggested_subject))
            if suggestions:
                suggestions_df = pd.DataFrame(suggestions, columns=["Title", "Suggested Subject"])
                st.dataframe(suggestions_df)
            else:
                st.info("No metadata enhancement suggestions available.")
        except Exception as e:
            st.error(f"Error generating metadata suggestions: {e}")
    else:
        st.info("Not enough descriptive data to generate metadata suggestions.")
else:
    st.warning("No metadata records found for this collection. Try selecting another one.")