File size: 6,940 Bytes
e6e6524
d707455
 
61165d4
d707455
 
91c3d7f
 
d707455
ac76af4
08b2694
 
 
 
ac76af4
08b2694
 
 
ac76af4
08b2694
 
 
 
 
ac76af4
 
 
 
08b2694
 
 
 
 
 
 
 
 
 
 
 
d707455
e6e6524
d707455
 
91c3d7f
d707455
 
a4af329
083533c
b948611
 
 
 
083533c
 
 
d707455
083533c
b948611
083533c
b948611
 
083533c
61165d4
 
15c7e45
 
 
 
 
61165d4
15c7e45
e6e6524
 
0b93d55
61165d4
 
 
 
 
 
 
 
e6e6524
61165d4
 
 
 
 
 
 
e6e6524
d707455
 
61165d4
d8a2f22
 
 
61165d4
 
 
 
 
 
d8a2f22
61165d4
 
 
 
 
 
d707455
 
91c3d7f
e6e6524
 
 
 
 
 
 
 
 
 
 
91c3d7f
 
 
e6e6524
 
91c3d7f
405d73b
91c3d7f
 
 
e6e6524
 
405d73b
e6e6524
 
91c3d7f
61165d4
405d73b
61165d4
 
e6e6524
21b5793
 
 
 
 
e6e6524
91c3d7f
 
61165d4
 
 
 
 
 
 
 
 
 
 
e6e6524
61165d4
 
 
 
 
 
 
 
91c3d7f
61165d4
91c3d7f
e6e6524
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
# MetaDiscovery Agent - LOC API with Enhanced Completeness and Quality Analysis
import requests
import pandas as pd
import numpy as np
import streamlit as st
import plotly.express as px
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Custom CSS for white background, styled sidebar, banner, and dark grey font
st.markdown("""
    <style>
        .main {
            background-color: white !important;
            color: #333333 !important;
        }
        .block-container {
            background-color: white !important;
            color: #333333 !important;
        }
        section[data-testid="stSidebar"] > div:first-child {
            background-color: #f8f9fa !important;
            padding: 1rem;
            border-radius: 0.5rem;
            color: #333333 !important;
        }
        .stMarkdown, .stTextInput, .stDataFrame {
            color: #333333 !important;
        }
        img.banner {
            width: 100%;
            border-radius: 12px;
            margin-bottom: 1rem;
        }
    </style>
""", unsafe_allow_html=True)

# Optional: Add a banner image (replace with your image URL)
st.markdown('<img src="https://www.loc.gov/static/images/home/home-header.jpg" class="banner">', unsafe_allow_html=True)

# Streamlit app header
st.title("MetaDiscovery Agent for Library of Congress Collections")
st.markdown("""
This tool connects to the LOC API, retrieves metadata from a selected collection, and performs
an analysis of metadata completeness, suggests enhancements, and identifies authority gaps.
""")

# Updated collection URLs using the correct LOC API format
collections = {
    "American Revolutionary War Maps": "american+revolutionary+war+maps",
    "Civil War Maps": "civil+war+maps",
    "Women's Suffrage": "women+suffrage",
    "World War I Posters": "world+war+posters"
}

# Sidebar for selecting collection
st.sidebar.markdown("## Settings")
selected = st.sidebar.selectbox("Select a collection", list(collections.keys()))
search_query = collections[selected]

# Use the main search endpoint (most reliable)
collection_url = f"https://www.loc.gov/search/?q={search_query}&fo=json"
st.sidebar.write(f"Selected Collection: {selected}")
st.sidebar.write(f"API URL: {collection_url}")

# Fetch data from LOC API with spoofed User-Agent header
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 Chrome/110.0.0.0 Safari/537.36"
}

try:
    response = requests.get(collection_url, headers=headers)
    response.raise_for_status()
    data = response.json()

    if "results" in data:
        records = data.get("results", [])
    elif "items" in data:
        records = data.get("items", [])
    else:
        records = []
        st.error("Unexpected API response structure. No records found.")
    st.write(f"Retrieved {len(records)} records")

except requests.exceptions.RequestException as e:
    st.error(f"API Connection Error: {e}")
    records = []
except ValueError:
    st.error("Failed to parse API response as JSON")
    records = []

# Extract selected metadata fields
items = []
for record in records:
    if isinstance(record, dict):
        description = record.get("description", "")
        if isinstance(description, list):
            description = " ".join([str(d) for d in description])
        item = {
            "id": record.get("id", ""),
            "title": record.get("title", ""),
            "date": record.get("date", ""),
            "subject": ", ".join(record.get("subject", [])) if isinstance(record.get("subject"), list) else record.get("subject", ""),
            "creator": record.get("creator", ""),
            "description": description
        }
        if not item["title"] and "item" in record:
            item["title"] = record.get("item", {}).get("title", "")
        if not item["date"] and "item" in record:
            item["date"] = record.get("item", {}).get("date", "")
        items.append(item)

metadata_df = pd.DataFrame(items)

# Utility functions for deeper metadata quality analysis
def is_incomplete(value):
    return pd.isna(value) or value in ["", "N/A", "null", None]

def is_valid_date(value):
    try:
        pd.to_datetime(value)
        return True
    except:
        return False

if not metadata_df.empty:
    st.subheader("πŸ“¦ Retrieved Metadata Sample")
    st.dataframe(metadata_df.head())

    # Metadata completeness analysis (enhanced)
    st.subheader("🧠 Metadata Completeness Analysis")
    completeness = metadata_df.map(lambda x: not is_incomplete(x)).mean() * 100
    completeness_df = pd.DataFrame({"Field": completeness.index, "Completeness (%)": completeness.values})
    fig = px.bar(completeness_df, x="Field", y="Completeness (%)", title="Metadata Completeness by Field")
    st.plotly_chart(fig)

    # Identify incomplete records
    incomplete_mask = metadata_df.map(is_incomplete).any(axis=1)
    incomplete_records = metadata_df[incomplete_mask]

    st.subheader("⚠️ Records with Incomplete Metadata")
    if not incomplete_records.empty:
        st.dataframe(incomplete_records.astype(str))
    else:
        st.success("All metadata fields are complete in this collection!")

    st.subheader("πŸ“Œ Identifiers of Items Needing Metadata Updates")
    if not incomplete_records.empty:
        st.write(incomplete_records[['id', 'title']])
    else:
        st.success("All records are complete!")

    st.subheader("✨ Suggested Metadata Enhancements")
    filled_descriptions = metadata_df[metadata_df['description'].notnull()]['description'].astype(str)
    if len(filled_descriptions) > 1:
        try:
            tfidf = TfidfVectorizer(stop_words='english')
            tfidf_matrix = tfidf.fit_transform(filled_descriptions)
            suggestions = []
            for idx, row in incomplete_records.iterrows():
                if pd.isna(row['subject']) and pd.notna(row['description']):
                    desc_vec = tfidf.transform([str(row['description'])])
                    sims = cosine_similarity(desc_vec, tfidf_matrix).flatten()
                    top_idx = sims.argmax()
                    suggested_subject = metadata_df.iloc[top_idx]['subject']
                    if pd.notna(suggested_subject) and suggested_subject:
                        suggestions.append((row['title'], suggested_subject))
            if suggestions:
                suggestions_df = pd.DataFrame(suggestions, columns=["Title", "Suggested Subject"])
                st.dataframe(suggestions_df)
            else:
                st.info("No metadata enhancement suggestions available.")
        except Exception as e:
            st.error(f"Error generating metadata suggestions: {e}")
    else:
        st.info("Not enough descriptive data to generate metadata suggestions.")
else:
    st.warning("No metadata records found for this collection. Try selecting another one.")