File size: 9,541 Bytes
d707455
 
 
c39747a
d707455
91c3d7f
 
d707455
c39747a
08b2694
c39747a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
08b2694
 
c39747a
12da302
c39747a
 
c3039ab
c39747a
 
 
 
d707455
c39747a
083533c
b948611
 
 
 
083533c
c39747a
c3039ab
b948611
bc2c7d0
1ce0089
c39747a
 
 
4e04d7b
c39747a
8956cd9
c39747a
 
 
 
 
 
 
 
 
 
 
90247f9
8956cd9
 
c39747a
 
90247f9
 
 
 
 
c39747a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39d75ee
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c39747a
 
 
 
 
 
 
 
 
 
 
 
 
 
90247f9
 
 
c39747a
90247f9
c39747a
 
 
 
 
 
 
90247f9
c39747a
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
import requests
import pandas as pd
import streamlit as st
import matplotlib
import plotly.express as px
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# ------------------- Custom CSS -------------------
st.markdown("""
    <style>
        html, body, [data-testid="stApp"] {
            background-color: #1A1A1A !important;
        }
        .main {
            background-color: #D3D3D3 !important;
            color: #1A1A1A!important;
        }
        .block-container {
            background-color: gray !important;
            color: #1A1A1A !important;
            padding-left: 2rem !important;
            padding-right: 2rem !important;
        }
        section[data-testid="stSidebar"] > div:first-child {
            background-color: #1A1A1A !important;
            color: #FFFFFF !important;
            padding: 2rem 1.5rem 1.5rem 1.5rem !important;
            border-radius: 12px;
            box-shadow: 0 4px 12px rgba(0, 0, 0, 0.08);
            font-size: 0.95rem;
        }
        .custom-table {
            background-color: #D3D3D3;
            color: #1A1A1A;
            font-family: monospace;
            padding: 1rem;
            border-radius: 8px;
            overflow-x: auto;
            white-space: pre;
            border: 1px solid #ccc;
        }
        .sidebar-stats {
            color: lightgray !important;
            font-size: 1.1rem !important;
            font-weight: 600;
        }
        .sidebar-contrast-block {
            background-color: #2b2b2b !important;
            padding: 1.25rem;
            border-radius: 10px;
            margin-top: 1.5rem;
        }
        .sidebar-section h3 {
            color: lightgray !important;
            font-size: 1.1rem !important;
            margin-top: 1.5rem;
        }
        .sidebar-links a {
            color: lightgray !important;
            text-decoration: none !important;
        }
        .sidebar-links a:hover {
            text-decoration: underline !important;
        }
    </style>
""", unsafe_allow_html=True)

# ------------------- Banner Image -------------------
st.image("https://cdn-uploads.huggingface.co/production/uploads/67351c643fe51cb1aa28f2e5/7ThcAOjbuM8ajrP85bGs4.jpeg", use_container_width=True)

# ------------------- App Title & Description -------------------
st.title("MetaDiscovery Agent for Library of Congress Collections")
st.markdown("""
This tool connects to the LOC API, retrieves metadata from a selected collection, and performs
an analysis of metadata completeness, suggests enhancements, and identifies authority gaps.
""")

# ------------------- Collection Selection -------------------
collections = {
    "American Revolutionary War Maps": "american+revolutionary+war+maps",
    "Civil War Maps": "civil+war+maps",
    "Women's Suffrage": "women+suffrage",
    "World War I Posters": "world+war+posters"
}

selected = st.sidebar.selectbox("Select a collection", list(collections.keys()), key="collection_selector")
search_query = collections[selected]
collection_url = f"https://www.loc.gov/search/?q={search_query}&fo=json"

# ------------------- Placeholders -------------------
stats_placeholder = st.sidebar.empty()
completeness_placeholder = st.sidebar.empty()

# ------------------- Helpful Resources -------------------
st.sidebar.markdown("""
<div class="sidebar-section">
  <h3>πŸ”— Helpful Resources</h3>
  <div class="sidebar-links">
    <ul style='padding-left: 1em'>
      <li><a href="https://www.loc.gov/apis/" target="_blank">LOC API Info</a></li>
      <li><a href="https://www.loc.gov/" target="_blank">Library of Congress Homepage</a></li>
      <li><a href="https://www.loc.gov/collections/" target="_blank">LOC Digital Collections</a></li>
      <li><a href="https://www.loc.gov/marc/" target="_blank">MARC Metadata Standards</a></li>
      <li><a href="https://labs.loc.gov/about-labs/digital-strategy/" target="_blank">LOC Digital Strategy</a></li>
    </ul>
  </div>
</div>
""", unsafe_allow_html=True)

# ------------------- Fetch Data -------------------
with st.spinner(f"Fetching data for {selected}..."):
    headers = {"User-Agent": "Mozilla/5.0"}
    try:
        response = requests.get(collection_url, headers=headers)
        response.raise_for_status()
        data = response.json()
        records = data.get("results") or data.get("items") or []
    except:
        records = []
        st.error("Failed to load data from LOC API")

# ------------------- Data Preparation -------------------
items = []
for record in records:
    description = record.get("description", "")
    if isinstance(description, list):
        description = " ".join([str(d) for d in description])
    item = {
        "id": record.get("id", ""),
        "title": record.get("title", ""),
        "date": record.get("date", ""),
        "subject": ", ".join(record.get("subject", [])) if isinstance(record.get("subject"), list) else record.get("subject", ""),
        "creator": record.get("creator", ""),
        "description": description
    }
    items.append(item)

metadata_df = pd.DataFrame(items)

# ------------------- Completeness Logic -------------------
def is_incomplete(value):
    return pd.isna(value) or value in ["", "N/A", "null", None]

if not metadata_df.empty:
    incomplete_mask = metadata_df.map(is_incomplete).any(axis=1)
    incomplete_count = incomplete_mask.sum()
    total_fields = metadata_df.size
    filled_fields = (~metadata_df.map(is_incomplete)).sum().sum()
    overall_percent = (filled_fields / total_fields) * 100
    completeness = (~metadata_df.map(is_incomplete)).mean() * 100
    completeness_df = pd.DataFrame({"Field": completeness.index, "Completeness (%)": completeness.values})
    completeness_table = completeness_df.set_index("Field")

    # ------------------- Quick Stats -------------------
    stats_html = f"""
    <div class="sidebar-stats">
        <h3 style="color: lightgray;">πŸ“Š Quick Stats</h3>
        <p style="color:lightgray;">Total Records: <b>{len(metadata_df)}</b></p>
        <p style="color:lightgray;">Incomplete Records: <b>{incomplete_count}</b></p>
        <p style="color:lightgray;">Overall Metadata Completeness: <b>{overall_percent:.1f}%</b></p>
    </div>
    """
    stats_placeholder.markdown(stats_html, unsafe_allow_html=True)

    # ------------------- Field Completeness Table -------------------
    with completeness_placeholder:
        st.markdown("""
            <div style='
                background-color: #2e2e2e;
                padding: 1.2rem;
                border-radius: 10px;
                margin-top: 1.5rem;
                color: lightgray;
            '>
            <h4 style='margin-bottom: 1rem;'>Field Completeness Breakdown</h4>
        """, unsafe_allow_html=True)
        st.dataframe(
            completeness_table.style.background_gradient(cmap="Greens").format("{:.1f}%"),
            use_container_width=True,
            height=240
        )
        st.markdown("</div>", unsafe_allow_html=True)

    # ------------------- Main Panel -------------------
    st.subheader("Retrieved Metadata Sample")
    st.dataframe(metadata_df.head())

        # Metadata completeness analysis (enhanced)
    st.subheader("πŸ“Š Metadata Completeness Analysis")
    
    completeness = metadata_df.map(lambda x: not is_incomplete(x)).mean() * 100
    completeness_df = pd.DataFrame({
        "Field": completeness.index,
        "Completeness (%)": completeness.values
    })
    
    fig = px.bar(
        completeness_df,
        x="Field",
        y="Completeness (%)",
        title="Metadata Completeness by Field",
        labels={"Field": "Metadata Field", "Completeness (%)": "Completeness (%)"}
    )
    st.plotly_chart(fig, use_container_width=True)


    # ------------------- Metadata Suggestions -------------------
    st.subheader("✨ Suggested Metadata Enhancements")
    incomplete_with_desc = metadata_df[incomplete_mask & metadata_df['description'].notnull()]
    reference_df = metadata_df[metadata_df['subject'].notnull() & metadata_df['description'].notnull()]

    if len(incomplete_with_desc) > 1 and len(reference_df) > 1:
        try:
            tfidf = TfidfVectorizer(stop_words='english')
            tfidf_matrix = tfidf.fit_transform(reference_df['description'])
            suggestions = []
            for _, row in incomplete_with_desc.iterrows():
                if pd.isna(row['subject']) and pd.notna(row['description']):
                    desc_vec = tfidf.transform([str(row['description'])])
                    sims = cosine_similarity(desc_vec, tfidf_matrix).flatten()
                    top_idx = sims.argmax()
                    suggested_subject = reference_df.iloc[top_idx]['subject']
                    if pd.notna(suggested_subject):
                        suggestions.append((row['title'], suggested_subject))
            if suggestions:
                suggestions_df = pd.DataFrame(suggestions, columns=["Title", "Suggested Subject"])
                st.markdown("<div class='custom-table'>" + suggestions_df.to_markdown(index=False) + "</div>", unsafe_allow_html=True)
            else:
                st.info("No metadata enhancement suggestions available.")
        except Exception as e:
            st.error(f"Error generating suggestions: {e}")
    else:
        st.info("Not enough descriptive data to generate metadata suggestions.")
else:
    st.warning("⚠️ No metadata records found for this collection.")