Spaces:
Running
Running
File size: 9,541 Bytes
d707455 c39747a d707455 91c3d7f d707455 c39747a 08b2694 c39747a 08b2694 c39747a 12da302 c39747a c3039ab c39747a d707455 c39747a 083533c b948611 083533c c39747a c3039ab b948611 bc2c7d0 1ce0089 c39747a 4e04d7b c39747a 8956cd9 c39747a 90247f9 8956cd9 c39747a 90247f9 c39747a 39d75ee c39747a 90247f9 c39747a 90247f9 c39747a 90247f9 c39747a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 |
import requests
import pandas as pd
import streamlit as st
import matplotlib
import plotly.express as px
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
# ------------------- Custom CSS -------------------
st.markdown("""
<style>
html, body, [data-testid="stApp"] {
background-color: #1A1A1A !important;
}
.main {
background-color: #D3D3D3 !important;
color: #1A1A1A!important;
}
.block-container {
background-color: gray !important;
color: #1A1A1A !important;
padding-left: 2rem !important;
padding-right: 2rem !important;
}
section[data-testid="stSidebar"] > div:first-child {
background-color: #1A1A1A !important;
color: #FFFFFF !important;
padding: 2rem 1.5rem 1.5rem 1.5rem !important;
border-radius: 12px;
box-shadow: 0 4px 12px rgba(0, 0, 0, 0.08);
font-size: 0.95rem;
}
.custom-table {
background-color: #D3D3D3;
color: #1A1A1A;
font-family: monospace;
padding: 1rem;
border-radius: 8px;
overflow-x: auto;
white-space: pre;
border: 1px solid #ccc;
}
.sidebar-stats {
color: lightgray !important;
font-size: 1.1rem !important;
font-weight: 600;
}
.sidebar-contrast-block {
background-color: #2b2b2b !important;
padding: 1.25rem;
border-radius: 10px;
margin-top: 1.5rem;
}
.sidebar-section h3 {
color: lightgray !important;
font-size: 1.1rem !important;
margin-top: 1.5rem;
}
.sidebar-links a {
color: lightgray !important;
text-decoration: none !important;
}
.sidebar-links a:hover {
text-decoration: underline !important;
}
</style>
""", unsafe_allow_html=True)
# ------------------- Banner Image -------------------
st.image("https://cdn-uploads.huggingface.co/production/uploads/67351c643fe51cb1aa28f2e5/7ThcAOjbuM8ajrP85bGs4.jpeg", use_container_width=True)
# ------------------- App Title & Description -------------------
st.title("MetaDiscovery Agent for Library of Congress Collections")
st.markdown("""
This tool connects to the LOC API, retrieves metadata from a selected collection, and performs
an analysis of metadata completeness, suggests enhancements, and identifies authority gaps.
""")
# ------------------- Collection Selection -------------------
collections = {
"American Revolutionary War Maps": "american+revolutionary+war+maps",
"Civil War Maps": "civil+war+maps",
"Women's Suffrage": "women+suffrage",
"World War I Posters": "world+war+posters"
}
selected = st.sidebar.selectbox("Select a collection", list(collections.keys()), key="collection_selector")
search_query = collections[selected]
collection_url = f"https://www.loc.gov/search/?q={search_query}&fo=json"
# ------------------- Placeholders -------------------
stats_placeholder = st.sidebar.empty()
completeness_placeholder = st.sidebar.empty()
# ------------------- Helpful Resources -------------------
st.sidebar.markdown("""
<div class="sidebar-section">
<h3>π Helpful Resources</h3>
<div class="sidebar-links">
<ul style='padding-left: 1em'>
<li><a href="https://www.loc.gov/apis/" target="_blank">LOC API Info</a></li>
<li><a href="https://www.loc.gov/" target="_blank">Library of Congress Homepage</a></li>
<li><a href="https://www.loc.gov/collections/" target="_blank">LOC Digital Collections</a></li>
<li><a href="https://www.loc.gov/marc/" target="_blank">MARC Metadata Standards</a></li>
<li><a href="https://labs.loc.gov/about-labs/digital-strategy/" target="_blank">LOC Digital Strategy</a></li>
</ul>
</div>
</div>
""", unsafe_allow_html=True)
# ------------------- Fetch Data -------------------
with st.spinner(f"Fetching data for {selected}..."):
headers = {"User-Agent": "Mozilla/5.0"}
try:
response = requests.get(collection_url, headers=headers)
response.raise_for_status()
data = response.json()
records = data.get("results") or data.get("items") or []
except:
records = []
st.error("Failed to load data from LOC API")
# ------------------- Data Preparation -------------------
items = []
for record in records:
description = record.get("description", "")
if isinstance(description, list):
description = " ".join([str(d) for d in description])
item = {
"id": record.get("id", ""),
"title": record.get("title", ""),
"date": record.get("date", ""),
"subject": ", ".join(record.get("subject", [])) if isinstance(record.get("subject"), list) else record.get("subject", ""),
"creator": record.get("creator", ""),
"description": description
}
items.append(item)
metadata_df = pd.DataFrame(items)
# ------------------- Completeness Logic -------------------
def is_incomplete(value):
return pd.isna(value) or value in ["", "N/A", "null", None]
if not metadata_df.empty:
incomplete_mask = metadata_df.map(is_incomplete).any(axis=1)
incomplete_count = incomplete_mask.sum()
total_fields = metadata_df.size
filled_fields = (~metadata_df.map(is_incomplete)).sum().sum()
overall_percent = (filled_fields / total_fields) * 100
completeness = (~metadata_df.map(is_incomplete)).mean() * 100
completeness_df = pd.DataFrame({"Field": completeness.index, "Completeness (%)": completeness.values})
completeness_table = completeness_df.set_index("Field")
# ------------------- Quick Stats -------------------
stats_html = f"""
<div class="sidebar-stats">
<h3 style="color: lightgray;">π Quick Stats</h3>
<p style="color:lightgray;">Total Records: <b>{len(metadata_df)}</b></p>
<p style="color:lightgray;">Incomplete Records: <b>{incomplete_count}</b></p>
<p style="color:lightgray;">Overall Metadata Completeness: <b>{overall_percent:.1f}%</b></p>
</div>
"""
stats_placeholder.markdown(stats_html, unsafe_allow_html=True)
# ------------------- Field Completeness Table -------------------
with completeness_placeholder:
st.markdown("""
<div style='
background-color: #2e2e2e;
padding: 1.2rem;
border-radius: 10px;
margin-top: 1.5rem;
color: lightgray;
'>
<h4 style='margin-bottom: 1rem;'>Field Completeness Breakdown</h4>
""", unsafe_allow_html=True)
st.dataframe(
completeness_table.style.background_gradient(cmap="Greens").format("{:.1f}%"),
use_container_width=True,
height=240
)
st.markdown("</div>", unsafe_allow_html=True)
# ------------------- Main Panel -------------------
st.subheader("Retrieved Metadata Sample")
st.dataframe(metadata_df.head())
# Metadata completeness analysis (enhanced)
st.subheader("π Metadata Completeness Analysis")
completeness = metadata_df.map(lambda x: not is_incomplete(x)).mean() * 100
completeness_df = pd.DataFrame({
"Field": completeness.index,
"Completeness (%)": completeness.values
})
fig = px.bar(
completeness_df,
x="Field",
y="Completeness (%)",
title="Metadata Completeness by Field",
labels={"Field": "Metadata Field", "Completeness (%)": "Completeness (%)"}
)
st.plotly_chart(fig, use_container_width=True)
# ------------------- Metadata Suggestions -------------------
st.subheader("β¨ Suggested Metadata Enhancements")
incomplete_with_desc = metadata_df[incomplete_mask & metadata_df['description'].notnull()]
reference_df = metadata_df[metadata_df['subject'].notnull() & metadata_df['description'].notnull()]
if len(incomplete_with_desc) > 1 and len(reference_df) > 1:
try:
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(reference_df['description'])
suggestions = []
for _, row in incomplete_with_desc.iterrows():
if pd.isna(row['subject']) and pd.notna(row['description']):
desc_vec = tfidf.transform([str(row['description'])])
sims = cosine_similarity(desc_vec, tfidf_matrix).flatten()
top_idx = sims.argmax()
suggested_subject = reference_df.iloc[top_idx]['subject']
if pd.notna(suggested_subject):
suggestions.append((row['title'], suggested_subject))
if suggestions:
suggestions_df = pd.DataFrame(suggestions, columns=["Title", "Suggested Subject"])
st.markdown("<div class='custom-table'>" + suggestions_df.to_markdown(index=False) + "</div>", unsafe_allow_html=True)
else:
st.info("No metadata enhancement suggestions available.")
except Exception as e:
st.error(f"Error generating suggestions: {e}")
else:
st.info("Not enough descriptive data to generate metadata suggestions.")
else:
st.warning("β οΈ No metadata records found for this collection.")
|