Spaces:
Running
Running
File size: 6,940 Bytes
e6e6524 d707455 61165d4 d707455 91c3d7f d707455 ac76af4 08b2694 ac76af4 08b2694 ac76af4 08b2694 ac76af4 08b2694 d707455 e6e6524 d707455 91c3d7f d707455 a4af329 083533c b948611 083533c d707455 083533c b948611 083533c b948611 083533c 61165d4 15c7e45 61165d4 15c7e45 e6e6524 0b93d55 61165d4 e6e6524 61165d4 e6e6524 d707455 61165d4 d8a2f22 61165d4 d8a2f22 61165d4 d707455 91c3d7f e6e6524 91c3d7f e6e6524 91c3d7f 405d73b 91c3d7f e6e6524 405d73b e6e6524 91c3d7f 61165d4 405d73b 61165d4 e6e6524 21b5793 e6e6524 91c3d7f 61165d4 e6e6524 61165d4 91c3d7f 61165d4 91c3d7f e6e6524 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 |
# MetaDiscovery Agent - LOC API with Enhanced Completeness and Quality Analysis
import requests
import pandas as pd
import numpy as np
import streamlit as st
import plotly.express as px
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
# Custom CSS for white background, styled sidebar, banner, and dark grey font
st.markdown("""
<style>
.main {
background-color: white !important;
color: #333333 !important;
}
.block-container {
background-color: white !important;
color: #333333 !important;
}
section[data-testid="stSidebar"] > div:first-child {
background-color: #f8f9fa !important;
padding: 1rem;
border-radius: 0.5rem;
color: #333333 !important;
}
.stMarkdown, .stTextInput, .stDataFrame {
color: #333333 !important;
}
img.banner {
width: 100%;
border-radius: 12px;
margin-bottom: 1rem;
}
</style>
""", unsafe_allow_html=True)
# Optional: Add a banner image (replace with your image URL)
st.markdown('<img src="https://www.loc.gov/static/images/home/home-header.jpg" class="banner">', unsafe_allow_html=True)
# Streamlit app header
st.title("MetaDiscovery Agent for Library of Congress Collections")
st.markdown("""
This tool connects to the LOC API, retrieves metadata from a selected collection, and performs
an analysis of metadata completeness, suggests enhancements, and identifies authority gaps.
""")
# Updated collection URLs using the correct LOC API format
collections = {
"American Revolutionary War Maps": "american+revolutionary+war+maps",
"Civil War Maps": "civil+war+maps",
"Women's Suffrage": "women+suffrage",
"World War I Posters": "world+war+posters"
}
# Sidebar for selecting collection
st.sidebar.markdown("## Settings")
selected = st.sidebar.selectbox("Select a collection", list(collections.keys()))
search_query = collections[selected]
# Use the main search endpoint (most reliable)
collection_url = f"https://www.loc.gov/search/?q={search_query}&fo=json"
st.sidebar.write(f"Selected Collection: {selected}")
st.sidebar.write(f"API URL: {collection_url}")
# Fetch data from LOC API with spoofed User-Agent header
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 Chrome/110.0.0.0 Safari/537.36"
}
try:
response = requests.get(collection_url, headers=headers)
response.raise_for_status()
data = response.json()
if "results" in data:
records = data.get("results", [])
elif "items" in data:
records = data.get("items", [])
else:
records = []
st.error("Unexpected API response structure. No records found.")
st.write(f"Retrieved {len(records)} records")
except requests.exceptions.RequestException as e:
st.error(f"API Connection Error: {e}")
records = []
except ValueError:
st.error("Failed to parse API response as JSON")
records = []
# Extract selected metadata fields
items = []
for record in records:
if isinstance(record, dict):
description = record.get("description", "")
if isinstance(description, list):
description = " ".join([str(d) for d in description])
item = {
"id": record.get("id", ""),
"title": record.get("title", ""),
"date": record.get("date", ""),
"subject": ", ".join(record.get("subject", [])) if isinstance(record.get("subject"), list) else record.get("subject", ""),
"creator": record.get("creator", ""),
"description": description
}
if not item["title"] and "item" in record:
item["title"] = record.get("item", {}).get("title", "")
if not item["date"] and "item" in record:
item["date"] = record.get("item", {}).get("date", "")
items.append(item)
metadata_df = pd.DataFrame(items)
# Utility functions for deeper metadata quality analysis
def is_incomplete(value):
return pd.isna(value) or value in ["", "N/A", "null", None]
def is_valid_date(value):
try:
pd.to_datetime(value)
return True
except:
return False
if not metadata_df.empty:
st.subheader("π¦ Retrieved Metadata Sample")
st.dataframe(metadata_df.head())
# Metadata completeness analysis (enhanced)
st.subheader("π§ Metadata Completeness Analysis")
completeness = metadata_df.map(lambda x: not is_incomplete(x)).mean() * 100
completeness_df = pd.DataFrame({"Field": completeness.index, "Completeness (%)": completeness.values})
fig = px.bar(completeness_df, x="Field", y="Completeness (%)", title="Metadata Completeness by Field")
st.plotly_chart(fig)
# Identify incomplete records
incomplete_mask = metadata_df.map(is_incomplete).any(axis=1)
incomplete_records = metadata_df[incomplete_mask]
st.subheader("β οΈ Records with Incomplete Metadata")
if not incomplete_records.empty:
st.dataframe(incomplete_records.astype(str))
else:
st.success("All metadata fields are complete in this collection!")
st.subheader("π Identifiers of Items Needing Metadata Updates")
if not incomplete_records.empty:
st.write(incomplete_records[['id', 'title']])
else:
st.success("All records are complete!")
st.subheader("β¨ Suggested Metadata Enhancements")
filled_descriptions = metadata_df[metadata_df['description'].notnull()]['description'].astype(str)
if len(filled_descriptions) > 1:
try:
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(filled_descriptions)
suggestions = []
for idx, row in incomplete_records.iterrows():
if pd.isna(row['subject']) and pd.notna(row['description']):
desc_vec = tfidf.transform([str(row['description'])])
sims = cosine_similarity(desc_vec, tfidf_matrix).flatten()
top_idx = sims.argmax()
suggested_subject = metadata_df.iloc[top_idx]['subject']
if pd.notna(suggested_subject) and suggested_subject:
suggestions.append((row['title'], suggested_subject))
if suggestions:
suggestions_df = pd.DataFrame(suggestions, columns=["Title", "Suggested Subject"])
st.dataframe(suggestions_df)
else:
st.info("No metadata enhancement suggestions available.")
except Exception as e:
st.error(f"Error generating metadata suggestions: {e}")
else:
st.info("Not enough descriptive data to generate metadata suggestions.")
else:
st.warning("No metadata records found for this collection. Try selecting another one.")
|