Spaces:
Running
Running
File size: 6,198 Bytes
91c3d7f d707455 61165d4 d707455 91c3d7f d707455 91c3d7f d707455 a4af329 083533c a4af329 083533c d707455 083533c a4af329 083533c a4af329 083533c 61165d4 15c7e45 61165d4 15c7e45 61165d4 15c7e45 61165d4 d707455 61165d4 d707455 91c3d7f 61165d4 91c3d7f 61165d4 91c3d7f 61165d4 91c3d7f 61165d4 21b5793 61165d4 91c3d7f 61165d4 91c3d7f 61165d4 91c3d7f 61165d4 91c3d7f 61165d4 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 |
# MetaDiscovery Agent - LOC API with Collection Selector and Search Endpoint + Enhanced Features
import requests
import pandas as pd
import numpy as np
import streamlit as st
import plotly.express as px
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
# Streamlit app header
st.title("MetaDiscovery Agent for Library of Congress Collections")
st.markdown("""
This tool connects to the LOC API, retrieves metadata from a selected collection, and performs
an analysis of metadata completeness, suggests enhancements, and identifies authority gaps.
""")
# Updated collection URLs using the correct LOC API format
collections = {
"American Revolutionary War Maps": {"path": "maps", "query": "american+revolutionary+war"},
"Civil War Maps": {"path": "maps", "query": "civil+war"},
"Women's Suffrage": {"path": "collection", "query": "women+suffrage"},
"World War I Posters": {"path": "pictures", "query": "world+war+I+posters"}
}
# Sidebar for selecting collection
st.sidebar.markdown("## Settings")
selected = st.sidebar.selectbox("Select a collection", list(collections.keys()))
collection_info = collections[selected]
# Correct URL format for LOC API
collection_url = f"https://www.loc.gov/{collection_info['path']}/search/?q={collection_info['query']}&fo=json"
st.sidebar.write(f"Selected Collection: {selected}")
st.sidebar.write(f"API URL: {collection_url}")
# Fetch data from LOC API with spoofed User-Agent header
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 Chrome/110.0.0.0 Safari/537.36"
}
try:
response = requests.get(collection_url, headers=headers)
response.raise_for_status() # Raise exception for 4XX/5XX responses
# Handle both possible response structures
if "results" in data:
records = data.get("results", [])
elif "items" in data:
records = data.get("items", [])
else:
records = []
st.error("Unexpected API response structure. No records found.")
st.write(f"Retrieved {len(records)} records")
except requests.exceptions.RequestException as e:
st.error(f"API Connection Error: {e}")
records = []
except ValueError:
st.error("Failed to parse API response as JSON")
records = []
# Extract selected metadata fields with proper path traversal
items = []
for record in records:
# Handle different possible data structures
if isinstance(record, dict):
# For direct field access
item = {
"id": record.get("id", ""),
"title": record.get("title", ""),
"date": record.get("date", ""),
"subject": ", ".join(record.get("subject", [])) if isinstance(record.get("subject"), list) else record.get("subject", ""),
"creator": record.get("creator", ""),
"description": record.get("description", "")
}
# For nested field access (common in LOC API)
if not item["title"] and "item" in record:
item["title"] = record.get("item", {}).get("title", "")
if not item["date"] and "item" in record:
item["date"] = record.get("item", {}).get("date", "")
items.append(item)
# Create DataFrame
metadata_df = pd.DataFrame(items)
if not metadata_df.empty:
st.subheader("π¦ Retrieved Metadata Sample")
st.dataframe(metadata_df.head())
# Metadata completeness analysis
st.subheader("π§ Metadata Completeness Analysis")
completeness = metadata_df.notnull().mean() * 100
completeness_df = pd.DataFrame({"Field": completeness.index, "Completeness (%)": completeness.values})
# Plot completeness
fig = px.bar(completeness_df, x="Field", y="Completeness (%)", title="Metadata Completeness by Field")
st.plotly_chart(fig)
# List records with missing values
st.subheader("β οΈ Records with Incomplete Metadata")
incomplete_records = metadata_df[metadata_df.isnull().any(axis=1)]
if not incomplete_records.empty:
st.dataframe(incomplete_records)
else:
st.success("All metadata fields are complete in this collection!")
# Show exact items that need updates
st.subheader("π Identifiers of Items Needing Metadata Updates")
if not incomplete_records.empty:
st.write(incomplete_records[['id', 'title']])
else:
st.success("All records are complete!")
# Suggest metadata using text similarity with better error handling
st.subheader("β¨ Suggested Metadata Enhancements")
# Only process if we have descriptions and enough data
filled_descriptions = metadata_df[metadata_df['description'].notnull()]['description'].astype(str)
if len(filled_descriptions) > 1:
try:
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(filled_descriptions)
sim_matrix = cosine_similarity(tfidf_matrix)
suggestions = []
for idx, row in incomplete_records.iterrows():
if pd.isna(row['subject']) and pd.notna(row['description']):
desc_vec = tfidf.transform([str(row['description'])])
sims = cosine_similarity(desc_vec, tfidf_matrix).flatten()
top_idx = sims.argmax()
suggested_subject = metadata_df.iloc[top_idx]['subject']
if pd.notna(suggested_subject) and suggested_subject: # Only add valid suggestions
suggestions.append((row['title'], suggested_subject))
if suggestions:
suggestions_df = pd.DataFrame(suggestions, columns=["Title", "Suggested Subject"])
st.dataframe(suggestions_df)
else:
st.info("No metadata enhancement suggestions available.")
except Exception as e:
st.error(f"Error generating metadata suggestions: {e}")
else:
st.info("Not enough descriptive data to generate metadata suggestions.")
else:
st.warning("No metadata records found for this collection. Try selecting another one.") |