# MetaDiscovery Agent - Phase 1: LOC API Integration and Metadata Gap Analysis import requests import pandas as pd import streamlit as st import plotly.express as px # Streamlit app header st.title("MetaDiscovery Agent for Library of Congress Collections") st.markdown(""" This tool connects to the LOC API, retrieves metadata from a selected collection, and performs an initial analysis of metadata completeness. """) # Predefined LOC collections collections = { "American Revolutionary War Maps": "american-revolutionary-war-maps", "Civil War Maps": "civil-war-maps", "Women’s Suffrage": "womens-suffrage", "World War I Posters": "world-war-i-posters" } # Sidebar for selecting collection st.sidebar.markdown("## Settings") selected = st.sidebar.selectbox("Select a collection", list(collections.keys())) collection_path = collections[selected] collection_url = f"https://www.loc.gov/collections/{collection_path}/?fo=json" # Display selected collection st.sidebar.write(f"Selected Collection: {selected}") # Fetch data from LOC API response = requests.get(collection_url) data = response.json() # Parse metadata records records = data.get("results", []) # Extract selected metadata fields items = [] for record in records: items.append({ "title": record.get("title"), "date": record.get("date"), "subject": record.get("subject"), "creator": record.get("creator"), "description": record.get("description") }) # Create DataFrame metadata_df = pd.DataFrame(items) st.subheader("📦 Retrieved Metadata Sample") st.dataframe(metadata_df.head()) # Metadata completeness analysis st.subheader("🧠 Metadata Completeness Analysis") completeness = metadata_df.notnull().mean() * 100 completeness_df = pd.DataFrame({"Field": completeness.index, "Completeness (%)": completeness.values}) # Plot completeness fig = px.bar(completeness_df, x="Field", y="Completeness (%)", title="Metadata Completeness by Field") st.plotly_chart(fig) # List records with missing values st.subheader("⚠️ Records with Incomplete Metadata") incomplete_records = metadata_df[metadata_df.isnull().any(axis=1)] st.dataframe(incomplete_records)