File size: 6,198 Bytes
91c3d7f
d707455
 
61165d4
d707455
 
91c3d7f
 
d707455
 
 
 
 
91c3d7f
d707455
 
a4af329
083533c
a4af329
 
 
 
083533c
 
 
d707455
083533c
a4af329
083533c
a4af329
 
083533c
61165d4
 
15c7e45
 
 
 
 
61165d4
15c7e45
61165d4
15c7e45
61165d4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d707455
 
61165d4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d707455
 
 
91c3d7f
 
 
 
61165d4
91c3d7f
 
 
 
61165d4
91c3d7f
 
 
61165d4
91c3d7f
 
 
61165d4
 
 
 
 
21b5793
 
 
 
 
 
61165d4
 
91c3d7f
61165d4
 
91c3d7f
61165d4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
91c3d7f
61165d4
91c3d7f
61165d4
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
# MetaDiscovery Agent - LOC API with Collection Selector and Search Endpoint + Enhanced Features
import requests
import pandas as pd
import numpy as np
import streamlit as st
import plotly.express as px
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Streamlit app header
st.title("MetaDiscovery Agent for Library of Congress Collections")
st.markdown("""
This tool connects to the LOC API, retrieves metadata from a selected collection, and performs
an analysis of metadata completeness, suggests enhancements, and identifies authority gaps.
""")

# Updated collection URLs using the correct LOC API format
collections = {
    "American Revolutionary War Maps": {"path": "maps", "query": "american+revolutionary+war"},
    "Civil War Maps": {"path": "maps", "query": "civil+war"},
    "Women's Suffrage": {"path": "collection", "query": "women+suffrage"},
    "World War I Posters": {"path": "pictures", "query": "world+war+I+posters"}
}

# Sidebar for selecting collection
st.sidebar.markdown("## Settings")
selected = st.sidebar.selectbox("Select a collection", list(collections.keys()))
collection_info = collections[selected]

# Correct URL format for LOC API
collection_url = f"https://www.loc.gov/{collection_info['path']}/search/?q={collection_info['query']}&fo=json"
st.sidebar.write(f"Selected Collection: {selected}")
st.sidebar.write(f"API URL: {collection_url}")

# Fetch data from LOC API with spoofed User-Agent header
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 Chrome/110.0.0.0 Safari/537.36"
}

try:
    response = requests.get(collection_url, headers=headers)
    response.raise_for_status()  # Raise exception for 4XX/5XX responses

    
    # Handle both possible response structures
    if "results" in data:
        records = data.get("results", [])
    elif "items" in data:
        records = data.get("items", [])
    else:
        records = []
        st.error("Unexpected API response structure. No records found.")
        
    st.write(f"Retrieved {len(records)} records")
    
except requests.exceptions.RequestException as e:
    st.error(f"API Connection Error: {e}")
    records = []
except ValueError:
    st.error("Failed to parse API response as JSON")
    records = []

# Extract selected metadata fields with proper path traversal
items = []
for record in records:
    # Handle different possible data structures
    if isinstance(record, dict):
        # For direct field access
        item = {
            "id": record.get("id", ""),
            "title": record.get("title", ""),
            "date": record.get("date", ""),
            "subject": ", ".join(record.get("subject", [])) if isinstance(record.get("subject"), list) else record.get("subject", ""),
            "creator": record.get("creator", ""),
            "description": record.get("description", "")
        }
        
        # For nested field access (common in LOC API)
        if not item["title"] and "item" in record:
            item["title"] = record.get("item", {}).get("title", "")
        if not item["date"] and "item" in record:
            item["date"] = record.get("item", {}).get("date", "")
        
        items.append(item)

# Create DataFrame
metadata_df = pd.DataFrame(items)

if not metadata_df.empty:
    st.subheader("πŸ“¦ Retrieved Metadata Sample")
    st.dataframe(metadata_df.head())
    
    # Metadata completeness analysis
    st.subheader("🧠 Metadata Completeness Analysis")
    completeness = metadata_df.notnull().mean() * 100
    completeness_df = pd.DataFrame({"Field": completeness.index, "Completeness (%)": completeness.values})
    
    # Plot completeness
    fig = px.bar(completeness_df, x="Field", y="Completeness (%)", title="Metadata Completeness by Field")
    st.plotly_chart(fig)
    
    # List records with missing values
    st.subheader("⚠️ Records with Incomplete Metadata")
    incomplete_records = metadata_df[metadata_df.isnull().any(axis=1)]
    if not incomplete_records.empty:
        st.dataframe(incomplete_records)
    else:
        st.success("All metadata fields are complete in this collection!")
    
    # Show exact items that need updates
    st.subheader("πŸ“Œ Identifiers of Items Needing Metadata Updates")
    if not incomplete_records.empty:
        st.write(incomplete_records[['id', 'title']])
    else:
        st.success("All records are complete!")
    
    # Suggest metadata using text similarity with better error handling
    st.subheader("✨ Suggested Metadata Enhancements")
    
    # Only process if we have descriptions and enough data
    filled_descriptions = metadata_df[metadata_df['description'].notnull()]['description'].astype(str)
    
    if len(filled_descriptions) > 1:
        try:
            tfidf = TfidfVectorizer(stop_words='english')
            tfidf_matrix = tfidf.fit_transform(filled_descriptions)
            sim_matrix = cosine_similarity(tfidf_matrix)
            
            suggestions = []
            for idx, row in incomplete_records.iterrows():
                if pd.isna(row['subject']) and pd.notna(row['description']):
                    desc_vec = tfidf.transform([str(row['description'])])
                    sims = cosine_similarity(desc_vec, tfidf_matrix).flatten()
                    top_idx = sims.argmax()
                    suggested_subject = metadata_df.iloc[top_idx]['subject']
                    if pd.notna(suggested_subject) and suggested_subject:  # Only add valid suggestions
                        suggestions.append((row['title'], suggested_subject))
            
            if suggestions:
                suggestions_df = pd.DataFrame(suggestions, columns=["Title", "Suggested Subject"])
                st.dataframe(suggestions_df)
            else:
                st.info("No metadata enhancement suggestions available.")
        except Exception as e:
            st.error(f"Error generating metadata suggestions: {e}")
    else:
        st.info("Not enough descriptive data to generate metadata suggestions.")
else:
    st.warning("No metadata records found for this collection. Try selecting another one.")