Spaces:

CCockrum
/

LOC-Metadate-Analyzer

Running

App Files Files Community

CCockrum commited on Apr 25

Commit

3c67c1b

verified ·

1 Parent(s): 6080190

Update app.py

Browse files

Files changed (1) hide show

app.py +277 -158

app.py CHANGED Viewed

@@ -1,37 +1,69 @@
 import requests
 import pandas as pd
 import streamlit as st
 import matplotlib
 import plotly.express as px
 from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.metrics.pairwise import cosine_similarity
-# ------------------- Custom CSS -------------------
 st.markdown("""
     <style>
-        html, body, [data-testid="stApp"] {
-            background-color: #1A1A1A !important;
-        }
         .main {
             background-color: #D3D3D3 !important;
             color: #1A1A1A!important;
         }
         .block-container {
             background-color: gray !important;
-            color: #1A1A1A !important;
-            padding-left: 2rem !important;
-            padding-right: 2rem !important;
         }
-        header[data-testid="stHeader"] {
-            background-color: #1A1A1A !important;
         }
         section[data-testid="stSidebar"] > div:first-child {
             background-color: #1A1A1A !important;
-            color: #FFFFFF !important;
-            padding: 2rem 1.5rem 1.5rem 1.5rem !important;
-            border-radius: 12px;
-            box-shadow: 0 4px 12px rgba(0, 0, 0, 0.08);
-            font-size: 0.95rem;
         }
         .custom-table {
             background-color: #D3D3D3;
@@ -42,44 +74,35 @@ st.markdown("""
             overflow-x: auto;
             white-space: pre;
             border: 1px solid #ccc;
         }
         .sidebar-stats {
             color: lightgray !important;
             font-size: 1.1rem !important;
             font-weight: 600;
         }
         .sidebar-contrast-block {
-            background-color: #2b2b2b !important;
             padding: 1.25rem;
             border-radius: 10px;
             margin-top: 1.5rem;
         }
-        .sidebar-section h3 {
-            color: lightgray !important;
-            font-size: 1.1rem !important;
-            margin-top: 1.5rem;
-        }
-        .sidebar-links a {
-            color: lightgray !important;
-            text-decoration: none !important;
-        }
-        .sidebar-links a:hover {
-            text-decoration: underline !important;
-        }
-    </style>
 """, unsafe_allow_html=True)
-# ------------------- Banner Image -------------------
 st.image("https://cdn-uploads.huggingface.co/production/uploads/67351c643fe51cb1aa28f2e5/7ThcAOjbuM8ajrP85bGs4.jpeg", use_container_width=True)
-# ------------------- App Title & Description -------------------
 st.title("MetaDiscovery Agent for Library of Congress Collections")
 st.markdown("""
 This tool connects to the LOC API, retrieves metadata from a selected collection, and performs
 an analysis of metadata completeness, suggests enhancements, and identifies authority gaps.
 """)
-# ------------------- Collection Selection -------------------
 collections = {
     "American Revolutionary War Maps": "american+revolutionary+war+maps",
     "Civil War Maps": "civil+war+maps",
@@ -87,144 +110,240 @@ collections = {
     "World War I Posters": "world+war+posters"
 }
 selected = st.sidebar.selectbox("Select a collection", list(collections.keys()), key="collection_selector")
 search_query = collections[selected]
 collection_url = f"https://www.loc.gov/search/?q={search_query}&fo=json"
-# ------------------- Placeholders -------------------
 stats_placeholder = st.sidebar.empty()
 completeness_placeholder = st.sidebar.empty()
-# ------------------- Helpful Resources -------------------
 st.sidebar.markdown("""
-<div class="sidebar-section">
-  <h3>🔗 Helpful Resources</h3>
-  <div class="sidebar-links">
-    <ul style='padding-left: 1em'>
-      <li><a href="https://www.loc.gov/apis/" target="_blank">LOC API Info</a></li>
-      <li><a href="https://www.loc.gov/" target="_blank">Library of Congress Homepage</a></li>
-      <li><a href="https://www.loc.gov/collections/" target="_blank">LOC Digital Collections</a></li>
-      <li><a href="https://www.loc.gov/marc/" target="_blank">MARC Metadata Standards</a></li>
-      <li><a href="https://labs.loc.gov/about-labs/digital-strategy/" target="_blank">LOC Digital Strategy</a></li>
-    </ul>
-  </div>
-</div>
 """, unsafe_allow_html=True)
-# ------------------- Fetch Data -------------------
-with st.spinner(f"Fetching data for {selected}..."):
-    headers = {"User-Agent": "Mozilla/5.0"}
-    try:
-        response = requests.get(collection_url, headers=headers)
-        response.raise_for_status()
-        data = response.json()
-        records = data.get("results") or data.get("items") or []
-    except:
-        records = []
-        st.error("Failed to load data from LOC API")
-# ------------------- Data Preparation -------------------
-items = []
-for record in records:
-    description = record.get("description", "")
-    if isinstance(description, list):
-        description = " ".join([str(d) for d in description])
-    item = {
-        "id": record.get("id", ""),
-        "title": record.get("title", ""),
-        "date": record.get("date", ""),
-        "subject": ", ".join(record.get("subject", [])) if isinstance(record.get("subject"), list) else record.get("subject", ""),
-        "creator": record.get("creator", ""),
-        "description": description
-    }
-    items.append(item)
-metadata_df = pd.DataFrame(items)
-# ------------------- Completeness Logic -------------------
-def is_incomplete(value):
-    return pd.isna(value) or value in ["", "N/A", "null", None]
-if not metadata_df.empty:
-    incomplete_mask = metadata_df.map(is_incomplete).any(axis=1)
-    incomplete_count = incomplete_mask.sum()
-    total_fields = metadata_df.size
-    filled_fields = (~metadata_df.map(is_incomplete)).sum().sum()
-    overall_percent = (filled_fields / total_fields) * 100
-    completeness = (~metadata_df.map(is_incomplete)).mean() * 100
-    completeness_df = pd.DataFrame({"Field": completeness.index, "Completeness (%)": completeness.values})
-    completeness_table = completeness_df.set_index("Field")
-    # ------------------- Quick Stats -------------------
-    stats_html = f"""
-    <div class="sidebar-stats">
-        <h3 style="color: lightgray;">📊 Quick Stats</h3>
-        <p style="color:lightgray;">Total Records: <b>{len(metadata_df)}</b></p>
-        <p style="color:lightgray;">Incomplete Records: <b>{incomplete_count}</b></p>
-        <p style="color:lightgray;">Overall Metadata Completeness: <b>{overall_percent:.1f}%</b></p>
-    </div>
-    """
-    stats_placeholder.markdown(stats_html, unsafe_allow_html=True)
-    # ------------------- Field Completeness Table -------------------
-    with completeness_placeholder:
-        st.markdown("""
-            <div style='
-                background-color: #2e2e2e;
-                padding: 1.2rem;
-                border-radius: 10px;
-                margin-top: 1.5rem;
-                color: lightgray;
-            '>
-            <h4 style='margin-bottom: 1rem;'>Field Completeness Breakdown</h4>
-        """, unsafe_allow_html=True)
-        st.dataframe(
-            completeness_table.style.background_gradient(cmap="Greens").format("{:.1f}%"),
-            use_container_width=True,
-            height=240
-        )
-        st.markdown("</div>", unsafe_allow_html=True)
-    # ------------------- Main Panel -------------------
-        # Metadata completeness analysis (enhanced)
-        st.subheader("📊 Metadata Completeness Analysis")
-        fig = px.bar(
-            completeness_df.reset_index(),
-            x="Field",
-            y="Completeness (%)",
-            title="Metadata Completeness by Field",
-            labels={"Field": "Metadata Field", "Completeness (%)": "Completeness (%)"}
-        )
-        st.plotly_chart(fig, use_container_width=True)
-    # ------------------- Metadata Suggestions -------------------
-    st.subheader("✨ Suggested Metadata Enhancements")
-    incomplete_with_desc = metadata_df[incomplete_mask & metadata_df['description'].notnull()]
-    reference_df = metadata_df[metadata_df['subject'].notnull() & metadata_df['description'].notnull()]
-    if len(incomplete_with_desc) > 1 and len(reference_df) > 1:
         try:
             tfidf = TfidfVectorizer(stop_words='english')
-            tfidf_matrix = tfidf.fit_transform(reference_df['description'])
-            suggestions = []
-            for _, row in incomplete_with_desc.iterrows():
-                if pd.isna(row['subject']) and pd.notna(row['description']):
-                    desc_vec = tfidf.transform([str(row['description'])])
-                    sims = cosine_similarity(desc_vec, tfidf_matrix).flatten()
-                    top_idx = sims.argmax()
-                    suggested_subject = reference_df.iloc[top_idx]['subject']
-                    if pd.notna(suggested_subject):
-                        suggestions.append((row['title'], suggested_subject))
-            if suggestions:
-                suggestions_df = pd.DataFrame(suggestions, columns=["Title", "Suggested Subject"])
-                st.markdown("<div class='custom-table'>" + suggestions_df.to_markdown(index=False) + "</div>", unsafe_allow_html=True)
             else:
-                st.info("No metadata enhancement suggestions available.")
-        except Exception as e:
-            st.error(f"Error generating suggestions: {e}")
-    else:
-        st.info("Not enough descriptive data to generate metadata suggestions.")
-else:
-    st.warning("⚠️ No metadata records found for this collection.")

+# MetaDiscovery Agent - LOC API with Enhanced Completeness and Quality Analysis
 import requests
 import pandas as pd
+import numpy as np
 import streamlit as st
 import matplotlib
 import plotly.express as px
 from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.metrics.pairwise import cosine_similarity
+# Custom CSS for white background, styled sidebar, banner, and dark grey font
 st.markdown("""
     <style>
         .main {
             background-color: #D3D3D3 !important;
             color: #1A1A1A!important;
         }
         .block-container {
             background-color: gray !important;
+            color: #808080!important;
         }
+        section[data-testid="stSidebar"] > div:first-child {
+            background-color: #808080 !important;
+            padding: 1rem;
+            border-radius: 0.5rem;
+            color: #808080 !important;
+        }
+        .stMarkdown, .stTextInput, .stDataFrame {
+            color: #1A1A1A!important;
         }
+        img.banner {
+            width: 100%;
+            border-radius: 12px;
+            margin-bottom: 1rem;
+        }
+                                 .stAlert {
+            background-color: #f0f0f5 !important;
+            color: #333333 !important;
+            padding: 1.25rem !important;
+            font-size: 1rem !important;
+            border-radius: 0.5rem !important;
+            box-shadow: 0 2px 5px rgba(0, 0, 0, 0.05) !important;
+        }
+        header[data-testid="stHeader"] {
+    background-color: gray !important;
+}
         section[data-testid="stSidebar"] > div:first-child {
+    background-color: #1A1A1A !important;
+    color: #FFFFFF !important;
+    padding: 2rem 1.5rem 1.5rem 1.5rem !important;
+    border-radius: 12px;
+    box-shadow: 0 4px 12px rgba(0, 0, 0, 0.08);
+    font-size: 0.95rem;
+    line-height: 1.5;
+}
+            .block-container {
+    background-color: gray !important;
+    color: #1A1A1A !important;
+    padding-left: 2rem !important;
+    padding-right: 2rem !important;
+    box-shadow: none !important;
+}
+        html, body, [data-testid="stApp"] {
             background-color: #1A1A1A !important;
         }
         .custom-table {
             background-color: #D3D3D3;
             overflow-x: auto;
             white-space: pre;
             border: 1px solid #ccc;
         }
         .sidebar-stats {
             color: lightgray !important;
             font-size: 1.1rem !important;
+            margin-top: 1.5rem;
             font-weight: 600;
         }
         .sidebar-contrast-block {
+            background-color: #2b2b2b !important;  /* Slightly lighter than #1A1A1A */
             padding: 1.25rem;
             border-radius: 10px;
             margin-top: 1.5rem;
         }
+</style>
 """, unsafe_allow_html=True)
+# OPTION 1: Use an image from a URL for the banner
 st.image("https://cdn-uploads.huggingface.co/production/uploads/67351c643fe51cb1aa28f2e5/7ThcAOjbuM8ajrP85bGs4.jpeg", use_container_width=True)
+# Streamlit app header
 st.title("MetaDiscovery Agent for Library of Congress Collections")
 st.markdown("""
 This tool connects to the LOC API, retrieves metadata from a selected collection, and performs
 an analysis of metadata completeness, suggests enhancements, and identifies authority gaps.
 """)
+# Updated collection URLs using the correct LOC API format
 collections = {
     "American Revolutionary War Maps": "american+revolutionary+war+maps",
     "Civil War Maps": "civil+war+maps",
     "World War I Posters": "world+war+posters"
 }
+# Sidebar for selecting collection
+#st.sidebar.markdown("## Settings")
+# Create empty metadata_df variable to ensure it exists before checking
+metadata_df = pd.DataFrame()
+# Add a key to the selectbox to ensure it refreshes properly
 selected = st.sidebar.selectbox("Select a collection", list(collections.keys()), key="collection_selector")
 search_query = collections[selected]
+# Define the collection URL
 collection_url = f"https://www.loc.gov/search/?q={search_query}&fo=json"
+# Create an empty placeholder for Quick Stats
 stats_placeholder = st.sidebar.empty()
+# Create placeholder for Field Completeness Breakdown
 completeness_placeholder = st.sidebar.empty()
+# Helpful Resources (styled and moved below dropdown)
+st.sidebar.markdown("### Helpful Resources", unsafe_allow_html=True)
+# Helpful Resources styled section
+# 3. Helpful Resources Section (Fixed, under Completeness)
 st.sidebar.markdown("""
+    <style>
+        .sidebar-section h3 {
+            color: lightgray !important;
+            font-size: 1.1rem !important;
+            margin-top: 1.5rem;
+        }
+        .sidebar-links a {
+            color: lightgray !important;
+            text-decoration: none !important;
+        }
+        .sidebar-links a:hover {
+            text-decoration: underline !important;
+        }
+    </style>
+    <div class="sidebar-section">
+      <h3>🔗 Helpful Resources</h3>
+      <div class="sidebar-links">
+        <ul style='padding-left: 1em'>
+          <li><a href="https://www.loc.gov/apis/" target="_blank">LOC API Info</a></li>
+          <li><a href="https://www.loc.gov/" target="_blank">Library of Congress Homepage</a></li>
+          <li><a href="https://www.loc.gov/collections/" target="_blank">LOC Digital Collections</a></li>
+          <li><a href="https://www.loc.gov/marc/" target="_blank">MARC Metadata Standards</a></li>
+          <li><a href="https://labs.loc.gov/about-labs/digital-strategy/" target="_blank">LOC Digital Strategy</a></li>
+        </ul>
+      </div>
+    </div>
 """, unsafe_allow_html=True)
+# Add a fetch button to make the action explicit
+fetch_data = True
+if fetch_data:
+    # Display a loading spinner while fetching data
+    with st.spinner(f"Fetching data for {selected}..."):
+        # Fetch data from LOC API with spoofed User-Agent header
+        headers = {
+            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 Chrome/110.0.0.0 Safari/537.36"
+        }
         try:
+            response = requests.get(collection_url, headers=headers)
+            response.raise_for_status()
+            data = response.json()
+            if "results" in data:
+                records = data.get("results", [])
+            elif "items" in data:
+                records = data.get("items", [])
+            else:
+                records = []
+                st.error("Unexpected API response structure. No records found.")
+            st.write(f"Retrieved {len(records)} records")
+        except requests.exceptions.RequestException as e:
+            st.error(f"API Connection Error: {e}")
+            records = []
+        except ValueError:
+            st.error("Failed to parse API response as JSON")
+            records = []
+        # Extract selected metadata fields
+        items = []
+        for record in records:
+            if isinstance(record, dict):
+                description = record.get("description", "")
+                if isinstance(description, list):
+                    description = " ".join([str(d) for d in description])
+                item = {
+                    "id": record.get("id", ""),
+                    "title": record.get("title", ""),
+                    "date": record.get("date", ""),
+                    "subject": ", ".join(record.get("subject", [])) if isinstance(record.get("subject"), list) else record.get("subject", ""),
+                    "creator": record.get("creator", ""),
+                    "description": description
+                }
+                if not item["title"] and "item" in record:
+                    item["title"] = record.get("item", {}).get("title", "")
+                if not item["date"] and "item" in record:
+                    item["date"] = record.get("item", {}).get("date", "")
+                items.append(item)
+        metadata_df = pd.DataFrame(items)
+        # Define custom completeness check
+        def is_incomplete(value):
+            return pd.isna(value) or value in ["", "N/A", "null", None]
+        if not metadata_df.empty:
+            # Incomplete record detection
+            incomplete_mask = metadata_df.apply(lambda row: row.map(is_incomplete), axis=1).any(axis=1)
+            incomplete_count = incomplete_mask.sum()
+            # Overall completeness
+            total_fields = metadata_df.size
+            filled_fields = metadata_df.apply(lambda row: row.map(lambda x: not is_incomplete(x)), axis=1).sum().sum()
+            overall_percent = (filled_fields / total_fields) * 100
+            # Field-by-field completeness
+            completeness = metadata_df.map(lambda x: not is_incomplete(x)).mean() * 100
+            completeness_table = completeness.round(1).to_frame(name="Completeness (%)")
+            # Render stats summary in sidebar
+            stats_html = f"""
+            <div class="sidebar-stats">
+                <h3 style="color: lightgray;">Quick Stats</h3>
+                <p style="color:lightgray;">Total Records: <b>{len(metadata_df)}</b></p>
+                <p style="color:lightgray;">Incomplete Records: <b>{incomplete_count}</b></p>
+                <p style="color:lightgray;">Overall Metadata Completeness: <b>{overall_percent:.1f}%</b></p>
+            </div>
+            """
+            stats_placeholder.markdown(stats_html, unsafe_allow_html=True)
+        # Utility functions for deeper metadata quality analysis
+        def is_incomplete(value):
+            return pd.isna(value) or value in ["", "N/A", "null", None]
+        def is_valid_date(value):
+            try:
+                pd.to_datetime(value)
+                return True
+            except:
+                return False
+        if not metadata_df.empty:
+            st.subheader("Retrieved Metadata Sample")
+            st.dataframe(metadata_df.head())
+            # Metadata completeness analysis (enhanced)
+            st.subheader("Metadata Completeness Analysis")
+            # Create the completeness table
+            completeness = metadata_df.map(lambda x: not is_incomplete(x)).mean() * 100
+            completeness_df = pd.DataFrame({
+                "Field": completeness.index,
+                "Completeness (%)": completeness.values
+            })
+            completeness_table = completeness_df.set_index("Field")
+            # FILL THE PLACEHOLDER created earlier
+            with completeness_placeholder:
+                st.markdown("""
+                    <div style='
+                        background-color: #2e2e2e;
+                        padding: 1.2rem;
+                        border-radius: 10px;
+                        margin-top: 1.5rem;
+                        color: lightgray;
+                    '>
+                    <h4 style='margin-bottom: 1rem;'>Field Completeness Breakdown</h4>
+                """, unsafe_allow_html=True)
+                st.dataframe(
+                    completeness_table.style.background_gradient(cmap="Greens").format("{:.1f}%"),
+                    use_container_width=True,
+                    height=240
+                )
+                st.markdown("</div>", unsafe_allow_html=True)
+            # Then continue plotting in main panel
+            fig = px.bar(completeness_df, x="Field", y="Completeness (%)", title="Metadata Completeness by Field")
+            st.plotly_chart(fig)
+            # Identify incomplete records
+            incomplete_mask = metadata_df.map(is_incomplete).any(axis=1)
+            incomplete_records = metadata_df[incomplete_mask]
+            st.subheader("✨ Suggested Metadata Enhancements")
+            incomplete_with_desc = incomplete_records[incomplete_records['description'].notnull()]
+            reference_df = metadata_df[metadata_df['subject'].notnull() & metadata_df['description'].notnull()]
             tfidf = TfidfVectorizer(stop_words='english')
+            if len(incomplete_with_desc) > 1 and len(reference_df) > 1:
+                try:
+                    suggestions = []
+                    tfidf_matrix = tfidf.fit_transform(reference_df['description'])
+                    for idx, row in incomplete_with_desc.iterrows():
+                        if pd.isna(row['subject']) and pd.notna(row['description']):
+                            desc_vec = tfidf.transform([str(row['description'])])
+                            sims = cosine_similarity(desc_vec, tfidf_matrix).flatten()
+                            top_idx = sims.argmax()
+                            suggested_subject = reference_df.iloc[top_idx]['subject']
+                            if pd.notna(suggested_subject) and suggested_subject:
+                                suggestions.append((row['title'], suggested_subject))
+                    if suggestions:
+                        suggestions_df = pd.DataFrame(suggestions, columns=["Title", "Suggested Subject"])
+                        st.markdown("<div class='custom-table'>" + suggestions_df.to_markdown(index=False) + "</div>", unsafe_allow_html=True)
+                    else:
+                        st.markdown("""
+                            <div class='custom-table'>
+                            <b>No metadata enhancement suggestions available.</b>
+                            </div>
+                        """, unsafe_allow_html=True)
+                except Exception as e:
+                    st.error(f"Error generating metadata suggestions: {e}")
             else:
+                st.markdown("""
+                    <div class='custom-table'>
+                    <b>Not enough descriptive data to generate metadata suggestions.</b>
+                    </div>
+                    """, unsafe_allow_html=True)
+        else:
+            st.warning("⚠️ No metadata records found for this collection. Try selecting another one.")