Spaces:

CCockrum
/

LOC-Metadate-Analyzer

Running

App Files Files Community

CCockrum commited on Apr 25

Commit

90247f9

verified ·

1 Parent(s): 4e04d7b

Update app.py

Browse files

Files changed (1) hide show

app.py +182 -319

app.py CHANGED Viewed

@@ -1,350 +1,213 @@
 # MetaDiscovery Agent - LOC API with Enhanced Completeness and Quality Analysis
 import requests
 import pandas as pd
-import numpy as np
 import streamlit as st
-import matplotlib
 import plotly.express as px
 from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.metrics.pairwise import cosine_similarity
-# Custom CSS for white background, styled sidebar, banner, and dark grey font
 st.markdown("""
-    <style>
-        .main {
-            background-color: #D3D3D3 !important;
-            color: #1A1A1A!important;
-        }
-        .block-container {
-            background-color: gray !important;
-            color: #808080!important;
-        }
-        section[data-testid="stSidebar"] > div:first-child {
-            background-color: #808080 !important;
-            padding: 1rem;
-            border-radius: 0.5rem;
-            color: #808080 !important;
-        }
-        .stMarkdown, .stTextInput, .stDataFrame {
-            color: #1A1A1A!important;
-        }
-        img.banner {
-            width: 100%;
-            border-radius: 12px;
-            margin-bottom: 1rem;
-        }
-                                 .stAlert {
-            background-color: #f0f0f5 !important;
-            color: #333333 !important;
-            padding: 1.25rem !important;
-            font-size: 1rem !important;
-            border-radius: 0.5rem !important;
-            box-shadow: 0 2px 5px rgba(0, 0, 0, 0.05) !important;
-        }
-        header[data-testid="stHeader"] {
-    background-color: gray !important;
-}
-        section[data-testid="stSidebar"] > div:first-child {
-    background-color: #1A1A1A !important;
-    color: #FFFFFF !important;
-    padding: 2rem 1.5rem 1.5rem 1.5rem !important;
-    border-radius: 12px;
-    box-shadow: 0 4px 12px rgba(0, 0, 0, 0.08);
-    font-size: 0.95rem;
-    line-height: 1.5;
-}
-            .block-container {
-    background-color: gray !important;
-    color: #1A1A1A !important;
-    padding-left: 2rem !important;
-    padding-right: 2rem !important;
-    box-shadow: none !important;
-}
-        html, body, [data-testid="stApp"] {
-            background-color: #1A1A1A !important;
-        }
-        .custom-table {
-            background-color: #D3D3D3;
-            color: #1A1A1A;
-            font-family: monospace;
-            padding: 1rem;
-            border-radius: 8px;
-            overflow-x: auto;
-            white-space: pre;
-            border: 1px solid #ccc;
-        }
-        .sidebar-stats {
-            color: lightgray !important;
-            font-size: 1.1rem !important;
-            margin-top: 1.5rem;
-            font-weight: 600;
-        }
-        .sidebar-contrast-block {
-            background-color: #2b2b2b !important;  /* Slightly lighter than #1A1A1A */
-            padding: 1.25rem;
-            border-radius: 10px;
-            margin-top: 1.5rem;
-        }
 </style>
 """, unsafe_allow_html=True)
-# OPTION 1: Use an image from a URL for the banner
 st.image("https://cdn-uploads.huggingface.co/production/uploads/67351c643fe51cb1aa28f2e5/7ThcAOjbuM8ajrP85bGs4.jpeg", use_container_width=True)
-# Streamlit app header
 st.title("MetaDiscovery Agent for Library of Congress Collections")
-st.markdown("""
-This tool connects to the LOC API, retrieves metadata from a selected collection, and performs
-an analysis of metadata completeness, suggests enhancements, and identifies authority gaps.
-""")
-# Updated collection URLs using the correct LOC API format
 collections = {
     "American Revolutionary War Maps": "american+revolutionary+war+maps",
     "Civil War Maps": "civil+war+maps",
     "Women's Suffrage": "women+suffrage",
     "World War I Posters": "world+war+posters"
 }
-# Sidebar for selecting collection
-#st.sidebar.markdown("## Settings")
-# Create empty metadata_df variable to ensure it exists before checking
-metadata_df = pd.DataFrame()
-# Add a key to the selectbox to ensure it refreshes properly
 selected = st.sidebar.selectbox("Select a collection", list(collections.keys()), key="collection_selector")
 search_query = collections[selected]
-# Define the collection URL
 collection_url = f"https://www.loc.gov/search/?q={search_query}&fo=json"
-# Create an empty placeholder for Quick Stats
-stats_placeholder = st.sidebar.empty()
-# Create placeholder for Field Completeness Breakdown
-completeness_placeholder = st.sidebar.empty()
-# Helpful Resources (styled and moved below dropdown)
-st.sidebar.markdown("### Helpful Resources", unsafe_allow_html=True)
-# Helpful Resources styled section
-# 3. Helpful Resources Section (Fixed, under Completeness)
 st.sidebar.markdown("""
-    <style>
-        .sidebar-section h3 {
-            color: lightgray !important;
-            font-size: 1.1rem !important;
-            margin-top: 1.5rem;
-        }
-        .sidebar-links a {
-            color: lightgray !important;
-            text-decoration: none !important;
-        }
-        .sidebar-links a:hover {
-            text-decoration: underline !important;
-        }
-    </style>
-    <div class="sidebar-section">
-      <h3>🔗 Helpful Resources</h3>
-      <div class="sidebar-links">
-        <ul style='padding-left: 1em'>
-          <li><a href="https://www.loc.gov/apis/" target="_blank">LOC API Info</a></li>
-          <li><a href="https://www.loc.gov/" target="_blank">Library of Congress Homepage</a></li>
-          <li><a href="https://www.loc.gov/collections/" target="_blank">LOC Digital Collections</a></li>
-          <li><a href="https://www.loc.gov/marc/" target="_blank">MARC Metadata Standards</a></li>
-          <li><a href="https://labs.loc.gov/about-labs/digital-strategy/" target="_blank">LOC Digital Strategy</a></li>
-        </ul>
-      </div>
-    </div>
 """, unsafe_allow_html=True)
-# Add a fetch button to make the action explicit
-fetch_data = True
-if fetch_data:
-    # Display a loading spinner while fetching data
-    with st.spinner(f"Fetching data for {selected}..."):
-        # Fetch data from LOC API with spoofed User-Agent header
-        headers = {
-            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 Chrome/110.0.0.0 Safari/537.36"
-        }
-        try:
-            response = requests.get(collection_url, headers=headers)
-            response.raise_for_status()
-            data = response.json()
-            if "results" in data:
-                records = data.get("results", [])
-            elif "items" in data:
-                records = data.get("items", [])
-            else:
-                records = []
-                st.error("Unexpected API response structure. No records found.")
-            st.write(f"Retrieved {len(records)} records")
-        except requests.exceptions.RequestException as e:
-            st.error(f"API Connection Error: {e}")
-            records = []
-        except ValueError:
-            st.error("Failed to parse API response as JSON")
-            records = []
-        # Extract selected metadata fields
-        items = []
-        for record in records:
-            if isinstance(record, dict):
-                description = record.get("description", "")
-                if isinstance(description, list):
-                    description = " ".join([str(d) for d in description])
-                item = {
-                    "id": record.get("id", ""),
-                    "title": record.get("title", ""),
-                    "date": record.get("date", ""),
-                    "subject": ", ".join(record.get("subject", [])) if isinstance(record.get("subject"), list) else record.get("subject", ""),
-                    "creator": record.get("creator", ""),
-                    "description": description
-                }
-                if not item["title"] and "item" in record:
-                    item["title"] = record.get("item", {}).get("title", "")
-                if not item["date"] and "item" in record:
-                    item["date"] = record.get("item", {}).get("date", "")
-                items.append(item)
-        metadata_df = pd.DataFrame(items)
-        # Define custom completeness check
-        def is_incomplete(value):
-            return pd.isna(value) or value in ["", "N/A", "null", None]
-        if not metadata_df.empty:
-            # Incomplete record detection
-            incomplete_mask = metadata_df.apply(lambda row: row.map(is_incomplete), axis=1).any(axis=1)
-            incomplete_count = incomplete_mask.sum()
-            # Overall completeness
-            total_fields = metadata_df.size
-            filled_fields = metadata_df.apply(lambda row: row.map(lambda x: not is_incomplete(x)), axis=1).sum().sum()
-            overall_percent = (filled_fields / total_fields) * 100
-            # Field-by-field completeness
-            completeness = metadata_df.map(lambda x: not is_incomplete(x)).mean() * 100
-            completeness_table = completeness.round(1).to_frame(name="Completeness (%)")
-            # Render stats summary in sidebar
-            stats_html = f"""
-            <div class="sidebar-stats">
-                <h3 style="color: lightgray;">Quick Stats</h3>
-                <p style="color:lightgray;">Total Records: <b>{len(metadata_df)}</b></p>
-                <p style="color:lightgray;">Incomplete Records: <b>{incomplete_count}</b></p>
-                <p style="color:lightgray;">Overall Metadata Completeness: <b>{overall_percent:.1f}%</b></p>
-            </div>
-            """
-            stats_placeholder.markdown(stats_html, unsafe_allow_html=True)
-        # Utility functions for deeper metadata quality analysis
-        def is_incomplete(value):
-            return pd.isna(value) or value in ["", "N/A", "null", None]
-        def is_valid_date(value):
             try:
-                pd.to_datetime(value)
-                return True
-            except:
-                return False
-        if not metadata_df.empty:
-            st.subheader("Retrieved Metadata Sample")
-            st.dataframe(metadata_df.head())
-            # Metadata completeness analysis (enhanced)
-            st.subheader("Metadata Completeness Analysis")
-            # Create the completeness table
-            completeness = metadata_df.map(lambda x: not is_incomplete(x)).mean() * 100
-            completeness_df = pd.DataFrame({
-                "Field": completeness.index,
-                "Completeness (%)": completeness.values
-            })
-            completeness_table = completeness_df.set_index("Field")
-            # FILL THE PLACEHOLDER created earlier
-            # FILL THE PLACEHOLDER created earlier
-            with completeness_placeholder:
-                st.markdown("""
-                    <div style='
-                        background-color: #2e2e2e;
-                        padding: 1.2rem;
-                        border-radius: 10px;
-                        margin-top: 1.5rem;
-                        color: lightgray;
-                    '>
-                    <h4 style='margin-bottom: 1rem;'>📊 Field Completeness Breakdown</h4>
-                """, unsafe_allow_html=True)
-                st.dataframe(
-                    completeness_table.style.background_gradient(cmap="Greens").format("{:.1f}%"),
-                    use_container_width=True,
-                    height=240
-                )
-                st.markdown("</div>", unsafe_allow_html=True)
-                completeness_table = completeness.round(1).to_frame(name="Completeness (%)")
-            # Then continue plotting in main panel
-            fig = px.bar(completeness_df, x="Field", y="Completeness (%)", title="Metadata Completeness by Field")
-            st.plotly_chart(fig)
-            # Identify incomplete records
-            incomplete_mask = metadata_df.map(is_incomplete).any(axis=1)
-            incomplete_records = metadata_df[incomplete_mask]
-            st.subheader("✨ Suggested Metadata Enhancements")
-            incomplete_with_desc = incomplete_records[incomplete_records['description'].notnull()]
-            reference_df = metadata_df[metadata_df['subject'].notnull() & metadata_df['description'].notnull()]
-            tfidf = TfidfVectorizer(stop_words='english')
-            if len(incomplete_with_desc) > 1 and len(reference_df) > 1:
-                try:
-                    suggestions = []
-                    tfidf_matrix = tfidf.fit_transform(reference_df['description'])
-                    for idx, row in incomplete_with_desc.iterrows():
-                        if pd.isna(row['subject']) and pd.notna(row['description']):
-                            desc_vec = tfidf.transform([str(row['description'])])
-                            sims = cosine_similarity(desc_vec, tfidf_matrix).flatten()
-                            top_idx = sims.argmax()
-                            suggested_subject = reference_df.iloc[top_idx]['subject']
-                            if pd.notna(suggested_subject) and suggested_subject:
-                                suggestions.append((row['title'], suggested_subject))
-                    if suggestions:
-                        suggestions_df = pd.DataFrame(suggestions, columns=["Title", "Suggested Subject"])
-                        st.markdown("<div class='custom-table'>" + suggestions_df.to_markdown(index=False) + "</div>", unsafe_allow_html=True)
-                    else:
-                        st.markdown("""
-                            <div class='custom-table'>
-                            <b>No metadata enhancement suggestions available.</b>
-                            </div>
-                        """, unsafe_allow_html=True)
-                except Exception as e:
-                    st.error(f"Error generating metadata suggestions: {e}")
-            else:
-                st.markdown("""
-                    <div class='custom-table'>
-                    <b>Not enough descriptive data to generate metadata suggestions.</b>
-                    </div>
-                    """, unsafe_allow_html=True)
         else:
-            st.warning("⚠️ No metadata records found for this collection. Try selecting another one.")

 # MetaDiscovery Agent - LOC API with Enhanced Completeness and Quality Analysis
 import requests
 import pandas as pd
 import streamlit as st
 import plotly.express as px
 from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.metrics.pairwise import cosine_similarity
+import matplotlib
+# --- CUSTOM CSS ---
 st.markdown("""
+<style>
+    html, body, [data-testid="stApp"] {
+        background-color: #1A1A1A !important;
+    }
+    .block-container {
+        background-color: gray !important;
+        color: #1A1A1A !important;
+        padding-left: 2rem !important;
+        padding-right: 2rem !important;
+    }
+    section[data-testid="stSidebar"] > div:first-child {
+        background-color: #1A1A1A !important;
+        color: #FFFFFF !important;
+        padding: 2rem 1.5rem;
+        border-radius: 12px;
+    }
+    .sidebar-contrast-block {
+        background-color: #2b2b2b !important;
+        padding: 1rem;
+        border-radius: 10px;
+        margin-top: 1.5rem;
+        color: lightgray;
+    }
+    .custom-table {
+        background-color: #D3D3D3;
+        color: #1A1A1A;
+        font-family: monospace;
+        padding: 1rem;
+        border-radius: 8px;
+        overflow-x: auto;
+        white-space: pre;
+        border: 1px solid #ccc;
+    }
+    .sidebar-links a {
+        color: lightgray !important;
+        text-decoration: none !important;
+    }
+    .sidebar-links a:hover {
+        text-decoration: underline !important;
+    }
 </style>
 """, unsafe_allow_html=True)
+# --- HEADER ---
 st.image("https://cdn-uploads.huggingface.co/production/uploads/67351c643fe51cb1aa28f2e5/7ThcAOjbuM8ajrP85bGs4.jpeg", use_container_width=True)
 st.title("MetaDiscovery Agent for Library of Congress Collections")
+st.markdown("This tool connects to the LOC API, retrieves metadata from a selected collection, and performs an analysis of metadata completeness, suggests enhancements, and identifies authority gaps.")
+# --- COLLECTION SETUP ---
 collections = {
     "American Revolutionary War Maps": "american+revolutionary+war+maps",
     "Civil War Maps": "civil+war+maps",
     "Women's Suffrage": "women+suffrage",
     "World War I Posters": "world+war+posters"
 }
 selected = st.sidebar.selectbox("Select a collection", list(collections.keys()), key="collection_selector")
 search_query = collections[selected]
 collection_url = f"https://www.loc.gov/search/?q={search_query}&fo=json"
+# --- PLACEHOLDERS FOR SIDEBAR BLOCKS ---
+stats_placeholder = st.sidebar.container()
+completeness_placeholder = st.sidebar.container()
+# --- HELPFUL RESOURCES ---
 st.sidebar.markdown("""
+<div class="sidebar-contrast-block">
+<h4>🔗 Helpful Resources</h4>
+<ul class="sidebar-links">
+  <li><a href="https://www.loc.gov/apis/" target="_blank">LOC API Info</a></li>
+  <li><a href="https://www.loc.gov/" target="_blank">Library of Congress Homepage</a></li>
+  <li><a href="https://www.loc.gov/collections/" target="_blank">LOC Digital Collections</a></li>
+  <li><a href="https://www.loc.gov/marc/" target="_blank">MARC Metadata Standards</a></li>
+  <li><a href="https://labs.loc.gov/about-labs/digital-strategy/" target="_blank">LOC Digital Strategy</a></li>
+</ul>
+</div>
 """, unsafe_allow_html=True)
+# Define Utility Functions
+def is_incomplete(value):
+    return pd.isna(value) or value in ["", "N/A", "null", None]
+def is_valid_date(value):
+    try:
+        pd.to_datetime(value)
+        return True
+    except:
+        return False
+# Fetch data from LOC API
+def fetch_loc_data(collection_url):
+    headers = {"User-Agent": "Mozilla/5.0"}
+    try:
+        response = requests.get(collection_url, headers=headers)
+        response.raise_for_status()
+        data = response.json()
+        if "results" in data:
+            return data["results"]
+        elif "items" in data:
+            return data["items"]
+        else:
+            return []
+    except Exception as e:
+        st.error(f"API Error: {e}")
+        return []
+# Transform Records
+def transform_records(records):
+    items = []
+    for record in records:
+        if isinstance(record, dict):
+            description = record.get("description", "")
+            if isinstance(description, list):
+                description = " ".join(map(str, description))
+            item = {
+                "id": record.get("id", ""),
+                "title": record.get("title", ""),
+                "date": record.get("date", ""),
+                "subject": ", ".join(record.get("subject", [])) if isinstance(record.get("subject"), list) else record.get("subject", ""),
+                "creator": record.get("creator", ""),
+                "description": description
+            }
+            items.append(item)
+    return pd.DataFrame(items)
+# Render Main Application Sections
+def render_main_sections(metadata_df, stats_placeholder, completeness_placeholder):
+    if not metadata_df.empty:
+        # Sidebar Quick Stats
+        incomplete_mask = metadata_df.map(is_incomplete).any(axis=1)
+        incomplete_count = incomplete_mask.sum()
+        total_fields = metadata_df.size
+        filled_fields = metadata_df.map(lambda x: not is_incomplete(x)).sum().sum()
+        overall_percent = (filled_fields / total_fields) * 100
+        # Completeness Table
+        completeness = metadata_df.map(lambda x: not is_incomplete(x)).mean() * 100
+        completeness_table = completeness.round(1).to_frame(name="Completeness (%)")
+        # Update sidebar placeholders
+        stats_html = f"""
+        <div class="sidebar-stats">
+            <h3 style="color: lightgray;">Quick Stats</h3>
+            <p style="color:lightgray;">Total Records: <b>{len(metadata_df)}</b></p>
+            <p style="color:lightgray;">Incomplete Records: <b>{incomplete_count}</b></p>
+            <p style="color:lightgray;">Overall Completeness: <b>{overall_percent:.1f}%</b></p>
+        </div>
+        """
+        stats_placeholder.markdown(stats_html, unsafe_allow_html=True)
+        # Field Completeness Breakdown (inside contrast block)
+        with completeness_placeholder:
+            st.markdown("""
+                <div style='background-color:#2e2e2e; padding:1.25rem; border-radius:8px; margin-top:1.5rem;'>
+                <h4 style='color: lightgray;'>Field Completeness Breakdown</h4>
+            """, unsafe_allow_html=True)
+            st.dataframe(
+                completeness_table.style.background_gradient(cmap="Greens").format("{:.1f}%"),
+                use_container_width=True
+            )
+            st.markdown("</div>", unsafe_allow_html=True)
+        # Main Body
+        st.subheader("📂 Retrieved Metadata Sample")
+        st.dataframe(metadata_df.head())
+        st.subheader("📊 Metadata Completeness Analysis")
+        completeness_df = completeness.reset_index()
+        completeness_df.columns = ["Field", "Completeness (%)"]
+        fig = px.bar(completeness_df, x="Field", y="Completeness (%)", title="Metadata Completeness by Field")
+        st.plotly_chart(fig)
+        # Suggested Metadata Enhancements
+        st.subheader("✨ Suggested Metadata Enhancements")
+        incomplete_with_desc = metadata_df[incomplete_mask & metadata_df['description'].notna()]
+        reference_df = metadata_df[metadata_df['subject'].notna() & metadata_df['description'].notna()]
+        if not incomplete_with_desc.empty and not reference_df.empty:
             try:
+                tfidf = TfidfVectorizer(stop_words='english')
+                tfidf_matrix = tfidf.fit_transform(reference_df['description'])
+                suggestions = []
+                for idx, row in incomplete_with_desc.iterrows():
+                    desc_vec = tfidf.transform([row['description']])
+                    sims = cosine_similarity(desc_vec, tfidf_matrix).flatten()
+                    top_idx = sims.argmax()
+                    suggested_subject = reference_df.iloc[top_idx]['subject']
+                    if suggested_subject:
+                        suggestions.append((row['title'], suggested_subject))
+                if suggestions:
+                    suggestions_df = pd.DataFrame(suggestions, columns=["Title", "Suggested Subject"])
+                    st.markdown("<div class='custom-table'>" + suggestions_df.to_markdown(index=False) + "</div>", unsafe_allow_html=True)
+                else:
+                    st.info("No suggestions could be generated.")
+            except Exception as e:
+                st.error(f"Error generating suggestions: {e}")
         else:
+            st.info("Not enough data for metadata enhancement suggestions.")
+    else:
+        st.warning("No metadata found for the selected collection.")