Spaces:

CCockrum
/

LOC-Metadate-Analyzer

Running

App Files Files Community

CCockrum commited on Apr 25

Commit

21b2b3f

verified ·

1 Parent(s): abfa7bf

Update app.py

Browse files

Files changed (1) hide show

app.py +159 -145

app.py CHANGED Viewed

@@ -1,108 +1,126 @@
-# MetaDiscovery Agent - LOC API with Enhanced Completeness and Quality Analysis
 import requests
 import pandas as pd
 import numpy as np
 import streamlit as st
-import matplotlib
 import plotly.express as px
 from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.metrics.pairwise import cosine_similarity
-# Custom CSS for white background, styled sidebar, banner, and dark grey font
 st.markdown("""
     <style>
         .main {
-            background-color: #D3D3D3 !important;
-            color: #1A1A1A!important;
         }
         .block-container {
-            background-color: gray !important;
-            color: #808080!important;
-        }
-        section[data-testid="stSidebar"] > div:first-child {
-            background-color: #808080 !important;
-            padding: 1rem;
-            border-radius: 0.5rem;
-            color: #808080 !important;
         }
-        .stMarkdown, .stTextInput, .stDataFrame {
-            color: #1A1A1A!important;
         }
-        img.banner {
-            width: 100%;
             border-radius: 12px;
-            margin-bottom: 1rem;
-        }
-                                 .stAlert {
-            background-color: #f0f0f5 !important;
-            color: #333333 !important;
-            padding: 1.25rem !important;
-            font-size: 1rem !important;
-            border-radius: 0.5rem !important;
-            box-shadow: 0 2px 5px rgba(0, 0, 0, 0.05) !important;
         }
-        header[data-testid="stHeader"] {
-    background-color: gray !important;
-}
-        section[data-testid="stSidebar"] > div:first-child {
-    background-color: #1A1A1A !important;
-    color: #FFFFFF !important;
-    padding: 2rem 1.5rem 1.5rem 1.5rem !important;
-    border-radius: 12px;
-    box-shadow: 0 4px 12px rgba(0, 0, 0, 0.08);
-    font-size: 0.95rem;
-    line-height: 1.5;
-}
-            .block-container {
-    background-color: gray !important;
-    color: #1A1A1A !important;
-    padding-left: 2rem !important;
-    padding-right: 2rem !important;
-    box-shadow: none !important;
-}
         html, body, [data-testid="stApp"] {
             background-color: #1A1A1A !important;
         }
         .custom-table {
-            background-color: #D3D3D3;
-            color: #1A1A1A;
             font-family: monospace;
             padding: 1rem;
             border-radius: 8px;
             overflow-x: auto;
             white-space: pre;
-            border: 1px solid #ccc;
         }
         .sidebar-stats {
             color: lightgray !important;
             font-size: 1.1rem !important;
             margin-top: 1.5rem;
             font-weight: 600;
         }
         .sidebar-contrast-block {
-            background-color: #2b2b2b !important;  /* Slightly lighter than #1A1A1A */
             padding: 1.25rem;
             border-radius: 10px;
             margin-top: 1.5rem;
         }
-</style>
 """, unsafe_allow_html=True)
-# OPTION 1: Use an image from a URL for the banner
 st.image("https://cdn-uploads.huggingface.co/production/uploads/67351c643fe51cb1aa28f2e5/7ThcAOjbuM8ajrP85bGs4.jpeg", use_container_width=True)
-# Streamlit app header
 st.title("MetaDiscovery Agent for Library of Congress Collections")
 st.markdown("""
-This tool connects to the LOC API, retrieves metadata from a selected collection, and performs
-an analysis of metadata completeness, suggests enhancements, and identifies authority gaps.
 """)
-# Updated collection URLs using the correct LOC API format
 collections = {
     "American Revolutionary War Maps": "american+revolutionary+war+maps",
     "Civil War Maps": "civil+war+maps",
@@ -110,62 +128,40 @@ collections = {
     "World War I Posters": "world+war+posters"
 }
-# Sidebar for selecting collection
-#st.sidebar.markdown("## Settings")
-# Create empty metadata_df variable to ensure it exists before checking
 metadata_df = pd.DataFrame()
-# Add a key to the selectbox to ensure it refreshes properly
 selected = st.sidebar.selectbox("Select a collection", list(collections.keys()), key="collection_selector")
 search_query = collections[selected]
 # Define the collection URL
 collection_url = f"https://www.loc.gov/search/?q={search_query}&fo=json"
-# Create an empty placeholder for Quick Stats
 stats_placeholder = st.sidebar.empty()
-# Create placeholder for Field Completeness Breakdown
 completeness_placeholder = st.sidebar.empty()
-# Helpful Resources (styled and moved below dropdown)
-st.sidebar.markdown("### Helpful Resources", unsafe_allow_html=True)
-# Helpful Resources styled section
-# 3. Helpful Resources Section (Fixed, under Completeness)
 st.sidebar.markdown("""
-    <style>
-        .sidebar-section h3 {
-            color: lightgray !important;
-            font-size: 1.1rem !important;
-            margin-top: 1.5rem;
-        }
-        .sidebar-links a {
-            color: lightgray !important;
-            text-decoration: none !important;
-        }
-        .sidebar-links a:hover {
-            text-decoration: underline !important;
-        }
-    </style>
-    <div class="sidebar-section">
-      <h3>🔗 Helpful Resources</h3>
-      <div class="sidebar-links">
-        <ul style='padding-left: 1em'>
-          <li><a href="https://www.loc.gov/apis/" target="_blank">LOC API Info</a></li>
-          <li><a href="https://www.loc.gov/" target="_blank">Library of Congress Homepage</a></li>
-          <li><a href="https://www.loc.gov/collections/" target="_blank">LOC Digital Collections</a></li>
-          <li><a href="https://www.loc.gov/marc/" target="_blank">MARC Metadata Standards</a></li>
-          <li><a href="https://labs.loc.gov/about-labs/digital-strategy/" target="_blank">LOC Digital Strategy</a></li>
         </ul>
-      </div>
     </div>
 """, unsafe_allow_html=True)
-# Add a fetch button to make the action explicit
 fetch_data = True
 if fetch_data:
     # Display a loading spinner while fetching data
     with st.spinner(f"Fetching data for {selected}..."):
@@ -232,6 +228,23 @@ if fetch_data:
             filled_fields = metadata_df.apply(lambda row: row.map(lambda x: not is_incomplete(x)), axis=1).sum().sum()
             overall_percent = (filled_fields / total_fields) * 100
             # Field-by-field completeness
             completeness = metadata_df.map(lambda x: not is_incomplete(x)).mean() * 100
             completeness_table = completeness.round(1).to_frame(name="Completeness (%)")
@@ -239,77 +252,78 @@ if fetch_data:
             # Render stats summary in sidebar
             stats_html = f"""
             <div class="sidebar-stats">
-                <h3 style="color: lightgray;">Quick Stats</h3>
                 <p style="color:lightgray;">Total Records: <b>{len(metadata_df)}</b></p>
                 <p style="color:lightgray;">Incomplete Records: <b>{incomplete_count}</b></p>
-                <p style="color:lightgray;">Overall Metadata Completeness: <b>{overall_percent:.1f}%</b></p>
             </div>
             """
             stats_placeholder.markdown(stats_html, unsafe_allow_html=True)
-        # Utility functions for deeper metadata quality analysis
-        def is_incomplete(value):
-            return pd.isna(value) or value in ["", "N/A", "null", None]
-        def is_valid_date(value):
-            try:
-                pd.to_datetime(value)
-                return True
-            except:
-                return False
-        if not metadata_df.empty:
-            st.subheader("Retrieved Metadata Sample")
-            st.dataframe(metadata_df.head())
-            # Metadata completeness analysis (enhanced)
-            st.subheader("Metadata Completeness Analysis")
-            # Create the completeness table
-            completeness = metadata_df.map(lambda x: not is_incomplete(x)).mean() * 100
-            completeness_df = pd.DataFrame({
-                "Field": completeness.index,
-                "Completeness (%)": completeness.values
-            })
-            completeness_table = completeness_df.set_index("Field")
-            # FILL THE PLACEHOLDER created earlier
             with completeness_placeholder:
                 st.markdown("""
-                    <div style='
-                        background-color: #2e2e2e;
-                        padding: 1.2rem;
-                        border-radius: 10px;
-                        margin-top: 1.5rem;
-                        color: lightgray;
-                    '>
-                    <h4 style='margin-bottom: 1rem;'>Field Completeness Breakdown</h4>
                 """, unsafe_allow_html=True)
                 st.dataframe(
-                    completeness_table.style.background_gradient(cmap="Greens").format("{:.1f}%"),
                     use_container_width=True,
                     height=240
                 )
                 st.markdown("</div>", unsafe_allow_html=True)
-            # Then continue plotting in main panel
-            fig = px.bar(completeness_df, x="Field", y="Completeness (%)", title="Metadata Completeness by Field")
-            st.plotly_chart(fig)
-            # Identify incomplete records
             incomplete_mask = metadata_df.map(is_incomplete).any(axis=1)
             incomplete_records = metadata_df[incomplete_mask]
-            st.subheader("✨ Suggested Metadata Enhancements")
             incomplete_with_desc = incomplete_records[incomplete_records['description'].notnull()]
             reference_df = metadata_df[metadata_df['subject'].notnull() & metadata_df['description'].notnull()]
             tfidf = TfidfVectorizer(stop_words='english')
             if len(incomplete_with_desc) > 1 and len(reference_df) > 1:

 import requests
 import pandas as pd
 import numpy as np
 import streamlit as st
 import plotly.express as px
 from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.metrics.pairwise import cosine_similarity
+# Custom CSS for styling to match the screenshot
 st.markdown("""
     <style>
+        /* Main background and text colors */
         .main {
+            background-color: #1A1A1A !important;
+            color: white !important;
         }
+        /* Container styling */
         .block-container {
+            background-color: #1A1A1A !important;
+            color: white !important;
+            padding-left: 2rem !important;
+            padding-right: 2rem !important;
         }
+        /* Header styling */
+        header[data-testid="stHeader"] {
+            background-color: #1A1A1A !important;
         }
+        /* Sidebar styling */
+        section[data-testid="stSidebar"] > div:first-child {
+            background-color: #1A1A1A !important;
+            color: #FFFFFF !important;
+            padding: 2rem 1.5rem 1.5rem 1.5rem !important;
             border-radius: 12px;
+            box-shadow: 0 4px 12px rgba(0, 0, 0, 0.08);
         }
+        /* Overall app background */
         html, body, [data-testid="stApp"] {
             background-color: #1A1A1A !important;
         }
+        /* Custom table styling */
         .custom-table {
+            background-color: #2e2e2e;
+            color: white;
             font-family: monospace;
             padding: 1rem;
             border-radius: 8px;
             overflow-x: auto;
             white-space: pre;
+            border: 1px solid #444;
         }
+        /* Sidebar stats styling */
         .sidebar-stats {
             color: lightgray !important;
             font-size: 1.1rem !important;
             margin-top: 1.5rem;
             font-weight: 600;
         }
+        /* Sidebar contrast block */
         .sidebar-contrast-block {
+            background-color: #2e2e2e !important;
             padding: 1.25rem;
             border-radius: 10px;
             margin-top: 1.5rem;
         }
+        /* DataFrame styling */
+        .stDataFrame {
+            color: white !important;
+        }
+        /* Markdown text color */
+        .stMarkdown {
+            color: white !important;
+        }
+        /* Title styling */
+        h1, h2, h3 {
+            color: white !important;
+        }
+        /* Alert styling */
+        .stAlert {
+            background-color: #2e2e2e !important;
+            color: white !important;
+            padding: 1.25rem !important;
+            font-size: 1rem !important;
+            border-radius: 0.5rem !important;
+        }
+        /* Chart background */
+        .js-plotly-plot .plotly .main-svg {
+            background-color: #1A1A1A !important;
+        }
+        /* Completeness breakdown section */
+        .field-completeness {
+            background-color: #2e2e2e;
+            padding: 1.2rem;
+            border-radius: 10px;
+            margin-top: 1.5rem;
+            color: lightgray;
+        }
+    </style>
 """, unsafe_allow_html=True)
+# Banner image
 st.image("https://cdn-uploads.huggingface.co/production/uploads/67351c643fe51cb1aa28f2e5/7ThcAOjbuM8ajrP85bGs4.jpeg", use_container_width=True)
+# App header
 st.title("MetaDiscovery Agent for Library of Congress Collections")
 st.markdown("""
+This tool connects to the LOC API, retrieves metadata from a selected collection, and performs an
+analysis of metadata completeness, suggests enhancements, and identifies authority gaps.
 """)
+# Collection URLs using the correct LOC API format
 collections = {
     "American Revolutionary War Maps": "american+revolutionary+war+maps",
     "Civil War Maps": "civil+war+maps",
     "World War I Posters": "world+war+posters"
 }
+# Initialize metadata_df variable
 metadata_df = pd.DataFrame()
+# Add collection selector to sidebar
 selected = st.sidebar.selectbox("Select a collection", list(collections.keys()), key="collection_selector")
 search_query = collections[selected]
 # Define the collection URL
 collection_url = f"https://www.loc.gov/search/?q={search_query}&fo=json"
+# Create placeholders for sidebar elements
 stats_placeholder = st.sidebar.empty()
 completeness_placeholder = st.sidebar.empty()
+# Helpful Resources (styled section in sidebar)
 st.sidebar.markdown("""
+    <div style='
+        margin-top: 1.5rem;
+        color: lightgray;
+    '>
+        <h3 style='font-size: 1.1rem; font-weight: 600;'>🔗 Helpful Resources</h3>
+        <ul style='padding-left: 1em; list-style-type: none;'>
+            <li><a href="https://www.loc.gov/apis/" target="_blank" style="color: lightgray; text-decoration: none;">LOC API Info</a></li>
+            <li><a href="https://www.loc.gov/" target="_blank" style="color: lightgray; text-decoration: none;">Library of Congress Homepage</a></li>
+            <li><a href="https://www.loc.gov/collections/" target="_blank" style="color: lightgray; text-decoration: none;">LOC Digital Collections</a></li>
+            <li><a href="https://www.loc.gov/marc/" target="_blank" style="color: lightgray; text-decoration: none;">MARC Metadata Standards</a></li>
+            <li><a href="https://labs.loc.gov/about-labs/digital-strategy/" target="_blank" style="color: lightgray; text-decoration: none;">LOC Digital Strategy</a></li>
         </ul>
     </div>
 """, unsafe_allow_html=True)
+# Set fetch_data to True to automatically fetch data
 fetch_data = True
 if fetch_data:
     # Display a loading spinner while fetching data
     with st.spinner(f"Fetching data for {selected}..."):
             filled_fields = metadata_df.apply(lambda row: row.map(lambda x: not is_incomplete(x)), axis=1).sum().sum()
             overall_percent = (filled_fields / total_fields) * 100
+            # Add "Overall Metadata Completeness" indicator to sidebar
+            st.sidebar.markdown(
+                f"""
+                <div style='
+                    background-color: #2e2e2e;
+                    padding: 1rem;
+                    border-radius: 10px;
+                    margin-top: 1.5rem;
+                    text-align: center;
+                '>
+                    <h3 style='color: lightgray; font-size: 1rem; margin-bottom: 0.5rem;'>Overall Metadata Completeness:</h3>
+                    <p style='color: white; font-size: 1.8rem; font-weight: bold; margin: 0;'>{overall_percent:.1f}%</p>
+                </div>
+                """,
+                unsafe_allow_html=True
+            )
             # Field-by-field completeness
             completeness = metadata_df.map(lambda x: not is_incomplete(x)).mean() * 100
             completeness_table = completeness.round(1).to_frame(name="Completeness (%)")
             # Render stats summary in sidebar
             stats_html = f"""
             <div class="sidebar-stats">
+                <h3 style="color: lightgray; font-size: 1.1rem;">Quick Stats</h3>
                 <p style="color:lightgray;">Total Records: <b>{len(metadata_df)}</b></p>
                 <p style="color:lightgray;">Incomplete Records: <b>{incomplete_count}</b></p>
             </div>
             """
             stats_placeholder.markdown(stats_html, unsafe_allow_html=True)
+            # Fill the Field Completeness Breakdown placeholder
             with completeness_placeholder:
                 st.markdown("""
+                    <div class='field-completeness'>
+                        <h4 style='margin-bottom: 1rem; color: lightgray;'>Field Completeness Breakdown</h4>
                 """, unsafe_allow_html=True)
+                # Create a styled dataframe showing completeness percentages
+                completeness_df = pd.DataFrame({
+                    "Field": completeness.index,
+                    "Completeness (%)": completeness.values
+                })
+                # Display the dataframe directly in the sidebar
                 st.dataframe(
+                    completeness_df.style.background_gradient(cmap="Greens").format("{:.1f}%"),
                     use_container_width=True,
                     height=240
                 )
                 st.markdown("</div>", unsafe_allow_html=True)
+            # Display retrieved metadata sample in main panel
+            st.subheader("Retrieved Metadata Sample")
+            st.dataframe(metadata_df.head())
+            # Metadata completeness analysis (bar chart)
+            st.subheader("Metadata Completeness Analysis")
+            # Create a bar chart with a dark theme to match the screenshot
+            fig = px.bar(
+                completeness_df,
+                x="Field",
+                y="Completeness (%)",
+                title="Metadata Completeness by Field",
+                color="Completeness (%)",
+                color_continuous_scale="Greens"
+            )
+            # Update the chart layout to match dark theme
+            fig.update_layout(
+                plot_bgcolor="#1A1A1A",
+                paper_bgcolor="#1A1A1A",
+                font_color="white",
+                title_font_color="white",
+                margin=dict(l=10, r=10, t=40, b=10),
+                coloraxis_showscale=False
+            )
+            # Update axes
+            fig.update_xaxes(title_font_color="white", tickfont_color="white", gridcolor="#333333")
+            fig.update_yaxes(title_font_color="white", tickfont_color="white", gridcolor="#333333")
+            st.plotly_chart(fig, use_container_width=True)
+            # Enhanced Metadata section
+            st.subheader("✨ Suggested Metadata Enhancements")
+            # Identify incomplete records with descriptions
             incomplete_mask = metadata_df.map(is_incomplete).any(axis=1)
             incomplete_records = metadata_df[incomplete_mask]
             incomplete_with_desc = incomplete_records[incomplete_records['description'].notnull()]
             reference_df = metadata_df[metadata_df['subject'].notnull() & metadata_df['description'].notnull()]
+            # Create TF-IDF vectorizer
             tfidf = TfidfVectorizer(stop_words='english')
             if len(incomplete_with_desc) > 1 and len(reference_df) > 1: