Spaces:

CCockrum
/

LOC-Metadate-Analyzer

Running

App Files Files Community

CCockrum commited on Apr 25

Commit

4e04d7b

verified ·

1 Parent(s): e1cc37a

Update app.py

Browse files

Files changed (1) hide show

app.py +147 -166

app.py CHANGED Viewed

@@ -1,126 +1,108 @@
 import requests
 import pandas as pd
 import numpy as np
 import streamlit as st
 import plotly.express as px
 from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.metrics.pairwise import cosine_similarity
-# Custom CSS for styling to match the screenshot
 st.markdown("""
     <style>
-        /* Main background and text colors */
         .main {
-            background-color: #1A1A1A !important;
-            color: white !important;
         }
-        /* Container styling */
         .block-container {
-            background-color: #1A1A1A !important;
-            color: white !important;
-            padding-left: 2rem !important;
-            padding-right: 2rem !important;
         }
-        /* Header styling */
-        header[data-testid="stHeader"] {
-            background-color: #1A1A1A !important;
-        }
-        /* Sidebar styling */
         section[data-testid="stSidebar"] > div:first-child {
-            background-color: #1A1A1A !important;
-            color: #FFFFFF !important;
-            padding: 2rem 1.5rem 1.5rem 1.5rem !important;
             border-radius: 12px;
-            box-shadow: 0 4px 12px rgba(0, 0, 0, 0.08);
         }
-        /* Overall app background */
         html, body, [data-testid="stApp"] {
             background-color: #1A1A1A !important;
         }
-        /* Custom table styling */
         .custom-table {
-            background-color: #2e2e2e;
-            color: white;
             font-family: monospace;
             padding: 1rem;
             border-radius: 8px;
             overflow-x: auto;
             white-space: pre;
-            border: 1px solid #444;
         }
-        /* Sidebar stats styling */
         .sidebar-stats {
             color: lightgray !important;
             font-size: 1.1rem !important;
             margin-top: 1.5rem;
             font-weight: 600;
         }
-        /* Sidebar contrast block */
         .sidebar-contrast-block {
-            background-color: #2e2e2e !important;
             padding: 1.25rem;
             border-radius: 10px;
             margin-top: 1.5rem;
         }
-        /* DataFrame styling */
-        .stDataFrame {
-            color: white !important;
-        }
-        /* Markdown text color */
-        .stMarkdown {
-            color: white !important;
-        }
-        /* Title styling */
-        h1, h2, h3 {
-            color: white !important;
-        }
-        /* Alert styling */
-        .stAlert {
-            background-color: #2e2e2e !important;
-            color: white !important;
-            padding: 1.25rem !important;
-            font-size: 1rem !important;
-            border-radius: 0.5rem !important;
-        }
-        /* Chart background */
-        .js-plotly-plot .plotly .main-svg {
-            background-color: #1A1A1A !important;
-        }
-        /* Completeness breakdown section */
-        .field-completeness {
-            background-color: #2e2e2e;
-            padding: 1.2rem;
-            border-radius: 10px;
-            margin-top: 1.5rem;
-            color: lightgray;
-        }
-    </style>
 """, unsafe_allow_html=True)
-# Banner image
 st.image("https://cdn-uploads.huggingface.co/production/uploads/67351c643fe51cb1aa28f2e5/7ThcAOjbuM8ajrP85bGs4.jpeg", use_container_width=True)
-# App header
 st.title("MetaDiscovery Agent for Library of Congress Collections")
 st.markdown("""
-This tool connects to the LOC API, retrieves metadata from a selected collection, and performs an
-analysis of metadata completeness, suggests enhancements, and identifies authority gaps.
 """)
-# Collection URLs using the correct LOC API format
 collections = {
     "American Revolutionary War Maps": "american+revolutionary+war+maps",
     "Civil War Maps": "civil+war+maps",
@@ -128,40 +110,62 @@ collections = {
     "World War I Posters": "world+war+posters"
 }
-# Initialize metadata_df variable
 metadata_df = pd.DataFrame()
-# Add collection selector to sidebar
 selected = st.sidebar.selectbox("Select a collection", list(collections.keys()), key="collection_selector")
 search_query = collections[selected]
 # Define the collection URL
 collection_url = f"https://www.loc.gov/search/?q={search_query}&fo=json"
-# Create placeholders for sidebar elements
 stats_placeholder = st.sidebar.empty()
 completeness_placeholder = st.sidebar.empty()
-# Helpful Resources (styled section in sidebar)
 st.sidebar.markdown("""
-    <div style='
-        margin-top: 1.5rem;
-        color: lightgray;
-    '>
-        <h3 style='font-size: 1.1rem; font-weight: 600;'>🔗 Helpful Resources</h3>
-        <ul style='padding-left: 1em; list-style-type: none;'>
-            <li><a href="https://www.loc.gov/apis/" target="_blank" style="color: lightgray; text-decoration: none;">LOC API Info</a></li>
-            <li><a href="https://www.loc.gov/" target="_blank" style="color: lightgray; text-decoration: none;">Library of Congress Homepage</a></li>
-            <li><a href="https://www.loc.gov/collections/" target="_blank" style="color: lightgray; text-decoration: none;">LOC Digital Collections</a></li>
-            <li><a href="https://www.loc.gov/marc/" target="_blank" style="color: lightgray; text-decoration: none;">MARC Metadata Standards</a></li>
-            <li><a href="https://labs.loc.gov/about-labs/digital-strategy/" target="_blank" style="color: lightgray; text-decoration: none;">LOC Digital Strategy</a></li>
         </ul>
     </div>
 """, unsafe_allow_html=True)
-# Set fetch_data to True to automatically fetch data
-fetch_data = True
 if fetch_data:
     # Display a loading spinner while fetching data
     with st.spinner(f"Fetching data for {selected}..."):
@@ -228,23 +232,6 @@ if fetch_data:
             filled_fields = metadata_df.apply(lambda row: row.map(lambda x: not is_incomplete(x)), axis=1).sum().sum()
             overall_percent = (filled_fields / total_fields) * 100
-            # Add "Overall Metadata Completeness" indicator to sidebar
-            st.sidebar.markdown(
-                f"""
-                <div style='
-                    background-color: #2e2e2e;
-                    padding: 1rem;
-                    border-radius: 10px;
-                    margin-top: 1.5rem;
-                    text-align: center;
-                '>
-                    <h3 style='color: lightgray; font-size: 1rem; margin-bottom: 0.5rem;'>Overall Metadata Completeness:</h3>
-                    <p style='color: white; font-size: 1.8rem; font-weight: bold; margin: 0;'>{overall_percent:.1f}%</p>
-                </div>
-                """,
-                unsafe_allow_html=True
-            )
             # Field-by-field completeness
             completeness = metadata_df.map(lambda x: not is_incomplete(x)).mean() * 100
             completeness_table = completeness.round(1).to_frame(name="Completeness (%)")
@@ -252,85 +239,79 @@ if fetch_data:
             # Render stats summary in sidebar
             stats_html = f"""
             <div class="sidebar-stats">
-                <h3 style="color: lightgray; font-size: 1.1rem;">Quick Stats</h3>
                 <p style="color:lightgray;">Total Records: <b>{len(metadata_df)}</b></p>
                 <p style="color:lightgray;">Incomplete Records: <b>{incomplete_count}</b></p>
             </div>
             """
             stats_placeholder.markdown(stats_html, unsafe_allow_html=True)
-            # Fill the Field Completeness Breakdown placeholder
             with completeness_placeholder:
                 st.markdown("""
-                    <div class='field-completeness'>
-                        <h4 style='margin-bottom: 1rem; color: lightgray;'>Field Completeness Breakdown</h4>
                 """, unsafe_allow_html=True)
-                # Create a dataframe showing completeness percentages
-                completeness_df = pd.DataFrame({
-                    "Field": completeness.index,
-                    "Completeness (%)": completeness.values
-                })
-                # FIX: Format the values before styling to avoid the ValueError
-                # Convert percentages to strings with format applied
-                completeness_df["Completeness (%)"] = completeness_df["Completeness (%)"].apply(lambda x: f"{x:.1f}")
-                # Display the dataframe directly in the sidebar
                 st.dataframe(
-                    completeness_df,  # No styling applied here to avoid format errors
                     use_container_width=True,
                     height=240
                 )
                 st.markdown("</div>", unsafe_allow_html=True)
-            # Display retrieved metadata sample in main panel
-            st.subheader("Retrieved Metadata Sample")
-            st.dataframe(metadata_df.head())
-            # Metadata completeness analysis (bar chart)
-            st.subheader("Metadata Completeness Analysis")
-            # FIX: Convert percentages to numeric for plotting
-            completeness_df["Completeness (%)"] = pd.to_numeric(completeness_df["Completeness (%)"])
-            # Create a bar chart with a dark theme to match the screenshot
-            fig = px.bar(
-                completeness_df,
-                x="Field",
-                y="Completeness (%)",
-                title="Metadata Completeness by Field",
-                color="Completeness (%)",
-                color_continuous_scale="Greens"
-            )
-            # Update the chart layout to match dark theme
-            fig.update_layout(
-                plot_bgcolor="#1A1A1A",
-                paper_bgcolor="#1A1A1A",
-                font_color="white",
-                title_font_color="white",
-                margin=dict(l=10, r=10, t=40, b=10),
-                coloraxis_showscale=False
-            )
-            # Update axes
-            fig.update_xaxes(title_font_color="white", tickfont_color="white", gridcolor="#333333")
-            fig.update_yaxes(title_font_color="white", tickfont_color="white", gridcolor="#333333")
-            st.plotly_chart(fig, use_container_width=True)
-            # Enhanced Metadata section
-            st.subheader("✨ Suggested Metadata Enhancements")
-            # Identify incomplete records with descriptions
             incomplete_mask = metadata_df.map(is_incomplete).any(axis=1)
             incomplete_records = metadata_df[incomplete_mask]
             incomplete_with_desc = incomplete_records[incomplete_records['description'].notnull()]
             reference_df = metadata_df[metadata_df['subject'].notnull() & metadata_df['description'].notnull()]
-            # Create TF-IDF vectorizer
             tfidf = TfidfVectorizer(stop_words='english')
             if len(incomplete_with_desc) > 1 and len(reference_df) > 1:

+# MetaDiscovery Agent - LOC API with Enhanced Completeness and Quality Analysis
 import requests
 import pandas as pd
 import numpy as np
 import streamlit as st
+import matplotlib
 import plotly.express as px
 from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.metrics.pairwise import cosine_similarity
+# Custom CSS for white background, styled sidebar, banner, and dark grey font
 st.markdown("""
     <style>
         .main {
+            background-color: #D3D3D3 !important;
+            color: #1A1A1A!important;
         }
         .block-container {
+            background-color: gray !important;
+            color: #808080!important;
         }
         section[data-testid="stSidebar"] > div:first-child {
+            background-color: #808080 !important;
+            padding: 1rem;
+            border-radius: 0.5rem;
+            color: #808080 !important;
+        }
+        .stMarkdown, .stTextInput, .stDataFrame {
+            color: #1A1A1A!important;
+        }
+        img.banner {
+            width: 100%;
             border-radius: 12px;
+            margin-bottom: 1rem;
         }
+                                 .stAlert {
+            background-color: #f0f0f5 !important;
+            color: #333333 !important;
+            padding: 1.25rem !important;
+            font-size: 1rem !important;
+            border-radius: 0.5rem !important;
+            box-shadow: 0 2px 5px rgba(0, 0, 0, 0.05) !important;
+        }
+        header[data-testid="stHeader"] {
+    background-color: gray !important;
+}
+        section[data-testid="stSidebar"] > div:first-child {
+    background-color: #1A1A1A !important;
+    color: #FFFFFF !important;
+    padding: 2rem 1.5rem 1.5rem 1.5rem !important;
+    border-radius: 12px;
+    box-shadow: 0 4px 12px rgba(0, 0, 0, 0.08);
+    font-size: 0.95rem;
+    line-height: 1.5;
+}
+            .block-container {
+    background-color: gray !important;
+    color: #1A1A1A !important;
+    padding-left: 2rem !important;
+    padding-right: 2rem !important;
+    box-shadow: none !important;
+}
         html, body, [data-testid="stApp"] {
             background-color: #1A1A1A !important;
         }
         .custom-table {
+            background-color: #D3D3D3;
+            color: #1A1A1A;
             font-family: monospace;
             padding: 1rem;
             border-radius: 8px;
             overflow-x: auto;
             white-space: pre;
+            border: 1px solid #ccc;
         }
         .sidebar-stats {
             color: lightgray !important;
             font-size: 1.1rem !important;
             margin-top: 1.5rem;
             font-weight: 600;
         }
         .sidebar-contrast-block {
+            background-color: #2b2b2b !important;  /* Slightly lighter than #1A1A1A */
             padding: 1.25rem;
             border-radius: 10px;
             margin-top: 1.5rem;
         }
+</style>
 """, unsafe_allow_html=True)
+# OPTION 1: Use an image from a URL for the banner
 st.image("https://cdn-uploads.huggingface.co/production/uploads/67351c643fe51cb1aa28f2e5/7ThcAOjbuM8ajrP85bGs4.jpeg", use_container_width=True)
+# Streamlit app header
 st.title("MetaDiscovery Agent for Library of Congress Collections")
 st.markdown("""
+This tool connects to the LOC API, retrieves metadata from a selected collection, and performs
+an analysis of metadata completeness, suggests enhancements, and identifies authority gaps.
 """)
+# Updated collection URLs using the correct LOC API format
 collections = {
     "American Revolutionary War Maps": "american+revolutionary+war+maps",
     "Civil War Maps": "civil+war+maps",
     "World War I Posters": "world+war+posters"
 }
+# Sidebar for selecting collection
+#st.sidebar.markdown("## Settings")
+# Create empty metadata_df variable to ensure it exists before checking
 metadata_df = pd.DataFrame()
+# Add a key to the selectbox to ensure it refreshes properly
 selected = st.sidebar.selectbox("Select a collection", list(collections.keys()), key="collection_selector")
 search_query = collections[selected]
 # Define the collection URL
 collection_url = f"https://www.loc.gov/search/?q={search_query}&fo=json"
+# Create an empty placeholder for Quick Stats
 stats_placeholder = st.sidebar.empty()
+# Create placeholder for Field Completeness Breakdown
 completeness_placeholder = st.sidebar.empty()
+# Helpful Resources (styled and moved below dropdown)
+st.sidebar.markdown("### Helpful Resources", unsafe_allow_html=True)
+# Helpful Resources styled section
+# 3. Helpful Resources Section (Fixed, under Completeness)
 st.sidebar.markdown("""
+    <style>
+        .sidebar-section h3 {
+            color: lightgray !important;
+            font-size: 1.1rem !important;
+            margin-top: 1.5rem;
+        }
+        .sidebar-links a {
+            color: lightgray !important;
+            text-decoration: none !important;
+        }
+        .sidebar-links a:hover {
+            text-decoration: underline !important;
+        }
+    </style>
+    <div class="sidebar-section">
+      <h3>🔗 Helpful Resources</h3>
+      <div class="sidebar-links">
+        <ul style='padding-left: 1em'>
+          <li><a href="https://www.loc.gov/apis/" target="_blank">LOC API Info</a></li>
+          <li><a href="https://www.loc.gov/" target="_blank">Library of Congress Homepage</a></li>
+          <li><a href="https://www.loc.gov/collections/" target="_blank">LOC Digital Collections</a></li>
+          <li><a href="https://www.loc.gov/marc/" target="_blank">MARC Metadata Standards</a></li>
+          <li><a href="https://labs.loc.gov/about-labs/digital-strategy/" target="_blank">LOC Digital Strategy</a></li>
         </ul>
+      </div>
     </div>
 """, unsafe_allow_html=True)
+# Add a fetch button to make the action explicit
+fetch_data = True
 if fetch_data:
     # Display a loading spinner while fetching data
     with st.spinner(f"Fetching data for {selected}..."):
             filled_fields = metadata_df.apply(lambda row: row.map(lambda x: not is_incomplete(x)), axis=1).sum().sum()
             overall_percent = (filled_fields / total_fields) * 100
             # Field-by-field completeness
             completeness = metadata_df.map(lambda x: not is_incomplete(x)).mean() * 100
             completeness_table = completeness.round(1).to_frame(name="Completeness (%)")
             # Render stats summary in sidebar
             stats_html = f"""
             <div class="sidebar-stats">
+                <h3 style="color: lightgray;">Quick Stats</h3>
                 <p style="color:lightgray;">Total Records: <b>{len(metadata_df)}</b></p>
                 <p style="color:lightgray;">Incomplete Records: <b>{incomplete_count}</b></p>
+                <p style="color:lightgray;">Overall Metadata Completeness: <b>{overall_percent:.1f}%</b></p>
             </div>
             """
             stats_placeholder.markdown(stats_html, unsafe_allow_html=True)
+        # Utility functions for deeper metadata quality analysis
+        def is_incomplete(value):
+            return pd.isna(value) or value in ["", "N/A", "null", None]
+        def is_valid_date(value):
+            try:
+                pd.to_datetime(value)
+                return True
+            except:
+                return False
+        if not metadata_df.empty:
+            st.subheader("Retrieved Metadata Sample")
+            st.dataframe(metadata_df.head())
+            # Metadata completeness analysis (enhanced)
+            st.subheader("Metadata Completeness Analysis")
+            # Create the completeness table
+            completeness = metadata_df.map(lambda x: not is_incomplete(x)).mean() * 100
+            completeness_df = pd.DataFrame({
+                "Field": completeness.index,
+                "Completeness (%)": completeness.values
+            })
+            completeness_table = completeness_df.set_index("Field")
+            # FILL THE PLACEHOLDER created earlier
+            # FILL THE PLACEHOLDER created earlier
             with completeness_placeholder:
                 st.markdown("""
+                    <div style='
+                        background-color: #2e2e2e;
+                        padding: 1.2rem;
+                        border-radius: 10px;
+                        margin-top: 1.5rem;
+                        color: lightgray;
+                    '>
+                    <h4 style='margin-bottom: 1rem;'>📊 Field Completeness Breakdown</h4>
                 """, unsafe_allow_html=True)
                 st.dataframe(
+                    completeness_table.style.background_gradient(cmap="Greens").format("{:.1f}%"),
                     use_container_width=True,
                     height=240
                 )
                 st.markdown("</div>", unsafe_allow_html=True)
+                completeness_table = completeness.round(1).to_frame(name="Completeness (%)")
+            # Then continue plotting in main panel
+            fig = px.bar(completeness_df, x="Field", y="Completeness (%)", title="Metadata Completeness by Field")
+            st.plotly_chart(fig)
+            # Identify incomplete records
             incomplete_mask = metadata_df.map(is_incomplete).any(axis=1)
             incomplete_records = metadata_df[incomplete_mask]
+            st.subheader("✨ Suggested Metadata Enhancements")
             incomplete_with_desc = incomplete_records[incomplete_records['description'].notnull()]
             reference_df = metadata_df[metadata_df['subject'].notnull() & metadata_df['description'].notnull()]
             tfidf = TfidfVectorizer(stop_words='english')
             if len(incomplete_with_desc) > 1 and len(reference_df) > 1: