Spaces:

CCockrum
/

LOC-Metadate-Analyzer

Running

App Files Files Community

CCockrum commited on Apr 25

Commit

c39747a

verified ·

1 Parent(s): 90247f9

Update app.py

Browse files

Files changed (1) hide show

app.py +183 -175

app.py CHANGED Viewed

@@ -1,213 +1,221 @@
-# MetaDiscovery Agent - LOC API with Enhanced Completeness and Quality Analysis
 import requests
 import pandas as pd
 import streamlit as st
 import plotly.express as px
 from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.metrics.pairwise import cosine_similarity
-import matplotlib
-# --- CUSTOM CSS ---
 st.markdown("""
-<style>
-    html, body, [data-testid="stApp"] {
-        background-color: #1A1A1A !important;
-    }
-    .block-container {
-        background-color: gray !important;
-        color: #1A1A1A !important;
-        padding-left: 2rem !important;
-        padding-right: 2rem !important;
-    }
-    section[data-testid="stSidebar"] > div:first-child {
-        background-color: #1A1A1A !important;
-        color: #FFFFFF !important;
-        padding: 2rem 1.5rem;
-        border-radius: 12px;
-    }
-    .sidebar-contrast-block {
-        background-color: #2b2b2b !important;
-        padding: 1rem;
-        border-radius: 10px;
-        margin-top: 1.5rem;
-        color: lightgray;
-    }
-    .custom-table {
-        background-color: #D3D3D3;
-        color: #1A1A1A;
-        font-family: monospace;
-        padding: 1rem;
-        border-radius: 8px;
-        overflow-x: auto;
-        white-space: pre;
-        border: 1px solid #ccc;
-    }
-    .sidebar-links a {
-        color: lightgray !important;
-        text-decoration: none !important;
-    }
-    .sidebar-links a:hover {
-        text-decoration: underline !important;
-    }
-</style>
 """, unsafe_allow_html=True)
-# --- HEADER ---
 st.image("https://cdn-uploads.huggingface.co/production/uploads/67351c643fe51cb1aa28f2e5/7ThcAOjbuM8ajrP85bGs4.jpeg", use_container_width=True)
 st.title("MetaDiscovery Agent for Library of Congress Collections")
-st.markdown("This tool connects to the LOC API, retrieves metadata from a selected collection, and performs an analysis of metadata completeness, suggests enhancements, and identifies authority gaps.")
-# --- COLLECTION SETUP ---
 collections = {
     "American Revolutionary War Maps": "american+revolutionary+war+maps",
     "Civil War Maps": "civil+war+maps",
     "Women's Suffrage": "women+suffrage",
     "World War I Posters": "world+war+posters"
 }
 selected = st.sidebar.selectbox("Select a collection", list(collections.keys()), key="collection_selector")
 search_query = collections[selected]
 collection_url = f"https://www.loc.gov/search/?q={search_query}&fo=json"
-# --- PLACEHOLDERS FOR SIDEBAR BLOCKS ---
-stats_placeholder = st.sidebar.container()
-completeness_placeholder = st.sidebar.container()
-# --- HELPFUL RESOURCES ---
 st.sidebar.markdown("""
-<div class="sidebar-contrast-block">
-<h4>🔗 Helpful Resources</h4>
-<ul class="sidebar-links">
-  <li><a href="https://www.loc.gov/apis/" target="_blank">LOC API Info</a></li>
-  <li><a href="https://www.loc.gov/" target="_blank">Library of Congress Homepage</a></li>
-  <li><a href="https://www.loc.gov/collections/" target="_blank">LOC Digital Collections</a></li>
-  <li><a href="https://www.loc.gov/marc/" target="_blank">MARC Metadata Standards</a></li>
-  <li><a href="https://labs.loc.gov/about-labs/digital-strategy/" target="_blank">LOC Digital Strategy</a></li>
-</ul>
 </div>
 """, unsafe_allow_html=True)
-# Define Utility Functions
-def is_incomplete(value):
-    return pd.isna(value) or value in ["", "N/A", "null", None]
-def is_valid_date(value):
-    try:
-        pd.to_datetime(value)
-        return True
-    except:
-        return False
-# Fetch data from LOC API
-def fetch_loc_data(collection_url):
     headers = {"User-Agent": "Mozilla/5.0"}
     try:
         response = requests.get(collection_url, headers=headers)
         response.raise_for_status()
         data = response.json()
-        if "results" in data:
-            return data["results"]
-        elif "items" in data:
-            return data["items"]
-        else:
-            return []
-    except Exception as e:
-        st.error(f"API Error: {e}")
-        return []
-# Transform Records
-def transform_records(records):
-    items = []
-    for record in records:
-        if isinstance(record, dict):
-            description = record.get("description", "")
-            if isinstance(description, list):
-                description = " ".join(map(str, description))
-            item = {
-                "id": record.get("id", ""),
-                "title": record.get("title", ""),
-                "date": record.get("date", ""),
-                "subject": ", ".join(record.get("subject", [])) if isinstance(record.get("subject"), list) else record.get("subject", ""),
-                "creator": record.get("creator", ""),
-                "description": description
-            }
-            items.append(item)
-    return pd.DataFrame(items)
-# Render Main Application Sections
-def render_main_sections(metadata_df, stats_placeholder, completeness_placeholder):
-    if not metadata_df.empty:
-        # Sidebar Quick Stats
-        incomplete_mask = metadata_df.map(is_incomplete).any(axis=1)
-        incomplete_count = incomplete_mask.sum()
-        total_fields = metadata_df.size
-        filled_fields = metadata_df.map(lambda x: not is_incomplete(x)).sum().sum()
-        overall_percent = (filled_fields / total_fields) * 100
-        # Completeness Table
-        completeness = metadata_df.map(lambda x: not is_incomplete(x)).mean() * 100
-        completeness_table = completeness.round(1).to_frame(name="Completeness (%)")
-        # Update sidebar placeholders
-        stats_html = f"""
-        <div class="sidebar-stats">
-            <h3 style="color: lightgray;">Quick Stats</h3>
-            <p style="color:lightgray;">Total Records: <b>{len(metadata_df)}</b></p>
-            <p style="color:lightgray;">Incomplete Records: <b>{incomplete_count}</b></p>
-            <p style="color:lightgray;">Overall Completeness: <b>{overall_percent:.1f}%</b></p>
-        </div>
-        """
-        stats_placeholder.markdown(stats_html, unsafe_allow_html=True)
-        # Field Completeness Breakdown (inside contrast block)
-        with completeness_placeholder:
-            st.markdown("""
-                <div style='background-color:#2e2e2e; padding:1.25rem; border-radius:8px; margin-top:1.5rem;'>
-                <h4 style='color: lightgray;'>Field Completeness Breakdown</h4>
-            """, unsafe_allow_html=True)
-            st.dataframe(
-                completeness_table.style.background_gradient(cmap="Greens").format("{:.1f}%"),
-                use_container_width=True
-            )
-            st.markdown("</div>", unsafe_allow_html=True)
-        # Main Body
-        st.subheader("📂 Retrieved Metadata Sample")
-        st.dataframe(metadata_df.head())
-        st.subheader("📊 Metadata Completeness Analysis")
-        completeness_df = completeness.reset_index()
-        completeness_df.columns = ["Field", "Completeness (%)"]
-        fig = px.bar(completeness_df, x="Field", y="Completeness (%)", title="Metadata Completeness by Field")
-        st.plotly_chart(fig)
-        # Suggested Metadata Enhancements
-        st.subheader("✨ Suggested Metadata Enhancements")
-        incomplete_with_desc = metadata_df[incomplete_mask & metadata_df['description'].notna()]
-        reference_df = metadata_df[metadata_df['subject'].notna() & metadata_df['description'].notna()]
-        if not incomplete_with_desc.empty and not reference_df.empty:
-            try:
-                tfidf = TfidfVectorizer(stop_words='english')
-                tfidf_matrix = tfidf.fit_transform(reference_df['description'])
-                suggestions = []
-                for idx, row in incomplete_with_desc.iterrows():
-                    desc_vec = tfidf.transform([row['description']])
                     sims = cosine_similarity(desc_vec, tfidf_matrix).flatten()
                     top_idx = sims.argmax()
                     suggested_subject = reference_df.iloc[top_idx]['subject']
-                    if suggested_subject:
                         suggestions.append((row['title'], suggested_subject))
-                if suggestions:
-                    suggestions_df = pd.DataFrame(suggestions, columns=["Title", "Suggested Subject"])
-                    st.markdown("<div class='custom-table'>" + suggestions_df.to_markdown(index=False) + "</div>", unsafe_allow_html=True)
-                else:
-                    st.info("No suggestions could be generated.")
-            except Exception as e:
-                st.error(f"Error generating suggestions: {e}")
-        else:
-            st.info("Not enough data for metadata enhancement suggestions.")
     else:
-        st.warning("No metadata found for the selected collection.")

 import requests
 import pandas as pd
 import streamlit as st
+import matplotlib
 import plotly.express as px
 from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.metrics.pairwise import cosine_similarity
+# ------------------- Custom CSS -------------------
 st.markdown("""
+    <style>
+        html, body, [data-testid="stApp"] {
+            background-color: #1A1A1A !important;
+        }
+        .main {
+            background-color: #D3D3D3 !important;
+            color: #1A1A1A!important;
+        }
+        .block-container {
+            background-color: gray !important;
+            color: #1A1A1A !important;
+            padding-left: 2rem !important;
+            padding-right: 2rem !important;
+        }
+        section[data-testid="stSidebar"] > div:first-child {
+            background-color: #1A1A1A !important;
+            color: #FFFFFF !important;
+            padding: 2rem 1.5rem 1.5rem 1.5rem !important;
+            border-radius: 12px;
+            box-shadow: 0 4px 12px rgba(0, 0, 0, 0.08);
+            font-size: 0.95rem;
+        }
+        .custom-table {
+            background-color: #D3D3D3;
+            color: #1A1A1A;
+            font-family: monospace;
+            padding: 1rem;
+            border-radius: 8px;
+            overflow-x: auto;
+            white-space: pre;
+            border: 1px solid #ccc;
+        }
+        .sidebar-stats {
+            color: lightgray !important;
+            font-size: 1.1rem !important;
+            font-weight: 600;
+        }
+        .sidebar-contrast-block {
+            background-color: #2b2b2b !important;
+            padding: 1.25rem;
+            border-radius: 10px;
+            margin-top: 1.5rem;
+        }
+        .sidebar-section h3 {
+            color: lightgray !important;
+            font-size: 1.1rem !important;
+            margin-top: 1.5rem;
+        }
+        .sidebar-links a {
+            color: lightgray !important;
+            text-decoration: none !important;
+        }
+        .sidebar-links a:hover {
+            text-decoration: underline !important;
+        }
+    </style>
 """, unsafe_allow_html=True)
+# ------------------- Banner Image -------------------
 st.image("https://cdn-uploads.huggingface.co/production/uploads/67351c643fe51cb1aa28f2e5/7ThcAOjbuM8ajrP85bGs4.jpeg", use_container_width=True)
+# ------------------- App Title & Description -------------------
 st.title("MetaDiscovery Agent for Library of Congress Collections")
+st.markdown("""
+This tool connects to the LOC API, retrieves metadata from a selected collection, and performs
+an analysis of metadata completeness, suggests enhancements, and identifies authority gaps.
+""")
+# ------------------- Collection Selection -------------------
 collections = {
     "American Revolutionary War Maps": "american+revolutionary+war+maps",
     "Civil War Maps": "civil+war+maps",
     "Women's Suffrage": "women+suffrage",
     "World War I Posters": "world+war+posters"
 }
 selected = st.sidebar.selectbox("Select a collection", list(collections.keys()), key="collection_selector")
 search_query = collections[selected]
 collection_url = f"https://www.loc.gov/search/?q={search_query}&fo=json"
+# ------------------- Placeholders -------------------
+stats_placeholder = st.sidebar.empty()
+completeness_placeholder = st.sidebar.empty()
+# ------------------- Helpful Resources -------------------
 st.sidebar.markdown("""
+<div class="sidebar-section">
+  <h3>🔗 Helpful Resources</h3>
+  <div class="sidebar-links">
+    <ul style='padding-left: 1em'>
+      <li><a href="https://www.loc.gov/apis/" target="_blank">LOC API Info</a></li>
+      <li><a href="https://www.loc.gov/" target="_blank">Library of Congress Homepage</a></li>
+      <li><a href="https://www.loc.gov/collections/" target="_blank">LOC Digital Collections</a></li>
+      <li><a href="https://www.loc.gov/marc/" target="_blank">MARC Metadata Standards</a></li>
+      <li><a href="https://labs.loc.gov/about-labs/digital-strategy/" target="_blank">LOC Digital Strategy</a></li>
+    </ul>
+  </div>
 </div>
 """, unsafe_allow_html=True)
+# ------------------- Fetch Data -------------------
+with st.spinner(f"Fetching data for {selected}..."):
     headers = {"User-Agent": "Mozilla/5.0"}
     try:
         response = requests.get(collection_url, headers=headers)
         response.raise_for_status()
         data = response.json()
+        records = data.get("results") or data.get("items") or []
+    except:
+        records = []
+        st.error("Failed to load data from LOC API")
+# ------------------- Data Preparation -------------------
+items = []
+for record in records:
+    description = record.get("description", "")
+    if isinstance(description, list):
+        description = " ".join([str(d) for d in description])
+    item = {
+        "id": record.get("id", ""),
+        "title": record.get("title", ""),
+        "date": record.get("date", ""),
+        "subject": ", ".join(record.get("subject", [])) if isinstance(record.get("subject"), list) else record.get("subject", ""),
+        "creator": record.get("creator", ""),
+        "description": description
+    }
+    items.append(item)
+metadata_df = pd.DataFrame(items)
+# ------------------- Completeness Logic -------------------
+def is_incomplete(value):
+    return pd.isna(value) or value in ["", "N/A", "null", None]
+if not metadata_df.empty:
+    incomplete_mask = metadata_df.map(is_incomplete).any(axis=1)
+    incomplete_count = incomplete_mask.sum()
+    total_fields = metadata_df.size
+    filled_fields = (~metadata_df.map(is_incomplete)).sum().sum()
+    overall_percent = (filled_fields / total_fields) * 100
+    completeness = (~metadata_df.map(is_incomplete)).mean() * 100
+    completeness_df = pd.DataFrame({"Field": completeness.index, "Completeness (%)": completeness.values})
+    completeness_table = completeness_df.set_index("Field")
+    # ------------------- Quick Stats -------------------
+    stats_html = f"""
+    <div class="sidebar-stats">
+        <h3 style="color: lightgray;">📊 Quick Stats</h3>
+        <p style="color:lightgray;">Total Records: <b>{len(metadata_df)}</b></p>
+        <p style="color:lightgray;">Incomplete Records: <b>{incomplete_count}</b></p>
+        <p style="color:lightgray;">Overall Metadata Completeness: <b>{overall_percent:.1f}%</b></p>
+    </div>
+    """
+    stats_placeholder.markdown(stats_html, unsafe_allow_html=True)
+    # ------------------- Field Completeness Table -------------------
+    with completeness_placeholder:
+        st.markdown("""
+            <div style='
+                background-color: #2e2e2e;
+                padding: 1.2rem;
+                border-radius: 10px;
+                margin-top: 1.5rem;
+                color: lightgray;
+            '>
+            <h4 style='margin-bottom: 1rem;'>Field Completeness Breakdown</h4>
+        """, unsafe_allow_html=True)
+        st.dataframe(
+            completeness_table.style.background_gradient(cmap="Greens").format("{:.1f}%"),
+            use_container_width=True,
+            height=240
+        )
+        st.markdown("</div>", unsafe_allow_html=True)
+    # ------------------- Main Panel -------------------
+    st.subheader("Retrieved Metadata Sample")
+    st.dataframe(metadata_df.head())
+    st.subheader("Metadata Completeness Analysis")
+    fig = px.bar(completeness_df, x="Field", y="Completeness (%)", title="Metadata Completeness by Field")
+    st.plotly_chart(fig)
+    # ------------------- Metadata Suggestions -------------------
+    st.subheader("✨ Suggested Metadata Enhancements")
+    incomplete_with_desc = metadata_df[incomplete_mask & metadata_df['description'].notnull()]
+    reference_df = metadata_df[metadata_df['subject'].notnull() & metadata_df['description'].notnull()]
+    if len(incomplete_with_desc) > 1 and len(reference_df) > 1:
+        try:
+            tfidf = TfidfVectorizer(stop_words='english')
+            tfidf_matrix = tfidf.fit_transform(reference_df['description'])
+            suggestions = []
+            for _, row in incomplete_with_desc.iterrows():
+                if pd.isna(row['subject']) and pd.notna(row['description']):
+                    desc_vec = tfidf.transform([str(row['description'])])
                     sims = cosine_similarity(desc_vec, tfidf_matrix).flatten()
                     top_idx = sims.argmax()
                     suggested_subject = reference_df.iloc[top_idx]['subject']
+                    if pd.notna(suggested_subject):
                         suggestions.append((row['title'], suggested_subject))
+            if suggestions:
+                suggestions_df = pd.DataFrame(suggestions, columns=["Title", "Suggested Subject"])
+                st.markdown("<div class='custom-table'>" + suggestions_df.to_markdown(index=False) + "</div>", unsafe_allow_html=True)
+            else:
+                st.info("No metadata enhancement suggestions available.")
+        except Exception as e:
+            st.error(f"Error generating suggestions: {e}")
     else:
+        st.info("Not enough descriptive data to generate metadata suggestions.")
+else:
+    st.warning("⚠️ No metadata records found for this collection.")