Spaces:

CCockrum
/

LOC-Metadate-Analyzer

Running

App Files Files Community

CCockrum commited on Apr 27

Commit

e004315

verified ·

1 Parent(s): fc8f7bb

Update app.py

Browse files

Files changed (1) hide show

app.py +151 -62

app.py CHANGED Viewed

@@ -1,12 +1,63 @@
 import requests
 import pandas as pd
-import numpy as np
 import streamlit as st
 import matplotlib
 import plotly.express as px
 from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.metrics.pairwise import cosine_similarity
 # Custom CSS
 st.markdown("""
     <style>
@@ -118,6 +169,11 @@ st.markdown("""
 </style>
 """, unsafe_allow_html=True)
 # Use an image from a URL for the banner
 st.image("https://cdn-uploads.huggingface.co/production/uploads/67351c643fe51cb1aa28f2e5/7ThcAOjbuM8ajrP85bGs4.jpeg", use_container_width=True)
@@ -222,24 +278,50 @@ if fetch_data:
         metadata_df = pd.DataFrame(items)
         # Define custom completeness check
         def is_incomplete(value):
             return pd.isna(value) or value in ["", "N/A", "null", None]
         if not metadata_df.empty:
-            incomplete_mask = metadata_df.map(is_incomplete).any(axis=1)
-            incomplete_count = incomplete_mask.sum()
             total_fields = metadata_df.size
-            filled_fields = (~metadata_df.map(is_incomplete)).sum().sum()
             overall_percent = (filled_fields / total_fields) * 100
             # Field-level completeness
             completeness = (~metadata_df.map(is_incomplete)).mean() * 100
             completeness_df = pd.DataFrame({"Field": completeness.index, "Completeness (%)": completeness.values})
             completeness_table = completeness_df.set_index("Field")
-            # Sidebar Quick Stats (index hidden, orange theme)
             quick_stats = pd.DataFrame({
                 "Metric": ["Total Records", "Incomplete Records", "Percent Complete"],
                 "Value": [len(metadata_df), incomplete_count, round(overall_percent, 1)]
@@ -280,6 +362,7 @@ if fetch_data:
                     hide_index=True,  # <<< ADD THIS
                     height=min(300, len(missing_df) * 35 + 38)
             )
             # Calculate Top 10 Subjects
             if 'subject' in metadata_df.columns:
                 top_subjects = (
@@ -339,101 +422,107 @@ if fetch_data:
             st.dataframe(metadata_df.head())
-            # Fill the placeholder created earlier
             st.subheader("Field Completeness Breakdown")
             st.markdown("""
                 <div style='
                     background-color: #2e2e2e;
-                    padding: 1.2rem;
                     border-radius: 10px;
                     margin-top: 1.5rem;
                     color: lightgray;
                 '>
             """, unsafe_allow_html=True)
             st.dataframe(
-                completeness_table.style.background_gradient(cmap="Greens").format("{:.1f}%"),
                 use_container_width=True,
                 height=240
             )
             st.markdown("</div>", unsafe_allow_html=True)
             # Identify incomplete records
             incomplete_mask = metadata_df.map(is_incomplete).any(axis=1)
             incomplete_records = metadata_df[incomplete_mask]
-            st.subheader("Suggested Metadata Enhancements")
-# Look for records with descriptions but missing subjects or other fields
-incomplete_with_desc = metadata_df[metadata_df['description'].notnull() &
-                                  (metadata_df['subject'].isnull() |
-                                   metadata_df['creator'].isnull())]
-# Reference data should be complete records with both subjects and descriptions
-reference_df = metadata_df[metadata_df['subject'].notnull() &
-                          metadata_df['description'].notnull() &
-                          metadata_df['creator'].notnull()]
-# Print debugging info
-st.write(f"Records with descriptions but missing fields: {len(incomplete_with_desc)}")
-st.write(f"Complete reference records: {len(reference_df)}")
-tfidf = TfidfVectorizer(stop_words='english', max_features=1000)
-if len(incomplete_with_desc) > 0 and len(reference_df) > 0:
-    try:
-        suggestions = []
-        # Fit TF-IDF on all complete descriptions
-        tfidf_matrix = tfidf.fit_transform(reference_df['description'].fillna('').astype(str))
-        # For each incomplete record
-        for idx, row in incomplete_with_desc.iterrows():
-            if pd.notna(row['description']):
-                # Transform this record's description
-                desc_vec = tfidf.transform([str(row['description'])])
-                # Get similarity scores to all reference records
-                sims = cosine_similarity(desc_vec, tfidf_matrix).flatten()
-                # Find the top 3 most similar records
-                top_indices = sims.argsort()[-3:][::-1]
-                # Get the most frequent subject among top matches
-                top_subjects = reference_df.iloc[top_indices]['subject'].value_counts().index
-                if len(top_subjects) > 0:
-                    suggested_subject = top_subjects[0]
-                    suggestions.append((row['title'], suggested_subject))
         if suggestions:
             suggestions_df = pd.DataFrame(suggestions, columns=["Title", "Suggested Subject"])
-            # Apply similar styling as your other tables
             styled_suggestions = (
                 suggestions_df.style
                 .background_gradient(cmap="Greens", subset=["Suggested Subject"])
                 .hide(axis="index")
             )
-            # Display as a dataframe with styling
             st.dataframe(
                 styled_suggestions,
                 use_container_width=True,
                 hide_index=True,
-                height=min(240, len(suggestions) * 35 + 38)
             )
         else:
-            empty_df = pd.DataFrame([["No metadata enhancement suggestions available."]],
-                                   columns=["Message"])
-            styled_empty = empty_df.style.hide(axis="index")
-            st.dataframe(styled_empty, use_container_width=True, hide_index=True)
-    except Exception as e:
-        st.error(f"Error generating metadata suggestions: {e}")
-        st.error(f"Error details: {str(e)}")
 else:
-    empty_df = pd.DataFrame([["Not enough descriptive data to generate metadata suggestions."]],
-                           columns=["Message"])
-    styled_empty = empty_df.style.hide(axis="index")
-    st.dataframe(styled_empty, use_container_width=True, hide_index=True)

+import os
 import requests
 import pandas as pd
 import streamlit as st
+import time
 import matplotlib
 import plotly.express as px
 from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.metrics.pairwise import cosine_similarity
+def is_missing(value):
+    return pd.isna(value) or str(value).strip() == ""
+# Load the Hugging Face API key from environment
+api_key = os.getenv('HF_API')
+def get_huggingface_suggestions(title, description):
+    API_URL = "https://api-inference.huggingface.co/models/facebook/bart-large-mnli"
+    headers = {"Authorization": f"Bearer {api_key}"}
+    full_text = f"{title}. {description}".strip()
+    if not full_text:
+        return None
+    candidate_labels = [
+        "History", "Politics", "Science", "Technology", "Art", "Literature",
+        "Education", "Economics", "Military", "Geography", "Sociology",
+        "Philosophy", "Religion", "Law", "Medicine", "Engineering",
+        "Mathematics", "Computer Science", "Agriculture", "Environment",
+        "Maps", "United States", "Civil War", "Revolution", "Posters", "Women's Rights", "World War I"
+    ]
+    payload = {
+        "inputs": full_text,
+        "parameters": {
+            "candidate_labels": candidate_labels,
+            "multi_label": True
+        }
+    }
+    try:
+        response = requests.post(API_URL, headers=headers, json=payload)
+        result = response.json()
+        if "error" in result:
+            st.error(f"API error: {result['error']}")
+            return None
+        labels = [
+            label for label, score in zip(result.get("labels", []), result.get("scores", []))
+            if score > 0.3
+        ]
+        return ", ".join(labels) if labels else None
+    except Exception as e:
+        st.error(f"API Error: {e}")
+        return None
 # Custom CSS
 st.markdown("""
     <style>
 </style>
 """, unsafe_allow_html=True)
+# Function to get subject suggestions using Hugging Face API
+def get_huggingface_suggestions(title, description):
+    API_URL = "https://api-inference.huggingface.co/models/facebook/bart-large-mnli"
+    # Rest of the function code...
 # Use an image from a URL for the banner
 st.image("https://cdn-uploads.huggingface.co/production/uploads/67351c643fe51cb1aa28f2e5/7ThcAOjbuM8ajrP85bGs4.jpeg", use_container_width=True)
         metadata_df = pd.DataFrame(items)
+        # Missing field detection
+        fields_to_check = ["subject", "creator", "date", "title", "description"]
+        missing_counts = {}
+        for field in fields_to_check:
+            if field in metadata_df.columns:
+                missing = metadata_df[field].apply(is_missing)
+                missing_counts[field] = missing.sum()
         # Define custom completeness check
         def is_incomplete(value):
             return pd.isna(value) or value in ["", "N/A", "null", None]
         if not metadata_df.empty:
+            # --- Unified Completeness and Missing Fields Analysis ---
+            #Define incompleteness at the cell level
+            is_incomplete = lambda value: pd.isna(value) or value in ["", "N/A", "null", None]
+            #Create a mask for missing values
+            missing_mask = metadata_df.map(is_incomplete)
+            #Compute overall record-level completeness
+            incomplete_count = missing_mask.any(axis=1).sum()
             total_fields = metadata_df.size
+            filled_fields = (~missing_mask).sum().sum()
             overall_percent = (filled_fields / total_fields) * 100
+            #Field-specific missing counts (for Missing Metadata Summary)
+            missing_counts = missing_mask.sum().sort_values(ascending=False)
+            missing_df = (
+                pd.DataFrame(missing_counts)
+                .reset_index()
+                .rename(columns={"index": "Field", 0: "Missing Count"})
+            )
             # Field-level completeness
             completeness = (~metadata_df.map(is_incomplete)).mean() * 100
             completeness_df = pd.DataFrame({"Field": completeness.index, "Completeness (%)": completeness.values})
             completeness_table = completeness_df.set_index("Field")
+            # Sidebar Quick Stats
             quick_stats = pd.DataFrame({
                 "Metric": ["Total Records", "Incomplete Records", "Percent Complete"],
                 "Value": [len(metadata_df), incomplete_count, round(overall_percent, 1)]
                     hide_index=True,  # <<< ADD THIS
                     height=min(300, len(missing_df) * 35 + 38)
             )
             # Calculate Top 10 Subjects
             if 'subject' in metadata_df.columns:
                 top_subjects = (
             st.dataframe(metadata_df.head())
             st.subheader("Field Completeness Breakdown")
+            #DARK box for the Field Completeness Breakdown (MATCH others!)
             st.markdown("""
                 <div style='
                     background-color: #2e2e2e;
+                    padding: 1.5rem;
                     border-radius: 10px;
                     margin-top: 1.5rem;
                     color: lightgray;
                 '>
             """, unsafe_allow_html=True)
+            #Dataframe inside the dark box
             st.dataframe(
+                completeness_table.style
+                .background_gradient(cmap="Greens")
+                .format("{:.1f}%")
+                .hide(axis="index"),
                 use_container_width=True,
                 height=240
             )
             st.markdown("</div>", unsafe_allow_html=True)
             # Identify incomplete records
             incomplete_mask = metadata_df.map(is_incomplete).any(axis=1)
             incomplete_records = metadata_df[incomplete_mask]
+# Suggested Metadata Enhancements Section
+st.subheader("Suggested Metadata Enhancements")
+# Always show the checkbox
+use_ai = st.checkbox("Use AI Suggestions", value=True)
+# Then check if records exist
+incomplete_with_desc = metadata_df[
+    (metadata_df['description'].notnull() | metadata_df['title'].notnull()) &
+    (metadata_df['subject'].isnull())
+]
+if not incomplete_with_desc.empty:
+    if use_ai:
+        suggestions = []
+        records_to_process = min(10, len(incomplete_with_desc))
+        progress = st.progress(0)
+        status = st.empty()
+        for i, (idx, row) in enumerate(incomplete_with_desc.iterrows()):
+            if i >= records_to_process:
+                break
+            title = row['title'] if pd.notna(row['title']) else ""
+            description = row['description'] if pd.notna(row['description']) else ""
+            status.text(f"Analyzing {i+1}/{records_to_process}: {title[:30]}...")
+            suggested_subject = get_huggingface_suggestions(title, description)
+            if suggested_subject:
+                suggestions.append((title, suggested_subject))
+            progress.progress((i + 1) / records_to_process)
+        status.empty()
         if suggestions:
             suggestions_df = pd.DataFrame(suggestions, columns=["Title", "Suggested Subject"])
             styled_suggestions = (
                 suggestions_df.style
                 .background_gradient(cmap="Greens", subset=["Suggested Subject"])
                 .hide(axis="index")
             )
+            #Custom dark card wrapper for the table
+            st.markdown("""
+                <div style='
+                    background-color: #2e2e2e;
+                    padding: 1.5rem;
+                    border-radius: 10px;
+                    margin-top: 1.5rem;
+                    color: lightgray;
+                '>
+            """, unsafe_allow_html=True)
             st.dataframe(
                 styled_suggestions,
                 use_container_width=True,
                 hide_index=True,
+                height=min(300, len(suggestions) * 35 + 38)
             )
+            st.markdown("</div>", unsafe_allow_html=True)
         else:
+            st.info("No metadata enhancement suggestions available.")
+    else:
+        st.info("Enable AI Suggestions to view recommendations.")
 else:
+    st.success("All records already have subjects or no usable text available.")