import os import requests import pandas as pd import streamlit as st import time import matplotlib import plotly.express as px from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.metrics.pairwise import cosine_similarity def is_missing(value): return pd.isna(value) or str(value).strip() == "" # Load the Hugging Face API key from environment api_key = os.getenv('HF_API') def get_huggingface_suggestions(title, description): API_URL = "https://api-inference.huggingface.co/models/facebook/bart-large-mnli" headers = {"Authorization": f"Bearer {api_key}"} full_text = f"{title}. {description}".strip() if not full_text: return None candidate_labels = [ "History", "Politics", "Science", "Technology", "Art", "Literature", "Education", "Economics", "Military", "Geography", "Sociology", "Philosophy", "Religion", "Law", "Medicine", "Engineering", "Mathematics", "Computer Science", "Agriculture", "Environment", "Maps", "United States", "Civil War", "Revolution", "Posters", "Women's Rights", "World War I" ] payload = { "inputs": full_text, "parameters": { "candidate_labels": candidate_labels, "multi_label": True } } try: response = requests.post(API_URL, headers=headers, json=payload) result = response.json() if "error" in result: st.error(f"API error: {result['error']}") return None labels = [ label for label, score in zip(result.get("labels", []), result.get("scores", [])) if score > 0.3 ] return ", ".join(labels) if labels else None except Exception as e: st.error(f"API Error: {e}") return None # Custom CSS st.markdown(""" """, unsafe_allow_html=True) # Function to get subject suggestions using Hugging Face API def get_huggingface_suggestions(title, description): API_URL = "https://api-inference.huggingface.co/models/facebook/bart-large-mnli" # Rest of the function code... # Use an image from a URL for the banner st.image("https://cdn-uploads.huggingface.co/production/uploads/67351c643fe51cb1aa28f2e5/7ThcAOjbuM8ajrP85bGs4.jpeg", use_container_width=True) # Streamlit app header st.title("MetaDiscovery Agent for Library of Congress Collections") st.markdown(""" This tool connects to the LOC API, retrieves metadata from a selected collection, and performs an analysis of metadata completeness, suggests enhancements, and identifies authority gaps. """) # Updated collection URLs using the correct LOC API collections = { "American Revolutionary War Maps": "american+revolutionary+war+maps", "Civil War Maps": "civil+war+maps", "Women's Suffrage": "women+suffrage", "World War I Posters": "world+war+posters" } # Sidebar for selecting collection #st.sidebar.markdown("## Settings") # Create empty metadata_df variable to ensure it exists before checking metadata_df = pd.DataFrame() # Add a key to the selectbox to ensure it refreshes properly with st.sidebar: st.markdown("""
""", unsafe_allow_html=True) selected = st.radio("Select a Collection", list(collections.keys()), key="collection_selector") st.markdown("
", unsafe_allow_html=True) search_query = collections[selected] # Define the collection URL collection_url = f"https://www.loc.gov/search/?q={search_query}&fo=json" # Create an empty placeholder for Quick Stats stats_placeholder = st.sidebar.empty() # Add a fetch button to make the action explicit fetch_data = True if fetch_data: # Display a loading spinner while fetching data with st.spinner(f"Fetching data for {selected}..."): # Fetch data from LOC API with spoofed User-Agent header headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 Chrome/110.0.0.0 Safari/537.36" } try: response = requests.get(collection_url, headers=headers) response.raise_for_status() data = response.json() if "results" in data: records = data.get("results", []) elif "items" in data: records = data.get("items", []) else: records = [] st.error("Unexpected API response structure. No records found.") st.write(f"Retrieved {len(records)} records") except requests.exceptions.RequestException as e: st.error(f"API Connection Error: {e}") records = [] except ValueError: st.error("Failed to parse API response as JSON") records = [] # Extract selected metadata fields items = [] for record in records: if isinstance(record, dict): description = record.get("description", "") if isinstance(description, list): description = " ".join([str(d) for d in description]) item = { "id": record.get("id", ""), "title": record.get("title", ""), "date": record.get("date", ""), "subject": ", ".join(record.get("subject", [])) if isinstance(record.get("subject"), list) else record.get("subject", ""), "creator": record.get("creator", ""), "description": description } if not item["title"] and "item" in record: item["title"] = record.get("item", {}).get("title", "") if not item["date"] and "item" in record: item["date"] = record.get("item", {}).get("date", "") items.append(item) metadata_df = pd.DataFrame(items) # Missing field detection fields_to_check = ["subject", "creator", "date", "title", "description"] missing_counts = {} for field in fields_to_check: if field in metadata_df.columns: missing = metadata_df[field].apply(is_missing) missing_counts[field] = missing.sum() # Define custom completeness check def is_incomplete(value): return pd.isna(value) or value in ["", "N/A", "null", None] if not metadata_df.empty: # --- Unified Completeness and Missing Fields Analysis --- #Define incompleteness at the cell level is_incomplete = lambda value: pd.isna(value) or value in ["", "N/A", "null", None] #Create a mask for missing values missing_mask = metadata_df.map(is_incomplete) #Compute overall record-level completeness incomplete_count = missing_mask.any(axis=1).sum() total_fields = metadata_df.size filled_fields = (~missing_mask).sum().sum() overall_percent = (filled_fields / total_fields) * 100 #Field-specific missing counts (for Missing Metadata Summary) missing_counts = missing_mask.sum().sort_values(ascending=False) missing_df = ( pd.DataFrame(missing_counts) .reset_index() .rename(columns={"index": "Field", 0: "Missing Count"}) ) # Field-level completeness completeness = (~metadata_df.map(is_incomplete)).mean() * 100 completeness_df = pd.DataFrame({"Field": completeness.index, "Completeness (%)": completeness.values}) completeness_table = completeness_df.set_index("Field") # Sidebar Quick Stats quick_stats = pd.DataFrame({ "Metric": ["Total Records", "Incomplete Records", "Percent Complete"], "Value": [len(metadata_df), incomplete_count, round(overall_percent, 1)] }) styled_quick_stats = ( quick_stats.style .hide(axis="index") .background_gradient(cmap="Oranges", subset=["Value"]) .format({"Value": "{:.1f}"}) ) # Add an expander and put the dataframe inside it with st.sidebar.expander("Quick Stats", expanded=True): st.dataframe( styled_quick_stats, use_container_width=True, hide_index=True ) # Sidebar: Metadata Missing Stats missing_df = ( pd.DataFrame(list(missing_counts.items()), columns=["Field", "Missing Count"]) .sort_values(by="Missing Count", ascending=False) .reset_index(drop=True) ) styled_missing_df = ( missing_df.style .background_gradient(cmap="Blues", subset=["Missing Count"]) .hide(axis="index") ) with st.sidebar.expander("🧹 Missing Metadata Summary", expanded=True): st.dataframe( styled_missing_df, use_container_width=True, hide_index=True, # <<< ADD THIS height=min(300, len(missing_df) * 35 + 38) ) # Calculate Top 10 Subjects if 'subject' in metadata_df.columns: top_subjects = ( metadata_df['subject'] .dropna() .str.split(',') .explode() .str.strip() .value_counts() .head(10) .to_frame(name="Count") ) #Most Common Subjects in Sidebar with st.sidebar.expander("Top 10 Most Common Subjects", expanded=True): st.dataframe( top_subjects.style.background_gradient(cmap="Greens").format("{:.0f}"), use_container_width=True, height=240 ) with st.sidebar.expander("Helpful Resources", expanded=False): st.markdown(""" """, unsafe_allow_html=True) # Utility functions for deeper metadata quality analysis def is_incomplete(value): return pd.isna(value) or value in ["", "N/A", "null", None] def is_valid_date(value): try: pd.to_datetime(value) return True except: return False if not metadata_df.empty: st.subheader("Retrieved Metadata Sample") st.dataframe(metadata_df.head()) st.subheader("Field Completeness Breakdown") #DARK box for the Field Completeness Breakdown (MATCH others!) st.markdown("""
""", unsafe_allow_html=True) #Dataframe inside the dark box st.dataframe( completeness_table.style .background_gradient(cmap="Greens") .format("{:.1f}%") .hide(axis="index"), use_container_width=True, height=240 ) st.markdown("
", unsafe_allow_html=True) # Identify incomplete records incomplete_mask = metadata_df.map(is_incomplete).any(axis=1) incomplete_records = metadata_df[incomplete_mask] # Suggested Metadata Enhancements Section st.subheader("Suggested Metadata Enhancements") # Always show the checkbox use_ai = st.checkbox("Use AI Suggestions", value=True) # Then check if records exist incomplete_with_desc = metadata_df[ (metadata_df['description'].notnull() | metadata_df['title'].notnull()) & (metadata_df['subject'].isnull()) ] if not incomplete_with_desc.empty: if use_ai: suggestions = [] records_to_process = min(10, len(incomplete_with_desc)) progress = st.progress(0) status = st.empty() for i, (idx, row) in enumerate(incomplete_with_desc.iterrows()): if i >= records_to_process: break title = row['title'] if pd.notna(row['title']) else "" description = row['description'] if pd.notna(row['description']) else "" status.text(f"Analyzing {i+1}/{records_to_process}: {title[:30]}...") suggested_subject = get_huggingface_suggestions(title, description) if suggested_subject: suggestions.append((title, suggested_subject)) progress.progress((i + 1) / records_to_process) status.empty() if suggestions: suggestions_df = pd.DataFrame(suggestions, columns=["Title", "Suggested Subject"]) styled_suggestions = ( suggestions_df.style .background_gradient(cmap="Greens", subset=["Suggested Subject"]) .hide(axis="index") ) #Custom dark card wrapper for the table st.markdown("""
""", unsafe_allow_html=True) st.dataframe( styled_suggestions, use_container_width=True, hide_index=True, height=min(300, len(suggestions) * 35 + 38) ) st.markdown("
", unsafe_allow_html=True) else: st.info("No metadata enhancement suggestions available.") else: st.info("Enable AI Suggestions to view recommendations.") else: st.success("All records already have subjects or no usable text available.")