Spaces:

CCockrum
/

LOC-Metadate-Analyzer

Running

App Files Files Community

LOC-Metadate-Analyzer / app.py

CCockrum

Update app.py

8f294a5 verified 4 months ago

raw

history blame

20.4 kB

	import os
	import requests
	import pandas as pd
	import streamlit as st
	import time
	import matplotlib
	import plotly.express as px
	from sklearn.feature_extraction.text import TfidfVectorizer
	from sklearn.metrics.pairwise import cosine_similarity

	def is_missing(value):
	return pd.isna(value) or str(value).strip() == ""

	# Load the Hugging Face API key from environment
	api_key = os.getenv('HF_API')

	def get_huggingface_suggestions(title, description):
	API_URL = "https://api-inference.huggingface.co/models/facebook/bart-large-mnli"
	headers = {"Authorization": f"Bearer {api_key}"}

	full_text = f"{title}. {description}".strip()

	if not full_text:
	return None

	candidate_labels = [
	"History", "Politics", "Science", "Technology", "Art", "Literature",
	"Education", "Economics", "Military", "Geography", "Sociology",
	"Philosophy", "Religion", "Law", "Medicine", "Engineering",
	"Mathematics", "Computer Science", "Agriculture", "Environment",
	"Maps", "United States", "Civil War", "Revolution", "Posters", "Women's Rights", "World War I"
	]

	payload = {
	"inputs": full_text,
	"parameters": {
	"candidate_labels": candidate_labels,
	"multi_label": True
	}
	}

	try:
	response = requests.post(API_URL, headers=headers, json=payload)
	result = response.json()

	if "error" in result:
	st.error(f"API error: {result['error']}")
	return None

	labels = [
	label for label, score in zip(result.get("labels", []), result.get("scores", []))
	if score > 0.3
	]

	return ", ".join(labels) if labels else None

	except Exception as e:
	st.error(f"API Error: {e}")
	return None

	# Custom CSS
	st.markdown("""
	<style>

	.main {
	background-color: #1A1A1A !important; /* dark */
	color: #D3D3D3 !important;
	}

	}
	.block-container {
	background-color: #D3D3D3 !important;
	color: #cccccc !important;
	padding-left: 3rem !important;
	padding-right: 3rem !important;
	max-width: 900px; /* widen main feed */
	margin: auto; /* center it */
	}
	/* Headings */
	h1, h2, h3, h4 {
	color: #eeeeee !important; /* brighter light gray for headings */
	font-weight: 700 !important; /* bold */
	margin-bottom: 1rem !important;
	}
	p, span, div {
	color: #cccccc !important;
	}
	/* Subheaders (optional) */
	.stSubheader {
	color: #dddddd !important;
	font-size: 1.4rem !important;
	}
	/* Dataframes (optional tweak) */
	.stDataFrame {
	background-color: #2e2e2e !important;
	border-radius: 10px;
	padding: 1rem;
	}
	section[data-testid="stSidebar"] > div:first-child {
	background-color: #808080 !important;
	padding: 1rem;
	border-radius: 0.5rem;
	color: #808080 !important;
	}
	.stMarkdown, .stTextInput, .stDataFrame {
	color: #1A1A1A!important;
	}
	img.banner {
	width: 100%;
	border-radius: 12px;
	margin-bottom: 1rem;
	}
	.stAlert {
	background-color: #f0f0f5 !important;
	color: #1A1A1A !important;
	padding: 1.25rem !important;
	font-size: 1rem !important;
	border-radius: 0.5rem !important;
	box-shadow: 0 2px 5px rgba(0, 0, 0, 0.05) !important;
	}
	header[data-testid="stHeader"] {
	background-color: #1A1A1A !important;
	}

	section[data-testid="stSidebar"] > div:first-child {
	background-color: #1A1A1A !important;
	color: #FFFFFF !important;
	padding: 2rem 1.5rem 1.5rem 1.5rem !important;
	border-radius: 12px;
	box-shadow: 0 4px 12px rgba(0, 0, 0, 0.08);
	font-size: 0.95rem;
	line-height: 1.5;
	}
	;
	html, body, [data-testid="stApp"] {
	background-color: #1A1A1A !important;
	}
	.custom-table {
	background-color: #D3D3D3;
	color: #1A1A1A;
	font-family: monospace;
	padding: 1rem;
	border-radius: 8px;
	overflow-x: auto;
	white-space: pre;
	border: 1px solid #ccc;

	}
	.sidebar-stats {
	color: lightgray !important;
	font-size: 1.1rem !important;
	margin-top: 1.5rem;
	font-weight: 600;
	}
	.sidebar-contrast-block {
	background-color: #2b2b2b !important;
	padding: 1.25rem;
	border-radius: 10px;
	margin-top: 1.5rem;
	}
	section.main > div { /* widen main container */
	max-width: 95%;
	padding-left: 3rem;
	padding-right: 3rem;

	}

	</style>
	""", unsafe_allow_html=True)

	# Function to get subject suggestions using Hugging Face API
	def get_huggingface_suggestions(title, description):
	API_URL = "https://api-inference.huggingface.co/models/facebook/bart-large-mnli"
	# Rest of the function code...

	# Use an image from a URL for the banner
	st.image("https://cdn-uploads.huggingface.co/production/uploads/67351c643fe51cb1aa28f2e5/7ThcAOjbuM8ajrP85bGs4.jpeg", use_container_width=True)

	# Streamlit app header
	st.title("MetaDiscovery Agent for Library of Congress Collections")
	st.markdown("""
	This tool connects to the LOC API, retrieves metadata from a selected collection, and performs
	an analysis of metadata completeness, suggests enhancements, and identifies authority gaps.
	""")

	# Updated collection URLs using the correct LOC API
	collections = {
	"American Revolutionary War Maps": "american+revolutionary+war+maps",
	"Civil War Maps": "civil+war+maps",
	"Women's Suffrage": "women+suffrage",
	"World War I Posters": "world+war+posters"
	}

	# Sidebar for selecting collection
	#st.sidebar.markdown("## Settings")

	# Create empty metadata_df variable to ensure it exists before checking
	metadata_df = pd.DataFrame()

	# Add a key to the selectbox to ensure it refreshes properly
	with st.sidebar:
	st.markdown("""
	<div style='
	background-color: #2b2b2b
	padding: 1.5rem;
	border-radius: 12px;
	margin-bottom: 1.5rem;
	box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
	'>
	""", unsafe_allow_html=True)

	selected = st.radio("Select a Collection", list(collections.keys()), key="collection_selector")

	st.markdown("</div>", unsafe_allow_html=True)

	search_query = collections[selected]

	# Define the collection URL
	collection_url = f"https://www.loc.gov/search/?q={search_query}&fo=json"

	# Create an empty placeholder for Quick Stats
	stats_placeholder = st.sidebar.empty()


	# Add a fetch button to make the action explicit
	fetch_data = True

	if fetch_data:
	# Display a loading spinner while fetching data
	with st.spinner(f"Fetching data for {selected}..."):
	# Fetch data from LOC API with spoofed User-Agent header
	headers = {
	"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 Chrome/110.0.0.0 Safari/537.36"
	}

	try:
	response = requests.get(collection_url, headers=headers)
	response.raise_for_status()
	data = response.json()

	if "results" in data:
	records = data.get("results", [])
	elif "items" in data:
	records = data.get("items", [])
	else:
	records = []
	st.error("Unexpected API response structure. No records found.")
	st.write(f"Retrieved {len(records)} records")

	except requests.exceptions.RequestException as e:
	st.error(f"API Connection Error: {e}")
	records = []
	except ValueError:
	st.error("Failed to parse API response as JSON")
	records = []

	# Extract selected metadata fields
	items = []
	for record in records:
	if isinstance(record, dict):
	description = record.get("description", "")
	if isinstance(description, list):
	description = " ".join([str(d) for d in description])
	item = {
	"id": record.get("id", ""),
	"title": record.get("title", ""),
	"date": record.get("date", ""),
	"subject": ", ".join(record.get("subject", [])) if isinstance(record.get("subject"), list) else record.get("subject", ""),
	"creator": record.get("creator", ""),
	"description": description
	}
	if not item["title"] and "item" in record:
	item["title"] = record.get("item", {}).get("title", "")
	if not item["date"] and "item" in record:
	item["date"] = record.get("item", {}).get("date", "")
	items.append(item)

	metadata_df = pd.DataFrame(items)

	# Missing field detection
	fields_to_check = ["subject", "creator", "date", "title", "description"]
	missing_counts = {}

	for field in fields_to_check:
	if field in metadata_df.columns:
	missing = metadata_df[field].apply(is_missing)
	missing_counts[field] = missing.sum()

	# Define custom completeness check
	def is_incomplete(value):
	return pd.isna(value) or value in ["", "N/A", "null", None]


	if not metadata_df.empty:
	# --- Unified Completeness and Missing Fields Analysis ---

	#Define incompleteness at the cell level
	is_incomplete = lambda value: pd.isna(value) or value in ["", "N/A", "null", None]

	#Create a mask for missing values
	missing_mask = metadata_df.map(is_incomplete)

	#Compute overall record-level completeness
	incomplete_count = missing_mask.any(axis=1).sum()
	total_fields = metadata_df.size
	filled_fields = (~missing_mask).sum().sum()
	overall_percent = (filled_fields / total_fields) * 100

	#Field-specific missing counts (for Missing Metadata Summary)
	missing_counts = missing_mask.sum().sort_values(ascending=False)
	missing_df = (
	pd.DataFrame(missing_counts)
	.reset_index()
	.rename(columns={"index": "Field", 0: "Missing Count"})
	)


	# Field-level completeness
	completeness = (~metadata_df.map(is_incomplete)).mean() * 100
	completeness_df = pd.DataFrame({"Field": completeness.index, "Completeness (%)": completeness.values})
	completeness_table = completeness_df.set_index("Field")

	# Sidebar Quick Stats
	quick_stats = pd.DataFrame({
	"Metric": ["Total Records", "Incomplete Records", "Percent Complete"],
	"Value": [len(metadata_df), incomplete_count, round(overall_percent, 1)]
	})

	styled_quick_stats = (
	quick_stats.style
	.hide(axis="index")
	.background_gradient(cmap="Oranges", subset=["Value"])
	.format({"Value": "{:.1f}"})
	)

	# Add an expander and put the dataframe inside it
	with st.sidebar.expander("Quick Stats", expanded=True):
	st.dataframe(
	styled_quick_stats,
	use_container_width=True,
	hide_index=True
	)

	# Sidebar: Metadata Missing Stats
	missing_df = (
	pd.DataFrame(list(missing_counts.items()), columns=["Field", "Missing Count"])
	.sort_values(by="Missing Count", ascending=False)
	.reset_index(drop=True)
	)

	styled_missing_df = (
	missing_df.style
	.background_gradient(cmap="Blues", subset=["Missing Count"])
	.hide(axis="index")
	)

	with st.sidebar.expander("🧹 Missing Metadata Summary", expanded=True):
	st.dataframe(
	styled_missing_df,
	use_container_width=True,
	hide_index=True, # <<< ADD THIS
	height=min(300, len(missing_df) * 35 + 38)
	)

	# Calculate Top 10 Subjects
	if 'subject' in metadata_df.columns:
	top_subjects = (
	metadata_df['subject']
	.dropna()
	.str.split(',')
	.explode()
	.str.strip()
	.value_counts()
	.head(10)
	.to_frame(name="Count")
	)

	#Most Common Subjects in Sidebar
	with st.sidebar.expander("Top 10 Most Common Subjects", expanded=True):
	st.dataframe(
	top_subjects.style.background_gradient(cmap="Greens").format("{:.0f}"),
	use_container_width=True,
	height=240
	)

	with st.sidebar.expander("Helpful Resources", expanded=False):
	st.markdown("""
	<style>
	.sidebar-links a {
	color: lightgray !important;
	text-decoration: none !important;
	}
	.sidebar-links a:hover {
	text-decoration: underline !important;
	}
	</style>
	<div class="sidebar-links">
	<ul style='padding-left: 1em'>
	<li><a href="https://www.loc.gov/apis/" target="_blank">LOC API Info</a></li>
	<li><a href="https://www.loc.gov/" target="_blank">Library of Congress Homepage</a></li>
	<li><a href="https://www.loc.gov/collections/" target="_blank">LOC Digital Collections</a></li>
	<li><a href="https://www.loc.gov/marc/" target="_blank">MARC Metadata Standards</a></li>
	<li><a href="https://labs.loc.gov/about-labs/digital-strategy/" target="_blank">LOC Digital Strategy</a></li>
	</ul>
	</div>
	""", unsafe_allow_html=True)

	# Utility functions for deeper metadata quality analysis
	def is_incomplete(value):
	return pd.isna(value) or value in ["", "N/A", "null", None]

	def is_valid_date(value):
	try:
	pd.to_datetime(value)
	return True
	except:
	return False

	if not metadata_df.empty:
	st.subheader("Retrieved Metadata Sample")
	st.dataframe(metadata_df.head())


	st.subheader("Field Completeness Breakdown")

	#DARK box for the Field Completeness Breakdown (MATCH others!)
	st.markdown("""
	<div style='
	background-color: #2e2e2e;
	padding: 1.5rem;
	border-radius: 10px;
	margin-top: 1.5rem;
	color: lightgray;
	'>
	""", unsafe_allow_html=True)

	#Dataframe inside the dark box
	st.dataframe(
	completeness_table.style
	.background_gradient(cmap="Greens")
	.format("{:.0f}%")
	.hide(axis="index"),
	use_container_width=True,
	height=240
	)

	st.markdown("</div>", unsafe_allow_html=True)

	# Identify incomplete records
	incomplete_mask = metadata_df.map(is_incomplete).any(axis=1)
	incomplete_records = metadata_df[incomplete_mask]


	# --- Suggested Metadata Enhancements Section ---
	st.subheader("Suggested Metadata Enhancements")

	# Create a row with checkbox for AI suggestions - with proper label
	use_ai = st.checkbox("Use AI Suggestions", value=True, label_visibility="hidden")
	st.markdown("🤖 Use AI Suggestions (Hugging Face)")

	# Check if records exist
	incomplete_with_desc = metadata_df[
	(metadata_df['description'].notnull() \| metadata_df['title'].notnull()) &
	(metadata_df['subject'].isnull())
	]

	if not incomplete_with_desc.empty:
	if use_ai:
	suggestions = []
	records_to_process = min(10, len(incomplete_with_desc))
	progress = st.progress(0)
	status = st.empty()

	for i, (idx, row) in enumerate(incomplete_with_desc.iterrows()):
	if i >= records_to_process:
	break
	title = row['title'] if pd.notna(row['title']) else ""
	description = row['description'] if pd.notna(row['description']) else ""
	status.text(f"Analyzing {i+1}/{records_to_process}: {title[:30]}...")
	suggested_subject = get_huggingface_suggestions(title, description)
	if suggested_subject:
	suggestions.append((title, suggested_subject))
	progress.progress((i + 1) / records_to_process)

	status.empty()
	progress.empty()

	if suggestions:
	suggestions_df = pd.DataFrame(suggestions, columns=["Title", "Suggested Subject"])

	# Create a custom dark-styled HTML table instead
	html_table = """
	<div style="background-color: #1e1e1e; padding: 1.5rem; border-radius: 10px; margin-top: 1rem;">
	<table style="width: 100%; border-collapse: collapse; color: #e0e0e0;">
	<thead>
	<tr style="border-bottom: 1px solid #444;">
	<th style="padding: 12px; text-align: left; color: #e0e0e0;">Title</th>
	<th style="padding: 12px; text-align: left; color: #e0e0e0;">Suggested Subject</th>
	</tr>
	</thead>
	<tbody>
	"""

	for _, row in suggestions_df.iterrows():
	title = row['Title']
	title_display = title[:50] + "..." if len(title) > 50 else title
	subject = row['Suggested Subject']

	# Calculate a shade of green based on confidence or some other metric
	# For demonstration, using a fixed green shade
	green_shade = "rgba(0, 100, 0, 0.3)"

	html_table += f"""
	<tr style="border-bottom: 1px solid #444;">
	<td style="padding: 12px; text-align: left;">{title_display}</td>
	<td style="padding: 12px; text-align: left; background-color: {green_shade};">{subject}</td>
	</tr>
	"""

	html_table += """
	</tbody>
	</table>
	</div>
	"""

	st.markdown(html_table, unsafe_allow_html=True)
	else:
	st.markdown("""
	<div style="background-color: #1e1e1e; padding: 1.5rem; border-radius: 10px; margin-top: 1rem; color: #e0e0e0;">
	No metadata enhancement suggestions available.
	</div>
	""", unsafe_allow_html=True)
	else:
	st.markdown("""
	<div style="background-color: #1e1e1e; padding: 1.5rem; border-radius: 10px; margin-top: 1rem; color: #e0e0e0;">
	Enable AI Suggestions to view recommendations.
	</div>
	""", unsafe_allow_html=True)
	else:
	st.markdown("""
	<div style="background-color: #1e1e1e; padding: 1.5rem; border-radius: 10px; margin-top: 1rem; color: #e0e0e0;">
	All records already have subjects or no usable text available.
	</div>
	""", unsafe_allow_html=True)