Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
|
@@ -1,4 +1,4 @@
|
|
| 1 |
-
# MetaDiscovery Agent - LOC API with
|
| 2 |
import requests
|
| 3 |
import pandas as pd
|
| 4 |
import numpy as np
|
|
@@ -8,13 +8,12 @@ from sklearn.feature_extraction.text import TfidfVectorizer
|
|
| 8 |
from sklearn.metrics.pairwise import cosine_similarity
|
| 9 |
|
| 10 |
# Streamlit app header
|
| 11 |
-
st.title("
|
| 12 |
st.markdown("""
|
| 13 |
This tool connects to the LOC API, retrieves metadata from a selected collection, and performs
|
| 14 |
an analysis of metadata completeness, suggests enhancements, and identifies authority gaps.
|
| 15 |
""")
|
| 16 |
|
| 17 |
-
# Updated collection URLs using the correct LOC API format
|
| 18 |
# Updated collection URLs using the correct LOC API format
|
| 19 |
collections = {
|
| 20 |
"American Revolutionary War Maps": "american+revolutionary+war+maps",
|
|
@@ -40,12 +39,9 @@ headers = {
|
|
| 40 |
|
| 41 |
try:
|
| 42 |
response = requests.get(collection_url, headers=headers)
|
| 43 |
-
response.raise_for_status()
|
| 44 |
-
|
| 45 |
-
data = response.json() # This line is missing
|
| 46 |
|
| 47 |
-
|
| 48 |
-
# Handle both possible response structures
|
| 49 |
if "results" in data:
|
| 50 |
records = data.get("results", [])
|
| 51 |
elif "items" in data:
|
|
@@ -53,9 +49,8 @@ try:
|
|
| 53 |
else:
|
| 54 |
records = []
|
| 55 |
st.error("Unexpected API response structure. No records found.")
|
| 56 |
-
|
| 57 |
st.write(f"Retrieved {len(records)} records")
|
| 58 |
-
|
| 59 |
except requests.exceptions.RequestException as e:
|
| 60 |
st.error(f"API Connection Error: {e}")
|
| 61 |
records = []
|
|
@@ -63,12 +58,10 @@ except ValueError:
|
|
| 63 |
st.error("Failed to parse API response as JSON")
|
| 64 |
records = []
|
| 65 |
|
| 66 |
-
# Extract selected metadata fields
|
| 67 |
items = []
|
| 68 |
for record in records:
|
| 69 |
-
# Handle different possible data structures
|
| 70 |
if isinstance(record, dict):
|
| 71 |
-
# For direct field access
|
| 72 |
item = {
|
| 73 |
"id": record.get("id", ""),
|
| 74 |
"title": record.get("title", ""),
|
|
@@ -77,58 +70,58 @@ for record in records:
|
|
| 77 |
"creator": record.get("creator", ""),
|
| 78 |
"description": record.get("description", "")
|
| 79 |
}
|
| 80 |
-
|
| 81 |
-
# For nested field access (common in LOC API)
|
| 82 |
if not item["title"] and "item" in record:
|
| 83 |
item["title"] = record.get("item", {}).get("title", "")
|
| 84 |
if not item["date"] and "item" in record:
|
| 85 |
item["date"] = record.get("item", {}).get("date", "")
|
| 86 |
-
|
| 87 |
items.append(item)
|
| 88 |
|
| 89 |
-
# Create DataFrame
|
| 90 |
metadata_df = pd.DataFrame(items)
|
| 91 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 92 |
if not metadata_df.empty:
|
| 93 |
st.subheader("📦 Retrieved Metadata Sample")
|
| 94 |
st.dataframe(metadata_df.head())
|
| 95 |
-
|
| 96 |
-
# Metadata completeness analysis
|
| 97 |
st.subheader("🧠 Metadata Completeness Analysis")
|
| 98 |
-
completeness = metadata_df.
|
| 99 |
completeness_df = pd.DataFrame({"Field": completeness.index, "Completeness (%)": completeness.values})
|
| 100 |
-
|
| 101 |
-
# Plot completeness
|
| 102 |
fig = px.bar(completeness_df, x="Field", y="Completeness (%)", title="Metadata Completeness by Field")
|
| 103 |
st.plotly_chart(fig)
|
| 104 |
-
|
| 105 |
-
#
|
|
|
|
|
|
|
|
|
|
| 106 |
st.subheader("⚠️ Records with Incomplete Metadata")
|
| 107 |
-
incomplete_records = metadata_df[metadata_df.isnull().any(axis=1)]
|
| 108 |
if not incomplete_records.empty:
|
| 109 |
st.dataframe(incomplete_records)
|
| 110 |
else:
|
| 111 |
st.success("All metadata fields are complete in this collection!")
|
| 112 |
-
|
| 113 |
-
# Show exact items that need updates
|
| 114 |
st.subheader("📌 Identifiers of Items Needing Metadata Updates")
|
| 115 |
if not incomplete_records.empty:
|
| 116 |
st.write(incomplete_records[['id', 'title']])
|
| 117 |
else:
|
| 118 |
st.success("All records are complete!")
|
| 119 |
-
|
| 120 |
-
# Suggest metadata using text similarity with better error handling
|
| 121 |
st.subheader("✨ Suggested Metadata Enhancements")
|
| 122 |
-
|
| 123 |
-
# Only process if we have descriptions and enough data
|
| 124 |
filled_descriptions = metadata_df[metadata_df['description'].notnull()]['description'].astype(str)
|
| 125 |
-
|
| 126 |
if len(filled_descriptions) > 1:
|
| 127 |
try:
|
| 128 |
tfidf = TfidfVectorizer(stop_words='english')
|
| 129 |
tfidf_matrix = tfidf.fit_transform(filled_descriptions)
|
| 130 |
-
sim_matrix = cosine_similarity(tfidf_matrix)
|
| 131 |
-
|
| 132 |
suggestions = []
|
| 133 |
for idx, row in incomplete_records.iterrows():
|
| 134 |
if pd.isna(row['subject']) and pd.notna(row['description']):
|
|
@@ -136,9 +129,8 @@ if not metadata_df.empty:
|
|
| 136 |
sims = cosine_similarity(desc_vec, tfidf_matrix).flatten()
|
| 137 |
top_idx = sims.argmax()
|
| 138 |
suggested_subject = metadata_df.iloc[top_idx]['subject']
|
| 139 |
-
if pd.notna(suggested_subject) and suggested_subject:
|
| 140 |
suggestions.append((row['title'], suggested_subject))
|
| 141 |
-
|
| 142 |
if suggestions:
|
| 143 |
suggestions_df = pd.DataFrame(suggestions, columns=["Title", "Suggested Subject"])
|
| 144 |
st.dataframe(suggestions_df)
|
|
@@ -149,4 +141,4 @@ if not metadata_df.empty:
|
|
| 149 |
else:
|
| 150 |
st.info("Not enough descriptive data to generate metadata suggestions.")
|
| 151 |
else:
|
| 152 |
-
st.warning("No metadata records found for this collection. Try selecting another one.")
|
|
|
|
| 1 |
+
# MetaDiscovery Agent - LOC API with Enhanced Completeness and Quality Analysis
|
| 2 |
import requests
|
| 3 |
import pandas as pd
|
| 4 |
import numpy as np
|
|
|
|
| 8 |
from sklearn.metrics.pairwise import cosine_similarity
|
| 9 |
|
| 10 |
# Streamlit app header
|
| 11 |
+
st.title("MetaDiscovery Agent for Library of Congress Collections")
|
| 12 |
st.markdown("""
|
| 13 |
This tool connects to the LOC API, retrieves metadata from a selected collection, and performs
|
| 14 |
an analysis of metadata completeness, suggests enhancements, and identifies authority gaps.
|
| 15 |
""")
|
| 16 |
|
|
|
|
| 17 |
# Updated collection URLs using the correct LOC API format
|
| 18 |
collections = {
|
| 19 |
"American Revolutionary War Maps": "american+revolutionary+war+maps",
|
|
|
|
| 39 |
|
| 40 |
try:
|
| 41 |
response = requests.get(collection_url, headers=headers)
|
| 42 |
+
response.raise_for_status()
|
| 43 |
+
data = response.json()
|
|
|
|
| 44 |
|
|
|
|
|
|
|
| 45 |
if "results" in data:
|
| 46 |
records = data.get("results", [])
|
| 47 |
elif "items" in data:
|
|
|
|
| 49 |
else:
|
| 50 |
records = []
|
| 51 |
st.error("Unexpected API response structure. No records found.")
|
|
|
|
| 52 |
st.write(f"Retrieved {len(records)} records")
|
| 53 |
+
|
| 54 |
except requests.exceptions.RequestException as e:
|
| 55 |
st.error(f"API Connection Error: {e}")
|
| 56 |
records = []
|
|
|
|
| 58 |
st.error("Failed to parse API response as JSON")
|
| 59 |
records = []
|
| 60 |
|
| 61 |
+
# Extract selected metadata fields
|
| 62 |
items = []
|
| 63 |
for record in records:
|
|
|
|
| 64 |
if isinstance(record, dict):
|
|
|
|
| 65 |
item = {
|
| 66 |
"id": record.get("id", ""),
|
| 67 |
"title": record.get("title", ""),
|
|
|
|
| 70 |
"creator": record.get("creator", ""),
|
| 71 |
"description": record.get("description", "")
|
| 72 |
}
|
|
|
|
|
|
|
| 73 |
if not item["title"] and "item" in record:
|
| 74 |
item["title"] = record.get("item", {}).get("title", "")
|
| 75 |
if not item["date"] and "item" in record:
|
| 76 |
item["date"] = record.get("item", {}).get("date", "")
|
|
|
|
| 77 |
items.append(item)
|
| 78 |
|
|
|
|
| 79 |
metadata_df = pd.DataFrame(items)
|
| 80 |
|
| 81 |
+
# Utility functions for deeper metadata quality analysis
|
| 82 |
+
def is_incomplete(value):
|
| 83 |
+
return pd.isna(value) or value in ["", "N/A", "null", None]
|
| 84 |
+
|
| 85 |
+
def is_valid_date(value):
|
| 86 |
+
try:
|
| 87 |
+
pd.to_datetime(value)
|
| 88 |
+
return True
|
| 89 |
+
except:
|
| 90 |
+
return False
|
| 91 |
+
|
| 92 |
if not metadata_df.empty:
|
| 93 |
st.subheader("📦 Retrieved Metadata Sample")
|
| 94 |
st.dataframe(metadata_df.head())
|
| 95 |
+
|
| 96 |
+
# Metadata completeness analysis (enhanced)
|
| 97 |
st.subheader("🧠 Metadata Completeness Analysis")
|
| 98 |
+
completeness = metadata_df.applymap(lambda x: not is_incomplete(x)).mean() * 100
|
| 99 |
completeness_df = pd.DataFrame({"Field": completeness.index, "Completeness (%)": completeness.values})
|
|
|
|
|
|
|
| 100 |
fig = px.bar(completeness_df, x="Field", y="Completeness (%)", title="Metadata Completeness by Field")
|
| 101 |
st.plotly_chart(fig)
|
| 102 |
+
|
| 103 |
+
# Identify incomplete records
|
| 104 |
+
incomplete_mask = metadata_df.applymap(is_incomplete).any(axis=1)
|
| 105 |
+
incomplete_records = metadata_df[incomplete_mask]
|
| 106 |
+
|
| 107 |
st.subheader("⚠️ Records with Incomplete Metadata")
|
|
|
|
| 108 |
if not incomplete_records.empty:
|
| 109 |
st.dataframe(incomplete_records)
|
| 110 |
else:
|
| 111 |
st.success("All metadata fields are complete in this collection!")
|
| 112 |
+
|
|
|
|
| 113 |
st.subheader("📌 Identifiers of Items Needing Metadata Updates")
|
| 114 |
if not incomplete_records.empty:
|
| 115 |
st.write(incomplete_records[['id', 'title']])
|
| 116 |
else:
|
| 117 |
st.success("All records are complete!")
|
| 118 |
+
|
|
|
|
| 119 |
st.subheader("✨ Suggested Metadata Enhancements")
|
|
|
|
|
|
|
| 120 |
filled_descriptions = metadata_df[metadata_df['description'].notnull()]['description'].astype(str)
|
|
|
|
| 121 |
if len(filled_descriptions) > 1:
|
| 122 |
try:
|
| 123 |
tfidf = TfidfVectorizer(stop_words='english')
|
| 124 |
tfidf_matrix = tfidf.fit_transform(filled_descriptions)
|
|
|
|
|
|
|
| 125 |
suggestions = []
|
| 126 |
for idx, row in incomplete_records.iterrows():
|
| 127 |
if pd.isna(row['subject']) and pd.notna(row['description']):
|
|
|
|
| 129 |
sims = cosine_similarity(desc_vec, tfidf_matrix).flatten()
|
| 130 |
top_idx = sims.argmax()
|
| 131 |
suggested_subject = metadata_df.iloc[top_idx]['subject']
|
| 132 |
+
if pd.notna(suggested_subject) and suggested_subject:
|
| 133 |
suggestions.append((row['title'], suggested_subject))
|
|
|
|
| 134 |
if suggestions:
|
| 135 |
suggestions_df = pd.DataFrame(suggestions, columns=["Title", "Suggested Subject"])
|
| 136 |
st.dataframe(suggestions_df)
|
|
|
|
| 141 |
else:
|
| 142 |
st.info("Not enough descriptive data to generate metadata suggestions.")
|
| 143 |
else:
|
| 144 |
+
st.warning("No metadata records found for this collection. Try selecting another one.")
|