CCockrum commited on
Commit
e6e6524
·
verified ·
1 Parent(s): 01e6c66

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +29 -37
app.py CHANGED
@@ -1,4 +1,4 @@
1
- # MetaDiscovery Agent - LOC API with Collection Selector and Search Endpoint + Enhanced Features
2
  import requests
3
  import pandas as pd
4
  import numpy as np
@@ -8,13 +8,12 @@ from sklearn.feature_extraction.text import TfidfVectorizer
8
  from sklearn.metrics.pairwise import cosine_similarity
9
 
10
  # Streamlit app header
11
- st.title("LOC MetaDiscovery Agent")
12
  st.markdown("""
13
  This tool connects to the LOC API, retrieves metadata from a selected collection, and performs
14
  an analysis of metadata completeness, suggests enhancements, and identifies authority gaps.
15
  """)
16
 
17
- # Updated collection URLs using the correct LOC API format
18
  # Updated collection URLs using the correct LOC API format
19
  collections = {
20
  "American Revolutionary War Maps": "american+revolutionary+war+maps",
@@ -40,12 +39,9 @@ headers = {
40
 
41
  try:
42
  response = requests.get(collection_url, headers=headers)
43
- response.raise_for_status() # Raise exception for 4XX/5XX responses
44
-
45
- data = response.json() # This line is missing
46
 
47
-
48
- # Handle both possible response structures
49
  if "results" in data:
50
  records = data.get("results", [])
51
  elif "items" in data:
@@ -53,9 +49,8 @@ try:
53
  else:
54
  records = []
55
  st.error("Unexpected API response structure. No records found.")
56
-
57
  st.write(f"Retrieved {len(records)} records")
58
-
59
  except requests.exceptions.RequestException as e:
60
  st.error(f"API Connection Error: {e}")
61
  records = []
@@ -63,12 +58,10 @@ except ValueError:
63
  st.error("Failed to parse API response as JSON")
64
  records = []
65
 
66
- # Extract selected metadata fields with proper path traversal
67
  items = []
68
  for record in records:
69
- # Handle different possible data structures
70
  if isinstance(record, dict):
71
- # For direct field access
72
  item = {
73
  "id": record.get("id", ""),
74
  "title": record.get("title", ""),
@@ -77,58 +70,58 @@ for record in records:
77
  "creator": record.get("creator", ""),
78
  "description": record.get("description", "")
79
  }
80
-
81
- # For nested field access (common in LOC API)
82
  if not item["title"] and "item" in record:
83
  item["title"] = record.get("item", {}).get("title", "")
84
  if not item["date"] and "item" in record:
85
  item["date"] = record.get("item", {}).get("date", "")
86
-
87
  items.append(item)
88
 
89
- # Create DataFrame
90
  metadata_df = pd.DataFrame(items)
91
 
 
 
 
 
 
 
 
 
 
 
 
92
  if not metadata_df.empty:
93
  st.subheader("📦 Retrieved Metadata Sample")
94
  st.dataframe(metadata_df.head())
95
-
96
- # Metadata completeness analysis
97
  st.subheader("🧠 Metadata Completeness Analysis")
98
- completeness = metadata_df.notnull().mean() * 100
99
  completeness_df = pd.DataFrame({"Field": completeness.index, "Completeness (%)": completeness.values})
100
-
101
- # Plot completeness
102
  fig = px.bar(completeness_df, x="Field", y="Completeness (%)", title="Metadata Completeness by Field")
103
  st.plotly_chart(fig)
104
-
105
- # List records with missing values
 
 
 
106
  st.subheader("⚠️ Records with Incomplete Metadata")
107
- incomplete_records = metadata_df[metadata_df.isnull().any(axis=1)]
108
  if not incomplete_records.empty:
109
  st.dataframe(incomplete_records)
110
  else:
111
  st.success("All metadata fields are complete in this collection!")
112
-
113
- # Show exact items that need updates
114
  st.subheader("📌 Identifiers of Items Needing Metadata Updates")
115
  if not incomplete_records.empty:
116
  st.write(incomplete_records[['id', 'title']])
117
  else:
118
  st.success("All records are complete!")
119
-
120
- # Suggest metadata using text similarity with better error handling
121
  st.subheader("✨ Suggested Metadata Enhancements")
122
-
123
- # Only process if we have descriptions and enough data
124
  filled_descriptions = metadata_df[metadata_df['description'].notnull()]['description'].astype(str)
125
-
126
  if len(filled_descriptions) > 1:
127
  try:
128
  tfidf = TfidfVectorizer(stop_words='english')
129
  tfidf_matrix = tfidf.fit_transform(filled_descriptions)
130
- sim_matrix = cosine_similarity(tfidf_matrix)
131
-
132
  suggestions = []
133
  for idx, row in incomplete_records.iterrows():
134
  if pd.isna(row['subject']) and pd.notna(row['description']):
@@ -136,9 +129,8 @@ if not metadata_df.empty:
136
  sims = cosine_similarity(desc_vec, tfidf_matrix).flatten()
137
  top_idx = sims.argmax()
138
  suggested_subject = metadata_df.iloc[top_idx]['subject']
139
- if pd.notna(suggested_subject) and suggested_subject: # Only add valid suggestions
140
  suggestions.append((row['title'], suggested_subject))
141
-
142
  if suggestions:
143
  suggestions_df = pd.DataFrame(suggestions, columns=["Title", "Suggested Subject"])
144
  st.dataframe(suggestions_df)
@@ -149,4 +141,4 @@ if not metadata_df.empty:
149
  else:
150
  st.info("Not enough descriptive data to generate metadata suggestions.")
151
  else:
152
- st.warning("No metadata records found for this collection. Try selecting another one.")
 
1
+ # MetaDiscovery Agent - LOC API with Enhanced Completeness and Quality Analysis
2
  import requests
3
  import pandas as pd
4
  import numpy as np
 
8
  from sklearn.metrics.pairwise import cosine_similarity
9
 
10
  # Streamlit app header
11
+ st.title("MetaDiscovery Agent for Library of Congress Collections")
12
  st.markdown("""
13
  This tool connects to the LOC API, retrieves metadata from a selected collection, and performs
14
  an analysis of metadata completeness, suggests enhancements, and identifies authority gaps.
15
  """)
16
 
 
17
  # Updated collection URLs using the correct LOC API format
18
  collections = {
19
  "American Revolutionary War Maps": "american+revolutionary+war+maps",
 
39
 
40
  try:
41
  response = requests.get(collection_url, headers=headers)
42
+ response.raise_for_status()
43
+ data = response.json()
 
44
 
 
 
45
  if "results" in data:
46
  records = data.get("results", [])
47
  elif "items" in data:
 
49
  else:
50
  records = []
51
  st.error("Unexpected API response structure. No records found.")
 
52
  st.write(f"Retrieved {len(records)} records")
53
+
54
  except requests.exceptions.RequestException as e:
55
  st.error(f"API Connection Error: {e}")
56
  records = []
 
58
  st.error("Failed to parse API response as JSON")
59
  records = []
60
 
61
+ # Extract selected metadata fields
62
  items = []
63
  for record in records:
 
64
  if isinstance(record, dict):
 
65
  item = {
66
  "id": record.get("id", ""),
67
  "title": record.get("title", ""),
 
70
  "creator": record.get("creator", ""),
71
  "description": record.get("description", "")
72
  }
 
 
73
  if not item["title"] and "item" in record:
74
  item["title"] = record.get("item", {}).get("title", "")
75
  if not item["date"] and "item" in record:
76
  item["date"] = record.get("item", {}).get("date", "")
 
77
  items.append(item)
78
 
 
79
  metadata_df = pd.DataFrame(items)
80
 
81
+ # Utility functions for deeper metadata quality analysis
82
+ def is_incomplete(value):
83
+ return pd.isna(value) or value in ["", "N/A", "null", None]
84
+
85
+ def is_valid_date(value):
86
+ try:
87
+ pd.to_datetime(value)
88
+ return True
89
+ except:
90
+ return False
91
+
92
  if not metadata_df.empty:
93
  st.subheader("📦 Retrieved Metadata Sample")
94
  st.dataframe(metadata_df.head())
95
+
96
+ # Metadata completeness analysis (enhanced)
97
  st.subheader("🧠 Metadata Completeness Analysis")
98
+ completeness = metadata_df.applymap(lambda x: not is_incomplete(x)).mean() * 100
99
  completeness_df = pd.DataFrame({"Field": completeness.index, "Completeness (%)": completeness.values})
 
 
100
  fig = px.bar(completeness_df, x="Field", y="Completeness (%)", title="Metadata Completeness by Field")
101
  st.plotly_chart(fig)
102
+
103
+ # Identify incomplete records
104
+ incomplete_mask = metadata_df.applymap(is_incomplete).any(axis=1)
105
+ incomplete_records = metadata_df[incomplete_mask]
106
+
107
  st.subheader("⚠️ Records with Incomplete Metadata")
 
108
  if not incomplete_records.empty:
109
  st.dataframe(incomplete_records)
110
  else:
111
  st.success("All metadata fields are complete in this collection!")
112
+
 
113
  st.subheader("📌 Identifiers of Items Needing Metadata Updates")
114
  if not incomplete_records.empty:
115
  st.write(incomplete_records[['id', 'title']])
116
  else:
117
  st.success("All records are complete!")
118
+
 
119
  st.subheader("✨ Suggested Metadata Enhancements")
 
 
120
  filled_descriptions = metadata_df[metadata_df['description'].notnull()]['description'].astype(str)
 
121
  if len(filled_descriptions) > 1:
122
  try:
123
  tfidf = TfidfVectorizer(stop_words='english')
124
  tfidf_matrix = tfidf.fit_transform(filled_descriptions)
 
 
125
  suggestions = []
126
  for idx, row in incomplete_records.iterrows():
127
  if pd.isna(row['subject']) and pd.notna(row['description']):
 
129
  sims = cosine_similarity(desc_vec, tfidf_matrix).flatten()
130
  top_idx = sims.argmax()
131
  suggested_subject = metadata_df.iloc[top_idx]['subject']
132
+ if pd.notna(suggested_subject) and suggested_subject:
133
  suggestions.append((row['title'], suggested_subject))
 
134
  if suggestions:
135
  suggestions_df = pd.DataFrame(suggestions, columns=["Title", "Suggested Subject"])
136
  st.dataframe(suggestions_df)
 
141
  else:
142
  st.info("Not enough descriptive data to generate metadata suggestions.")
143
  else:
144
+ st.warning("No metadata records found for this collection. Try selecting another one.")