CCockrum commited on
Commit
90247f9
Β·
verified Β·
1 Parent(s): 4e04d7b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +182 -319
app.py CHANGED
@@ -1,350 +1,213 @@
1
  # MetaDiscovery Agent - LOC API with Enhanced Completeness and Quality Analysis
2
  import requests
3
  import pandas as pd
4
- import numpy as np
5
  import streamlit as st
6
- import matplotlib
7
  import plotly.express as px
8
  from sklearn.feature_extraction.text import TfidfVectorizer
9
  from sklearn.metrics.pairwise import cosine_similarity
 
10
 
11
- # Custom CSS for white background, styled sidebar, banner, and dark grey font
12
  st.markdown("""
13
- <style>
14
-
15
- .main {
16
- background-color: #D3D3D3 !important;
17
- color: #1A1A1A!important;
18
-
19
- }
20
- .block-container {
21
- background-color: gray !important;
22
- color: #808080!important;
23
- }
24
- section[data-testid="stSidebar"] > div:first-child {
25
- background-color: #808080 !important;
26
- padding: 1rem;
27
- border-radius: 0.5rem;
28
- color: #808080 !important;
29
- }
30
- .stMarkdown, .stTextInput, .stDataFrame {
31
- color: #1A1A1A!important;
32
- }
33
- img.banner {
34
- width: 100%;
35
- border-radius: 12px;
36
- margin-bottom: 1rem;
37
- }
38
- .stAlert {
39
- background-color: #f0f0f5 !important;
40
- color: #333333 !important;
41
- padding: 1.25rem !important;
42
- font-size: 1rem !important;
43
- border-radius: 0.5rem !important;
44
- box-shadow: 0 2px 5px rgba(0, 0, 0, 0.05) !important;
45
- }
46
- header[data-testid="stHeader"] {
47
- background-color: gray !important;
48
- }
49
- section[data-testid="stSidebar"] > div:first-child {
50
- background-color: #1A1A1A !important;
51
- color: #FFFFFF !important;
52
- padding: 2rem 1.5rem 1.5rem 1.5rem !important;
53
- border-radius: 12px;
54
- box-shadow: 0 4px 12px rgba(0, 0, 0, 0.08);
55
- font-size: 0.95rem;
56
- line-height: 1.5;
57
- }
58
- .block-container {
59
- background-color: gray !important;
60
- color: #1A1A1A !important;
61
- padding-left: 2rem !important;
62
- padding-right: 2rem !important;
63
- box-shadow: none !important;
64
- }
65
- html, body, [data-testid="stApp"] {
66
- background-color: #1A1A1A !important;
67
- }
68
- .custom-table {
69
- background-color: #D3D3D3;
70
- color: #1A1A1A;
71
- font-family: monospace;
72
- padding: 1rem;
73
- border-radius: 8px;
74
- overflow-x: auto;
75
- white-space: pre;
76
- border: 1px solid #ccc;
77
-
78
- }
79
- .sidebar-stats {
80
- color: lightgray !important;
81
- font-size: 1.1rem !important;
82
- margin-top: 1.5rem;
83
- font-weight: 600;
84
- }
85
- .sidebar-contrast-block {
86
- background-color: #2b2b2b !important; /* Slightly lighter than #1A1A1A */
87
- padding: 1.25rem;
88
- border-radius: 10px;
89
- margin-top: 1.5rem;
90
- }
91
-
92
  </style>
93
  """, unsafe_allow_html=True)
94
 
95
- # OPTION 1: Use an image from a URL for the banner
96
  st.image("https://cdn-uploads.huggingface.co/production/uploads/67351c643fe51cb1aa28f2e5/7ThcAOjbuM8ajrP85bGs4.jpeg", use_container_width=True)
97
-
98
- # Streamlit app header
99
  st.title("MetaDiscovery Agent for Library of Congress Collections")
100
- st.markdown("""
101
- This tool connects to the LOC API, retrieves metadata from a selected collection, and performs
102
- an analysis of metadata completeness, suggests enhancements, and identifies authority gaps.
103
- """)
104
 
105
- # Updated collection URLs using the correct LOC API format
106
  collections = {
107
  "American Revolutionary War Maps": "american+revolutionary+war+maps",
108
  "Civil War Maps": "civil+war+maps",
109
  "Women's Suffrage": "women+suffrage",
110
  "World War I Posters": "world+war+posters"
111
  }
112
-
113
- # Sidebar for selecting collection
114
- #st.sidebar.markdown("## Settings")
115
-
116
- # Create empty metadata_df variable to ensure it exists before checking
117
- metadata_df = pd.DataFrame()
118
-
119
- # Add a key to the selectbox to ensure it refreshes properly
120
  selected = st.sidebar.selectbox("Select a collection", list(collections.keys()), key="collection_selector")
121
  search_query = collections[selected]
122
-
123
- # Define the collection URL
124
  collection_url = f"https://www.loc.gov/search/?q={search_query}&fo=json"
125
 
126
- # Create an empty placeholder for Quick Stats
127
- stats_placeholder = st.sidebar.empty()
 
128
 
129
- # Create placeholder for Field Completeness Breakdown
130
- completeness_placeholder = st.sidebar.empty()
131
-
132
- # Helpful Resources (styled and moved below dropdown)
133
- st.sidebar.markdown("### Helpful Resources", unsafe_allow_html=True)
134
- # Helpful Resources styled section
135
- # 3. Helpful Resources Section (Fixed, under Completeness)
136
  st.sidebar.markdown("""
137
- <style>
138
- .sidebar-section h3 {
139
- color: lightgray !important;
140
- font-size: 1.1rem !important;
141
- margin-top: 1.5rem;
142
- }
143
- .sidebar-links a {
144
- color: lightgray !important;
145
- text-decoration: none !important;
146
- }
147
- .sidebar-links a:hover {
148
- text-decoration: underline !important;
149
- }
150
- </style>
151
- <div class="sidebar-section">
152
- <h3>πŸ”— Helpful Resources</h3>
153
- <div class="sidebar-links">
154
- <ul style='padding-left: 1em'>
155
- <li><a href="https://www.loc.gov/apis/" target="_blank">LOC API Info</a></li>
156
- <li><a href="https://www.loc.gov/" target="_blank">Library of Congress Homepage</a></li>
157
- <li><a href="https://www.loc.gov/collections/" target="_blank">LOC Digital Collections</a></li>
158
- <li><a href="https://www.loc.gov/marc/" target="_blank">MARC Metadata Standards</a></li>
159
- <li><a href="https://labs.loc.gov/about-labs/digital-strategy/" target="_blank">LOC Digital Strategy</a></li>
160
- </ul>
161
- </div>
162
- </div>
163
  """, unsafe_allow_html=True)
164
 
165
-
166
- # Add a fetch button to make the action explicit
167
- fetch_data = True
168
-
169
- if fetch_data:
170
- # Display a loading spinner while fetching data
171
- with st.spinner(f"Fetching data for {selected}..."):
172
- # Fetch data from LOC API with spoofed User-Agent header
173
- headers = {
174
- "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 Chrome/110.0.0.0 Safari/537.36"
175
- }
176
-
177
- try:
178
- response = requests.get(collection_url, headers=headers)
179
- response.raise_for_status()
180
- data = response.json()
181
-
182
- if "results" in data:
183
- records = data.get("results", [])
184
- elif "items" in data:
185
- records = data.get("items", [])
186
- else:
187
- records = []
188
- st.error("Unexpected API response structure. No records found.")
189
- st.write(f"Retrieved {len(records)} records")
190
-
191
- except requests.exceptions.RequestException as e:
192
- st.error(f"API Connection Error: {e}")
193
- records = []
194
- except ValueError:
195
- st.error("Failed to parse API response as JSON")
196
- records = []
197
-
198
- # Extract selected metadata fields
199
- items = []
200
- for record in records:
201
- if isinstance(record, dict):
202
- description = record.get("description", "")
203
- if isinstance(description, list):
204
- description = " ".join([str(d) for d in description])
205
- item = {
206
- "id": record.get("id", ""),
207
- "title": record.get("title", ""),
208
- "date": record.get("date", ""),
209
- "subject": ", ".join(record.get("subject", [])) if isinstance(record.get("subject"), list) else record.get("subject", ""),
210
- "creator": record.get("creator", ""),
211
- "description": description
212
- }
213
- if not item["title"] and "item" in record:
214
- item["title"] = record.get("item", {}).get("title", "")
215
- if not item["date"] and "item" in record:
216
- item["date"] = record.get("item", {}).get("date", "")
217
- items.append(item)
218
-
219
- metadata_df = pd.DataFrame(items)
220
-
221
- # Define custom completeness check
222
- def is_incomplete(value):
223
- return pd.isna(value) or value in ["", "N/A", "null", None]
224
-
225
- if not metadata_df.empty:
226
- # Incomplete record detection
227
- incomplete_mask = metadata_df.apply(lambda row: row.map(is_incomplete), axis=1).any(axis=1)
228
- incomplete_count = incomplete_mask.sum()
229
-
230
- # Overall completeness
231
- total_fields = metadata_df.size
232
- filled_fields = metadata_df.apply(lambda row: row.map(lambda x: not is_incomplete(x)), axis=1).sum().sum()
233
- overall_percent = (filled_fields / total_fields) * 100
234
-
235
- # Field-by-field completeness
236
- completeness = metadata_df.map(lambda x: not is_incomplete(x)).mean() * 100
237
- completeness_table = completeness.round(1).to_frame(name="Completeness (%)")
238
-
239
- # Render stats summary in sidebar
240
- stats_html = f"""
241
- <div class="sidebar-stats">
242
- <h3 style="color: lightgray;">Quick Stats</h3>
243
- <p style="color:lightgray;">Total Records: <b>{len(metadata_df)}</b></p>
244
- <p style="color:lightgray;">Incomplete Records: <b>{incomplete_count}</b></p>
245
- <p style="color:lightgray;">Overall Metadata Completeness: <b>{overall_percent:.1f}%</b></p>
246
- </div>
247
- """
248
- stats_placeholder.markdown(stats_html, unsafe_allow_html=True)
249
-
250
-
251
- # Utility functions for deeper metadata quality analysis
252
- def is_incomplete(value):
253
- return pd.isna(value) or value in ["", "N/A", "null", None]
254
-
255
- def is_valid_date(value):
 
 
 
 
 
 
 
 
 
 
 
256
  try:
257
- pd.to_datetime(value)
258
- return True
259
- except:
260
- return False
261
-
262
- if not metadata_df.empty:
263
- st.subheader("Retrieved Metadata Sample")
264
- st.dataframe(metadata_df.head())
265
-
266
- # Metadata completeness analysis (enhanced)
267
- st.subheader("Metadata Completeness Analysis")
268
- # Create the completeness table
269
- completeness = metadata_df.map(lambda x: not is_incomplete(x)).mean() * 100
270
- completeness_df = pd.DataFrame({
271
- "Field": completeness.index,
272
- "Completeness (%)": completeness.values
273
- })
274
- completeness_table = completeness_df.set_index("Field")
275
-
276
- # FILL THE PLACEHOLDER created earlier
277
-
278
- # FILL THE PLACEHOLDER created earlier
279
- with completeness_placeholder:
280
- st.markdown("""
281
- <div style='
282
- background-color: #2e2e2e;
283
- padding: 1.2rem;
284
- border-radius: 10px;
285
- margin-top: 1.5rem;
286
- color: lightgray;
287
- '>
288
- <h4 style='margin-bottom: 1rem;'>πŸ“Š Field Completeness Breakdown</h4>
289
- """, unsafe_allow_html=True)
290
-
291
- st.dataframe(
292
- completeness_table.style.background_gradient(cmap="Greens").format("{:.1f}%"),
293
- use_container_width=True,
294
- height=240
295
- )
296
-
297
- st.markdown("</div>", unsafe_allow_html=True)
298
-
299
- completeness_table = completeness.round(1).to_frame(name="Completeness (%)")
300
-
301
- # Then continue plotting in main panel
302
- fig = px.bar(completeness_df, x="Field", y="Completeness (%)", title="Metadata Completeness by Field")
303
- st.plotly_chart(fig)
304
-
305
-
306
-
307
- # Identify incomplete records
308
- incomplete_mask = metadata_df.map(is_incomplete).any(axis=1)
309
- incomplete_records = metadata_df[incomplete_mask]
310
-
311
- st.subheader("✨ Suggested Metadata Enhancements")
312
-
313
- incomplete_with_desc = incomplete_records[incomplete_records['description'].notnull()]
314
- reference_df = metadata_df[metadata_df['subject'].notnull() & metadata_df['description'].notnull()]
315
- tfidf = TfidfVectorizer(stop_words='english')
316
-
317
- if len(incomplete_with_desc) > 1 and len(reference_df) > 1:
318
- try:
319
- suggestions = []
320
- tfidf_matrix = tfidf.fit_transform(reference_df['description'])
321
-
322
- for idx, row in incomplete_with_desc.iterrows():
323
- if pd.isna(row['subject']) and pd.notna(row['description']):
324
- desc_vec = tfidf.transform([str(row['description'])])
325
- sims = cosine_similarity(desc_vec, tfidf_matrix).flatten()
326
- top_idx = sims.argmax()
327
- suggested_subject = reference_df.iloc[top_idx]['subject']
328
- if pd.notna(suggested_subject) and suggested_subject:
329
- suggestions.append((row['title'], suggested_subject))
330
-
331
- if suggestions:
332
- suggestions_df = pd.DataFrame(suggestions, columns=["Title", "Suggested Subject"])
333
- st.markdown("<div class='custom-table'>" + suggestions_df.to_markdown(index=False) + "</div>", unsafe_allow_html=True)
334
- else:
335
- st.markdown("""
336
- <div class='custom-table'>
337
- <b>No metadata enhancement suggestions available.</b>
338
- </div>
339
- """, unsafe_allow_html=True)
340
-
341
- except Exception as e:
342
- st.error(f"Error generating metadata suggestions: {e}")
343
- else:
344
- st.markdown("""
345
- <div class='custom-table'>
346
- <b>Not enough descriptive data to generate metadata suggestions.</b>
347
- </div>
348
- """, unsafe_allow_html=True)
349
  else:
350
- st.warning("⚠️ No metadata records found for this collection. Try selecting another one.")
 
 
 
 
1
  # MetaDiscovery Agent - LOC API with Enhanced Completeness and Quality Analysis
2
  import requests
3
  import pandas as pd
 
4
  import streamlit as st
 
5
  import plotly.express as px
6
  from sklearn.feature_extraction.text import TfidfVectorizer
7
  from sklearn.metrics.pairwise import cosine_similarity
8
+ import matplotlib
9
 
10
+ # --- CUSTOM CSS ---
11
  st.markdown("""
12
+ <style>
13
+ html, body, [data-testid="stApp"] {
14
+ background-color: #1A1A1A !important;
15
+ }
16
+ .block-container {
17
+ background-color: gray !important;
18
+ color: #1A1A1A !important;
19
+ padding-left: 2rem !important;
20
+ padding-right: 2rem !important;
21
+ }
22
+ section[data-testid="stSidebar"] > div:first-child {
23
+ background-color: #1A1A1A !important;
24
+ color: #FFFFFF !important;
25
+ padding: 2rem 1.5rem;
26
+ border-radius: 12px;
27
+ }
28
+ .sidebar-contrast-block {
29
+ background-color: #2b2b2b !important;
30
+ padding: 1rem;
31
+ border-radius: 10px;
32
+ margin-top: 1.5rem;
33
+ color: lightgray;
34
+ }
35
+ .custom-table {
36
+ background-color: #D3D3D3;
37
+ color: #1A1A1A;
38
+ font-family: monospace;
39
+ padding: 1rem;
40
+ border-radius: 8px;
41
+ overflow-x: auto;
42
+ white-space: pre;
43
+ border: 1px solid #ccc;
44
+ }
45
+ .sidebar-links a {
46
+ color: lightgray !important;
47
+ text-decoration: none !important;
48
+ }
49
+ .sidebar-links a:hover {
50
+ text-decoration: underline !important;
51
+ }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
52
  </style>
53
  """, unsafe_allow_html=True)
54
 
55
+ # --- HEADER ---
56
  st.image("https://cdn-uploads.huggingface.co/production/uploads/67351c643fe51cb1aa28f2e5/7ThcAOjbuM8ajrP85bGs4.jpeg", use_container_width=True)
 
 
57
  st.title("MetaDiscovery Agent for Library of Congress Collections")
58
+ st.markdown("This tool connects to the LOC API, retrieves metadata from a selected collection, and performs an analysis of metadata completeness, suggests enhancements, and identifies authority gaps.")
 
 
 
59
 
60
+ # --- COLLECTION SETUP ---
61
  collections = {
62
  "American Revolutionary War Maps": "american+revolutionary+war+maps",
63
  "Civil War Maps": "civil+war+maps",
64
  "Women's Suffrage": "women+suffrage",
65
  "World War I Posters": "world+war+posters"
66
  }
 
 
 
 
 
 
 
 
67
  selected = st.sidebar.selectbox("Select a collection", list(collections.keys()), key="collection_selector")
68
  search_query = collections[selected]
 
 
69
  collection_url = f"https://www.loc.gov/search/?q={search_query}&fo=json"
70
 
71
+ # --- PLACEHOLDERS FOR SIDEBAR BLOCKS ---
72
+ stats_placeholder = st.sidebar.container()
73
+ completeness_placeholder = st.sidebar.container()
74
 
75
+ # --- HELPFUL RESOURCES ---
 
 
 
 
 
 
76
  st.sidebar.markdown("""
77
+ <div class="sidebar-contrast-block">
78
+ <h4>πŸ”— Helpful Resources</h4>
79
+ <ul class="sidebar-links">
80
+ <li><a href="https://www.loc.gov/apis/" target="_blank">LOC API Info</a></li>
81
+ <li><a href="https://www.loc.gov/" target="_blank">Library of Congress Homepage</a></li>
82
+ <li><a href="https://www.loc.gov/collections/" target="_blank">LOC Digital Collections</a></li>
83
+ <li><a href="https://www.loc.gov/marc/" target="_blank">MARC Metadata Standards</a></li>
84
+ <li><a href="https://labs.loc.gov/about-labs/digital-strategy/" target="_blank">LOC Digital Strategy</a></li>
85
+ </ul>
86
+ </div>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
87
  """, unsafe_allow_html=True)
88
 
89
+ # Define Utility Functions
90
+ def is_incomplete(value):
91
+ return pd.isna(value) or value in ["", "N/A", "null", None]
92
+
93
+ def is_valid_date(value):
94
+ try:
95
+ pd.to_datetime(value)
96
+ return True
97
+ except:
98
+ return False
99
+
100
+ # Fetch data from LOC API
101
+ def fetch_loc_data(collection_url):
102
+ headers = {"User-Agent": "Mozilla/5.0"}
103
+ try:
104
+ response = requests.get(collection_url, headers=headers)
105
+ response.raise_for_status()
106
+ data = response.json()
107
+ if "results" in data:
108
+ return data["results"]
109
+ elif "items" in data:
110
+ return data["items"]
111
+ else:
112
+ return []
113
+ except Exception as e:
114
+ st.error(f"API Error: {e}")
115
+ return []
116
+
117
+ # Transform Records
118
+ def transform_records(records):
119
+ items = []
120
+ for record in records:
121
+ if isinstance(record, dict):
122
+ description = record.get("description", "")
123
+ if isinstance(description, list):
124
+ description = " ".join(map(str, description))
125
+ item = {
126
+ "id": record.get("id", ""),
127
+ "title": record.get("title", ""),
128
+ "date": record.get("date", ""),
129
+ "subject": ", ".join(record.get("subject", [])) if isinstance(record.get("subject"), list) else record.get("subject", ""),
130
+ "creator": record.get("creator", ""),
131
+ "description": description
132
+ }
133
+ items.append(item)
134
+ return pd.DataFrame(items)
135
+
136
+ # Render Main Application Sections
137
+ def render_main_sections(metadata_df, stats_placeholder, completeness_placeholder):
138
+ if not metadata_df.empty:
139
+ # Sidebar Quick Stats
140
+ incomplete_mask = metadata_df.map(is_incomplete).any(axis=1)
141
+ incomplete_count = incomplete_mask.sum()
142
+
143
+ total_fields = metadata_df.size
144
+ filled_fields = metadata_df.map(lambda x: not is_incomplete(x)).sum().sum()
145
+ overall_percent = (filled_fields / total_fields) * 100
146
+
147
+ # Completeness Table
148
+ completeness = metadata_df.map(lambda x: not is_incomplete(x)).mean() * 100
149
+ completeness_table = completeness.round(1).to_frame(name="Completeness (%)")
150
+
151
+ # Update sidebar placeholders
152
+ stats_html = f"""
153
+ <div class="sidebar-stats">
154
+ <h3 style="color: lightgray;">Quick Stats</h3>
155
+ <p style="color:lightgray;">Total Records: <b>{len(metadata_df)}</b></p>
156
+ <p style="color:lightgray;">Incomplete Records: <b>{incomplete_count}</b></p>
157
+ <p style="color:lightgray;">Overall Completeness: <b>{overall_percent:.1f}%</b></p>
158
+ </div>
159
+ """
160
+ stats_placeholder.markdown(stats_html, unsafe_allow_html=True)
161
+
162
+ # Field Completeness Breakdown (inside contrast block)
163
+ with completeness_placeholder:
164
+ st.markdown("""
165
+ <div style='background-color:#2e2e2e; padding:1.25rem; border-radius:8px; margin-top:1.5rem;'>
166
+ <h4 style='color: lightgray;'>Field Completeness Breakdown</h4>
167
+ """, unsafe_allow_html=True)
168
+
169
+ st.dataframe(
170
+ completeness_table.style.background_gradient(cmap="Greens").format("{:.1f}%"),
171
+ use_container_width=True
172
+ )
173
+
174
+ st.markdown("</div>", unsafe_allow_html=True)
175
+
176
+ # Main Body
177
+ st.subheader("πŸ“‚ Retrieved Metadata Sample")
178
+ st.dataframe(metadata_df.head())
179
+
180
+ st.subheader("πŸ“Š Metadata Completeness Analysis")
181
+ completeness_df = completeness.reset_index()
182
+ completeness_df.columns = ["Field", "Completeness (%)"]
183
+ fig = px.bar(completeness_df, x="Field", y="Completeness (%)", title="Metadata Completeness by Field")
184
+ st.plotly_chart(fig)
185
+
186
+ # Suggested Metadata Enhancements
187
+ st.subheader("✨ Suggested Metadata Enhancements")
188
+ incomplete_with_desc = metadata_df[incomplete_mask & metadata_df['description'].notna()]
189
+ reference_df = metadata_df[metadata_df['subject'].notna() & metadata_df['description'].notna()]
190
+ if not incomplete_with_desc.empty and not reference_df.empty:
191
  try:
192
+ tfidf = TfidfVectorizer(stop_words='english')
193
+ tfidf_matrix = tfidf.fit_transform(reference_df['description'])
194
+ suggestions = []
195
+ for idx, row in incomplete_with_desc.iterrows():
196
+ desc_vec = tfidf.transform([row['description']])
197
+ sims = cosine_similarity(desc_vec, tfidf_matrix).flatten()
198
+ top_idx = sims.argmax()
199
+ suggested_subject = reference_df.iloc[top_idx]['subject']
200
+ if suggested_subject:
201
+ suggestions.append((row['title'], suggested_subject))
202
+ if suggestions:
203
+ suggestions_df = pd.DataFrame(suggestions, columns=["Title", "Suggested Subject"])
204
+ st.markdown("<div class='custom-table'>" + suggestions_df.to_markdown(index=False) + "</div>", unsafe_allow_html=True)
205
+ else:
206
+ st.info("No suggestions could be generated.")
207
+ except Exception as e:
208
+ st.error(f"Error generating suggestions: {e}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
209
  else:
210
+ st.info("Not enough data for metadata enhancement suggestions.")
211
+ else:
212
+ st.warning("No metadata found for the selected collection.")
213
+