CCockrum commited on
Commit
4e04d7b
Β·
verified Β·
1 Parent(s): e1cc37a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +147 -166
app.py CHANGED
@@ -1,126 +1,108 @@
 
1
  import requests
2
  import pandas as pd
3
  import numpy as np
4
  import streamlit as st
 
5
  import plotly.express as px
6
  from sklearn.feature_extraction.text import TfidfVectorizer
7
  from sklearn.metrics.pairwise import cosine_similarity
8
 
9
- # Custom CSS for styling to match the screenshot
10
  st.markdown("""
11
  <style>
12
- /* Main background and text colors */
13
  .main {
14
- background-color: #1A1A1A !important;
15
- color: white !important;
 
16
  }
17
-
18
- /* Container styling */
19
  .block-container {
20
- background-color: #1A1A1A !important;
21
- color: white !important;
22
- padding-left: 2rem !important;
23
- padding-right: 2rem !important;
24
  }
25
-
26
- /* Header styling */
27
- header[data-testid="stHeader"] {
28
- background-color: #1A1A1A !important;
29
- }
30
-
31
- /* Sidebar styling */
32
  section[data-testid="stSidebar"] > div:first-child {
33
- background-color: #1A1A1A !important;
34
- color: #FFFFFF !important;
35
- padding: 2rem 1.5rem 1.5rem 1.5rem !important;
 
 
 
 
 
 
 
36
  border-radius: 12px;
37
- box-shadow: 0 4px 12px rgba(0, 0, 0, 0.08);
38
  }
39
-
40
- /* Overall app background */
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
41
  html, body, [data-testid="stApp"] {
42
  background-color: #1A1A1A !important;
43
  }
44
-
45
- /* Custom table styling */
46
  .custom-table {
47
- background-color: #2e2e2e;
48
- color: white;
49
  font-family: monospace;
50
  padding: 1rem;
51
  border-radius: 8px;
52
  overflow-x: auto;
53
  white-space: pre;
54
- border: 1px solid #444;
 
55
  }
56
-
57
- /* Sidebar stats styling */
58
  .sidebar-stats {
59
  color: lightgray !important;
60
  font-size: 1.1rem !important;
61
  margin-top: 1.5rem;
62
  font-weight: 600;
63
  }
64
-
65
- /* Sidebar contrast block */
66
  .sidebar-contrast-block {
67
- background-color: #2e2e2e !important;
68
  padding: 1.25rem;
69
  border-radius: 10px;
70
  margin-top: 1.5rem;
71
  }
72
-
73
- /* DataFrame styling */
74
- .stDataFrame {
75
- color: white !important;
76
- }
77
-
78
- /* Markdown text color */
79
- .stMarkdown {
80
- color: white !important;
81
- }
82
-
83
- /* Title styling */
84
- h1, h2, h3 {
85
- color: white !important;
86
- }
87
-
88
- /* Alert styling */
89
- .stAlert {
90
- background-color: #2e2e2e !important;
91
- color: white !important;
92
- padding: 1.25rem !important;
93
- font-size: 1rem !important;
94
- border-radius: 0.5rem !important;
95
- }
96
-
97
- /* Chart background */
98
- .js-plotly-plot .plotly .main-svg {
99
- background-color: #1A1A1A !important;
100
- }
101
-
102
- /* Completeness breakdown section */
103
- .field-completeness {
104
- background-color: #2e2e2e;
105
- padding: 1.2rem;
106
- border-radius: 10px;
107
- margin-top: 1.5rem;
108
- color: lightgray;
109
- }
110
- </style>
111
  """, unsafe_allow_html=True)
112
 
113
- # Banner image
114
  st.image("https://cdn-uploads.huggingface.co/production/uploads/67351c643fe51cb1aa28f2e5/7ThcAOjbuM8ajrP85bGs4.jpeg", use_container_width=True)
115
 
116
- # App header
117
  st.title("MetaDiscovery Agent for Library of Congress Collections")
118
  st.markdown("""
119
- This tool connects to the LOC API, retrieves metadata from a selected collection, and performs an
120
- analysis of metadata completeness, suggests enhancements, and identifies authority gaps.
121
  """)
122
 
123
- # Collection URLs using the correct LOC API format
124
  collections = {
125
  "American Revolutionary War Maps": "american+revolutionary+war+maps",
126
  "Civil War Maps": "civil+war+maps",
@@ -128,40 +110,62 @@ collections = {
128
  "World War I Posters": "world+war+posters"
129
  }
130
 
131
- # Initialize metadata_df variable
 
 
 
132
  metadata_df = pd.DataFrame()
133
 
134
- # Add collection selector to sidebar
135
  selected = st.sidebar.selectbox("Select a collection", list(collections.keys()), key="collection_selector")
136
  search_query = collections[selected]
137
 
138
  # Define the collection URL
139
  collection_url = f"https://www.loc.gov/search/?q={search_query}&fo=json"
140
 
141
- # Create placeholders for sidebar elements
142
  stats_placeholder = st.sidebar.empty()
 
 
143
  completeness_placeholder = st.sidebar.empty()
144
 
145
- # Helpful Resources (styled section in sidebar)
 
 
 
146
  st.sidebar.markdown("""
147
- <div style='
148
- margin-top: 1.5rem;
149
- color: lightgray;
150
- '>
151
- <h3 style='font-size: 1.1rem; font-weight: 600;'>πŸ”— Helpful Resources</h3>
152
- <ul style='padding-left: 1em; list-style-type: none;'>
153
- <li><a href="https://www.loc.gov/apis/" target="_blank" style="color: lightgray; text-decoration: none;">LOC API Info</a></li>
154
- <li><a href="https://www.loc.gov/" target="_blank" style="color: lightgray; text-decoration: none;">Library of Congress Homepage</a></li>
155
- <li><a href="https://www.loc.gov/collections/" target="_blank" style="color: lightgray; text-decoration: none;">LOC Digital Collections</a></li>
156
- <li><a href="https://www.loc.gov/marc/" target="_blank" style="color: lightgray; text-decoration: none;">MARC Metadata Standards</a></li>
157
- <li><a href="https://labs.loc.gov/about-labs/digital-strategy/" target="_blank" style="color: lightgray; text-decoration: none;">LOC Digital Strategy</a></li>
 
 
 
 
 
 
 
 
 
 
 
 
158
  </ul>
 
159
  </div>
160
  """, unsafe_allow_html=True)
161
 
162
- # Set fetch_data to True to automatically fetch data
163
- fetch_data = True
164
 
 
 
 
165
  if fetch_data:
166
  # Display a loading spinner while fetching data
167
  with st.spinner(f"Fetching data for {selected}..."):
@@ -228,23 +232,6 @@ if fetch_data:
228
  filled_fields = metadata_df.apply(lambda row: row.map(lambda x: not is_incomplete(x)), axis=1).sum().sum()
229
  overall_percent = (filled_fields / total_fields) * 100
230
 
231
- # Add "Overall Metadata Completeness" indicator to sidebar
232
- st.sidebar.markdown(
233
- f"""
234
- <div style='
235
- background-color: #2e2e2e;
236
- padding: 1rem;
237
- border-radius: 10px;
238
- margin-top: 1.5rem;
239
- text-align: center;
240
- '>
241
- <h3 style='color: lightgray; font-size: 1rem; margin-bottom: 0.5rem;'>Overall Metadata Completeness:</h3>
242
- <p style='color: white; font-size: 1.8rem; font-weight: bold; margin: 0;'>{overall_percent:.1f}%</p>
243
- </div>
244
- """,
245
- unsafe_allow_html=True
246
- )
247
-
248
  # Field-by-field completeness
249
  completeness = metadata_df.map(lambda x: not is_incomplete(x)).mean() * 100
250
  completeness_table = completeness.round(1).to_frame(name="Completeness (%)")
@@ -252,85 +239,79 @@ if fetch_data:
252
  # Render stats summary in sidebar
253
  stats_html = f"""
254
  <div class="sidebar-stats">
255
- <h3 style="color: lightgray; font-size: 1.1rem;">Quick Stats</h3>
256
  <p style="color:lightgray;">Total Records: <b>{len(metadata_df)}</b></p>
257
  <p style="color:lightgray;">Incomplete Records: <b>{incomplete_count}</b></p>
 
258
  </div>
259
  """
260
  stats_placeholder.markdown(stats_html, unsafe_allow_html=True)
261
 
262
- # Fill the Field Completeness Breakdown placeholder
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
263
  with completeness_placeholder:
264
  st.markdown("""
265
- <div class='field-completeness'>
266
- <h4 style='margin-bottom: 1rem; color: lightgray;'>Field Completeness Breakdown</h4>
 
 
 
 
 
 
267
  """, unsafe_allow_html=True)
268
 
269
- # Create a dataframe showing completeness percentages
270
- completeness_df = pd.DataFrame({
271
- "Field": completeness.index,
272
- "Completeness (%)": completeness.values
273
- })
274
-
275
- # FIX: Format the values before styling to avoid the ValueError
276
- # Convert percentages to strings with format applied
277
- completeness_df["Completeness (%)"] = completeness_df["Completeness (%)"].apply(lambda x: f"{x:.1f}")
278
-
279
- # Display the dataframe directly in the sidebar
280
  st.dataframe(
281
- completeness_df, # No styling applied here to avoid format errors
282
  use_container_width=True,
283
  height=240
284
  )
285
 
286
  st.markdown("</div>", unsafe_allow_html=True)
287
 
288
- # Display retrieved metadata sample in main panel
289
- st.subheader("Retrieved Metadata Sample")
290
- st.dataframe(metadata_df.head())
291
 
292
- # Metadata completeness analysis (bar chart)
293
- st.subheader("Metadata Completeness Analysis")
294
-
295
- # FIX: Convert percentages to numeric for plotting
296
- completeness_df["Completeness (%)"] = pd.to_numeric(completeness_df["Completeness (%)"])
297
-
298
- # Create a bar chart with a dark theme to match the screenshot
299
- fig = px.bar(
300
- completeness_df,
301
- x="Field",
302
- y="Completeness (%)",
303
- title="Metadata Completeness by Field",
304
- color="Completeness (%)",
305
- color_continuous_scale="Greens"
306
- )
307
-
308
- # Update the chart layout to match dark theme
309
- fig.update_layout(
310
- plot_bgcolor="#1A1A1A",
311
- paper_bgcolor="#1A1A1A",
312
- font_color="white",
313
- title_font_color="white",
314
- margin=dict(l=10, r=10, t=40, b=10),
315
- coloraxis_showscale=False
316
- )
317
-
318
- # Update axes
319
- fig.update_xaxes(title_font_color="white", tickfont_color="white", gridcolor="#333333")
320
- fig.update_yaxes(title_font_color="white", tickfont_color="white", gridcolor="#333333")
321
-
322
- st.plotly_chart(fig, use_container_width=True)
323
 
324
- # Enhanced Metadata section
325
- st.subheader("✨ Suggested Metadata Enhancements")
326
 
327
- # Identify incomplete records with descriptions
 
328
  incomplete_mask = metadata_df.map(is_incomplete).any(axis=1)
329
  incomplete_records = metadata_df[incomplete_mask]
 
 
 
330
  incomplete_with_desc = incomplete_records[incomplete_records['description'].notnull()]
331
  reference_df = metadata_df[metadata_df['subject'].notnull() & metadata_df['description'].notnull()]
332
-
333
- # Create TF-IDF vectorizer
334
  tfidf = TfidfVectorizer(stop_words='english')
335
 
336
  if len(incomplete_with_desc) > 1 and len(reference_df) > 1:
 
1
+ # MetaDiscovery Agent - LOC API with Enhanced Completeness and Quality Analysis
2
  import requests
3
  import pandas as pd
4
  import numpy as np
5
  import streamlit as st
6
+ import matplotlib
7
  import plotly.express as px
8
  from sklearn.feature_extraction.text import TfidfVectorizer
9
  from sklearn.metrics.pairwise import cosine_similarity
10
 
11
+ # Custom CSS for white background, styled sidebar, banner, and dark grey font
12
  st.markdown("""
13
  <style>
14
+
15
  .main {
16
+ background-color: #D3D3D3 !important;
17
+ color: #1A1A1A!important;
18
+
19
  }
 
 
20
  .block-container {
21
+ background-color: gray !important;
22
+ color: #808080!important;
 
 
23
  }
 
 
 
 
 
 
 
24
  section[data-testid="stSidebar"] > div:first-child {
25
+ background-color: #808080 !important;
26
+ padding: 1rem;
27
+ border-radius: 0.5rem;
28
+ color: #808080 !important;
29
+ }
30
+ .stMarkdown, .stTextInput, .stDataFrame {
31
+ color: #1A1A1A!important;
32
+ }
33
+ img.banner {
34
+ width: 100%;
35
  border-radius: 12px;
36
+ margin-bottom: 1rem;
37
  }
38
+ .stAlert {
39
+ background-color: #f0f0f5 !important;
40
+ color: #333333 !important;
41
+ padding: 1.25rem !important;
42
+ font-size: 1rem !important;
43
+ border-radius: 0.5rem !important;
44
+ box-shadow: 0 2px 5px rgba(0, 0, 0, 0.05) !important;
45
+ }
46
+ header[data-testid="stHeader"] {
47
+ background-color: gray !important;
48
+ }
49
+ section[data-testid="stSidebar"] > div:first-child {
50
+ background-color: #1A1A1A !important;
51
+ color: #FFFFFF !important;
52
+ padding: 2rem 1.5rem 1.5rem 1.5rem !important;
53
+ border-radius: 12px;
54
+ box-shadow: 0 4px 12px rgba(0, 0, 0, 0.08);
55
+ font-size: 0.95rem;
56
+ line-height: 1.5;
57
+ }
58
+ .block-container {
59
+ background-color: gray !important;
60
+ color: #1A1A1A !important;
61
+ padding-left: 2rem !important;
62
+ padding-right: 2rem !important;
63
+ box-shadow: none !important;
64
+ }
65
  html, body, [data-testid="stApp"] {
66
  background-color: #1A1A1A !important;
67
  }
 
 
68
  .custom-table {
69
+ background-color: #D3D3D3;
70
+ color: #1A1A1A;
71
  font-family: monospace;
72
  padding: 1rem;
73
  border-radius: 8px;
74
  overflow-x: auto;
75
  white-space: pre;
76
+ border: 1px solid #ccc;
77
+
78
  }
 
 
79
  .sidebar-stats {
80
  color: lightgray !important;
81
  font-size: 1.1rem !important;
82
  margin-top: 1.5rem;
83
  font-weight: 600;
84
  }
 
 
85
  .sidebar-contrast-block {
86
+ background-color: #2b2b2b !important; /* Slightly lighter than #1A1A1A */
87
  padding: 1.25rem;
88
  border-radius: 10px;
89
  margin-top: 1.5rem;
90
  }
91
+
92
+ </style>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
93
  """, unsafe_allow_html=True)
94
 
95
+ # OPTION 1: Use an image from a URL for the banner
96
  st.image("https://cdn-uploads.huggingface.co/production/uploads/67351c643fe51cb1aa28f2e5/7ThcAOjbuM8ajrP85bGs4.jpeg", use_container_width=True)
97
 
98
+ # Streamlit app header
99
  st.title("MetaDiscovery Agent for Library of Congress Collections")
100
  st.markdown("""
101
+ This tool connects to the LOC API, retrieves metadata from a selected collection, and performs
102
+ an analysis of metadata completeness, suggests enhancements, and identifies authority gaps.
103
  """)
104
 
105
+ # Updated collection URLs using the correct LOC API format
106
  collections = {
107
  "American Revolutionary War Maps": "american+revolutionary+war+maps",
108
  "Civil War Maps": "civil+war+maps",
 
110
  "World War I Posters": "world+war+posters"
111
  }
112
 
113
+ # Sidebar for selecting collection
114
+ #st.sidebar.markdown("## Settings")
115
+
116
+ # Create empty metadata_df variable to ensure it exists before checking
117
  metadata_df = pd.DataFrame()
118
 
119
+ # Add a key to the selectbox to ensure it refreshes properly
120
  selected = st.sidebar.selectbox("Select a collection", list(collections.keys()), key="collection_selector")
121
  search_query = collections[selected]
122
 
123
  # Define the collection URL
124
  collection_url = f"https://www.loc.gov/search/?q={search_query}&fo=json"
125
 
126
+ # Create an empty placeholder for Quick Stats
127
  stats_placeholder = st.sidebar.empty()
128
+
129
+ # Create placeholder for Field Completeness Breakdown
130
  completeness_placeholder = st.sidebar.empty()
131
 
132
+ # Helpful Resources (styled and moved below dropdown)
133
+ st.sidebar.markdown("### Helpful Resources", unsafe_allow_html=True)
134
+ # Helpful Resources styled section
135
+ # 3. Helpful Resources Section (Fixed, under Completeness)
136
  st.sidebar.markdown("""
137
+ <style>
138
+ .sidebar-section h3 {
139
+ color: lightgray !important;
140
+ font-size: 1.1rem !important;
141
+ margin-top: 1.5rem;
142
+ }
143
+ .sidebar-links a {
144
+ color: lightgray !important;
145
+ text-decoration: none !important;
146
+ }
147
+ .sidebar-links a:hover {
148
+ text-decoration: underline !important;
149
+ }
150
+ </style>
151
+ <div class="sidebar-section">
152
+ <h3>πŸ”— Helpful Resources</h3>
153
+ <div class="sidebar-links">
154
+ <ul style='padding-left: 1em'>
155
+ <li><a href="https://www.loc.gov/apis/" target="_blank">LOC API Info</a></li>
156
+ <li><a href="https://www.loc.gov/" target="_blank">Library of Congress Homepage</a></li>
157
+ <li><a href="https://www.loc.gov/collections/" target="_blank">LOC Digital Collections</a></li>
158
+ <li><a href="https://www.loc.gov/marc/" target="_blank">MARC Metadata Standards</a></li>
159
+ <li><a href="https://labs.loc.gov/about-labs/digital-strategy/" target="_blank">LOC Digital Strategy</a></li>
160
  </ul>
161
+ </div>
162
  </div>
163
  """, unsafe_allow_html=True)
164
 
 
 
165
 
166
+ # Add a fetch button to make the action explicit
167
+ fetch_data = True
168
+
169
  if fetch_data:
170
  # Display a loading spinner while fetching data
171
  with st.spinner(f"Fetching data for {selected}..."):
 
232
  filled_fields = metadata_df.apply(lambda row: row.map(lambda x: not is_incomplete(x)), axis=1).sum().sum()
233
  overall_percent = (filled_fields / total_fields) * 100
234
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
235
  # Field-by-field completeness
236
  completeness = metadata_df.map(lambda x: not is_incomplete(x)).mean() * 100
237
  completeness_table = completeness.round(1).to_frame(name="Completeness (%)")
 
239
  # Render stats summary in sidebar
240
  stats_html = f"""
241
  <div class="sidebar-stats">
242
+ <h3 style="color: lightgray;">Quick Stats</h3>
243
  <p style="color:lightgray;">Total Records: <b>{len(metadata_df)}</b></p>
244
  <p style="color:lightgray;">Incomplete Records: <b>{incomplete_count}</b></p>
245
+ <p style="color:lightgray;">Overall Metadata Completeness: <b>{overall_percent:.1f}%</b></p>
246
  </div>
247
  """
248
  stats_placeholder.markdown(stats_html, unsafe_allow_html=True)
249
 
250
+
251
+ # Utility functions for deeper metadata quality analysis
252
+ def is_incomplete(value):
253
+ return pd.isna(value) or value in ["", "N/A", "null", None]
254
+
255
+ def is_valid_date(value):
256
+ try:
257
+ pd.to_datetime(value)
258
+ return True
259
+ except:
260
+ return False
261
+
262
+ if not metadata_df.empty:
263
+ st.subheader("Retrieved Metadata Sample")
264
+ st.dataframe(metadata_df.head())
265
+
266
+ # Metadata completeness analysis (enhanced)
267
+ st.subheader("Metadata Completeness Analysis")
268
+ # Create the completeness table
269
+ completeness = metadata_df.map(lambda x: not is_incomplete(x)).mean() * 100
270
+ completeness_df = pd.DataFrame({
271
+ "Field": completeness.index,
272
+ "Completeness (%)": completeness.values
273
+ })
274
+ completeness_table = completeness_df.set_index("Field")
275
+
276
+ # FILL THE PLACEHOLDER created earlier
277
+
278
+ # FILL THE PLACEHOLDER created earlier
279
  with completeness_placeholder:
280
  st.markdown("""
281
+ <div style='
282
+ background-color: #2e2e2e;
283
+ padding: 1.2rem;
284
+ border-radius: 10px;
285
+ margin-top: 1.5rem;
286
+ color: lightgray;
287
+ '>
288
+ <h4 style='margin-bottom: 1rem;'>πŸ“Š Field Completeness Breakdown</h4>
289
  """, unsafe_allow_html=True)
290
 
 
 
 
 
 
 
 
 
 
 
 
291
  st.dataframe(
292
+ completeness_table.style.background_gradient(cmap="Greens").format("{:.1f}%"),
293
  use_container_width=True,
294
  height=240
295
  )
296
 
297
  st.markdown("</div>", unsafe_allow_html=True)
298
 
299
+ completeness_table = completeness.round(1).to_frame(name="Completeness (%)")
 
 
300
 
301
+ # Then continue plotting in main panel
302
+ fig = px.bar(completeness_df, x="Field", y="Completeness (%)", title="Metadata Completeness by Field")
303
+ st.plotly_chart(fig)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
304
 
 
 
305
 
306
+
307
+ # Identify incomplete records
308
  incomplete_mask = metadata_df.map(is_incomplete).any(axis=1)
309
  incomplete_records = metadata_df[incomplete_mask]
310
+
311
+ st.subheader("✨ Suggested Metadata Enhancements")
312
+
313
  incomplete_with_desc = incomplete_records[incomplete_records['description'].notnull()]
314
  reference_df = metadata_df[metadata_df['subject'].notnull() & metadata_df['description'].notnull()]
 
 
315
  tfidf = TfidfVectorizer(stop_words='english')
316
 
317
  if len(incomplete_with_desc) > 1 and len(reference_df) > 1: