CCockrum commited on
Commit
21b2b3f
·
verified ·
1 Parent(s): abfa7bf

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +159 -145
app.py CHANGED
@@ -1,108 +1,126 @@
1
- # MetaDiscovery Agent - LOC API with Enhanced Completeness and Quality Analysis
2
  import requests
3
  import pandas as pd
4
  import numpy as np
5
  import streamlit as st
6
- import matplotlib
7
  import plotly.express as px
8
  from sklearn.feature_extraction.text import TfidfVectorizer
9
  from sklearn.metrics.pairwise import cosine_similarity
10
 
11
- # Custom CSS for white background, styled sidebar, banner, and dark grey font
12
  st.markdown("""
13
  <style>
14
-
15
  .main {
16
- background-color: #D3D3D3 !important;
17
- color: #1A1A1A!important;
18
-
19
  }
 
 
20
  .block-container {
21
- background-color: gray !important;
22
- color: #808080!important;
23
- }
24
- section[data-testid="stSidebar"] > div:first-child {
25
- background-color: #808080 !important;
26
- padding: 1rem;
27
- border-radius: 0.5rem;
28
- color: #808080 !important;
29
  }
30
- .stMarkdown, .stTextInput, .stDataFrame {
31
- color: #1A1A1A!important;
 
 
32
  }
33
- img.banner {
34
- width: 100%;
 
 
 
 
35
  border-radius: 12px;
36
- margin-bottom: 1rem;
37
- }
38
- .stAlert {
39
- background-color: #f0f0f5 !important;
40
- color: #333333 !important;
41
- padding: 1.25rem !important;
42
- font-size: 1rem !important;
43
- border-radius: 0.5rem !important;
44
- box-shadow: 0 2px 5px rgba(0, 0, 0, 0.05) !important;
45
  }
46
- header[data-testid="stHeader"] {
47
- background-color: gray !important;
48
- }
49
- section[data-testid="stSidebar"] > div:first-child {
50
- background-color: #1A1A1A !important;
51
- color: #FFFFFF !important;
52
- padding: 2rem 1.5rem 1.5rem 1.5rem !important;
53
- border-radius: 12px;
54
- box-shadow: 0 4px 12px rgba(0, 0, 0, 0.08);
55
- font-size: 0.95rem;
56
- line-height: 1.5;
57
- }
58
- .block-container {
59
- background-color: gray !important;
60
- color: #1A1A1A !important;
61
- padding-left: 2rem !important;
62
- padding-right: 2rem !important;
63
- box-shadow: none !important;
64
- }
65
  html, body, [data-testid="stApp"] {
66
  background-color: #1A1A1A !important;
67
  }
 
 
68
  .custom-table {
69
- background-color: #D3D3D3;
70
- color: #1A1A1A;
71
  font-family: monospace;
72
  padding: 1rem;
73
  border-radius: 8px;
74
  overflow-x: auto;
75
  white-space: pre;
76
- border: 1px solid #ccc;
77
-
78
  }
 
 
79
  .sidebar-stats {
80
  color: lightgray !important;
81
  font-size: 1.1rem !important;
82
  margin-top: 1.5rem;
83
  font-weight: 600;
84
  }
 
 
85
  .sidebar-contrast-block {
86
- background-color: #2b2b2b !important; /* Slightly lighter than #1A1A1A */
87
  padding: 1.25rem;
88
  border-radius: 10px;
89
  margin-top: 1.5rem;
90
  }
91
-
92
- </style>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
93
  """, unsafe_allow_html=True)
94
 
95
- # OPTION 1: Use an image from a URL for the banner
96
  st.image("https://cdn-uploads.huggingface.co/production/uploads/67351c643fe51cb1aa28f2e5/7ThcAOjbuM8ajrP85bGs4.jpeg", use_container_width=True)
97
 
98
- # Streamlit app header
99
  st.title("MetaDiscovery Agent for Library of Congress Collections")
100
  st.markdown("""
101
- This tool connects to the LOC API, retrieves metadata from a selected collection, and performs
102
- an analysis of metadata completeness, suggests enhancements, and identifies authority gaps.
103
  """)
104
 
105
- # Updated collection URLs using the correct LOC API format
106
  collections = {
107
  "American Revolutionary War Maps": "american+revolutionary+war+maps",
108
  "Civil War Maps": "civil+war+maps",
@@ -110,62 +128,40 @@ collections = {
110
  "World War I Posters": "world+war+posters"
111
  }
112
 
113
- # Sidebar for selecting collection
114
- #st.sidebar.markdown("## Settings")
115
-
116
- # Create empty metadata_df variable to ensure it exists before checking
117
  metadata_df = pd.DataFrame()
118
 
119
- # Add a key to the selectbox to ensure it refreshes properly
120
  selected = st.sidebar.selectbox("Select a collection", list(collections.keys()), key="collection_selector")
121
  search_query = collections[selected]
122
 
123
  # Define the collection URL
124
  collection_url = f"https://www.loc.gov/search/?q={search_query}&fo=json"
125
 
126
- # Create an empty placeholder for Quick Stats
127
  stats_placeholder = st.sidebar.empty()
128
-
129
- # Create placeholder for Field Completeness Breakdown
130
  completeness_placeholder = st.sidebar.empty()
131
 
132
- # Helpful Resources (styled and moved below dropdown)
133
- st.sidebar.markdown("### Helpful Resources", unsafe_allow_html=True)
134
- # Helpful Resources styled section
135
- # 3. Helpful Resources Section (Fixed, under Completeness)
136
  st.sidebar.markdown("""
137
- <style>
138
- .sidebar-section h3 {
139
- color: lightgray !important;
140
- font-size: 1.1rem !important;
141
- margin-top: 1.5rem;
142
- }
143
- .sidebar-links a {
144
- color: lightgray !important;
145
- text-decoration: none !important;
146
- }
147
- .sidebar-links a:hover {
148
- text-decoration: underline !important;
149
- }
150
- </style>
151
- <div class="sidebar-section">
152
- <h3>🔗 Helpful Resources</h3>
153
- <div class="sidebar-links">
154
- <ul style='padding-left: 1em'>
155
- <li><a href="https://www.loc.gov/apis/" target="_blank">LOC API Info</a></li>
156
- <li><a href="https://www.loc.gov/" target="_blank">Library of Congress Homepage</a></li>
157
- <li><a href="https://www.loc.gov/collections/" target="_blank">LOC Digital Collections</a></li>
158
- <li><a href="https://www.loc.gov/marc/" target="_blank">MARC Metadata Standards</a></li>
159
- <li><a href="https://labs.loc.gov/about-labs/digital-strategy/" target="_blank">LOC Digital Strategy</a></li>
160
  </ul>
161
- </div>
162
  </div>
163
  """, unsafe_allow_html=True)
164
 
165
-
166
- # Add a fetch button to make the action explicit
167
  fetch_data = True
168
-
169
  if fetch_data:
170
  # Display a loading spinner while fetching data
171
  with st.spinner(f"Fetching data for {selected}..."):
@@ -232,6 +228,23 @@ if fetch_data:
232
  filled_fields = metadata_df.apply(lambda row: row.map(lambda x: not is_incomplete(x)), axis=1).sum().sum()
233
  overall_percent = (filled_fields / total_fields) * 100
234
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
235
  # Field-by-field completeness
236
  completeness = metadata_df.map(lambda x: not is_incomplete(x)).mean() * 100
237
  completeness_table = completeness.round(1).to_frame(name="Completeness (%)")
@@ -239,77 +252,78 @@ if fetch_data:
239
  # Render stats summary in sidebar
240
  stats_html = f"""
241
  <div class="sidebar-stats">
242
- <h3 style="color: lightgray;">Quick Stats</h3>
243
  <p style="color:lightgray;">Total Records: <b>{len(metadata_df)}</b></p>
244
  <p style="color:lightgray;">Incomplete Records: <b>{incomplete_count}</b></p>
245
- <p style="color:lightgray;">Overall Metadata Completeness: <b>{overall_percent:.1f}%</b></p>
246
  </div>
247
  """
248
  stats_placeholder.markdown(stats_html, unsafe_allow_html=True)
249
 
250
-
251
- # Utility functions for deeper metadata quality analysis
252
- def is_incomplete(value):
253
- return pd.isna(value) or value in ["", "N/A", "null", None]
254
-
255
- def is_valid_date(value):
256
- try:
257
- pd.to_datetime(value)
258
- return True
259
- except:
260
- return False
261
-
262
- if not metadata_df.empty:
263
- st.subheader("Retrieved Metadata Sample")
264
- st.dataframe(metadata_df.head())
265
-
266
- # Metadata completeness analysis (enhanced)
267
- st.subheader("Metadata Completeness Analysis")
268
- # Create the completeness table
269
- completeness = metadata_df.map(lambda x: not is_incomplete(x)).mean() * 100
270
- completeness_df = pd.DataFrame({
271
- "Field": completeness.index,
272
- "Completeness (%)": completeness.values
273
- })
274
- completeness_table = completeness_df.set_index("Field")
275
-
276
- # FILL THE PLACEHOLDER created earlier
277
-
278
  with completeness_placeholder:
279
  st.markdown("""
280
- <div style='
281
- background-color: #2e2e2e;
282
- padding: 1.2rem;
283
- border-radius: 10px;
284
- margin-top: 1.5rem;
285
- color: lightgray;
286
- '>
287
- <h4 style='margin-bottom: 1rem;'>Field Completeness Breakdown</h4>
288
  """, unsafe_allow_html=True)
289
 
 
 
 
 
 
 
 
290
  st.dataframe(
291
- completeness_table.style.background_gradient(cmap="Greens").format("{:.1f}%"),
292
  use_container_width=True,
293
  height=240
294
  )
295
 
296
  st.markdown("</div>", unsafe_allow_html=True)
297
 
 
 
 
298
 
299
- # Then continue plotting in main panel
300
- fig = px.bar(completeness_df, x="Field", y="Completeness (%)", title="Metadata Completeness by Field")
301
- st.plotly_chart(fig)
302
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
303
 
 
 
304
 
305
- # Identify incomplete records
306
  incomplete_mask = metadata_df.map(is_incomplete).any(axis=1)
307
  incomplete_records = metadata_df[incomplete_mask]
308
-
309
- st.subheader("✨ Suggested Metadata Enhancements")
310
-
311
  incomplete_with_desc = incomplete_records[incomplete_records['description'].notnull()]
312
  reference_df = metadata_df[metadata_df['subject'].notnull() & metadata_df['description'].notnull()]
 
 
313
  tfidf = TfidfVectorizer(stop_words='english')
314
 
315
  if len(incomplete_with_desc) > 1 and len(reference_df) > 1:
 
 
1
  import requests
2
  import pandas as pd
3
  import numpy as np
4
  import streamlit as st
 
5
  import plotly.express as px
6
  from sklearn.feature_extraction.text import TfidfVectorizer
7
  from sklearn.metrics.pairwise import cosine_similarity
8
 
9
+ # Custom CSS for styling to match the screenshot
10
  st.markdown("""
11
  <style>
12
+ /* Main background and text colors */
13
  .main {
14
+ background-color: #1A1A1A !important;
15
+ color: white !important;
 
16
  }
17
+
18
+ /* Container styling */
19
  .block-container {
20
+ background-color: #1A1A1A !important;
21
+ color: white !important;
22
+ padding-left: 2rem !important;
23
+ padding-right: 2rem !important;
 
 
 
 
24
  }
25
+
26
+ /* Header styling */
27
+ header[data-testid="stHeader"] {
28
+ background-color: #1A1A1A !important;
29
  }
30
+
31
+ /* Sidebar styling */
32
+ section[data-testid="stSidebar"] > div:first-child {
33
+ background-color: #1A1A1A !important;
34
+ color: #FFFFFF !important;
35
+ padding: 2rem 1.5rem 1.5rem 1.5rem !important;
36
  border-radius: 12px;
37
+ box-shadow: 0 4px 12px rgba(0, 0, 0, 0.08);
 
 
 
 
 
 
 
 
38
  }
39
+
40
+ /* Overall app background */
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
41
  html, body, [data-testid="stApp"] {
42
  background-color: #1A1A1A !important;
43
  }
44
+
45
+ /* Custom table styling */
46
  .custom-table {
47
+ background-color: #2e2e2e;
48
+ color: white;
49
  font-family: monospace;
50
  padding: 1rem;
51
  border-radius: 8px;
52
  overflow-x: auto;
53
  white-space: pre;
54
+ border: 1px solid #444;
 
55
  }
56
+
57
+ /* Sidebar stats styling */
58
  .sidebar-stats {
59
  color: lightgray !important;
60
  font-size: 1.1rem !important;
61
  margin-top: 1.5rem;
62
  font-weight: 600;
63
  }
64
+
65
+ /* Sidebar contrast block */
66
  .sidebar-contrast-block {
67
+ background-color: #2e2e2e !important;
68
  padding: 1.25rem;
69
  border-radius: 10px;
70
  margin-top: 1.5rem;
71
  }
72
+
73
+ /* DataFrame styling */
74
+ .stDataFrame {
75
+ color: white !important;
76
+ }
77
+
78
+ /* Markdown text color */
79
+ .stMarkdown {
80
+ color: white !important;
81
+ }
82
+
83
+ /* Title styling */
84
+ h1, h2, h3 {
85
+ color: white !important;
86
+ }
87
+
88
+ /* Alert styling */
89
+ .stAlert {
90
+ background-color: #2e2e2e !important;
91
+ color: white !important;
92
+ padding: 1.25rem !important;
93
+ font-size: 1rem !important;
94
+ border-radius: 0.5rem !important;
95
+ }
96
+
97
+ /* Chart background */
98
+ .js-plotly-plot .plotly .main-svg {
99
+ background-color: #1A1A1A !important;
100
+ }
101
+
102
+ /* Completeness breakdown section */
103
+ .field-completeness {
104
+ background-color: #2e2e2e;
105
+ padding: 1.2rem;
106
+ border-radius: 10px;
107
+ margin-top: 1.5rem;
108
+ color: lightgray;
109
+ }
110
+ </style>
111
  """, unsafe_allow_html=True)
112
 
113
+ # Banner image
114
  st.image("https://cdn-uploads.huggingface.co/production/uploads/67351c643fe51cb1aa28f2e5/7ThcAOjbuM8ajrP85bGs4.jpeg", use_container_width=True)
115
 
116
+ # App header
117
  st.title("MetaDiscovery Agent for Library of Congress Collections")
118
  st.markdown("""
119
+ This tool connects to the LOC API, retrieves metadata from a selected collection, and performs an
120
+ analysis of metadata completeness, suggests enhancements, and identifies authority gaps.
121
  """)
122
 
123
+ # Collection URLs using the correct LOC API format
124
  collections = {
125
  "American Revolutionary War Maps": "american+revolutionary+war+maps",
126
  "Civil War Maps": "civil+war+maps",
 
128
  "World War I Posters": "world+war+posters"
129
  }
130
 
131
+ # Initialize metadata_df variable
 
 
 
132
  metadata_df = pd.DataFrame()
133
 
134
+ # Add collection selector to sidebar
135
  selected = st.sidebar.selectbox("Select a collection", list(collections.keys()), key="collection_selector")
136
  search_query = collections[selected]
137
 
138
  # Define the collection URL
139
  collection_url = f"https://www.loc.gov/search/?q={search_query}&fo=json"
140
 
141
+ # Create placeholders for sidebar elements
142
  stats_placeholder = st.sidebar.empty()
 
 
143
  completeness_placeholder = st.sidebar.empty()
144
 
145
+ # Helpful Resources (styled section in sidebar)
 
 
 
146
  st.sidebar.markdown("""
147
+ <div style='
148
+ margin-top: 1.5rem;
149
+ color: lightgray;
150
+ '>
151
+ <h3 style='font-size: 1.1rem; font-weight: 600;'>🔗 Helpful Resources</h3>
152
+ <ul style='padding-left: 1em; list-style-type: none;'>
153
+ <li><a href="https://www.loc.gov/apis/" target="_blank" style="color: lightgray; text-decoration: none;">LOC API Info</a></li>
154
+ <li><a href="https://www.loc.gov/" target="_blank" style="color: lightgray; text-decoration: none;">Library of Congress Homepage</a></li>
155
+ <li><a href="https://www.loc.gov/collections/" target="_blank" style="color: lightgray; text-decoration: none;">LOC Digital Collections</a></li>
156
+ <li><a href="https://www.loc.gov/marc/" target="_blank" style="color: lightgray; text-decoration: none;">MARC Metadata Standards</a></li>
157
+ <li><a href="https://labs.loc.gov/about-labs/digital-strategy/" target="_blank" style="color: lightgray; text-decoration: none;">LOC Digital Strategy</a></li>
 
 
 
 
 
 
 
 
 
 
 
 
158
  </ul>
 
159
  </div>
160
  """, unsafe_allow_html=True)
161
 
162
+ # Set fetch_data to True to automatically fetch data
 
163
  fetch_data = True
164
+
165
  if fetch_data:
166
  # Display a loading spinner while fetching data
167
  with st.spinner(f"Fetching data for {selected}..."):
 
228
  filled_fields = metadata_df.apply(lambda row: row.map(lambda x: not is_incomplete(x)), axis=1).sum().sum()
229
  overall_percent = (filled_fields / total_fields) * 100
230
 
231
+ # Add "Overall Metadata Completeness" indicator to sidebar
232
+ st.sidebar.markdown(
233
+ f"""
234
+ <div style='
235
+ background-color: #2e2e2e;
236
+ padding: 1rem;
237
+ border-radius: 10px;
238
+ margin-top: 1.5rem;
239
+ text-align: center;
240
+ '>
241
+ <h3 style='color: lightgray; font-size: 1rem; margin-bottom: 0.5rem;'>Overall Metadata Completeness:</h3>
242
+ <p style='color: white; font-size: 1.8rem; font-weight: bold; margin: 0;'>{overall_percent:.1f}%</p>
243
+ </div>
244
+ """,
245
+ unsafe_allow_html=True
246
+ )
247
+
248
  # Field-by-field completeness
249
  completeness = metadata_df.map(lambda x: not is_incomplete(x)).mean() * 100
250
  completeness_table = completeness.round(1).to_frame(name="Completeness (%)")
 
252
  # Render stats summary in sidebar
253
  stats_html = f"""
254
  <div class="sidebar-stats">
255
+ <h3 style="color: lightgray; font-size: 1.1rem;">Quick Stats</h3>
256
  <p style="color:lightgray;">Total Records: <b>{len(metadata_df)}</b></p>
257
  <p style="color:lightgray;">Incomplete Records: <b>{incomplete_count}</b></p>
 
258
  </div>
259
  """
260
  stats_placeholder.markdown(stats_html, unsafe_allow_html=True)
261
 
262
+ # Fill the Field Completeness Breakdown placeholder
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
263
  with completeness_placeholder:
264
  st.markdown("""
265
+ <div class='field-completeness'>
266
+ <h4 style='margin-bottom: 1rem; color: lightgray;'>Field Completeness Breakdown</h4>
 
 
 
 
 
 
267
  """, unsafe_allow_html=True)
268
 
269
+ # Create a styled dataframe showing completeness percentages
270
+ completeness_df = pd.DataFrame({
271
+ "Field": completeness.index,
272
+ "Completeness (%)": completeness.values
273
+ })
274
+
275
+ # Display the dataframe directly in the sidebar
276
  st.dataframe(
277
+ completeness_df.style.background_gradient(cmap="Greens").format("{:.1f}%"),
278
  use_container_width=True,
279
  height=240
280
  )
281
 
282
  st.markdown("</div>", unsafe_allow_html=True)
283
 
284
+ # Display retrieved metadata sample in main panel
285
+ st.subheader("Retrieved Metadata Sample")
286
+ st.dataframe(metadata_df.head())
287
 
288
+ # Metadata completeness analysis (bar chart)
289
+ st.subheader("Metadata Completeness Analysis")
290
+
291
+ # Create a bar chart with a dark theme to match the screenshot
292
+ fig = px.bar(
293
+ completeness_df,
294
+ x="Field",
295
+ y="Completeness (%)",
296
+ title="Metadata Completeness by Field",
297
+ color="Completeness (%)",
298
+ color_continuous_scale="Greens"
299
+ )
300
+
301
+ # Update the chart layout to match dark theme
302
+ fig.update_layout(
303
+ plot_bgcolor="#1A1A1A",
304
+ paper_bgcolor="#1A1A1A",
305
+ font_color="white",
306
+ title_font_color="white",
307
+ margin=dict(l=10, r=10, t=40, b=10),
308
+ coloraxis_showscale=False
309
+ )
310
+
311
+ # Update axes
312
+ fig.update_xaxes(title_font_color="white", tickfont_color="white", gridcolor="#333333")
313
+ fig.update_yaxes(title_font_color="white", tickfont_color="white", gridcolor="#333333")
314
+
315
+ st.plotly_chart(fig, use_container_width=True)
316
 
317
+ # Enhanced Metadata section
318
+ st.subheader("✨ Suggested Metadata Enhancements")
319
 
320
+ # Identify incomplete records with descriptions
321
  incomplete_mask = metadata_df.map(is_incomplete).any(axis=1)
322
  incomplete_records = metadata_df[incomplete_mask]
 
 
 
323
  incomplete_with_desc = incomplete_records[incomplete_records['description'].notnull()]
324
  reference_df = metadata_df[metadata_df['subject'].notnull() & metadata_df['description'].notnull()]
325
+
326
+ # Create TF-IDF vectorizer
327
  tfidf = TfidfVectorizer(stop_words='english')
328
 
329
  if len(incomplete_with_desc) > 1 and len(reference_df) > 1: