CCockrum commited on
Commit
c3039ab
·
verified ·
1 Parent(s): e4f3827

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +124 -118
app.py CHANGED
@@ -6,7 +6,6 @@ import streamlit as st
6
  import plotly.express as px
7
  from sklearn.feature_extraction.text import TfidfVectorizer
8
  from sklearn.metrics.pairwise import cosine_similarity
9
- import os
10
 
11
  # Custom CSS for white background, styled sidebar, banner, and dark grey font
12
  st.markdown("""
@@ -37,18 +36,18 @@ st.markdown("""
37
  }
38
  .stAlert {
39
  background-color: #f0f0f5 !important;
40
- color: #D3D3D3 !important;
41
  padding: 1.25rem !important;
42
  font-size: 1rem !important;
43
  border-radius: 0.5rem !important;
44
  box-shadow: 0 2px 5px rgba(0, 0, 0, 0.05) !important;
45
  }
46
  header[data-testid="stHeader"] {
47
- background-color: gray !important;
48
  }
49
  section[data-testid="stSidebar"] > div:first-child {
50
  background-color: #1A1A1A !important;
51
- color: #D3D3D3 !important;
52
  padding: 2rem 1.5rem 1.5rem 1.5rem !important;
53
  border-radius: 12px;
54
  box-shadow: 0 4px 12px rgba(0, 0, 0, 0.08);
@@ -68,12 +67,11 @@ st.markdown("""
68
  </style>
69
  """, unsafe_allow_html=True)
70
 
71
- # Near the top of your app, after the CSS styling
72
- st.image("https://cdn-uploads.huggingface.co/production/uploads/67351c643fe51cb1aa28f2e5/7ThcAOjbuM8ajrP85bGs4.jpeg", use_container_width=True)
73
-
74
 
75
  # Streamlit app header
76
- st.title("Library of Congress Collections Analysis Tool")
77
  st.markdown("""
78
  This tool connects to the LOC API, retrieves metadata from a selected collection, and performs
79
  an analysis of metadata completeness, suggests enhancements, and identifies authority gaps.
@@ -89,7 +87,9 @@ collections = {
89
 
90
  # Sidebar for selecting collection
91
  st.sidebar.markdown("## Settings")
92
- selected = st.sidebar.selectbox("Select a collection", list(collections.keys()))
 
 
93
  search_query = collections[selected]
94
 
95
  # Define the collection URL
@@ -99,116 +99,122 @@ collection_url = f"https://www.loc.gov/search/?q={search_query}&fo=json"
99
  st.sidebar.write(f"Selected Collection: {selected}")
100
  st.sidebar.markdown(f"<span style='color: lightgray;'>API URL: {collection_url}</span>", unsafe_allow_html=True)
101
 
102
- # Fetch data from LOC API with spoofed User-Agent header
103
- headers = {
104
- "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 Chrome/110.0.0.0 Safari/537.36"
105
- }
106
 
107
- try:
108
- response = requests.get(collection_url, headers=headers)
109
- response.raise_for_status()
110
- data = response.json()
111
-
112
- if "results" in data:
113
- records = data.get("results", [])
114
- elif "items" in data:
115
- records = data.get("items", [])
116
- else:
117
- records = []
118
- st.error("Unexpected API response structure. No records found.")
119
- st.write(f"Retrieved {len(records)} records")
120
-
121
- except requests.exceptions.RequestException as e:
122
- st.error(f"API Connection Error: {e}")
123
- records = []
124
- except ValueError:
125
- st.error("Failed to parse API response as JSON")
126
- records = []
127
-
128
- # Extract selected metadata fields
129
- items = []
130
- for record in records:
131
- if isinstance(record, dict):
132
- description = record.get("description", "")
133
- if isinstance(description, list):
134
- description = " ".join([str(d) for d in description])
135
- item = {
136
- "id": record.get("id", ""),
137
- "title": record.get("title", ""),
138
- "date": record.get("date", ""),
139
- "subject": ", ".join(record.get("subject", [])) if isinstance(record.get("subject"), list) else record.get("subject", ""),
140
- "creator": record.get("creator", ""),
141
- "description": description
142
  }
143
- if not item["title"] and "item" in record:
144
- item["title"] = record.get("item", {}).get("title", "")
145
- if not item["date"] and "item" in record:
146
- item["date"] = record.get("item", {}).get("date", "")
147
- items.append(item)
148
-
149
- metadata_df = pd.DataFrame(items)
150
-
151
- # Utility functions for deeper metadata quality analysis
152
- def is_incomplete(value):
153
- return pd.isna(value) or value in ["", "N/A", "null", None]
154
-
155
- def is_valid_date(value):
156
- try:
157
- pd.to_datetime(value)
158
- return True
159
- except:
160
- return False
161
-
162
- if not metadata_df.empty:
163
- st.subheader("Retrieved Metadata Sample")
164
- st.dataframe(metadata_df.head())
165
-
166
- # Metadata completeness analysis (enhanced)
167
- st.subheader("Metadata Completeness Analysis")
168
- completeness = metadata_df.map(lambda x: not is_incomplete(x)).mean() * 100
169
- completeness_df = pd.DataFrame({"Field": completeness.index, "Completeness (%)": completeness.values})
170
- fig = px.bar(completeness_df, x="Field", y="Completeness (%)", title="Metadata Completeness by Field")
171
- st.plotly_chart(fig)
172
-
173
- # Identify incomplete records
174
- incomplete_mask = metadata_df.map(is_incomplete).any(axis=1)
175
- incomplete_records = metadata_df[incomplete_mask]
176
-
177
- st.subheader("Records with Incomplete Metadata")
178
- if not incomplete_records.empty:
179
- st.dataframe(incomplete_records.astype(str))
180
- else:
181
- st.success("All metadata fields are complete in this collection!")
182
-
183
- st.subheader("Identifiers of Items Needing Metadata Updates")
184
- if not incomplete_records.empty:
185
- st.write(incomplete_records[['id', 'title']])
186
- else:
187
- st.success("All records are complete!")
188
-
189
- st.subheader("Suggested Metadata Enhancements")
190
- filled_descriptions = metadata_df[metadata_df['description'].notnull()]['description'].astype(str)
191
- if len(filled_descriptions) > 1:
192
  try:
193
- tfidf = TfidfVectorizer(stop_words='english')
194
- tfidf_matrix = tfidf.fit_transform(filled_descriptions)
195
- suggestions = []
196
- for idx, row in incomplete_records.iterrows():
197
- if pd.isna(row['subject']) and pd.notna(row['description']):
198
- desc_vec = tfidf.transform([str(row['description'])])
199
- sims = cosine_similarity(desc_vec, tfidf_matrix).flatten()
200
- top_idx = sims.argmax()
201
- suggested_subject = metadata_df.iloc[top_idx]['subject']
202
- if pd.notna(suggested_subject) and suggested_subject:
203
- suggestions.append((row['title'], suggested_subject))
204
- if suggestions:
205
- suggestions_df = pd.DataFrame(suggestions, columns=["Title", "Suggested Subject"])
206
- st.dataframe(suggestions_df)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
207
  else:
208
- st.info("No metadata enhancement suggestions available.")
209
- except Exception as e:
210
- st.error(f"Error generating metadata suggestions: {e}")
211
- else:
212
- st.info("Not enough descriptive data to generate metadata suggestions.")
213
- else:
214
- st.warning("No metadata records found for this collection. Try selecting another one.")
 
6
  import plotly.express as px
7
  from sklearn.feature_extraction.text import TfidfVectorizer
8
  from sklearn.metrics.pairwise import cosine_similarity
 
9
 
10
  # Custom CSS for white background, styled sidebar, banner, and dark grey font
11
  st.markdown("""
 
36
  }
37
  .stAlert {
38
  background-color: #f0f0f5 !important;
39
+ color: #333333 !important;
40
  padding: 1.25rem !important;
41
  font-size: 1rem !important;
42
  border-radius: 0.5rem !important;
43
  box-shadow: 0 2px 5px rgba(0, 0, 0, 0.05) !important;
44
  }
45
  header[data-testid="stHeader"] {
46
+ background-color: #D3D3D3 !important;
47
  }
48
  section[data-testid="stSidebar"] > div:first-child {
49
  background-color: #1A1A1A !important;
50
+ color: #FFFFFF !important;
51
  padding: 2rem 1.5rem 1.5rem 1.5rem !important;
52
  border-radius: 12px;
53
  box-shadow: 0 4px 12px rgba(0, 0, 0, 0.08);
 
67
  </style>
68
  """, unsafe_allow_html=True)
69
 
70
+ # OPTION 1: Use an image from a URL for the banner
71
+ st.image("https://www.loc.gov/static/images/logo-loc-new-branding.svg", use_container_width=True)
 
72
 
73
  # Streamlit app header
74
+ st.title("MetaDiscovery Agent for Library of Congress Collections")
75
  st.markdown("""
76
  This tool connects to the LOC API, retrieves metadata from a selected collection, and performs
77
  an analysis of metadata completeness, suggests enhancements, and identifies authority gaps.
 
87
 
88
  # Sidebar for selecting collection
89
  st.sidebar.markdown("## Settings")
90
+
91
+ # Add a key to the selectbox to ensure it refreshes properly
92
+ selected = st.sidebar.selectbox("Select a collection", list(collections.keys()), key="collection_selector")
93
  search_query = collections[selected]
94
 
95
  # Define the collection URL
 
99
  st.sidebar.write(f"Selected Collection: {selected}")
100
  st.sidebar.markdown(f"<span style='color: lightgray;'>API URL: {collection_url}</span>", unsafe_allow_html=True)
101
 
102
+ # Add a fetch button to make the action explicit
103
+ fetch_data = True
 
 
104
 
105
+ if fetch_data:
106
+ # Display a loading spinner while fetching data
107
+ with st.spinner(f"Fetching data for {selected}..."):
108
+ # Fetch data from LOC API with spoofed User-Agent header
109
+ headers = {
110
+ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 Chrome/110.0.0.0 Safari/537.36"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
111
  }
112
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
113
  try:
114
+ response = requests.get(collection_url, headers=headers)
115
+ response.raise_for_status()
116
+ data = response.json()
117
+
118
+ if "results" in data:
119
+ records = data.get("results", [])
120
+ elif "items" in data:
121
+ records = data.get("items", [])
122
+ else:
123
+ records = []
124
+ st.error("Unexpected API response structure. No records found.")
125
+ st.write(f"Retrieved {len(records)} records")
126
+
127
+ except requests.exceptions.RequestException as e:
128
+ st.error(f"API Connection Error: {e}")
129
+ records = []
130
+ except ValueError:
131
+ st.error("Failed to parse API response as JSON")
132
+ records = []
133
+
134
+ # Extract selected metadata fields
135
+ items = []
136
+ for record in records:
137
+ if isinstance(record, dict):
138
+ description = record.get("description", "")
139
+ if isinstance(description, list):
140
+ description = " ".join([str(d) for d in description])
141
+ item = {
142
+ "id": record.get("id", ""),
143
+ "title": record.get("title", ""),
144
+ "date": record.get("date", ""),
145
+ "subject": ", ".join(record.get("subject", [])) if isinstance(record.get("subject"), list) else record.get("subject", ""),
146
+ "creator": record.get("creator", ""),
147
+ "description": description
148
+ }
149
+ if not item["title"] and "item" in record:
150
+ item["title"] = record.get("item", {}).get("title", "")
151
+ if not item["date"] and "item" in record:
152
+ item["date"] = record.get("item", {}).get("date", "")
153
+ items.append(item)
154
+
155
+ metadata_df = pd.DataFrame(items)
156
+
157
+ # Utility functions for deeper metadata quality analysis
158
+ def is_incomplete(value):
159
+ return pd.isna(value) or value in ["", "N/A", "null", None]
160
+
161
+ def is_valid_date(value):
162
+ try:
163
+ pd.to_datetime(value)
164
+ return True
165
+ except:
166
+ return False
167
+
168
+ if not metadata_df.empty:
169
+ st.subheader("📦 Retrieved Metadata Sample")
170
+ st.dataframe(metadata_df.head())
171
+
172
+ # Metadata completeness analysis (enhanced)
173
+ st.subheader("🧠 Metadata Completeness Analysis")
174
+ completeness = metadata_df.map(lambda x: not is_incomplete(x)).mean() * 100
175
+ completeness_df = pd.DataFrame({"Field": completeness.index, "Completeness (%)": completeness.values})
176
+ fig = px.bar(completeness_df, x="Field", y="Completeness (%)", title="Metadata Completeness by Field")
177
+ st.plotly_chart(fig)
178
+
179
+ # Identify incomplete records
180
+ incomplete_mask = metadata_df.map(is_incomplete).any(axis=1)
181
+ incomplete_records = metadata_df[incomplete_mask]
182
+
183
+ st.subheader("⚠️ Records with Incomplete Metadata")
184
+ if not incomplete_records.empty:
185
+ st.dataframe(incomplete_records.astype(str))
186
+ else:
187
+ st.success("All metadata fields are complete in this collection!")
188
+
189
+ st.subheader("📌 Identifiers of Items Needing Metadata Updates")
190
+ if not incomplete_records.empty:
191
+ st.write(incomplete_records[['id', 'title']])
192
+ else:
193
+ st.success("All records are complete!")
194
+
195
+ st.subheader("✨ Suggested Metadata Enhancements")
196
+ filled_descriptions = metadata_df[metadata_df['description'].notnull()]['description'].astype(str)
197
+ if len(filled_descriptions) > 1:
198
+ try:
199
+ tfidf = TfidfVectorizer(stop_words='english')
200
+ tfidf_matrix = tfidf.fit_transform(filled_descriptions)
201
+ suggestions = []
202
+ for idx, row in incomplete_records.iterrows():
203
+ if pd.isna(row['subject']) and pd.notna(row['description']):
204
+ desc_vec = tfidf.transform([str(row['description'])])
205
+ sims = cosine_similarity(desc_vec, tfidf_matrix).flatten()
206
+ top_idx = sims.argmax()
207
+ suggested_subject = metadata_df.iloc[top_idx]['subject']
208
+ if pd.notna(suggested_subject) and suggested_subject:
209
+ suggestions.append((row['title'], suggested_subject))
210
+ if suggestions:
211
+ suggestions_df = pd.DataFrame(suggestions, columns=["Title", "Suggested Subject"])
212
+ st.dataframe(suggestions_df)
213
+ else:
214
+ st.info("No metadata enhancement suggestions available.")
215
+ except Exception as e:
216
+ st.error(f"Error generating metadata suggestions: {e}")
217
  else:
218
+ st.info("Not enough descriptive data to generate metadata suggestions.")
219
+ else:
220
+ st.warning("No metadata records found for this collection. Try selecting another one.")