CCockrum commited on
Commit
c39747a
·
verified ·
1 Parent(s): 90247f9

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +183 -175
app.py CHANGED
@@ -1,213 +1,221 @@
1
- # MetaDiscovery Agent - LOC API with Enhanced Completeness and Quality Analysis
2
  import requests
3
  import pandas as pd
4
  import streamlit as st
 
5
  import plotly.express as px
6
  from sklearn.feature_extraction.text import TfidfVectorizer
7
  from sklearn.metrics.pairwise import cosine_similarity
8
- import matplotlib
9
 
10
- # --- CUSTOM CSS ---
11
  st.markdown("""
12
- <style>
13
- html, body, [data-testid="stApp"] {
14
- background-color: #1A1A1A !important;
15
- }
16
- .block-container {
17
- background-color: gray !important;
18
- color: #1A1A1A !important;
19
- padding-left: 2rem !important;
20
- padding-right: 2rem !important;
21
- }
22
- section[data-testid="stSidebar"] > div:first-child {
23
- background-color: #1A1A1A !important;
24
- color: #FFFFFF !important;
25
- padding: 2rem 1.5rem;
26
- border-radius: 12px;
27
- }
28
- .sidebar-contrast-block {
29
- background-color: #2b2b2b !important;
30
- padding: 1rem;
31
- border-radius: 10px;
32
- margin-top: 1.5rem;
33
- color: lightgray;
34
- }
35
- .custom-table {
36
- background-color: #D3D3D3;
37
- color: #1A1A1A;
38
- font-family: monospace;
39
- padding: 1rem;
40
- border-radius: 8px;
41
- overflow-x: auto;
42
- white-space: pre;
43
- border: 1px solid #ccc;
44
- }
45
- .sidebar-links a {
46
- color: lightgray !important;
47
- text-decoration: none !important;
48
- }
49
- .sidebar-links a:hover {
50
- text-decoration: underline !important;
51
- }
52
- </style>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
53
  """, unsafe_allow_html=True)
54
 
55
- # --- HEADER ---
56
  st.image("https://cdn-uploads.huggingface.co/production/uploads/67351c643fe51cb1aa28f2e5/7ThcAOjbuM8ajrP85bGs4.jpeg", use_container_width=True)
 
 
57
  st.title("MetaDiscovery Agent for Library of Congress Collections")
58
- st.markdown("This tool connects to the LOC API, retrieves metadata from a selected collection, and performs an analysis of metadata completeness, suggests enhancements, and identifies authority gaps.")
 
 
 
59
 
60
- # --- COLLECTION SETUP ---
61
  collections = {
62
  "American Revolutionary War Maps": "american+revolutionary+war+maps",
63
  "Civil War Maps": "civil+war+maps",
64
  "Women's Suffrage": "women+suffrage",
65
  "World War I Posters": "world+war+posters"
66
  }
 
67
  selected = st.sidebar.selectbox("Select a collection", list(collections.keys()), key="collection_selector")
68
  search_query = collections[selected]
69
  collection_url = f"https://www.loc.gov/search/?q={search_query}&fo=json"
70
 
71
- # --- PLACEHOLDERS FOR SIDEBAR BLOCKS ---
72
- stats_placeholder = st.sidebar.container()
73
- completeness_placeholder = st.sidebar.container()
74
 
75
- # --- HELPFUL RESOURCES ---
76
  st.sidebar.markdown("""
77
- <div class="sidebar-contrast-block">
78
- <h4>🔗 Helpful Resources</h4>
79
- <ul class="sidebar-links">
80
- <li><a href="https://www.loc.gov/apis/" target="_blank">LOC API Info</a></li>
81
- <li><a href="https://www.loc.gov/" target="_blank">Library of Congress Homepage</a></li>
82
- <li><a href="https://www.loc.gov/collections/" target="_blank">LOC Digital Collections</a></li>
83
- <li><a href="https://www.loc.gov/marc/" target="_blank">MARC Metadata Standards</a></li>
84
- <li><a href="https://labs.loc.gov/about-labs/digital-strategy/" target="_blank">LOC Digital Strategy</a></li>
85
- </ul>
 
 
86
  </div>
87
  """, unsafe_allow_html=True)
88
 
89
- # Define Utility Functions
90
- def is_incomplete(value):
91
- return pd.isna(value) or value in ["", "N/A", "null", None]
92
-
93
- def is_valid_date(value):
94
- try:
95
- pd.to_datetime(value)
96
- return True
97
- except:
98
- return False
99
-
100
- # Fetch data from LOC API
101
- def fetch_loc_data(collection_url):
102
  headers = {"User-Agent": "Mozilla/5.0"}
103
  try:
104
  response = requests.get(collection_url, headers=headers)
105
  response.raise_for_status()
106
  data = response.json()
107
- if "results" in data:
108
- return data["results"]
109
- elif "items" in data:
110
- return data["items"]
111
- else:
112
- return []
113
- except Exception as e:
114
- st.error(f"API Error: {e}")
115
- return []
116
-
117
- # Transform Records
118
- def transform_records(records):
119
- items = []
120
- for record in records:
121
- if isinstance(record, dict):
122
- description = record.get("description", "")
123
- if isinstance(description, list):
124
- description = " ".join(map(str, description))
125
- item = {
126
- "id": record.get("id", ""),
127
- "title": record.get("title", ""),
128
- "date": record.get("date", ""),
129
- "subject": ", ".join(record.get("subject", [])) if isinstance(record.get("subject"), list) else record.get("subject", ""),
130
- "creator": record.get("creator", ""),
131
- "description": description
132
- }
133
- items.append(item)
134
- return pd.DataFrame(items)
135
-
136
- # Render Main Application Sections
137
- def render_main_sections(metadata_df, stats_placeholder, completeness_placeholder):
138
- if not metadata_df.empty:
139
- # Sidebar Quick Stats
140
- incomplete_mask = metadata_df.map(is_incomplete).any(axis=1)
141
- incomplete_count = incomplete_mask.sum()
142
-
143
- total_fields = metadata_df.size
144
- filled_fields = metadata_df.map(lambda x: not is_incomplete(x)).sum().sum()
145
- overall_percent = (filled_fields / total_fields) * 100
146
-
147
- # Completeness Table
148
- completeness = metadata_df.map(lambda x: not is_incomplete(x)).mean() * 100
149
- completeness_table = completeness.round(1).to_frame(name="Completeness (%)")
150
-
151
- # Update sidebar placeholders
152
- stats_html = f"""
153
- <div class="sidebar-stats">
154
- <h3 style="color: lightgray;">Quick Stats</h3>
155
- <p style="color:lightgray;">Total Records: <b>{len(metadata_df)}</b></p>
156
- <p style="color:lightgray;">Incomplete Records: <b>{incomplete_count}</b></p>
157
- <p style="color:lightgray;">Overall Completeness: <b>{overall_percent:.1f}%</b></p>
158
- </div>
159
- """
160
- stats_placeholder.markdown(stats_html, unsafe_allow_html=True)
161
-
162
- # Field Completeness Breakdown (inside contrast block)
163
- with completeness_placeholder:
164
- st.markdown("""
165
- <div style='background-color:#2e2e2e; padding:1.25rem; border-radius:8px; margin-top:1.5rem;'>
166
- <h4 style='color: lightgray;'>Field Completeness Breakdown</h4>
167
- """, unsafe_allow_html=True)
168
-
169
- st.dataframe(
170
- completeness_table.style.background_gradient(cmap="Greens").format("{:.1f}%"),
171
- use_container_width=True
172
- )
173
-
174
- st.markdown("</div>", unsafe_allow_html=True)
175
-
176
- # Main Body
177
- st.subheader("📂 Retrieved Metadata Sample")
178
- st.dataframe(metadata_df.head())
179
-
180
- st.subheader("📊 Metadata Completeness Analysis")
181
- completeness_df = completeness.reset_index()
182
- completeness_df.columns = ["Field", "Completeness (%)"]
183
- fig = px.bar(completeness_df, x="Field", y="Completeness (%)", title="Metadata Completeness by Field")
184
- st.plotly_chart(fig)
185
-
186
- # Suggested Metadata Enhancements
187
- st.subheader("✨ Suggested Metadata Enhancements")
188
- incomplete_with_desc = metadata_df[incomplete_mask & metadata_df['description'].notna()]
189
- reference_df = metadata_df[metadata_df['subject'].notna() & metadata_df['description'].notna()]
190
- if not incomplete_with_desc.empty and not reference_df.empty:
191
- try:
192
- tfidf = TfidfVectorizer(stop_words='english')
193
- tfidf_matrix = tfidf.fit_transform(reference_df['description'])
194
- suggestions = []
195
- for idx, row in incomplete_with_desc.iterrows():
196
- desc_vec = tfidf.transform([row['description']])
197
  sims = cosine_similarity(desc_vec, tfidf_matrix).flatten()
198
  top_idx = sims.argmax()
199
  suggested_subject = reference_df.iloc[top_idx]['subject']
200
- if suggested_subject:
201
  suggestions.append((row['title'], suggested_subject))
202
- if suggestions:
203
- suggestions_df = pd.DataFrame(suggestions, columns=["Title", "Suggested Subject"])
204
- st.markdown("<div class='custom-table'>" + suggestions_df.to_markdown(index=False) + "</div>", unsafe_allow_html=True)
205
- else:
206
- st.info("No suggestions could be generated.")
207
- except Exception as e:
208
- st.error(f"Error generating suggestions: {e}")
209
- else:
210
- st.info("Not enough data for metadata enhancement suggestions.")
211
  else:
212
- st.warning("No metadata found for the selected collection.")
213
-
 
 
 
1
  import requests
2
  import pandas as pd
3
  import streamlit as st
4
+ import matplotlib
5
  import plotly.express as px
6
  from sklearn.feature_extraction.text import TfidfVectorizer
7
  from sklearn.metrics.pairwise import cosine_similarity
 
8
 
9
+ # ------------------- Custom CSS -------------------
10
  st.markdown("""
11
+ <style>
12
+ html, body, [data-testid="stApp"] {
13
+ background-color: #1A1A1A !important;
14
+ }
15
+ .main {
16
+ background-color: #D3D3D3 !important;
17
+ color: #1A1A1A!important;
18
+ }
19
+ .block-container {
20
+ background-color: gray !important;
21
+ color: #1A1A1A !important;
22
+ padding-left: 2rem !important;
23
+ padding-right: 2rem !important;
24
+ }
25
+ section[data-testid="stSidebar"] > div:first-child {
26
+ background-color: #1A1A1A !important;
27
+ color: #FFFFFF !important;
28
+ padding: 2rem 1.5rem 1.5rem 1.5rem !important;
29
+ border-radius: 12px;
30
+ box-shadow: 0 4px 12px rgba(0, 0, 0, 0.08);
31
+ font-size: 0.95rem;
32
+ }
33
+ .custom-table {
34
+ background-color: #D3D3D3;
35
+ color: #1A1A1A;
36
+ font-family: monospace;
37
+ padding: 1rem;
38
+ border-radius: 8px;
39
+ overflow-x: auto;
40
+ white-space: pre;
41
+ border: 1px solid #ccc;
42
+ }
43
+ .sidebar-stats {
44
+ color: lightgray !important;
45
+ font-size: 1.1rem !important;
46
+ font-weight: 600;
47
+ }
48
+ .sidebar-contrast-block {
49
+ background-color: #2b2b2b !important;
50
+ padding: 1.25rem;
51
+ border-radius: 10px;
52
+ margin-top: 1.5rem;
53
+ }
54
+ .sidebar-section h3 {
55
+ color: lightgray !important;
56
+ font-size: 1.1rem !important;
57
+ margin-top: 1.5rem;
58
+ }
59
+ .sidebar-links a {
60
+ color: lightgray !important;
61
+ text-decoration: none !important;
62
+ }
63
+ .sidebar-links a:hover {
64
+ text-decoration: underline !important;
65
+ }
66
+ </style>
67
  """, unsafe_allow_html=True)
68
 
69
+ # ------------------- Banner Image -------------------
70
  st.image("https://cdn-uploads.huggingface.co/production/uploads/67351c643fe51cb1aa28f2e5/7ThcAOjbuM8ajrP85bGs4.jpeg", use_container_width=True)
71
+
72
+ # ------------------- App Title & Description -------------------
73
  st.title("MetaDiscovery Agent for Library of Congress Collections")
74
+ st.markdown("""
75
+ This tool connects to the LOC API, retrieves metadata from a selected collection, and performs
76
+ an analysis of metadata completeness, suggests enhancements, and identifies authority gaps.
77
+ """)
78
 
79
+ # ------------------- Collection Selection -------------------
80
  collections = {
81
  "American Revolutionary War Maps": "american+revolutionary+war+maps",
82
  "Civil War Maps": "civil+war+maps",
83
  "Women's Suffrage": "women+suffrage",
84
  "World War I Posters": "world+war+posters"
85
  }
86
+
87
  selected = st.sidebar.selectbox("Select a collection", list(collections.keys()), key="collection_selector")
88
  search_query = collections[selected]
89
  collection_url = f"https://www.loc.gov/search/?q={search_query}&fo=json"
90
 
91
+ # ------------------- Placeholders -------------------
92
+ stats_placeholder = st.sidebar.empty()
93
+ completeness_placeholder = st.sidebar.empty()
94
 
95
+ # ------------------- Helpful Resources -------------------
96
  st.sidebar.markdown("""
97
+ <div class="sidebar-section">
98
+ <h3>🔗 Helpful Resources</h3>
99
+ <div class="sidebar-links">
100
+ <ul style='padding-left: 1em'>
101
+ <li><a href="https://www.loc.gov/apis/" target="_blank">LOC API Info</a></li>
102
+ <li><a href="https://www.loc.gov/" target="_blank">Library of Congress Homepage</a></li>
103
+ <li><a href="https://www.loc.gov/collections/" target="_blank">LOC Digital Collections</a></li>
104
+ <li><a href="https://www.loc.gov/marc/" target="_blank">MARC Metadata Standards</a></li>
105
+ <li><a href="https://labs.loc.gov/about-labs/digital-strategy/" target="_blank">LOC Digital Strategy</a></li>
106
+ </ul>
107
+ </div>
108
  </div>
109
  """, unsafe_allow_html=True)
110
 
111
+ # ------------------- Fetch Data -------------------
112
+ with st.spinner(f"Fetching data for {selected}..."):
 
 
 
 
 
 
 
 
 
 
 
113
  headers = {"User-Agent": "Mozilla/5.0"}
114
  try:
115
  response = requests.get(collection_url, headers=headers)
116
  response.raise_for_status()
117
  data = response.json()
118
+ records = data.get("results") or data.get("items") or []
119
+ except:
120
+ records = []
121
+ st.error("Failed to load data from LOC API")
122
+
123
+ # ------------------- Data Preparation -------------------
124
+ items = []
125
+ for record in records:
126
+ description = record.get("description", "")
127
+ if isinstance(description, list):
128
+ description = " ".join([str(d) for d in description])
129
+ item = {
130
+ "id": record.get("id", ""),
131
+ "title": record.get("title", ""),
132
+ "date": record.get("date", ""),
133
+ "subject": ", ".join(record.get("subject", [])) if isinstance(record.get("subject"), list) else record.get("subject", ""),
134
+ "creator": record.get("creator", ""),
135
+ "description": description
136
+ }
137
+ items.append(item)
138
+
139
+ metadata_df = pd.DataFrame(items)
140
+
141
+ # ------------------- Completeness Logic -------------------
142
+ def is_incomplete(value):
143
+ return pd.isna(value) or value in ["", "N/A", "null", None]
144
+
145
+ if not metadata_df.empty:
146
+ incomplete_mask = metadata_df.map(is_incomplete).any(axis=1)
147
+ incomplete_count = incomplete_mask.sum()
148
+ total_fields = metadata_df.size
149
+ filled_fields = (~metadata_df.map(is_incomplete)).sum().sum()
150
+ overall_percent = (filled_fields / total_fields) * 100
151
+ completeness = (~metadata_df.map(is_incomplete)).mean() * 100
152
+ completeness_df = pd.DataFrame({"Field": completeness.index, "Completeness (%)": completeness.values})
153
+ completeness_table = completeness_df.set_index("Field")
154
+
155
+ # ------------------- Quick Stats -------------------
156
+ stats_html = f"""
157
+ <div class="sidebar-stats">
158
+ <h3 style="color: lightgray;">📊 Quick Stats</h3>
159
+ <p style="color:lightgray;">Total Records: <b>{len(metadata_df)}</b></p>
160
+ <p style="color:lightgray;">Incomplete Records: <b>{incomplete_count}</b></p>
161
+ <p style="color:lightgray;">Overall Metadata Completeness: <b>{overall_percent:.1f}%</b></p>
162
+ </div>
163
+ """
164
+ stats_placeholder.markdown(stats_html, unsafe_allow_html=True)
165
+
166
+ # ------------------- Field Completeness Table -------------------
167
+ with completeness_placeholder:
168
+ st.markdown("""
169
+ <div style='
170
+ background-color: #2e2e2e;
171
+ padding: 1.2rem;
172
+ border-radius: 10px;
173
+ margin-top: 1.5rem;
174
+ color: lightgray;
175
+ '>
176
+ <h4 style='margin-bottom: 1rem;'>Field Completeness Breakdown</h4>
177
+ """, unsafe_allow_html=True)
178
+ st.dataframe(
179
+ completeness_table.style.background_gradient(cmap="Greens").format("{:.1f}%"),
180
+ use_container_width=True,
181
+ height=240
182
+ )
183
+ st.markdown("</div>", unsafe_allow_html=True)
184
+
185
+ # ------------------- Main Panel -------------------
186
+ st.subheader("Retrieved Metadata Sample")
187
+ st.dataframe(metadata_df.head())
188
+
189
+ st.subheader("Metadata Completeness Analysis")
190
+ fig = px.bar(completeness_df, x="Field", y="Completeness (%)", title="Metadata Completeness by Field")
191
+ st.plotly_chart(fig)
192
+
193
+ # ------------------- Metadata Suggestions -------------------
194
+ st.subheader(" Suggested Metadata Enhancements")
195
+ incomplete_with_desc = metadata_df[incomplete_mask & metadata_df['description'].notnull()]
196
+ reference_df = metadata_df[metadata_df['subject'].notnull() & metadata_df['description'].notnull()]
197
+
198
+ if len(incomplete_with_desc) > 1 and len(reference_df) > 1:
199
+ try:
200
+ tfidf = TfidfVectorizer(stop_words='english')
201
+ tfidf_matrix = tfidf.fit_transform(reference_df['description'])
202
+ suggestions = []
203
+ for _, row in incomplete_with_desc.iterrows():
204
+ if pd.isna(row['subject']) and pd.notna(row['description']):
205
+ desc_vec = tfidf.transform([str(row['description'])])
 
 
206
  sims = cosine_similarity(desc_vec, tfidf_matrix).flatten()
207
  top_idx = sims.argmax()
208
  suggested_subject = reference_df.iloc[top_idx]['subject']
209
+ if pd.notna(suggested_subject):
210
  suggestions.append((row['title'], suggested_subject))
211
+ if suggestions:
212
+ suggestions_df = pd.DataFrame(suggestions, columns=["Title", "Suggested Subject"])
213
+ st.markdown("<div class='custom-table'>" + suggestions_df.to_markdown(index=False) + "</div>", unsafe_allow_html=True)
214
+ else:
215
+ st.info("No metadata enhancement suggestions available.")
216
+ except Exception as e:
217
+ st.error(f"Error generating suggestions: {e}")
 
 
218
  else:
219
+ st.info("Not enough descriptive data to generate metadata suggestions.")
220
+ else:
221
+ st.warning("⚠️ No metadata records found for this collection.")