CCockrum commited on
Commit
61165d4
Β·
verified Β·
1 Parent(s): 21b5793

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +88 -47
app.py CHANGED
@@ -1,7 +1,7 @@
1
  # MetaDiscovery Agent - LOC API with Collection Selector and Search Endpoint + Enhanced Features
2
-
3
  import requests
4
  import pandas as pd
 
5
  import streamlit as st
6
  import plotly.express as px
7
  from sklearn.feature_extraction.text import TfidfVectorizer
@@ -18,7 +18,7 @@ an analysis of metadata completeness, suggests enhancements, and identifies auth
18
  collections = {
19
  "American Revolutionary War Maps": "american-revolutionary-war-maps",
20
  "Civil War Maps": "civil-war-maps",
21
- "Women’s Suffrage": "womens-suffrage",
22
  "World War I Posters": "world-war-i-posters"
23
  }
24
 
@@ -27,28 +27,57 @@ st.sidebar.markdown("## Settings")
27
  selected = st.sidebar.selectbox("Select a collection", list(collections.keys()))
28
  collection_path = collections[selected]
29
 
30
- # Updated: Use LOC Search API with partof filter (URL encoding for colon)
31
- collection_url = f"https://www.loc.gov/search/?q=&fa=partof%3A{collection_path}&fo=json"
32
  st.sidebar.write(f"Selected Collection: {selected}")
33
-
34
- # Fetch data from LOC API
35
- response = requests.get(collection_url)
36
- data = response.json()
37
-
38
- # Parse metadata records
39
- records = data.get("results", [])
40
-
41
- # Extract selected metadata fields
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42
  items = []
43
  for record in records:
44
- items.append({
45
- "id": record.get("id"),
46
- "title": record.get("title"),
47
- "date": record.get("date"),
48
- "subject": record.get("subject"),
49
- "creator": record.get("creator"),
50
- "description": record.get("description")
51
- })
 
 
 
 
 
 
 
 
 
 
 
52
 
53
  # Create DataFrame
54
  metadata_df = pd.DataFrame(items)
@@ -56,49 +85,61 @@ metadata_df = pd.DataFrame(items)
56
  if not metadata_df.empty:
57
  st.subheader("πŸ“¦ Retrieved Metadata Sample")
58
  st.dataframe(metadata_df.head())
59
-
60
  # Metadata completeness analysis
61
  st.subheader("🧠 Metadata Completeness Analysis")
62
  completeness = metadata_df.notnull().mean() * 100
63
  completeness_df = pd.DataFrame({"Field": completeness.index, "Completeness (%)": completeness.values})
64
-
65
  # Plot completeness
66
  fig = px.bar(completeness_df, x="Field", y="Completeness (%)", title="Metadata Completeness by Field")
67
  st.plotly_chart(fig)
68
-
69
  # List records with missing values
70
  st.subheader("⚠️ Records with Incomplete Metadata")
71
  incomplete_records = metadata_df[metadata_df.isnull().any(axis=1)]
72
- st.dataframe(incomplete_records)
73
-
 
 
 
74
  # Show exact items that need updates
75
  st.subheader("πŸ“Œ Identifiers of Items Needing Metadata Updates")
76
  if not incomplete_records.empty:
77
  st.write(incomplete_records[['id', 'title']])
78
  else:
79
  st.success("All records are complete!")
80
-
81
- # Suggest metadata using text similarity (basic example)
82
  st.subheader("✨ Suggested Metadata Enhancements")
 
 
83
  filled_descriptions = metadata_df[metadata_df['description'].notnull()]['description'].astype(str)
84
- tfidf = TfidfVectorizer(stop_words='english')
85
- tfidf_matrix = tfidf.fit_transform(filled_descriptions)
86
- sim_matrix = cosine_similarity(tfidf_matrix)
87
-
88
- suggestions = []
89
- for idx, row in incomplete_records.iterrows():
90
- if pd.isna(row['subject']) and pd.notna(row['description']):
91
- desc_vec = tfidf.transform([str(row['description'])])
92
- sims = cosine_similarity(desc_vec, tfidf_matrix).flatten()
93
- top_idx = sims.argmax()
94
- suggested_subject = metadata_df.iloc[top_idx]['subject']
95
- suggestions.append((row['title'], suggested_subject))
96
-
97
- if suggestions:
98
- suggestions_df = pd.DataFrame(suggestions, columns=["Title", "Suggested Subject"])
99
- st.dataframe(suggestions_df)
 
 
 
 
 
 
 
 
100
  else:
101
- st.info("No metadata enhancement suggestions available.")
102
-
103
  else:
104
- st.warning("No metadata records found for this collection. Try selecting another one.")
 
1
  # MetaDiscovery Agent - LOC API with Collection Selector and Search Endpoint + Enhanced Features
 
2
  import requests
3
  import pandas as pd
4
+ import numpy as np
5
  import streamlit as st
6
  import plotly.express as px
7
  from sklearn.feature_extraction.text import TfidfVectorizer
 
18
  collections = {
19
  "American Revolutionary War Maps": "american-revolutionary-war-maps",
20
  "Civil War Maps": "civil-war-maps",
21
+ "Women's Suffrage": "womens-suffrage",
22
  "World War I Posters": "world-war-i-posters"
23
  }
24
 
 
27
  selected = st.sidebar.selectbox("Select a collection", list(collections.keys()))
28
  collection_path = collections[selected]
29
 
30
+ # Corrected LOC API URL format
31
+ collection_url = f"https://www.loc.gov/{collection_path}/?fo=json"
32
  st.sidebar.write(f"Selected Collection: {selected}")
33
+ st.sidebar.write(f"API URL: {collection_url}")
34
+
35
+ # Fetch data from LOC API with error handling
36
+ try:
37
+ response = requests.get(collection_url)
38
+ response.raise_for_status() # Raise exception for 4XX/5XX responses
39
+ data = response.json()
40
+
41
+ # Handle both possible response structures
42
+ if "results" in data:
43
+ records = data.get("results", [])
44
+ elif "items" in data:
45
+ records = data.get("items", [])
46
+ else:
47
+ records = []
48
+ st.error("Unexpected API response structure. No records found.")
49
+
50
+ st.write(f"Retrieved {len(records)} records")
51
+
52
+ except requests.exceptions.RequestException as e:
53
+ st.error(f"API Connection Error: {e}")
54
+ records = []
55
+ except ValueError:
56
+ st.error("Failed to parse API response as JSON")
57
+ records = []
58
+
59
+ # Extract selected metadata fields with proper path traversal
60
  items = []
61
  for record in records:
62
+ # Handle different possible data structures
63
+ if isinstance(record, dict):
64
+ # For direct field access
65
+ item = {
66
+ "id": record.get("id", ""),
67
+ "title": record.get("title", ""),
68
+ "date": record.get("date", ""),
69
+ "subject": ", ".join(record.get("subject", [])) if isinstance(record.get("subject"), list) else record.get("subject", ""),
70
+ "creator": record.get("creator", ""),
71
+ "description": record.get("description", "")
72
+ }
73
+
74
+ # For nested field access (common in LOC API)
75
+ if not item["title"] and "item" in record:
76
+ item["title"] = record.get("item", {}).get("title", "")
77
+ if not item["date"] and "item" in record:
78
+ item["date"] = record.get("item", {}).get("date", "")
79
+
80
+ items.append(item)
81
 
82
  # Create DataFrame
83
  metadata_df = pd.DataFrame(items)
 
85
  if not metadata_df.empty:
86
  st.subheader("πŸ“¦ Retrieved Metadata Sample")
87
  st.dataframe(metadata_df.head())
88
+
89
  # Metadata completeness analysis
90
  st.subheader("🧠 Metadata Completeness Analysis")
91
  completeness = metadata_df.notnull().mean() * 100
92
  completeness_df = pd.DataFrame({"Field": completeness.index, "Completeness (%)": completeness.values})
93
+
94
  # Plot completeness
95
  fig = px.bar(completeness_df, x="Field", y="Completeness (%)", title="Metadata Completeness by Field")
96
  st.plotly_chart(fig)
97
+
98
  # List records with missing values
99
  st.subheader("⚠️ Records with Incomplete Metadata")
100
  incomplete_records = metadata_df[metadata_df.isnull().any(axis=1)]
101
+ if not incomplete_records.empty:
102
+ st.dataframe(incomplete_records)
103
+ else:
104
+ st.success("All metadata fields are complete in this collection!")
105
+
106
  # Show exact items that need updates
107
  st.subheader("πŸ“Œ Identifiers of Items Needing Metadata Updates")
108
  if not incomplete_records.empty:
109
  st.write(incomplete_records[['id', 'title']])
110
  else:
111
  st.success("All records are complete!")
112
+
113
+ # Suggest metadata using text similarity with better error handling
114
  st.subheader("✨ Suggested Metadata Enhancements")
115
+
116
+ # Only process if we have descriptions and enough data
117
  filled_descriptions = metadata_df[metadata_df['description'].notnull()]['description'].astype(str)
118
+
119
+ if len(filled_descriptions) > 1:
120
+ try:
121
+ tfidf = TfidfVectorizer(stop_words='english')
122
+ tfidf_matrix = tfidf.fit_transform(filled_descriptions)
123
+ sim_matrix = cosine_similarity(tfidf_matrix)
124
+
125
+ suggestions = []
126
+ for idx, row in incomplete_records.iterrows():
127
+ if pd.isna(row['subject']) and pd.notna(row['description']):
128
+ desc_vec = tfidf.transform([str(row['description'])])
129
+ sims = cosine_similarity(desc_vec, tfidf_matrix).flatten()
130
+ top_idx = sims.argmax()
131
+ suggested_subject = metadata_df.iloc[top_idx]['subject']
132
+ if pd.notna(suggested_subject) and suggested_subject: # Only add valid suggestions
133
+ suggestions.append((row['title'], suggested_subject))
134
+
135
+ if suggestions:
136
+ suggestions_df = pd.DataFrame(suggestions, columns=["Title", "Suggested Subject"])
137
+ st.dataframe(suggestions_df)
138
+ else:
139
+ st.info("No metadata enhancement suggestions available.")
140
+ except Exception as e:
141
+ st.error(f"Error generating metadata suggestions: {e}")
142
  else:
143
+ st.info("Not enough descriptive data to generate metadata suggestions.")
 
144
  else:
145
+ st.warning("No metadata records found for this collection. Try selecting another one.")