CCockrum commited on
Commit
91c3d7f
·
verified ·
1 Parent(s): 083533c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +50 -20
app.py CHANGED
@@ -1,15 +1,17 @@
1
- # MetaDiscovery Agent - Phase 1: LOC API Integration and Metadata Gap Analysis
2
 
3
  import requests
4
  import pandas as pd
5
  import streamlit as st
6
  import plotly.express as px
 
 
7
 
8
  # Streamlit app header
9
  st.title("MetaDiscovery Agent for Library of Congress Collections")
10
  st.markdown("""
11
  This tool connects to the LOC API, retrieves metadata from a selected collection, and performs
12
- an initial analysis of metadata completeness.
13
  """)
14
 
15
  # Predefined LOC collections
@@ -24,9 +26,9 @@ collections = {
24
  st.sidebar.markdown("## Settings")
25
  selected = st.sidebar.selectbox("Select a collection", list(collections.keys()))
26
  collection_path = collections[selected]
27
- collection_url = f"https://www.loc.gov/collections/{collection_path}/?fo=json"
28
 
29
- # Display selected collection
 
30
  st.sidebar.write(f"Selected Collection: {selected}")
31
 
32
  # Fetch data from LOC API
@@ -49,19 +51,47 @@ for record in records:
49
 
50
  # Create DataFrame
51
  metadata_df = pd.DataFrame(items)
52
- st.subheader("📦 Retrieved Metadata Sample")
53
- st.dataframe(metadata_df.head())
54
-
55
- # Metadata completeness analysis
56
- st.subheader("🧠 Metadata Completeness Analysis")
57
- completeness = metadata_df.notnull().mean() * 100
58
- completeness_df = pd.DataFrame({"Field": completeness.index, "Completeness (%)": completeness.values})
59
-
60
- # Plot completeness
61
- fig = px.bar(completeness_df, x="Field", y="Completeness (%)", title="Metadata Completeness by Field")
62
- st.plotly_chart(fig)
63
-
64
- # List records with missing values
65
- st.subheader("⚠️ Records with Incomplete Metadata")
66
- incomplete_records = metadata_df[metadata_df.isnull().any(axis=1)]
67
- st.dataframe(incomplete_records)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # MetaDiscovery Agent - LOC API with Collection Selector and Search Endpoint + Enhanced Features
2
 
3
  import requests
4
  import pandas as pd
5
  import streamlit as st
6
  import plotly.express as px
7
+ from sklearn.feature_extraction.text import TfidfVectorizer
8
+ from sklearn.metrics.pairwise import cosine_similarity
9
 
10
  # Streamlit app header
11
  st.title("MetaDiscovery Agent for Library of Congress Collections")
12
  st.markdown("""
13
  This tool connects to the LOC API, retrieves metadata from a selected collection, and performs
14
+ an analysis of metadata completeness, suggests enhancements, and identifies authority gaps.
15
  """)
16
 
17
  # Predefined LOC collections
 
26
  st.sidebar.markdown("## Settings")
27
  selected = st.sidebar.selectbox("Select a collection", list(collections.keys()))
28
  collection_path = collections[selected]
 
29
 
30
+ # Updated: Use LOC Search API with partof filter
31
+ collection_url = f"https://www.loc.gov/search/?q=&fa=partof:{collection_path}&fo=json"
32
  st.sidebar.write(f"Selected Collection: {selected}")
33
 
34
  # Fetch data from LOC API
 
51
 
52
  # Create DataFrame
53
  metadata_df = pd.DataFrame(items)
54
+
55
+ if not metadata_df.empty:
56
+ st.subheader("📦 Retrieved Metadata Sample")
57
+ st.dataframe(metadata_df.head())
58
+
59
+ # Metadata completeness analysis
60
+ st.subheader("🧠 Metadata Completeness Analysis")
61
+ completeness = metadata_df.notnull().mean() * 100
62
+ completeness_df = pd.DataFrame({"Field": completeness.index, "Completeness (%)": completeness.values})
63
+
64
+ # Plot completeness
65
+ fig = px.bar(completeness_df, x="Field", y="Completeness (%)", title="Metadata Completeness by Field")
66
+ st.plotly_chart(fig)
67
+
68
+ # List records with missing values
69
+ st.subheader("⚠️ Records with Incomplete Metadata")
70
+ incomplete_records = metadata_df[metadata_df.isnull().any(axis=1)]
71
+ st.dataframe(incomplete_records)
72
+
73
+ # Suggest metadata using text similarity (basic example)
74
+ st.subheader("✨ Suggested Metadata Enhancements")
75
+ filled_descriptions = metadata_df[metadata_df['description'].notnull()]['description'].astype(str)
76
+ tfidf = TfidfVectorizer(stop_words='english')
77
+ tfidf_matrix = tfidf.fit_transform(filled_descriptions)
78
+ sim_matrix = cosine_similarity(tfidf_matrix)
79
+
80
+ suggestions = []
81
+ for idx, row in incomplete_records.iterrows():
82
+ if pd.isna(row['subject']) and pd.notna(row['description']):
83
+ # Find most similar description
84
+ desc_vec = tfidf.transform([str(row['description'])])
85
+ sims = cosine_similarity(desc_vec, tfidf_matrix).flatten()
86
+ top_idx = sims.argmax()
87
+ suggested_subject = metadata_df.iloc[top_idx]['subject']
88
+ suggestions.append((row['title'], suggested_subject))
89
+
90
+ if suggestions:
91
+ suggestions_df = pd.DataFrame(suggestions, columns=["Title", "Suggested Subject"])
92
+ st.dataframe(suggestions_df)
93
+ else:
94
+ st.info("No metadata enhancement suggestions available.")
95
+
96
+ else:
97
+ st.warning("No metadata records found for this collection. Try selecting another one.")