Spaces:

CCockrum
/

LOC-Metadate-Analyzer

Running

App Files Files Community

CCockrum commited on Apr 26

Commit

1b02b65

verified ·

1 Parent(s): ce53185

Update app.py

Browse files

Files changed (1) hide show

app.py +67 -36

app.py CHANGED Viewed

@@ -347,42 +347,73 @@ if fetch_data:
             st.subheader("Suggested Metadata Enhancements")
-            incomplete_with_desc = incomplete_records[incomplete_records['description'].notnull()]
-            reference_df = metadata_df[metadata_df['subject'].notnull() & metadata_df['description'].notnull()]
-            tfidf = TfidfVectorizer(stop_words='english')
-            if len(incomplete_with_desc) > 1 and len(reference_df) > 1:
-                try:
-                    suggestions = []
-                    tfidf_matrix = tfidf.fit_transform(reference_df['description'])
-                    for idx, row in incomplete_with_desc.iterrows():
-                        if pd.isna(row['subject']) and pd.notna(row['description']):
-                            desc_vec = tfidf.transform([str(row['description'])])
-                            sims = cosine_similarity(desc_vec, tfidf_matrix).flatten()
-                            top_idx = sims.argmax()
-                            suggested_subject = reference_df.iloc[top_idx]['subject']
-                            if pd.notna(suggested_subject) and suggested_subject:
-                                suggestions.append((row['title'], suggested_subject))
-                    if suggestions:
-                        suggestions_df = pd.DataFrame(suggestions, columns=["Title", "Suggested Subject"])
-                        st.markdown("<div class='custom-table'>" + suggestions_df.to_markdown(index=False) + "</div>", unsafe_allow_html=True)
-                    else:
-                        st.markdown("""
-                            <div class='custom-table'>
-                            <b>No metadata enhancement suggestions available.</b>
-                            </div>
-                        """, unsafe_allow_html=True)
-                except Exception as e:
-                    st.error(f"Error generating metadata suggestions: {e}")
-            else:
-                st.markdown("""
-                    <div class='custom-table'>
-                    <b>Not enough descriptive data to generate metadata suggestions.</b>
-                    </div>
-                    """, unsafe_allow_html=True)
         else:
-            st.warning("No metadata records found for this collection. Try selecting another one.")

             st.subheader("Suggested Metadata Enhancements")
+# Look for records with descriptions but missing subjects or other fields
+incomplete_with_desc = metadata_df[metadata_df['description'].notnull() &
+                                  (metadata_df['subject'].isnull() |
+                                   metadata_df['creator'].isnull())]
+# Reference data should be complete records with both subjects and descriptions
+reference_df = metadata_df[metadata_df['subject'].notnull() &
+                          metadata_df['description'].notnull() &
+                          metadata_df['creator'].notnull()]
+# Print debugging info
+st.write(f"Records with descriptions but missing fields: {len(incomplete_with_desc)}")
+st.write(f"Complete reference records: {len(reference_df)}")
+tfidf = TfidfVectorizer(stop_words='english', max_features=1000)
+if len(incomplete_with_desc) > 0 and len(reference_df) > 0:
+    try:
+        suggestions = []
+        # Fit TF-IDF on all complete descriptions
+        tfidf_matrix = tfidf.fit_transform(reference_df['description'].fillna('').astype(str))
+        # For each incomplete record
+        for idx, row in incomplete_with_desc.iterrows():
+            if pd.notna(row['description']):
+                # Transform this record's description
+                desc_vec = tfidf.transform([str(row['description'])])
+                # Get similarity scores to all reference records
+                sims = cosine_similarity(desc_vec, tfidf_matrix).flatten()
+                # Find the top 3 most similar records
+                top_indices = sims.argsort()[-3:][::-1]
+                # Get the most frequent subject among top matches
+                top_subjects = reference_df.iloc[top_indices]['subject'].value_counts().index
+                if len(top_subjects) > 0:
+                    suggested_subject = top_subjects[0]
+                    suggestions.append((row['title'], suggested_subject))
+        if suggestions:
+            suggestions_df = pd.DataFrame(suggestions, columns=["Title", "Suggested Subject"])
+            # Apply similar styling as your other tables
+            styled_suggestions = (
+                suggestions_df.style
+                .background_gradient(cmap="Greens", subset=["Suggested Subject"])
+                .hide(axis="index")
+            )
+            # Display as a dataframe with styling
+            st.dataframe(
+                styled_suggestions,
+                use_container_width=True,
+                hide_index=True,
+                height=min(240, len(suggestions) * 35 + 38)
+            )
         else:
+            empty_df = pd.DataFrame([["No metadata enhancement suggestions available."]],
+                                   columns=["Message"])
+            styled_empty = empty_df.style.hide(axis="index")
+            st.dataframe(styled_empty, use_container_width=True, hide_index=True)
+    except Exception as e:
+        st.error(f"Error generating metadata suggestions: {e}")
+        st.error(f"Error details: {str(e)}")
+else:
+    empty_df = pd.DataFrame([["Not enough descriptive data to generate metadata suggestions."]],
+                           columns=["Message"])
+    styled_empty = empty_df.style.hide(axis="index")
+    st.dataframe(styled_empty, use_container_width=True, hide_index=True)