Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -347,42 +347,73 @@ if fetch_data:
|
|
347 |
|
348 |
st.subheader("Suggested Metadata Enhancements")
|
349 |
|
350 |
-
|
351 |
-
|
352 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
353 |
|
354 |
-
|
355 |
-
|
356 |
-
|
357 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
358 |
|
359 |
-
|
360 |
-
|
361 |
-
|
362 |
-
|
363 |
-
|
364 |
-
|
365 |
-
|
366 |
-
|
367 |
-
|
368 |
-
|
369 |
-
|
370 |
-
|
371 |
-
|
372 |
-
|
373 |
-
|
374 |
-
|
375 |
-
|
376 |
-
""", unsafe_allow_html=True)
|
377 |
-
|
378 |
-
except Exception as e:
|
379 |
-
st.error(f"Error generating metadata suggestions: {e}")
|
380 |
-
else:
|
381 |
-
st.markdown("""
|
382 |
-
<div class='custom-table'>
|
383 |
-
<b>Not enough descriptive data to generate metadata suggestions.</b>
|
384 |
-
</div>
|
385 |
-
""", unsafe_allow_html=True)
|
386 |
else:
|
387 |
-
|
388 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
347 |
|
348 |
st.subheader("Suggested Metadata Enhancements")
|
349 |
|
350 |
+
# Look for records with descriptions but missing subjects or other fields
|
351 |
+
incomplete_with_desc = metadata_df[metadata_df['description'].notnull() &
|
352 |
+
(metadata_df['subject'].isnull() |
|
353 |
+
metadata_df['creator'].isnull())]
|
354 |
+
|
355 |
+
# Reference data should be complete records with both subjects and descriptions
|
356 |
+
reference_df = metadata_df[metadata_df['subject'].notnull() &
|
357 |
+
metadata_df['description'].notnull() &
|
358 |
+
metadata_df['creator'].notnull()]
|
359 |
+
|
360 |
+
# Print debugging info
|
361 |
+
st.write(f"Records with descriptions but missing fields: {len(incomplete_with_desc)}")
|
362 |
+
st.write(f"Complete reference records: {len(reference_df)}")
|
363 |
+
|
364 |
+
tfidf = TfidfVectorizer(stop_words='english', max_features=1000)
|
365 |
+
|
366 |
+
if len(incomplete_with_desc) > 0 and len(reference_df) > 0:
|
367 |
+
try:
|
368 |
+
suggestions = []
|
369 |
+
# Fit TF-IDF on all complete descriptions
|
370 |
+
tfidf_matrix = tfidf.fit_transform(reference_df['description'].fillna('').astype(str))
|
371 |
|
372 |
+
# For each incomplete record
|
373 |
+
for idx, row in incomplete_with_desc.iterrows():
|
374 |
+
if pd.notna(row['description']):
|
375 |
+
# Transform this record's description
|
376 |
+
desc_vec = tfidf.transform([str(row['description'])])
|
377 |
+
|
378 |
+
# Get similarity scores to all reference records
|
379 |
+
sims = cosine_similarity(desc_vec, tfidf_matrix).flatten()
|
380 |
+
|
381 |
+
# Find the top 3 most similar records
|
382 |
+
top_indices = sims.argsort()[-3:][::-1]
|
383 |
+
|
384 |
+
# Get the most frequent subject among top matches
|
385 |
+
top_subjects = reference_df.iloc[top_indices]['subject'].value_counts().index
|
386 |
+
if len(top_subjects) > 0:
|
387 |
+
suggested_subject = top_subjects[0]
|
388 |
+
suggestions.append((row['title'], suggested_subject))
|
389 |
|
390 |
+
if suggestions:
|
391 |
+
suggestions_df = pd.DataFrame(suggestions, columns=["Title", "Suggested Subject"])
|
392 |
+
|
393 |
+
# Apply similar styling as your other tables
|
394 |
+
styled_suggestions = (
|
395 |
+
suggestions_df.style
|
396 |
+
.background_gradient(cmap="Greens", subset=["Suggested Subject"])
|
397 |
+
.hide(axis="index")
|
398 |
+
)
|
399 |
+
|
400 |
+
# Display as a dataframe with styling
|
401 |
+
st.dataframe(
|
402 |
+
styled_suggestions,
|
403 |
+
use_container_width=True,
|
404 |
+
hide_index=True,
|
405 |
+
height=min(240, len(suggestions) * 35 + 38)
|
406 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
407 |
else:
|
408 |
+
empty_df = pd.DataFrame([["No metadata enhancement suggestions available."]],
|
409 |
+
columns=["Message"])
|
410 |
+
styled_empty = empty_df.style.hide(axis="index")
|
411 |
+
st.dataframe(styled_empty, use_container_width=True, hide_index=True)
|
412 |
+
except Exception as e:
|
413 |
+
st.error(f"Error generating metadata suggestions: {e}")
|
414 |
+
st.error(f"Error details: {str(e)}")
|
415 |
+
else:
|
416 |
+
empty_df = pd.DataFrame([["Not enough descriptive data to generate metadata suggestions."]],
|
417 |
+
columns=["Message"])
|
418 |
+
styled_empty = empty_df.style.hide(axis="index")
|
419 |
+
st.dataframe(styled_empty, use_container_width=True, hide_index=True)
|