CCockrum commited on
Commit
1b02b65
·
verified ·
1 Parent(s): ce53185

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +67 -36
app.py CHANGED
@@ -347,42 +347,73 @@ if fetch_data:
347
 
348
  st.subheader("Suggested Metadata Enhancements")
349
 
350
- incomplete_with_desc = incomplete_records[incomplete_records['description'].notnull()]
351
- reference_df = metadata_df[metadata_df['subject'].notnull() & metadata_df['description'].notnull()]
352
- tfidf = TfidfVectorizer(stop_words='english')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
353
 
354
- if len(incomplete_with_desc) > 1 and len(reference_df) > 1:
355
- try:
356
- suggestions = []
357
- tfidf_matrix = tfidf.fit_transform(reference_df['description'])
 
 
 
 
 
 
 
 
 
 
 
 
 
358
 
359
- for idx, row in incomplete_with_desc.iterrows():
360
- if pd.isna(row['subject']) and pd.notna(row['description']):
361
- desc_vec = tfidf.transform([str(row['description'])])
362
- sims = cosine_similarity(desc_vec, tfidf_matrix).flatten()
363
- top_idx = sims.argmax()
364
- suggested_subject = reference_df.iloc[top_idx]['subject']
365
- if pd.notna(suggested_subject) and suggested_subject:
366
- suggestions.append((row['title'], suggested_subject))
367
-
368
- if suggestions:
369
- suggestions_df = pd.DataFrame(suggestions, columns=["Title", "Suggested Subject"])
370
- st.markdown("<div class='custom-table'>" + suggestions_df.to_markdown(index=False) + "</div>", unsafe_allow_html=True)
371
- else:
372
- st.markdown("""
373
- <div class='custom-table'>
374
- <b>No metadata enhancement suggestions available.</b>
375
- </div>
376
- """, unsafe_allow_html=True)
377
-
378
- except Exception as e:
379
- st.error(f"Error generating metadata suggestions: {e}")
380
- else:
381
- st.markdown("""
382
- <div class='custom-table'>
383
- <b>Not enough descriptive data to generate metadata suggestions.</b>
384
- </div>
385
- """, unsafe_allow_html=True)
386
  else:
387
- st.warning("No metadata records found for this collection. Try selecting another one.")
388
-
 
 
 
 
 
 
 
 
 
 
 
347
 
348
  st.subheader("Suggested Metadata Enhancements")
349
 
350
+ # Look for records with descriptions but missing subjects or other fields
351
+ incomplete_with_desc = metadata_df[metadata_df['description'].notnull() &
352
+ (metadata_df['subject'].isnull() |
353
+ metadata_df['creator'].isnull())]
354
+
355
+ # Reference data should be complete records with both subjects and descriptions
356
+ reference_df = metadata_df[metadata_df['subject'].notnull() &
357
+ metadata_df['description'].notnull() &
358
+ metadata_df['creator'].notnull()]
359
+
360
+ # Print debugging info
361
+ st.write(f"Records with descriptions but missing fields: {len(incomplete_with_desc)}")
362
+ st.write(f"Complete reference records: {len(reference_df)}")
363
+
364
+ tfidf = TfidfVectorizer(stop_words='english', max_features=1000)
365
+
366
+ if len(incomplete_with_desc) > 0 and len(reference_df) > 0:
367
+ try:
368
+ suggestions = []
369
+ # Fit TF-IDF on all complete descriptions
370
+ tfidf_matrix = tfidf.fit_transform(reference_df['description'].fillna('').astype(str))
371
 
372
+ # For each incomplete record
373
+ for idx, row in incomplete_with_desc.iterrows():
374
+ if pd.notna(row['description']):
375
+ # Transform this record's description
376
+ desc_vec = tfidf.transform([str(row['description'])])
377
+
378
+ # Get similarity scores to all reference records
379
+ sims = cosine_similarity(desc_vec, tfidf_matrix).flatten()
380
+
381
+ # Find the top 3 most similar records
382
+ top_indices = sims.argsort()[-3:][::-1]
383
+
384
+ # Get the most frequent subject among top matches
385
+ top_subjects = reference_df.iloc[top_indices]['subject'].value_counts().index
386
+ if len(top_subjects) > 0:
387
+ suggested_subject = top_subjects[0]
388
+ suggestions.append((row['title'], suggested_subject))
389
 
390
+ if suggestions:
391
+ suggestions_df = pd.DataFrame(suggestions, columns=["Title", "Suggested Subject"])
392
+
393
+ # Apply similar styling as your other tables
394
+ styled_suggestions = (
395
+ suggestions_df.style
396
+ .background_gradient(cmap="Greens", subset=["Suggested Subject"])
397
+ .hide(axis="index")
398
+ )
399
+
400
+ # Display as a dataframe with styling
401
+ st.dataframe(
402
+ styled_suggestions,
403
+ use_container_width=True,
404
+ hide_index=True,
405
+ height=min(240, len(suggestions) * 35 + 38)
406
+ )
 
 
 
 
 
 
 
 
 
 
407
  else:
408
+ empty_df = pd.DataFrame([["No metadata enhancement suggestions available."]],
409
+ columns=["Message"])
410
+ styled_empty = empty_df.style.hide(axis="index")
411
+ st.dataframe(styled_empty, use_container_width=True, hide_index=True)
412
+ except Exception as e:
413
+ st.error(f"Error generating metadata suggestions: {e}")
414
+ st.error(f"Error details: {str(e)}")
415
+ else:
416
+ empty_df = pd.DataFrame([["Not enough descriptive data to generate metadata suggestions."]],
417
+ columns=["Message"])
418
+ styled_empty = empty_df.style.hide(axis="index")
419
+ st.dataframe(styled_empty, use_container_width=True, hide_index=True)