Spaces:
Sleeping
Sleeping
Update streamlit_app.py
Browse files- streamlit_app.py +311 -212
streamlit_app.py
CHANGED
@@ -212,8 +212,8 @@ def main():
|
|
212 |
st.title("π Complexity Metrics Explorer")
|
213 |
st.markdown("Interactive visualization of conversation complexity metrics across different dataset types.")
|
214 |
|
215 |
-
# Dataset selection
|
216 |
-
st.header("ποΈ Dataset Selection")
|
217 |
|
218 |
# Available datasets
|
219 |
available_datasets = [
|
@@ -223,36 +223,31 @@ def main():
|
|
223 |
"Custom..."
|
224 |
]
|
225 |
|
226 |
-
|
227 |
-
|
228 |
-
|
229 |
-
|
230 |
-
|
231 |
-
|
232 |
-
index=0, # Default to reduced dataset
|
233 |
-
help="Choose which dataset to analyze",
|
234 |
-
format_func=lambda x: x.split('/')[-1] if x != "Custom..." else x # Show only the dataset name part
|
235 |
-
)
|
236 |
-
|
237 |
-
with col2:
|
238 |
-
# Add refresh button
|
239 |
-
if st.button("π Refresh Data", help="Clear cache and reload dataset"):
|
240 |
-
st.cache_data.clear()
|
241 |
-
st.rerun()
|
242 |
|
243 |
# Handle custom dataset input
|
244 |
if selected_option == "Custom...":
|
245 |
-
selected_dataset = st.text_input(
|
246 |
"Custom Dataset Name",
|
247 |
value="risky-conversations/jailbreaks_dataset_with_results_reduced",
|
248 |
help="Enter the full dataset name (e.g., 'risky-conversations/jailbreaks_dataset_with_results_reduced')"
|
249 |
)
|
250 |
if not selected_dataset.strip():
|
251 |
-
st.warning("Please enter a dataset name")
|
252 |
st.stop()
|
253 |
else:
|
254 |
selected_dataset = selected_option
|
255 |
|
|
|
|
|
|
|
|
|
|
|
256 |
# Load data
|
257 |
with st.spinner(f"Loading dataset: {selected_dataset}..."):
|
258 |
try:
|
@@ -280,52 +275,48 @@ def main():
|
|
280 |
if not data_loaded:
|
281 |
st.stop()
|
282 |
|
283 |
-
#
|
284 |
-
st.header("ποΈ
|
285 |
|
286 |
# Dataset type filter
|
287 |
dataset_types = df['type'].unique()
|
288 |
-
|
289 |
-
|
290 |
-
|
291 |
-
|
292 |
-
|
293 |
-
|
294 |
-
default=dataset_types,
|
295 |
-
help="Filter by conversation type"
|
296 |
-
)
|
297 |
|
298 |
# Role filter
|
299 |
-
|
300 |
-
|
301 |
-
|
302 |
-
|
303 |
-
|
304 |
-
|
305 |
-
|
306 |
-
|
307 |
-
|
308 |
-
|
309 |
-
|
310 |
-
|
311 |
-
|
312 |
-
|
313 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
314 |
|
315 |
-
|
316 |
-
|
317 |
-
|
318 |
-
selected_roles.append('user')
|
319 |
-
if include_assistant and 'assistant' in roles:
|
320 |
-
selected_roles.append('assistant')
|
321 |
-
|
322 |
-
# Show selection info
|
323 |
-
if selected_roles:
|
324 |
-
st.success(f"Including: {', '.join(selected_roles)}")
|
325 |
-
else:
|
326 |
-
st.warning("No roles selected")
|
327 |
else:
|
328 |
-
|
|
|
|
|
329 |
|
330 |
# Filter data based on selections
|
331 |
filtered_df = df[df['type'].isin(selected_types)] if selected_types else df
|
@@ -343,7 +334,7 @@ def main():
|
|
343 |
st.stop()
|
344 |
|
345 |
# Metric selection
|
346 |
-
st.header("π Metrics
|
347 |
|
348 |
# Dynamic metric categorization based on common patterns
|
349 |
def categorize_metrics(metrics):
|
@@ -386,28 +377,24 @@ def main():
|
|
386 |
metric_categories = categorize_metrics(available_metrics)
|
387 |
|
388 |
# Metric selection interface
|
389 |
-
selection_mode = st.radio(
|
390 |
"Selection Mode",
|
391 |
["By Category", "Search/Filter", "Select All"],
|
392 |
-
help="Choose how to select metrics"
|
393 |
-
horizontal=True
|
394 |
)
|
395 |
|
396 |
if selection_mode == "By Category":
|
397 |
-
|
398 |
-
|
399 |
-
|
400 |
-
|
401 |
-
|
402 |
-
options=list(metric_categories.keys()),
|
403 |
-
help=f"Found {len(metric_categories)} categories"
|
404 |
-
)
|
405 |
|
406 |
available_in_category = metric_categories[selected_category]
|
407 |
default_selection = available_in_category[:5] if len(available_in_category) > 5 else available_in_category
|
408 |
|
409 |
# Add select all button for category
|
410 |
-
col1, col2 = st.columns(2)
|
411 |
with col1:
|
412 |
if st.button("Select All", key="select_all_category"):
|
413 |
st.session_state.selected_metrics_category = available_in_category
|
@@ -419,7 +406,7 @@ def main():
|
|
419 |
if "selected_metrics_category" not in st.session_state:
|
420 |
st.session_state.selected_metrics_category = default_selection
|
421 |
|
422 |
-
selected_metrics = st.multiselect(
|
423 |
f"Select Metrics ({len(available_in_category)} available)",
|
424 |
options=available_in_category,
|
425 |
default=st.session_state.selected_metrics_category,
|
@@ -428,7 +415,7 @@ def main():
|
|
428 |
)
|
429 |
|
430 |
elif selection_mode == "Search/Filter":
|
431 |
-
search_term = st.text_input(
|
432 |
"Search Metrics",
|
433 |
placeholder="Enter keywords to filter metrics...",
|
434 |
help="Search for metrics containing specific terms"
|
@@ -439,10 +426,10 @@ def main():
|
|
439 |
else:
|
440 |
filtered_metrics = available_metrics
|
441 |
|
442 |
-
st.write(f"Found {len(filtered_metrics)} metrics")
|
443 |
|
444 |
# Add select all button for search results
|
445 |
-
col1, col2 = st.columns(2)
|
446 |
with col1:
|
447 |
if st.button("Select All", key="select_all_search"):
|
448 |
st.session_state.selected_metrics_search = filtered_metrics
|
@@ -454,7 +441,7 @@ def main():
|
|
454 |
if "selected_metrics_search" not in st.session_state:
|
455 |
st.session_state.selected_metrics_search = filtered_metrics[:5] if len(filtered_metrics) > 5 else filtered_metrics[:3]
|
456 |
|
457 |
-
selected_metrics = st.multiselect(
|
458 |
"Select Metrics",
|
459 |
options=filtered_metrics,
|
460 |
default=st.session_state.selected_metrics_search,
|
@@ -464,7 +451,7 @@ def main():
|
|
464 |
|
465 |
else: # Select All
|
466 |
# Add select all button for all metrics
|
467 |
-
col1, col2 = st.columns(2)
|
468 |
with col1:
|
469 |
if st.button("Select All", key="select_all_all"):
|
470 |
st.session_state.selected_metrics_all = available_metrics
|
@@ -476,7 +463,7 @@ def main():
|
|
476 |
if "selected_metrics_all" not in st.session_state:
|
477 |
st.session_state.selected_metrics_all = available_metrics[:10] # Limit default to first 10 for performance
|
478 |
|
479 |
-
selected_metrics = st.multiselect(
|
480 |
f"All Metrics ({len(available_metrics)} total)",
|
481 |
options=available_metrics,
|
482 |
default=st.session_state.selected_metrics_all,
|
@@ -486,18 +473,18 @@ def main():
|
|
486 |
|
487 |
# Show selection summary
|
488 |
if selected_metrics:
|
489 |
-
st.success(f"Selected {len(selected_metrics)} metrics")
|
490 |
|
491 |
# Performance warning for large selections
|
492 |
if len(selected_metrics) > 20:
|
493 |
-
st.warning(f"β οΈ Large selection ({len(selected_metrics)} metrics) may impact performance")
|
494 |
elif len(selected_metrics) > 50:
|
495 |
-
st.error(f"π¨ Very large selection ({len(selected_metrics)} metrics) - consider reducing for better performance")
|
496 |
else:
|
497 |
-
st.warning("No metrics selected")
|
498 |
|
499 |
# Metric info expander
|
500 |
-
with st.expander("βΉοΈ Metric Information", expanded=False):
|
501 |
st.write(f"**Total Available Metrics:** {len(available_metrics)}")
|
502 |
st.write(f"**Categories Found:** {len(metric_categories)}")
|
503 |
|
@@ -506,8 +493,6 @@ def main():
|
|
506 |
for i, metric in enumerate(available_metrics, 1):
|
507 |
st.write(f"{i}. `{metric}`")
|
508 |
|
509 |
-
st.divider() # Visual separator before main content
|
510 |
-
|
511 |
# Main content tabs
|
512 |
tab1, tab2, tab3, tab4, tab5 = st.tabs(["π Distributions", "π Correlations", "π Comparisons", "π Conversation", "π― Details"])
|
513 |
|
@@ -704,6 +689,7 @@ def main():
|
|
704 |
# Display conversation metadata
|
705 |
st.subheader("π Conversation Overview")
|
706 |
|
|
|
707 |
col1, col2, col3, col4 = st.columns(4)
|
708 |
with col1:
|
709 |
st.metric("Type", selected_conversation['type'])
|
@@ -718,6 +704,68 @@ def main():
|
|
718 |
assistant_turns = roles.count('assistant')
|
719 |
st.metric("User/Assistant", f"{user_turns}/{assistant_turns}")
|
720 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
721 |
# Get conversation turns with metrics
|
722 |
conv_turns_data = filtered_df_exploded[filtered_df_exploded.index.isin(
|
723 |
filtered_df_exploded[filtered_df_exploded.index // len(filtered_df_exploded) * len(filtered_df) +
|
@@ -739,146 +787,197 @@ def main():
|
|
739 |
# Simpler approach: get all turns from the conversation directly
|
740 |
conversation_turns = selected_conversation.get('conversation', [])
|
741 |
|
742 |
-
|
743 |
-
|
744 |
-
|
745 |
-
|
746 |
-
|
747 |
-
|
|
|
|
|
|
|
748 |
|
749 |
-
|
750 |
-
|
751 |
-
|
752 |
-
content = turn.get('content', 'No content')
|
753 |
-
|
754 |
-
# Style based on role
|
755 |
-
if role == 'user':
|
756 |
-
st.markdown(f"**π€ User (Turn {i+1}):**")
|
757 |
-
st.info(content)
|
758 |
-
elif role == 'assistant':
|
759 |
-
st.markdown(f"**π€ Assistant (Turn {i+1}):**")
|
760 |
-
st.success(content)
|
761 |
-
else:
|
762 |
-
st.markdown(f"**β {role.title()} (Turn {i+1}):**")
|
763 |
-
st.warning(content)
|
764 |
|
765 |
-
#
|
766 |
-
|
|
|
|
|
|
|
|
|
|
|
767 |
|
768 |
-
|
769 |
-
|
770 |
-
|
771 |
-
|
772 |
|
773 |
-
#
|
774 |
-
|
775 |
-
|
776 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
777 |
|
778 |
-
|
779 |
-
|
780 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
781 |
|
782 |
-
#
|
783 |
-
|
784 |
-
if
|
785 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
786 |
|
787 |
-
|
788 |
-
|
789 |
-
|
790 |
-
|
791 |
-
|
792 |
-
|
793 |
-
|
794 |
-
|
795 |
-
|
796 |
-
|
797 |
-
|
798 |
-
|
799 |
-
|
800 |
-
|
801 |
-
|
802 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
803 |
else:
|
804 |
-
|
805 |
-
|
806 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
807 |
|
808 |
-
|
809 |
-
|
810 |
-
|
811 |
-
|
812 |
-
|
813 |
-
|
814 |
-
|
815 |
-
|
816 |
-
|
817 |
-
|
818 |
-
|
819 |
-
|
820 |
-
|
821 |
-
|
822 |
-
# Get values for this metric
|
823 |
-
y_values = []
|
824 |
-
for _, turn_row in sample_turns.iterrows():
|
825 |
-
value = turn_row.get(col, None)
|
826 |
-
if pd.notna(value) and isinstance(value, (int, float)):
|
827 |
-
y_values.append(value)
|
828 |
else:
|
829 |
-
|
830 |
-
|
831 |
-
|
832 |
-
|
833 |
-
|
834 |
-
|
835 |
-
|
836 |
-
|
837 |
-
|
838 |
-
|
839 |
-
|
840 |
-
|
841 |
-
|
842 |
-
|
843 |
-
|
844 |
-
|
845 |
-
|
846 |
-
|
847 |
-
|
848 |
-
|
849 |
-
|
850 |
-
|
851 |
-
|
852 |
-
|
853 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
854 |
else:
|
855 |
-
|
856 |
-
else:
|
857 |
-
st.info("No turn-level data available for this conversation type.")
|
858 |
-
else:
|
859 |
-
st.warning("No turn-level metrics available in the dataset for the selected metrics.")
|
860 |
|
861 |
-
|
862 |
-
|
863 |
-
|
864 |
-
|
865 |
-
|
866 |
-
|
867 |
-
|
868 |
-
|
869 |
-
|
870 |
-
|
871 |
-
|
872 |
-
|
873 |
-
|
874 |
-
|
875 |
-
|
876 |
-
|
877 |
-
|
878 |
-
|
|
|
|
|
|
|
|
|
|
|
879 |
|
|
|
|
|
880 |
else:
|
881 |
-
st.warning("Select some metrics to see turn-level analysis.")
|
882 |
|
883 |
else:
|
884 |
st.warning("No conversation data available for the selected conversation.")
|
|
|
212 |
st.title("π Complexity Metrics Explorer")
|
213 |
st.markdown("Interactive visualization of conversation complexity metrics across different dataset types.")
|
214 |
|
215 |
+
# Dataset selection
|
216 |
+
st.sidebar.header("ποΈ Dataset Selection")
|
217 |
|
218 |
# Available datasets
|
219 |
available_datasets = [
|
|
|
223 |
"Custom..."
|
224 |
]
|
225 |
|
226 |
+
selected_option = st.sidebar.selectbox(
|
227 |
+
"Select Dataset",
|
228 |
+
options=available_datasets,
|
229 |
+
index=0, # Default to reduced dataset
|
230 |
+
help="Choose which dataset to analyze"
|
231 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
232 |
|
233 |
# Handle custom dataset input
|
234 |
if selected_option == "Custom...":
|
235 |
+
selected_dataset = st.sidebar.text_input(
|
236 |
"Custom Dataset Name",
|
237 |
value="risky-conversations/jailbreaks_dataset_with_results_reduced",
|
238 |
help="Enter the full dataset name (e.g., 'risky-conversations/jailbreaks_dataset_with_results_reduced')"
|
239 |
)
|
240 |
if not selected_dataset.strip():
|
241 |
+
st.sidebar.warning("Please enter a dataset name")
|
242 |
st.stop()
|
243 |
else:
|
244 |
selected_dataset = selected_option
|
245 |
|
246 |
+
# Add refresh button
|
247 |
+
if st.sidebar.button("π Refresh Data", help="Clear cache and reload dataset"):
|
248 |
+
st.cache_data.clear()
|
249 |
+
st.rerun()
|
250 |
+
|
251 |
# Load data
|
252 |
with st.spinner(f"Loading dataset: {selected_dataset}..."):
|
253 |
try:
|
|
|
275 |
if not data_loaded:
|
276 |
st.stop()
|
277 |
|
278 |
+
# Sidebar controls
|
279 |
+
st.sidebar.header("ποΈ Controls")
|
280 |
|
281 |
# Dataset type filter
|
282 |
dataset_types = df['type'].unique()
|
283 |
+
selected_types = st.sidebar.multiselect(
|
284 |
+
"Select Dataset Types",
|
285 |
+
options=dataset_types,
|
286 |
+
default=dataset_types,
|
287 |
+
help="Filter by conversation type"
|
288 |
+
)
|
|
|
|
|
|
|
289 |
|
290 |
# Role filter
|
291 |
+
if 'turn.role' in df_exploded.columns:
|
292 |
+
roles = df_exploded['turn.role'].dropna().unique()
|
293 |
+
# Assert only user and assistant roles exist
|
294 |
+
expected_roles = {'user', 'assistant'}
|
295 |
+
actual_roles = set(roles)
|
296 |
+
assert actual_roles.issubset(expected_roles), f"Unexpected roles found: {actual_roles - expected_roles}. Expected only 'user' and 'assistant'"
|
297 |
+
|
298 |
+
st.sidebar.subheader("π₯ Role Filter")
|
299 |
+
col1, col2 = st.sidebar.columns(2)
|
300 |
+
|
301 |
+
with col1:
|
302 |
+
include_user = st.checkbox("User", value=True, help="Include user turns")
|
303 |
+
with col2:
|
304 |
+
include_assistant = st.checkbox("Assistant", value=True, help="Include assistant turns")
|
305 |
+
|
306 |
+
# Build selected roles list
|
307 |
+
selected_roles = []
|
308 |
+
if include_user and 'user' in roles:
|
309 |
+
selected_roles.append('user')
|
310 |
+
if include_assistant and 'assistant' in roles:
|
311 |
+
selected_roles.append('assistant')
|
312 |
|
313 |
+
# Show selection info
|
314 |
+
if selected_roles:
|
315 |
+
st.sidebar.success(f"Including: {', '.join(selected_roles)}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
316 |
else:
|
317 |
+
st.sidebar.warning("No roles selected")
|
318 |
+
else:
|
319 |
+
selected_roles = None
|
320 |
|
321 |
# Filter data based on selections
|
322 |
filtered_df = df[df['type'].isin(selected_types)] if selected_types else df
|
|
|
334 |
st.stop()
|
335 |
|
336 |
# Metric selection
|
337 |
+
st.sidebar.header("π Metrics")
|
338 |
|
339 |
# Dynamic metric categorization based on common patterns
|
340 |
def categorize_metrics(metrics):
|
|
|
377 |
metric_categories = categorize_metrics(available_metrics)
|
378 |
|
379 |
# Metric selection interface
|
380 |
+
selection_mode = st.sidebar.radio(
|
381 |
"Selection Mode",
|
382 |
["By Category", "Search/Filter", "Select All"],
|
383 |
+
help="Choose how to select metrics"
|
|
|
384 |
)
|
385 |
|
386 |
if selection_mode == "By Category":
|
387 |
+
selected_category = st.sidebar.selectbox(
|
388 |
+
"Metric Category",
|
389 |
+
options=list(metric_categories.keys()),
|
390 |
+
help=f"Found {len(metric_categories)} categories"
|
391 |
+
)
|
|
|
|
|
|
|
392 |
|
393 |
available_in_category = metric_categories[selected_category]
|
394 |
default_selection = available_in_category[:5] if len(available_in_category) > 5 else available_in_category
|
395 |
|
396 |
# Add select all button for category
|
397 |
+
col1, col2 = st.sidebar.columns(2)
|
398 |
with col1:
|
399 |
if st.button("Select All", key="select_all_category"):
|
400 |
st.session_state.selected_metrics_category = available_in_category
|
|
|
406 |
if "selected_metrics_category" not in st.session_state:
|
407 |
st.session_state.selected_metrics_category = default_selection
|
408 |
|
409 |
+
selected_metrics = st.sidebar.multiselect(
|
410 |
f"Select Metrics ({len(available_in_category)} available)",
|
411 |
options=available_in_category,
|
412 |
default=st.session_state.selected_metrics_category,
|
|
|
415 |
)
|
416 |
|
417 |
elif selection_mode == "Search/Filter":
|
418 |
+
search_term = st.sidebar.text_input(
|
419 |
"Search Metrics",
|
420 |
placeholder="Enter keywords to filter metrics...",
|
421 |
help="Search for metrics containing specific terms"
|
|
|
426 |
else:
|
427 |
filtered_metrics = available_metrics
|
428 |
|
429 |
+
st.sidebar.write(f"Found {len(filtered_metrics)} metrics")
|
430 |
|
431 |
# Add select all button for search results
|
432 |
+
col1, col2 = st.sidebar.columns(2)
|
433 |
with col1:
|
434 |
if st.button("Select All", key="select_all_search"):
|
435 |
st.session_state.selected_metrics_search = filtered_metrics
|
|
|
441 |
if "selected_metrics_search" not in st.session_state:
|
442 |
st.session_state.selected_metrics_search = filtered_metrics[:5] if len(filtered_metrics) > 5 else filtered_metrics[:3]
|
443 |
|
444 |
+
selected_metrics = st.sidebar.multiselect(
|
445 |
"Select Metrics",
|
446 |
options=filtered_metrics,
|
447 |
default=st.session_state.selected_metrics_search,
|
|
|
451 |
|
452 |
else: # Select All
|
453 |
# Add select all button for all metrics
|
454 |
+
col1, col2 = st.sidebar.columns(2)
|
455 |
with col1:
|
456 |
if st.button("Select All", key="select_all_all"):
|
457 |
st.session_state.selected_metrics_all = available_metrics
|
|
|
463 |
if "selected_metrics_all" not in st.session_state:
|
464 |
st.session_state.selected_metrics_all = available_metrics[:10] # Limit default to first 10 for performance
|
465 |
|
466 |
+
selected_metrics = st.sidebar.multiselect(
|
467 |
f"All Metrics ({len(available_metrics)} total)",
|
468 |
options=available_metrics,
|
469 |
default=st.session_state.selected_metrics_all,
|
|
|
473 |
|
474 |
# Show selection summary
|
475 |
if selected_metrics:
|
476 |
+
st.sidebar.success(f"Selected {len(selected_metrics)} metrics")
|
477 |
|
478 |
# Performance warning for large selections
|
479 |
if len(selected_metrics) > 20:
|
480 |
+
st.sidebar.warning(f"β οΈ Large selection ({len(selected_metrics)} metrics) may impact performance")
|
481 |
elif len(selected_metrics) > 50:
|
482 |
+
st.sidebar.error(f"π¨ Very large selection ({len(selected_metrics)} metrics) - consider reducing for better performance")
|
483 |
else:
|
484 |
+
st.sidebar.warning("No metrics selected")
|
485 |
|
486 |
# Metric info expander
|
487 |
+
with st.sidebar.expander("βΉοΈ Metric Information", expanded=False):
|
488 |
st.write(f"**Total Available Metrics:** {len(available_metrics)}")
|
489 |
st.write(f"**Categories Found:** {len(metric_categories)}")
|
490 |
|
|
|
493 |
for i, metric in enumerate(available_metrics, 1):
|
494 |
st.write(f"{i}. `{metric}`")
|
495 |
|
|
|
|
|
496 |
# Main content tabs
|
497 |
tab1, tab2, tab3, tab4, tab5 = st.tabs(["π Distributions", "π Correlations", "π Comparisons", "π Conversation", "π― Details"])
|
498 |
|
|
|
689 |
# Display conversation metadata
|
690 |
st.subheader("π Conversation Overview")
|
691 |
|
692 |
+
# First row - basic info
|
693 |
col1, col2, col3, col4 = st.columns(4)
|
694 |
with col1:
|
695 |
st.metric("Type", selected_conversation['type'])
|
|
|
704 |
assistant_turns = roles.count('assistant')
|
705 |
st.metric("User/Assistant", f"{user_turns}/{assistant_turns}")
|
706 |
|
707 |
+
# Second row - additional metadata
|
708 |
+
col1, col2, col3 = st.columns(3)
|
709 |
+
with col1:
|
710 |
+
provenance = selected_conversation.get('provenance_dataset', 'Unknown')
|
711 |
+
st.metric("Dataset Source", provenance)
|
712 |
+
with col2:
|
713 |
+
language = selected_conversation.get('language', 'Unknown')
|
714 |
+
st.metric("Language", language.upper() if language else 'Unknown')
|
715 |
+
with col3:
|
716 |
+
timestamp = selected_conversation.get('timestamp', None)
|
717 |
+
if timestamp:
|
718 |
+
# Handle different timestamp formats
|
719 |
+
if isinstance(timestamp, str):
|
720 |
+
st.metric("Timestamp", timestamp)
|
721 |
+
else:
|
722 |
+
st.metric("Timestamp", str(timestamp))
|
723 |
+
else:
|
724 |
+
st.metric("Timestamp", "Not Available")
|
725 |
+
|
726 |
+
# Add toxicity summary
|
727 |
+
conversation_turns_temp = selected_conversation.get('conversation', [])
|
728 |
+
if hasattr(conversation_turns_temp, 'tolist'):
|
729 |
+
conversation_turns_temp = conversation_turns_temp.tolist()
|
730 |
+
elif conversation_turns_temp is None:
|
731 |
+
conversation_turns_temp = []
|
732 |
+
|
733 |
+
if len(conversation_turns_temp) > 0:
|
734 |
+
# Calculate overall toxicity statistics
|
735 |
+
all_toxicities = []
|
736 |
+
for turn in conversation_turns_temp:
|
737 |
+
toxicities = turn.get('toxicities', {})
|
738 |
+
if toxicities and 'toxicity' in toxicities:
|
739 |
+
all_toxicities.append(toxicities['toxicity'])
|
740 |
+
|
741 |
+
if all_toxicities:
|
742 |
+
avg_toxicity = sum(all_toxicities) / len(all_toxicities)
|
743 |
+
max_toxicity = max(all_toxicities)
|
744 |
+
|
745 |
+
st.markdown("**π Toxicity Summary:**")
|
746 |
+
col1, col2, col3 = st.columns(3)
|
747 |
+
with col1:
|
748 |
+
# Color code average toxicity
|
749 |
+
if avg_toxicity > 0.5:
|
750 |
+
st.metric("Average Toxicity", f"{avg_toxicity:.4f}", delta="HIGH", delta_color="inverse")
|
751 |
+
elif avg_toxicity > 0.1:
|
752 |
+
st.metric("Average Toxicity", f"{avg_toxicity:.4f}", delta="MED", delta_color="off")
|
753 |
+
else:
|
754 |
+
st.metric("Average Toxicity", f"{avg_toxicity:.4f}", delta="LOW", delta_color="normal")
|
755 |
+
|
756 |
+
with col2:
|
757 |
+
# Color code max toxicity
|
758 |
+
if max_toxicity > 0.5:
|
759 |
+
st.metric("Max Toxicity", f"{max_toxicity:.4f}", delta="HIGH", delta_color="inverse")
|
760 |
+
elif max_toxicity > 0.1:
|
761 |
+
st.metric("Max Toxicity", f"{max_toxicity:.4f}", delta="MED", delta_color="off")
|
762 |
+
else:
|
763 |
+
st.metric("Max Toxicity", f"{max_toxicity:.4f}", delta="LOW", delta_color="normal")
|
764 |
+
|
765 |
+
with col3:
|
766 |
+
high_tox_turns = sum(1 for t in all_toxicities if t > 0.5)
|
767 |
+
st.metric("High Toxicity Turns", high_tox_turns)
|
768 |
+
|
769 |
# Get conversation turns with metrics
|
770 |
conv_turns_data = filtered_df_exploded[filtered_df_exploded.index.isin(
|
771 |
filtered_df_exploded[filtered_df_exploded.index // len(filtered_df_exploded) * len(filtered_df) +
|
|
|
787 |
# Simpler approach: get all turns from the conversation directly
|
788 |
conversation_turns = selected_conversation.get('conversation', [])
|
789 |
|
790 |
+
# Ensure conversation_turns is a list and handle different data types
|
791 |
+
if hasattr(conversation_turns, 'tolist'):
|
792 |
+
conversation_turns = conversation_turns.tolist()
|
793 |
+
elif conversation_turns is None:
|
794 |
+
conversation_turns = []
|
795 |
+
|
796 |
+
if len(conversation_turns) > 0:
|
797 |
+
# Display conversation content with metrics
|
798 |
+
st.subheader("π¬ Conversation with Metrics")
|
799 |
|
800 |
+
# Get actual turn-level data for this conversation
|
801 |
+
turn_metric_columns = [f"turn.turn_metrics.{m}" for m in selected_metrics]
|
802 |
+
available_columns = [col for col in turn_metric_columns if col in filtered_df_exploded.columns]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
803 |
|
804 |
+
# Get sample metrics for this conversation type (since exact matching is complex)
|
805 |
+
sample_metrics = None
|
806 |
+
if available_columns:
|
807 |
+
type_turns = filtered_df_exploded[filtered_df_exploded['type'] == selected_conversation['type']]
|
808 |
+
sample_size = min(len(conversation_turns), len(type_turns))
|
809 |
+
if sample_size > 0:
|
810 |
+
sample_metrics = type_turns.head(sample_size)
|
811 |
|
812 |
+
# Display each turn with its metrics
|
813 |
+
for i, turn in enumerate(conversation_turns):
|
814 |
+
role = turn.get('role', 'unknown')
|
815 |
+
content = turn.get('content', 'No content')
|
816 |
|
817 |
+
# Display turn content with role styling
|
818 |
+
if role == 'user':
|
819 |
+
st.markdown(f"**π€ User (Turn {i+1}):**")
|
820 |
+
st.info(content)
|
821 |
+
elif role == 'assistant':
|
822 |
+
st.markdown(f"**π€ Assistant (Turn {i+1}):**")
|
823 |
+
st.success(content)
|
824 |
+
else:
|
825 |
+
st.markdown(f"**β {role.title()} (Turn {i+1}):**")
|
826 |
+
st.warning(content)
|
827 |
|
828 |
+
# Display metrics for this turn
|
829 |
+
if sample_metrics is not None and i < len(sample_metrics):
|
830 |
+
turn_row = sample_metrics.iloc[i]
|
831 |
+
|
832 |
+
# Create metrics display
|
833 |
+
metrics_for_turn = {}
|
834 |
+
for col in available_columns:
|
835 |
+
metric_name = col.replace('turn.turn_metrics.', '')
|
836 |
+
friendly_name = get_human_friendly_metric_name(metric_name)
|
837 |
+
value = turn_row.get(col, 'N/A')
|
838 |
+
if pd.notna(value) and isinstance(value, (int, float)):
|
839 |
+
metrics_for_turn[friendly_name] = round(value, 3)
|
840 |
+
else:
|
841 |
+
metrics_for_turn[friendly_name] = 'N/A'
|
842 |
|
843 |
+
# Add toxicity metrics if available
|
844 |
+
toxicities = turn.get('toxicities', {})
|
845 |
+
if toxicities:
|
846 |
+
st.markdown("**π Toxicity Scores:**")
|
847 |
+
tox_cols = st.columns(4)
|
848 |
+
tox_metrics = [
|
849 |
+
('toxicity', 'Overall Toxicity'),
|
850 |
+
('severe_toxicity', 'Severe Toxicity'),
|
851 |
+
('identity_attack', 'Identity Attack'),
|
852 |
+
('insult', 'Insult'),
|
853 |
+
('obscene', 'Obscene'),
|
854 |
+
('sexual_explicit', 'Sexual Explicit'),
|
855 |
+
('threat', 'Threat')
|
856 |
+
]
|
857 |
|
858 |
+
for idx, (tox_key, tox_name) in enumerate(tox_metrics):
|
859 |
+
if tox_key in toxicities:
|
860 |
+
col_idx = idx % 4
|
861 |
+
with tox_cols[col_idx]:
|
862 |
+
tox_value = toxicities[tox_key]
|
863 |
+
if isinstance(tox_value, (int, float)):
|
864 |
+
# Color code based on toxicity level
|
865 |
+
if tox_value > 0.5:
|
866 |
+
st.metric(tox_name, f"{tox_value:.4f}", delta="HIGH", delta_color="inverse")
|
867 |
+
elif tox_value > 0.1:
|
868 |
+
st.metric(tox_name, f"{tox_value:.4f}", delta="MED", delta_color="off")
|
869 |
+
else:
|
870 |
+
st.metric(tox_name, f"{tox_value:.4f}", delta="LOW", delta_color="normal")
|
871 |
+
else:
|
872 |
+
st.metric(tox_name, str(tox_value))
|
873 |
+
|
874 |
+
# Display complexity metrics
|
875 |
+
if metrics_for_turn:
|
876 |
+
st.markdown("**π Complexity Metrics:**")
|
877 |
+
# Display metrics in columns
|
878 |
+
num_cols = min(4, len(metrics_for_turn))
|
879 |
+
if num_cols > 0:
|
880 |
+
cols = st.columns(num_cols)
|
881 |
+
for idx, (metric_name, value) in enumerate(metrics_for_turn.items()):
|
882 |
+
col_idx = idx % num_cols
|
883 |
+
with cols[col_idx]:
|
884 |
+
if isinstance(value, (int, float)) and value != 'N/A':
|
885 |
+
st.metric(metric_name, value)
|
886 |
else:
|
887 |
+
st.metric(metric_name, str(value))
|
888 |
+
else:
|
889 |
+
# Show toxicity even when no complexity metrics available
|
890 |
+
toxicities = turn.get('toxicities', {})
|
891 |
+
if toxicities:
|
892 |
+
st.markdown("**π Toxicity Scores:**")
|
893 |
+
tox_cols = st.columns(4)
|
894 |
+
tox_metrics = [
|
895 |
+
('toxicity', 'Overall Toxicity'),
|
896 |
+
('severe_toxicity', 'Severe Toxicity'),
|
897 |
+
('identity_attack', 'Identity Attack'),
|
898 |
+
('insult', 'Insult'),
|
899 |
+
('obscene', 'Obscene'),
|
900 |
+
('sexual_explicit', 'Sexual Explicit'),
|
901 |
+
('threat', 'Threat')
|
902 |
+
]
|
903 |
|
904 |
+
for idx, (tox_key, tox_name) in enumerate(tox_metrics):
|
905 |
+
if tox_key in toxicities:
|
906 |
+
col_idx = idx % 4
|
907 |
+
with tox_cols[col_idx]:
|
908 |
+
tox_value = toxicities[tox_key]
|
909 |
+
if isinstance(tox_value, (int, float)):
|
910 |
+
# Color code based on toxicity level
|
911 |
+
if tox_value > 0.5:
|
912 |
+
st.metric(tox_name, f"{tox_value:.4f}", delta="HIGH", delta_color="inverse")
|
913 |
+
elif tox_value > 0.1:
|
914 |
+
st.metric(tox_name, f"{tox_value:.4f}", delta="MED", delta_color="off")
|
915 |
+
else:
|
916 |
+
st.metric(tox_name, f"{tox_value:.4f}", delta="LOW", delta_color="normal")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
917 |
else:
|
918 |
+
st.metric(tox_name, str(tox_value))
|
919 |
+
|
920 |
+
# Show basic turn statistics when no complexity metrics available
|
921 |
+
st.markdown("**π Basic Statistics:**")
|
922 |
+
col1, col2, col3 = st.columns(3)
|
923 |
+
with col1:
|
924 |
+
st.metric("Characters", len(content))
|
925 |
+
with col2:
|
926 |
+
st.metric("Words", len(content.split()))
|
927 |
+
with col3:
|
928 |
+
st.metric("Role", role.title())
|
929 |
+
|
930 |
+
# Add separator between turns
|
931 |
+
st.divider()
|
932 |
+
|
933 |
+
# Plot metrics over turns with real data if available
|
934 |
+
if available_columns and sample_metrics is not None:
|
935 |
+
st.subheader("π Metrics Over Turns")
|
936 |
+
|
937 |
+
fig = go.Figure()
|
938 |
+
|
939 |
+
# Add traces for each selected metric (real data)
|
940 |
+
for col in available_columns[:5]: # Limit to first 5 for readability
|
941 |
+
metric_name = col.replace('turn.turn_metrics.', '')
|
942 |
+
friendly_name = get_human_friendly_metric_name(metric_name)
|
943 |
+
|
944 |
+
# Get values for this metric
|
945 |
+
y_values = []
|
946 |
+
for _, turn_row in sample_metrics.iterrows():
|
947 |
+
value = turn_row.get(col, None)
|
948 |
+
if pd.notna(value) and isinstance(value, (int, float)):
|
949 |
+
y_values.append(value)
|
950 |
else:
|
951 |
+
y_values.append(None)
|
|
|
|
|
|
|
|
|
952 |
|
953 |
+
if any(v is not None for v in y_values):
|
954 |
+
fig.add_trace(go.Scatter(
|
955 |
+
x=list(range(1, len(y_values) + 1)),
|
956 |
+
y=y_values,
|
957 |
+
mode='lines+markers',
|
958 |
+
name=friendly_name,
|
959 |
+
line=dict(width=2),
|
960 |
+
marker=dict(size=8),
|
961 |
+
connectgaps=False
|
962 |
+
))
|
963 |
+
|
964 |
+
if fig.data: # Only show if we have data
|
965 |
+
fig.update_layout(
|
966 |
+
title="Complexity Metrics Across Conversation Turns",
|
967 |
+
xaxis_title="Turn Number",
|
968 |
+
yaxis_title="Metric Value",
|
969 |
+
height=400,
|
970 |
+
hovermode='x unified'
|
971 |
+
)
|
972 |
+
|
973 |
+
st.plotly_chart(fig, use_container_width=True)
|
974 |
+
else:
|
975 |
+
st.info("No numeric metric data available to plot for this conversation type.")
|
976 |
|
977 |
+
elif selected_metrics:
|
978 |
+
st.info("Select metrics that are available in the dataset to see turn-level analysis.")
|
979 |
else:
|
980 |
+
st.warning("Select some metrics to see detailed turn-level analysis.")
|
981 |
|
982 |
else:
|
983 |
st.warning("No conversation data available for the selected conversation.")
|