525GradioApp / visualization /bow_visualizer.py
Ryan
update
2c58f4e
import gradio as gr
import json
import numpy as np
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots
import pandas as pd
from difflib import SequenceMatcher
from visualization.ngram_visualizer import create_ngram_visualization
from visualization.topic_visualizer import process_and_visualize_topic_analysis # Added import
def create_bow_visualization(analysis_results):
"""
Create visualizations for bag of words analysis results
Args:
analysis_results (dict): Analysis results from the bow analysis
Returns:
list: List of gradio components with visualizations
"""
# Parse analysis results if it's a string
if isinstance(analysis_results, str):
try:
results = json.loads(analysis_results)
except json.JSONDecodeError:
return [gr.Markdown("Error parsing analysis results.")]
else:
results = analysis_results
output_components = []
# Check if we have valid results
if not results or "analyses" not in results:
return [gr.Markdown("No analysis results found.")]
# Process each prompt
for prompt, analyses in results["analyses"].items():
output_components.append(gr.Markdown(f"## Analysis of Prompt: \"{prompt}\""))
# Process Bag of Words analysis if available
if "bag_of_words" in analyses:
bow_results = analyses["bag_of_words"]
# Show models being compared
models = bow_results.get("models", [])
if len(models) >= 2:
output_components.append(gr.Markdown(f"### Comparing responses from {models[0]} and {models[1]}"))
# Get important words for each model
important_words = bow_results.get("important_words", {})
# Prepare data for plotting important words
if important_words:
for model_name, words in important_words.items():
df = pd.DataFrame(words)
# Create bar chart for top words
fig = px.bar(df, x='word', y='count',
title=f"Top Words Used by {model_name}",
labels={'word': 'Word', 'count': 'Frequency'},
height=400)
# Improve layout
fig.update_layout(
xaxis_title="Word",
yaxis_title="Frequency",
xaxis={'categoryorder':'total descending'}
)
output_components.append(gr.Plot(value=fig))
# Visualize differential words (words with biggest frequency difference)
diff_words = bow_results.get("differential_words", [])
word_matrix = bow_results.get("word_count_matrix", {})
if diff_words and word_matrix and len(diff_words) > 0:
output_components.append(gr.Markdown("### Words with Biggest Frequency Differences"))
# Create dataframe for plotting
model1, model2 = models[0], models[1]
diff_data = []
for word in diff_words[:15]: # Limit to top 15 for readability
if word in word_matrix:
counts = word_matrix[word]
diff_data.append({
"word": word,
model1: counts.get(model1, 0),
model2: counts.get(model2, 0)
})
if diff_data:
diff_df = pd.DataFrame(diff_data)
# Create grouped bar chart
fig = go.Figure()
fig.add_trace(go.Bar(
x=diff_df['word'],
y=diff_df[model1],
name=model1,
marker_color='indianred'
))
fig.add_trace(go.Bar(
x=diff_df['word'],
y=diff_df[model2],
name=model2,
marker_color='lightsalmon'
))
fig.update_layout(
title="Word Frequency Comparison",
xaxis_title="Word",
yaxis_title="Frequency",
barmode='group',
height=500
)
output_components.append(gr.Plot(value=fig))
# If no components were added, show a message
if len(output_components) <= 1:
output_components.append(gr.Markdown("No detailed Bag of Words analysis found in results."))
return output_components
# update the process_and_visualize_analysis function
def process_and_visualize_analysis(analysis_results):
"""
Process the analysis results and create visualization components
Args:
analysis_results (dict): The analysis results
Returns:
list: List of gradio components for visualization
"""
try:
print(f"Starting visualization of analysis results: {type(analysis_results)}")
components = []
if not analysis_results or "analyses" not in analysis_results:
print("Warning: Empty or invalid analysis results")
components.append(gr.Markdown("No analysis results to visualize."))
return components
# For each prompt in the analysis results
for prompt, analyses in analysis_results.get("analyses", {}).items():
print(f"Visualizing results for prompt: {prompt[:30]}...")
components.append(gr.Markdown(f"## Analysis for Prompt:\n\"{prompt}\""))
# Check for Bag of Words analysis
if "bag_of_words" in analyses:
print("Processing Bag of Words visualization")
components.append(gr.Markdown("### Bag of Words Analysis"))
bow_results = analyses["bag_of_words"]
# Display models compared
if "models" in bow_results:
models = bow_results["models"]
components.append(gr.Markdown(f"**Models compared**: {', '.join(models)}"))
# Display important words for each model
if "important_words" in bow_results:
components.append(gr.Markdown("#### Most Common Words by Model"))
for model, words in bow_results["important_words"].items():
print(f"Creating word list for model {model}")
word_list = [f"{item['word']} ({item['count']})" for item in words[:10]]
components.append(gr.Markdown(f"**{model}**: {', '.join(word_list)}"))
# Add visualizations for word frequency differences
if "differential_words" in bow_results and "word_count_matrix" in bow_results and len(
bow_results["models"]) >= 2:
diff_words = bow_results["differential_words"]
word_matrix = bow_results["word_count_matrix"]
models = bow_results["models"]
if diff_words and word_matrix and len(diff_words) > 0:
components.append(gr.Markdown("### Words with Biggest Frequency Differences"))
# Create dataframe for plotting
model1, model2 = models[0], models[1]
diff_data = []
for word in diff_words[:10]: # Limit to top 10 for readability
if word in word_matrix:
counts = word_matrix[word]
model1_count = counts.get(model1, 0)
model2_count = counts.get(model2, 0)
# Only include if there's a meaningful difference
if abs(model1_count - model2_count) > 0:
components.append(gr.Markdown(
f"- **{word}**: {model1}: {model1_count}, {model2}: {model2_count}"
))
# Check for N-gram analysis
if "ngram_analysis" in analyses:
print("Processing N-gram visualization")
# Use the dedicated n-gram visualization function
ngram_components = create_ngram_visualization(
{"analyses": {prompt: {"ngram_analysis": analyses["ngram_analysis"]}}})
components.extend(ngram_components)
# Check for Topic Modeling analysis
if "topic_modeling" in analyses:
print("Processing Topic Modeling visualization")
# Use the dedicated topic visualization function
topic_components = process_and_visualize_topic_analysis(
{"analyses": {prompt: {"topic_modeling": analyses["topic_modeling"]}}})
components.extend(topic_components)
if not components:
components.append(gr.Markdown("No visualization components could be created from the analysis results."))
print(f"Visualization complete: generated {len(components)} components")
return components
except Exception as e:
import traceback
error_msg = f"Visualization error: {str(e)}\n{traceback.format_exc()}"
print(error_msg)
return [gr.Markdown(f"**Error during visualization:**\n\n```\n{error_msg}\n```")]