"""
Enhanced visualization for topic modeling analysis results
"""
import gradio as gr
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import numpy as np
def create_topic_visualization(analysis_results):
"""
Create enhanced visualizations for topic modeling analysis results
Args:
analysis_results (dict): Analysis results from the topic modeling analysis
Returns:
list: List of gradio components with visualizations
"""
# Initialize output components list
output_components = []
# Check if we have valid results
if not analysis_results or "analyses" not in analysis_results:
return [gr.Markdown("No analysis results found.")]
# Process each prompt
for prompt, analyses in analysis_results["analyses"].items():
# Process Topic Modeling analysis if available
if "topic_modeling" in analyses:
topic_results = analyses["topic_modeling"]
# Enhanced error checking and messaging
if "error" in topic_results:
output_components.append(gr.Markdown(f"## ⚠️ Topic Modeling Error"))
output_components.append(gr.Markdown(f"Error: {topic_results['error']}"))
output_components.append(gr.Markdown("Suggestions:"))
output_components.append(gr.Markdown("1. Try with longer text samples - topic modeling typically needs 100+ words per document"))
output_components.append(gr.Markdown("2. Reduce the number of topics (2-3 for short texts)"))
output_components.append(gr.Markdown("3. Try the Bag of Words or N-gram analysis for shorter texts"))
continue
# Show method and number of topics
method = topic_results.get("method", "lda").upper()
n_topics = topic_results.get("n_topics", 3)
# Check if n_topics was adjusted
if "adjusted_n_topics" in topic_results and topic_results["adjusted_n_topics"] != topic_results.get("original_n_topics", n_topics):
output_components.append(gr.Markdown(
f"## Topic Modeling Analysis ({method}, {topic_results['adjusted_n_topics']} topics) " +
f"*Adjusted from {topic_results['original_n_topics']} due to limited text content*"
))
n_topics = topic_results["adjusted_n_topics"]
else:
output_components.append(gr.Markdown(f"## Topic Modeling Analysis ({method}, {n_topics} topics)"))
# Check for warnings
if "warnings" in topic_results:
if isinstance(topic_results["warnings"], list):
for warning in topic_results["warnings"]:
output_components.append(gr.Markdown(f"⚠️ **Warning**: {warning}"))
else:
output_components.append(gr.Markdown(f"⚠️ **Warning**: {topic_results['warnings']}"))
if "warning" in topic_results:
output_components.append(gr.Markdown(f"⚠️ **Warning**: {topic_results['warning']}"))
# Show models being compared
models = topic_results.get("models", [])
if len(models) >= 2:
output_components.append(gr.Markdown(f"### Comparing responses from {models[0]} and {models[1]}"))
# Show topic quality metrics if available
if "coherence_scores" in topic_results:
coherence_html = f"""
Topic Quality Metrics
Metric |
{models[0]} |
{models[1]} |
Combined |
Topic Coherence |
{topic_results["coherence_scores"].get(models[0], 0):.2f}
|
{topic_results["coherence_scores"].get(models[1], 0):.2f}
|
{topic_results["coherence_scores"].get("combined", 0):.2f}
|
Topic Diversity |
{topic_results["diversity_scores"].get(models[0], 0):.2f}
|
{topic_results["diversity_scores"].get(models[1], 0):.2f}
|
{topic_results["diversity_scores"].get("combined", 0):.2f}
|
Higher coherence scores indicate more semantically coherent topics.
Higher diversity scores indicate less overlap between topics.
"""
output_components.append(gr.HTML(coherence_html))
# Visualize topics
topics = topic_results.get("topics", [])
if topics:
output_components.append(gr.Markdown("### Discovered Topics"))
# Create a topic word cloud using HTML/CSS for better visibility
for topic in topics:
topic_id = topic.get("id", 0)
words = topic.get("words", [])
weights = topic.get("weights", [])
if words and weights and len(words) == len(weights):
# Generate a word cloud-like div using HTML/CSS
word_cloud_html = f"""
Topic {topic_id+1}
"""
# Sort words by weight for better visualization
word_weight_pairs = sorted(zip(words, weights), key=lambda x: x[1], reverse=True)
# Add each word with size based on weight
for word, weight in word_weight_pairs:
# Scale weight to a reasonable font size (min 14px, max 28px)
font_size = 14 + min(14, round(weight * 30))
# Color based on weight (darker = higher weight)
color_intensity = max(0, min(90, int(100 - weight * 100)))
color = f"hsl(210, 70%, {color_intensity}%)"
word_cloud_html += f"""
{word}
"""
word_cloud_html += """
"""
output_components.append(gr.HTML(word_cloud_html))
# Add a proper bar chart visualization for topic words
for topic in topics[:min(3, len(topics))]: # Show charts for max 3 topics to avoid clutter
topic_id = topic.get("id", 0)
words = topic.get("words", [])
weights = topic.get("weights", [])
if words and weights and len(words) == len(weights):
# Create dataframe for plotting
df = pd.DataFrame({
'word': words,
'weight': weights
})
# Sort by weight
df = df.sort_values('weight', ascending=False)
# Limit to top N words for clarity
df = df.head(10)
# Create bar chart
fig = px.bar(
df, x='weight', y='word',
title=f"Topic {topic_id+1} Top Words",
labels={'word': 'Word', 'weight': 'Weight'},
height=300,
orientation='h' # Horizontal bars
)
# Improve layout
fig.update_layout(
margin=dict(l=10, r=10, t=40, b=10),
yaxis={'categoryorder': 'total ascending'}
)
output_components.append(gr.Plot(value=fig))
# Visualize topic distributions for each model
model_topics = topic_results.get("model_topics", {})
if model_topics and all(model in model_topics for model in models):
output_components.append(gr.Markdown("### Topic Distribution by Model"))
# Create multi-model topic distribution comparison
distribution_data = []
for model in models:
if model in model_topics:
distribution = model_topics[model]
for i, weight in enumerate(distribution):
if i < 10: # Limit to 10 topics max
distribution_data.append({
'Model': model,
'Topic': f"Topic {i+1}",
'Weight': weight
})
if distribution_data:
df = pd.DataFrame(distribution_data)
# Create grouped bar chart
fig = px.bar(
df, x='Topic', y='Weight', color='Model',
barmode='group',
title="Topic Distribution Comparison",
height=400
)
output_components.append(gr.Plot(value=fig))
# Visualize topic differences as a heatmap
comparisons = topic_results.get("comparisons", {})
if comparisons:
comparison_key = f"{models[0]} vs {models[1]}"
if comparison_key in comparisons:
output_components.append(gr.Markdown("### Topic Similarity Analysis"))
# Get JS divergence
js_divergence = comparisons[comparison_key].get("js_divergence", 0)
# Create a divergence meter
divergence_html = f"""
Topic Distribution Divergence
Similar (0.0)
Different (1.0)
Score: {js_divergence:.3f}
Jensen-Shannon Divergence measures the similarity between topic distributions.
Lower values indicate more similar topic distributions between models.
"""
output_components.append(gr.HTML(divergence_html))
# Create similarity matrix heatmap if available
similarity_matrix = topic_results.get("similarity_matrix", [])
if similarity_matrix and len(similarity_matrix) > 0:
# Convert to format for heatmap
z_data = similarity_matrix
# Create heatmap
fig = go.Figure(data=go.Heatmap(
z=z_data,
x=[f"{models[1]} Topic {i+1}" for i in range(len(similarity_matrix[0]))],
y=[f"{models[0]} Topic {i+1}" for i in range(len(similarity_matrix))],
colorscale='Viridis',
showscale=True,
colorbar=dict(title="Similarity")
))
fig.update_layout(
title="Topic Similarity Matrix",
height=400,
margin=dict(l=50, r=50, t=50, b=50)
)
output_components.append(gr.Plot(value=fig))
# Show best matching topics
matched_topics = topic_results.get("matched_topics", [])
if matched_topics:
output_components.append(gr.Markdown("### Most Similar Topic Pairs"))
# Create HTML table for matched topics
matched_topics_html = """
Topic Pair |
Top Words in Model 1 |
Top Words in Model 2 |
Similarity |
"""
# Sort by similarity, highest first
sorted_matches = sorted(matched_topics, key=lambda x: x['similarity'], reverse=True)
for match in sorted_matches:
# Format words with commas
words1 = ", ".join(match["set1_topic_words"][:5]) # Show top 5 words
words2 = ", ".join(match["set2_topic_words"][:5]) # Show top 5 words
# Calculate color based on similarity (green for high, red for low)
similarity = match["similarity"]
color = f"hsl({int(120 * similarity)}, 70%, 50%)"
matched_topics_html += f"""
{models[0]} Topic {match['set1_topic_id']+1} ↔ {models[1]} Topic {match['set2_topic_id']+1}
|
{words1} |
{words2} |
{similarity:.2f}
|
"""
matched_topics_html += """
"""
output_components.append(gr.HTML(matched_topics_html))
# If no components were added, show a message
if len(output_components) <= 1:
output_components.append(gr.Markdown("No detailed Topic Modeling analysis found in results."))
return output_components
def process_and_visualize_topic_analysis(analysis_results):
"""
Process the topic modeling analysis results and create visualization components
Args:
analysis_results (dict): The analysis results
Returns:
list: List of gradio components for visualization
"""
try:
print(f"Starting visualization of topic modeling analysis results")
components = create_topic_visualization(analysis_results)
print(f"Completed topic modeling visualization with {len(components)} components")
return components
except Exception as e:
import traceback
error_msg = f"Topic modeling visualization error: {str(e)}\n{traceback.format_exc()}"
print(error_msg)
return [
gr.Markdown(f"**Error during topic modeling visualization:**"),
gr.Markdown(f"```\n{str(e)}\n```"),
gr.Markdown("Try adjusting the number of topics or using longer text inputs.")
]