""" Enhanced visualization for topic modeling analysis results """ import gradio as gr import pandas as pd import plotly.express as px import plotly.graph_objects as go from plotly.subplots import make_subplots import numpy as np def create_topic_visualization(analysis_results): """ Create enhanced visualizations for topic modeling analysis results Args: analysis_results (dict): Analysis results from the topic modeling analysis Returns: list: List of gradio components with visualizations """ # Initialize output components list output_components = [] # Check if we have valid results if not analysis_results or "analyses" not in analysis_results: return [gr.Markdown("No analysis results found.")] # Process each prompt for prompt, analyses in analysis_results["analyses"].items(): # Process Topic Modeling analysis if available if "topic_modeling" in analyses: topic_results = analyses["topic_modeling"] # Enhanced error checking and messaging if "error" in topic_results: output_components.append(gr.Markdown(f"## ⚠️ Topic Modeling Error")) output_components.append(gr.Markdown(f"Error: {topic_results['error']}")) output_components.append(gr.Markdown("Suggestions:")) output_components.append(gr.Markdown("1. Try with longer text samples - topic modeling typically needs 100+ words per document")) output_components.append(gr.Markdown("2. Reduce the number of topics (2-3 for short texts)")) output_components.append(gr.Markdown("3. Try the Bag of Words or N-gram analysis for shorter texts")) continue # Show method and number of topics method = topic_results.get("method", "lda").upper() n_topics = topic_results.get("n_topics", 3) # Check if n_topics was adjusted if "adjusted_n_topics" in topic_results and topic_results["adjusted_n_topics"] != topic_results.get("original_n_topics", n_topics): output_components.append(gr.Markdown( f"## Topic Modeling Analysis ({method}, {topic_results['adjusted_n_topics']} topics) " + f"*Adjusted from {topic_results['original_n_topics']} due to limited text content*" )) n_topics = topic_results["adjusted_n_topics"] else: output_components.append(gr.Markdown(f"## Topic Modeling Analysis ({method}, {n_topics} topics)")) # Check for warnings if "warnings" in topic_results: if isinstance(topic_results["warnings"], list): for warning in topic_results["warnings"]: output_components.append(gr.Markdown(f"⚠️ **Warning**: {warning}")) else: output_components.append(gr.Markdown(f"⚠️ **Warning**: {topic_results['warnings']}")) if "warning" in topic_results: output_components.append(gr.Markdown(f"⚠️ **Warning**: {topic_results['warning']}")) # Show models being compared models = topic_results.get("models", []) if len(models) >= 2: output_components.append(gr.Markdown(f"### Comparing responses from {models[0]} and {models[1]}")) # Show topic quality metrics if available if "coherence_scores" in topic_results: coherence_html = f"""

Topic Quality Metrics

Metric {models[0]} {models[1]} Combined
Topic Coherence {topic_results["coherence_scores"].get(models[0], 0):.2f} {topic_results["coherence_scores"].get(models[1], 0):.2f} {topic_results["coherence_scores"].get("combined", 0):.2f}
Topic Diversity {topic_results["diversity_scores"].get(models[0], 0):.2f} {topic_results["diversity_scores"].get(models[1], 0):.2f} {topic_results["diversity_scores"].get("combined", 0):.2f}

Higher coherence scores indicate more semantically coherent topics.
Higher diversity scores indicate less overlap between topics.

""" output_components.append(gr.HTML(coherence_html)) # Visualize topics topics = topic_results.get("topics", []) if topics: output_components.append(gr.Markdown("### Discovered Topics")) # Create a topic word cloud using HTML/CSS for better visibility for topic in topics: topic_id = topic.get("id", 0) words = topic.get("words", []) weights = topic.get("weights", []) if words and weights and len(words) == len(weights): # Generate a word cloud-like div using HTML/CSS word_cloud_html = f"""

Topic {topic_id+1}

""" # Sort words by weight for better visualization word_weight_pairs = sorted(zip(words, weights), key=lambda x: x[1], reverse=True) # Add each word with size based on weight for word, weight in word_weight_pairs: # Scale weight to a reasonable font size (min 14px, max 28px) font_size = 14 + min(14, round(weight * 30)) # Color based on weight (darker = higher weight) color_intensity = max(0, min(90, int(100 - weight * 100))) color = f"hsl(210, 70%, {color_intensity}%)" word_cloud_html += f""" {word} """ word_cloud_html += """
""" output_components.append(gr.HTML(word_cloud_html)) # Add a proper bar chart visualization for topic words for topic in topics[:min(3, len(topics))]: # Show charts for max 3 topics to avoid clutter topic_id = topic.get("id", 0) words = topic.get("words", []) weights = topic.get("weights", []) if words and weights and len(words) == len(weights): # Create dataframe for plotting df = pd.DataFrame({ 'word': words, 'weight': weights }) # Sort by weight df = df.sort_values('weight', ascending=False) # Limit to top N words for clarity df = df.head(10) # Create bar chart fig = px.bar( df, x='weight', y='word', title=f"Topic {topic_id+1} Top Words", labels={'word': 'Word', 'weight': 'Weight'}, height=300, orientation='h' # Horizontal bars ) # Improve layout fig.update_layout( margin=dict(l=10, r=10, t=40, b=10), yaxis={'categoryorder': 'total ascending'} ) output_components.append(gr.Plot(value=fig)) # Visualize topic distributions for each model model_topics = topic_results.get("model_topics", {}) if model_topics and all(model in model_topics for model in models): output_components.append(gr.Markdown("### Topic Distribution by Model")) # Create multi-model topic distribution comparison distribution_data = [] for model in models: if model in model_topics: distribution = model_topics[model] for i, weight in enumerate(distribution): if i < 10: # Limit to 10 topics max distribution_data.append({ 'Model': model, 'Topic': f"Topic {i+1}", 'Weight': weight }) if distribution_data: df = pd.DataFrame(distribution_data) # Create grouped bar chart fig = px.bar( df, x='Topic', y='Weight', color='Model', barmode='group', title="Topic Distribution Comparison", height=400 ) output_components.append(gr.Plot(value=fig)) # Visualize topic differences as a heatmap comparisons = topic_results.get("comparisons", {}) if comparisons: comparison_key = f"{models[0]} vs {models[1]}" if comparison_key in comparisons: output_components.append(gr.Markdown("### Topic Similarity Analysis")) # Get JS divergence js_divergence = comparisons[comparison_key].get("js_divergence", 0) # Create a divergence meter divergence_html = f"""

Topic Distribution Divergence

Similar (0.0) Different (1.0)

Score: {js_divergence:.3f}

Jensen-Shannon Divergence measures the similarity between topic distributions.
Lower values indicate more similar topic distributions between models.

""" output_components.append(gr.HTML(divergence_html)) # Create similarity matrix heatmap if available similarity_matrix = topic_results.get("similarity_matrix", []) if similarity_matrix and len(similarity_matrix) > 0: # Convert to format for heatmap z_data = similarity_matrix # Create heatmap fig = go.Figure(data=go.Heatmap( z=z_data, x=[f"{models[1]} Topic {i+1}" for i in range(len(similarity_matrix[0]))], y=[f"{models[0]} Topic {i+1}" for i in range(len(similarity_matrix))], colorscale='Viridis', showscale=True, colorbar=dict(title="Similarity") )) fig.update_layout( title="Topic Similarity Matrix", height=400, margin=dict(l=50, r=50, t=50, b=50) ) output_components.append(gr.Plot(value=fig)) # Show best matching topics matched_topics = topic_results.get("matched_topics", []) if matched_topics: output_components.append(gr.Markdown("### Most Similar Topic Pairs")) # Create HTML table for matched topics matched_topics_html = """
""" # Sort by similarity, highest first sorted_matches = sorted(matched_topics, key=lambda x: x['similarity'], reverse=True) for match in sorted_matches: # Format words with commas words1 = ", ".join(match["set1_topic_words"][:5]) # Show top 5 words words2 = ", ".join(match["set2_topic_words"][:5]) # Show top 5 words # Calculate color based on similarity (green for high, red for low) similarity = match["similarity"] color = f"hsl({int(120 * similarity)}, 70%, 50%)" matched_topics_html += f""" """ matched_topics_html += """
Topic Pair Top Words in Model 1 Top Words in Model 2 Similarity
{models[0]} Topic {match['set1_topic_id']+1} ↔ {models[1]} Topic {match['set2_topic_id']+1} {words1} {words2} {similarity:.2f}
""" output_components.append(gr.HTML(matched_topics_html)) # If no components were added, show a message if len(output_components) <= 1: output_components.append(gr.Markdown("No detailed Topic Modeling analysis found in results.")) return output_components def process_and_visualize_topic_analysis(analysis_results): """ Process the topic modeling analysis results and create visualization components Args: analysis_results (dict): The analysis results Returns: list: List of gradio components for visualization """ try: print(f"Starting visualization of topic modeling analysis results") components = create_topic_visualization(analysis_results) print(f"Completed topic modeling visualization with {len(components)} components") return components except Exception as e: import traceback error_msg = f"Topic modeling visualization error: {str(e)}\n{traceback.format_exc()}" print(error_msg) return [ gr.Markdown(f"**Error during topic modeling visualization:**"), gr.Markdown(f"```\n{str(e)}\n```"), gr.Markdown("Try adjusting the number of topics or using longer text inputs.") ]