Ryan commited on
Commit
14bac19
·
1 Parent(s): 7f40410
processors/topic_modeling.py ADDED
@@ -0,0 +1,208 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Topic modeling processor for comparing text responses
3
+ """
4
+ from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
5
+ from sklearn.decomposition import LatentDirichletAllocation, NMF
6
+ import numpy as np
7
+ import nltk
8
+ from nltk.corpus import stopwords
9
+ import re
10
+
11
+ def preprocess_text(text):
12
+ """
13
+ Preprocess text for topic modeling
14
+
15
+ Args:
16
+ text (str): Text to preprocess
17
+
18
+ Returns:
19
+ str: Preprocessed text
20
+ """
21
+ # Convert to lowercase
22
+ text = text.lower()
23
+
24
+ # Remove special characters and digits
25
+ text = re.sub(r'[^a-zA-Z\s]', '', text)
26
+
27
+ # Tokenize
28
+ tokens = nltk.word_tokenize(text)
29
+
30
+ # Remove stopwords
31
+ stop_words = set(stopwords.words('english'))
32
+ tokens = [token for token in tokens if token not in stop_words and len(token) > 3]
33
+
34
+ return ' '.join(tokens)
35
+
36
+ def get_top_words_per_topic(model, feature_names, n_top_words=10):
37
+ """
38
+ Get the top words for each topic in the model
39
+
40
+ Args:
41
+ model: Topic model (LDA or NMF)
42
+ feature_names (list): Feature names (words)
43
+ n_top_words (int): Number of top words to include per topic
44
+
45
+ Returns:
46
+ list: List of topics with their top words
47
+ """
48
+ topics = []
49
+ for topic_idx, topic in enumerate(model.components_):
50
+ top_words_idx = topic.argsort()[:-n_top_words - 1:-1]
51
+ top_words = [feature_names[i] for i in top_words_idx]
52
+ topic_dict = {
53
+ "id": topic_idx,
54
+ "words": top_words,
55
+ "weights": topic[top_words_idx].tolist()
56
+ }
57
+ topics.append(topic_dict)
58
+ return topics
59
+
60
+ def extract_topics(texts, n_topics=3, n_top_words=10, method="lda"):
61
+ """
62
+ Extract topics from a list of texts
63
+
64
+ Args:
65
+ texts (list): List of text documents
66
+ n_topics (int): Number of topics to extract
67
+ n_top_words (int): Number of top words per topic
68
+ method (str): Topic modeling method ('lda' or 'nmf')
69
+
70
+ Returns:
71
+ dict: Topic modeling results with topics and document-topic distributions
72
+ """
73
+ result = {
74
+ "method": method,
75
+ "n_topics": n_topics,
76
+ "topics": [],
77
+ "document_topics": []
78
+ }
79
+
80
+ # Preprocess texts
81
+ preprocessed_texts = [preprocess_text(text) for text in texts]
82
+
83
+ # Create document-term matrix
84
+ if method == "nmf":
85
+ # For NMF, use TF-IDF vectorization
86
+ vectorizer = TfidfVectorizer(max_features=1000, min_df=2, max_df=0.85)
87
+ else:
88
+ # For LDA, use CountVectorizer
89
+ vectorizer = CountVectorizer(max_features=1000, min_df=2, max_df=0.85)
90
+
91
+ X = vectorizer.fit_transform(preprocessed_texts)
92
+ feature_names = vectorizer.get_feature_names_out()
93
+
94
+ # Apply topic modeling
95
+ if method == "nmf":
96
+ # Non-negative Matrix Factorization
97
+ model = NMF(n_components=n_topics, random_state=42, max_iter=1000)
98
+ else:
99
+ # Latent Dirichlet Allocation
100
+ model = LatentDirichletAllocation(n_components=n_topics, random_state=42, max_iter=20)
101
+
102
+ topic_distribution = model.fit_transform(X)
103
+
104
+ # Get top words for each topic
105
+ result["topics"] = get_top_words_per_topic(model, feature_names, n_top_words)
106
+
107
+ # Get topic distribution for each document
108
+ for i, dist in enumerate(topic_distribution):
109
+ # Normalize for easier comparison
110
+ normalized_dist = dist / np.sum(dist) if np.sum(dist) > 0 else dist
111
+ result["document_topics"].append({
112
+ "document_id": i,
113
+ "distribution": normalized_dist.tolist()
114
+ })
115
+
116
+ return result
117
+
118
+ def compare_topics(response_texts, model_names, n_topics=3, n_top_words=10, method="lda"):
119
+ """
120
+ Compare topic distributions between different model responses
121
+
122
+ Args:
123
+ response_texts (list): List of response texts to compare
124
+ model_names (list): Names of models corresponding to responses
125
+ n_topics (int): Number of topics to extract
126
+ n_top_words (int): Number of top words per topic
127
+ method (str): Topic modeling method ('lda' or 'nmf')
128
+
129
+ Returns:
130
+ dict: Comparative topic analysis
131
+ """
132
+ # Initialize results
133
+ result = {
134
+ "models": model_names,
135
+ "method": method,
136
+ "n_topics": n_topics,
137
+ "topics": [],
138
+ "model_topics": {},
139
+ "comparisons": {}
140
+ }
141
+
142
+ # Extract topics
143
+ topic_model = extract_topics(response_texts, n_topics, n_top_words, method)
144
+ result["topics"] = topic_model["topics"]
145
+
146
+ # Map topic distributions to models
147
+ for i, model_name in enumerate(model_names):
148
+ if i < len(topic_model["document_topics"]):
149
+ result["model_topics"][model_name] = topic_model["document_topics"][i]["distribution"]
150
+
151
+ # Calculate topic distribution differences for pairs of models
152
+ if len(model_names) >= 2:
153
+ for i in range(len(model_names)):
154
+ for j in range(i+1, len(model_names)):
155
+ model1, model2 = model_names[i], model_names[j]
156
+
157
+ # Get topic distributions
158
+ dist1 = result["model_topics"].get(model1, [])
159
+ dist2 = result["model_topics"].get(model2, [])
160
+
161
+ # Skip if distributions are not available
162
+ if not dist1 or not dist2 or len(dist1) != len(dist2):
163
+ continue
164
+
165
+ # Calculate Jensen-Shannon divergence (approximation using average of KL divergences)
166
+ dist1 = np.array(dist1)
167
+ dist2 = np.array(dist2)
168
+
169
+ # Add small epsilon to avoid division by zero
170
+ epsilon = 1e-10
171
+ dist1 = dist1 + epsilon
172
+ dist2 = dist2 + epsilon
173
+
174
+ # Normalize
175
+ dist1 = dist1 / np.sum(dist1)
176
+ dist2 = dist2 / np.sum(dist2)
177
+
178
+ # Calculate average distribution
179
+ avg_dist = (dist1 + dist2) / 2
180
+
181
+ # Calculate KL divergences
182
+ kl_div1 = np.sum(dist1 * np.log(dist1 / avg_dist))
183
+ kl_div2 = np.sum(dist2 * np.log(dist2 / avg_dist))
184
+
185
+ # Jensen-Shannon divergence
186
+ js_div = (kl_div1 + kl_div2) / 2
187
+
188
+ # Topic-wise differences
189
+ topic_diffs = []
190
+ for t in range(len(dist1)):
191
+ topic_diffs.append({
192
+ "topic_id": t,
193
+ "model1_weight": float(dist1[t]),
194
+ "model2_weight": float(dist2[t]),
195
+ "diff": float(abs(dist1[t] - dist2[t]))
196
+ })
197
+
198
+ # Sort by difference
199
+ topic_diffs.sort(key=lambda x: x["diff"], reverse=True)
200
+
201
+ # Store comparison
202
+ comparison_key = f"{model1} vs {model2}"
203
+ result["comparisons"][comparison_key] = {
204
+ "js_divergence": float(js_div),
205
+ "topic_differences": topic_diffs
206
+ }
207
+
208
+ return result
visualization/__init__.py CHANGED
@@ -3,5 +3,11 @@ Visualization components for LLM Response Comparator
3
  """
4
 
5
  from .bow_visualizer import process_and_visualize_analysis
 
 
6
 
7
- __all__ = ['process_and_visualize_analysis']
 
 
 
 
 
3
  """
4
 
5
  from .bow_visualizer import process_and_visualize_analysis
6
+ from .topic_visualizer import process_and_visualize_topic_analysis
7
+ from .ngram_visualizer import process_and_visualize_ngram_analysis
8
 
9
+ __all__ = [
10
+ 'process_and_visualize_analysis',
11
+ 'process_and_visualize_topic_analysis',
12
+ 'process_and_visualize_ngram_analysis'
13
+ ]
visualization/topic_visualizer.py ADDED
@@ -0,0 +1,168 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Visualization for topic modeling analysis results
3
+ """
4
+ from visualization.ngram_visualizer import create_ngram_visualization
5
+ import gradio as gr
6
+ import json
7
+ import numpy as np
8
+ import pandas as pd
9
+ import plotly.express as px
10
+ import plotly.graph_objects as go
11
+ from plotly.subplots import make_subplots
12
+
13
+ def create_topic_visualization(analysis_results):
14
+ """
15
+ Create visualizations for topic modeling analysis results
16
+
17
+ Args:
18
+ analysis_results (dict): Analysis results from the topic modeling analysis
19
+
20
+ Returns:
21
+ list: List of gradio components with visualizations
22
+ """
23
+ # Initialize output components list
24
+ output_components = []
25
+
26
+ # Check if we have valid results
27
+ if not analysis_results or "analyses" not in analysis_results:
28
+ return [gr.Markdown("No analysis results found.")]
29
+
30
+ # Process each prompt
31
+ for prompt, analyses in analysis_results["analyses"].items():
32
+ # Process Topic Modeling analysis if available
33
+ if "topic_modeling" in analyses:
34
+ topic_results = analyses["topic_modeling"]
35
+
36
+ # Show method and number of topics
37
+ method = topic_results.get("method", "lda").upper()
38
+ n_topics = topic_results.get("n_topics", 3)
39
+ output_components.append(gr.Markdown(f"## Topic Modeling Analysis ({method}, {n_topics} topics)"))
40
+
41
+ # Show models being compared
42
+ models = topic_results.get("models", [])
43
+ if len(models) >= 2:
44
+ output_components.append(gr.Markdown(f"### Comparing responses from {models[0]} and {models[1]}"))
45
+
46
+ # Visualize topics
47
+ topics = topic_results.get("topics", [])
48
+ if topics:
49
+ output_components.append(gr.Markdown("### Discovered Topics"))
50
+
51
+ for topic in topics:
52
+ topic_id = topic.get("id", 0)
53
+ words = topic.get("words", [])
54
+ weights = topic.get("weights", [])
55
+
56
+ # Create topic word bar chart
57
+ if words and weights and len(words) == len(weights):
58
+ # Create dataframe for plotting
59
+ df = pd.DataFrame({
60
+ 'word': words,
61
+ 'weight': weights
62
+ })
63
+
64
+ # Sort by weight
65
+ df = df.sort_values('weight', ascending=False)
66
+
67
+ # Create bar chart
68
+ fig = px.bar(
69
+ df, x='word', y='weight',
70
+ title=f"Topic {topic_id+1} Top Words",
71
+ labels={'word': 'Word', 'weight': 'Weight'},
72
+ height=300
73
+ )
74
+
75
+ output_components.append(gr.Plot(value=fig))
76
+
77
+ # Visualize topic distributions for each model
78
+ model_topics = topic_results.get("model_topics", {})
79
+ if model_topics and all(model in model_topics for model in models):
80
+ output_components.append(gr.Markdown("### Topic Distribution by Model"))
81
+
82
+ # Create multi-model topic distribution comparison
83
+ fig = go.Figure()
84
+ for model in models:
85
+ if model in model_topics:
86
+ distribution = model_topics[model]
87
+ fig.add_trace(go.Bar(
88
+ x=[f"Topic {i+1}" for i in range(len(distribution))],
89
+ y=distribution,
90
+ name=model
91
+ ))
92
+
93
+ fig.update_layout(
94
+ title="Topic Distributions Comparison",
95
+ xaxis_title="Topic",
96
+ yaxis_title="Weight",
97
+ barmode='group',
98
+ height=400
99
+ )
100
+
101
+ output_components.append(gr.Plot(value=fig))
102
+
103
+ # Visualize topic differences
104
+ comparisons = topic_results.get("comparisons", {})
105
+ if comparisons:
106
+ output_components.append(gr.Markdown("### Topic Distribution Differences"))
107
+
108
+ for comparison_key, comparison_data in comparisons.items():
109
+ js_divergence = comparison_data.get("js_divergence", 0)
110
+ topic_differences = comparison_data.get("topic_differences", [])
111
+
112
+ output_components.append(gr.Markdown(
113
+ f"**{comparison_key}** - Jensen-Shannon Divergence: {js_divergence:.4f}"
114
+ ))
115
+
116
+ if topic_differences:
117
+ # Create DataFrame for plotting
118
+ model1, model2 = comparison_key.split(" vs ")
119
+ df_diff = pd.DataFrame(topic_differences)
120
+
121
+ # Create bar chart for topic differences
122
+ fig = go.Figure()
123
+ fig.add_trace(go.Bar(
124
+ x=[f"Topic {d['topic_id']+1}" for d in topic_differences],
125
+ y=[d["model1_weight"] for d in topic_differences],
126
+ name=model1
127
+ ))
128
+ fig.add_trace(go.Bar(
129
+ x=[f"Topic {d['topic_id']+1}" for d in topic_differences],
130
+ y=[d["model2_weight"] for d in topic_differences],
131
+ name=model2
132
+ ))
133
+
134
+ fig.update_layout(
135
+ title="Topic Weight Comparison",
136
+ xaxis_title="Topic",
137
+ yaxis_title="Weight",
138
+ barmode='group',
139
+ height=400
140
+ )
141
+
142
+ output_components.append(gr.Plot(value=fig))
143
+
144
+ # If no components were added, show a message
145
+ if len(output_components) <= 1:
146
+ output_components.append(gr.Markdown("No detailed Topic Modeling analysis found in results."))
147
+
148
+ return output_components
149
+
150
+
151
+ def process_and_visualize_topic_analysis(analysis_results):
152
+ """
153
+ Process the topic modeling analysis results and create visualization components
154
+
155
+ Args:
156
+ analysis_results (dict): The analysis results
157
+
158
+ Returns:
159
+ list: List of gradio components for visualization
160
+ """
161
+ try:
162
+ print(f"Starting visualization of topic modeling analysis results")
163
+ return create_topic_visualization(analysis_results)
164
+ except Exception as e:
165
+ import traceback
166
+ error_msg = f"Topic modeling visualization error: {str(e)}\n{traceback.format_exc()}"
167
+ print(error_msg)
168
+ return [gr.Markdown(f"**Error during topic modeling visualization:**\n\n```\n{error_msg}\n```")]