Spaces:
Sleeping
Sleeping
Ryan
commited on
Commit
·
14bac19
1
Parent(s):
7f40410
update
Browse files- processors/topic_modeling.py +208 -0
- visualization/__init__.py +7 -1
- visualization/topic_visualizer.py +168 -0
processors/topic_modeling.py
ADDED
@@ -0,0 +1,208 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Topic modeling processor for comparing text responses
|
3 |
+
"""
|
4 |
+
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
|
5 |
+
from sklearn.decomposition import LatentDirichletAllocation, NMF
|
6 |
+
import numpy as np
|
7 |
+
import nltk
|
8 |
+
from nltk.corpus import stopwords
|
9 |
+
import re
|
10 |
+
|
11 |
+
def preprocess_text(text):
|
12 |
+
"""
|
13 |
+
Preprocess text for topic modeling
|
14 |
+
|
15 |
+
Args:
|
16 |
+
text (str): Text to preprocess
|
17 |
+
|
18 |
+
Returns:
|
19 |
+
str: Preprocessed text
|
20 |
+
"""
|
21 |
+
# Convert to lowercase
|
22 |
+
text = text.lower()
|
23 |
+
|
24 |
+
# Remove special characters and digits
|
25 |
+
text = re.sub(r'[^a-zA-Z\s]', '', text)
|
26 |
+
|
27 |
+
# Tokenize
|
28 |
+
tokens = nltk.word_tokenize(text)
|
29 |
+
|
30 |
+
# Remove stopwords
|
31 |
+
stop_words = set(stopwords.words('english'))
|
32 |
+
tokens = [token for token in tokens if token not in stop_words and len(token) > 3]
|
33 |
+
|
34 |
+
return ' '.join(tokens)
|
35 |
+
|
36 |
+
def get_top_words_per_topic(model, feature_names, n_top_words=10):
|
37 |
+
"""
|
38 |
+
Get the top words for each topic in the model
|
39 |
+
|
40 |
+
Args:
|
41 |
+
model: Topic model (LDA or NMF)
|
42 |
+
feature_names (list): Feature names (words)
|
43 |
+
n_top_words (int): Number of top words to include per topic
|
44 |
+
|
45 |
+
Returns:
|
46 |
+
list: List of topics with their top words
|
47 |
+
"""
|
48 |
+
topics = []
|
49 |
+
for topic_idx, topic in enumerate(model.components_):
|
50 |
+
top_words_idx = topic.argsort()[:-n_top_words - 1:-1]
|
51 |
+
top_words = [feature_names[i] for i in top_words_idx]
|
52 |
+
topic_dict = {
|
53 |
+
"id": topic_idx,
|
54 |
+
"words": top_words,
|
55 |
+
"weights": topic[top_words_idx].tolist()
|
56 |
+
}
|
57 |
+
topics.append(topic_dict)
|
58 |
+
return topics
|
59 |
+
|
60 |
+
def extract_topics(texts, n_topics=3, n_top_words=10, method="lda"):
|
61 |
+
"""
|
62 |
+
Extract topics from a list of texts
|
63 |
+
|
64 |
+
Args:
|
65 |
+
texts (list): List of text documents
|
66 |
+
n_topics (int): Number of topics to extract
|
67 |
+
n_top_words (int): Number of top words per topic
|
68 |
+
method (str): Topic modeling method ('lda' or 'nmf')
|
69 |
+
|
70 |
+
Returns:
|
71 |
+
dict: Topic modeling results with topics and document-topic distributions
|
72 |
+
"""
|
73 |
+
result = {
|
74 |
+
"method": method,
|
75 |
+
"n_topics": n_topics,
|
76 |
+
"topics": [],
|
77 |
+
"document_topics": []
|
78 |
+
}
|
79 |
+
|
80 |
+
# Preprocess texts
|
81 |
+
preprocessed_texts = [preprocess_text(text) for text in texts]
|
82 |
+
|
83 |
+
# Create document-term matrix
|
84 |
+
if method == "nmf":
|
85 |
+
# For NMF, use TF-IDF vectorization
|
86 |
+
vectorizer = TfidfVectorizer(max_features=1000, min_df=2, max_df=0.85)
|
87 |
+
else:
|
88 |
+
# For LDA, use CountVectorizer
|
89 |
+
vectorizer = CountVectorizer(max_features=1000, min_df=2, max_df=0.85)
|
90 |
+
|
91 |
+
X = vectorizer.fit_transform(preprocessed_texts)
|
92 |
+
feature_names = vectorizer.get_feature_names_out()
|
93 |
+
|
94 |
+
# Apply topic modeling
|
95 |
+
if method == "nmf":
|
96 |
+
# Non-negative Matrix Factorization
|
97 |
+
model = NMF(n_components=n_topics, random_state=42, max_iter=1000)
|
98 |
+
else:
|
99 |
+
# Latent Dirichlet Allocation
|
100 |
+
model = LatentDirichletAllocation(n_components=n_topics, random_state=42, max_iter=20)
|
101 |
+
|
102 |
+
topic_distribution = model.fit_transform(X)
|
103 |
+
|
104 |
+
# Get top words for each topic
|
105 |
+
result["topics"] = get_top_words_per_topic(model, feature_names, n_top_words)
|
106 |
+
|
107 |
+
# Get topic distribution for each document
|
108 |
+
for i, dist in enumerate(topic_distribution):
|
109 |
+
# Normalize for easier comparison
|
110 |
+
normalized_dist = dist / np.sum(dist) if np.sum(dist) > 0 else dist
|
111 |
+
result["document_topics"].append({
|
112 |
+
"document_id": i,
|
113 |
+
"distribution": normalized_dist.tolist()
|
114 |
+
})
|
115 |
+
|
116 |
+
return result
|
117 |
+
|
118 |
+
def compare_topics(response_texts, model_names, n_topics=3, n_top_words=10, method="lda"):
|
119 |
+
"""
|
120 |
+
Compare topic distributions between different model responses
|
121 |
+
|
122 |
+
Args:
|
123 |
+
response_texts (list): List of response texts to compare
|
124 |
+
model_names (list): Names of models corresponding to responses
|
125 |
+
n_topics (int): Number of topics to extract
|
126 |
+
n_top_words (int): Number of top words per topic
|
127 |
+
method (str): Topic modeling method ('lda' or 'nmf')
|
128 |
+
|
129 |
+
Returns:
|
130 |
+
dict: Comparative topic analysis
|
131 |
+
"""
|
132 |
+
# Initialize results
|
133 |
+
result = {
|
134 |
+
"models": model_names,
|
135 |
+
"method": method,
|
136 |
+
"n_topics": n_topics,
|
137 |
+
"topics": [],
|
138 |
+
"model_topics": {},
|
139 |
+
"comparisons": {}
|
140 |
+
}
|
141 |
+
|
142 |
+
# Extract topics
|
143 |
+
topic_model = extract_topics(response_texts, n_topics, n_top_words, method)
|
144 |
+
result["topics"] = topic_model["topics"]
|
145 |
+
|
146 |
+
# Map topic distributions to models
|
147 |
+
for i, model_name in enumerate(model_names):
|
148 |
+
if i < len(topic_model["document_topics"]):
|
149 |
+
result["model_topics"][model_name] = topic_model["document_topics"][i]["distribution"]
|
150 |
+
|
151 |
+
# Calculate topic distribution differences for pairs of models
|
152 |
+
if len(model_names) >= 2:
|
153 |
+
for i in range(len(model_names)):
|
154 |
+
for j in range(i+1, len(model_names)):
|
155 |
+
model1, model2 = model_names[i], model_names[j]
|
156 |
+
|
157 |
+
# Get topic distributions
|
158 |
+
dist1 = result["model_topics"].get(model1, [])
|
159 |
+
dist2 = result["model_topics"].get(model2, [])
|
160 |
+
|
161 |
+
# Skip if distributions are not available
|
162 |
+
if not dist1 or not dist2 or len(dist1) != len(dist2):
|
163 |
+
continue
|
164 |
+
|
165 |
+
# Calculate Jensen-Shannon divergence (approximation using average of KL divergences)
|
166 |
+
dist1 = np.array(dist1)
|
167 |
+
dist2 = np.array(dist2)
|
168 |
+
|
169 |
+
# Add small epsilon to avoid division by zero
|
170 |
+
epsilon = 1e-10
|
171 |
+
dist1 = dist1 + epsilon
|
172 |
+
dist2 = dist2 + epsilon
|
173 |
+
|
174 |
+
# Normalize
|
175 |
+
dist1 = dist1 / np.sum(dist1)
|
176 |
+
dist2 = dist2 / np.sum(dist2)
|
177 |
+
|
178 |
+
# Calculate average distribution
|
179 |
+
avg_dist = (dist1 + dist2) / 2
|
180 |
+
|
181 |
+
# Calculate KL divergences
|
182 |
+
kl_div1 = np.sum(dist1 * np.log(dist1 / avg_dist))
|
183 |
+
kl_div2 = np.sum(dist2 * np.log(dist2 / avg_dist))
|
184 |
+
|
185 |
+
# Jensen-Shannon divergence
|
186 |
+
js_div = (kl_div1 + kl_div2) / 2
|
187 |
+
|
188 |
+
# Topic-wise differences
|
189 |
+
topic_diffs = []
|
190 |
+
for t in range(len(dist1)):
|
191 |
+
topic_diffs.append({
|
192 |
+
"topic_id": t,
|
193 |
+
"model1_weight": float(dist1[t]),
|
194 |
+
"model2_weight": float(dist2[t]),
|
195 |
+
"diff": float(abs(dist1[t] - dist2[t]))
|
196 |
+
})
|
197 |
+
|
198 |
+
# Sort by difference
|
199 |
+
topic_diffs.sort(key=lambda x: x["diff"], reverse=True)
|
200 |
+
|
201 |
+
# Store comparison
|
202 |
+
comparison_key = f"{model1} vs {model2}"
|
203 |
+
result["comparisons"][comparison_key] = {
|
204 |
+
"js_divergence": float(js_div),
|
205 |
+
"topic_differences": topic_diffs
|
206 |
+
}
|
207 |
+
|
208 |
+
return result
|
visualization/__init__.py
CHANGED
@@ -3,5 +3,11 @@ Visualization components for LLM Response Comparator
|
|
3 |
"""
|
4 |
|
5 |
from .bow_visualizer import process_and_visualize_analysis
|
|
|
|
|
6 |
|
7 |
-
__all__ = [
|
|
|
|
|
|
|
|
|
|
3 |
"""
|
4 |
|
5 |
from .bow_visualizer import process_and_visualize_analysis
|
6 |
+
from .topic_visualizer import process_and_visualize_topic_analysis
|
7 |
+
from .ngram_visualizer import process_and_visualize_ngram_analysis
|
8 |
|
9 |
+
__all__ = [
|
10 |
+
'process_and_visualize_analysis',
|
11 |
+
'process_and_visualize_topic_analysis',
|
12 |
+
'process_and_visualize_ngram_analysis'
|
13 |
+
]
|
visualization/topic_visualizer.py
ADDED
@@ -0,0 +1,168 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Visualization for topic modeling analysis results
|
3 |
+
"""
|
4 |
+
from visualization.ngram_visualizer import create_ngram_visualization
|
5 |
+
import gradio as gr
|
6 |
+
import json
|
7 |
+
import numpy as np
|
8 |
+
import pandas as pd
|
9 |
+
import plotly.express as px
|
10 |
+
import plotly.graph_objects as go
|
11 |
+
from plotly.subplots import make_subplots
|
12 |
+
|
13 |
+
def create_topic_visualization(analysis_results):
|
14 |
+
"""
|
15 |
+
Create visualizations for topic modeling analysis results
|
16 |
+
|
17 |
+
Args:
|
18 |
+
analysis_results (dict): Analysis results from the topic modeling analysis
|
19 |
+
|
20 |
+
Returns:
|
21 |
+
list: List of gradio components with visualizations
|
22 |
+
"""
|
23 |
+
# Initialize output components list
|
24 |
+
output_components = []
|
25 |
+
|
26 |
+
# Check if we have valid results
|
27 |
+
if not analysis_results or "analyses" not in analysis_results:
|
28 |
+
return [gr.Markdown("No analysis results found.")]
|
29 |
+
|
30 |
+
# Process each prompt
|
31 |
+
for prompt, analyses in analysis_results["analyses"].items():
|
32 |
+
# Process Topic Modeling analysis if available
|
33 |
+
if "topic_modeling" in analyses:
|
34 |
+
topic_results = analyses["topic_modeling"]
|
35 |
+
|
36 |
+
# Show method and number of topics
|
37 |
+
method = topic_results.get("method", "lda").upper()
|
38 |
+
n_topics = topic_results.get("n_topics", 3)
|
39 |
+
output_components.append(gr.Markdown(f"## Topic Modeling Analysis ({method}, {n_topics} topics)"))
|
40 |
+
|
41 |
+
# Show models being compared
|
42 |
+
models = topic_results.get("models", [])
|
43 |
+
if len(models) >= 2:
|
44 |
+
output_components.append(gr.Markdown(f"### Comparing responses from {models[0]} and {models[1]}"))
|
45 |
+
|
46 |
+
# Visualize topics
|
47 |
+
topics = topic_results.get("topics", [])
|
48 |
+
if topics:
|
49 |
+
output_components.append(gr.Markdown("### Discovered Topics"))
|
50 |
+
|
51 |
+
for topic in topics:
|
52 |
+
topic_id = topic.get("id", 0)
|
53 |
+
words = topic.get("words", [])
|
54 |
+
weights = topic.get("weights", [])
|
55 |
+
|
56 |
+
# Create topic word bar chart
|
57 |
+
if words and weights and len(words) == len(weights):
|
58 |
+
# Create dataframe for plotting
|
59 |
+
df = pd.DataFrame({
|
60 |
+
'word': words,
|
61 |
+
'weight': weights
|
62 |
+
})
|
63 |
+
|
64 |
+
# Sort by weight
|
65 |
+
df = df.sort_values('weight', ascending=False)
|
66 |
+
|
67 |
+
# Create bar chart
|
68 |
+
fig = px.bar(
|
69 |
+
df, x='word', y='weight',
|
70 |
+
title=f"Topic {topic_id+1} Top Words",
|
71 |
+
labels={'word': 'Word', 'weight': 'Weight'},
|
72 |
+
height=300
|
73 |
+
)
|
74 |
+
|
75 |
+
output_components.append(gr.Plot(value=fig))
|
76 |
+
|
77 |
+
# Visualize topic distributions for each model
|
78 |
+
model_topics = topic_results.get("model_topics", {})
|
79 |
+
if model_topics and all(model in model_topics for model in models):
|
80 |
+
output_components.append(gr.Markdown("### Topic Distribution by Model"))
|
81 |
+
|
82 |
+
# Create multi-model topic distribution comparison
|
83 |
+
fig = go.Figure()
|
84 |
+
for model in models:
|
85 |
+
if model in model_topics:
|
86 |
+
distribution = model_topics[model]
|
87 |
+
fig.add_trace(go.Bar(
|
88 |
+
x=[f"Topic {i+1}" for i in range(len(distribution))],
|
89 |
+
y=distribution,
|
90 |
+
name=model
|
91 |
+
))
|
92 |
+
|
93 |
+
fig.update_layout(
|
94 |
+
title="Topic Distributions Comparison",
|
95 |
+
xaxis_title="Topic",
|
96 |
+
yaxis_title="Weight",
|
97 |
+
barmode='group',
|
98 |
+
height=400
|
99 |
+
)
|
100 |
+
|
101 |
+
output_components.append(gr.Plot(value=fig))
|
102 |
+
|
103 |
+
# Visualize topic differences
|
104 |
+
comparisons = topic_results.get("comparisons", {})
|
105 |
+
if comparisons:
|
106 |
+
output_components.append(gr.Markdown("### Topic Distribution Differences"))
|
107 |
+
|
108 |
+
for comparison_key, comparison_data in comparisons.items():
|
109 |
+
js_divergence = comparison_data.get("js_divergence", 0)
|
110 |
+
topic_differences = comparison_data.get("topic_differences", [])
|
111 |
+
|
112 |
+
output_components.append(gr.Markdown(
|
113 |
+
f"**{comparison_key}** - Jensen-Shannon Divergence: {js_divergence:.4f}"
|
114 |
+
))
|
115 |
+
|
116 |
+
if topic_differences:
|
117 |
+
# Create DataFrame for plotting
|
118 |
+
model1, model2 = comparison_key.split(" vs ")
|
119 |
+
df_diff = pd.DataFrame(topic_differences)
|
120 |
+
|
121 |
+
# Create bar chart for topic differences
|
122 |
+
fig = go.Figure()
|
123 |
+
fig.add_trace(go.Bar(
|
124 |
+
x=[f"Topic {d['topic_id']+1}" for d in topic_differences],
|
125 |
+
y=[d["model1_weight"] for d in topic_differences],
|
126 |
+
name=model1
|
127 |
+
))
|
128 |
+
fig.add_trace(go.Bar(
|
129 |
+
x=[f"Topic {d['topic_id']+1}" for d in topic_differences],
|
130 |
+
y=[d["model2_weight"] for d in topic_differences],
|
131 |
+
name=model2
|
132 |
+
))
|
133 |
+
|
134 |
+
fig.update_layout(
|
135 |
+
title="Topic Weight Comparison",
|
136 |
+
xaxis_title="Topic",
|
137 |
+
yaxis_title="Weight",
|
138 |
+
barmode='group',
|
139 |
+
height=400
|
140 |
+
)
|
141 |
+
|
142 |
+
output_components.append(gr.Plot(value=fig))
|
143 |
+
|
144 |
+
# If no components were added, show a message
|
145 |
+
if len(output_components) <= 1:
|
146 |
+
output_components.append(gr.Markdown("No detailed Topic Modeling analysis found in results."))
|
147 |
+
|
148 |
+
return output_components
|
149 |
+
|
150 |
+
|
151 |
+
def process_and_visualize_topic_analysis(analysis_results):
|
152 |
+
"""
|
153 |
+
Process the topic modeling analysis results and create visualization components
|
154 |
+
|
155 |
+
Args:
|
156 |
+
analysis_results (dict): The analysis results
|
157 |
+
|
158 |
+
Returns:
|
159 |
+
list: List of gradio components for visualization
|
160 |
+
"""
|
161 |
+
try:
|
162 |
+
print(f"Starting visualization of topic modeling analysis results")
|
163 |
+
return create_topic_visualization(analysis_results)
|
164 |
+
except Exception as e:
|
165 |
+
import traceback
|
166 |
+
error_msg = f"Topic modeling visualization error: {str(e)}\n{traceback.format_exc()}"
|
167 |
+
print(error_msg)
|
168 |
+
return [gr.Markdown(f"**Error during topic modeling visualization:**\n\n```\n{error_msg}\n```")]
|