Ryan commited on
Commit
30bc4e7
·
1 Parent(s): fc52d23
.idea/workspace.xml CHANGED
@@ -4,7 +4,9 @@
4
  <option name="autoReloadType" value="SELECTIVE" />
5
  </component>
6
  <component name="ChangeListManager">
7
- <list default="true" id="8e67814c-7f04-433c-ab7a-2b65a1106d4c" name="Changes" comment="" />
 
 
8
  <option name="SHOW_DIALOG" value="false" />
9
  <option name="HIGHLIGHT_CONFLICTS" value="true" />
10
  <option name="HIGHLIGHT_NON_ACTIVE_CHANGELIST" value="false" />
@@ -53,7 +55,7 @@
53
  <option name="presentableId" value="Default" />
54
  <updated>1745170754325</updated>
55
  <workItem from="1745170755404" duration="245000" />
56
- <workItem from="1745172030020" duration="11341000" />
57
  </task>
58
  <servers />
59
  </component>
 
4
  <option name="autoReloadType" value="SELECTIVE" />
5
  </component>
6
  <component name="ChangeListManager">
7
+ <list default="true" id="8e67814c-7f04-433c-ab7a-2b65a1106d4c" name="Changes" comment="">
8
+ <change beforePath="$PROJECT_DIR$/processors/topic_modeling.py" beforeDir="false" afterPath="$PROJECT_DIR$/processors/topic_modeling.py" afterDir="false" />
9
+ </list>
10
  <option name="SHOW_DIALOG" value="false" />
11
  <option name="HIGHLIGHT_CONFLICTS" value="true" />
12
  <option name="HIGHLIGHT_NON_ACTIVE_CHANGELIST" value="false" />
 
55
  <option name="presentableId" value="Default" />
56
  <updated>1745170754325</updated>
57
  <workItem from="1745170755404" duration="245000" />
58
+ <workItem from="1745172030020" duration="11940000" />
59
  </task>
60
  <servers />
61
  </component>
processors/topic_modeling.py CHANGED
@@ -83,10 +83,12 @@ def extract_topics(texts, n_topics=3, n_top_words=10, method="lda"):
83
  # Create document-term matrix
84
  if method == "nmf":
85
  # For NMF, use TF-IDF vectorization
86
- vectorizer = TfidfVectorizer(max_features=1000, min_df=2, max_df=0.85)
 
87
  else:
88
  # For LDA, use CountVectorizer
89
- vectorizer = CountVectorizer(max_features=1000, min_df=2, max_df=0.85)
 
90
 
91
  X = vectorizer.fit_transform(preprocessed_texts)
92
  feature_names = vectorizer.get_feature_names_out()
@@ -113,96 +115,4 @@ def extract_topics(texts, n_topics=3, n_top_words=10, method="lda"):
113
  "distribution": normalized_dist.tolist()
114
  })
115
 
116
- return result
117
-
118
- def compare_topics(response_texts, model_names, n_topics=3, n_top_words=10, method="lda"):
119
- """
120
- Compare topic distributions between different model responses
121
-
122
- Args:
123
- response_texts (list): List of response texts to compare
124
- model_names (list): Names of models corresponding to responses
125
- n_topics (int): Number of topics to extract
126
- n_top_words (int): Number of top words per topic
127
- method (str): Topic modeling method ('lda' or 'nmf')
128
-
129
- Returns:
130
- dict: Comparative topic analysis
131
- """
132
- # Initialize results
133
- result = {
134
- "models": model_names,
135
- "method": method,
136
- "n_topics": n_topics,
137
- "topics": [],
138
- "model_topics": {},
139
- "comparisons": {}
140
- }
141
-
142
- # Extract topics
143
- topic_model = extract_topics(response_texts, n_topics, n_top_words, method)
144
- result["topics"] = topic_model["topics"]
145
-
146
- # Map topic distributions to models
147
- for i, model_name in enumerate(model_names):
148
- if i < len(topic_model["document_topics"]):
149
- result["model_topics"][model_name] = topic_model["document_topics"][i]["distribution"]
150
-
151
- # Calculate topic distribution differences for pairs of models
152
- if len(model_names) >= 2:
153
- for i in range(len(model_names)):
154
- for j in range(i+1, len(model_names)):
155
- model1, model2 = model_names[i], model_names[j]
156
-
157
- # Get topic distributions
158
- dist1 = result["model_topics"].get(model1, [])
159
- dist2 = result["model_topics"].get(model2, [])
160
-
161
- # Skip if distributions are not available
162
- if not dist1 or not dist2 or len(dist1) != len(dist2):
163
- continue
164
-
165
- # Calculate Jensen-Shannon divergence (approximation using average of KL divergences)
166
- dist1 = np.array(dist1)
167
- dist2 = np.array(dist2)
168
-
169
- # Add small epsilon to avoid division by zero
170
- epsilon = 1e-10
171
- dist1 = dist1 + epsilon
172
- dist2 = dist2 + epsilon
173
-
174
- # Normalize
175
- dist1 = dist1 / np.sum(dist1)
176
- dist2 = dist2 / np.sum(dist2)
177
-
178
- # Calculate average distribution
179
- avg_dist = (dist1 + dist2) / 2
180
-
181
- # Calculate KL divergences
182
- kl_div1 = np.sum(dist1 * np.log(dist1 / avg_dist))
183
- kl_div2 = np.sum(dist2 * np.log(dist2 / avg_dist))
184
-
185
- # Jensen-Shannon divergence
186
- js_div = (kl_div1 + kl_div2) / 2
187
-
188
- # Topic-wise differences
189
- topic_diffs = []
190
- for t in range(len(dist1)):
191
- topic_diffs.append({
192
- "topic_id": t,
193
- "model1_weight": float(dist1[t]),
194
- "model2_weight": float(dist2[t]),
195
- "diff": float(abs(dist1[t] - dist2[t]))
196
- })
197
-
198
- # Sort by difference
199
- topic_diffs.sort(key=lambda x: x["diff"], reverse=True)
200
-
201
- # Store comparison
202
- comparison_key = f"{model1} vs {model2}"
203
- result["comparisons"][comparison_key] = {
204
- "js_divergence": float(js_div),
205
- "topic_differences": topic_diffs
206
- }
207
-
208
- return result
 
83
  # Create document-term matrix
84
  if method == "nmf":
85
  # For NMF, use TF-IDF vectorization
86
+ # Adjust min_df and max_df for small document sets
87
+ vectorizer = TfidfVectorizer(max_features=1000, min_df=1, max_df=1.0)
88
  else:
89
  # For LDA, use CountVectorizer
90
+ # Adjust min_df and max_df for small document sets
91
+ vectorizer = CountVectorizer(max_features=1000, min_df=1, max_df=1.0)
92
 
93
  X = vectorizer.fit_transform(preprocessed_texts)
94
  feature_names = vectorizer.get_feature_names_out()
 
115
  "distribution": normalized_dist.tolist()
116
  })
117
 
118
+ return result