Spaces:
Sleeping
Sleeping
Ryan
commited on
Commit
·
30bc4e7
1
Parent(s):
fc52d23
update
Browse files- .idea/workspace.xml +4 -2
- processors/topic_modeling.py +5 -95
.idea/workspace.xml
CHANGED
@@ -4,7 +4,9 @@
|
|
4 |
<option name="autoReloadType" value="SELECTIVE" />
|
5 |
</component>
|
6 |
<component name="ChangeListManager">
|
7 |
-
<list default="true" id="8e67814c-7f04-433c-ab7a-2b65a1106d4c" name="Changes" comment=""
|
|
|
|
|
8 |
<option name="SHOW_DIALOG" value="false" />
|
9 |
<option name="HIGHLIGHT_CONFLICTS" value="true" />
|
10 |
<option name="HIGHLIGHT_NON_ACTIVE_CHANGELIST" value="false" />
|
@@ -53,7 +55,7 @@
|
|
53 |
<option name="presentableId" value="Default" />
|
54 |
<updated>1745170754325</updated>
|
55 |
<workItem from="1745170755404" duration="245000" />
|
56 |
-
<workItem from="1745172030020" duration="
|
57 |
</task>
|
58 |
<servers />
|
59 |
</component>
|
|
|
4 |
<option name="autoReloadType" value="SELECTIVE" />
|
5 |
</component>
|
6 |
<component name="ChangeListManager">
|
7 |
+
<list default="true" id="8e67814c-7f04-433c-ab7a-2b65a1106d4c" name="Changes" comment="">
|
8 |
+
<change beforePath="$PROJECT_DIR$/processors/topic_modeling.py" beforeDir="false" afterPath="$PROJECT_DIR$/processors/topic_modeling.py" afterDir="false" />
|
9 |
+
</list>
|
10 |
<option name="SHOW_DIALOG" value="false" />
|
11 |
<option name="HIGHLIGHT_CONFLICTS" value="true" />
|
12 |
<option name="HIGHLIGHT_NON_ACTIVE_CHANGELIST" value="false" />
|
|
|
55 |
<option name="presentableId" value="Default" />
|
56 |
<updated>1745170754325</updated>
|
57 |
<workItem from="1745170755404" duration="245000" />
|
58 |
+
<workItem from="1745172030020" duration="11940000" />
|
59 |
</task>
|
60 |
<servers />
|
61 |
</component>
|
processors/topic_modeling.py
CHANGED
@@ -83,10 +83,12 @@ def extract_topics(texts, n_topics=3, n_top_words=10, method="lda"):
|
|
83 |
# Create document-term matrix
|
84 |
if method == "nmf":
|
85 |
# For NMF, use TF-IDF vectorization
|
86 |
-
|
|
|
87 |
else:
|
88 |
# For LDA, use CountVectorizer
|
89 |
-
|
|
|
90 |
|
91 |
X = vectorizer.fit_transform(preprocessed_texts)
|
92 |
feature_names = vectorizer.get_feature_names_out()
|
@@ -113,96 +115,4 @@ def extract_topics(texts, n_topics=3, n_top_words=10, method="lda"):
|
|
113 |
"distribution": normalized_dist.tolist()
|
114 |
})
|
115 |
|
116 |
-
return result
|
117 |
-
|
118 |
-
def compare_topics(response_texts, model_names, n_topics=3, n_top_words=10, method="lda"):
|
119 |
-
"""
|
120 |
-
Compare topic distributions between different model responses
|
121 |
-
|
122 |
-
Args:
|
123 |
-
response_texts (list): List of response texts to compare
|
124 |
-
model_names (list): Names of models corresponding to responses
|
125 |
-
n_topics (int): Number of topics to extract
|
126 |
-
n_top_words (int): Number of top words per topic
|
127 |
-
method (str): Topic modeling method ('lda' or 'nmf')
|
128 |
-
|
129 |
-
Returns:
|
130 |
-
dict: Comparative topic analysis
|
131 |
-
"""
|
132 |
-
# Initialize results
|
133 |
-
result = {
|
134 |
-
"models": model_names,
|
135 |
-
"method": method,
|
136 |
-
"n_topics": n_topics,
|
137 |
-
"topics": [],
|
138 |
-
"model_topics": {},
|
139 |
-
"comparisons": {}
|
140 |
-
}
|
141 |
-
|
142 |
-
# Extract topics
|
143 |
-
topic_model = extract_topics(response_texts, n_topics, n_top_words, method)
|
144 |
-
result["topics"] = topic_model["topics"]
|
145 |
-
|
146 |
-
# Map topic distributions to models
|
147 |
-
for i, model_name in enumerate(model_names):
|
148 |
-
if i < len(topic_model["document_topics"]):
|
149 |
-
result["model_topics"][model_name] = topic_model["document_topics"][i]["distribution"]
|
150 |
-
|
151 |
-
# Calculate topic distribution differences for pairs of models
|
152 |
-
if len(model_names) >= 2:
|
153 |
-
for i in range(len(model_names)):
|
154 |
-
for j in range(i+1, len(model_names)):
|
155 |
-
model1, model2 = model_names[i], model_names[j]
|
156 |
-
|
157 |
-
# Get topic distributions
|
158 |
-
dist1 = result["model_topics"].get(model1, [])
|
159 |
-
dist2 = result["model_topics"].get(model2, [])
|
160 |
-
|
161 |
-
# Skip if distributions are not available
|
162 |
-
if not dist1 or not dist2 or len(dist1) != len(dist2):
|
163 |
-
continue
|
164 |
-
|
165 |
-
# Calculate Jensen-Shannon divergence (approximation using average of KL divergences)
|
166 |
-
dist1 = np.array(dist1)
|
167 |
-
dist2 = np.array(dist2)
|
168 |
-
|
169 |
-
# Add small epsilon to avoid division by zero
|
170 |
-
epsilon = 1e-10
|
171 |
-
dist1 = dist1 + epsilon
|
172 |
-
dist2 = dist2 + epsilon
|
173 |
-
|
174 |
-
# Normalize
|
175 |
-
dist1 = dist1 / np.sum(dist1)
|
176 |
-
dist2 = dist2 / np.sum(dist2)
|
177 |
-
|
178 |
-
# Calculate average distribution
|
179 |
-
avg_dist = (dist1 + dist2) / 2
|
180 |
-
|
181 |
-
# Calculate KL divergences
|
182 |
-
kl_div1 = np.sum(dist1 * np.log(dist1 / avg_dist))
|
183 |
-
kl_div2 = np.sum(dist2 * np.log(dist2 / avg_dist))
|
184 |
-
|
185 |
-
# Jensen-Shannon divergence
|
186 |
-
js_div = (kl_div1 + kl_div2) / 2
|
187 |
-
|
188 |
-
# Topic-wise differences
|
189 |
-
topic_diffs = []
|
190 |
-
for t in range(len(dist1)):
|
191 |
-
topic_diffs.append({
|
192 |
-
"topic_id": t,
|
193 |
-
"model1_weight": float(dist1[t]),
|
194 |
-
"model2_weight": float(dist2[t]),
|
195 |
-
"diff": float(abs(dist1[t] - dist2[t]))
|
196 |
-
})
|
197 |
-
|
198 |
-
# Sort by difference
|
199 |
-
topic_diffs.sort(key=lambda x: x["diff"], reverse=True)
|
200 |
-
|
201 |
-
# Store comparison
|
202 |
-
comparison_key = f"{model1} vs {model2}"
|
203 |
-
result["comparisons"][comparison_key] = {
|
204 |
-
"js_divergence": float(js_div),
|
205 |
-
"topic_differences": topic_diffs
|
206 |
-
}
|
207 |
-
|
208 |
-
return result
|
|
|
83 |
# Create document-term matrix
|
84 |
if method == "nmf":
|
85 |
# For NMF, use TF-IDF vectorization
|
86 |
+
# Adjust min_df and max_df for small document sets
|
87 |
+
vectorizer = TfidfVectorizer(max_features=1000, min_df=1, max_df=1.0)
|
88 |
else:
|
89 |
# For LDA, use CountVectorizer
|
90 |
+
# Adjust min_df and max_df for small document sets
|
91 |
+
vectorizer = CountVectorizer(max_features=1000, min_df=1, max_df=1.0)
|
92 |
|
93 |
X = vectorizer.fit_transform(preprocessed_texts)
|
94 |
feature_names = vectorizer.get_feature_names_out()
|
|
|
115 |
"distribution": normalized_dist.tolist()
|
116 |
})
|
117 |
|
118 |
+
return result
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|