Ryan commited on
Commit
1b72959
·
1 Parent(s): 4b11d86
Files changed (1) hide show
  1. processors/topic_modeling.py +57 -0
processors/topic_modeling.py CHANGED
@@ -115,4 +115,61 @@ def extract_topics(texts, n_topics=3, n_top_words=10, method="lda"):
115
  "distribution": normalized_dist.tolist()
116
  })
117
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
118
  return result
 
115
  "distribution": normalized_dist.tolist()
116
  })
117
 
118
+ return result
119
+
120
+ def compare_topics(texts_set_1, texts_set_2, n_topics=3, n_top_words=10, method="lda"):
121
+ """
122
+ Compare topics between two sets of texts
123
+
124
+ Args:
125
+ texts_set_1 (list): First list of text documents
126
+ texts_set_2 (list): Second list of text documents
127
+ n_topics (int): Number of topics to extract
128
+ n_top_words (int): Number of top words per topic
129
+ method (str): Topic modeling method ('lda' or 'nmf')
130
+
131
+ Returns:
132
+ dict: Comparison results with topics from both sets and similarity metrics
133
+ """
134
+ # Extract topics for each set
135
+ topics_set_1 = extract_topics(texts_set_1, n_topics, n_top_words, method)
136
+ topics_set_2 = extract_topics(texts_set_2, n_topics, n_top_words, method)
137
+
138
+ # Calculate similarity between topics
139
+ similarity_matrix = []
140
+ for topic1 in topics_set_1["topics"]:
141
+ topic_similarities = []
142
+ words1 = set(topic1["words"])
143
+ for topic2 in topics_set_2["topics"]:
144
+ words2 = set(topic2["words"])
145
+ # Jaccard similarity: intersection over union
146
+ intersection = len(words1.intersection(words2))
147
+ union = len(words1.union(words2))
148
+ similarity = intersection / union if union > 0 else 0
149
+ topic_similarities.append(similarity)
150
+ similarity_matrix.append(topic_similarities)
151
+
152
+ # Find the best matching topic pairs
153
+ matched_topics = []
154
+ for i, similarities in enumerate(similarity_matrix):
155
+ best_match_idx = np.argmax(similarities)
156
+ matched_topics.append({
157
+ "set1_topic_id": i,
158
+ "set1_topic_words": topics_set_1["topics"][i]["words"],
159
+ "set2_topic_id": best_match_idx,
160
+ "set2_topic_words": topics_set_2["topics"][best_match_idx]["words"],
161
+ "similarity": similarities[best_match_idx]
162
+ })
163
+
164
+ # Construct result
165
+ result = {
166
+ "method": method,
167
+ "n_topics": n_topics,
168
+ "set1_topics": topics_set_1["topics"],
169
+ "set2_topics": topics_set_2["topics"],
170
+ "similarity_matrix": similarity_matrix,
171
+ "matched_topics": matched_topics,
172
+ "average_similarity": np.mean([match["similarity"] for match in matched_topics])
173
+ }
174
+
175
  return result