Spaces:
Sleeping
Sleeping
Ryan
commited on
Commit
·
1b72959
1
Parent(s):
4b11d86
update
Browse files- processors/topic_modeling.py +57 -0
processors/topic_modeling.py
CHANGED
@@ -115,4 +115,61 @@ def extract_topics(texts, n_topics=3, n_top_words=10, method="lda"):
|
|
115 |
"distribution": normalized_dist.tolist()
|
116 |
})
|
117 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
118 |
return result
|
|
|
115 |
"distribution": normalized_dist.tolist()
|
116 |
})
|
117 |
|
118 |
+
return result
|
119 |
+
|
120 |
+
def compare_topics(texts_set_1, texts_set_2, n_topics=3, n_top_words=10, method="lda"):
|
121 |
+
"""
|
122 |
+
Compare topics between two sets of texts
|
123 |
+
|
124 |
+
Args:
|
125 |
+
texts_set_1 (list): First list of text documents
|
126 |
+
texts_set_2 (list): Second list of text documents
|
127 |
+
n_topics (int): Number of topics to extract
|
128 |
+
n_top_words (int): Number of top words per topic
|
129 |
+
method (str): Topic modeling method ('lda' or 'nmf')
|
130 |
+
|
131 |
+
Returns:
|
132 |
+
dict: Comparison results with topics from both sets and similarity metrics
|
133 |
+
"""
|
134 |
+
# Extract topics for each set
|
135 |
+
topics_set_1 = extract_topics(texts_set_1, n_topics, n_top_words, method)
|
136 |
+
topics_set_2 = extract_topics(texts_set_2, n_topics, n_top_words, method)
|
137 |
+
|
138 |
+
# Calculate similarity between topics
|
139 |
+
similarity_matrix = []
|
140 |
+
for topic1 in topics_set_1["topics"]:
|
141 |
+
topic_similarities = []
|
142 |
+
words1 = set(topic1["words"])
|
143 |
+
for topic2 in topics_set_2["topics"]:
|
144 |
+
words2 = set(topic2["words"])
|
145 |
+
# Jaccard similarity: intersection over union
|
146 |
+
intersection = len(words1.intersection(words2))
|
147 |
+
union = len(words1.union(words2))
|
148 |
+
similarity = intersection / union if union > 0 else 0
|
149 |
+
topic_similarities.append(similarity)
|
150 |
+
similarity_matrix.append(topic_similarities)
|
151 |
+
|
152 |
+
# Find the best matching topic pairs
|
153 |
+
matched_topics = []
|
154 |
+
for i, similarities in enumerate(similarity_matrix):
|
155 |
+
best_match_idx = np.argmax(similarities)
|
156 |
+
matched_topics.append({
|
157 |
+
"set1_topic_id": i,
|
158 |
+
"set1_topic_words": topics_set_1["topics"][i]["words"],
|
159 |
+
"set2_topic_id": best_match_idx,
|
160 |
+
"set2_topic_words": topics_set_2["topics"][best_match_idx]["words"],
|
161 |
+
"similarity": similarities[best_match_idx]
|
162 |
+
})
|
163 |
+
|
164 |
+
# Construct result
|
165 |
+
result = {
|
166 |
+
"method": method,
|
167 |
+
"n_topics": n_topics,
|
168 |
+
"set1_topics": topics_set_1["topics"],
|
169 |
+
"set2_topics": topics_set_2["topics"],
|
170 |
+
"similarity_matrix": similarity_matrix,
|
171 |
+
"matched_topics": matched_topics,
|
172 |
+
"average_similarity": np.mean([match["similarity"] for match in matched_topics])
|
173 |
+
}
|
174 |
+
|
175 |
return result
|