Spaces:
Sleeping
Sleeping
""" | |
Topic modeling processor for comparing text responses | |
""" | |
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer | |
from sklearn.decomposition import LatentDirichletAllocation, NMF | |
import numpy as np | |
import nltk | |
from nltk.corpus import stopwords | |
import re | |
def preprocess_text(text): | |
""" | |
Preprocess text for topic modeling | |
Args: | |
text (str): Text to preprocess | |
Returns: | |
str: Preprocessed text | |
""" | |
# Convert to lowercase | |
text = text.lower() | |
# Remove special characters and digits | |
text = re.sub(r'[^a-zA-Z\s]', '', text) | |
# Tokenize | |
tokens = nltk.word_tokenize(text) | |
# Remove stopwords | |
stop_words = set(stopwords.words('english')) | |
tokens = [token for token in tokens if token not in stop_words and len(token) > 3] | |
return ' '.join(tokens) | |
def get_top_words_per_topic(model, feature_names, n_top_words=10): | |
""" | |
Get the top words for each topic in the model | |
Args: | |
model: Topic model (LDA or NMF) | |
feature_names (list): Feature names (words) | |
n_top_words (int): Number of top words to include per topic | |
Returns: | |
list: List of topics with their top words | |
""" | |
topics = [] | |
for topic_idx, topic in enumerate(model.components_): | |
top_words_idx = topic.argsort()[:-n_top_words - 1:-1] | |
top_words = [feature_names[i] for i in top_words_idx] | |
topic_dict = { | |
"id": topic_idx, | |
"words": top_words, | |
"weights": topic[top_words_idx].tolist() | |
} | |
topics.append(topic_dict) | |
return topics | |
def extract_topics(texts, n_topics=3, n_top_words=10, method="lda"): | |
""" | |
Extract topics from a list of texts | |
Args: | |
texts (list): List of text documents | |
n_topics (int): Number of topics to extract | |
n_top_words (int): Number of top words per topic | |
method (str): Topic modeling method ('lda' or 'nmf') | |
Returns: | |
dict: Topic modeling results with topics and document-topic distributions | |
""" | |
result = { | |
"method": method, | |
"n_topics": n_topics, | |
"topics": [], | |
"document_topics": [] | |
} | |
# Preprocess texts | |
preprocessed_texts = [preprocess_text(text) for text in texts] | |
# Create document-term matrix | |
if method == "nmf": | |
# For NMF, use TF-IDF vectorization | |
# Adjust min_df and max_df for small document sets | |
vectorizer = TfidfVectorizer(max_features=1000, min_df=1, max_df=1.0) | |
else: | |
# For LDA, use CountVectorizer | |
# Adjust min_df and max_df for small document sets | |
vectorizer = CountVectorizer(max_features=1000, min_df=1, max_df=1.0) | |
X = vectorizer.fit_transform(preprocessed_texts) | |
feature_names = vectorizer.get_feature_names_out() | |
# Apply topic modeling | |
if method == "nmf": | |
# Non-negative Matrix Factorization | |
model = NMF(n_components=n_topics, random_state=42, max_iter=1000) | |
else: | |
# Latent Dirichlet Allocation | |
model = LatentDirichletAllocation(n_components=n_topics, random_state=42, max_iter=20) | |
topic_distribution = model.fit_transform(X) | |
# Get top words for each topic | |
result["topics"] = get_top_words_per_topic(model, feature_names, n_top_words) | |
# Get topic distribution for each document | |
for i, dist in enumerate(topic_distribution): | |
# Normalize for easier comparison | |
normalized_dist = dist / np.sum(dist) if np.sum(dist) > 0 else dist | |
result["document_topics"].append({ | |
"document_id": i, | |
"distribution": normalized_dist.tolist() | |
}) | |
return result | |
def compare_topics(texts_set_1, texts_set_2, n_topics=3, n_top_words=10, method="lda", model_names=None): | |
""" | |
Compare topics between two sets of texts | |
Args: | |
texts_set_1 (list): First list of text documents | |
texts_set_2 (list): Second list of text documents | |
n_topics (int): Number of topics to extract | |
n_top_words (int): Number of top words per topic | |
method (str): Topic modeling method ('lda' or 'nmf') | |
model_names (list, optional): Names of the models being compared | |
Returns: | |
dict: Comparison results with topics from both sets and similarity metrics | |
""" | |
# Set default model names if not provided | |
if model_names is None: | |
model_names = ["Model 1", "Model 2"] | |
# Extract topics for each set | |
topics_set_1 = extract_topics(texts_set_1, n_topics, n_top_words, method) | |
topics_set_2 = extract_topics(texts_set_2, n_topics, n_top_words, method) | |
# Calculate similarity between topics | |
similarity_matrix = [] | |
for topic1 in topics_set_1["topics"]: | |
topic_similarities = [] | |
words1 = set(topic1["words"]) | |
for topic2 in topics_set_2["topics"]: | |
words2 = set(topic2["words"]) | |
# Jaccard similarity: intersection over union | |
intersection = len(words1.intersection(words2)) | |
union = len(words1.union(words2)) | |
similarity = intersection / union if union > 0 else 0 | |
topic_similarities.append(similarity) | |
similarity_matrix.append(topic_similarities) | |
# Find the best matching topic pairs | |
matched_topics = [] | |
for i, similarities in enumerate(similarity_matrix): | |
best_match_idx = np.argmax(similarities) | |
matched_topics.append({ | |
"set1_topic_id": i, | |
"set1_topic_words": topics_set_1["topics"][i]["words"], | |
"set2_topic_id": best_match_idx, | |
"set2_topic_words": topics_set_2["topics"][best_match_idx]["words"], | |
"similarity": similarities[best_match_idx] | |
}) | |
# Construct result | |
result = { | |
"method": method, | |
"n_topics": n_topics, | |
"set1_topics": topics_set_1["topics"], | |
"set2_topics": topics_set_2["topics"], | |
"similarity_matrix": similarity_matrix, | |
"matched_topics": matched_topics, | |
"average_similarity": np.mean([match["similarity"] for match in matched_topics]), | |
"models": model_names # Add model names to result | |
} | |
return result |