Spaces:
Sleeping
Sleeping
File size: 6,272 Bytes
14bac19 30bc4e7 14bac19 30bc4e7 14bac19 1b72959 b559aef 1b72959 b559aef 1b72959 b559aef 1b72959 b559aef 1b72959 30bc4e7 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 |
"""
Topic modeling processor for comparing text responses
"""
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation, NMF
import numpy as np
import nltk
from nltk.corpus import stopwords
import re
def preprocess_text(text):
"""
Preprocess text for topic modeling
Args:
text (str): Text to preprocess
Returns:
str: Preprocessed text
"""
# Convert to lowercase
text = text.lower()
# Remove special characters and digits
text = re.sub(r'[^a-zA-Z\s]', '', text)
# Tokenize
tokens = nltk.word_tokenize(text)
# Remove stopwords
stop_words = set(stopwords.words('english'))
tokens = [token for token in tokens if token not in stop_words and len(token) > 3]
return ' '.join(tokens)
def get_top_words_per_topic(model, feature_names, n_top_words=10):
"""
Get the top words for each topic in the model
Args:
model: Topic model (LDA or NMF)
feature_names (list): Feature names (words)
n_top_words (int): Number of top words to include per topic
Returns:
list: List of topics with their top words
"""
topics = []
for topic_idx, topic in enumerate(model.components_):
top_words_idx = topic.argsort()[:-n_top_words - 1:-1]
top_words = [feature_names[i] for i in top_words_idx]
topic_dict = {
"id": topic_idx,
"words": top_words,
"weights": topic[top_words_idx].tolist()
}
topics.append(topic_dict)
return topics
def extract_topics(texts, n_topics=3, n_top_words=10, method="lda"):
"""
Extract topics from a list of texts
Args:
texts (list): List of text documents
n_topics (int): Number of topics to extract
n_top_words (int): Number of top words per topic
method (str): Topic modeling method ('lda' or 'nmf')
Returns:
dict: Topic modeling results with topics and document-topic distributions
"""
result = {
"method": method,
"n_topics": n_topics,
"topics": [],
"document_topics": []
}
# Preprocess texts
preprocessed_texts = [preprocess_text(text) for text in texts]
# Create document-term matrix
if method == "nmf":
# For NMF, use TF-IDF vectorization
# Adjust min_df and max_df for small document sets
vectorizer = TfidfVectorizer(max_features=1000, min_df=1, max_df=1.0)
else:
# For LDA, use CountVectorizer
# Adjust min_df and max_df for small document sets
vectorizer = CountVectorizer(max_features=1000, min_df=1, max_df=1.0)
X = vectorizer.fit_transform(preprocessed_texts)
feature_names = vectorizer.get_feature_names_out()
# Apply topic modeling
if method == "nmf":
# Non-negative Matrix Factorization
model = NMF(n_components=n_topics, random_state=42, max_iter=1000)
else:
# Latent Dirichlet Allocation
model = LatentDirichletAllocation(n_components=n_topics, random_state=42, max_iter=20)
topic_distribution = model.fit_transform(X)
# Get top words for each topic
result["topics"] = get_top_words_per_topic(model, feature_names, n_top_words)
# Get topic distribution for each document
for i, dist in enumerate(topic_distribution):
# Normalize for easier comparison
normalized_dist = dist / np.sum(dist) if np.sum(dist) > 0 else dist
result["document_topics"].append({
"document_id": i,
"distribution": normalized_dist.tolist()
})
return result
def compare_topics(texts_set_1, texts_set_2, n_topics=3, n_top_words=10, method="lda", model_names=None):
"""
Compare topics between two sets of texts
Args:
texts_set_1 (list): First list of text documents
texts_set_2 (list): Second list of text documents
n_topics (int): Number of topics to extract
n_top_words (int): Number of top words per topic
method (str): Topic modeling method ('lda' or 'nmf')
model_names (list, optional): Names of the models being compared
Returns:
dict: Comparison results with topics from both sets and similarity metrics
"""
# Set default model names if not provided
if model_names is None:
model_names = ["Model 1", "Model 2"]
# Extract topics for each set
topics_set_1 = extract_topics(texts_set_1, n_topics, n_top_words, method)
topics_set_2 = extract_topics(texts_set_2, n_topics, n_top_words, method)
# Calculate similarity between topics
similarity_matrix = []
for topic1 in topics_set_1["topics"]:
topic_similarities = []
words1 = set(topic1["words"])
for topic2 in topics_set_2["topics"]:
words2 = set(topic2["words"])
# Jaccard similarity: intersection over union
intersection = len(words1.intersection(words2))
union = len(words1.union(words2))
similarity = intersection / union if union > 0 else 0
topic_similarities.append(similarity)
similarity_matrix.append(topic_similarities)
# Find the best matching topic pairs
matched_topics = []
for i, similarities in enumerate(similarity_matrix):
best_match_idx = np.argmax(similarities)
matched_topics.append({
"set1_topic_id": i,
"set1_topic_words": topics_set_1["topics"][i]["words"],
"set2_topic_id": best_match_idx,
"set2_topic_words": topics_set_2["topics"][best_match_idx]["words"],
"similarity": similarities[best_match_idx]
})
# Construct result
result = {
"method": method,
"n_topics": n_topics,
"set1_topics": topics_set_1["topics"],
"set2_topics": topics_set_2["topics"],
"similarity_matrix": similarity_matrix,
"matched_topics": matched_topics,
"average_similarity": np.mean([match["similarity"] for match in matched_topics]),
"models": model_names # Add model names to result
}
return result |