525GradioApp / processors /topic_modeling.py
Ryan
update
b559aef
raw
history blame
6.27 kB
"""
Topic modeling processor for comparing text responses
"""
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation, NMF
import numpy as np
import nltk
from nltk.corpus import stopwords
import re
def preprocess_text(text):
"""
Preprocess text for topic modeling
Args:
text (str): Text to preprocess
Returns:
str: Preprocessed text
"""
# Convert to lowercase
text = text.lower()
# Remove special characters and digits
text = re.sub(r'[^a-zA-Z\s]', '', text)
# Tokenize
tokens = nltk.word_tokenize(text)
# Remove stopwords
stop_words = set(stopwords.words('english'))
tokens = [token for token in tokens if token not in stop_words and len(token) > 3]
return ' '.join(tokens)
def get_top_words_per_topic(model, feature_names, n_top_words=10):
"""
Get the top words for each topic in the model
Args:
model: Topic model (LDA or NMF)
feature_names (list): Feature names (words)
n_top_words (int): Number of top words to include per topic
Returns:
list: List of topics with their top words
"""
topics = []
for topic_idx, topic in enumerate(model.components_):
top_words_idx = topic.argsort()[:-n_top_words - 1:-1]
top_words = [feature_names[i] for i in top_words_idx]
topic_dict = {
"id": topic_idx,
"words": top_words,
"weights": topic[top_words_idx].tolist()
}
topics.append(topic_dict)
return topics
def extract_topics(texts, n_topics=3, n_top_words=10, method="lda"):
"""
Extract topics from a list of texts
Args:
texts (list): List of text documents
n_topics (int): Number of topics to extract
n_top_words (int): Number of top words per topic
method (str): Topic modeling method ('lda' or 'nmf')
Returns:
dict: Topic modeling results with topics and document-topic distributions
"""
result = {
"method": method,
"n_topics": n_topics,
"topics": [],
"document_topics": []
}
# Preprocess texts
preprocessed_texts = [preprocess_text(text) for text in texts]
# Create document-term matrix
if method == "nmf":
# For NMF, use TF-IDF vectorization
# Adjust min_df and max_df for small document sets
vectorizer = TfidfVectorizer(max_features=1000, min_df=1, max_df=1.0)
else:
# For LDA, use CountVectorizer
# Adjust min_df and max_df for small document sets
vectorizer = CountVectorizer(max_features=1000, min_df=1, max_df=1.0)
X = vectorizer.fit_transform(preprocessed_texts)
feature_names = vectorizer.get_feature_names_out()
# Apply topic modeling
if method == "nmf":
# Non-negative Matrix Factorization
model = NMF(n_components=n_topics, random_state=42, max_iter=1000)
else:
# Latent Dirichlet Allocation
model = LatentDirichletAllocation(n_components=n_topics, random_state=42, max_iter=20)
topic_distribution = model.fit_transform(X)
# Get top words for each topic
result["topics"] = get_top_words_per_topic(model, feature_names, n_top_words)
# Get topic distribution for each document
for i, dist in enumerate(topic_distribution):
# Normalize for easier comparison
normalized_dist = dist / np.sum(dist) if np.sum(dist) > 0 else dist
result["document_topics"].append({
"document_id": i,
"distribution": normalized_dist.tolist()
})
return result
def compare_topics(texts_set_1, texts_set_2, n_topics=3, n_top_words=10, method="lda", model_names=None):
"""
Compare topics between two sets of texts
Args:
texts_set_1 (list): First list of text documents
texts_set_2 (list): Second list of text documents
n_topics (int): Number of topics to extract
n_top_words (int): Number of top words per topic
method (str): Topic modeling method ('lda' or 'nmf')
model_names (list, optional): Names of the models being compared
Returns:
dict: Comparison results with topics from both sets and similarity metrics
"""
# Set default model names if not provided
if model_names is None:
model_names = ["Model 1", "Model 2"]
# Extract topics for each set
topics_set_1 = extract_topics(texts_set_1, n_topics, n_top_words, method)
topics_set_2 = extract_topics(texts_set_2, n_topics, n_top_words, method)
# Calculate similarity between topics
similarity_matrix = []
for topic1 in topics_set_1["topics"]:
topic_similarities = []
words1 = set(topic1["words"])
for topic2 in topics_set_2["topics"]:
words2 = set(topic2["words"])
# Jaccard similarity: intersection over union
intersection = len(words1.intersection(words2))
union = len(words1.union(words2))
similarity = intersection / union if union > 0 else 0
topic_similarities.append(similarity)
similarity_matrix.append(topic_similarities)
# Find the best matching topic pairs
matched_topics = []
for i, similarities in enumerate(similarity_matrix):
best_match_idx = np.argmax(similarities)
matched_topics.append({
"set1_topic_id": i,
"set1_topic_words": topics_set_1["topics"][i]["words"],
"set2_topic_id": best_match_idx,
"set2_topic_words": topics_set_2["topics"][best_match_idx]["words"],
"similarity": similarities[best_match_idx]
})
# Construct result
result = {
"method": method,
"n_topics": n_topics,
"set1_topics": topics_set_1["topics"],
"set2_topics": topics_set_2["topics"],
"similarity_matrix": similarity_matrix,
"matched_topics": matched_topics,
"average_similarity": np.mean([match["similarity"] for match in matched_topics]),
"models": model_names # Add model names to result
}
return result