Spaces:

RyanS974
/

525GradioApp

Sleeping

525GradioApp / processors /topic_modeling.py

Ryan

update

b559aef 4 months ago

6.27 kB

	"""
	Topic modeling processor for comparing text responses
	"""
	from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
	from sklearn.decomposition import LatentDirichletAllocation, NMF
	import numpy as np
	import nltk
	from nltk.corpus import stopwords
	import re

	def preprocess_text(text):
	"""
	Preprocess text for topic modeling

	Args:
	text (str): Text to preprocess

	Returns:
	str: Preprocessed text
	"""
	# Convert to lowercase
	text = text.lower()

	# Remove special characters and digits
	text = re.sub(r'[^a-zA-Z\s]', '', text)

	# Tokenize
	tokens = nltk.word_tokenize(text)

	# Remove stopwords
	stop_words = set(stopwords.words('english'))
	tokens = [token for token in tokens if token not in stop_words and len(token) > 3]

	return ' '.join(tokens)

	def get_top_words_per_topic(model, feature_names, n_top_words=10):
	"""
	Get the top words for each topic in the model

	Args:
	model: Topic model (LDA or NMF)
	feature_names (list): Feature names (words)
	n_top_words (int): Number of top words to include per topic

	Returns:
	list: List of topics with their top words
	"""
	topics = []
	for topic_idx, topic in enumerate(model.components_):
	top_words_idx = topic.argsort()[:-n_top_words - 1:-1]
	top_words = [feature_names[i] for i in top_words_idx]
	topic_dict = {
	"id": topic_idx,
	"words": top_words,
	"weights": topic[top_words_idx].tolist()
	}
	topics.append(topic_dict)
	return topics

	def extract_topics(texts, n_topics=3, n_top_words=10, method="lda"):
	"""
	Extract topics from a list of texts

	Args:
	texts (list): List of text documents
	n_topics (int): Number of topics to extract
	n_top_words (int): Number of top words per topic
	method (str): Topic modeling method ('lda' or 'nmf')

	Returns:
	dict: Topic modeling results with topics and document-topic distributions
	"""
	result = {
	"method": method,
	"n_topics": n_topics,
	"topics": [],
	"document_topics": []
	}

	# Preprocess texts
	preprocessed_texts = [preprocess_text(text) for text in texts]

	# Create document-term matrix
	if method == "nmf":
	# For NMF, use TF-IDF vectorization
	# Adjust min_df and max_df for small document sets
	vectorizer = TfidfVectorizer(max_features=1000, min_df=1, max_df=1.0)
	else:
	# For LDA, use CountVectorizer
	# Adjust min_df and max_df for small document sets
	vectorizer = CountVectorizer(max_features=1000, min_df=1, max_df=1.0)

	X = vectorizer.fit_transform(preprocessed_texts)
	feature_names = vectorizer.get_feature_names_out()

	# Apply topic modeling
	if method == "nmf":
	# Non-negative Matrix Factorization
	model = NMF(n_components=n_topics, random_state=42, max_iter=1000)
	else:
	# Latent Dirichlet Allocation
	model = LatentDirichletAllocation(n_components=n_topics, random_state=42, max_iter=20)

	topic_distribution = model.fit_transform(X)

	# Get top words for each topic
	result["topics"] = get_top_words_per_topic(model, feature_names, n_top_words)

	# Get topic distribution for each document
	for i, dist in enumerate(topic_distribution):
	# Normalize for easier comparison
	normalized_dist = dist / np.sum(dist) if np.sum(dist) > 0 else dist
	result["document_topics"].append({
	"document_id": i,
	"distribution": normalized_dist.tolist()
	})

	return result

	def compare_topics(texts_set_1, texts_set_2, n_topics=3, n_top_words=10, method="lda", model_names=None):
	"""
	Compare topics between two sets of texts

	Args:
	texts_set_1 (list): First list of text documents
	texts_set_2 (list): Second list of text documents
	n_topics (int): Number of topics to extract
	n_top_words (int): Number of top words per topic
	method (str): Topic modeling method ('lda' or 'nmf')
	model_names (list, optional): Names of the models being compared

	Returns:
	dict: Comparison results with topics from both sets and similarity metrics
	"""
	# Set default model names if not provided
	if model_names is None:
	model_names = ["Model 1", "Model 2"]

	# Extract topics for each set
	topics_set_1 = extract_topics(texts_set_1, n_topics, n_top_words, method)
	topics_set_2 = extract_topics(texts_set_2, n_topics, n_top_words, method)

	# Calculate similarity between topics
	similarity_matrix = []
	for topic1 in topics_set_1["topics"]:
	topic_similarities = []
	words1 = set(topic1["words"])
	for topic2 in topics_set_2["topics"]:
	words2 = set(topic2["words"])
	# Jaccard similarity: intersection over union
	intersection = len(words1.intersection(words2))
	union = len(words1.union(words2))
	similarity = intersection / union if union > 0 else 0
	topic_similarities.append(similarity)
	similarity_matrix.append(topic_similarities)

	# Find the best matching topic pairs
	matched_topics = []
	for i, similarities in enumerate(similarity_matrix):
	best_match_idx = np.argmax(similarities)
	matched_topics.append({
	"set1_topic_id": i,
	"set1_topic_words": topics_set_1["topics"][i]["words"],
	"set2_topic_id": best_match_idx,
	"set2_topic_words": topics_set_2["topics"][best_match_idx]["words"],
	"similarity": similarities[best_match_idx]
	})

	# Construct result
	result = {
	"method": method,
	"n_topics": n_topics,
	"set1_topics": topics_set_1["topics"],
	"set2_topics": topics_set_2["topics"],
	"similarity_matrix": similarity_matrix,
	"matched_topics": matched_topics,
	"average_similarity": np.mean([match["similarity"] for match in matched_topics]),
	"models": model_names # Add model names to result
	}

	return result