Spaces:

RyanS974
/

525GradioApp

Sleeping

File size: 6,272 Bytes

"""
Topic modeling processor for comparing text responses
"""
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation, NMF
import numpy as np
import nltk
from nltk.corpus import stopwords
import re

def preprocess_text(text):
    """
    Preprocess text for topic modeling
    
    Args:
        text (str): Text to preprocess
        
    Returns:
        str: Preprocessed text
    """
    # Convert to lowercase
    text = text.lower()
    
    # Remove special characters and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    
    # Tokenize
    tokens = nltk.word_tokenize(text)
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words and len(token) > 3]
    
    return ' '.join(tokens)

def get_top_words_per_topic(model, feature_names, n_top_words=10):
    """
    Get the top words for each topic in the model
    
    Args:
        model: Topic model (LDA or NMF)
        feature_names (list): Feature names (words)
        n_top_words (int): Number of top words to include per topic
        
    Returns:
        list: List of topics with their top words
    """
    topics = []
    for topic_idx, topic in enumerate(model.components_):
        top_words_idx = topic.argsort()[:-n_top_words - 1:-1]
        top_words = [feature_names[i] for i in top_words_idx]
        topic_dict = {
            "id": topic_idx,
            "words": top_words,
            "weights": topic[top_words_idx].tolist()
        }
        topics.append(topic_dict)
    return topics

def extract_topics(texts, n_topics=3, n_top_words=10, method="lda"):
    """
    Extract topics from a list of texts
    
    Args:
        texts (list): List of text documents
        n_topics (int): Number of topics to extract
        n_top_words (int): Number of top words per topic
        method (str): Topic modeling method ('lda' or 'nmf')
        
    Returns:
        dict: Topic modeling results with topics and document-topic distributions
    """
    result = {
        "method": method,
        "n_topics": n_topics,
        "topics": [],
        "document_topics": []
    }
    
    # Preprocess texts
    preprocessed_texts = [preprocess_text(text) for text in texts]
    
    # Create document-term matrix
    if method == "nmf":
        # For NMF, use TF-IDF vectorization
        # Adjust min_df and max_df for small document sets
        vectorizer = TfidfVectorizer(max_features=1000, min_df=1, max_df=1.0)
    else:
        # For LDA, use CountVectorizer
        # Adjust min_df and max_df for small document sets
        vectorizer = CountVectorizer(max_features=1000, min_df=1, max_df=1.0)
    
    X = vectorizer.fit_transform(preprocessed_texts)
    feature_names = vectorizer.get_feature_names_out()
    
    # Apply topic modeling
    if method == "nmf":
        # Non-negative Matrix Factorization
        model = NMF(n_components=n_topics, random_state=42, max_iter=1000)
    else:
        # Latent Dirichlet Allocation
        model = LatentDirichletAllocation(n_components=n_topics, random_state=42, max_iter=20)
    
    topic_distribution = model.fit_transform(X)
    
    # Get top words for each topic
    result["topics"] = get_top_words_per_topic(model, feature_names, n_top_words)
    
    # Get topic distribution for each document
    for i, dist in enumerate(topic_distribution):
        # Normalize for easier comparison
        normalized_dist = dist / np.sum(dist) if np.sum(dist) > 0 else dist
        result["document_topics"].append({
            "document_id": i,
            "distribution": normalized_dist.tolist()
        })
    
    return result

def compare_topics(texts_set_1, texts_set_2, n_topics=3, n_top_words=10, method="lda", model_names=None):
    """
    Compare topics between two sets of texts
    
    Args:
        texts_set_1 (list): First list of text documents
        texts_set_2 (list): Second list of text documents
        n_topics (int): Number of topics to extract
        n_top_words (int): Number of top words per topic
        method (str): Topic modeling method ('lda' or 'nmf')
        model_names (list, optional): Names of the models being compared
        
    Returns:
        dict: Comparison results with topics from both sets and similarity metrics
    """
    # Set default model names if not provided
    if model_names is None:
        model_names = ["Model 1", "Model 2"]
    
    # Extract topics for each set
    topics_set_1 = extract_topics(texts_set_1, n_topics, n_top_words, method)
    topics_set_2 = extract_topics(texts_set_2, n_topics, n_top_words, method)
    
    # Calculate similarity between topics
    similarity_matrix = []
    for topic1 in topics_set_1["topics"]:
        topic_similarities = []
        words1 = set(topic1["words"])
        for topic2 in topics_set_2["topics"]:
            words2 = set(topic2["words"])
            # Jaccard similarity: intersection over union
            intersection = len(words1.intersection(words2))
            union = len(words1.union(words2))
            similarity = intersection / union if union > 0 else 0
            topic_similarities.append(similarity)
        similarity_matrix.append(topic_similarities)
    
    # Find the best matching topic pairs
    matched_topics = []
    for i, similarities in enumerate(similarity_matrix):
        best_match_idx = np.argmax(similarities)
        matched_topics.append({
            "set1_topic_id": i,
            "set1_topic_words": topics_set_1["topics"][i]["words"],
            "set2_topic_id": best_match_idx,
            "set2_topic_words": topics_set_2["topics"][best_match_idx]["words"],
            "similarity": similarities[best_match_idx]
        })
    
    # Construct result
    result = {
        "method": method,
        "n_topics": n_topics,
        "set1_topics": topics_set_1["topics"],
        "set2_topics": topics_set_2["topics"],
        "similarity_matrix": similarity_matrix,
        "matched_topics": matched_topics,
        "average_similarity": np.mean([match["similarity"] for match in matched_topics]),
        "models": model_names  # Add model names to result
    }
    
    return result