File size: 6,272 Bytes
14bac19
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30bc4e7
 
14bac19
 
30bc4e7
 
14bac19
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1b72959
 
b559aef
1b72959
 
 
 
 
 
 
 
 
b559aef
1b72959
 
 
 
b559aef
 
 
 
1b72959
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b559aef
 
1b72959
 
30bc4e7
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
"""
Topic modeling processor for comparing text responses
"""
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation, NMF
import numpy as np
import nltk
from nltk.corpus import stopwords
import re

def preprocess_text(text):
    """
    Preprocess text for topic modeling
    
    Args:
        text (str): Text to preprocess
        
    Returns:
        str: Preprocessed text
    """
    # Convert to lowercase
    text = text.lower()
    
    # Remove special characters and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    
    # Tokenize
    tokens = nltk.word_tokenize(text)
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words and len(token) > 3]
    
    return ' '.join(tokens)

def get_top_words_per_topic(model, feature_names, n_top_words=10):
    """
    Get the top words for each topic in the model
    
    Args:
        model: Topic model (LDA or NMF)
        feature_names (list): Feature names (words)
        n_top_words (int): Number of top words to include per topic
        
    Returns:
        list: List of topics with their top words
    """
    topics = []
    for topic_idx, topic in enumerate(model.components_):
        top_words_idx = topic.argsort()[:-n_top_words - 1:-1]
        top_words = [feature_names[i] for i in top_words_idx]
        topic_dict = {
            "id": topic_idx,
            "words": top_words,
            "weights": topic[top_words_idx].tolist()
        }
        topics.append(topic_dict)
    return topics

def extract_topics(texts, n_topics=3, n_top_words=10, method="lda"):
    """
    Extract topics from a list of texts
    
    Args:
        texts (list): List of text documents
        n_topics (int): Number of topics to extract
        n_top_words (int): Number of top words per topic
        method (str): Topic modeling method ('lda' or 'nmf')
        
    Returns:
        dict: Topic modeling results with topics and document-topic distributions
    """
    result = {
        "method": method,
        "n_topics": n_topics,
        "topics": [],
        "document_topics": []
    }
    
    # Preprocess texts
    preprocessed_texts = [preprocess_text(text) for text in texts]
    
    # Create document-term matrix
    if method == "nmf":
        # For NMF, use TF-IDF vectorization
        # Adjust min_df and max_df for small document sets
        vectorizer = TfidfVectorizer(max_features=1000, min_df=1, max_df=1.0)
    else:
        # For LDA, use CountVectorizer
        # Adjust min_df and max_df for small document sets
        vectorizer = CountVectorizer(max_features=1000, min_df=1, max_df=1.0)
    
    X = vectorizer.fit_transform(preprocessed_texts)
    feature_names = vectorizer.get_feature_names_out()
    
    # Apply topic modeling
    if method == "nmf":
        # Non-negative Matrix Factorization
        model = NMF(n_components=n_topics, random_state=42, max_iter=1000)
    else:
        # Latent Dirichlet Allocation
        model = LatentDirichletAllocation(n_components=n_topics, random_state=42, max_iter=20)
    
    topic_distribution = model.fit_transform(X)
    
    # Get top words for each topic
    result["topics"] = get_top_words_per_topic(model, feature_names, n_top_words)
    
    # Get topic distribution for each document
    for i, dist in enumerate(topic_distribution):
        # Normalize for easier comparison
        normalized_dist = dist / np.sum(dist) if np.sum(dist) > 0 else dist
        result["document_topics"].append({
            "document_id": i,
            "distribution": normalized_dist.tolist()
        })
    
    return result

def compare_topics(texts_set_1, texts_set_2, n_topics=3, n_top_words=10, method="lda", model_names=None):
    """
    Compare topics between two sets of texts
    
    Args:
        texts_set_1 (list): First list of text documents
        texts_set_2 (list): Second list of text documents
        n_topics (int): Number of topics to extract
        n_top_words (int): Number of top words per topic
        method (str): Topic modeling method ('lda' or 'nmf')
        model_names (list, optional): Names of the models being compared
        
    Returns:
        dict: Comparison results with topics from both sets and similarity metrics
    """
    # Set default model names if not provided
    if model_names is None:
        model_names = ["Model 1", "Model 2"]
    
    # Extract topics for each set
    topics_set_1 = extract_topics(texts_set_1, n_topics, n_top_words, method)
    topics_set_2 = extract_topics(texts_set_2, n_topics, n_top_words, method)
    
    # Calculate similarity between topics
    similarity_matrix = []
    for topic1 in topics_set_1["topics"]:
        topic_similarities = []
        words1 = set(topic1["words"])
        for topic2 in topics_set_2["topics"]:
            words2 = set(topic2["words"])
            # Jaccard similarity: intersection over union
            intersection = len(words1.intersection(words2))
            union = len(words1.union(words2))
            similarity = intersection / union if union > 0 else 0
            topic_similarities.append(similarity)
        similarity_matrix.append(topic_similarities)
    
    # Find the best matching topic pairs
    matched_topics = []
    for i, similarities in enumerate(similarity_matrix):
        best_match_idx = np.argmax(similarities)
        matched_topics.append({
            "set1_topic_id": i,
            "set1_topic_words": topics_set_1["topics"][i]["words"],
            "set2_topic_id": best_match_idx,
            "set2_topic_words": topics_set_2["topics"][best_match_idx]["words"],
            "similarity": similarities[best_match_idx]
        })
    
    # Construct result
    result = {
        "method": method,
        "n_topics": n_topics,
        "set1_topics": topics_set_1["topics"],
        "set2_topics": topics_set_2["topics"],
        "similarity_matrix": similarity_matrix,
        "matched_topics": matched_topics,
        "average_similarity": np.mean([match["similarity"] for match in matched_topics]),
        "models": model_names  # Add model names to result
    }
    
    return result