File size: 6,643 Bytes
8e34de3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
from collections import Counter
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

# Download necessary NLTK data
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt')
    
try:
    nltk.data.find('corpora/stopwords')
except LookupError:
    nltk.download('stopwords')
    
try:
    nltk.data.find('corpora/wordnet')
except LookupError:
    nltk.download('wordnet')

def preprocess_text(text):
    """
    Preprocess text for bag of words analysis
    
    Args:
        text (str): Input text
        
    Returns:
        str: Preprocessed text
    """
    # Convert to lowercase
    text = text.lower()
    
    # Remove special characters and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    
    # Tokenize
    tokens = word_tokenize(text)
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]
    
    # Lemmatize
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    
    # Filter out short words (likely not meaningful)
    tokens = [token for token in tokens if len(token) > 2]
    
    # Join back to string
    return ' '.join(tokens)

def create_bow(text):
    """
    Create bag of words representation
    
    Args:
        text (str): Input text
        
    Returns:
        dict: Bag of words representation with word counts
    """
    # Preprocess text
    preprocessed_text = preprocess_text(text)
    
    # Tokenize
    tokens = preprocessed_text.split()
    
    # Count occurrences
    word_counts = Counter(tokens)
    
    return dict(word_counts)

def compare_bow(bow1, bow2):
    """
    Compare two bag of words representations
    
    Args:
        bow1 (dict): First bag of words
        bow2 (dict): Second bag of words
        
    Returns:
        dict: Comparison metrics
    """
    # Get all unique words
    all_words = set(bow1.keys()).union(set(bow2.keys()))
    
    # Words in both
    common_words = set(bow1.keys()).intersection(set(bow2.keys()))
    
    # Words unique to each
    unique_to_1 = set(bow1.keys()) - set(bow2.keys())
    unique_to_2 = set(bow2.keys()) - set(bow1.keys())
    
    # Calculate Jaccard similarity
    jaccard = len(common_words) / len(all_words) if len(all_words) > 0 else 0
    
    # Calculate cosine similarity
    vec1 = np.zeros(len(all_words))
    vec2 = np.zeros(len(all_words))
    
    for i, word in enumerate(all_words):
        vec1[i] = bow1.get(word, 0)
        vec2[i] = bow2.get(word, 0)
    
    # Normalize vectors
    norm1 = np.linalg.norm(vec1)
    norm2 = np.linalg.norm(vec2)
    
    if norm1 == 0 or norm2 == 0:
        cosine = 0
    else:
        cosine = np.dot(vec1, vec2) / (norm1 * norm2)
    
    return {
        "jaccard_similarity": jaccard,
        "cosine_similarity": cosine,
        "common_word_count": len(common_words),
        "unique_to_first": list(unique_to_1)[:20],  # Limit for readability
        "unique_to_second": list(unique_to_2)[:20]  # Limit for readability
    }

def important_words(bow, top_n=10):
    """
    Extract most important/distinctive words
    
    Args:
        bow (dict): Bag of words representation
        top_n (int): Number of top words to return
        
    Returns:
        list: Top words with counts
    """
    # Sort by count
    sorted_words = sorted(bow.items(), key=lambda x: x[1], reverse=True)
    
    # Return top N
    return [{"word": word, "count": count} for word, count in sorted_words[:top_n]]

def compare_bow_across_texts(texts, model_names, top_n=25):
    """
    Compare bag of words across multiple texts
    
    Args:
        texts (list): List of text responses
        model_names (list): List of model names corresponding to responses
        top_n (int): Number of top words to include
        
    Returns:
        dict: Comparative bag of words analysis
    """
    # Create bag of words for each text
    bows = [create_bow(text) for text in texts]
    
    # Map to models
    model_bows = {model: bow for model, bow in zip(model_names, bows)}
    
    # Get important words for each model
    model_important_words = {model: important_words(bow, top_n) for model, bow in model_bows.items()}
    
    # Compare pairwise
    comparisons = {}
    for i, model1 in enumerate(model_names):
        for j, model2 in enumerate(model_names):
            if j <= i:  # Avoid duplicate comparisons
                continue
            
            comparison_key = f"{model1} vs {model2}"
            comparisons[comparison_key] = compare_bow(model_bows[model1], model_bows[model2])
    
    # Create combined word list across all models
    all_words = set()
    for bow in bows:
        all_words.update(bow.keys())
    
    # Create a matrix of word counts across models
    word_count_matrix = {}
    for word in sorted(list(all_words)):
        word_counts = [bow.get(word, 0) for bow in bows]
        # Only include words that show up in at least one model
        if any(count > 0 for count in word_counts):
            word_count_matrix[word] = {model: bow.get(word, 0) for model, bow in zip(model_names, bows)}
    
    # Sort matrix by most differential words (words with biggest variance across models)
    word_variances = {}
    for word, counts in word_count_matrix.items():
        count_values = list(counts.values())
        if len(count_values) > 1:
            word_variances[word] = np.var(count_values)
    
    # Get top differential words
    top_diff_words = sorted(word_variances.items(), key=lambda x: x[1], reverse=True)[:top_n]
    differential_words = [word for word, _ in top_diff_words]
    
    # Format results
    result = {
        "model_word_counts": model_bows,
        "important_words": model_important_words,
        "comparisons": comparisons,
        "differential_words": differential_words,
        "word_count_matrix": {word: word_count_matrix[word] for word in differential_words},
        "models": model_names
    }
    
    return result

def compare_bow(texts, model_names, top_n=25):
    """
    Compare bag of words between different texts
    
    Args:
        texts (list): List of text responses to compare
        model_names (list): Names of models corresponding to responses
        top_n (int): Number of top words to consider
        
    Returns:
        dict: Comparative analysis
    """
    return compare_bow_across_texts(texts, model_names, top_n)