File size: 7,525 Bytes
e633a26
2d9e425
 
 
e633a26
8e34de3
 
 
 
e633a26
8e34de3
2d9e425
 
 
 
 
 
1a44569
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8e34de3
e633a26
8e34de3
e633a26
8e34de3
 
e633a26
 
8e34de3
 
 
e633a26
8e34de3
e633a26
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8e34de3
 
 
 
 
 
 
 
 
 
 
 
e633a26
 
 
 
1a44569
e633a26
1a44569
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
"""
Updated bow_analysis.py to include similarity metrics.
Preprocessing here is more advanced than n-gram version.
Lowercase, tokenize, remove stopwords, non-alphabetic characters removal, short words removal, lemmatization.
"""
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from processors.metrics import calculate_similarity

# not used currently imports, but left in case I start using them again
import numpy as np
from collections import Counter
import re
import nltk

# Define the compare_bow_across_texts function directly in this file
def compare_bow_across_texts(texts, model_names, top_n=25):
    """
    Compare bag of words representations across multiple texts.
    
    Args:
        texts (list): List of text responses to compare
        model_names (list): Names of models corresponding to responses
        top_n (int): Number of top words to consider
        
    Returns:
        dict: Bag of words analysis results
    """
    # Initialize the results dictionary
    result = {
        "models": model_names,
        "important_words": {},
        "word_count_matrix": {},
        "differential_words": []
    }
    
    # Make sure we have texts to analyze
    if not texts or len(texts) < 1:
        return result
        
    # Preprocess texts (tokenize, remove stopwords, etc.)
    preprocessed_texts = []
    stop_words = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()
    
    for text in texts:
        # Convert to lowercase and tokenize
        tokens = word_tokenize(text.lower())
        
        # Remove stopwords, punctuation, and lemmatize
        filtered_tokens = []
        for token in tokens:
            if token.isalpha() and token not in stop_words and len(token) > 2:
                filtered_tokens.append(lemmatizer.lemmatize(token))
        
        preprocessed_texts.append(" ".join(filtered_tokens))
    
    # Create bag of words representations using CountVectorizer
    vectorizer = CountVectorizer(max_features=1000)
    X = vectorizer.fit_transform(preprocessed_texts)
    
    # Get feature names (words)
    feature_names = vectorizer.get_feature_names_out()
    
    # Create word count matrix
    word_counts = {}
    for i, model in enumerate(model_names):
        counts = X[i].toarray()[0]
        word_counts[model] = {}
        
        # Store word frequencies for this model
        for j, word in enumerate(feature_names):
            if counts[j] > 0:  # Only store words that appear
                word_counts[model][word] = int(counts[j])
                
                # Add to word count matrix
                if word not in result["word_count_matrix"]:
                    result["word_count_matrix"][word] = {}
                result["word_count_matrix"][word][model] = int(counts[j])
    
    # Find important words for each model
    for model, word_freq in word_counts.items():
        # Sort by frequency
        sorted_words = sorted(word_freq.items(), key=lambda x: x[1], reverse=True)
        
        # Store top N words
        result["important_words"][model] = [
            {"word": word, "count": count} 
            for word, count in sorted_words[:top_n]
        ]
    
    # Calculate differential words (words with biggest frequency difference between models)
    if len(model_names) >= 2:
        model1, model2 = model_names[0], model_names[1]
        
        # Calculate differences
        diff_scores = {}
        for word in result["word_count_matrix"]:
            count1 = result["word_count_matrix"][word].get(model1, 0)
            count2 = result["word_count_matrix"][word].get(model2, 0)
            
            # Absolute difference
            diff_scores[word] = abs(count1 - count2)
        
        # Sort by difference
        sorted_diffs = sorted(diff_scores.items(), key=lambda x: x[1], reverse=True)
        result["differential_words"] = [word for word, _ in sorted_diffs[:top_n]]
        
        # Calculate overlap statistics
        model1_words = set(word_counts.get(model1, {}).keys())
        model2_words = set(word_counts.get(model2, {}).keys())
        common_words = model1_words.intersection(model2_words)
        
        # Initialize comparisons if needed
        if "comparisons" not in result:
            result["comparisons"] = {}
            
        comparison_key = f"{model1} vs {model2}"
        result["comparisons"][comparison_key] = {
            "common_word_count": len(common_words)
        }
    
    return result

def add_similarity_metrics(bow_results, response_texts, model_names):
    """
    Add similarity metrics to the bag of words analysis results
    
    Args:
        bow_results (dict): The bag of words analysis results
        response_texts (list): List of response texts to compare
        model_names (list): List of model names corresponding to responses
        
    Returns:
        dict: Updated bag of words results with similarity metrics
    """
    # Make sure we have at least two responses to compare
    if len(response_texts) < 2 or len(model_names) < 2:
        print("Need at least two responses to calculate similarity metrics")
        return bow_results
    
    # Get the first two responses (current implementation only handles two-way comparisons)
    text1, text2 = response_texts[0], response_texts[1]
    model1, model2 = model_names[0], model_names[1]
    
    # Generate the comparison key
    comparison_key = f"{model1} vs {model2}"
    
    # Initialize comparisons if needed
    if "comparisons" not in bow_results:
        bow_results["comparisons"] = {}
    
    # Initialize the comparison entry if needed
    if comparison_key not in bow_results["comparisons"]:
        bow_results["comparisons"][comparison_key] = {}
    
    # Calculate similarity metrics
    metrics = calculate_similarity(text1, text2)
    
    # Add metrics to the comparison
    bow_results["comparisons"][comparison_key].update({
        "cosine_similarity": metrics.get("cosine_similarity", 0),
        "jaccard_similarity": metrics.get("jaccard_similarity", 0),
        "semantic_similarity": metrics.get("semantic_similarity", 0)
    })
    
    # If we have common_word_count from BOW analysis, keep it
    if "common_word_count" not in bow_results["comparisons"][comparison_key]:
        # Calculate from bow data as a fallback
        if "important_words" in bow_results:
            words1 = set([item["word"] for item in bow_results["important_words"].get(model1, [])])
            words2 = set([item["word"] for item in bow_results["important_words"].get(model2, [])])
            common_words = words1.intersection(words2)
            bow_results["comparisons"][comparison_key]["common_word_count"] = len(common_words)
    
    return bow_results

def compare_bow(texts, model_names, top_n=25):
    """
    Compare bag of words between different texts
    
    Args:
        texts (list): List of text responses to compare
        model_names (list): Names of models corresponding to responses
        top_n (int): Number of top words to consider
        
    Returns:
        dict: Comparative analysis
    """
    bow_results = compare_bow_across_texts(texts, model_names, top_n)
    
    # Add similarity metrics to the results
    if len(texts) >= 2 and len(model_names) >= 2:
        bow_results = add_similarity_metrics(bow_results, texts, model_names)
    
    return bow_results