Spaces:
Sleeping
Sleeping
File size: 7,525 Bytes
e633a26 2d9e425 e633a26 8e34de3 e633a26 8e34de3 2d9e425 1a44569 8e34de3 e633a26 8e34de3 e633a26 8e34de3 e633a26 8e34de3 e633a26 8e34de3 e633a26 8e34de3 e633a26 1a44569 e633a26 1a44569 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 |
"""
Updated bow_analysis.py to include similarity metrics.
Preprocessing here is more advanced than n-gram version.
Lowercase, tokenize, remove stopwords, non-alphabetic characters removal, short words removal, lemmatization.
"""
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from processors.metrics import calculate_similarity
# not used currently imports, but left in case I start using them again
import numpy as np
from collections import Counter
import re
import nltk
# Define the compare_bow_across_texts function directly in this file
def compare_bow_across_texts(texts, model_names, top_n=25):
"""
Compare bag of words representations across multiple texts.
Args:
texts (list): List of text responses to compare
model_names (list): Names of models corresponding to responses
top_n (int): Number of top words to consider
Returns:
dict: Bag of words analysis results
"""
# Initialize the results dictionary
result = {
"models": model_names,
"important_words": {},
"word_count_matrix": {},
"differential_words": []
}
# Make sure we have texts to analyze
if not texts or len(texts) < 1:
return result
# Preprocess texts (tokenize, remove stopwords, etc.)
preprocessed_texts = []
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
for text in texts:
# Convert to lowercase and tokenize
tokens = word_tokenize(text.lower())
# Remove stopwords, punctuation, and lemmatize
filtered_tokens = []
for token in tokens:
if token.isalpha() and token not in stop_words and len(token) > 2:
filtered_tokens.append(lemmatizer.lemmatize(token))
preprocessed_texts.append(" ".join(filtered_tokens))
# Create bag of words representations using CountVectorizer
vectorizer = CountVectorizer(max_features=1000)
X = vectorizer.fit_transform(preprocessed_texts)
# Get feature names (words)
feature_names = vectorizer.get_feature_names_out()
# Create word count matrix
word_counts = {}
for i, model in enumerate(model_names):
counts = X[i].toarray()[0]
word_counts[model] = {}
# Store word frequencies for this model
for j, word in enumerate(feature_names):
if counts[j] > 0: # Only store words that appear
word_counts[model][word] = int(counts[j])
# Add to word count matrix
if word not in result["word_count_matrix"]:
result["word_count_matrix"][word] = {}
result["word_count_matrix"][word][model] = int(counts[j])
# Find important words for each model
for model, word_freq in word_counts.items():
# Sort by frequency
sorted_words = sorted(word_freq.items(), key=lambda x: x[1], reverse=True)
# Store top N words
result["important_words"][model] = [
{"word": word, "count": count}
for word, count in sorted_words[:top_n]
]
# Calculate differential words (words with biggest frequency difference between models)
if len(model_names) >= 2:
model1, model2 = model_names[0], model_names[1]
# Calculate differences
diff_scores = {}
for word in result["word_count_matrix"]:
count1 = result["word_count_matrix"][word].get(model1, 0)
count2 = result["word_count_matrix"][word].get(model2, 0)
# Absolute difference
diff_scores[word] = abs(count1 - count2)
# Sort by difference
sorted_diffs = sorted(diff_scores.items(), key=lambda x: x[1], reverse=True)
result["differential_words"] = [word for word, _ in sorted_diffs[:top_n]]
# Calculate overlap statistics
model1_words = set(word_counts.get(model1, {}).keys())
model2_words = set(word_counts.get(model2, {}).keys())
common_words = model1_words.intersection(model2_words)
# Initialize comparisons if needed
if "comparisons" not in result:
result["comparisons"] = {}
comparison_key = f"{model1} vs {model2}"
result["comparisons"][comparison_key] = {
"common_word_count": len(common_words)
}
return result
def add_similarity_metrics(bow_results, response_texts, model_names):
"""
Add similarity metrics to the bag of words analysis results
Args:
bow_results (dict): The bag of words analysis results
response_texts (list): List of response texts to compare
model_names (list): List of model names corresponding to responses
Returns:
dict: Updated bag of words results with similarity metrics
"""
# Make sure we have at least two responses to compare
if len(response_texts) < 2 or len(model_names) < 2:
print("Need at least two responses to calculate similarity metrics")
return bow_results
# Get the first two responses (current implementation only handles two-way comparisons)
text1, text2 = response_texts[0], response_texts[1]
model1, model2 = model_names[0], model_names[1]
# Generate the comparison key
comparison_key = f"{model1} vs {model2}"
# Initialize comparisons if needed
if "comparisons" not in bow_results:
bow_results["comparisons"] = {}
# Initialize the comparison entry if needed
if comparison_key not in bow_results["comparisons"]:
bow_results["comparisons"][comparison_key] = {}
# Calculate similarity metrics
metrics = calculate_similarity(text1, text2)
# Add metrics to the comparison
bow_results["comparisons"][comparison_key].update({
"cosine_similarity": metrics.get("cosine_similarity", 0),
"jaccard_similarity": metrics.get("jaccard_similarity", 0),
"semantic_similarity": metrics.get("semantic_similarity", 0)
})
# If we have common_word_count from BOW analysis, keep it
if "common_word_count" not in bow_results["comparisons"][comparison_key]:
# Calculate from bow data as a fallback
if "important_words" in bow_results:
words1 = set([item["word"] for item in bow_results["important_words"].get(model1, [])])
words2 = set([item["word"] for item in bow_results["important_words"].get(model2, [])])
common_words = words1.intersection(words2)
bow_results["comparisons"][comparison_key]["common_word_count"] = len(common_words)
return bow_results
def compare_bow(texts, model_names, top_n=25):
"""
Compare bag of words between different texts
Args:
texts (list): List of text responses to compare
model_names (list): Names of models corresponding to responses
top_n (int): Number of top words to consider
Returns:
dict: Comparative analysis
"""
bow_results = compare_bow_across_texts(texts, model_names, top_n)
# Add similarity metrics to the results
if len(texts) >= 2 and len(model_names) >= 2:
bow_results = add_similarity_metrics(bow_results, texts, model_names)
return bow_results |