Spaces:
Sleeping
Sleeping
File size: 5,369 Bytes
39cf944 2d9e425 39cf944 2d9e425 39cf944 2d9e425 39cf944 c74b269 39cf944 08f222a c74b269 7138f76 08f222a 39cf944 08f222a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 |
"""
N-gram analysis for comparing text responses.
Minimal preprocessing is done here, basically just removing stop words and tokenization. From my research this is a good combination for n-gram analysis.
"""
from sklearn.feature_extraction.text import CountVectorizer
# these aren't used currently, as they were imports for testing versions with them. the code is removed also, but I decided to just leave these imports incase I start using them again.
from collections import Counter
import numpy as np
import nltk
from nltk.util import ngrams
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
# Helper function to flatten nested lists
def flatten_list(nested_list):
"""
Recursively flattens a nested list.
Args:
nested_list (list): A potentially nested list.
Returns:
list: A flattened list.
"""
for item in nested_list:
if isinstance(item, list):
yield from flatten_list(item)
else:
yield item
def compare_ngrams(texts, model_names, n=2, top_n=25):
"""
Compare n-gram representations across multiple texts.
Args:
texts (list): List of text responses to compare
model_names (list): Names of models corresponding to responses
n (int): Size of n-grams (1 for unigrams, 2 for bigrams, etc.)
top_n (int): Number of top n-grams to consider
Returns:
dict: N-gram analysis results
"""
# Initialize the results dictionary
result = {
"models": model_names,
"ngram_size": n,
"important_ngrams": {},
"ngram_count_matrix": {},
"differential_ngrams": []
}
# Make sure we have texts to analyze
if not texts or len(texts) < 1:
return result
# Convert n to integer if it's a string
if isinstance(n, str):
n = int(n)
# Convert top_n to integer if necessary
if isinstance(top_n, str):
top_n = int(top_n)
try:
# Create n-gram representations using CountVectorizer
vectorizer = CountVectorizer(
ngram_range=(n, n), # Use the specified n-gram size
max_features=1000,
stop_words='english'
)
# Ensure each text is a string, without attempting complex preprocessing
processed_texts = [str(text) if not isinstance(text, str) else text for text in texts]
X = vectorizer.fit_transform(processed_texts)
# Get feature names (n-grams)
feature_names = vectorizer.get_feature_names_out()
# Create n-gram count matrix
ngram_counts = {}
for i, model in enumerate(model_names):
counts = X[i].toarray()[0]
ngram_counts[model] = {}
# Store n-gram frequencies for this model
for j, ngram in enumerate(feature_names):
if counts[j] > 0: # Only store n-grams that appear
ngram_counts[model][ngram] = int(counts[j])
# Add to n-gram count matrix
if ngram not in result["ngram_count_matrix"]:
result["ngram_count_matrix"][ngram] = {}
result["ngram_count_matrix"][ngram][model] = int(counts[j])
# Find important n-grams for each model
for model, ngram_freq in ngram_counts.items():
# Sort by frequency
sorted_ngrams = sorted(ngram_freq.items(), key=lambda x: x[1], reverse=True)
# Store top N n-grams
result["important_ngrams"][model] = [
{"ngram": ngram, "count": count}
for ngram, count in sorted_ngrams[:top_n]
]
# Calculate differential n-grams (n-grams with biggest frequency difference between models)
if len(model_names) >= 2:
model1, model2 = model_names[0], model_names[1]
# Calculate differences
diff_scores = {}
for ngram in result["ngram_count_matrix"]:
count1 = result["ngram_count_matrix"][ngram].get(model1, 0)
count2 = result["ngram_count_matrix"][ngram].get(model2, 0)
# Absolute difference
diff_scores[ngram] = abs(count1 - count2)
# Sort by difference
sorted_diffs = sorted(diff_scores.items(), key=lambda x: x[1], reverse=True)
result["differential_ngrams"] = [ngram for ngram, _ in sorted_diffs[:top_n]]
# Calculate overlap statistics
model1_ngrams = set(ngram_counts.get(model1, {}).keys())
model2_ngrams = set(ngram_counts.get(model2, {}).keys())
common_ngrams = model1_ngrams.intersection(model2_ngrams)
# Initialize comparisons if needed
if "comparisons" not in result:
result["comparisons"] = {}
comparison_key = f"{model1} vs {model2}"
result["comparisons"][comparison_key] = {
"common_ngram_count": len(common_ngrams)
}
return result
except Exception as e:
import traceback
error_msg = f"N-gram analysis error: {str(e)}\n{traceback.format_exc()}"
print(error_msg)
# Return basic structure with error
return {
"models": model_names,
"ngram_size": n,
"error": str(e)
}
|