Spaces:
Sleeping
Sleeping
File size: 4,441 Bytes
39cf944 08f222a 39cf944 08f222a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 |
"""
N-gram analysis for comparing text responses
"""
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
from collections import Counter
import nltk
from nltk.util import ngrams
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
def compare_ngrams(texts, model_names, n=2, top_n=25):
"""
Compare n-gram representations across multiple texts.
Args:
texts (list): List of text responses to compare
model_names (list): Names of models corresponding to responses
n (int): Size of n-grams (1 for unigrams, 2 for bigrams, etc.)
top_n (int): Number of top n-grams to consider
Returns:
dict: N-gram analysis results
"""
# Initialize the results dictionary
result = {
"models": model_names,
"ngram_size": n,
"important_ngrams": {},
"ngram_count_matrix": {},
"differential_ngrams": []
}
# Make sure we have texts to analyze
if not texts or len(texts) < 1:
return result
# Convert n to integer if it's a string
if isinstance(n, str):
n = int(n)
# Convert top_n to integer if necessary
if isinstance(top_n, str):
top_n = int(top_n)
try:
# Create n-gram representations using CountVectorizer
vectorizer = CountVectorizer(
ngram_range=(n, n), # Use the specified n-gram size
max_features=1000,
stop_words='english'
)
X = vectorizer.fit_transform(texts)
# Get feature names (n-grams)
feature_names = vectorizer.get_feature_names_out()
# Create n-gram count matrix
ngram_counts = {}
for i, model in enumerate(model_names):
counts = X[i].toarray()[0]
ngram_counts[model] = {}
# Store n-gram frequencies for this model
for j, ngram in enumerate(feature_names):
if counts[j] > 0: # Only store n-grams that appear
ngram_counts[model][ngram] = int(counts[j])
# Add to n-gram count matrix
if ngram not in result["ngram_count_matrix"]:
result["ngram_count_matrix"][ngram] = {}
result["ngram_count_matrix"][ngram][model] = int(counts[j])
# Find important n-grams for each model
for model, ngram_freq in ngram_counts.items():
# Sort by frequency
sorted_ngrams = sorted(ngram_freq.items(), key=lambda x: x[1], reverse=True)
# Store top N n-grams
result["important_ngrams"][model] = [
{"ngram": ngram, "count": count}
for ngram, count in sorted_ngrams[:top_n]
]
# Calculate differential n-grams (n-grams with biggest frequency difference between models)
if len(model_names) >= 2:
model1, model2 = model_names[0], model_names[1]
# Calculate differences
diff_scores = {}
for ngram in result["ngram_count_matrix"]:
count1 = result["ngram_count_matrix"][ngram].get(model1, 0)
count2 = result["ngram_count_matrix"][ngram].get(model2, 0)
# Absolute difference
diff_scores[ngram] = abs(count1 - count2)
# Sort by difference
sorted_diffs = sorted(diff_scores.items(), key=lambda x: x[1], reverse=True)
result["differential_ngrams"] = [ngram for ngram, _ in sorted_diffs[:top_n]]
# Calculate overlap statistics
model1_ngrams = set(ngram_counts.get(model1, {}).keys())
model2_ngrams = set(ngram_counts.get(model2, {}).keys())
common_ngrams = model1_ngrams.intersection(model2_ngrams)
# Initialize comparisons if needed
if "comparisons" not in result:
result["comparisons"] = {}
comparison_key = f"{model1} vs {model2}"
result["comparisons"][comparison_key] = {
"common_ngram_count": len(common_ngrams)
}
return result
except Exception as e:
import traceback
error_msg = f"N-gram analysis error: {str(e)}\n{traceback.format_exc()}"
print(error_msg)
# Return basic structure with error
return {
"models": model_names,
"ngram_size": n,
"error": str(e)
}
|