Spaces:
Sleeping
Sleeping
""" | |
N-gram analysis for comparing text responses. | |
Minimal preprocessing is done here, basically just removing stop words and tokenization. From my research this is a good combination for n-gram analysis. | |
""" | |
from sklearn.feature_extraction.text import CountVectorizer | |
# these aren't used currently, as they were imports for testing versions with them. the code is removed also, but I decided to just leave these imports incase I start using them again. | |
from collections import Counter | |
import numpy as np | |
import nltk | |
from nltk.util import ngrams | |
from nltk.tokenize import word_tokenize | |
from nltk.corpus import stopwords | |
# Helper function to flatten nested lists | |
def flatten_list(nested_list): | |
""" | |
Recursively flattens a nested list. | |
Args: | |
nested_list (list): A potentially nested list. | |
Returns: | |
list: A flattened list. | |
""" | |
for item in nested_list: | |
if isinstance(item, list): | |
yield from flatten_list(item) | |
else: | |
yield item | |
def compare_ngrams(texts, model_names, n=2, top_n=25): | |
""" | |
Compare n-gram representations across multiple texts. | |
Args: | |
texts (list): List of text responses to compare | |
model_names (list): Names of models corresponding to responses | |
n (int): Size of n-grams (1 for unigrams, 2 for bigrams, etc.) | |
top_n (int): Number of top n-grams to consider | |
Returns: | |
dict: N-gram analysis results | |
""" | |
# Initialize the results dictionary | |
result = { | |
"models": model_names, | |
"ngram_size": n, | |
"important_ngrams": {}, | |
"ngram_count_matrix": {}, | |
"differential_ngrams": [] | |
} | |
# Make sure we have texts to analyze | |
if not texts or len(texts) < 1: | |
return result | |
# Convert n to integer if it's a string | |
if isinstance(n, str): | |
n = int(n) | |
# Convert top_n to integer if necessary | |
if isinstance(top_n, str): | |
top_n = int(top_n) | |
try: | |
# Create n-gram representations using CountVectorizer | |
vectorizer = CountVectorizer( | |
ngram_range=(n, n), # Use the specified n-gram size | |
max_features=1000, | |
stop_words='english' | |
) | |
# Ensure each text is a string, without attempting complex preprocessing | |
processed_texts = [str(text) if not isinstance(text, str) else text for text in texts] | |
X = vectorizer.fit_transform(processed_texts) | |
# Get feature names (n-grams) | |
feature_names = vectorizer.get_feature_names_out() | |
# Create n-gram count matrix | |
ngram_counts = {} | |
for i, model in enumerate(model_names): | |
counts = X[i].toarray()[0] | |
ngram_counts[model] = {} | |
# Store n-gram frequencies for this model | |
for j, ngram in enumerate(feature_names): | |
if counts[j] > 0: # Only store n-grams that appear | |
ngram_counts[model][ngram] = int(counts[j]) | |
# Add to n-gram count matrix | |
if ngram not in result["ngram_count_matrix"]: | |
result["ngram_count_matrix"][ngram] = {} | |
result["ngram_count_matrix"][ngram][model] = int(counts[j]) | |
# Find important n-grams for each model | |
for model, ngram_freq in ngram_counts.items(): | |
# Sort by frequency | |
sorted_ngrams = sorted(ngram_freq.items(), key=lambda x: x[1], reverse=True) | |
# Store top N n-grams | |
result["important_ngrams"][model] = [ | |
{"ngram": ngram, "count": count} | |
for ngram, count in sorted_ngrams[:top_n] | |
] | |
# Calculate differential n-grams (n-grams with biggest frequency difference between models) | |
if len(model_names) >= 2: | |
model1, model2 = model_names[0], model_names[1] | |
# Calculate differences | |
diff_scores = {} | |
for ngram in result["ngram_count_matrix"]: | |
count1 = result["ngram_count_matrix"][ngram].get(model1, 0) | |
count2 = result["ngram_count_matrix"][ngram].get(model2, 0) | |
# Absolute difference | |
diff_scores[ngram] = abs(count1 - count2) | |
# Sort by difference | |
sorted_diffs = sorted(diff_scores.items(), key=lambda x: x[1], reverse=True) | |
result["differential_ngrams"] = [ngram for ngram, _ in sorted_diffs[:top_n]] | |
# Calculate overlap statistics | |
model1_ngrams = set(ngram_counts.get(model1, {}).keys()) | |
model2_ngrams = set(ngram_counts.get(model2, {}).keys()) | |
common_ngrams = model1_ngrams.intersection(model2_ngrams) | |
# Initialize comparisons if needed | |
if "comparisons" not in result: | |
result["comparisons"] = {} | |
comparison_key = f"{model1} vs {model2}" | |
result["comparisons"][comparison_key] = { | |
"common_ngram_count": len(common_ngrams) | |
} | |
return result | |
except Exception as e: | |
import traceback | |
error_msg = f"N-gram analysis error: {str(e)}\n{traceback.format_exc()}" | |
print(error_msg) | |
# Return basic structure with error | |
return { | |
"models": model_names, | |
"ngram_size": n, | |
"error": str(e) | |
} | |