""" N-gram analysis for comparing text responses. Minimal preprocessing is done here, basically just removing stop words and tokenization. From my research this is a good combination for n-gram analysis. """ from sklearn.feature_extraction.text import CountVectorizer # these aren't used currently, as they were imports for testing versions with them. the code is removed also, but I decided to just leave these imports incase I start using them again. from collections import Counter import numpy as np import nltk from nltk.util import ngrams from nltk.tokenize import word_tokenize from nltk.corpus import stopwords # Helper function to flatten nested lists def flatten_list(nested_list): """ Recursively flattens a nested list. Args: nested_list (list): A potentially nested list. Returns: list: A flattened list. """ for item in nested_list: if isinstance(item, list): yield from flatten_list(item) else: yield item def compare_ngrams(texts, model_names, n=2, top_n=25): """ Compare n-gram representations across multiple texts. Args: texts (list): List of text responses to compare model_names (list): Names of models corresponding to responses n (int): Size of n-grams (1 for unigrams, 2 for bigrams, etc.) top_n (int): Number of top n-grams to consider Returns: dict: N-gram analysis results """ # Initialize the results dictionary result = { "models": model_names, "ngram_size": n, "important_ngrams": {}, "ngram_count_matrix": {}, "differential_ngrams": [] } # Make sure we have texts to analyze if not texts or len(texts) < 1: return result # Convert n to integer if it's a string if isinstance(n, str): n = int(n) # Convert top_n to integer if necessary if isinstance(top_n, str): top_n = int(top_n) try: # Create n-gram representations using CountVectorizer vectorizer = CountVectorizer( ngram_range=(n, n), # Use the specified n-gram size max_features=1000, stop_words='english' ) # Ensure each text is a string, without attempting complex preprocessing processed_texts = [str(text) if not isinstance(text, str) else text for text in texts] X = vectorizer.fit_transform(processed_texts) # Get feature names (n-grams) feature_names = vectorizer.get_feature_names_out() # Create n-gram count matrix ngram_counts = {} for i, model in enumerate(model_names): counts = X[i].toarray()[0] ngram_counts[model] = {} # Store n-gram frequencies for this model for j, ngram in enumerate(feature_names): if counts[j] > 0: # Only store n-grams that appear ngram_counts[model][ngram] = int(counts[j]) # Add to n-gram count matrix if ngram not in result["ngram_count_matrix"]: result["ngram_count_matrix"][ngram] = {} result["ngram_count_matrix"][ngram][model] = int(counts[j]) # Find important n-grams for each model for model, ngram_freq in ngram_counts.items(): # Sort by frequency sorted_ngrams = sorted(ngram_freq.items(), key=lambda x: x[1], reverse=True) # Store top N n-grams result["important_ngrams"][model] = [ {"ngram": ngram, "count": count} for ngram, count in sorted_ngrams[:top_n] ] # Calculate differential n-grams (n-grams with biggest frequency difference between models) if len(model_names) >= 2: model1, model2 = model_names[0], model_names[1] # Calculate differences diff_scores = {} for ngram in result["ngram_count_matrix"]: count1 = result["ngram_count_matrix"][ngram].get(model1, 0) count2 = result["ngram_count_matrix"][ngram].get(model2, 0) # Absolute difference diff_scores[ngram] = abs(count1 - count2) # Sort by difference sorted_diffs = sorted(diff_scores.items(), key=lambda x: x[1], reverse=True) result["differential_ngrams"] = [ngram for ngram, _ in sorted_diffs[:top_n]] # Calculate overlap statistics model1_ngrams = set(ngram_counts.get(model1, {}).keys()) model2_ngrams = set(ngram_counts.get(model2, {}).keys()) common_ngrams = model1_ngrams.intersection(model2_ngrams) # Initialize comparisons if needed if "comparisons" not in result: result["comparisons"] = {} comparison_key = f"{model1} vs {model2}" result["comparisons"][comparison_key] = { "common_ngram_count": len(common_ngrams) } return result except Exception as e: import traceback error_msg = f"N-gram analysis error: {str(e)}\n{traceback.format_exc()}" print(error_msg) # Return basic structure with error return { "models": model_names, "ngram_size": n, "error": str(e) }