Spaces:

RyanS974
/

525GradioApp

Sleeping

525GradioApp / processors /bow_analysis.py

Ryan

update

2d9e425 4 months ago

7.53 kB

	"""
	Updated bow_analysis.py to include similarity metrics.
	Preprocessing here is more advanced than n-gram version.
	Lowercase, tokenize, remove stopwords, non-alphabetic characters removal, short words removal, lemmatization.
	"""
	from sklearn.feature_extraction.text import CountVectorizer
	from nltk.corpus import stopwords
	from nltk.stem import WordNetLemmatizer
	from nltk.tokenize import word_tokenize
	from processors.metrics import calculate_similarity

	# not used currently imports, but left in case I start using them again
	import numpy as np
	from collections import Counter
	import re
	import nltk

	# Define the compare_bow_across_texts function directly in this file
	def compare_bow_across_texts(texts, model_names, top_n=25):
	"""
	Compare bag of words representations across multiple texts.

	Args:
	texts (list): List of text responses to compare
	model_names (list): Names of models corresponding to responses
	top_n (int): Number of top words to consider

	Returns:
	dict: Bag of words analysis results
	"""
	# Initialize the results dictionary
	result = {
	"models": model_names,
	"important_words": {},
	"word_count_matrix": {},
	"differential_words": []
	}

	# Make sure we have texts to analyze
	if not texts or len(texts) < 1:
	return result

	# Preprocess texts (tokenize, remove stopwords, etc.)
	preprocessed_texts = []
	stop_words = set(stopwords.words('english'))
	lemmatizer = WordNetLemmatizer()

	for text in texts:
	# Convert to lowercase and tokenize
	tokens = word_tokenize(text.lower())

	# Remove stopwords, punctuation, and lemmatize
	filtered_tokens = []
	for token in tokens:
	if token.isalpha() and token not in stop_words and len(token) > 2:
	filtered_tokens.append(lemmatizer.lemmatize(token))

	preprocessed_texts.append(" ".join(filtered_tokens))

	# Create bag of words representations using CountVectorizer
	vectorizer = CountVectorizer(max_features=1000)
	X = vectorizer.fit_transform(preprocessed_texts)

	# Get feature names (words)
	feature_names = vectorizer.get_feature_names_out()

	# Create word count matrix
	word_counts = {}
	for i, model in enumerate(model_names):
	counts = X[i].toarray()[0]
	word_counts[model] = {}

	# Store word frequencies for this model
	for j, word in enumerate(feature_names):
	if counts[j] > 0: # Only store words that appear
	word_counts[model][word] = int(counts[j])

	# Add to word count matrix
	if word not in result["word_count_matrix"]:
	result["word_count_matrix"][word] = {}
	result["word_count_matrix"][word][model] = int(counts[j])

	# Find important words for each model
	for model, word_freq in word_counts.items():
	# Sort by frequency
	sorted_words = sorted(word_freq.items(), key=lambda x: x[1], reverse=True)

	# Store top N words
	result["important_words"][model] = [
	{"word": word, "count": count}
	for word, count in sorted_words[:top_n]
	]

	# Calculate differential words (words with biggest frequency difference between models)
	if len(model_names) >= 2:
	model1, model2 = model_names[0], model_names[1]

	# Calculate differences
	diff_scores = {}
	for word in result["word_count_matrix"]:
	count1 = result["word_count_matrix"][word].get(model1, 0)
	count2 = result["word_count_matrix"][word].get(model2, 0)

	# Absolute difference
	diff_scores[word] = abs(count1 - count2)

	# Sort by difference
	sorted_diffs = sorted(diff_scores.items(), key=lambda x: x[1], reverse=True)
	result["differential_words"] = [word for word, _ in sorted_diffs[:top_n]]

	# Calculate overlap statistics
	model1_words = set(word_counts.get(model1, {}).keys())
	model2_words = set(word_counts.get(model2, {}).keys())
	common_words = model1_words.intersection(model2_words)

	# Initialize comparisons if needed
	if "comparisons" not in result:
	result["comparisons"] = {}

	comparison_key = f"{model1} vs {model2}"
	result["comparisons"][comparison_key] = {
	"common_word_count": len(common_words)
	}

	return result

	def add_similarity_metrics(bow_results, response_texts, model_names):
	"""
	Add similarity metrics to the bag of words analysis results

	Args:
	bow_results (dict): The bag of words analysis results
	response_texts (list): List of response texts to compare
	model_names (list): List of model names corresponding to responses

	Returns:
	dict: Updated bag of words results with similarity metrics
	"""
	# Make sure we have at least two responses to compare
	if len(response_texts) < 2 or len(model_names) < 2:
	print("Need at least two responses to calculate similarity metrics")
	return bow_results

	# Get the first two responses (current implementation only handles two-way comparisons)
	text1, text2 = response_texts[0], response_texts[1]
	model1, model2 = model_names[0], model_names[1]

	# Generate the comparison key
	comparison_key = f"{model1} vs {model2}"

	# Initialize comparisons if needed
	if "comparisons" not in bow_results:
	bow_results["comparisons"] = {}

	# Initialize the comparison entry if needed
	if comparison_key not in bow_results["comparisons"]:
	bow_results["comparisons"][comparison_key] = {}

	# Calculate similarity metrics
	metrics = calculate_similarity(text1, text2)

	# Add metrics to the comparison
	bow_results["comparisons"][comparison_key].update({
	"cosine_similarity": metrics.get("cosine_similarity", 0),
	"jaccard_similarity": metrics.get("jaccard_similarity", 0),
	"semantic_similarity": metrics.get("semantic_similarity", 0)
	})

	# If we have common_word_count from BOW analysis, keep it
	if "common_word_count" not in bow_results["comparisons"][comparison_key]:
	# Calculate from bow data as a fallback
	if "important_words" in bow_results:
	words1 = set([item["word"] for item in bow_results["important_words"].get(model1, [])])
	words2 = set([item["word"] for item in bow_results["important_words"].get(model2, [])])
	common_words = words1.intersection(words2)
	bow_results["comparisons"][comparison_key]["common_word_count"] = len(common_words)

	return bow_results

	def compare_bow(texts, model_names, top_n=25):
	"""
	Compare bag of words between different texts

	Args:
	texts (list): List of text responses to compare
	model_names (list): Names of models corresponding to responses
	top_n (int): Number of top words to consider

	Returns:
	dict: Comparative analysis
	"""
	bow_results = compare_bow_across_texts(texts, model_names, top_n)

	# Add similarity metrics to the results
	if len(texts) >= 2 and len(model_names) >= 2:
	bow_results = add_similarity_metrics(bow_results, texts, model_names)

	return bow_results