Spaces:

RyanS974
/

525GradioApp

Sleeping

File size: 7,525 Bytes

"""
Updated bow_analysis.py to include similarity metrics.
Preprocessing here is more advanced than n-gram version.
Lowercase, tokenize, remove stopwords, non-alphabetic characters removal, short words removal, lemmatization.
"""
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from processors.metrics import calculate_similarity

# not used currently imports, but left in case I start using them again
import numpy as np
from collections import Counter
import re
import nltk

# Define the compare_bow_across_texts function directly in this file
def compare_bow_across_texts(texts, model_names, top_n=25):
    """
    Compare bag of words representations across multiple texts.
    
    Args:
        texts (list): List of text responses to compare
        model_names (list): Names of models corresponding to responses
        top_n (int): Number of top words to consider
        
    Returns:
        dict: Bag of words analysis results
    """
    # Initialize the results dictionary
    result = {
        "models": model_names,
        "important_words": {},
        "word_count_matrix": {},
        "differential_words": []
    }
    
    # Make sure we have texts to analyze
    if not texts or len(texts) < 1:
        return result
        
    # Preprocess texts (tokenize, remove stopwords, etc.)
    preprocessed_texts = []
    stop_words = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()
    
    for text in texts:
        # Convert to lowercase and tokenize
        tokens = word_tokenize(text.lower())
        
        # Remove stopwords, punctuation, and lemmatize
        filtered_tokens = []
        for token in tokens:
            if token.isalpha() and token not in stop_words and len(token) > 2:
                filtered_tokens.append(lemmatizer.lemmatize(token))
        
        preprocessed_texts.append(" ".join(filtered_tokens))
    
    # Create bag of words representations using CountVectorizer
    vectorizer = CountVectorizer(max_features=1000)
    X = vectorizer.fit_transform(preprocessed_texts)
    
    # Get feature names (words)
    feature_names = vectorizer.get_feature_names_out()
    
    # Create word count matrix
    word_counts = {}
    for i, model in enumerate(model_names):
        counts = X[i].toarray()[0]
        word_counts[model] = {}
        
        # Store word frequencies for this model
        for j, word in enumerate(feature_names):
            if counts[j] > 0:  # Only store words that appear
                word_counts[model][word] = int(counts[j])
                
                # Add to word count matrix
                if word not in result["word_count_matrix"]:
                    result["word_count_matrix"][word] = {}
                result["word_count_matrix"][word][model] = int(counts[j])
    
    # Find important words for each model
    for model, word_freq in word_counts.items():
        # Sort by frequency
        sorted_words = sorted(word_freq.items(), key=lambda x: x[1], reverse=True)
        
        # Store top N words
        result["important_words"][model] = [
            {"word": word, "count": count} 
            for word, count in sorted_words[:top_n]
        ]
    
    # Calculate differential words (words with biggest frequency difference between models)
    if len(model_names) >= 2:
        model1, model2 = model_names[0], model_names[1]
        
        # Calculate differences
        diff_scores = {}
        for word in result["word_count_matrix"]:
            count1 = result["word_count_matrix"][word].get(model1, 0)
            count2 = result["word_count_matrix"][word].get(model2, 0)
            
            # Absolute difference
            diff_scores[word] = abs(count1 - count2)
        
        # Sort by difference
        sorted_diffs = sorted(diff_scores.items(), key=lambda x: x[1], reverse=True)
        result["differential_words"] = [word for word, _ in sorted_diffs[:top_n]]
        
        # Calculate overlap statistics
        model1_words = set(word_counts.get(model1, {}).keys())
        model2_words = set(word_counts.get(model2, {}).keys())
        common_words = model1_words.intersection(model2_words)
        
        # Initialize comparisons if needed
        if "comparisons" not in result:
            result["comparisons"] = {}
            
        comparison_key = f"{model1} vs {model2}"
        result["comparisons"][comparison_key] = {
            "common_word_count": len(common_words)
        }
    
    return result

def add_similarity_metrics(bow_results, response_texts, model_names):
    """
    Add similarity metrics to the bag of words analysis results
    
    Args:
        bow_results (dict): The bag of words analysis results
        response_texts (list): List of response texts to compare
        model_names (list): List of model names corresponding to responses
        
    Returns:
        dict: Updated bag of words results with similarity metrics
    """
    # Make sure we have at least two responses to compare
    if len(response_texts) < 2 or len(model_names) < 2:
        print("Need at least two responses to calculate similarity metrics")
        return bow_results
    
    # Get the first two responses (current implementation only handles two-way comparisons)
    text1, text2 = response_texts[0], response_texts[1]
    model1, model2 = model_names[0], model_names[1]
    
    # Generate the comparison key
    comparison_key = f"{model1} vs {model2}"
    
    # Initialize comparisons if needed
    if "comparisons" not in bow_results:
        bow_results["comparisons"] = {}
    
    # Initialize the comparison entry if needed
    if comparison_key not in bow_results["comparisons"]:
        bow_results["comparisons"][comparison_key] = {}
    
    # Calculate similarity metrics
    metrics = calculate_similarity(text1, text2)
    
    # Add metrics to the comparison
    bow_results["comparisons"][comparison_key].update({
        "cosine_similarity": metrics.get("cosine_similarity", 0),
        "jaccard_similarity": metrics.get("jaccard_similarity", 0),
        "semantic_similarity": metrics.get("semantic_similarity", 0)
    })
    
    # If we have common_word_count from BOW analysis, keep it
    if "common_word_count" not in bow_results["comparisons"][comparison_key]:
        # Calculate from bow data as a fallback
        if "important_words" in bow_results:
            words1 = set([item["word"] for item in bow_results["important_words"].get(model1, [])])
            words2 = set([item["word"] for item in bow_results["important_words"].get(model2, [])])
            common_words = words1.intersection(words2)
            bow_results["comparisons"][comparison_key]["common_word_count"] = len(common_words)
    
    return bow_results

def compare_bow(texts, model_names, top_n=25):
    """
    Compare bag of words between different texts
    
    Args:
        texts (list): List of text responses to compare
        model_names (list): Names of models corresponding to responses
        top_n (int): Number of top words to consider
        
    Returns:
        dict: Comparative analysis
    """
    bow_results = compare_bow_across_texts(texts, model_names, top_n)
    
    # Add similarity metrics to the results
    if len(texts) >= 2 and len(model_names) >= 2:
        bow_results = add_similarity_metrics(bow_results, texts, model_names)
    
    return bow_results