Spaces:

RyanS974
/

525GradioApp

Sleeping

File size: 6,643 Bytes

8e34de3

from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
from collections import Counter
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

# Download necessary NLTK data
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt')
    
try:
    nltk.data.find('corpora/stopwords')
except LookupError:
    nltk.download('stopwords')
    
try:
    nltk.data.find('corpora/wordnet')
except LookupError:
    nltk.download('wordnet')

def preprocess_text(text):
    """
    Preprocess text for bag of words analysis
    
    Args:
        text (str): Input text
        
    Returns:
        str: Preprocessed text
    """
    # Convert to lowercase
    text = text.lower()
    
    # Remove special characters and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    
    # Tokenize
    tokens = word_tokenize(text)
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]
    
    # Lemmatize
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    
    # Filter out short words (likely not meaningful)
    tokens = [token for token in tokens if len(token) > 2]
    
    # Join back to string
    return ' '.join(tokens)

def create_bow(text):
    """
    Create bag of words representation
    
    Args:
        text (str): Input text
        
    Returns:
        dict: Bag of words representation with word counts
    """
    # Preprocess text
    preprocessed_text = preprocess_text(text)
    
    # Tokenize
    tokens = preprocessed_text.split()
    
    # Count occurrences
    word_counts = Counter(tokens)
    
    return dict(word_counts)

def compare_bow(bow1, bow2):
    """
    Compare two bag of words representations
    
    Args:
        bow1 (dict): First bag of words
        bow2 (dict): Second bag of words
        
    Returns:
        dict: Comparison metrics
    """
    # Get all unique words
    all_words = set(bow1.keys()).union(set(bow2.keys()))
    
    # Words in both
    common_words = set(bow1.keys()).intersection(set(bow2.keys()))
    
    # Words unique to each
    unique_to_1 = set(bow1.keys()) - set(bow2.keys())
    unique_to_2 = set(bow2.keys()) - set(bow1.keys())
    
    # Calculate Jaccard similarity
    jaccard = len(common_words) / len(all_words) if len(all_words) > 0 else 0
    
    # Calculate cosine similarity
    vec1 = np.zeros(len(all_words))
    vec2 = np.zeros(len(all_words))
    
    for i, word in enumerate(all_words):
        vec1[i] = bow1.get(word, 0)
        vec2[i] = bow2.get(word, 0)
    
    # Normalize vectors
    norm1 = np.linalg.norm(vec1)
    norm2 = np.linalg.norm(vec2)
    
    if norm1 == 0 or norm2 == 0:
        cosine = 0
    else:
        cosine = np.dot(vec1, vec2) / (norm1 * norm2)
    
    return {
        "jaccard_similarity": jaccard,
        "cosine_similarity": cosine,
        "common_word_count": len(common_words),
        "unique_to_first": list(unique_to_1)[:20],  # Limit for readability
        "unique_to_second": list(unique_to_2)[:20]  # Limit for readability
    }

def important_words(bow, top_n=10):
    """
    Extract most important/distinctive words
    
    Args:
        bow (dict): Bag of words representation
        top_n (int): Number of top words to return
        
    Returns:
        list: Top words with counts
    """
    # Sort by count
    sorted_words = sorted(bow.items(), key=lambda x: x[1], reverse=True)
    
    # Return top N
    return [{"word": word, "count": count} for word, count in sorted_words[:top_n]]

def compare_bow_across_texts(texts, model_names, top_n=25):
    """
    Compare bag of words across multiple texts
    
    Args:
        texts (list): List of text responses
        model_names (list): List of model names corresponding to responses
        top_n (int): Number of top words to include
        
    Returns:
        dict: Comparative bag of words analysis
    """
    # Create bag of words for each text
    bows = [create_bow(text) for text in texts]
    
    # Map to models
    model_bows = {model: bow for model, bow in zip(model_names, bows)}
    
    # Get important words for each model
    model_important_words = {model: important_words(bow, top_n) for model, bow in model_bows.items()}
    
    # Compare pairwise
    comparisons = {}
    for i, model1 in enumerate(model_names):
        for j, model2 in enumerate(model_names):
            if j <= i:  # Avoid duplicate comparisons
                continue
            
            comparison_key = f"{model1} vs {model2}"
            comparisons[comparison_key] = compare_bow(model_bows[model1], model_bows[model2])
    
    # Create combined word list across all models
    all_words = set()
    for bow in bows:
        all_words.update(bow.keys())
    
    # Create a matrix of word counts across models
    word_count_matrix = {}
    for word in sorted(list(all_words)):
        word_counts = [bow.get(word, 0) for bow in bows]
        # Only include words that show up in at least one model
        if any(count > 0 for count in word_counts):
            word_count_matrix[word] = {model: bow.get(word, 0) for model, bow in zip(model_names, bows)}
    
    # Sort matrix by most differential words (words with biggest variance across models)
    word_variances = {}
    for word, counts in word_count_matrix.items():
        count_values = list(counts.values())
        if len(count_values) > 1:
            word_variances[word] = np.var(count_values)
    
    # Get top differential words
    top_diff_words = sorted(word_variances.items(), key=lambda x: x[1], reverse=True)[:top_n]
    differential_words = [word for word, _ in top_diff_words]
    
    # Format results
    result = {
        "model_word_counts": model_bows,
        "important_words": model_important_words,
        "comparisons": comparisons,
        "differential_words": differential_words,
        "word_count_matrix": {word: word_count_matrix[word] for word in differential_words},
        "models": model_names
    }
    
    return result

def compare_bow(texts, model_names, top_n=25):
    """
    Compare bag of words between different texts
    
    Args:
        texts (list): List of text responses to compare
        model_names (list): Names of models corresponding to responses
        top_n (int): Number of top words to consider
        
    Returns:
        dict: Comparative analysis
    """
    return compare_bow_across_texts(texts, model_names, top_n)