Spaces:

RyanS974
/

525GradioApp

Sleeping

File size: 7,743 Bytes

import nltk
from nltk.sentiment import SentimentIntensityAnalyzer
import statistics
import re

def download_nltk_resources():
    """Download required NLTK resources if not already downloaded"""
    try:
        nltk.download('vader_lexicon', quiet=True)
    except:
        pass

# Ensure NLTK resources are available
download_nltk_resources()

def classify_formality(text):
    """
    Classify text formality based on simple heuristics
    
    Args:
        text (str): Text to analyze
        
    Returns:
        str: Formality level (Formal, Neutral, or Informal)
    """
    # Simple formality indicators
    formal_indicators = [
        r'\b(therefore|thus|consequently|furthermore|moreover|however)\b',
        r'\b(in accordance with|with respect to|regarding|concerning)\b',
        r'\b(shall|must|may|will be required to)\b',
        r'\b(it is|there are|there is)\b',
        r'\b(Mr\.|Ms\.|Dr\.|Prof\.)\b'
    ]
    
    informal_indicators = [
        r'\b(like|yeah|cool|awesome|gonna|wanna|gotta)\b',
        r'(\!{2,}|\?{2,})',
        r'\b(lol|haha|wow|omg|btw)\b',
        r'\b(don\'t|can\'t|won\'t|shouldn\'t)\b',
        r'(\.{3,})'
    ]
    
    # Calculate scores
    formal_score = sum([len(re.findall(pattern, text, re.IGNORECASE)) for pattern in formal_indicators])
    informal_score = sum([len(re.findall(pattern, text, re.IGNORECASE)) for pattern in informal_indicators])
    
    # Normalize by text length
    words = len(text.split())
    if words > 0:
        formal_score = formal_score / (words / 100)  # per 100 words
        informal_score = informal_score / (words / 100)  # per 100 words
    
    # Determine formality
    if formal_score > informal_score * 1.5:
        return "Formal"
    elif informal_score > formal_score * 1.5:
        return "Informal"
    else:
        return "Neutral"

def classify_sentiment(text):
    """
    Classify text sentiment using NLTK's VADER
    
    Args:
        text (str): Text to analyze
        
    Returns:
        str: Sentiment (Positive, Neutral, or Negative)
    """
    try:
        sia = SentimentIntensityAnalyzer()
        sentiment = sia.polarity_scores(text)
        
        if sentiment['compound'] >= 0.05:
            return "Positive"
        elif sentiment['compound'] <= -0.05:
            return "Negative"
        else:
            return "Neutral"
    except:
        return "Neutral"

def classify_complexity(text):
    """
    Classify text complexity based on sentence length and word length
    
    Args:
        text (str): Text to analyze
        
    Returns:
        str: Complexity level (Simple, Average, or Complex)
    """
    # Split into sentences
    sentences = nltk.sent_tokenize(text)
    
    if not sentences:
        return "Average"
    
    # Calculate average sentence length
    sentence_lengths = [len(s.split()) for s in sentences]
    avg_sentence_length = statistics.mean(sentence_lengths) if sentence_lengths else 0
    
    # Calculate average word length
    words = [word for sentence in sentences for word in nltk.word_tokenize(sentence) 
             if word.isalnum()]  # only consider alphanumeric tokens
    
    avg_word_length = statistics.mean([len(word) for word in words]) if words else 0
    
    # Determine complexity
    if avg_sentence_length > 20 or avg_word_length > 6:
        return "Complex"
    elif avg_sentence_length < 12 or avg_word_length < 4:
        return "Simple"
    else:
        return "Average"

def compare_classifications(text1, text2):
    """
    Compare classifications between two texts
    
    Args:
        text1 (str): First text
        text2 (str): Second text
        
    Returns:
        dict: Comparison results
    """
    formality1 = classify_formality(text1)
    formality2 = classify_formality(text2)
    
    sentiment1 = classify_sentiment(text1)
    sentiment2 = classify_sentiment(text2)
    
    complexity1 = classify_complexity(text1)
    complexity2 = classify_complexity(text2)
    
    results = {}
    
    if formality1 != formality2:
        results["Formality"] = f"Model 1 is {formality1.lower()}, while Model 2 is {formality2.lower()}"
    
    if sentiment1 != sentiment2:
        results["Sentiment"] = f"Model 1 has a {sentiment1.lower()} tone, while Model 2 has a {sentiment2.lower()} tone"
    
    if complexity1 != complexity2:
        results["Complexity"] = f"Model 1 uses {complexity1.lower()} language, while Model 2 uses {complexity2.lower()} language"
    
    if not results:
        results["Summary"] = "Both responses have similar writing characteristics"
    
    return results

def classify_with_roberta(text, task="sentiment", model_name=None):
    """
    Classify text using a RoBERTa model from the dataset directory
    
    Args:
        text (str): Text to analyze
        task (str): Classification task ('sentiment', 'toxicity', 'topic', 'person')
        model_name (str, optional): Specific model to use, if None will use task-appropriate model
        
    Returns:
        dict: Classification results with labels and scores
    """
    try:
        import torch
        from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline
        
        # Map tasks to appropriate pre-trained models
        task_model_map = {
            "sentiment": "cardiffnlp/twitter-roberta-base-sentiment",
            "toxicity": "cardiffnlp/twitter-roberta-base-hate",
            "topic": "facebook/bart-large-mnli",  # Zero-shot classification for topics
            "person": "roberta-base"  # Default for person detection - could be fine-tuned
        }
        
        # Use mapped model if not specified
        if model_name is None and task in task_model_map:
            model_to_use = task_model_map[task]
        elif model_name is not None:
            model_to_use = model_name
        else:
            model_to_use = "roberta-base"
            
        # Special handling for zero-shot topic classification
        if task == "topic":
            classifier = pipeline("zero-shot-classification", model=model_to_use)
            topics = ["economy", "foreign policy", "healthcare", "environment", "immigration"]
            results = classifier(text, topics, multi_label=False)
            return {
                "labels": results["labels"],
                "scores": results["scores"]
            }
        else:
            # Initialize the classification pipeline
            classifier = pipeline("text-classification", model=model_to_use, return_all_scores=True)
            
            # Get classification results
            results = classifier(text)
            
            # Format results for consistent output
            if isinstance(results, list) and len(results) == 1:
                results = results[0]
                
            return {
                "task": task,
                "model": model_to_use,
                "results": results
            }
    
    except ImportError:
        return {"error": "Required packages not installed. Please install transformers and torch."}
    except Exception as e:
        return {"error": f"Classification failed: {str(e)}"}

def analyze_dataset_with_roberta(dataset_texts, task="topic"):
    """
    Analyze a collection of dataset texts using RoBERTa models
    
    Args:
        dataset_texts (dict): Dictionary with keys as text identifiers and values as text content
        task (str): Classification task to perform
        
    Returns:
        dict: Classification results keyed by text identifier
    """
    results = {}
    
    for text_id, text_content in dataset_texts.items():
        results[text_id] = classify_with_roberta(text_content, task=task)
    
    return results