import nltk from nltk.sentiment import SentimentIntensityAnalyzer import statistics import re def download_nltk_resources(): """Download required NLTK resources if not already downloaded""" try: nltk.download('vader_lexicon', quiet=True) except: pass # Ensure NLTK resources are available download_nltk_resources() def classify_formality(text): """ Classify text formality based on simple heuristics Args: text (str): Text to analyze Returns: str: Formality level (Formal, Neutral, or Informal) """ # Simple formality indicators formal_indicators = [ r'\b(therefore|thus|consequently|furthermore|moreover|however)\b', r'\b(in accordance with|with respect to|regarding|concerning)\b', r'\b(shall|must|may|will be required to)\b', r'\b(it is|there are|there is)\b', r'\b(Mr\.|Ms\.|Dr\.|Prof\.)\b' ] informal_indicators = [ r'\b(like|yeah|cool|awesome|gonna|wanna|gotta)\b', r'(\!{2,}|\?{2,})', r'\b(lol|haha|wow|omg|btw)\b', r'\b(don\'t|can\'t|won\'t|shouldn\'t)\b', r'(\.{3,})' ] # Calculate scores formal_score = sum([len(re.findall(pattern, text, re.IGNORECASE)) for pattern in formal_indicators]) informal_score = sum([len(re.findall(pattern, text, re.IGNORECASE)) for pattern in informal_indicators]) # Normalize by text length words = len(text.split()) if words > 0: formal_score = formal_score / (words / 100) # per 100 words informal_score = informal_score / (words / 100) # per 100 words # Determine formality if formal_score > informal_score * 1.5: return "Formal" elif informal_score > formal_score * 1.5: return "Informal" else: return "Neutral" def classify_sentiment(text): """ Classify text sentiment using NLTK's VADER Args: text (str): Text to analyze Returns: str: Sentiment (Positive, Neutral, or Negative) """ try: sia = SentimentIntensityAnalyzer() sentiment = sia.polarity_scores(text) if sentiment['compound'] >= 0.05: return "Positive" elif sentiment['compound'] <= -0.05: return "Negative" else: return "Neutral" except: return "Neutral" def classify_complexity(text): """ Classify text complexity based on sentence length and word length Args: text (str): Text to analyze Returns: str: Complexity level (Simple, Average, or Complex) """ # Split into sentences sentences = nltk.sent_tokenize(text) if not sentences: return "Average" # Calculate average sentence length sentence_lengths = [len(s.split()) for s in sentences] avg_sentence_length = statistics.mean(sentence_lengths) if sentence_lengths else 0 # Calculate average word length words = [word for sentence in sentences for word in nltk.word_tokenize(sentence) if word.isalnum()] # only consider alphanumeric tokens avg_word_length = statistics.mean([len(word) for word in words]) if words else 0 # Determine complexity if avg_sentence_length > 20 or avg_word_length > 6: return "Complex" elif avg_sentence_length < 12 or avg_word_length < 4: return "Simple" else: return "Average" def compare_classifications(text1, text2): """ Compare classifications between two texts Args: text1 (str): First text text2 (str): Second text Returns: dict: Comparison results """ formality1 = classify_formality(text1) formality2 = classify_formality(text2) sentiment1 = classify_sentiment(text1) sentiment2 = classify_sentiment(text2) complexity1 = classify_complexity(text1) complexity2 = classify_complexity(text2) results = {} if formality1 != formality2: results["Formality"] = f"Model 1 is {formality1.lower()}, while Model 2 is {formality2.lower()}" if sentiment1 != sentiment2: results["Sentiment"] = f"Model 1 has a {sentiment1.lower()} tone, while Model 2 has a {sentiment2.lower()} tone" if complexity1 != complexity2: results["Complexity"] = f"Model 1 uses {complexity1.lower()} language, while Model 2 uses {complexity2.lower()} language" if not results: results["Summary"] = "Both responses have similar writing characteristics" return results def classify_with_roberta(text, task="sentiment", model_name=None): """ Classify text using a RoBERTa model from the dataset directory Args: text (str): Text to analyze task (str): Classification task ('sentiment', 'toxicity', 'topic', 'person') model_name (str, optional): Specific model to use, if None will use task-appropriate model Returns: dict: Classification results with labels and scores """ try: import torch from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline # Map tasks to appropriate pre-trained models task_model_map = { "sentiment": "cardiffnlp/twitter-roberta-base-sentiment", "toxicity": "cardiffnlp/twitter-roberta-base-hate", "topic": "facebook/bart-large-mnli", # Zero-shot classification for topics "person": "roberta-base" # Default for person detection - could be fine-tuned } # Use mapped model if not specified if model_name is None and task in task_model_map: model_to_use = task_model_map[task] elif model_name is not None: model_to_use = model_name else: model_to_use = "roberta-base" # Special handling for zero-shot topic classification if task == "topic": classifier = pipeline("zero-shot-classification", model=model_to_use) topics = ["economy", "foreign policy", "healthcare", "environment", "immigration"] results = classifier(text, topics, multi_label=False) return { "labels": results["labels"], "scores": results["scores"] } else: # Initialize the classification pipeline classifier = pipeline("text-classification", model=model_to_use, return_all_scores=True) # Get classification results results = classifier(text) # Format results for consistent output if isinstance(results, list) and len(results) == 1: results = results[0] return { "task": task, "model": model_to_use, "results": results } except ImportError: return {"error": "Required packages not installed. Please install transformers and torch."} except Exception as e: return {"error": f"Classification failed: {str(e)}"} def analyze_dataset_with_roberta(dataset_texts, task="topic"): """ Analyze a collection of dataset texts using RoBERTa models Args: dataset_texts (dict): Dictionary with keys as text identifiers and values as text content task (str): Classification task to perform Returns: dict: Classification results keyed by text identifier """ results = {} for text_id, text_content in dataset_texts.items(): results[text_id] = classify_with_roberta(text_content, task=task) return results