File size: 7,743 Bytes
7138f76
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7731b47
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7138f76
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer
import statistics
import re

def download_nltk_resources():
    """Download required NLTK resources if not already downloaded"""
    try:
        nltk.download('vader_lexicon', quiet=True)
    except:
        pass

# Ensure NLTK resources are available
download_nltk_resources()

def classify_formality(text):
    """
    Classify text formality based on simple heuristics
    
    Args:
        text (str): Text to analyze
        
    Returns:
        str: Formality level (Formal, Neutral, or Informal)
    """
    # Simple formality indicators
    formal_indicators = [
        r'\b(therefore|thus|consequently|furthermore|moreover|however)\b',
        r'\b(in accordance with|with respect to|regarding|concerning)\b',
        r'\b(shall|must|may|will be required to)\b',
        r'\b(it is|there are|there is)\b',
        r'\b(Mr\.|Ms\.|Dr\.|Prof\.)\b'
    ]
    
    informal_indicators = [
        r'\b(like|yeah|cool|awesome|gonna|wanna|gotta)\b',
        r'(\!{2,}|\?{2,})',
        r'\b(lol|haha|wow|omg|btw)\b',
        r'\b(don\'t|can\'t|won\'t|shouldn\'t)\b',
        r'(\.{3,})'
    ]
    
    # Calculate scores
    formal_score = sum([len(re.findall(pattern, text, re.IGNORECASE)) for pattern in formal_indicators])
    informal_score = sum([len(re.findall(pattern, text, re.IGNORECASE)) for pattern in informal_indicators])
    
    # Normalize by text length
    words = len(text.split())
    if words > 0:
        formal_score = formal_score / (words / 100)  # per 100 words
        informal_score = informal_score / (words / 100)  # per 100 words
    
    # Determine formality
    if formal_score > informal_score * 1.5:
        return "Formal"
    elif informal_score > formal_score * 1.5:
        return "Informal"
    else:
        return "Neutral"

def classify_sentiment(text):
    """
    Classify text sentiment using NLTK's VADER
    
    Args:
        text (str): Text to analyze
        
    Returns:
        str: Sentiment (Positive, Neutral, or Negative)
    """
    try:
        sia = SentimentIntensityAnalyzer()
        sentiment = sia.polarity_scores(text)
        
        if sentiment['compound'] >= 0.05:
            return "Positive"
        elif sentiment['compound'] <= -0.05:
            return "Negative"
        else:
            return "Neutral"
    except:
        return "Neutral"

def classify_complexity(text):
    """
    Classify text complexity based on sentence length and word length
    
    Args:
        text (str): Text to analyze
        
    Returns:
        str: Complexity level (Simple, Average, or Complex)
    """
    # Split into sentences
    sentences = nltk.sent_tokenize(text)
    
    if not sentences:
        return "Average"
    
    # Calculate average sentence length
    sentence_lengths = [len(s.split()) for s in sentences]
    avg_sentence_length = statistics.mean(sentence_lengths) if sentence_lengths else 0
    
    # Calculate average word length
    words = [word for sentence in sentences for word in nltk.word_tokenize(sentence) 
             if word.isalnum()]  # only consider alphanumeric tokens
    
    avg_word_length = statistics.mean([len(word) for word in words]) if words else 0
    
    # Determine complexity
    if avg_sentence_length > 20 or avg_word_length > 6:
        return "Complex"
    elif avg_sentence_length < 12 or avg_word_length < 4:
        return "Simple"
    else:
        return "Average"

def compare_classifications(text1, text2):
    """
    Compare classifications between two texts
    
    Args:
        text1 (str): First text
        text2 (str): Second text
        
    Returns:
        dict: Comparison results
    """
    formality1 = classify_formality(text1)
    formality2 = classify_formality(text2)
    
    sentiment1 = classify_sentiment(text1)
    sentiment2 = classify_sentiment(text2)
    
    complexity1 = classify_complexity(text1)
    complexity2 = classify_complexity(text2)
    
    results = {}
    
    if formality1 != formality2:
        results["Formality"] = f"Model 1 is {formality1.lower()}, while Model 2 is {formality2.lower()}"
    
    if sentiment1 != sentiment2:
        results["Sentiment"] = f"Model 1 has a {sentiment1.lower()} tone, while Model 2 has a {sentiment2.lower()} tone"
    
    if complexity1 != complexity2:
        results["Complexity"] = f"Model 1 uses {complexity1.lower()} language, while Model 2 uses {complexity2.lower()} language"
    
    if not results:
        results["Summary"] = "Both responses have similar writing characteristics"
    
    return results

def classify_with_roberta(text, task="sentiment", model_name=None):
    """
    Classify text using a RoBERTa model from the dataset directory
    
    Args:
        text (str): Text to analyze
        task (str): Classification task ('sentiment', 'toxicity', 'topic', 'person')
        model_name (str, optional): Specific model to use, if None will use task-appropriate model
        
    Returns:
        dict: Classification results with labels and scores
    """
    try:
        import torch
        from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline
        
        # Map tasks to appropriate pre-trained models
        task_model_map = {
            "sentiment": "cardiffnlp/twitter-roberta-base-sentiment",
            "toxicity": "cardiffnlp/twitter-roberta-base-hate",
            "topic": "facebook/bart-large-mnli",  # Zero-shot classification for topics
            "person": "roberta-base"  # Default for person detection - could be fine-tuned
        }
        
        # Use mapped model if not specified
        if model_name is None and task in task_model_map:
            model_to_use = task_model_map[task]
        elif model_name is not None:
            model_to_use = model_name
        else:
            model_to_use = "roberta-base"
            
        # Special handling for zero-shot topic classification
        if task == "topic":
            classifier = pipeline("zero-shot-classification", model=model_to_use)
            topics = ["economy", "foreign policy", "healthcare", "environment", "immigration"]
            results = classifier(text, topics, multi_label=False)
            return {
                "labels": results["labels"],
                "scores": results["scores"]
            }
        else:
            # Initialize the classification pipeline
            classifier = pipeline("text-classification", model=model_to_use, return_all_scores=True)
            
            # Get classification results
            results = classifier(text)
            
            # Format results for consistent output
            if isinstance(results, list) and len(results) == 1:
                results = results[0]
                
            return {
                "task": task,
                "model": model_to_use,
                "results": results
            }
    
    except ImportError:
        return {"error": "Required packages not installed. Please install transformers and torch."}
    except Exception as e:
        return {"error": f"Classification failed: {str(e)}"}

def analyze_dataset_with_roberta(dataset_texts, task="topic"):
    """
    Analyze a collection of dataset texts using RoBERTa models
    
    Args:
        dataset_texts (dict): Dictionary with keys as text identifiers and values as text content
        task (str): Classification task to perform
        
    Returns:
        dict: Classification results keyed by text identifier
    """
    results = {}
    
    for text_id, text_content in dataset_texts.items():
        results[text_id] = classify_with_roberta(text_content, task=task)
    
    return results