525GradioApp / processors /text_classifiers.py
Ryan
update
7731b47
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer
import statistics
import re
def download_nltk_resources():
"""Download required NLTK resources if not already downloaded"""
try:
nltk.download('vader_lexicon', quiet=True)
except:
pass
# Ensure NLTK resources are available
download_nltk_resources()
def classify_formality(text):
"""
Classify text formality based on simple heuristics
Args:
text (str): Text to analyze
Returns:
str: Formality level (Formal, Neutral, or Informal)
"""
# Simple formality indicators
formal_indicators = [
r'\b(therefore|thus|consequently|furthermore|moreover|however)\b',
r'\b(in accordance with|with respect to|regarding|concerning)\b',
r'\b(shall|must|may|will be required to)\b',
r'\b(it is|there are|there is)\b',
r'\b(Mr\.|Ms\.|Dr\.|Prof\.)\b'
]
informal_indicators = [
r'\b(like|yeah|cool|awesome|gonna|wanna|gotta)\b',
r'(\!{2,}|\?{2,})',
r'\b(lol|haha|wow|omg|btw)\b',
r'\b(don\'t|can\'t|won\'t|shouldn\'t)\b',
r'(\.{3,})'
]
# Calculate scores
formal_score = sum([len(re.findall(pattern, text, re.IGNORECASE)) for pattern in formal_indicators])
informal_score = sum([len(re.findall(pattern, text, re.IGNORECASE)) for pattern in informal_indicators])
# Normalize by text length
words = len(text.split())
if words > 0:
formal_score = formal_score / (words / 100) # per 100 words
informal_score = informal_score / (words / 100) # per 100 words
# Determine formality
if formal_score > informal_score * 1.5:
return "Formal"
elif informal_score > formal_score * 1.5:
return "Informal"
else:
return "Neutral"
def classify_sentiment(text):
"""
Classify text sentiment using NLTK's VADER
Args:
text (str): Text to analyze
Returns:
str: Sentiment (Positive, Neutral, or Negative)
"""
try:
sia = SentimentIntensityAnalyzer()
sentiment = sia.polarity_scores(text)
if sentiment['compound'] >= 0.05:
return "Positive"
elif sentiment['compound'] <= -0.05:
return "Negative"
else:
return "Neutral"
except:
return "Neutral"
def classify_complexity(text):
"""
Classify text complexity based on sentence length and word length
Args:
text (str): Text to analyze
Returns:
str: Complexity level (Simple, Average, or Complex)
"""
# Split into sentences
sentences = nltk.sent_tokenize(text)
if not sentences:
return "Average"
# Calculate average sentence length
sentence_lengths = [len(s.split()) for s in sentences]
avg_sentence_length = statistics.mean(sentence_lengths) if sentence_lengths else 0
# Calculate average word length
words = [word for sentence in sentences for word in nltk.word_tokenize(sentence)
if word.isalnum()] # only consider alphanumeric tokens
avg_word_length = statistics.mean([len(word) for word in words]) if words else 0
# Determine complexity
if avg_sentence_length > 20 or avg_word_length > 6:
return "Complex"
elif avg_sentence_length < 12 or avg_word_length < 4:
return "Simple"
else:
return "Average"
def compare_classifications(text1, text2):
"""
Compare classifications between two texts
Args:
text1 (str): First text
text2 (str): Second text
Returns:
dict: Comparison results
"""
formality1 = classify_formality(text1)
formality2 = classify_formality(text2)
sentiment1 = classify_sentiment(text1)
sentiment2 = classify_sentiment(text2)
complexity1 = classify_complexity(text1)
complexity2 = classify_complexity(text2)
results = {}
if formality1 != formality2:
results["Formality"] = f"Model 1 is {formality1.lower()}, while Model 2 is {formality2.lower()}"
if sentiment1 != sentiment2:
results["Sentiment"] = f"Model 1 has a {sentiment1.lower()} tone, while Model 2 has a {sentiment2.lower()} tone"
if complexity1 != complexity2:
results["Complexity"] = f"Model 1 uses {complexity1.lower()} language, while Model 2 uses {complexity2.lower()} language"
if not results:
results["Summary"] = "Both responses have similar writing characteristics"
return results
def classify_with_roberta(text, task="sentiment", model_name=None):
"""
Classify text using a RoBERTa model from the dataset directory
Args:
text (str): Text to analyze
task (str): Classification task ('sentiment', 'toxicity', 'topic', 'person')
model_name (str, optional): Specific model to use, if None will use task-appropriate model
Returns:
dict: Classification results with labels and scores
"""
try:
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline
# Map tasks to appropriate pre-trained models
task_model_map = {
"sentiment": "cardiffnlp/twitter-roberta-base-sentiment",
"toxicity": "cardiffnlp/twitter-roberta-base-hate",
"topic": "facebook/bart-large-mnli", # Zero-shot classification for topics
"person": "roberta-base" # Default for person detection - could be fine-tuned
}
# Use mapped model if not specified
if model_name is None and task in task_model_map:
model_to_use = task_model_map[task]
elif model_name is not None:
model_to_use = model_name
else:
model_to_use = "roberta-base"
# Special handling for zero-shot topic classification
if task == "topic":
classifier = pipeline("zero-shot-classification", model=model_to_use)
topics = ["economy", "foreign policy", "healthcare", "environment", "immigration"]
results = classifier(text, topics, multi_label=False)
return {
"labels": results["labels"],
"scores": results["scores"]
}
else:
# Initialize the classification pipeline
classifier = pipeline("text-classification", model=model_to_use, return_all_scores=True)
# Get classification results
results = classifier(text)
# Format results for consistent output
if isinstance(results, list) and len(results) == 1:
results = results[0]
return {
"task": task,
"model": model_to_use,
"results": results
}
except ImportError:
return {"error": "Required packages not installed. Please install transformers and torch."}
except Exception as e:
return {"error": f"Classification failed: {str(e)}"}
def analyze_dataset_with_roberta(dataset_texts, task="topic"):
"""
Analyze a collection of dataset texts using RoBERTa models
Args:
dataset_texts (dict): Dictionary with keys as text identifiers and values as text content
task (str): Classification task to perform
Returns:
dict: Classification results keyed by text identifier
"""
results = {}
for text_id, text_content in dataset_texts.items():
results[text_id] = classify_with_roberta(text_content, task=task)
return results