525GradioApp / processors /text_classifiers.py
Ryan
update
c435293
raw
history blame
4.61 kB
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer
import statistics
import re
def download_nltk_resources():
"""Download required NLTK resources if not already downloaded"""
try:
nltk.download('vader_lexicon', quiet=True)
except:
pass
# Ensure NLTK resources are available
download_nltk_resources()
def classify_formality(text):
"""
Classify text formality based on simple heuristics
Args:
text (str): Text to analyze
Returns:
str: Formality level (Formal, Neutral, or Informal)
"""
# Simple formality indicators
formal_indicators = [
r'\b(therefore|thus|consequently|furthermore|moreover|however)\b',
r'\b(in accordance with|with respect to|regarding|concerning)\b',
r'\b(shall|must|may|will be required to)\b',
r'\b(it is|there are|there is)\b',
r'\b(Mr\.|Ms\.|Dr\.|Prof\.)\b'
]
informal_indicators = [
r'\b(like|yeah|cool|awesome|gonna|wanna|gotta)\b',
r'(\!{2,}|\?{2,})',
r'\b(lol|haha|wow|omg|btw)\b',
r'\b(don\'t|can\'t|won\'t|shouldn\'t)\b',
r'(\.{3,})'
]
# Calculate scores
formal_score = sum([len(re.findall(pattern, text, re.IGNORECASE)) for pattern in formal_indicators])
informal_score = sum([len(re.findall(pattern, text, re.IGNORECASE)) for pattern in informal_indicators])
# Normalize by text length
words = len(text.split())
if words > 0:
formal_score = formal_score / (words / 100) # per 100 words
informal_score = informal_score / (words / 100) # per 100 words
# Determine formality
if formal_score > informal_score * 1.5:
return "Formal"
elif informal_score > formal_score * 1.5:
return "Informal"
else:
return "Neutral"
def classify_sentiment(text):
"""
Classify text sentiment using NLTK's VADER
Args:
text (str): Text to analyze
Returns:
str: Sentiment (Positive, Neutral, or Negative)
"""
try:
sia = SentimentIntensityAnalyzer()
sentiment = sia.polarity_scores(text)
if sentiment['compound'] >= 0.05:
return "Positive"
elif sentiment['compound'] <= -0.05:
return "Negative"
else:
return "Neutral"
except:
return "Neutral"
def classify_complexity(text):
"""
Classify text complexity based on sentence length and word length
Args:
text (str): Text to analyze
Returns:
str: Complexity level (Simple, Average, or Complex)
"""
# Split into sentences
sentences = nltk.sent_tokenize(text)
if not sentences:
return "Average"
# Calculate average sentence length
sentence_lengths = [len(s.split()) for s in sentences]
avg_sentence_length = statistics.mean(sentence_lengths) if sentence_lengths else 0
# Calculate average word length
words = [word for sentence in sentences for word in nltk.word_tokenize(sentence)
if word.isalnum()] # only consider alphanumeric tokens
avg_word_length = statistics.mean([len(word) for word in words]) if words else 0
# Determine complexity
if avg_sentence_length > 20 or avg_word_length > 6:
return "Complex"
elif avg_sentence_length < 12 or avg_word_length < 4:
return "Simple"
else:
return "Average"
def compare_classifications(text1, text2):
"""
Compare classifications between two texts
Args:
text1 (str): First text
text2 (str): Second text
Returns:
dict: Comparison results
"""
formality1 = classify_formality(text1)
formality2 = classify_formality(text2)
sentiment1 = classify_sentiment(text1)
sentiment2 = classify_sentiment(text2)
complexity1 = classify_complexity(text1)
complexity2 = classify_complexity(text2)
results = {}
if formality1 != formality2:
results["Formality"] = f"Model 1 is {formality1.lower()}, while Model 2 is {formality2.lower()}"
if sentiment1 != sentiment2:
results["Sentiment"] = f"Model 1 has a {sentiment1.lower()} tone, while Model 2 has a {sentiment2.lower()} tone"
if complexity1 != complexity2:
results["Complexity"] = f"Model 1 uses {complexity1.lower()} language, while Model 2 uses {complexity2.lower()} language"
if not results:
results["Summary"] = "Both responses have similar writing characteristics"
return results