Spaces:

RyanS974
/

525GradioApp

Sleeping

525GradioApp / processors /text_classifiers.py

Ryan

update

c435293 4 months ago

4.61 kB

	import nltk
	from nltk.sentiment import SentimentIntensityAnalyzer
	import statistics
	import re

	def download_nltk_resources():
	"""Download required NLTK resources if not already downloaded"""
	try:
	nltk.download('vader_lexicon', quiet=True)
	except:
	pass

	# Ensure NLTK resources are available
	download_nltk_resources()

	def classify_formality(text):
	"""
	Classify text formality based on simple heuristics

	Args:
	text (str): Text to analyze

	Returns:
	str: Formality level (Formal, Neutral, or Informal)
	"""
	# Simple formality indicators
	formal_indicators = [
	r'\b(therefore\|thus\|consequently\|furthermore\|moreover\|however)\b',
	r'\b(in accordance with\|with respect to\|regarding\|concerning)\b',
	r'\b(shall\|must\|may\|will be required to)\b',
	r'\b(it is\|there are\|there is)\b',
	r'\b(Mr\.\|Ms\.\|Dr\.\|Prof\.)\b'
	]

	informal_indicators = [
	r'\b(like\|yeah\|cool\|awesome\|gonna\|wanna\|gotta)\b',
	r'(\!{2,}\|\?{2,})',
	r'\b(lol\|haha\|wow\|omg\|btw)\b',
	r'\b(don\'t\|can\'t\|won\'t\|shouldn\'t)\b',
	r'(\.{3,})'
	]

	# Calculate scores
	formal_score = sum([len(re.findall(pattern, text, re.IGNORECASE)) for pattern in formal_indicators])
	informal_score = sum([len(re.findall(pattern, text, re.IGNORECASE)) for pattern in informal_indicators])

	# Normalize by text length
	words = len(text.split())
	if words > 0:
	formal_score = formal_score / (words / 100) # per 100 words
	informal_score = informal_score / (words / 100) # per 100 words

	# Determine formality
	if formal_score > informal_score * 1.5:
	return "Formal"
	elif informal_score > formal_score * 1.5:
	return "Informal"
	else:
	return "Neutral"

	def classify_sentiment(text):
	"""
	Classify text sentiment using NLTK's VADER

	Args:
	text (str): Text to analyze

	Returns:
	str: Sentiment (Positive, Neutral, or Negative)
	"""
	try:
	sia = SentimentIntensityAnalyzer()
	sentiment = sia.polarity_scores(text)

	if sentiment['compound'] >= 0.05:
	return "Positive"
	elif sentiment['compound'] <= -0.05:
	return "Negative"
	else:
	return "Neutral"
	except:
	return "Neutral"

	def classify_complexity(text):
	"""
	Classify text complexity based on sentence length and word length

	Args:
	text (str): Text to analyze

	Returns:
	str: Complexity level (Simple, Average, or Complex)
	"""
	# Split into sentences
	sentences = nltk.sent_tokenize(text)

	if not sentences:
	return "Average"

	# Calculate average sentence length
	sentence_lengths = [len(s.split()) for s in sentences]
	avg_sentence_length = statistics.mean(sentence_lengths) if sentence_lengths else 0

	# Calculate average word length
	words = [word for sentence in sentences for word in nltk.word_tokenize(sentence)
	if word.isalnum()] # only consider alphanumeric tokens

	avg_word_length = statistics.mean([len(word) for word in words]) if words else 0

	# Determine complexity
	if avg_sentence_length > 20 or avg_word_length > 6:
	return "Complex"
	elif avg_sentence_length < 12 or avg_word_length < 4:
	return "Simple"
	else:
	return "Average"

	def compare_classifications(text1, text2):
	"""
	Compare classifications between two texts

	Args:
	text1 (str): First text
	text2 (str): Second text

	Returns:
	dict: Comparison results
	"""
	formality1 = classify_formality(text1)
	formality2 = classify_formality(text2)

	sentiment1 = classify_sentiment(text1)
	sentiment2 = classify_sentiment(text2)

	complexity1 = classify_complexity(text1)
	complexity2 = classify_complexity(text2)

	results = {}

	if formality1 != formality2:
	results["Formality"] = f"Model 1 is {formality1.lower()}, while Model 2 is {formality2.lower()}"

	if sentiment1 != sentiment2:
	results["Sentiment"] = f"Model 1 has a {sentiment1.lower()} tone, while Model 2 has a {sentiment2.lower()} tone"

	if complexity1 != complexity2:
	results["Complexity"] = f"Model 1 uses {complexity1.lower()} language, while Model 2 uses {complexity2.lower()} language"

	if not results:
	results["Summary"] = "Both responses have similar writing characteristics"

	return results