Spaces:

RyanS974
/

525GradioApp

Sleeping

525GradioApp / processors /text_classifiers.py

Ryan

update

7731b47 4 months ago

7.74 kB

	import nltk
	from nltk.sentiment import SentimentIntensityAnalyzer
	import statistics
	import re

	def download_nltk_resources():
	"""Download required NLTK resources if not already downloaded"""
	try:
	nltk.download('vader_lexicon', quiet=True)
	except:
	pass

	# Ensure NLTK resources are available
	download_nltk_resources()

	def classify_formality(text):
	"""
	Classify text formality based on simple heuristics

	Args:
	text (str): Text to analyze

	Returns:
	str: Formality level (Formal, Neutral, or Informal)
	"""
	# Simple formality indicators
	formal_indicators = [
	r'\b(therefore\|thus\|consequently\|furthermore\|moreover\|however)\b',
	r'\b(in accordance with\|with respect to\|regarding\|concerning)\b',
	r'\b(shall\|must\|may\|will be required to)\b',
	r'\b(it is\|there are\|there is)\b',
	r'\b(Mr\.\|Ms\.\|Dr\.\|Prof\.)\b'
	]

	informal_indicators = [
	r'\b(like\|yeah\|cool\|awesome\|gonna\|wanna\|gotta)\b',
	r'(\!{2,}\|\?{2,})',
	r'\b(lol\|haha\|wow\|omg\|btw)\b',
	r'\b(don\'t\|can\'t\|won\'t\|shouldn\'t)\b',
	r'(\.{3,})'
	]

	# Calculate scores
	formal_score = sum([len(re.findall(pattern, text, re.IGNORECASE)) for pattern in formal_indicators])
	informal_score = sum([len(re.findall(pattern, text, re.IGNORECASE)) for pattern in informal_indicators])

	# Normalize by text length
	words = len(text.split())
	if words > 0:
	formal_score = formal_score / (words / 100) # per 100 words
	informal_score = informal_score / (words / 100) # per 100 words

	# Determine formality
	if formal_score > informal_score * 1.5:
	return "Formal"
	elif informal_score > formal_score * 1.5:
	return "Informal"
	else:
	return "Neutral"

	def classify_sentiment(text):
	"""
	Classify text sentiment using NLTK's VADER

	Args:
	text (str): Text to analyze

	Returns:
	str: Sentiment (Positive, Neutral, or Negative)
	"""
	try:
	sia = SentimentIntensityAnalyzer()
	sentiment = sia.polarity_scores(text)

	if sentiment['compound'] >= 0.05:
	return "Positive"
	elif sentiment['compound'] <= -0.05:
	return "Negative"
	else:
	return "Neutral"
	except:
	return "Neutral"

	def classify_complexity(text):
	"""
	Classify text complexity based on sentence length and word length

	Args:
	text (str): Text to analyze

	Returns:
	str: Complexity level (Simple, Average, or Complex)
	"""
	# Split into sentences
	sentences = nltk.sent_tokenize(text)

	if not sentences:
	return "Average"

	# Calculate average sentence length
	sentence_lengths = [len(s.split()) for s in sentences]
	avg_sentence_length = statistics.mean(sentence_lengths) if sentence_lengths else 0

	# Calculate average word length
	words = [word for sentence in sentences for word in nltk.word_tokenize(sentence)
	if word.isalnum()] # only consider alphanumeric tokens

	avg_word_length = statistics.mean([len(word) for word in words]) if words else 0

	# Determine complexity
	if avg_sentence_length > 20 or avg_word_length > 6:
	return "Complex"
	elif avg_sentence_length < 12 or avg_word_length < 4:
	return "Simple"
	else:
	return "Average"

	def compare_classifications(text1, text2):
	"""
	Compare classifications between two texts

	Args:
	text1 (str): First text
	text2 (str): Second text

	Returns:
	dict: Comparison results
	"""
	formality1 = classify_formality(text1)
	formality2 = classify_formality(text2)

	sentiment1 = classify_sentiment(text1)
	sentiment2 = classify_sentiment(text2)

	complexity1 = classify_complexity(text1)
	complexity2 = classify_complexity(text2)

	results = {}

	if formality1 != formality2:
	results["Formality"] = f"Model 1 is {formality1.lower()}, while Model 2 is {formality2.lower()}"

	if sentiment1 != sentiment2:
	results["Sentiment"] = f"Model 1 has a {sentiment1.lower()} tone, while Model 2 has a {sentiment2.lower()} tone"

	if complexity1 != complexity2:
	results["Complexity"] = f"Model 1 uses {complexity1.lower()} language, while Model 2 uses {complexity2.lower()} language"

	if not results:
	results["Summary"] = "Both responses have similar writing characteristics"

	return results

	def classify_with_roberta(text, task="sentiment", model_name=None):
	"""
	Classify text using a RoBERTa model from the dataset directory

	Args:
	text (str): Text to analyze
	task (str): Classification task ('sentiment', 'toxicity', 'topic', 'person')
	model_name (str, optional): Specific model to use, if None will use task-appropriate model

	Returns:
	dict: Classification results with labels and scores
	"""
	try:
	import torch
	from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline

	# Map tasks to appropriate pre-trained models
	task_model_map = {
	"sentiment": "cardiffnlp/twitter-roberta-base-sentiment",
	"toxicity": "cardiffnlp/twitter-roberta-base-hate",
	"topic": "facebook/bart-large-mnli", # Zero-shot classification for topics
	"person": "roberta-base" # Default for person detection - could be fine-tuned
	}

	# Use mapped model if not specified
	if model_name is None and task in task_model_map:
	model_to_use = task_model_map[task]
	elif model_name is not None:
	model_to_use = model_name
	else:
	model_to_use = "roberta-base"

	# Special handling for zero-shot topic classification
	if task == "topic":
	classifier = pipeline("zero-shot-classification", model=model_to_use)
	topics = ["economy", "foreign policy", "healthcare", "environment", "immigration"]
	results = classifier(text, topics, multi_label=False)
	return {
	"labels": results["labels"],
	"scores": results["scores"]
	}
	else:
	# Initialize the classification pipeline
	classifier = pipeline("text-classification", model=model_to_use, return_all_scores=True)

	# Get classification results
	results = classifier(text)

	# Format results for consistent output
	if isinstance(results, list) and len(results) == 1:
	results = results[0]

	return {
	"task": task,
	"model": model_to_use,
	"results": results
	}

	except ImportError:
	return {"error": "Required packages not installed. Please install transformers and torch."}
	except Exception as e:
	return {"error": f"Classification failed: {str(e)}"}

	def analyze_dataset_with_roberta(dataset_texts, task="topic"):
	"""
	Analyze a collection of dataset texts using RoBERTa models

	Args:
	dataset_texts (dict): Dictionary with keys as text identifiers and values as text content
	task (str): Classification task to perform

	Returns:
	dict: Classification results keyed by text identifier
	"""
	results = {}

	for text_id, text_content in dataset_texts.items():
	results[text_id] = classify_with_roberta(text_content, task=task)

	return results