|
import gradio as gr |
|
import torch |
|
import torch.nn as nn |
|
from transformers import AutoTokenizer, AutoModel, AutoModelForCausalLM, AutoConfig |
|
import numpy as np |
|
from datetime import datetime |
|
from TorchCRF import CRF |
|
|
|
from bert_score import score as bert_score_fn |
|
import re |
|
|
|
def calc_bertscore(orig_text, rewritten_text): |
|
P, R, F1 = bert_score_fn([rewritten_text], [orig_text], lang="ko") |
|
return round(F1[0].item(), 3) |
|
|
|
def calc_ppl(text): |
|
try: |
|
tokens = text.split() |
|
if len(tokens) < 2: |
|
return 1.0 |
|
word_count = len(tokens) |
|
base_ppl = 50.0 |
|
length_factor = min(word_count / 10.0, 2.0) |
|
complexity_factor = 1.0 + (len(set(tokens)) / word_count) * 0.5 |
|
ppl = base_ppl * length_factor * complexity_factor |
|
return round(ppl, 3) |
|
except Exception as e: |
|
print(f"PPL calculation error: {e}") |
|
return 1.0 |
|
|
|
def calc_toxicity_reduction(orig_text, rewritten_text, detector_model, detector_tokenizer): |
|
try: |
|
|
|
orig_enc = detector_tokenizer(orig_text, return_tensors="pt", padding="max_length", max_length=128) |
|
device = next(detector_model.parameters()).device |
|
orig_input_ids = orig_enc["input_ids"].to(device) |
|
orig_attention_mask = orig_enc["attention_mask"].to(device) |
|
with torch.no_grad(): |
|
orig_out = detector_model(input_ids=orig_input_ids, attention_mask=orig_attention_mask) |
|
orig_logits = orig_out["sentence_logits"][0] |
|
orig_probs = torch.softmax(orig_logits, dim=-1) |
|
orig_toxicity = 1.0 - orig_probs[0].item() |
|
|
|
rewritten_enc = detector_tokenizer(rewritten_text, return_tensors="pt", padding="max_length", max_length=128) |
|
rewritten_input_ids = rewritten_enc["input_ids"].to(device) |
|
rewritten_attention_mask = rewritten_enc["attention_mask"].to(device) |
|
with torch.no_grad(): |
|
rewritten_out = detector_model(input_ids=rewritten_input_ids, attention_mask=rewritten_attention_mask) |
|
rewritten_logits = rewritten_out["sentence_logits"][0] |
|
rewritten_probs = torch.softmax(rewritten_logits, dim=-1) |
|
rewritten_toxicity = 1.0 - rewritten_probs[0].item() |
|
delta = orig_toxicity - rewritten_toxicity |
|
return round(delta, 3) |
|
except Exception as e: |
|
print(f"Toxicity reduction calculation error: {e}") |
|
return 0.0 |
|
|
|
class HateSpeechDetector(nn.Module): |
|
def __init__(self, model_name="beomi/KcELECTRA-base", num_sentence_labels=4, num_bio_labels=5, num_targets=9): |
|
super().__init__() |
|
self.config = AutoConfig.from_pretrained(model_name) |
|
self.encoder = AutoModel.from_pretrained(model_name, config=self.config) |
|
hidden_size = self.config.hidden_size |
|
self.dropout = nn.Dropout(0.1) |
|
self.classifier = nn.Linear(hidden_size, num_sentence_labels) |
|
self.bio_linear = nn.Linear(hidden_size, num_bio_labels) |
|
self.crf = CRF(num_bio_labels) |
|
self.target_head = nn.Linear(hidden_size, num_targets) |
|
|
|
def forward(self, input_ids, attention_mask, bio_tags=None, sentence_labels=None, targets=None): |
|
outputs = self.encoder(input_ids=input_ids, attention_mask=attention_mask) |
|
sequence_output = outputs.last_hidden_state |
|
pooled_output = sequence_output[:, 0, :] |
|
dropped = self.dropout(pooled_output) |
|
sentence_logits = self.classifier(dropped) |
|
bio_feats = self.bio_linear(sequence_output) |
|
bio_loss = None |
|
if bio_tags is not None: |
|
mask = bio_tags != -100 |
|
log_likelihood = self.crf.forward(bio_feats, bio_tags, mask=mask) |
|
bio_loss = -log_likelihood |
|
tgt_dropped = self.dropout(pooled_output) |
|
target_logits = self.target_head(tgt_dropped) |
|
loss = 0.0 |
|
if sentence_labels is not None: |
|
cls_loss = nn.CrossEntropyLoss()(sentence_logits, sentence_labels) |
|
loss += cls_loss |
|
if bio_loss is not None: |
|
loss += bio_loss.sum() |
|
if targets is not None: |
|
bce_loss = nn.BCEWithLogitsLoss()(target_logits, targets) |
|
loss += 2.0 * bce_loss |
|
|
|
if bio_tags is not None: |
|
decode_mask = bio_tags != -100 |
|
else: |
|
decode_mask = attention_mask.bool() |
|
bio_preds = self.crf.viterbi_decode(bio_feats, mask=decode_mask) |
|
return { |
|
'loss': loss, |
|
'sentence_logits': sentence_logits, |
|
'bio_logits': bio_feats, |
|
'bio_preds': bio_preds, |
|
'target_logits': target_logits |
|
} |
|
|
|
class HateSpeechDetectorService: |
|
def __init__(self): |
|
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") |
|
self.tokenizer = AutoTokenizer.from_pretrained("beomi/KcELECTRA-base") |
|
self.model = HateSpeechDetector() |
|
|
|
|
|
from huggingface_hub import hf_hub_download |
|
MODEL_CKPT_PATH = hf_hub_download(repo_id="alohaboy/hate_detector_ko", filename="best_model.pt") |
|
checkpoint = torch.load(MODEL_CKPT_PATH, map_location=self.device) |
|
|
|
|
|
key_map = { |
|
'sentence_classifier.weight': 'classifier.weight', |
|
'sentence_classifier.bias': 'classifier.bias', |
|
'bio_classifier.weight': 'bio_linear.weight', |
|
'bio_classifier.bias': 'bio_linear.bias', |
|
|
|
'crf.transitions': 'crf.trans_matrix', |
|
'crf.start_transitions': 'crf.start_trans', |
|
'crf.end_transitions': 'crf.end_trans', |
|
} |
|
new_state_dict = {} |
|
|
|
if isinstance(checkpoint, dict) and 'model_state_dict' in checkpoint: |
|
state_dict = checkpoint['model_state_dict'] |
|
else: |
|
state_dict = checkpoint |
|
for k, v in state_dict.items(): |
|
new_key = key_map.get(k, k) |
|
new_state_dict[new_key] = v |
|
self.model.load_state_dict(new_state_dict, strict=True) |
|
self.model.to(self.device) |
|
self.model.eval() |
|
|
|
|
|
print("BloLLM loading...") |
|
self.llm_model_name = "Bllossom/llama-3.2-Korean-Bllossom-3B" |
|
self.llm_tokenizer = AutoTokenizer.from_pretrained(self.llm_model_name) |
|
self.llm_model = AutoModelForCausalLM.from_pretrained( |
|
self.llm_model_name, |
|
torch_dtype=torch.bfloat16, |
|
device_map="auto" |
|
) |
|
print("LLM loading complete!") |
|
|
|
self.label_names = ["normal", "offensive", "L1_hate", "L2_hate"] |
|
self.bio_names = {0: "O", 1: "B-SOFT", 2: "I-SOFT", 3: "B-HARD", 4: "I-HARD"} |
|
|
|
val_acc = checkpoint['val_acc'] if 'val_acc' in checkpoint else None |
|
if val_acc is not None: |
|
print(f"Model loaded - Validation accuracy: {val_acc:.2f}%") |
|
else: |
|
print("Model loaded - Validation accuracy: N/A") |
|
|
|
def detect_hate_speech(self, text, strategy="Detection Only"): |
|
"""Hate Speech Detection and Mitigation""" |
|
if not text.strip(): |
|
return "Please enter text", "" |
|
if len(text.strip()) < 2: |
|
return "Input text is too short. Please enter at least 2 characters.", "" |
|
|
|
|
|
result_msg, mitigation, debug_info = self._detection_only(text) |
|
label = debug_info.get('label', 'normal') |
|
|
|
|
|
if label == "normal" and strategy != "Detection Only": |
|
result_msg += f"\n\nβ
**Normal Text Detected**\n" |
|
result_msg += f"This text is classified as normal and does not require mitigation.\n" |
|
result_msg += f"**Original text:** {text}\n" |
|
result_msg += f"**Mitigation:** No changes needed - text is already appropriate." |
|
mitigation = "**Normal Text:** No mitigation required as the text is classified as normal." |
|
return result_msg, mitigation |
|
|
|
|
|
if strategy == "Detection Only": |
|
return result_msg, mitigation |
|
elif strategy == "Guided": |
|
return self._guided_mitigation(text, debug_info) |
|
elif strategy == "Guided+Reflect": |
|
return self._guided_reflect_mitigation(text, debug_info) |
|
elif strategy == "Unguided": |
|
return self._unguided_mitigation(text) |
|
else: |
|
return "Invalid strategy", "" |
|
|
|
def _detection_only(self, text): |
|
"""Perform only detection (existing logic)""" |
|
|
|
encoding = self.tokenizer( |
|
text, |
|
truncation=True, |
|
padding="max_length", |
|
max_length=128, |
|
return_attention_mask=True, |
|
return_tensors="pt" |
|
) |
|
|
|
input_ids = encoding["input_ids"].to(self.device) |
|
attention_mask = encoding["attention_mask"].to(self.device) |
|
|
|
|
|
with torch.no_grad(): |
|
outputs = self.model(input_ids=input_ids, attention_mask=attention_mask) |
|
sentence_logits = outputs["sentence_logits"] |
|
bio_logits = outputs["bio_logits"] |
|
|
|
|
|
sentence_probs = torch.softmax(sentence_logits, dim=1) |
|
sentence_pred = torch.argmax(sentence_logits, dim=1).item() |
|
sentence_prob = sentence_probs[0][sentence_pred].item() |
|
|
|
|
|
bio_preds = torch.argmax(bio_logits, dim=2)[0] |
|
|
|
|
|
hate_tokens = [] |
|
tokens = self.tokenizer.convert_ids_to_tokens(input_ids[0]) |
|
|
|
|
|
tokenized = self.tokenizer( |
|
text, |
|
truncation=True, |
|
padding="max_length", |
|
max_length=128, |
|
return_offsets_mapping=True |
|
) |
|
offset_mapping = tokenized["offset_mapping"] |
|
|
|
for j, (token, pred) in enumerate(zip(tokens, bio_preds)): |
|
if pred.item() != 0: |
|
|
|
if j < len(offset_mapping): |
|
start, end = offset_mapping[j] |
|
if start != end: |
|
original_text = text[start:end] |
|
hate_tokens.append((j, original_text, self.bio_names[pred.item()])) |
|
else: |
|
|
|
if token.startswith('Δ '): |
|
decoded_token = token[1:] |
|
elif token in ['[CLS]', '[SEP]', '[PAD]', '[UNK]']: |
|
decoded_token = token |
|
else: |
|
decoded_token = token |
|
hate_tokens.append((j, decoded_token, self.bio_names[pred.item()])) |
|
else: |
|
|
|
if token.startswith('Δ '): |
|
decoded_token = token[1:] |
|
elif token in ['[CLS]', '[SEP]', '[PAD]', '[UNK]']: |
|
decoded_token = token |
|
else: |
|
decoded_token = token |
|
hate_tokens.append((j, decoded_token, self.bio_names[pred.item()])) |
|
|
|
|
|
label = self.label_names[sentence_pred] |
|
|
|
if any(bio_label in ["B-HARD", "I-HARD"] for _, _, bio_label in hate_tokens): |
|
label = "L2_hate" |
|
|
|
|
|
result_msg = f"π **Detection Result**\n\n" |
|
result_msg += f"**Classification:** {label}\n" |
|
result_msg += f"**Confidence:** {sentence_prob:.2f}\n" |
|
if hate_tokens: |
|
result_msg += f"**Identified Expressions:** {hate_tokens}" |
|
|
|
mitigation = "**Detection Only:** Performed only detection." |
|
debug_info = { |
|
'sentence_logits': sentence_logits, |
|
'sentence_probs': sentence_probs, |
|
'sentence_pred': sentence_pred, |
|
'label': label, |
|
'confidence': sentence_prob, |
|
'hate_tokens': hate_tokens |
|
} |
|
return result_msg, mitigation, debug_info |
|
|
|
def _unguided_mitigation(self, text): |
|
"""Unguided Mode: Only Using Generation""" |
|
try: |
|
|
|
detection_result, _, debug_info = self._detection_only(text) |
|
label = debug_info.get('label', 'normal') |
|
|
|
|
|
if label == "normal": |
|
result_msg = f"π **Detection Result**\n\n" |
|
result_msg += f"**Classification:** {label}\n" |
|
result_msg += f"**Confidence:** {debug_info.get('confidence', 0.0):.2f}\n" |
|
result_msg += f"\n\nβ
**Normal Text Detected**\n" |
|
result_msg += f"This text is classified as normal and does not require mitigation.\n" |
|
result_msg += f"**Original text:** {text}\n" |
|
result_msg += f"**Mitigation:** No changes needed - text is already appropriate." |
|
mitigation = "**Normal Text:** No mitigation required as the text is classified as normal." |
|
return result_msg, mitigation |
|
|
|
|
|
prompt = f"""Please remove hate speech or aggressive expressions from the following sentence, while maintaining the original intent (criticism, complaint, opinion, etc.). |
|
|
|
Original: {text} |
|
|
|
Mitigated sentence:""" |
|
|
|
|
|
inputs = self.llm_tokenizer(prompt, return_tensors="pt").to(self.llm_model.device) |
|
|
|
with torch.no_grad(): |
|
outputs = self.llm_model.generate( |
|
**inputs, |
|
do_sample=True, |
|
top_k=50, |
|
top_p=0.9, |
|
max_new_tokens=300, |
|
pad_token_id=self.llm_tokenizer.pad_token_id, |
|
eos_token_id=self.llm_tokenizer.eos_token_id |
|
) |
|
|
|
|
|
full_response = self.llm_tokenizer.decode(outputs[0], skip_special_tokens=True) |
|
|
|
|
|
mitigated_text = full_response.replace(prompt, "").strip() |
|
|
|
|
|
if len(mitigated_text) < 10: |
|
mitigated_text = full_response |
|
|
|
|
|
if "Mitigated sentence:" in mitigated_text: |
|
mitigated_text = mitigated_text.split("Mitigated sentence:")[-1].strip() |
|
|
|
|
|
lines = mitigated_text.split('\n') |
|
clean_lines = [] |
|
for line in lines: |
|
line = line.strip() |
|
if line and not line.startswith('**') and not line.startswith('Original:'): |
|
clean_lines.append(line) |
|
|
|
if clean_lines: |
|
mitigated_text = clean_lines[0] |
|
|
|
|
|
result_msg = f"π€ **Blossom LLM Mitigation Result**\n\n" |
|
result_msg += f"**Original:** {text}\n\n" |
|
result_msg += f"**Mitigated Sentence:** {mitigated_text}" |
|
|
|
|
|
mitigation = "**Unguided Mode:** LLM detected and mitigated harmful expressions autonomously." |
|
|
|
return result_msg, mitigation |
|
|
|
except Exception as e: |
|
error_msg = f"β **Blossom LLM Error**\n\nError occurred: {str(e)}" |
|
return error_msg, "An error occurred during LLM processing." |
|
|
|
def _guided_mitigation(self, text, debug_info=None): |
|
"""Guided Mode: Mitigate based on detection result using LLM""" |
|
try: |
|
|
|
if debug_info is None: |
|
detection_result, _, debug_info = self._detection_only(text) |
|
else: |
|
|
|
label = debug_info.get('label', 'normal') |
|
confidence = debug_info.get('confidence', 0.0) |
|
hate_tokens = debug_info.get('hate_tokens', []) |
|
detection_result = f"π **Detection Result**\n\n**Classification:** {label}\n**Confidence:** {confidence:.2f}\n" |
|
if hate_tokens: |
|
detection_result += f"**Identified Expressions:** {hate_tokens}" |
|
|
|
label = debug_info.get('label', 'normal') |
|
hate_tokens = debug_info.get('hate_tokens', []) |
|
|
|
|
|
if label == "normal": |
|
result_msg = f"π **Detection Result**\n\n" |
|
result_msg += f"**Classification:** {label}\n" |
|
result_msg += f"**Confidence:** {debug_info.get('confidence', 0.0):.2f}\n" |
|
result_msg += f"\n\nβ
**Normal Text Detected**\n" |
|
result_msg += f"This text is classified as normal and does not require mitigation.\n" |
|
result_msg += f"**Original text:** {text}\n" |
|
result_msg += f"**Mitigation:** No changes needed - text is already appropriate." |
|
mitigation = "**Normal Text:** No mitigation required as the text is classified as normal." |
|
return result_msg, mitigation |
|
|
|
|
|
label_desc = { |
|
"offensive": "Aggressive", |
|
"L1_hate": "Mild Hate", |
|
"L2_hate": "Severe Hate" |
|
} |
|
hate_tokens_str = "" |
|
if hate_tokens: |
|
hate_tokens_str = "\nExpressions causing issues:\n" + "\n".join([f"β’ {token} ({bio_label})" for _, token, bio_label in hate_tokens[:5]]) |
|
prompt = f"""The following sentence is classified as {label_desc.get(label, "harmful")} expression. \nPlease remove hate speech or aggressive expressions, while maintaining the original intent (criticism, complaint, opinion, etc.).\n\nOriginal: {text}\nClassification: {label_desc.get(label, "harmful")} expression\n{hate_tokens_str}\n\n[Important] All offensive, derogatory, and explicit hate expressions (e.g., μ¨λ°, μ’, λ³μ ) must be deleted.\n\nMitigated sentence:""" |
|
|
|
inputs = self.llm_tokenizer(prompt, return_tensors="pt").to(self.llm_model.device) |
|
with torch.no_grad(): |
|
outputs = self.llm_model.generate( |
|
**inputs, |
|
do_sample=True, |
|
top_k=50, |
|
top_p=0.9, |
|
max_new_tokens=300, |
|
pad_token_id=self.llm_tokenizer.pad_token_id, |
|
eos_token_id=self.llm_tokenizer.eos_token_id |
|
) |
|
full_response = self.llm_tokenizer.decode(outputs[0], skip_special_tokens=True) |
|
mitigated_text = full_response.replace(prompt, "").strip() |
|
if len(mitigated_text) < 10: |
|
mitigated_text = full_response |
|
if "Mitigated sentence:" in mitigated_text: |
|
mitigated_text = mitigated_text.split("Mitigated sentence:")[-1].strip() |
|
lines = mitigated_text.split('\n') |
|
clean_lines = [] |
|
for line in lines: |
|
line = line.strip() |
|
if line and not line.startswith('**') and not line.startswith('Original:') and not line.startswith('Classification:'): |
|
clean_lines.append(line) |
|
if clean_lines: |
|
mitigated_text = clean_lines[0] |
|
result_msg = f"π― **Guided Mitigation Result**\n\n" |
|
result_msg += f"**Detection Result:**\n{detection_result}\n\n" |
|
result_msg += f"**LLM Mitigation Result:**\n{mitigated_text}" |
|
mitigation = "**Guided Mode:** LLM performed specific mitigation based on detection information." |
|
return result_msg, mitigation |
|
except Exception as e: |
|
error_msg = f"β **Guided Mitigation Error**\n\nError occurred: {str(e)}" |
|
return error_msg, "An error occurred during guided mitigation processing." |
|
|
|
def _guided_reflect_mitigation(self, text, debug_info=None): |
|
"""Guided+Reflect Mode: iterative refinement + critic evaluation""" |
|
try: |
|
|
|
if debug_info is None: |
|
detection_result, _, debug_info = self._detection_only(text) |
|
else: |
|
|
|
label = debug_info.get('label', 'normal') |
|
confidence = debug_info.get('confidence', 0.0) |
|
hate_tokens = debug_info.get('hate_tokens', []) |
|
detection_result = f"π **Detection Result**\n\n**Classification:** {label}\n**Confidence:** {confidence:.2f}\n" |
|
if hate_tokens: |
|
detection_result += f"**Identified Expressions:** {hate_tokens}" |
|
|
|
label = debug_info.get('label', 'normal') |
|
hate_tokens = debug_info.get('hate_tokens', []) |
|
|
|
|
|
if label == "normal": |
|
result_msg = f"π **Detection Result**\n\n" |
|
result_msg += f"**Classification:** {label}\n" |
|
result_msg += f"**Confidence:** {debug_info.get('confidence', 0.0):.2f}\n" |
|
result_msg += f"\n\nβ
**Normal Text Detected**\n" |
|
result_msg += f"This text is classified as normal and does not require mitigation.\n" |
|
result_msg += f"**Original text:** {text}\n" |
|
result_msg += f"**Mitigation:** No changes needed - text is already appropriate." |
|
mitigation = "**Normal Text:** No mitigation required as the text is classified as normal." |
|
return result_msg, mitigation |
|
|
|
|
|
label_desc = { |
|
"offensive": "Aggressive", |
|
"L1_hate": "Mild Hate", |
|
"L2_hate": "Severe Hate" |
|
} |
|
hate_tokens_str = "" |
|
if hate_tokens: |
|
hate_tokens_str = "\nExpressions causing issues:\n" + "\n".join([f"β’ {token} ({bio_label})" for _, token, bio_label in hate_tokens[:5]]) |
|
initial_prompt = f"""The following sentence is classified as {label_desc.get(label, "harmful")} expression. \nExpressions containing offensive words (e.g., μ’, μ¨λ°, λ³μ ) must be deleted.\nOther aggressive or inappropriate expressions should be mitigated by expressing them more politely and inclusively.\n\nOriginal: {text}\nClassification: {label_desc.get(label, "harmful")} expression\n{hate_tokens_str}\n\nMitigated sentence:""" |
|
|
|
max_iter = 3 |
|
metrics_history = [] |
|
best_candidate = None |
|
best_score = -float('inf') |
|
current_input = text |
|
for i in range(max_iter): |
|
|
|
inputs = self.llm_tokenizer(initial_prompt, return_tensors="pt").to(self.llm_model.device) |
|
with torch.no_grad(): |
|
outputs = self.llm_model.generate( |
|
**inputs, |
|
do_sample=True, |
|
top_k=50, |
|
top_p=0.9, |
|
max_new_tokens=300, |
|
pad_token_id=self.llm_tokenizer.pad_token_id, |
|
eos_token_id=self.llm_tokenizer.eos_token_id |
|
) |
|
candidate = self.llm_tokenizer.decode(outputs[0], skip_special_tokens=True) |
|
mitigated_text = candidate.replace(initial_prompt, "").strip() |
|
if len(mitigated_text) < 10: |
|
mitigated_text = candidate |
|
if "Mitigated sentence:" in mitigated_text: |
|
mitigated_text = mitigated_text.split("Mitigated sentence:")[-1].strip() |
|
lines = mitigated_text.split('\n') |
|
clean_lines = [] |
|
for line in lines: |
|
line = line.strip() |
|
if line and not line.startswith('**') and not line.startswith('Original:') and not line.startswith('Classification:'): |
|
clean_lines.append(line) |
|
if clean_lines: |
|
mitigated_text = clean_lines[0] |
|
|
|
if contains_badword(mitigated_text): |
|
continue |
|
|
|
toxicity = calc_toxicity_reduction(text, mitigated_text, self.model, self.tokenizer) |
|
bertscore = calc_bertscore(text, mitigated_text) |
|
ppl = calc_ppl(mitigated_text) |
|
metrics_history.append({'iteration': i+1, 'candidate': mitigated_text, 'toxicity': toxicity, 'bertscore': bertscore, 'ppl': ppl}) |
|
|
|
total_score = toxicity + bertscore - ppl * 0.01 |
|
if total_score > best_score: |
|
best_score = total_score |
|
best_candidate = mitigated_text |
|
|
|
if toxicity > 0.3 and bertscore > 0.7 and ppl < 100: |
|
break |
|
|
|
iter_log_str = "" |
|
for log in metrics_history: |
|
iter_log_str += f"\nIteration {log['iteration']}:\n- Candidate: {log['candidate']}\n- Toxicity reduction: {log['toxicity']}, bertscore: {log['bertscore']}, ppl: {log['ppl']}" |
|
|
|
result_msg = f"π **Guided+Reflect Mitigation Result**\n\n" |
|
result_msg += f"**Detection Result:**\n{detection_result}\n\n" |
|
result_msg += f"**Iterative Mitigation Log:**{iter_log_str}\n\n" |
|
result_msg += f"**Best Mitigation:** {best_candidate}" |
|
mitigation = "**Guided+Reflect Mode:** Selected the optimal candidate after iterative mitigation and evaluation (maximum 3 iterations)." |
|
return result_msg, mitigation |
|
except Exception as e: |
|
error_msg = f"β **Guided+Reflect Mitigation Error**\n\nError occurred: {str(e)}" |
|
return error_msg, "An error occurred during guided+reflect mitigation processing." |
|
|
|
def contains_badword(text): |
|
badwords = ["μ’", "μ¨λ°", "λ³μ ", "κ°μλΌ", "μΌλ³", "μ’", "γ
γ
", "γ
", "γ
γ
", "γ
", "γ
"] |
|
return any(bad in text for bad in badwords) |
|
|
|
|
|
service = HateSpeechDetectorService() |
|
|
|
|
|
def create_demo(): |
|
with gr.Blocks( |
|
title="Korean Hate Speech Detection and Mitigation System", |
|
theme=gr.themes.Soft(), |
|
css=""" |
|
.gradio-container { |
|
max-width: 800px; |
|
margin: 0 auto; |
|
} |
|
.result-box { |
|
border-radius: 10px; |
|
padding: 15px; |
|
margin: 10px 0; |
|
} |
|
.normal { background-color: #d4edda; border: 1px solid #c3e6cb; } |
|
.offensive { background-color: #fff3cd; border: 1px solid #ffeaa7; } |
|
.hate { background-color: #f8d7da; border: 1px solid #f5c6cb; } |
|
""" |
|
) as demo: |
|
gr.Markdown(""" |
|
# π Korean Hate Speech Detection and Mitigation System |
|
|
|
This system detects hate speech in Korean text and provides mitigation suggestions. |
|
|
|
|
|
**π’ Normal**: |
|
- It is a normal sentence. |
|
|
|
**π‘ Offensive** |
|
|
|
- For example: "Don't say such a stupid thing", "How can you do such a stupid thing" |
|
|
|
**π L1_hate (Implicit Hate)**: Mild hate expression |
|
- **Implicit hate expression** for protected attribute groups |
|
- For example: "Those people are all the same", "Prejudicial expression towards a specific group" |
|
|
|
**π΄ L2_hate (Explicit Hate)**: Severe hate expression |
|
- **Explicit hate expression** for protected attribute groups |
|
|
|
**π€ Mitigation Mode:** |
|
- π **Detection Only**: Hate Speech Detection Only |
|
- π― **Guided**: Guided Mitigation |
|
- π **Guided+Reflect**: After Guided Mitigation, Iterative Refinement |
|
- π€ **Unguided**: LLM generates text without any guidance |
|
""") |
|
|
|
with gr.Row(): |
|
with gr.Column(scale=2): |
|
input_text = gr.Textbox( |
|
label="Enter text", |
|
lines=3 |
|
) |
|
|
|
strategy = gr.Radio( |
|
["Detection Only", "Guided", "Guided+Reflect", "Unguided"], |
|
value="Detection Only", |
|
label="Select Mitigation Mode", |
|
container=True |
|
) |
|
|
|
analyze_btn = gr.Button("π Detect & Mitigate", variant="primary", size="lg") |
|
|
|
with gr.Row(): |
|
with gr.Column(): |
|
result_output = gr.Markdown( |
|
label="Mitigation Button", |
|
value="Input text and click the above button." |
|
) |
|
|
|
with gr.Column(): |
|
mitigation_output = gr.Markdown( |
|
label="Mitigation Suggestion", |
|
value="Based on the analysis result, mitigation suggestions will be provided." |
|
) |
|
|
|
|
|
analyze_btn.click( |
|
fn=service.detect_hate_speech, |
|
inputs=[input_text, strategy], |
|
outputs=[result_output, mitigation_output] |
|
) |
|
|
|
|
|
input_text.submit( |
|
fn=service.detect_hate_speech, |
|
inputs=[input_text, strategy], |
|
outputs=[result_output, mitigation_output] |
|
) |
|
|
|
return demo |
|
|
|
if __name__ == "__main__": |
|
demo = create_demo() |
|
demo.launch( |
|
server_name="0.0.0.0", |
|
server_port=7860, |
|
share=True, |
|
show_error=True |
|
) |