Spaces:

alohaboy
/

hate-speech-mitigation-demo

Running

App Files Files Community

alohaboy commited on Jul 20

Commit

f09939b

0 Parent(s):

최소 파일만 포함한 완전 클린 Space push

Browse files

Files changed (3) hide show

README.md +14 -0
app.py +648 -0
requirements.txt +9 -0

README.md ADDED Viewed

	@@ -0,0 +1,14 @@

+---
+title: Korean Hate Speech Mitigation Demo
+emoji: "🛡️"
+colorFrom: indigo
+colorTo: blue
+sdk: gradio
+sdk_version: "4.44.0"
+app_file: app.py
+pinned: false
+---
+# Korean Hate Speech Mitigation Demo
+이 Space는 한국어 혐오 표현 탐지 및 순화 데모입니다.

app.py ADDED Viewed

	@@ -0,0 +1,648 @@

+import gradio as gr
+import torch
+import torch.nn as nn
+from transformers import AutoTokenizer, AutoModel, AutoModelForCausalLM, AutoConfig
+import numpy as np
+from datetime import datetime
+from TorchCRF import CRF
+from bert_score import score as bert_score_fn
+import re
+from huggingface_hub import hf_hub_download
+def calc_bertscore(orig_text, rewritten_text):
+    P, R, F1 = bert_score_fn([rewritten_text], [orig_text], lang="ko")
+    return round(F1[0].item(), 3)
+def calc_ppl(text):
+    try:
+        tokens = text.split()
+        if len(tokens) < 2:
+            return 1.0
+        word_count = len(tokens)
+        base_ppl = 50.0
+        length_factor = min(word_count / 10.0, 2.0)
+        complexity_factor = 1.0 + (len(set(tokens)) / word_count) * 0.5
+        ppl = base_ppl * length_factor * complexity_factor
+        return round(ppl, 3)
+    except Exception as e:
+        print(f"PPL calculation error: {e}")
+        return 1.0
+def calc_toxicity_reduction(orig_text, rewritten_text, detector_model, detector_tokenizer):
+    try:
+        # Original toxicity score
+        orig_enc = detector_tokenizer(orig_text, return_tensors="pt", padding="max_length", max_length=128)
+        device = next(detector_model.parameters()).device
+        orig_input_ids = orig_enc["input_ids"].to(device)
+        orig_attention_mask = orig_enc["attention_mask"].to(device)
+        with torch.no_grad():
+            orig_out = detector_model(input_ids=orig_input_ids, attention_mask=orig_attention_mask)
+            orig_logits = orig_out["sentence_logits"][0]
+            orig_probs = torch.softmax(orig_logits, dim=-1)
+            orig_toxicity = 1.0 - orig_probs[0].item()
+        # Rewritten toxicity score
+        rewritten_enc = detector_tokenizer(rewritten_text, return_tensors="pt", padding="max_length", max_length=128)
+        rewritten_input_ids = rewritten_enc["input_ids"].to(device)
+        rewritten_attention_mask = rewritten_enc["attention_mask"].to(device)
+        with torch.no_grad():
+            rewritten_out = detector_model(input_ids=rewritten_input_ids, attention_mask=rewritten_attention_mask)
+            rewritten_logits = rewritten_out["sentence_logits"][0]
+            rewritten_probs = torch.softmax(rewritten_logits, dim=-1)
+            rewritten_toxicity = 1.0 - rewritten_probs[0].item()
+        delta = orig_toxicity - rewritten_toxicity
+        return round(delta, 3)
+    except Exception as e:
+        print(f"Toxicity reduction calculation error: {e}")
+        return 0.0
+class HateSpeechDetector(nn.Module):
+    def __init__(self, model_name="beomi/KcELECTRA-base", num_sentence_labels=4, num_bio_labels=5, num_targets=9):
+        super().__init__()
+        self.config = AutoConfig.from_pretrained(model_name)
+        self.encoder = AutoModel.from_pretrained(model_name, config=self.config)
+        hidden_size = self.config.hidden_size
+        self.dropout = nn.Dropout(0.1)
+        self.classifier = nn.Linear(hidden_size, num_sentence_labels)  # Sentence classification
+        self.bio_linear = nn.Linear(hidden_size, num_bio_labels)      # BIO tagging
+        self.crf = CRF(num_bio_labels)
+        self.target_head = nn.Linear(hidden_size, num_targets)        # Target classification
+    def forward(self, input_ids, attention_mask, bio_tags=None, sentence_labels=None, targets=None):
+        outputs = self.encoder(input_ids=input_ids, attention_mask=attention_mask)
+        sequence_output = outputs.last_hidden_state
+        pooled_output = sequence_output[:, 0, :]
+        dropped = self.dropout(pooled_output)
+        sentence_logits = self.classifier(dropped)
+        bio_feats = self.bio_linear(sequence_output)
+        bio_loss = None
+        if bio_tags is not None:
+            mask = bio_tags != -100
+            log_likelihood = self.crf.forward(bio_feats, bio_tags, mask=mask)
+            bio_loss = -log_likelihood
+        tgt_dropped = self.dropout(pooled_output)
+        target_logits = self.target_head(tgt_dropped)
+        loss = 0.0
+        if sentence_labels is not None:
+            cls_loss = nn.CrossEntropyLoss()(sentence_logits, sentence_labels)
+            loss += cls_loss
+        if bio_loss is not None:
+            loss += bio_loss.sum()
+        if targets is not None:
+            bce_loss = nn.BCEWithLogitsLoss()(target_logits, targets)
+            loss += 2.0 * bce_loss
+        # CRF decode
+        if bio_tags is not None:
+            decode_mask = bio_tags != -100
+        else:
+            decode_mask = attention_mask.bool()
+        print("[DEBUG] bio_tags:", bio_tags)
+        print("[DEBUG] attention_mask.shape:", attention_mask.shape)
+        print("[DEBUG] decode_mask.shape:", decode_mask.shape)
+        print("[DEBUG] decode_mask[:, 0]:", decode_mask[:, 0] if decode_mask.dim() > 1 else decode_mask[0])
+        print("[DEBUG] bio_feats.shape:", bio_feats.shape)
+        bio_preds = self.crf.viterbi_decode(bio_feats, mask=decode_mask)
+        return {
+            'loss': loss,
+            'sentence_logits': sentence_logits,
+            'bio_logits': bio_feats,
+            'bio_preds': bio_preds,
+            'target_logits': target_logits
+        }
+class HateSpeechDetectorService:
+    def __init__(self):
+        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        self.tokenizer = AutoTokenizer.from_pretrained("beomi/KcELECTRA-base")
+        self.model = HateSpeechDetector()
+        # Model loading
+        MODEL_CKPT_PATH = hf_hub_download(repo_id="alohaboy/hate_detector_ko", filename="best_model.pt")
+        checkpoint = torch.load(MODEL_CKPT_PATH, map_location=self.device)
+        # state_dict key conversion
+        key_map = {
+            'sentence_classifier.weight': 'classifier.weight',
+            'sentence_classifier.bias': 'classifier.bias',
+            'bio_classifier.weight': 'bio_linear.weight',
+            'bio_classifier.bias': 'bio_linear.bias',
+            # CRF related keys (reverse)
+            'crf.transitions': 'crf.trans_matrix',
+            'crf.start_transitions': 'crf.start_trans',
+            'crf.end_transitions': 'crf.end_trans',
+        }
+        new_state_dict = {}
+        # If checkpoint is a dict and model_state_dict key exists, load from it
+        if isinstance(checkpoint, dict) and 'model_state_dict' in checkpoint:
+            state_dict = checkpoint['model_state_dict']
+        else:
+            state_dict = checkpoint
+        for k, v in state_dict.items():
+            new_key = key_map.get(k, k)
+            new_state_dict[new_key] = v
+        self.model.load_state_dict(new_state_dict, strict=True)
+        self.model.to(self.device)
+        self.model.eval()
+        # Blossom LLM loading
+        print("Blossom LLM loading...")
+        self.llm_model_name = "Bllossom/llama-3.2-Korean-Bllossom-3B"
+        self.llm_tokenizer = AutoTokenizer.from_pretrained(self.llm_model_name)
+        self.llm_model = AutoModelForCausalLM.from_pretrained(
+            self.llm_model_name,
+            torch_dtype=torch.bfloat16,
+            device_map="auto"
+        )
+        print("LLM loading complete!")
+        self.label_names = ["normal", "offensive", "L1_hate", "L2_hate"]
+        self.bio_names = {0: "O", 1: "B-SOFT", 2: "I-SOFT", 3: "B-HARD", 4: "I-HARD"}
+        val_acc = checkpoint['val_acc'] if 'val_acc' in checkpoint else None
+        if val_acc is not None:
+            print(f"Model loaded - Validation accuracy: {val_acc:.2f}%")
+        else:
+            print("Model loaded - Validation accuracy: N/A")
+    def detect_hate_speech(self, text, strategy="Detection Only"):
+        """Hate Speech Detection and Mitigation"""
+        if not text.strip():
+            return "Please enter text", ""
+        if len(text.strip()) < 2:
+            return "Input text is too short. Please enter at least 2 characters.", ""
+        if strategy == "Detection Only":
+            result_msg, mitigation, debug_info = self._detection_only(text)
+            print("[DEBUG] Input text:", text)
+            print("[DEBUG] sentence_logits:", debug_info.get('sentence_logits'))
+            print("[DEBUG] sentence_probs:", debug_info.get('sentence_probs'))
+            print("[DEBUG] sentence_pred:", debug_info.get('sentence_pred'))
+            print("[DEBUG] label:", debug_info.get('label'))
+            print("[DEBUG] confidence:", debug_info.get('confidence'))
+            return result_msg, mitigation
+        elif strategy == "Guided":
+            return self._guided_mitigation(text)
+        elif strategy == "Guided+Reflect":
+            return self._guided_reflect_mitigation(text)
+        elif strategy == "Unguided":
+            return self._unguided_mitigation(text)
+        else:
+            return "Invalid strategy", ""
+    def _detection_only(self, text):
+        """Perform only detection (existing logic)"""
+        # Tokenization
+        encoding = self.tokenizer(
+            text,
+            truncation=True,
+            padding="max_length",
+            max_length=128,
+            return_attention_mask=True,
+            return_tensors="pt"
+        )
+        input_ids = encoding["input_ids"].to(self.device)
+        attention_mask = encoding["attention_mask"].to(self.device)
+        print("[DEBUG] attention_mask[:, 0] =", attention_mask[:, 0])
+        # Prediction
+        with torch.no_grad():
+            outputs = self.model(input_ids=input_ids, attention_mask=attention_mask)
+            sentence_logits = outputs["sentence_logits"]
+            bio_logits = outputs["bio_logits"]
+            # Sentence classification result
+            sentence_probs = torch.softmax(sentence_logits, dim=1)
+            sentence_pred = torch.argmax(sentence_logits, dim=1).item()
+            sentence_prob = sentence_probs[0][sentence_pred].item()
+            # BIO tagging result
+            bio_preds = torch.argmax(bio_logits, dim=2)[0]
+            # Find hate/aggressive tokens
+            hate_tokens = []
+            tokens = self.tokenizer.convert_ids_to_tokens(input_ids[0])
+            # Tokenize original text to get offset mapping
+            tokenized = self.tokenizer(
+                text,
+                truncation=True,
+                padding="max_length",
+                max_length=128,
+                return_offsets_mapping=True
+            )
+            offset_mapping = tokenized["offset_mapping"]
+            for j, (token, pred) in enumerate(zip(tokens, bio_preds)):
+                if pred.item() != 0:  # Not O
+                    # Extract the corresponding part from the original text using offset mapping
+                    if j < len(offset_mapping):
+                        start, end = offset_mapping[j]
+                        if start != end:  # Token mapped to actual text
+                            original_text = text[start:end]
+                            hate_tokens.append((j, original_text, self.bio_names[pred.item()]))
+                        else:
+                            # Special token handling
+                            if token.startswith('Ġ'):
+                                decoded_token = token[1:]  # Remove Ġ
+                            elif token in ['[CLS]', '[SEP]', '[PAD]', '[UNK]']:
+                                decoded_token = token
+                            else:
+                                decoded_token = token
+                            hate_tokens.append((j, decoded_token, self.bio_names[pred.item()]))
+                    else:
+                        # Fallback
+                        if token.startswith('Ġ'):
+                            decoded_token = token[1:]
+                        elif token in ['[CLS]', '[SEP]', '[PAD]', '[UNK]']:
+                            decoded_token = token
+                        else:
+                            decoded_token = token
+                        hate_tokens.append((j, decoded_token, self.bio_names[pred.item()]))
+        # Determine label
+        label = self.label_names[sentence_pred]
+        # If hate_tokens contain B-HARD, I-HARD, increase label to L2_hate
+        if any(bio_label in ["B-HARD", "I-HARD"] for _, _, bio_label in hate_tokens):
+            label = "L2_hate"
+        # Construct result message
+        result_msg = f"Detection result: {label}\nConfidence: {sentence_prob:.2f}"
+        if hate_tokens:
+            result_msg += f"\nIdentified hate/aggressive expressions: {hate_tokens}"
+        mitigation = "Performed only detection."
+        debug_info = {
+            'sentence_logits': sentence_logits,
+            'sentence_probs': sentence_probs,
+            'sentence_pred': sentence_pred,
+            'label': label,
+            'confidence': sentence_prob,
+            'hate_tokens': hate_tokens
+        }
+        return result_msg, mitigation, debug_info
+    def _unguided_mitigation(self, text):
+        """Unguided Mode: Only Using Generation"""
+        try:
+            # Blossom LLM prompt
+            prompt = f"""Please remove hate speech or aggressive expressions from the following sentence, while maintaining the original intent (criticism, complaint, opinion, etc.).
+Original: {text}
+Mitigated sentence:"""
+            # LLM inference
+            inputs = self.llm_tokenizer(prompt, return_tensors="pt").to(self.llm_model.device)
+            with torch.no_grad():
+                outputs = self.llm_model.generate(
+                    **inputs,
+                    do_sample=True,
+                    top_k=50,
+                    top_p=0.9,
+                    max_new_tokens=300,
+                    pad_token_id=self.llm_tokenizer.pad_token_id,
+                    eos_token_id=self.llm_tokenizer.eos_token_id
+                )
+            # Decode result
+            full_response = self.llm_tokenizer.decode(outputs[0], skip_special_tokens=True)
+            # Remove prompt part and extract mitigated sentence
+            mitigated_text = full_response.replace(prompt, "").strip()
+            # Handle truncated sentences
+            if len(mitigated_text) < 10:  # Too short, use original response
+                mitigated_text = full_response
+            # Prevent repetitive output: extract only the first mitigated sentence
+            if "Mitigated sentence:" in mitigated_text:
+                mitigated_text = mitigated_text.split("Mitigated sentence:")[-1].strip()
+            # Use only the first meaningful line if multiple lines
+            lines = mitigated_text.split('\n')
+            clean_lines = []
+            for line in lines:
+                line = line.strip()
+                if line and not line.startswith('**') and not line.startswith('Original:'):
+                    clean_lines.append(line)
+            if clean_lines:
+                mitigated_text = clean_lines[0]
+            # Result message
+            result_msg = f"🤖 **Blossom LLM Mitigation Result**\n\n"
+            result_msg += f"**Original:** {text}\n\n"
+            result_msg += f"**Mitigated Sentence:** {mitigated_text}"
+            # Mitigation info
+            mitigation = "**Unguided Mode:** Blossom LLM detected and mitigated harmful expressions autonomously."
+            return result_msg, mitigation
+        except Exception as e:
+            error_msg = f"❌ **Blossom LLM Error**\n\nError occurred: {str(e)}"
+            return error_msg, "An error occurred during LLM processing."
+    def _guided_mitigation(self, text):
+        """Guided Mode: Mitigate based on KcELECTRA detection result using Blossom LLM"""
+        try:
+            # First, perform detection with KcELECTRA
+            detection_result, _, debug_info = self._detection_only(text)
+            label = debug_info.get('label', 'normal')
+            hate_tokens = debug_info.get('hate_tokens', [])
+            # Construct Blossom LLM prompt
+            if label == "normal":
+                prompt = f"""The following sentence is classified as a normal sentence. Please improve it by expressing it more politely and respectfully, while maintaining the original intent.\n\nOriginal: {text}\n\nImproved sentence:"""
+            else:
+                label_desc = {
+                    "offensive": "Aggressive",
+                    "L1_hate": "Mild Hate",
+                    "L2_hate": "Severe Hate"
+                }
+                hate_tokens_str = ""
+                if hate_tokens:
+                    hate_tokens_str = "\nExpressions causing issues:\n" + "\n".join([f"• {token} ({bio_label})" for _, token, bio_label in hate_tokens[:5]])
+                prompt = f"""The following sentence is classified as {label_desc.get(label, "harmful")} expression. \nPlease remove hate speech or aggressive expressions, while maintaining the original intent (criticism, complaint, opinion, etc.).\n\nOriginal: {text}\nClassification: {label_desc.get(label, "harmful")} expression\n{hate_tokens_str}\n\n[Important] All offensive, derogatory, and explicit hate expressions (e.g., 씨발, 좆, 병신) must be deleted.\n\nMitigated sentence:"""
+            # LLM inference
+            inputs = self.llm_tokenizer(prompt, return_tensors="pt").to(self.llm_model.device)
+            with torch.no_grad():
+                outputs = self.llm_model.generate(
+                    **inputs,
+                    do_sample=True,
+                    top_k=50,
+                    top_p=0.9,
+                    max_new_tokens=300,
+                    pad_token_id=self.llm_tokenizer.pad_token_id,
+                    eos_token_id=self.llm_tokenizer.eos_token_id
+                )
+            full_response = self.llm_tokenizer.decode(outputs[0], skip_special_tokens=True)
+            mitigated_text = full_response.replace(prompt, "").strip()
+            if len(mitigated_text) < 10:
+                mitigated_text = full_response
+            if "Mitigated sentence:" in mitigated_text:
+                mitigated_text = mitigated_text.split("Mitigated sentence:")[-1].strip()
+            lines = mitigated_text.split('\n')
+            clean_lines = []
+            for line in lines:
+                line = line.strip()
+                if line and not line.startswith('**') and not line.startswith('Original:') and not line.startswith('Classification:'):
+                    clean_lines.append(line)
+            if clean_lines:
+                mitigated_text = clean_lines[0]
+            result_msg = f"🎯 **Guided Mitigation Result**\n\n"
+            result_msg += f"**KcELECTRA Detection Result:**\n{detection_result}\n\n"
+            result_msg += f"**Blossom LLM Mitigation Result:**\n{mitigated_text}"
+            mitigation = "**Guided Mode:** Blossom LLM performed specific mitigation based on KcELECTRA's detection information."
+            return result_msg, mitigation
+        except Exception as e:
+            error_msg = f"❌ **Guided Mitigation Error**\n\nError occurred: {str(e)}"
+            return error_msg, "An error occurred during guided mitigation processing."
+    def _guided_reflect_mitigation(self, text):
+        """Guided+Reflect Mode: iterative refinement + critic evaluation"""
+        try:
+            detection_result, _, debug_info = self._detection_only(text)
+            label = debug_info.get('label', 'normal')
+            hate_tokens = debug_info.get('hate_tokens', [])
+            # Step 1: Initial mitigation
+            if label == "normal":
+                initial_prompt = f"""The following sentence is classified as a normal sentence. Please improve it by expressing it more politely and respectfully, while maintaining the original intent.\n\nOriginal: {text}\n\nImproved sentence:"""
+            else:
+                label_desc = {
+                    "offensive": "Aggressive",
+                    "L1_hate": "Mild Hate",
+                    "L2_hate": "Severe Hate"
+                }
+                hate_tokens_str = ""
+                if hate_tokens:
+                    hate_tokens_str = "\nExpressions causing issues:\n" + "\n".join([f"• {token} ({bio_label})" for _, token, bio_label in hate_tokens[:5]])
+                initial_prompt = f"""The following sentence is classified as {label_desc.get(label, "harmful")} expression. \nExpressions containing offensive words (e.g., 좃, 씨발, 병신) must be deleted.\nOther aggressive or inappropriate expressions should be mitigated by expressing them more politely and inclusively.\n\nOriginal: {text}\nClassification: {label_desc.get(label, "harmful")} expression\n{hate_tokens_str}\n\nMitigated sentence:"""
+            # Iterative mitigation and evaluation
+            max_iter = 5
+            metrics_history = []
+            best_candidate = None
+            best_score = -float('inf')
+            current_input = text
+            for i in range(max_iter):
+                # Generate candidate
+                inputs = self.llm_tokenizer(initial_prompt, return_tensors="pt").to(self.llm_model.device)
+                with torch.no_grad():
+                    outputs = self.llm_model.generate(
+                        **inputs,
+                        do_sample=True,
+                        top_k=50,
+                        top_p=0.9,
+                        max_new_tokens=300,
+                        pad_token_id=self.llm_tokenizer.pad_token_id,
+                        eos_token_id=self.llm_tokenizer.eos_token_id
+                    )
+                candidate = self.llm_tokenizer.decode(outputs[0], skip_special_tokens=True)
+                mitigated_text = candidate.replace(initial_prompt, "").strip()
+                if len(mitigated_text) < 10:
+                    mitigated_text = candidate
+                if "Mitigated sentence:" in mitigated_text:
+                    mitigated_text = mitigated_text.split("Mitigated sentence:")[-1].strip()
+                lines = mitigated_text.split('\n')
+                clean_lines = []
+                for line in lines:
+                    line = line.strip()
+                    if line and not line.startswith('**') and not line.startswith('Original:') and not line.startswith('Classification:'):
+                        clean_lines.append(line)
+                if clean_lines:
+                    mitigated_text = clean_lines[0]
+                # Exclude candidates containing offensive words
+                if contains_badword(mitigated_text):
+                    continue
+                # Evaluation
+                toxicity = calc_toxicity_reduction(text, mitigated_text, self.model, self.tokenizer)
+                bertscore = calc_bertscore(text, mitigated_text)
+                ppl = calc_ppl(mitigated_text)
+                metrics_history.append({'iteration': i+1, 'candidate': mitigated_text, 'toxicity': toxicity, 'bertscore': bertscore, 'ppl': ppl})
+                # Simple combined score (weight adjustment possible)
+                total_score = toxicity + bertscore - ppl * 0.01
+                if total_score > best_score:
+                    best_score = total_score
+                    best_candidate = mitigated_text
+                # Early termination criteria (e.g., toxicity>0.3, bertscore>0.7, ppl<100)
+                if toxicity > 0.3 and bertscore > 0.7 and ppl < 100:
+                    break
+            # Log output
+            iter_log_str = ""
+            for log in metrics_history:
+                iter_log_str += f"\nIteration {log['iteration']}:\n- Candidate: {log['candidate']}\n- Toxicity reduction: {log['toxicity']}, bertscore: {log['bertscore']}, ppl: {log['ppl']}"
+            # Result message
+            result_msg = f"🔄 **Guided+Reflect Mitigation Result**\n\n"
+            result_msg += f"**Detection Result:**\n{detection_result}\n\n"
+            result_msg += f"**Iterative Mitigation Log:**{iter_log_str}\n\n"
+            result_msg += f"**Best Mitigation:** {best_candidate}"
+            mitigation = "**Guided+Reflect Mode:** Selected the optimal candidate after iterative mitigation and evaluation (maximum 5 iterations)."
+            return result_msg, mitigation
+        except Exception as e:
+            error_msg = f"❌ **Guided+Reflect Mitigation Error**\n\nError occurred: {str(e)}"
+            return error_msg, "An error occurred during guided+reflect mitigation processing."
+    def _suggest_mitigation(self, label, confidence, hate_tokens):
+        """Suggest mitigation for hate speech expressions"""
+        if label == "normal":
+            return "✅ **Mitigation Suggestion**: This sentence does not require correction."
+        mitigation = f"**🔧 Mitigation Suggestion for Hate Speech:**\n\n"
+        if label == "offensive":
+            mitigation += "**Aggressive Expression Mitigation Options:**\n"
+            mitigation += "• Try to change aggressive expressions to more polite expressions\n"
+            mitigation += "• Use objective expressions instead of emotional expressions\n"
+            mitigation += "• Reconstruct with a mind to be considerate\n"
+            mitigation += "• When criticizing, provide specific and constructive feedback"
+        elif label == "L1_hate":
+            mitigation += "**Implicit Hate Expression Mitigation Options:**\n"
+            mitigation += "• Remove expressions that discriminate or show prejudice\n"
+            mitigation += "• Avoid generalizing about specific groups\n"
+            mitigation += "• Use more inclusive and respectful expressions\n"
+            mitigation += "• Change to expressions that acknowledge diversity"
+        else:  # L2_hate
+            mitigation += "**Explicit Hate Expression Mitigation Options:**\n"
+            mitigation += "• Completely remove severe hate expressions\n"
+            mitigation += "• Do not use violent or threatening expressions\n"
+            mitigation += "• Use expressions that respect everyone's dignity\n"
+            mitigation += "• Change to expressions that discriminate or promote hate\n"
+            mitigation += "• If necessary, seek professional help"
+        return mitigation
+def contains_badword(text):
+    badwords = ["좃", "씨발", "병신", "개새끼", "염병", "좆", "ㅅㅂ", "ㅄ", "ㅂㅅ", "ㅗ", "ㅉ"]
+    return any(bad in text for bad in badwords)
+# Service initialization
+service = HateSpeechDetectorService()
+# Gradio interface
+def create_demo():
+    with gr.Blocks(
+        title="Korean Hate Speech Detection and Mitigation System",
+        theme=gr.themes.Soft(),
+        css="""
+        .gradio-container {
+            max-width: 800px;
+            margin: 0 auto;
+        }
+        .result-box {
+            border-radius: 10px;
+            padding: 15px;
+            margin: 10px 0;
+        }
+        .normal { background-color: #d4edda; border: 1px solid #c3e6cb; }
+        .offensive { background-color: #fff3cd; border: 1px solid #ffeaa7; }
+        .hate { background-color: #f8d7da; border: 1px solid #f5c6cb; }
+        """
+    ) as demo:
+        gr.Markdown("""
+        # Korean Hate Speech Detection and Mitigation System
+        This system detects hate speech in Korean text and provides mitigation suggestions.
+        **🟢 Normal**:
+        - It is a normal sentence.
+        **🟡 Offensive**
+        - For example: "Don't say such a stupid thing", "How can you do such a stupid thing"
+        **🟠 L1_hate (Implicit Hate)**: Mild hate expression
+        - **Implicit hate expression** for protected attribute groups
+        - For example: "Those people are all the same", "Prejudicial expression towards a specific group"
+        **🔴 L2_hate (Explicit Hate)**: Severe hate expression
+        - **Explicit hate expression** for protected attribute groups
+        **🤖 Mitigation Mode:**
+        - 🔍 **Detection Only**: Hate Speech Detection Only
+        - 🎯 **Guided**: Guided Mitigation
+        - 🔄 **Guided+Reflect**: After Guided Mitigation, Iterative Refinement
+        - 🤖 **Unguided**: LLM generates text without any guidance
+        """)
+        with gr.Row():
+            with gr.Column(scale=2):
+                input_text = gr.Textbox(
+                    label="Enter text to detect hate speech & mitigate",
+                    lines=3
+                )
+                strategy = gr.Radio(
+                    ["Detection Only", "Guided", "Guided+Reflect", "Unguided"],
+                    value="Detection Only",
+                    label="Select Mitigation Mode",
+                    container=True
+                )
+                analyze_btn = gr.Button("🔍 Detect & Mitigate", variant="primary", size="lg")
+            # with gr.Column(scale=1):
+            #     gr.Markdown("""
+            #     **🧪 Test Examples:**
+            #     **🟢 Normal:**
+            #     - "Hello, today's weather is nice."
+            #     - "This movie was really fun."
+            #     **🟡 Offensive:**
+            #     - "How can you do such a stupid thing"
+            #     - "Don't say such a stupid thing"
+            #     **🟠 L1_hate (Implicit):**
+            #     - "Those people are all the same"
+            #     - "Prejudicial expression towards a specific group"
+            #     **🔴 L2_hate (Explicit):**
+            #     - "All women are useless"
+            #     - "People with disabilities are a burden to society"
+            #     """)
+        with gr.Row():
+            with gr.Column():
+                result_output = gr.Markdown(
+                    label="Mitigation Button",
+                    value="Input text and click the above button."
+                )
+            with gr.Column():
+                mitigation_output = gr.Markdown(
+                    label="Mitigation Suggestion",
+                    value="Based on the analysis result, mitigation suggestions will be provided."
+                )
+        # Event handlers
+        analyze_btn.click(
+            fn=service.detect_hate_speech,
+            inputs=[input_text, strategy],
+            outputs=[result_output, mitigation_output]
+        )
+        # Allow analysis via Enter key
+        input_text.submit(
+            fn=service.detect_hate_speech,
+            inputs=[input_text, strategy],
+            outputs=[result_output, mitigation_output]
+        )
+        # gr.Markdown("""
+        # ---
+        # **Model Information:**
+        # - Detection Model: KcELECTRA-base (Validation Accuracy: 67.67%)
+        # - Mitigation Model: Blossom LLM (llama-3.2-Korean-Bllossom-3B)
+        # - Training Data: K-HATERS Dataset
+        # """)
+    return demo
+if __name__ == "__main__":
+    demo = create_demo()
+    demo.launch(
+        server_name="0.0.0.0",
+        server_port=7863,
+        share=True,
+        show_error=True
+    )

requirements.txt ADDED Viewed

	@@ -0,0 +1,9 @@

+gradio>=4.44.0
+torch>=2.0.0
+transformers>=4.30.0
+bert-score>=0.3.13
+numpy>=1.21.0
+scikit-learn>=1.0.0
+accelerate>=0.20.0
+TorchCRF==1.1.0