Spaces:

alohaboy
/

hate-speech-mitigation-demo

Running

App Files Files Community

alohaboy commited on 28 days ago

Commit

2d7f457

1 Parent(s): e8dd03b

Update to latest Korean hate speech detection and mitigation system

Browse files

Files changed (2) hide show

README.md +9 -5
app.py +280 -616

README.md CHANGED Viewed

@@ -1,14 +1,18 @@
 ---
-title: Korean Hate Speech Mitigation Demo
-emoji: "🛡️"
 colorFrom: indigo
 colorTo: blue
 sdk: gradio
-sdk_version: "4.44.0"
 app_file: app.py
 pinned: false
 ---
-# Korean Hate Speech Mitigation Demo
-이 Space는 한국어 혐오 표현 탐지 및 순화 데모입니다.

 ---
+title: Hate Speech Mitigation Demo
+emoji: 🛡️
 colorFrom: indigo
 colorTo: blue
 sdk: gradio
+sdk_version: "4.27.0"
 app_file: app.py
 pinned: false
 ---
+# Hate Speech Mitigation Demo
+이 Space는 한국어 혐오 표현 완화 데모입니다.
+- Gradio 기반 인터페이스
+- Electra + CRF 기반 혐오 탐지
+- LLM 기반 문장 순화

app.py CHANGED Viewed

@@ -1,5 +1,13 @@
 import gradio as gr
 import torch
 import torch.nn as nn
 from transformers import AutoTokenizer, AutoModel, AutoModelForCausalLM, AutoConfig
 import numpy as np
@@ -8,636 +16,292 @@ from TorchCRF import CRF
 from bert_score import score as bert_score_fn
 import re
-from huggingface_hub import hf_hub_download
 def calc_bertscore(orig_text, rewritten_text):
     P, R, F1 = bert_score_fn([rewritten_text], [orig_text], lang="ko")
     return round(F1[0].item(), 3)
 def calc_ppl(text):
-    try:
-        tokens = text.split()
-        if len(tokens) < 2:
-            return 1.0
-        word_count = len(tokens)
-        base_ppl = 50.0
-        length_factor = min(word_count / 10.0, 2.0)
-        complexity_factor = 1.0 + (len(set(tokens)) / word_count) * 0.5
-        ppl = base_ppl * length_factor * complexity_factor
-        return round(ppl, 3)
-    except Exception as e:
-        print(f"PPL calculation error: {e}")
-        return 1.0
 def calc_toxicity_reduction(orig_text, rewritten_text, detector_model, detector_tokenizer):
-    try:
-        # Original toxicity score
-        orig_enc = detector_tokenizer(orig_text, return_tensors="pt", padding="max_length", max_length=128)
-        device = next(detector_model.parameters()).device
-        orig_input_ids = orig_enc["input_ids"].to(device)
-        orig_attention_mask = orig_enc["attention_mask"].to(device)
-        with torch.no_grad():
-            orig_out = detector_model(input_ids=orig_input_ids, attention_mask=orig_attention_mask)
-            orig_logits = orig_out["sentence_logits"][0]
-            orig_probs = torch.softmax(orig_logits, dim=-1)
-            orig_toxicity = 1.0 - orig_probs[0].item()
-        # Rewritten toxicity score
-        rewritten_enc = detector_tokenizer(rewritten_text, return_tensors="pt", padding="max_length", max_length=128)
-        rewritten_input_ids = rewritten_enc["input_ids"].to(device)
-        rewritten_attention_mask = rewritten_enc["attention_mask"].to(device)
-        with torch.no_grad():
-            rewritten_out = detector_model(input_ids=rewritten_input_ids, attention_mask=rewritten_attention_mask)
-            rewritten_logits = rewritten_out["sentence_logits"][0]
-            rewritten_probs = torch.softmax(rewritten_logits, dim=-1)
-            rewritten_toxicity = 1.0 - rewritten_probs[0].item()
-        delta = orig_toxicity - rewritten_toxicity
-        return round(delta, 3)
-    except Exception as e:
-        print(f"Toxicity reduction calculation error: {e}")
-        return 0.0
-class HateSpeechDetector(nn.Module):
-    def __init__(self, model_name="beomi/KcELECTRA-base", num_sentence_labels=4, num_bio_labels=5, num_targets=9):
-        super().__init__()
-        self.config = AutoConfig.from_pretrained(model_name)
-        self.encoder = AutoModel.from_pretrained(model_name, config=self.config)
-        hidden_size = self.config.hidden_size
-        self.dropout = nn.Dropout(0.1)
-        self.classifier = nn.Linear(hidden_size, num_sentence_labels)  # Sentence classification
-        self.bio_linear = nn.Linear(hidden_size, num_bio_labels)      # BIO tagging
-        self.crf = CRF(num_bio_labels)
-        self.target_head = nn.Linear(hidden_size, num_targets)        # Target classification
-    def forward(self, input_ids, attention_mask, bio_tags=None, sentence_labels=None, targets=None):
-        outputs = self.encoder(input_ids=input_ids, attention_mask=attention_mask)
-        sequence_output = outputs.last_hidden_state
-        pooled_output = sequence_output[:, 0, :]
-        dropped = self.dropout(pooled_output)
-        sentence_logits = self.classifier(dropped)
-        bio_feats = self.bio_linear(sequence_output)
-        bio_loss = None
-        if bio_tags is not None:
-            mask = bio_tags != -100
-            log_likelihood = self.crf.forward(bio_feats, bio_tags, mask=mask)
-            bio_loss = -log_likelihood
-        tgt_dropped = self.dropout(pooled_output)
-        target_logits = self.target_head(tgt_dropped)
-        loss = 0.0
-        if sentence_labels is not None:
-            cls_loss = nn.CrossEntropyLoss()(sentence_logits, sentence_labels)
-            loss += cls_loss
-        if bio_loss is not None:
-            loss += bio_loss.sum()
-        if targets is not None:
-            bce_loss = nn.BCEWithLogitsLoss()(target_logits, targets)
-            loss += 2.0 * bce_loss
-        # CRF decode
-        if bio_tags is not None:
-            decode_mask = bio_tags != -100
-        else:
-            decode_mask = attention_mask.bool()
-        print("[DEBUG] bio_tags:", bio_tags)
-        print("[DEBUG] attention_mask.shape:", attention_mask.shape)
-        print("[DEBUG] decode_mask.shape:", decode_mask.shape)
-        print("[DEBUG] decode_mask[:, 0]:", decode_mask[:, 0] if decode_mask.dim() > 1 else decode_mask[0])
-        print("[DEBUG] bio_feats.shape:", bio_feats.shape)
-        bio_preds = self.crf.viterbi_decode(bio_feats, mask=decode_mask)
-        return {
-            'loss': loss,
-            'sentence_logits': sentence_logits,
-            'bio_logits': bio_feats,
-            'bio_preds': bio_preds,
-            'target_logits': target_logits
-        }
-class HateSpeechDetectorService:
-    def __init__(self):
-        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-        self.tokenizer = AutoTokenizer.from_pretrained("beomi/KcELECTRA-base")
-        self.model = HateSpeechDetector()
-        # Model loading
-        MODEL_CKPT_PATH = hf_hub_download(repo_id="alohaboy/hate_detector_ko", filename="best_model.pt")
-        checkpoint = torch.load(MODEL_CKPT_PATH, map_location=self.device)
-        # state_dict key conversion
-        key_map = {
-            'sentence_classifier.weight': 'classifier.weight',
-            'sentence_classifier.bias': 'classifier.bias',
-            'bio_classifier.weight': 'bio_linear.weight',
-            'bio_classifier.bias': 'bio_linear.bias',
-            # CRF related keys (reverse)
-            'crf.transitions': 'crf.trans_matrix',
-            'crf.start_transitions': 'crf.start_trans',
-            'crf.end_transitions': 'crf.end_trans',
-        }
-        new_state_dict = {}
-        # If checkpoint is a dict and model_state_dict key exists, load from it
-        if isinstance(checkpoint, dict) and 'model_state_dict' in checkpoint:
-            state_dict = checkpoint['model_state_dict']
-        else:
-            state_dict = checkpoint
-        for k, v in state_dict.items():
-            new_key = key_map.get(k, k)
-            new_state_dict[new_key] = v
-        self.model.load_state_dict(new_state_dict, strict=True)
-        self.model.to(self.device)
-        self.model.eval()
-        # Blossom LLM loading
-        print("Blossom LLM loading...")
-        self.llm_model_name = "Bllossom/llama-3.2-Korean-Bllossom-3B"
-        self.llm_tokenizer = AutoTokenizer.from_pretrained(self.llm_model_name)
-        self.llm_model = AutoModelForCausalLM.from_pretrained(
-            self.llm_model_name,
-            torch_dtype=torch.bfloat16,
-            device_map="auto"
-        )
-        print("LLM loading complete!")
-        self.label_names = ["normal", "offensive", "L1_hate", "L2_hate"]
-        self.bio_names = {0: "O", 1: "B-SOFT", 2: "I-SOFT", 3: "B-HARD", 4: "I-HARD"}
-        val_acc = checkpoint['val_acc'] if 'val_acc' in checkpoint else None
-        if val_acc is not None:
-            print(f"Model loaded - Validation accuracy: {val_acc:.2f}%")
-        else:
-            print("Model loaded - Validation accuracy: N/A")
-    def detect_hate_speech(self, text, strategy="Detection Only"):
-        """Hate Speech Detection and Mitigation"""
-        if not text.strip():
-            return "Please enter text", ""
-        if len(text.strip()) < 2:
-            return "Input text is too short. Please enter at least 2 characters.", ""
-        if strategy == "Detection Only":
-            result_msg, mitigation, debug_info = self._detection_only(text)
-            print("[DEBUG] Input text:", text)
-            print("[DEBUG] sentence_logits:", debug_info.get('sentence_logits'))
-            print("[DEBUG] sentence_probs:", debug_info.get('sentence_probs'))
-            print("[DEBUG] sentence_pred:", debug_info.get('sentence_pred'))
-            print("[DEBUG] label:", debug_info.get('label'))
-            print("[DEBUG] confidence:", debug_info.get('confidence'))
-            return result_msg, mitigation
-        elif strategy == "Guided":
-            return self._guided_mitigation(text)
-        elif strategy == "Guided+Reflect":
-            return self._guided_reflect_mitigation(text)
-        elif strategy == "Unguided":
-            return self._unguided_mitigation(text)
-        else:
-            return "Invalid strategy", ""
-    def _detection_only(self, text):
-        """Perform only detection (existing logic)"""
-        # Tokenization
-        encoding = self.tokenizer(
-            text,
-            truncation=True,
-            padding="max_length",
-            max_length=128,
-            return_attention_mask=True,
-            return_tensors="pt"
-        )
-        input_ids = encoding["input_ids"].to(self.device)
-        attention_mask = encoding["attention_mask"].to(self.device)
-        print("[DEBUG] attention_mask[:, 0] =", attention_mask[:, 0])
-        # Prediction
-        with torch.no_grad():
-            outputs = self.model(input_ids=input_ids, attention_mask=attention_mask)
-            sentence_logits = outputs["sentence_logits"]
-            bio_logits = outputs["bio_logits"]
-            # Sentence classification result
-            sentence_probs = torch.softmax(sentence_logits, dim=1)
-            sentence_pred = torch.argmax(sentence_logits, dim=1).item()
-            sentence_prob = sentence_probs[0][sentence_pred].item()
-            # BIO tagging result
-            bio_preds = torch.argmax(bio_logits, dim=2)[0]
-            # Find hate/aggressive tokens
-            hate_tokens = []
-            tokens = self.tokenizer.convert_ids_to_tokens(input_ids[0])
-            # Tokenize original text to get offset mapping
-            tokenized = self.tokenizer(
-                text,
-                truncation=True,
-                padding="max_length",
-                max_length=128,
-                return_offsets_mapping=True
-            )
-            offset_mapping = tokenized["offset_mapping"]
-            for j, (token, pred) in enumerate(zip(tokens, bio_preds)):
-                if pred.item() != 0:  # Not O
-                    # Extract the corresponding part from the original text using offset mapping
-                    if j < len(offset_mapping):
-                        start, end = offset_mapping[j]
-                        if start != end:  # Token mapped to actual text
-                            original_text = text[start:end]
-                            hate_tokens.append((j, original_text, self.bio_names[pred.item()]))
-                        else:
-                            # Special token handling
-                            if token.startswith('Ġ'):
-                                decoded_token = token[1:]  # Remove Ġ
-                            elif token in ['[CLS]', '[SEP]', '[PAD]', '[UNK]']:
-                                decoded_token = token
-                            else:
-                                decoded_token = token
-                            hate_tokens.append((j, decoded_token, self.bio_names[pred.item()]))
-                    else:
-                        # Fallback
-                        if token.startswith('Ġ'):
-                            decoded_token = token[1:]
-                        elif token in ['[CLS]', '[SEP]', '[PAD]', '[UNK]']:
-                            decoded_token = token
-                        else:
-                            decoded_token = token
-                        hate_tokens.append((j, decoded_token, self.bio_names[pred.item()]))
-        # Determine label
-        label = self.label_names[sentence_pred]
-        # If hate_tokens contain B-HARD, I-HARD, increase label to L2_hate
-        if any(bio_label in ["B-HARD", "I-HARD"] for _, _, bio_label in hate_tokens):
-            label = "L2_hate"
-        # Construct result message
-        result_msg = f"Detection result: {label}\nConfidence: {sentence_prob:.2f}"
-        if hate_tokens:
-            result_msg += f"\nIdentified hate/aggressive expressions: {hate_tokens}"
-        mitigation = "Performed only detection."
-        debug_info = {
-            'sentence_logits': sentence_logits,
-            'sentence_probs': sentence_probs,
-            'sentence_pred': sentence_pred,
-            'label': label,
-            'confidence': sentence_prob,
-            'hate_tokens': hate_tokens
-        }
-        return result_msg, mitigation, debug_info
-    def _unguided_mitigation(self, text):
-        """Unguided Mode: Only Using Generation"""
-        try:
-            # Blossom LLM prompt
-            prompt = f"""Please remove hate speech or aggressive expressions from the following sentence, while maintaining the original intent (criticism, complaint, opinion, etc.).
-Original: {text}
-Mitigated sentence:"""
-            # LLM inference
-            inputs = self.llm_tokenizer(prompt, return_tensors="pt").to(self.llm_model.device)
-            with torch.no_grad():
-                outputs = self.llm_model.generate(
-                    **inputs,
-                    do_sample=True,
-                    top_k=50,
-                    top_p=0.9,
-                    max_new_tokens=300,
-                    pad_token_id=self.llm_tokenizer.pad_token_id,
-                    eos_token_id=self.llm_tokenizer.eos_token_id
-                )
-            # Decode result
-            full_response = self.llm_tokenizer.decode(outputs[0], skip_special_tokens=True)
-            # Remove prompt part and extract mitigated sentence
-            mitigated_text = full_response.replace(prompt, "").strip()
-            # Handle truncated sentences
-            if len(mitigated_text) < 10:  # Too short, use original response
-                mitigated_text = full_response
-            # Prevent repetitive output: extract only the first mitigated sentence
-            if "Mitigated sentence:" in mitigated_text:
-                mitigated_text = mitigated_text.split("Mitigated sentence:")[-1].strip()
-            # Use only the first meaningful line if multiple lines
-            lines = mitigated_text.split('\n')
-            clean_lines = []
-            for line in lines:
-                line = line.strip()
-                if line and not line.startswith('**') and not line.startswith('Original:'):
-                    clean_lines.append(line)
-            if clean_lines:
-                mitigated_text = clean_lines[0]
-            # Result message
-            result_msg = f"🤖 **Blossom LLM Mitigation Result**\n\n"
-            result_msg += f"**Original:** {text}\n\n"
-            result_msg += f"**Mitigated Sentence:** {mitigated_text}"
-            # Mitigation info
-            mitigation = "**Unguided Mode:** Blossom LLM detected and mitigated harmful expressions autonomously."
-            return result_msg, mitigation
-        except Exception as e:
-            error_msg = f"❌ **Blossom LLM Error**\n\nError occurred: {str(e)}"
-            return error_msg, "An error occurred during LLM processing."
-    def _guided_mitigation(self, text):
-        """Guided Mode: Mitigate based on KcELECTRA detection result using Blossom LLM"""
-        try:
-            # First, perform detection with KcELECTRA
-            detection_result, _, debug_info = self._detection_only(text)
-            label = debug_info.get('label', 'normal')
-            hate_tokens = debug_info.get('hate_tokens', [])
-            # Construct Blossom LLM prompt
-            if label == "normal":
-                prompt = f"""The following sentence is classified as a normal sentence. Please improve it by expressing it more politely and respectfully, while maintaining the original intent.\n\nOriginal: {text}\n\nImproved sentence:"""
-            else:
-                label_desc = {
-                    "offensive": "Aggressive",
-                    "L1_hate": "Mild Hate",
-                    "L2_hate": "Severe Hate"
-                }
-                hate_tokens_str = ""
-                if hate_tokens:
-                    hate_tokens_str = "\nExpressions causing issues:\n" + "\n".join([f"• {token} ({bio_label})" for _, token, bio_label in hate_tokens[:5]])
-                prompt = f"""The following sentence is classified as {label_desc.get(label, "harmful")} expression. \nPlease remove hate speech or aggressive expressions, while maintaining the original intent (criticism, complaint, opinion, etc.).\n\nOriginal: {text}\nClassification: {label_desc.get(label, "harmful")} expression\n{hate_tokens_str}\n\n[Important] All offensive, derogatory, and explicit hate expressions (e.g., 씨발, 좆, 병신) must be deleted.\n\nMitigated sentence:"""
-            # LLM inference
-            inputs = self.llm_tokenizer(prompt, return_tensors="pt").to(self.llm_model.device)
-            with torch.no_grad():
-                outputs = self.llm_model.generate(
-                    **inputs,
-                    do_sample=True,
-                    top_k=50,
-                    top_p=0.9,
-                    max_new_tokens=300,
-                    pad_token_id=self.llm_tokenizer.pad_token_id,
-                    eos_token_id=self.llm_tokenizer.eos_token_id
-                )
-            full_response = self.llm_tokenizer.decode(outputs[0], skip_special_tokens=True)
-            mitigated_text = full_response.replace(prompt, "").strip()
-            if len(mitigated_text) < 10:
-                mitigated_text = full_response
-            if "Mitigated sentence:" in mitigated_text:
-                mitigated_text = mitigated_text.split("Mitigated sentence:")[-1].strip()
-            lines = mitigated_text.split('\n')
-            clean_lines = []
-            for line in lines:
-                line = line.strip()
-                if line and not line.startswith('**') and not line.startswith('Original:') and not line.startswith('Classification:'):
-                    clean_lines.append(line)
-            if clean_lines:
-                mitigated_text = clean_lines[0]
-            result_msg = f"🎯 **Guided Mitigation Result**\n\n"
-            result_msg += f"**KcELECTRA Detection Result:**\n{detection_result}\n\n"
-            result_msg += f"**Blossom LLM Mitigation Result:**\n{mitigated_text}"
-            mitigation = "**Guided Mode:** Blossom LLM performed specific mitigation based on KcELECTRA's detection information."
-            return result_msg, mitigation
-        except Exception as e:
-            error_msg = f"❌ **Guided Mitigation Error**\n\nError occurred: {str(e)}"
-            return error_msg, "An error occurred during guided mitigation processing."
-    def _guided_reflect_mitigation(self, text):
-        """Guided+Reflect Mode: iterative refinement + critic evaluation"""
-        try:
-            detection_result, _, debug_info = self._detection_only(text)
-            label = debug_info.get('label', 'normal')
-            hate_tokens = debug_info.get('hate_tokens', [])
-            # Step 1: Initial mitigation
-            if label == "normal":
-                initial_prompt = f"""The following sentence is classified as a normal sentence. Please improve it by expressing it more politely and respectfully, while maintaining the original intent.\n\nOriginal: {text}\n\nImproved sentence:"""
-            else:
-                label_desc = {
-                    "offensive": "Aggressive",
-                    "L1_hate": "Mild Hate",
-                    "L2_hate": "Severe Hate"
-                }
-                hate_tokens_str = ""
-                if hate_tokens:
-                    hate_tokens_str = "\nExpressions causing issues:\n" + "\n".join([f"• {token} ({bio_label})" for _, token, bio_label in hate_tokens[:5]])
-                initial_prompt = f"""The following sentence is classified as {label_desc.get(label, "harmful")} expression. \nExpressions containing offensive words (e.g., 좃, 씨발, 병신) must be deleted.\nOther aggressive or inappropriate expressions should be mitigated by expressing them more politely and inclusively.\n\nOriginal: {text}\nClassification: {label_desc.get(label, "harmful")} expression\n{hate_tokens_str}\n\nMitigated sentence:"""
-            # Iterative mitigation and evaluation
-            max_iter = 5
-            metrics_history = []
-            best_candidate = None
-            best_score = -float('inf')
-            current_input = text
-            for i in range(max_iter):
-                # Generate candidate
-                inputs = self.llm_tokenizer(initial_prompt, return_tensors="pt").to(self.llm_model.device)
-                with torch.no_grad():
-                    outputs = self.llm_model.generate(
-                        **inputs,
-                        do_sample=True,
-                        top_k=50,
-                        top_p=0.9,
-                        max_new_tokens=300,
-                        pad_token_id=self.llm_tokenizer.pad_token_id,
-                        eos_token_id=self.llm_tokenizer.eos_token_id
-                    )
-                candidate = self.llm_tokenizer.decode(outputs[0], skip_special_tokens=True)
-                mitigated_text = candidate.replace(initial_prompt, "").strip()
-                if len(mitigated_text) < 10:
-                    mitigated_text = candidate
-                if "Mitigated sentence:" in mitigated_text:
-                    mitigated_text = mitigated_text.split("Mitigated sentence:")[-1].strip()
-                lines = mitigated_text.split('\n')
-                clean_lines = []
-                for line in lines:
-                    line = line.strip()
-                    if line and not line.startswith('**') and not line.startswith('Original:') and not line.startswith('Classification:'):
-                        clean_lines.append(line)
-                if clean_lines:
-                    mitigated_text = clean_lines[0]
-                # Exclude candidates containing offensive words
-                if contains_badword(mitigated_text):
-                    continue
-                # Evaluation
-                toxicity = calc_toxicity_reduction(text, mitigated_text, self.model, self.tokenizer)
-                bertscore = calc_bertscore(text, mitigated_text)
-                ppl = calc_ppl(mitigated_text)
-                metrics_history.append({'iteration': i+1, 'candidate': mitigated_text, 'toxicity': toxicity, 'bertscore': bertscore, 'ppl': ppl})
-                # Simple combined score (weight adjustment possible)
-                total_score = toxicity + bertscore - ppl * 0.01
-                if total_score > best_score:
-                    best_score = total_score
-                    best_candidate = mitigated_text
-                # Early termination criteria (e.g., toxicity>0.3, bertscore>0.7, ppl<100)
-                if toxicity > 0.3 and bertscore > 0.7 and ppl < 100:
-                    break
-            # Log output
-            iter_log_str = ""
-            for log in metrics_history:
-                iter_log_str += f"\nIteration {log['iteration']}:\n- Candidate: {log['candidate']}\n- Toxicity reduction: {log['toxicity']}, bertscore: {log['bertscore']}, ppl: {log['ppl']}"
-            # Result message
-            result_msg = f"🔄 **Guided+Reflect Mitigation Result**\n\n"
-            result_msg += f"**Detection Result:**\n{detection_result}\n\n"
-            result_msg += f"**Iterative Mitigation Log:**{iter_log_str}\n\n"
-            result_msg += f"**Best Mitigation:** {best_candidate}"
-            mitigation = "**Guided+Reflect Mode:** Selected the optimal candidate after iterative mitigation and evaluation (maximum 5 iterations)."
-            return result_msg, mitigation
-        except Exception as e:
-            error_msg = f"❌ **Guided+Reflect Mitigation Error**\n\nError occurred: {str(e)}"
-            return error_msg, "An error occurred during guided+reflect mitigation processing."
-    def _suggest_mitigation(self, label, confidence, hate_tokens):
-        """Suggest mitigation for hate speech expressions"""
-        if label == "normal":
-            return "✅ **Mitigation Suggestion**: This sentence does not require correction."
-        mitigation = f"**🔧 Mitigation Suggestion for Hate Speech:**\n\n"
-        if label == "offensive":
-            mitigation += "**Aggressive Expression Mitigation Options:**\n"
-            mitigation += "• Try to change aggressive expressions to more polite expressions\n"
-            mitigation += "• Use objective expressions instead of emotional expressions\n"
-            mitigation += "• Reconstruct with a mind to be considerate\n"
-            mitigation += "• When criticizing, provide specific and constructive feedback"
-        elif label == "L1_hate":
-            mitigation += "**Implicit Hate Expression Mitigation Options:**\n"
-            mitigation += "• Remove expressions that discriminate or show prejudice\n"
-            mitigation += "• Avoid generalizing about specific groups\n"
-            mitigation += "• Use more inclusive and respectful expressions\n"
-            mitigation += "• Change to expressions that acknowledge diversity"
-        else:  # L2_hate
-            mitigation += "**Explicit Hate Expression Mitigation Options:**\n"
-            mitigation += "• Completely remove severe hate expressions\n"
-            mitigation += "• Do not use violent or threatening expressions\n"
-            mitigation += "• Use expressions that respect everyone's dignity\n"
-            mitigation += "• Change to expressions that discriminate or promote hate\n"
-            mitigation += "• If necessary, seek professional help"
-        return mitigation
-def contains_badword(text):
-    badwords = ["좃", "씨발", "병신", "개새끼", "염병", "좆", "ㅅㅂ", "ㅄ", "ㅂㅅ", "ㅗ", "ㅉ"]
-    return any(bad in text for bad in badwords)
-# Service initialization
-service = HateSpeechDetectorService()
-# Gradio interface
-def create_demo():
-    with gr.Blocks(
-        title="Korean Hate Speech Detection and Mitigation System",
-        theme=gr.themes.Soft(),
-        css="""
-        .gradio-container {
-            max-width: 800px;
-            margin: 0 auto;
-        }
-        .result-box {
-            border-radius: 10px;
-            padding: 15px;
-            margin: 10px 0;
-        }
-        .normal { background-color: #d4edda; border: 1px solid #c3e6cb; }
-        .offensive { background-color: #fff3cd; border: 1px solid #ffeaa7; }
-        .hate { background-color: #f8d7da; border: 1px solid #f5c6cb; }
-        """
-    ) as demo:
-        gr.Markdown("""
-        # Korean Hate Speech Detection and Mitigation System
-        This system detects hate speech in Korean text and provides mitigation suggestions.
-        **🟢 Normal**:
-        - It is a normal sentence.
-        **🟡 Offensive**
-        - For example: "Don't say such a stupid thing", "How can you do such a stupid thing"
-        **🟠 L1_hate (Implicit Hate)**: Mild hate expression
-        - **Implicit hate expression** for protected attribute groups
-        - For example: "Those people are all the same", "Prejudicial expression towards a specific group"
-        **🔴 L2_hate (Explicit Hate)**: Severe hate expression
-        - **Explicit hate expression** for protected attribute groups
-        **🤖 Mitigation Mode:**
-        - 🔍 **Detection Only**: Hate Speech Detection Only
-        - 🎯 **Guided**: Guided Mitigation
-        - 🔄 **Guided+Reflect**: After Guided Mitigation, Iterative Refinement
-        - 🤖 **Unguided**: LLM generates text without any guidance
-        """)
-        with gr.Row():
-            with gr.Column(scale=2):
-                input_text = gr.Textbox(
-                    label="Enter text to detect hate speech & mitigate",
-                    lines=3
-                )
-                strategy = gr.Radio(
-                    ["Detection Only", "Guided", "Guided+Reflect", "Unguided"],
-                    value="Detection Only",
-                    label="Select Mitigation Mode",
-                    container=True
-                )
-                analyze_btn = gr.Button("🔍 Detect & Mitigate", variant="primary", size="lg")
-            # with gr.Column(scale=1):
-            #     gr.Markdown("""
-            #     **🧪 Test Examples:**
-            #     **🟢 Normal:**
-            #     - "Hello, today's weather is nice."
-            #     - "This movie was really fun."
-            #     **🟡 Offensive:**
-            #     - "How can you do such a stupid thing"
-            #     - "Don't say such a stupid thing"
-            #     **🟠 L1_hate (Implicit):**
-            #     - "Those people are all the same"
-            #     - "Prejudicial expression towards a specific group"
-            #     **🔴 L2_hate (Explicit):**
-            #     - "All women are useless"
-            #     - "People with disabilities are a burden to society"
-            #     """)
-        with gr.Row():
-            with gr.Column():
-                result_output = gr.Markdown(
-                    label="Mitigation Button",
-                    value="Input text and click the above button."
-                )
-            with gr.Column():
-                mitigation_output = gr.Markdown(
-                    label="Mitigation Suggestion",
-                    value="Based on the analysis result, mitigation suggestions will be provided."
-                )
-        # Event handlers
-        analyze_btn.click(
-            fn=service.detect_hate_speech,
-            inputs=[input_text, strategy],
-            outputs=[result_output, mitigation_output]
-        )
-        # Allow analysis via Enter key
-        input_text.submit(
-            fn=service.detect_hate_speech,
-            inputs=[input_text, strategy],
-            outputs=[result_output, mitigation_output]
-        )
-        # gr.Markdown("""
-        # ---
-        # **Model Information:**
-        # - Detection Model: KcELECTRA-base (Validation Accuracy: 67.67%)
-        # - Mitigation Model: Blossom LLM (llama-3.2-Korean-Bllossom-3B)
-        # - Training Data: K-HATERS Dataset
-        # """)
-    return demo
-if __name__ == "__main__":
-    demo = create_demo()
-    demo.launch(show_error=True)

 import gradio as gr
 import torch
+<<<<<<< HEAD
+from transformers import AutoTokenizer, AutoModelForCausalLM
+from detector import detect_spans, KHatersModelCRF
+from generator import critic_score, mitigate_with_strategy
+from bert_score import score
+from huggingface_hub import hf_hub_download  # 추가
+=======
 import torch.nn as nn
 from transformers import AutoTokenizer, AutoModel, AutoModelForCausalLM, AutoConfig
 import numpy as np
 from bert_score import score as bert_score_fn
 import re
+>>>>>>> 1094316 (Update Gradio demo for Hugging Face Space)
+# --- 스코어 계산용 placeholder 함수들 ---
+# 1. BERTScore (의미 유사도)
 def calc_bertscore(orig_text, rewritten_text):
     P, R, F1 = bert_score_fn([rewritten_text], [orig_text], lang="ko")
     return round(F1[0].item(), 3)
+# 2. PPL (Perplexity, 언어적 자연스러움)
+# KoGPT2 등 한국어 causal LM 사용 (최초 1회만 모델/토크나이저 로드)
+_kogpt2_tokenizer = None
+_kogpt2_model = None
 def calc_ppl(text):
+    global _kogpt2_tokenizer, _kogpt2_model
+    if _kogpt2_tokenizer is None or _kogpt2_model is None:
+        _kogpt2_tokenizer = AutoTokenizer.from_pretrained("skt/kogpt2-base-v2")
+        _kogpt2_model = AutoModelForCausalLM.from_pretrained("skt/kogpt2-base-v2")
+        _kogpt2_model.eval()
+    encodings = _kogpt2_tokenizer(text, return_tensors="pt")
+    input_ids = encodings.input_ids
+    with torch.no_grad():
+        outputs = _kogpt2_model(input_ids, labels=input_ids)
+        loss = outputs.loss
+    ppl = torch.exp(loss).item()
+    return round(ppl, 3)
+# 3. ΔTox (유해성 감소)
 def calc_toxicity_reduction(orig_text, rewritten_text, detector_model, detector_tokenizer):
+    # critic_score 함수로 normal softmax score 평가
+    orig_score = critic_score(orig_text, detector_model, detector_tokenizer)
+    rewritten_score = critic_score(rewritten_text, detector_model, detector_tokenizer)
+    delta = orig_score - rewritten_score
+    return round(delta, 3)
+# detector 준비
+base_model_name = "beomi/KcELECTRA-base"
+detector_tokenizer = AutoTokenizer.from_pretrained(base_model_name, use_fast=False)
+<<<<<<< HEAD
+num_labels, num_bio_labels, num_targets = 4, 5, 9  # ← 수정!
+detector_model = KHatersModelCRF(base_model_name, num_labels, num_bio_labels, num_targets)
+# ckpt = torch.load("/root/PROJECT-ROOT/backend/kcelectra_crf_ckpt/best_model.pt", map_location="cpu")
+ckpt_path = hf_hub_download(repo_id="alohaboy/hate_detector_ko", filename="best_model.pt", use_auth_token=True)
+ckpt = torch.load(ckpt_path, map_location="cpu")
+=======
+num_labels, num_bio_labels, num_targets = 5, 3, 9
+detector_model = KHatersModelCRF(base_model_name, num_labels, num_bio_labels, num_targets)
+ckpt = torch.load("/root/PROJECT-ROOT/backend/kcelectra_crf_ckpt/best_model.pt", map_location="cpu")
+>>>>>>> 1094316 (Update Gradio demo for Hugging Face Space)
+if "model_state_dict" in ckpt:
+    state_dict = ckpt["model_state_dict"]
+else:
+    state_dict = ckpt
+# state_dict 키 변환 패치 (구버전 → 신버전)
+crf_key_map = {
+    "crf.trans_matrix": "crf.transitions",
+    "crf.start_trans": "crf.start_transitions",
+    "crf.end_trans": "crf.end_transitions",
+}
+for old_key, new_key in crf_key_map.items():
+    if old_key in state_dict:
+        state_dict[new_key] = state_dict.pop(old_key)
+<<<<<<< HEAD
+# 구버전 키가 남아있으면 삭제
+for k in ["crf.trans_matrix", "crf.start_trans", "crf.end_trans"]:
+=======
+# 신버전 키가 남아있으면 삭제
+for k in ["crf.transitions", "crf.start_transitions", "crf.end_transitions"]:
+>>>>>>> 1094316 (Update Gradio demo for Hugging Face Space)
+    if k in state_dict:
+        del state_dict[k]
+detector_model.load_state_dict(state_dict)
+# detector_model.to("cuda")  # GPU로 이동 (제거)
+detector_model.eval()
+# LLM 준비
+LLM_MODEL_NAME = "Bllossom/llama-3.2-Korean-Bllossom-3B"
+llm_tokenizer = AutoTokenizer.from_pretrained(LLM_MODEL_NAME, use_auth_token=True)
+llm_model = AutoModelForCausalLM.from_pretrained(
+    LLM_MODEL_NAME,
+    torch_dtype=torch.float32,  # CPU에서 동작
+    device_map="cpu",           # CPU 사용
+    use_auth_token=True
+)
+llm_model.to("cpu")
+print("llm_model device:", llm_model.device)
+def phrase_replacement(text, spans):
+    # 탐지된 혐오 스팬만 순화(예시: "[순화]"로 대체)
+    new_text = text
+    for span in spans:
+        new_text = new_text.replace(span['text'], "[순화]")
+    return new_text
+def unguided_rewrite(text):
+    # LLM에 "순화" 프롬프트 없이 단순 paraphrase (예시)
+    prompt = f"다음 문장을 더 부드럽게 바꿔주세요: {text}"
+    inputs = llm_tokenizer(prompt, return_tensors="pt").to(llm_model.device)
+    gen_ids = llm_model.generate(
+        **inputs,
+        do_sample=True,
+        top_k=50,
+        top_p=0.9,
+        num_return_sequences=1,
+        max_new_tokens=64,
+        pad_token_id=llm_tokenizer.pad_token_id,
+        eos_token_id=llm_tokenizer.eos_token_id
+    )
+    return llm_tokenizer.decode(gen_ids[0], skip_special_tokens=True)
+def hate_speech_demo(text, strategy):
+    det = detect_spans(text)
+    detector_result = {
+        "샘플문장": text,
+        "유해카테고리": det["category"],
+        "탐지스팬들": det["spans"],
+        "타겟예측": det["targets"]
+    }
+    # 전략별 순화
+    if det["category"] == "normal":
+        output = text
+        candidate_info = "-"
+    elif strategy == "Full Rewrite":
+        num_return_sequences = 5
+        current_text = text
+        best_candidate = current_text
+        best_score = 0.0
+        all_candidates = []
+        all_scores = []
+        for _ in range(3):  # max_iter=3
+            prompt = (
+                f"문장을 순화해주세요.\n\n"
+                f"- 원문: {current_text}\n"
+                f"- 혐오 표현: {[span['text'] for span in det['spans']]}\n"
+                f"- 혐오 유형: {det['category']}\n"
+                f"- 타겟 그룹: {det['targets']}\n\n"
+                f"위 내용을 고려하여, 혐오 표현을 더 평화로운 표현으로 순화해주세요."
+            )
+            inputs = llm_tokenizer(prompt, return_tensors="pt").to(llm_model.device)
+            gen_ids = llm_model.generate(
+                **inputs,
+                do_sample=True,
+                top_k=50,
+                top_p=0.9,
+                num_return_sequences=num_return_sequences,
+                max_new_tokens=64,
+                pad_token_id=llm_tokenizer.pad_token_id,
+                eos_token_id=llm_tokenizer.eos_token_id
+            )
+            candidates = [llm_tokenizer.decode(g, skip_special_tokens=True) for g in gen_ids]
+            scores = [critic_score(c, detector_model, detector_tokenizer) for c in candidates]
+            all_candidates.extend(candidates)
+            all_scores.extend(scores)
+            best_idx = int(torch.tensor(scores).argmax())
+            if scores[best_idx] >= 0.7:
+                best_candidate = candidates[best_idx]
+                best_score = scores[best_idx]
+                break
+            best_candidate = candidates[best_idx]
+            best_score = scores[best_idx]
+            current_text = best_candidate
+        output = best_candidate
+        candidate_info = "\n".join([f"[{i+1}] {c} (score={s:.3f})" for i, (c, s) in enumerate(zip(all_candidates, all_scores))])
+    elif strategy == "Phrase Replacement":
+        output = phrase_replacement(text, det["spans"])
+        candidate_info = "-"
+    elif strategy == "Unguided":
+        output = unguided_rewrite(text)
+        candidate_info = "-"
+    else:
+        output = text
+        candidate_info = "-"
+    # 스코어 계산
+    tox_score = calc_toxicity_reduction(text, output, detector_model, detector_tokenizer)
+    ppl_score = calc_ppl(output)
+    bert_score = calc_bertscore(text, output)
+    soft_or_hard = det.get("soft_or_hard", "-")
+    return (
+        det["category"],
+        str(det["spans"]),
+        str(det["targets"]),
+        output,
+        f"Toxicity Reduction: {tox_score}",
+        f"PPL: {ppl_score}",
+        f"BERTScore: {bert_score}",
+        candidate_info,
+        soft_or_hard
+    )
+with gr.Blocks(theme=gr.themes.Monochrome(primary_hue="blue", secondary_hue="slate")) as demo:
+    gr.HTML("""
+    <style>
+    .modern-input textarea { font-size: 1.1em; border-radius: 8px; }
+    .modern-btn { background: linear-gradient(90deg,#2563eb,#60a5fa); color: white; border-radius: 8px; font-weight: bold; }
+    .modern-label { font-size: 1.05em; font-weight: 600; color: #2563eb; }
+    .modern-badge { background: #e0e7ff; color: #1e293b; border-radius: 6px; padding: 0.2em 0.6em; font-weight: 500; }
+    .modern-output textarea { background: #f1f5f9; border-radius: 8px; font-size: 1.1em; }
+    .modern-score input { color: #0ea5e9; font-weight: bold; font-size: 1.1em; }
+    .modern-candidate textarea { background: #f8fafc; border-radius: 8px; }
+    </style>
+    """)
+    gr.Markdown("""
+    <div style='text-align:center; margin-bottom: 1em;'>
+        <h1 style='color:#2563eb;'>Hate Speech Mitigation Demo</h1>
+        <p style='font-size:1.1em;'>Enter a sentence and select a mitigation strategy to see the results and various scores.</p>
+    </div>
+    """)
+    with gr.Row():
+        lang = gr.Radio(["Korean", "English"], value="Korean", label="Language", container=True)
+        input_box = gr.Textbox(label="Input text", lines=2, value="야, 병신아. 이런 것도 못해?", elem_classes="modern-input")
+        strategy = gr.Radio([
+            "Guided",
+            "Phrase Replacement",
+            "Unguided"
+        ], value="Full Rewrite", label="Mitigation Strategy", container=True)
+    run_btn = gr.Button("Non-Toxic", variant="primary", elem_classes="modern-btn")
+    with gr.Row():
+        out1 = gr.Label(label="Hate type", elem_classes="modern-label")
+        out2 = gr.HighlightedText(label="Detected spans", elem_classes="modern-badge")
+        out3 = gr.Label(label="Target", elem_classes="modern-label")
+        out5 = gr.Number(label="Toxicity Reduction", elem_classes="modern-score")
+        out6 = gr.Number(label="PPL", elem_classes="modern-score")
+        out7 = gr.Number(label="BERTScore", elem_classes="modern-score")
+    with gr.Accordion("Candidate Info", open=False):
+        out8 = gr.Textbox(label="Candidates", lines=3, elem_classes="modern-candidate")
+    out4 = gr.Textbox(label="output)", lines=2, elem_classes="modern-output")
+    def hate_speech_multilingual(text, strategy, lang):
+        if lang == "Korean":
+            return hate_speech_demo(text, strategy)[:8]  # soft/hard 제외
+        else:
+            # 영어 혐오 탐지 파이프라인 준비
+            from english_detector import detect_spans as detect_spans_en, EnglishElectraHateDetector
+            from transformers import AutoTokenizer
+            import torch
+            # 매핑 생성
+            tag2id = {"O": 0, "B-HATE": 1, "I-HATE": 2, "B-OFF": 3, "I-OFF": 4}
+            id2tag = {v: k for k, v in tag2id.items()}
+            sev2id = {"NORMAL": 0, "O1": 1, "O2": 2, "H1": 3, "H2": 4}
+            id2sev = {v: k for k, v in sev2id.items()}
+            # 카테고리 예시(실제 HateXplain 전체 카테고리 반영 필요)
+            cat_list = ["African", "Arab", "Asian", "Atheist", "Buddhist", "Christian", "Female", "Hispanic", "Homosexual_gay_or_lesbian", "Immigrant", "Jewish", "Male", "Other_religions", "Physical_disability", "Transgender", "Other"]
+            cat2id = {c: i for i, c in enumerate(cat_list)}
+            id2cat = {v: k for k, v in cat2id.items()}
+            # 모델/토크나이저 준비
+            base_model_name = "google/electra-base-discriminator"
+            tokenizer = AutoTokenizer.from_pretrained(base_model_name)
+            model = EnglishElectraHateDetector(base_model_name, num_severity=5, num_bio_labels=5, num_targets=len(cat2id))
+            ckpt = torch.load("/root/PROJECT-ROOT/backend/english_detector_ckpt/best_model.pt", map_location="cuda")
+            model.load_state_dict(ckpt)
+            model.eval()
+            model.to("cuda")  # 탐지 모델만 GPU에 올림
+            # 탐지 실행
+            det = detect_spans_en(text, model, tokenizer, tag2id, id2tag, sev2id, id2sev, cat2id, id2cat, device="cuda")
+            # HighlightedText 변환: [(text, class)]
+            spans_for_highlight = [(span["text"], span["label"]) for span in det["spans"] if span.get("text")]
+            # 타겟 변환
+            targets_str = ", ".join(det["targets"]) if det["targets"] else "-"
+            # 전략별 순화(여기선 원문 그대로 반환, 필요시 영어 generator 연동)
+            output = text
+            candidate_info = "-"
+            # 점수 placeholder
+            tox_score = "-"
+            ppl_score = "-"
+            bert_score = "-"
+            soft_or_hard = det.get("soft_or_hard", "-")
+            return (
+                det["category"] if "category" in det else det.get("severity", "-"),
+                spans_for_highlight,
+                targets_str,
+                tox_score,
+                ppl_score,
+                bert_score,
+                candidate_info,
+                output
+            )
+    run_btn.click(
+        hate_speech_multilingual,
+        inputs=[input_box, strategy, lang],
+        outputs=[out1, out2, out3, out5, out6, out7, out8, out4]
+    )
+demo.launch(share=True)
+print("input_ids:", input_ids)
+print("attention_mask:", attention_mask)
+print("decode_mask:", decode_mask)
+print("bio_feats.shape:", bio_feats.shape)