Spaces:

koushikkumarkadari
/

hate-speech-detection

Sleeping

File size: 4,469 Bytes

import gradio as gr
import torch
from transformers import AutoTokenizer, AlbertForSequenceClassification, AlbertTokenizer
import numpy as np
import os
import gdown
import logging

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Define Google Drive file IDs for each model's config and safetensors
model_file_ids = {
    "sentiment": {
        "config": "11jwMJmQMGkiVZWBRQ5BLFyot1520FYIQ",
        "model": "115N5yiu9lfw4uJE5YxHNoHauHeYSSusu"
    },
    "emotion": {
        "config": "1dSxK10jbZyRpMDCm6MCRf9Jy0weOzLP9",
        "model": "1Y3rTtPfo4zu28OhsRybdJF6czZN46I0Y"
    },
    "hate_speech": {
        "config": "1QTejES8BZQs3qnxom9ymiZkLRUAZ91NP",
        "model": "1ol2xO4XbdHwP_HHCYsnX8iVutA6javy_"
    },
    "sarcasm": {
        "config": "1ypl0j1Yp_-0szR4-P1-0CMyDYBwUn5Wz",
        "model": "1pbByLvTIHO_sT9HMeypvXbsdHsLVzTdk"
    }
}

# Define local directory to store downloaded models
save_dir = "./saved_models"
os.makedirs(save_dir, exist_ok=True)

# Download individual model files
for task, files in model_file_ids.items():
    output_dir = os.path.join(save_dir, task)
    os.makedirs(output_dir, exist_ok=True)
    
    config_path = os.path.join(output_dir, "config.json")
    model_path = os.path.join(output_dir, "model.safetensors")
    
    if not os.path.exists(config_path):
        logger.info(f"Downloading {task} config.json from Google Drive...")
        gdown.download(f"https://drive.google.com/uc?id={files['config']}", config_path, quiet=False)
    else:
        logger.info(f"Config for {task} already exists, skipping download.")
    
    if not os.path.exists(model_path):
        logger.info(f"Downloading {task} model.safetensors from Google Drive...")
        gdown.download(f"https://drive.google.com/uc?id={files['model']}", model_path, quiet=False)
    else:
        logger.info(f"Model for {task} already exists, skipping download.")

# Define model paths
tasks = ["sentiment", "emotion", "hate_speech", "sarcasm"]
model_paths = {task: f"{save_dir}/{task}" for task in tasks}

# Define label mappings
label_mappings = {
    "sentiment": ["negative", "neutral", "positive"],
    "emotion": ["happy", "sad", "angry", "fear"],
    "hate_speech": ["no", "yes"],
    "sarcasm": ["no", "yes"]
}

# Load tokenizer
logger.info("Loading tokenizer...")
try:
    # Explicitly use AlbertTokenizer with SentencePiece
    tokenizer = AlbertTokenizer.from_pretrained("ai4bharat/indic-bert", use_fast=False)
except Exception as e:
    logger.error(f"Failed to load tokenizer: {str(e)}")
    raise

# Load all models
models = {}
for task in tasks:
    logger.info(f"Loading model for {task}...")
    if not os.path.exists(model_paths[task]):
        raise FileNotFoundError(f"Model directory {model_paths[task]} not found.")
    try:
        models[task] = AlbertForSequenceClassification.from_pretrained(model_paths[task])
    except Exception as e:
        logger.error(f"Failed to load model for {task}: {str(e)}")
        raise

# Function to predict for a single task
def predict_task(text, task, model, tokenizer, max_length=128):
    inputs = tokenizer(
        text,
        padding=True,
        truncation=True,
        max_length=max_length,
        return_tensors="pt"
    )
    
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        probabilities = torch.softmax(logits, dim=1).squeeze().cpu().numpy()
    
    labels = label_mappings[task]
    return {label: f"{prob*100:.2f}%" for label, prob in zip(labels, probabilities)}

# Gradio interface function
def predict_all_tasks(text):
    if not text.strip():
        return "Please enter some text."
    
    results = {}
    for task in tasks:
        results[task] = predict_task(text, task, models[task], tokenizer)
    
    output = ""
    for task, probs in results.items():
        output += f"\n{task.capitalize()} Prediction:\n"
        for label, prob in probs.items():
            output += f"  {label}: {prob}\n"
    
    return output

# Create Gradio interface
iface = gr.Interface(
    fn=predict_all_tasks,
    inputs=gr.Textbox(lines=2, placeholder="Enter Telugu text here..."),
    outputs="text",
    title="Telugu Text Analysis",
    description="Enter Telugu text to predict sentiment, emotion, hate speech, and sarcasm."
)

if __name__ == "__main__":
    logger.info("Launching Gradio interface...")
    iface.launch(server_name="0.0.0.0", server_port=7860)