File size: 4,469 Bytes
5a0110d
 
e5e8bcf
5a0110d
 
1a946a6
96ff0d7
 
 
 
 
5a0110d
e5e8bcf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1a946a6
 
 
5a0110d
1a946a6
 
e5e8bcf
 
1a946a6
e5e8bcf
 
 
 
 
 
 
 
 
 
 
 
 
 
d647001
e5e8bcf
1a946a6
 
5a0110d
 
 
 
 
 
 
 
 
 
 
e5e8bcf
d647001
96ff0d7
e5e8bcf
 
96ff0d7
 
 
5a0110d
 
 
 
d647001
 
 
96ff0d7
d647001
96ff0d7
d647001
96ff0d7
5a0110d
 
 
d647001
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5a0110d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
96ff0d7
5a0110d
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
import gradio as gr
import torch
from transformers import AutoTokenizer, AlbertForSequenceClassification, AlbertTokenizer
import numpy as np
import os
import gdown
import logging

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Define Google Drive file IDs for each model's config and safetensors
model_file_ids = {
    "sentiment": {
        "config": "11jwMJmQMGkiVZWBRQ5BLFyot1520FYIQ",
        "model": "115N5yiu9lfw4uJE5YxHNoHauHeYSSusu"
    },
    "emotion": {
        "config": "1dSxK10jbZyRpMDCm6MCRf9Jy0weOzLP9",
        "model": "1Y3rTtPfo4zu28OhsRybdJF6czZN46I0Y"
    },
    "hate_speech": {
        "config": "1QTejES8BZQs3qnxom9ymiZkLRUAZ91NP",
        "model": "1ol2xO4XbdHwP_HHCYsnX8iVutA6javy_"
    },
    "sarcasm": {
        "config": "1ypl0j1Yp_-0szR4-P1-0CMyDYBwUn5Wz",
        "model": "1pbByLvTIHO_sT9HMeypvXbsdHsLVzTdk"
    }
}

# Define local directory to store downloaded models
save_dir = "./saved_models"
os.makedirs(save_dir, exist_ok=True)

# Download individual model files
for task, files in model_file_ids.items():
    output_dir = os.path.join(save_dir, task)
    os.makedirs(output_dir, exist_ok=True)
    
    config_path = os.path.join(output_dir, "config.json")
    model_path = os.path.join(output_dir, "model.safetensors")
    
    if not os.path.exists(config_path):
        logger.info(f"Downloading {task} config.json from Google Drive...")
        gdown.download(f"https://drive.google.com/uc?id={files['config']}", config_path, quiet=False)
    else:
        logger.info(f"Config for {task} already exists, skipping download.")
    
    if not os.path.exists(model_path):
        logger.info(f"Downloading {task} model.safetensors from Google Drive...")
        gdown.download(f"https://drive.google.com/uc?id={files['model']}", model_path, quiet=False)
    else:
        logger.info(f"Model for {task} already exists, skipping download.")

# Define model paths
tasks = ["sentiment", "emotion", "hate_speech", "sarcasm"]
model_paths = {task: f"{save_dir}/{task}" for task in tasks}

# Define label mappings
label_mappings = {
    "sentiment": ["negative", "neutral", "positive"],
    "emotion": ["happy", "sad", "angry", "fear"],
    "hate_speech": ["no", "yes"],
    "sarcasm": ["no", "yes"]
}

# Load tokenizer
logger.info("Loading tokenizer...")
try:
    # Explicitly use AlbertTokenizer with SentencePiece
    tokenizer = AlbertTokenizer.from_pretrained("ai4bharat/indic-bert", use_fast=False)
except Exception as e:
    logger.error(f"Failed to load tokenizer: {str(e)}")
    raise

# Load all models
models = {}
for task in tasks:
    logger.info(f"Loading model for {task}...")
    if not os.path.exists(model_paths[task]):
        raise FileNotFoundError(f"Model directory {model_paths[task]} not found.")
    try:
        models[task] = AlbertForSequenceClassification.from_pretrained(model_paths[task])
    except Exception as e:
        logger.error(f"Failed to load model for {task}: {str(e)}")
        raise

# Function to predict for a single task
def predict_task(text, task, model, tokenizer, max_length=128):
    inputs = tokenizer(
        text,
        padding=True,
        truncation=True,
        max_length=max_length,
        return_tensors="pt"
    )
    
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        probabilities = torch.softmax(logits, dim=1).squeeze().cpu().numpy()
    
    labels = label_mappings[task]
    return {label: f"{prob*100:.2f}%" for label, prob in zip(labels, probabilities)}

# Gradio interface function
def predict_all_tasks(text):
    if not text.strip():
        return "Please enter some text."
    
    results = {}
    for task in tasks:
        results[task] = predict_task(text, task, models[task], tokenizer)
    
    output = ""
    for task, probs in results.items():
        output += f"\n{task.capitalize()} Prediction:\n"
        for label, prob in probs.items():
            output += f"  {label}: {prob}\n"
    
    return output

# Create Gradio interface
iface = gr.Interface(
    fn=predict_all_tasks,
    inputs=gr.Textbox(lines=2, placeholder="Enter Telugu text here..."),
    outputs="text",
    title="Telugu Text Analysis",
    description="Enter Telugu text to predict sentiment, emotion, hate speech, and sarcasm."
)

if __name__ == "__main__":
    logger.info("Launching Gradio interface...")
    iface.launch(server_name="0.0.0.0", server_port=7860)