from transformers import AutoModelForSequenceClassification, AutoTokenizer, TrainingArguments, Trainer from datasets import load_dataset model_name = "microsoft/Multilingual-MiniLM-L12-H384" dataset = load_dataset("Goodmotion/spam-mail") tokenizer = AutoTokenizer.from_pretrained(model_name) # insert labels def encode_labels(data): label_map = {"SPAM": 1, "NOSPAM": 0} data["label"] = label_map[data["label"]] return data def tokenize_data(data): return tokenizer( data["text"], padding="max_length", truncation=True, max_length=128 ) # tokenize the dataset tokenized_dataset = dataset.map(tokenize_data, batched=True) # define the model model = AutoModelForSequenceClassification.from_pretrained( "microsoft/Multilingual-MiniLM-L12-H384", num_labels=2 ) model.classifier.weight.data.normal_(mean=0.0, std=0.02) model.classifier.bias.data.zero_() training_args = TrainingArguments( output_dir="./results", # speed training learning_rate=5e-5, # 16 examples per device per_device_train_batch_size=16, # 3 times on the same data num_train_epochs=3, # weight coef weight_decay=0.01, logging_dir='./logs' ) trainer = Trainer( model=model, args=training_args, train_dataset=tokenized_dataset["train"], ) # train the model trainer.train() # save the model model.save_pretrained("./spam-classifier") # save the tokenizer tokenizer.save_pretrained("./spam-classifier")