from datasets import load_dataset dataset = load_dataset("audiofolder", data_dir="data") #dataset= dataset["train"].train_test_split(seed=42, shuffle=True, test_size=0.1) from transformers import ASTForAudioClassification from transformers import ASTFeatureExtractor from transformers import TrainingArguments import numpy as np from transformers import Trainer import evaluate batch_size = 8 gradient_accumulation_steps = 1 num_train_epochs = 10 labels=["noise","speech"] num_labels = 2 max_duration = 5 model_id="bookbot/distil-ast-audioset" model_name = "speechVSnoise" label2id, id2label = dict(), dict() for i, label in enumerate(labels): label2id[label] = str(i) id2label[str(i)] = label model = ASTForAudioClassification.from_pretrained( model_id, num_labels=num_labels, label2id=label2id, id2label=id2label, ignore_mismatched_sizes=True ) feature_extractor = ASTFeatureExtractor.from_pretrained( model_id, do_normalize=True, return_attention_mask=False ) def preprocess_function(examples): audio_arrays = [x["array"] for x in examples["audio"]] inputs = feature_extractor( audio_arrays, sampling_rate=feature_extractor.sampling_rate, max_length=int(feature_extractor.sampling_rate * max_duration), truncation=True, ) return inputs dataset_encoded = dataset.map( preprocess_function, batched=True, batch_size=1674, num_proc=1, ) metric = evaluate.load("accuracy") def compute_metrics(eval_pred): predictions = np.argmax(eval_pred.predictions, axis=1) return metric.compute(predictions=predictions, references=eval_pred.label_ids) training_args = TrainingArguments( f"{model_name}", evaluation_strategy="epoch", save_strategy="epoch", learning_rate=5e-5, per_device_train_batch_size=batch_size, gradient_accumulation_steps=gradient_accumulation_steps, per_device_eval_batch_size=batch_size, num_train_epochs=num_train_epochs, warmup_ratio=0.1, logging_steps=5, load_best_model_at_end=True, # metric_for_best_model="accuracy", # push_to_hub=True, ) from transformers import Trainer trainer = Trainer( model, training_args, train_dataset=dataset_encoded["train"], eval_dataset=dataset_encoded["train"], tokenizer=feature_extractor, # compute_metrics=compute_metrics, ) trainer.train()