|
|
|
from datasets import load_dataset |
|
dataset = load_dataset("audiofolder", data_dir="data") |
|
|
|
from transformers import ASTForAudioClassification |
|
from transformers import ASTFeatureExtractor |
|
from transformers import TrainingArguments |
|
import numpy as np |
|
from transformers import Trainer |
|
import evaluate |
|
batch_size = 8 |
|
gradient_accumulation_steps = 1 |
|
num_train_epochs = 10 |
|
labels=["noise","speech"] |
|
num_labels = 2 |
|
max_duration = 5 |
|
model_id="bookbot/distil-ast-audioset" |
|
model_name = "speechVSnoise" |
|
|
|
label2id, id2label = dict(), dict() |
|
for i, label in enumerate(labels): |
|
label2id[label] = str(i) |
|
id2label[str(i)] = label |
|
|
|
model = ASTForAudioClassification.from_pretrained( |
|
model_id, |
|
num_labels=num_labels, label2id=label2id, |
|
id2label=id2label, |
|
ignore_mismatched_sizes=True |
|
) |
|
feature_extractor = ASTFeatureExtractor.from_pretrained( |
|
model_id, do_normalize=True, return_attention_mask=False |
|
) |
|
|
|
|
|
def preprocess_function(examples): |
|
audio_arrays = [x["array"] for x in examples["audio"]] |
|
inputs = feature_extractor( |
|
audio_arrays, |
|
sampling_rate=feature_extractor.sampling_rate, |
|
max_length=int(feature_extractor.sampling_rate * max_duration), |
|
truncation=True, |
|
|
|
) |
|
return inputs |
|
dataset_encoded = dataset.map( |
|
preprocess_function, |
|
batched=True, |
|
batch_size=1674, |
|
num_proc=1, |
|
) |
|
metric = evaluate.load("accuracy") |
|
def compute_metrics(eval_pred): |
|
predictions = np.argmax(eval_pred.predictions, axis=1) |
|
return metric.compute(predictions=predictions, references=eval_pred.label_ids) |
|
training_args = TrainingArguments( |
|
f"{model_name}", |
|
evaluation_strategy="epoch", |
|
save_strategy="epoch", |
|
learning_rate=5e-5, |
|
per_device_train_batch_size=batch_size, |
|
gradient_accumulation_steps=gradient_accumulation_steps, |
|
per_device_eval_batch_size=batch_size, |
|
num_train_epochs=num_train_epochs, |
|
warmup_ratio=0.1, |
|
logging_steps=5, |
|
load_best_model_at_end=True, |
|
|
|
|
|
) |
|
|
|
from transformers import Trainer |
|
|
|
trainer = Trainer( |
|
model, |
|
training_args, |
|
train_dataset=dataset_encoded["train"], |
|
eval_dataset=dataset_encoded["train"], |
|
tokenizer=feature_extractor, |
|
|
|
) |
|
trainer.train() |
|
|
|
|