pere commited on
Commit
1cf8cf6
·
1 Parent(s): 8b460b7
Files changed (4) hide show
  1. run.sh +6 -6
  2. run_npsc.sh +37 -0
  3. run_nst.sh +38 -0
  4. run_whisper_finetuning.py +78 -31
run.sh CHANGED
@@ -2,18 +2,17 @@
2
  python run_whisper_finetuning.py \
3
  --model_name_or_path="openai/whisper-small" \
4
  --output_dir="../whisper-testrun1" \
5
- --repo_id="NbAiLab/whisper-testrun1" \
6
  --overwrite_output_dir=True \
7
  --language="Norwegian" \
8
  --task="transcribe" \
9
- --dataset_name="NbAiLab/NPSC" \
10
- --dataset_config="16K_mp3" \
11
  --do_train=True \
12
  --do_eval=True \
13
  --audio_column_name="audio" \
14
- --text_column_name="normsentence_text" \
15
- --per_device_train_batch_size=16 \
16
- --per_device_train_batch_size=16 \
17
  --learning_rate=2e-5 \
18
  --warmup_steps=500 \
19
  --max_steps=10000 \
@@ -23,6 +22,7 @@ python run_whisper_finetuning.py \
23
  --evaluation_strategy="steps" \
24
  --save_steps=1000 \
25
  --eval_steps=1000 \
 
26
  --logging_steps=250 \
27
  --fp16=True \
28
  --load_best_model_at_end=True \
 
2
  python run_whisper_finetuning.py \
3
  --model_name_or_path="openai/whisper-small" \
4
  --output_dir="../whisper-testrun1" \
 
5
  --overwrite_output_dir=True \
6
  --language="Norwegian" \
7
  --task="transcribe" \
8
+ --dataset_name="mozilla-foundation/common_voice_11_0" \
9
+ --dataset_config="nn-NO" \
10
  --do_train=True \
11
  --do_eval=True \
12
  --audio_column_name="audio" \
13
+ --text_column_name="sentence" \
14
+ --per_device_train_batch_size=32 \
15
+ --per_device_train_batch_size=32 \
16
  --learning_rate=2e-5 \
17
  --warmup_steps=500 \
18
  --max_steps=10000 \
 
22
  --evaluation_strategy="steps" \
23
  --save_steps=1000 \
24
  --eval_steps=1000 \
25
+ --max_eval_samples=10 \
26
  --logging_steps=250 \
27
  --fp16=True \
28
  --load_best_model_at_end=True \
run_npsc.sh ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ python run_whisper_finetuning.py \
3
+ --model_name_or_path="openai/whisper-small" \
4
+ --output_dir="../whisper-testrun1" \
5
+ --overwrite_output_dir=True \
6
+ --language="Norwegian" \
7
+ --task="transcribe" \
8
+ --dataset_name="NbAiLab/NPSC" \
9
+ --dataset_config="16K_mp3_bokmaal" \
10
+ --do_train=True \
11
+ --do_eval=True \
12
+ --audio_column_name="audio" \
13
+ --text_column_name="text" \
14
+ --per_device_train_batch_size=16 \
15
+ --per_device_train_batch_size=16 \
16
+ --learning_rate=2e-5 \
17
+ --warmup_steps=500 \
18
+ --max_steps=10000 \
19
+ --gradient_checkpointing=True \
20
+ --gradient_accumulation_steps=1 \
21
+ --group_by_length=False \
22
+ --evaluation_strategy="steps" \
23
+ --save_steps=1000 \
24
+ --eval_steps=1000 \
25
+ --logging_steps=250 \
26
+ --fp16=True \
27
+ --load_best_model_at_end=True \
28
+ --metric_for_best_model="wer" \
29
+ --greater_is_better=False \
30
+ --report_to="tensorboard" \
31
+ --predict_with_generate=True \
32
+ --generation_max_length=225 \
33
+ --print_training_arguments=True \
34
+ --push_to_hub=True
35
+
36
+
37
+
run_nst.sh ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ python run_whisper_finetuning.py \
3
+ --model_name_or_path="openai/whisper-small" \
4
+ --output_dir="../whisper-testrun1" \
5
+ --overwrite_output_dir=True \
6
+ --language="Norwegian" \
7
+ --task="transcribe" \
8
+ --dataset_name="NbAiLab/NST" \
9
+ --dataset_config="no-close" \
10
+ --do_train=True \
11
+ --do_eval=True \
12
+ --audio_column_name="audio" \
13
+ --text_column_name="text" \
14
+ --per_device_train_batch_size=16 \
15
+ --per_device_train_batch_size=16 \
16
+ --learning_rate=2e-5 \
17
+ --warmup_steps=500 \
18
+ --max_steps=10000 \
19
+ --gradient_checkpointing=True \
20
+ --gradient_accumulation_steps=1 \
21
+ --group_by_length=False \
22
+ --evaluation_strategy="steps" \
23
+ --save_steps=1000 \
24
+ --eval_steps=10 \
25
+ --max_eval_samples=100 \
26
+ --logging_steps=250 \
27
+ --fp16=True \
28
+ --load_best_model_at_end=True \
29
+ --metric_for_best_model="wer" \
30
+ --greater_is_better=False \
31
+ --report_to="tensorboard" \
32
+ --predict_with_generate=True \
33
+ --generation_max_length=225 \
34
+ --print_training_arguments=True \
35
+ --push_to_hub=True
36
+
37
+
38
+
run_whisper_finetuning.py CHANGED
@@ -51,6 +51,48 @@ from transformers.utils.versions import require_version
51
  def list_field(default=None, metadata=None):
52
  return field(default_factory=lambda: default, metadata=metadata)
53
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
54
 
55
  @dataclass
56
  class ModelArguments:
@@ -134,6 +176,7 @@ class ModelArguments:
134
  )
135
 
136
 
 
137
  @dataclass
138
  class DataTrainingArguments:
139
  """
@@ -191,7 +234,7 @@ class DataTrainingArguments:
191
  default=None,
192
  metadata={
193
  "help": "For debugging purposes or quicker training, truncate the number of validation examples to this "
194
- "value if set."
195
  },
196
  )
197
  chars_to_ignore: Optional[List[str]] = list_field(
@@ -240,19 +283,11 @@ class DataTrainingArguments:
240
  default="|",
241
  metadata={"help": "The word delimiter token for the tokenizer"},
242
  )
243
- predict_with_generate: bool = field(
244
- default=True,
245
- metadata={
246
- "help": "Output tokens in addition to loss and digits for calculating metrics"},
247
- )
248
- generation_max_length: int = field(
249
- default=225,
250
- metadata={"help": "Maximum number of tokens generated"},
251
- )
252
  phoneme_language: Optional[str] = field(
253
  default=None,
254
  metadata={
255
- "help": "The target language that should be used be"
256
  " passed to the tokenizer for tokenization. Note that"
257
  " this is only relevant if the model classifies the"
258
  " input audio to a sequence of phoneme sequences."
@@ -303,7 +338,7 @@ def main():
303
  # or by passing the --help flag to this script.
304
  # We now keep distinct sets of args, for a cleaner separation of concerns.
305
  parser = HfArgumentParser(
306
- (ModelArguments, DataTrainingArguments, TrainingArguments))
307
  model_args, data_args, training_args = parser.parse_args_into_dataclasses()
308
 
309
  # Metrics
@@ -351,7 +386,7 @@ def main():
351
 
352
  # Load dataset
353
  train_dataset = load_dataset(data_args.dataset_name, data_args.dataset_config_name, split="train", streaming=True, use_auth_token=True)
354
- eval_dataset = load_dataset(data_args.dataset_name, data_args.dataset_config_name, split="validation", streaming=True, use_auth_token=True)
355
 
356
 
357
  # Rename columns
@@ -373,15 +408,17 @@ def main():
373
  model_args.model_name_or_path, language=model_args.language, task=model_args.task)
374
  data_collator = DataCollatorSpeechSeq2SeqWithPadding(processor=processor)
375
 
376
-
377
  # Prepare data
378
- train_dataset = train_dataset.cast_column("audio", Audio(sampling_rate=16000))
379
- eval_dataset = eval_dataset.cast_column("audio", Audio(sampling_rate=16000))
 
380
 
381
  # TODO Not able to implement in Streaming mode. Can not find a way to list columns. But is is necessary?
382
  # train_data = train_data.map(prepare_dataset, remove_columns=train_data.column_names, num_proc=1)
383
 
384
  train_dataset = train_dataset.map(prepare_dataset)
 
385
 
386
  # Metrics
387
  metric = evaluate.load("wer")
@@ -407,8 +444,10 @@ def main():
407
 
408
  # use last checkpoint if exist
409
  if last_checkpoint is not None:
 
410
  checkpoint = last_checkpoint
411
  elif os.path.isdir(model_args.model_name_or_path):
 
412
  checkpoint = model_args.model_name_or_path
413
  else:
414
  checkpoint = None
@@ -423,7 +462,13 @@ def main():
423
 
424
  # Set seed before initializing model.
425
  set_seed(training_args.seed)
426
-
 
 
 
 
 
 
427
  trainer = Seq2SeqTrainer(
428
  args=training_args,
429
  model=model,
@@ -433,6 +478,7 @@ def main():
433
  compute_metrics=compute_metrics,
434
  tokenizer=processor.feature_extractor,
435
  )
 
436
 
437
  train_result = trainer.train(resume_from_checkpoint=checkpoint)
438
  trainer.save_model()
@@ -448,21 +494,22 @@ def main():
448
  trainer.create_model_card(**kwargs)
449
 
450
  # TODO - Look closer into the evaluation and the model card writing.
451
-
 
452
  # Evaluation
453
- results = {}
454
- if training_args.do_eval:
455
- logger.info("*** Evaluate ***")
456
- metrics = trainer.evaluate()
457
- max_eval_samples = (
458
- data_args.max_eval_samples if data_args.max_eval_samples is not None else len(
459
- vectorized_datasets["eval"])
460
- )
461
- metrics["eval_samples"] = min(
462
- max_eval_samples, len(vectorized_datasets["eval"]))
463
-
464
- trainer.log_metrics("eval", metrics)
465
- trainer.save_metrics("eval", metrics)
466
 
467
  # Write model card and (optionally) push to hub
468
  config_name = data_args.dataset_config_name if data_args.dataset_config_name is not None else "na"
 
51
  def list_field(default=None, metadata=None):
52
  return field(default_factory=lambda: default, metadata=metadata)
53
 
54
+ @dataclass
55
+ class Seq2SeqTrainingArguments(TrainingArguments):
56
+ """
57
+ Args:
58
+ sortish_sampler (`bool`, *optional*, defaults to `False`):
59
+ Whether to use a *sortish sampler* or not. Only possible if the underlying datasets are *Seq2SeqDataset*
60
+ for now but will become generally available in the near future.
61
+ It sorts the inputs according to lengths in order to minimize the padding size, with a bit of randomness
62
+ for the training set.
63
+ predict_with_generate (`bool`, *optional*, defaults to `False`):
64
+ Whether to use generate to calculate generative metrics (ROUGE, BLEU).
65
+ generation_max_length (`int`, *optional*):
66
+ The `max_length` to use on each evaluation loop when `predict_with_generate=True`. Will default to the
67
+ `max_length` value of the model configuration.
68
+ generation_num_beams (`int`, *optional*):
69
+ The `num_beams` to use on each evaluation loop when `predict_with_generate=True`. Will default to the
70
+ `num_beams` value of the model configuration.
71
+ """
72
+
73
+ sortish_sampler: bool = field(default=False, metadata={"help": "Whether to use SortishSampler or not."})
74
+ predict_with_generate: bool = field(
75
+ default=False, metadata={"help": "Whether to use generate to calculate generative metrics (ROUGE, BLEU)."}
76
+ )
77
+ generation_max_length: Optional[int] = field(
78
+ default=None,
79
+ metadata={
80
+ "help": (
81
+ "The `max_length` to use on each evaluation loop when `predict_with_generate=True`. Will default "
82
+ "to the `max_length` value of the model configuration."
83
+ )
84
+ },
85
+ )
86
+ generation_num_beams: Optional[int] = field(
87
+ default=None,
88
+ metadata={
89
+ "help": (
90
+ "The `num_beams` to use on each evaluation loop when `predict_with_generate=True`. Will default "
91
+ "to the `num_beams` value of the model configuration."
92
+ )
93
+ },
94
+ )
95
+
96
 
97
  @dataclass
98
  class ModelArguments:
 
176
  )
177
 
178
 
179
+
180
  @dataclass
181
  class DataTrainingArguments:
182
  """
 
234
  default=None,
235
  metadata={
236
  "help": "For debugging purposes or quicker training, truncate the number of validation examples to this "
237
+ "value if set. Should also be set when streaming."
238
  },
239
  )
240
  chars_to_ignore: Optional[List[str]] = list_field(
 
283
  default="|",
284
  metadata={"help": "The word delimiter token for the tokenizer"},
285
  )
286
+
 
 
 
 
 
 
 
 
287
  phoneme_language: Optional[str] = field(
288
  default=None,
289
  metadata={
290
+ "help": "The target language that should be used be"
291
  " passed to the tokenizer for tokenization. Note that"
292
  " this is only relevant if the model classifies the"
293
  " input audio to a sequence of phoneme sequences."
 
338
  # or by passing the --help flag to this script.
339
  # We now keep distinct sets of args, for a cleaner separation of concerns.
340
  parser = HfArgumentParser(
341
+ (ModelArguments, DataTrainingArguments, Seq2SeqTrainingArguments))
342
  model_args, data_args, training_args = parser.parse_args_into_dataclasses()
343
 
344
  # Metrics
 
386
 
387
  # Load dataset
388
  train_dataset = load_dataset(data_args.dataset_name, data_args.dataset_config_name, split="train", streaming=True, use_auth_token=True)
389
+ eval_dataset = load_dataset(data_args.dataset_name, data_args.dataset_config_name, split="test", streaming=True, use_auth_token=True)
390
 
391
 
392
  # Rename columns
 
408
  model_args.model_name_or_path, language=model_args.language, task=model_args.task)
409
  data_collator = DataCollatorSpeechSeq2SeqWithPadding(processor=processor)
410
 
411
+
412
  # Prepare data
413
+ # Is not working.... but since it is already 16000 maybe I dont need it?
414
+ # train_dataset = train_dataset.cast_column("audio", Audio(sampling_rate=16000))
415
+ # eval_dataset = eval_dataset.cast_column("audio", Audio(sampling_rate=16000))
416
 
417
  # TODO Not able to implement in Streaming mode. Can not find a way to list columns. But is is necessary?
418
  # train_data = train_data.map(prepare_dataset, remove_columns=train_data.column_names, num_proc=1)
419
 
420
  train_dataset = train_dataset.map(prepare_dataset)
421
+ eval_dataset = eval_dataset.map(prepare_dataset)
422
 
423
  # Metrics
424
  metric = evaluate.load("wer")
 
444
 
445
  # use last checkpoint if exist
446
  if last_checkpoint is not None:
447
+ print("*** Found a checkpoint!")
448
  checkpoint = last_checkpoint
449
  elif os.path.isdir(model_args.model_name_or_path):
450
+ print("*** Loading checkpoint from parameters")
451
  checkpoint = model_args.model_name_or_path
452
  else:
453
  checkpoint = None
 
462
 
463
  # Set seed before initializing model.
464
  set_seed(training_args.seed)
465
+
466
+ # TODO - I think the number of epochs needs to be set manually? Now it seems to be calculated based on the save steps. How do I do this?
467
+ # Code here
468
+
469
+ # Save the processor as well, since we need it later
470
+ processor.save_pretrained(training_args.output_dir)
471
+
472
  trainer = Seq2SeqTrainer(
473
  args=training_args,
474
  model=model,
 
478
  compute_metrics=compute_metrics,
479
  tokenizer=processor.feature_extractor,
480
  )
481
+
482
 
483
  train_result = trainer.train(resume_from_checkpoint=checkpoint)
484
  trainer.save_model()
 
494
  trainer.create_model_card(**kwargs)
495
 
496
  # TODO - Look closer into the evaluation and the model card writing.
497
+
498
+ # breakpoint()
499
  # Evaluation
500
+ # results = {}
501
+ # if training_args.do_eval:
502
+ # logger.info("*** Evaluate ***")
503
+ # metrics = trainer.evaluate()
504
+ # max_eval_samples = (
505
+ # data_args.max_eval_samples if data_args.max_eval_samples is not None else len(
506
+ # vectorized_datasets["eval"])
507
+ # )
508
+ # metrics["eval_samples"] = min(
509
+ # max_eval_samples, len(vectorized_datasets["eval"]))
510
+
511
+ # trainer.log_metrics("eval", metrics)
512
+ # trainer.save_metrics("eval", metrics)
513
 
514
  # Write model card and (optionally) push to hub
515
  config_name = data_args.dataset_config_name if data_args.dataset_config_name is not None else "na"