NbAiLab
/

nb-distil-whisper-large-pytorch2

TensorBoard

Safetensors

whisper

🇪🇺 Region: EU

Model card Files Files and versions Metrics Training metrics Community

pere commited on Nov 7, 2024

Commit

7f3bd04

verified ·

1 Parent(s): d4cf8c0

Update run_distillation.py

Browse files

Files changed (1) hide show

run_distillation.py +28 -12

run_distillation.py CHANGED Viewed

@@ -1141,12 +1141,14 @@ def main():
         if whisper_transcript is not None and whisper_transcript.upper() == whisper_transcript:
             # filter entirely upper-case transcriptions: these are erroneous generations from large-v3
             return False
-        elif len(norm_ground_truth) > 0 and whisper_transcript is not None:
             norm_whisper_transcript = normalizer(whisper_transcript)
             wer = 100 * metric.compute(predictions=[norm_whisper_transcript], references=[norm_ground_truth])
             return wer < wer_threshold
         else:
-            # filter automatically since we can't know the WER
             return False
     filter_by_wer_threshold = partial(
@@ -1327,16 +1329,30 @@ def main():
         label_str = tokenizer.batch_decode(labels, skip_special_tokens=True)
         wer_ortho = 100 * metric.compute(predictions=pred_str, references=label_str)
-        # normalize everything and re-compute the WER
-        norm_pred_str = [normalizer(pred) for pred in pred_str]
-        norm_label_str = [normalizer(label) for label in label_str]
-        # for logging, we need the pred/labels to match the norm_pred/norm_labels, so discard any filtered samples here
-        pred_str = [pred_str[i] for i in range(len(norm_pred_str)) if len(norm_label_str[i]) > 0]
-        label_str = [label_str[i] for i in range(len(norm_label_str)) if len(norm_label_str[i]) > 0]
-        # filtering step to only evaluate the samples that correspond to non-zero normalized references:
-        norm_pred_str = [norm_pred_str[i] for i in range(len(norm_pred_str)) if len(norm_label_str[i]) > 0]
-        norm_label_str = [norm_label_str[i] for i in range(len(norm_label_str)) if len(norm_label_str[i]) > 0]
         wer = 100 * metric.compute(predictions=norm_pred_str, references=norm_label_str)
         return {"wer": wer, "wer_ortho": wer_ortho}, pred_str, label_str, norm_pred_str, norm_label_str
@@ -1808,4 +1824,4 @@ def main():
 if __name__ == "__main__":
-    main()

         if whisper_transcript is not None and whisper_transcript.upper() == whisper_transcript:
             # filter entirely upper-case transcriptions: these are erroneous generations from large-v3
             return False
+        elif len(norm_ground_truth) == 0 and len(normalizer(whisper_transcript)) == 0:
+            return True
+        elif len(norm_ground_truth.strip()) > 0 and whisper_transcript is not None and len(normalizer(whisper_transcript).strip()) > 0:
             norm_whisper_transcript = normalizer(whisper_transcript)
             wer = 100 * metric.compute(predictions=[norm_whisper_transcript], references=[norm_ground_truth])
             return wer < wer_threshold
         else:
+            # filter automatically since weR
             return False
     filter_by_wer_threshold = partial(
         label_str = tokenizer.batch_decode(labels, skip_special_tokens=True)
         wer_ortho = 100 * metric.compute(predictions=pred_str, references=label_str)
+        # Normalize everything
+        norm_pred_str = []
+        norm_label_str = []
+        # Iterate through all predictions and labels
+        for pred, label in zip(pred_str, label_str):
+            # Normalize the prediction and label
+            normalized_pred = normalizer(pred)
+            normalized_label = normalizer(label)
+            # If either normalized string is empty after normalization, replace with "<|nocaptions|>"
+            if not normalized_pred.strip():
+                normalized_pred = "<|nocaptions|>"
+            if not normalized_label.strip():
+                normalized_label = "<|nocaptions|>"
+            norm_pred_str.append(normalized_pred)
+            norm_label_str.append(normalized_label)
+        # Replace original strings with "<|nocaptions|>" where necessary for consistency
+        pred_str = [pred if len(pred.strip()) > 0 else "<|nocaptions|>" for pred in pred_str]
+        label_str = [label if len(label.strip()) > 0 else "<|nocaptions|>" for label in label_str]
+        # Compute WER using all entries, including those with "<|nocaptions|>"
         wer = 100 * metric.compute(predictions=norm_pred_str, references=norm_label_str)
         return {"wer": wer, "wer_ortho": wer_ortho}, pred_str, label_str, norm_pred_str, norm_label_str
 if __name__ == "__main__":
+    main()