mskov commited on
Commit
3f0eaa9
Β·
1 Parent(s): 5ce6e7e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +20 -3
app.py CHANGED
@@ -6,7 +6,7 @@ from transformers import pipeline, WhisperModel, WhisperTokenizer, WhisperFeatur
6
  os.system("pip install jiwer")
7
  from jiwer import wer
8
  os.system("pip install datasets[audio]")
9
- from evaluate import evaluator
10
  import evaluate
11
  from datasets import load_dataset, Audio, disable_caching, set_caching_enabled
12
  import gradio as gr
@@ -18,7 +18,7 @@ disable_caching()
18
  huggingface_token = os.environ["huggingface_token"]
19
  pipe = pipeline(model="mskov/whisper-small-esc50")
20
  print(pipe)
21
-
22
  dataset = load_dataset("mskov/miso_test", split="test").cast_column("audio", Audio(sampling_rate=16000))
23
 
24
  print(dataset, "and at 0[audio][array] ", dataset[0]["audio"]["array"], type(dataset[0]["audio"]["array"]), "and at audio : ", dataset[0]["audio"])
@@ -27,6 +27,23 @@ model = pipe
27
  # Evaluate the model
28
  # model.eval()
29
  #print("model.eval ", model.eval())
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30
  with torch.no_grad():
31
  outputs = model(input_ids=input_ids, attention_mask=attention_mask)
32
  print("outputs ", outputs)
@@ -43,7 +60,7 @@ wer_score = wer(labels, predicted_text)
43
 
44
  # Print or return WER score
45
  print(f"Word Error Rate (WER): {wer_score}")
46
-
47
 
48
  def transcribe(audio):
49
  text = pipe(audio)["text"]
 
6
  os.system("pip install jiwer")
7
  from jiwer import wer
8
  os.system("pip install datasets[audio]")
9
+ from evaluate import evaluator, load
10
  import evaluate
11
  from datasets import load_dataset, Audio, disable_caching, set_caching_enabled
12
  import gradio as gr
 
18
  huggingface_token = os.environ["huggingface_token"]
19
  pipe = pipeline(model="mskov/whisper-small-esc50")
20
  print(pipe)
21
+ processor = WhisperProcessor.from_pretrained("openai/whisper-medium").to("cuda")
22
  dataset = load_dataset("mskov/miso_test", split="test").cast_column("audio", Audio(sampling_rate=16000))
23
 
24
  print(dataset, "and at 0[audio][array] ", dataset[0]["audio"]["array"], type(dataset[0]["audio"]["array"]), "and at audio : ", dataset[0]["audio"])
 
27
  # Evaluate the model
28
  # model.eval()
29
  #print("model.eval ", model.eval())
30
+
31
+ audio = batch["audio"]
32
+ input_features = processor(audio["array"], sampling_rate=audio["sampling_rate"], return_tensors="pt").input_features
33
+ batch["reference"] = processor.tokenizer._normalize(batch['text'])
34
+
35
+
36
+ with torch.no_grad():
37
+ predicted_ids = model.generate(input_features.to("cuda"))[0]
38
+ transcription = processor.decode(predicted_ids)
39
+ batch["prediction"] = processor.tokenizer._normalize(transcription)
40
+ return batch
41
+
42
+ result = dataset.map(map_to_pred)
43
+
44
+ wer = load("wer")
45
+ print(100 * wer.compute(references=result["reference"], predictions=result["prediction"]))
46
+ '''
47
  with torch.no_grad():
48
  outputs = model(input_ids=input_ids, attention_mask=attention_mask)
49
  print("outputs ", outputs)
 
60
 
61
  # Print or return WER score
62
  print(f"Word Error Rate (WER): {wer_score}")
63
+ '''
64
 
65
  def transcribe(audio):
66
  text = pipe(audio)["text"]