mskov commited on
Commit
0597769
Β·
1 Parent(s): 68ed0e8

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +7 -62
app.py CHANGED
@@ -7,90 +7,35 @@ os.system("pip install jiwer")
7
  from jiwer import wer
8
  os.system("pip install datasets[audio]")
9
  from evaluate import evaluator, load
10
- import evaluate
11
  from datasets import load_dataset, Audio, disable_caching, set_caching_enabled
12
  import gradio as gr
13
  import torch
14
- import re
 
15
 
16
- set_caching_enabled(False)
17
- disable_caching()
18
 
19
- huggingface_token = os.environ["huggingface_token"]
20
- pipe = pipeline(model="mskov/whisper-small-esc50")
21
- print(pipe)
22
  processor = WhisperProcessor.from_pretrained("mskov/whisper-small-esc50")
23
- dataset = load_dataset("ashraq/esc50", split="train").cast_column("audio", Audio(sampling_rate=16000))
24
 
25
- # print(dataset, "and at 0[audio][array] ", dataset[0]["audio"]["array"], type(dataset[0]["audio"]["array"]), "and at audio : ", dataset[0]["audio"])
26
-
27
- model = WhisperForConditionalGeneration.from_pretrained("mskov/whisper-small-esc50")
28
-
29
- # Evaluate the model
30
- # model.eval()
31
- #print("model.eval ", model.eval())
32
-
33
-
34
- # Remove brackets and extra spaces
35
  def map_to_pred(batch):
36
  audio = batch["audio"]
37
- input_features = processor(audio["array"], sampling_rate=audio["sampling_rate"], return_tensors="pt").input_features
38
  batch["reference"] = processor.tokenizer._normalize(batch['category'])
39
 
40
  with torch.no_grad():
41
  predicted_ids = model.generate(input_features.to("cuda"))[0]
42
  transcription = processor.decode(predicted_ids)
43
  batch["prediction"] = processor.tokenizer._normalize(transcription)
 
44
  return batch
45
 
46
- result = dataset.map(map_to_pred)
47
-
48
- wer = load("wer")
49
- print(100 * wer.compute(references=result["reference"], predictions=result["prediction"]))
50
-
51
- '''
52
- def map_to_pred(batch):
53
- cleaned_transcription = re.sub(r'\[[^\]]+\]', '', batch['category']).strip()
54
- print("cleaned transcript", cleaned_transcription)
55
- cleaned_transcription = preprocess_transcription(batch['category'])
56
- normalized_transcription = processor.tokenizer._normalize(cleaned_transcription)
57
-
58
- audio = batch["audio"]
59
- input_features = processor(audio["array"], sampling_rate=audio["sampling_rate"], return_tensors="pt").input_features
60
- batch["reference"] = processor.tokenizer._normalize(batch['category'])
61
-
62
-
63
- with torch.no_grad():
64
- predicted_ids = model.generate(input_features)[0]
65
-
66
- transcription = processor.decode(predicted_ids)
67
- batch["prediction"] = processor.tokenizer._normalize(transcription)
68
- return batch
69
-
70
- result = dataset.map(map_to_pred)
71
 
72
  wer = load("wer")
73
  print(100 * wer.compute(references=result["reference"], predictions=result["prediction"]))
74
- '''
75
- '''
76
- with torch.no_grad():
77
- outputs = model(input_ids=input_ids, attention_mask=attention_mask)
78
- print("outputs ", outputs)
79
-
80
- # Convert predicted token IDs back to text
81
- predicted_text = tokenizer.batch_decode(outputs.logits.argmax(dim=-1), skip_special_tokens=True)
82
 
83
- # Get ground truth labels from the dataset
84
- labels = dataset["audio"] # Replace "labels" with the appropriate key in your dataset
85
- print("labels are ", labels)
86
 
87
- # Compute WER
88
- wer = load("wer")
89
- wer_score = wer(labels, predicted_text)
90
-
91
- # Print or return WER score
92
- print(f"Word Error Rate (WER): {wer_score}")
93
- '''
94
 
95
  def transcribe(audio):
96
  text = pipe(audio)["text"]
 
7
  from jiwer import wer
8
  os.system("pip install datasets[audio]")
9
  from evaluate import evaluator, load
10
+ from transformers import AutoModelForSequenceClassification, pipeline, BertTokenizer, AutoTokenizer, GPT2Model
11
  from datasets import load_dataset, Audio, disable_caching, set_caching_enabled
12
  import gradio as gr
13
  import torch
14
+ from datasets import load_dataset
15
+ from transformers import WhisperForConditionalGeneration, WhisperProcessor
16
 
 
 
17
 
 
 
 
18
  processor = WhisperProcessor.from_pretrained("mskov/whisper-small-esc50")
19
+ model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-small-esc50").to("cuda")
20
 
 
 
 
 
 
 
 
 
 
 
21
  def map_to_pred(batch):
22
  audio = batch["audio"]
23
+ input_features = processor(audio["array"], sampling_rate=16000, return_tensors="pt").input_features
24
  batch["reference"] = processor.tokenizer._normalize(batch['category'])
25
 
26
  with torch.no_grad():
27
  predicted_ids = model.generate(input_features.to("cuda"))[0]
28
  transcription = processor.decode(predicted_ids)
29
  batch["prediction"] = processor.tokenizer._normalize(transcription)
30
+ print(batch["prediction"])
31
  return batch
32
 
33
+ result = librispeech_test_clean.map(map_to_pred)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
 
35
  wer = load("wer")
36
  print(100 * wer.compute(references=result["reference"], predictions=result["prediction"]))
 
 
 
 
 
 
 
 
37
 
 
 
 
38
 
 
 
 
 
 
 
 
39
 
40
  def transcribe(audio):
41
  text = pipe(audio)["text"]