Update app.py
Browse files
app.py
CHANGED
@@ -6,7 +6,7 @@ from transformers import pipeline, WhisperModel, WhisperTokenizer, WhisperFeatur
|
|
6 |
os.system("pip install jiwer")
|
7 |
from jiwer import wer
|
8 |
os.system("pip install datasets[audio]")
|
9 |
-
from evaluate import evaluator
|
10 |
import evaluate
|
11 |
from datasets import load_dataset, Audio, disable_caching, set_caching_enabled
|
12 |
import gradio as gr
|
@@ -18,7 +18,7 @@ disable_caching()
|
|
18 |
huggingface_token = os.environ["huggingface_token"]
|
19 |
pipe = pipeline(model="mskov/whisper-small-esc50")
|
20 |
print(pipe)
|
21 |
-
|
22 |
dataset = load_dataset("mskov/miso_test", split="test").cast_column("audio", Audio(sampling_rate=16000))
|
23 |
|
24 |
print(dataset, "and at 0[audio][array] ", dataset[0]["audio"]["array"], type(dataset[0]["audio"]["array"]), "and at audio : ", dataset[0]["audio"])
|
@@ -27,6 +27,23 @@ model = pipe
|
|
27 |
# Evaluate the model
|
28 |
# model.eval()
|
29 |
#print("model.eval ", model.eval())
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
30 |
with torch.no_grad():
|
31 |
outputs = model(input_ids=input_ids, attention_mask=attention_mask)
|
32 |
print("outputs ", outputs)
|
@@ -43,7 +60,7 @@ wer_score = wer(labels, predicted_text)
|
|
43 |
|
44 |
# Print or return WER score
|
45 |
print(f"Word Error Rate (WER): {wer_score}")
|
46 |
-
|
47 |
|
48 |
def transcribe(audio):
|
49 |
text = pipe(audio)["text"]
|
|
|
6 |
os.system("pip install jiwer")
|
7 |
from jiwer import wer
|
8 |
os.system("pip install datasets[audio]")
|
9 |
+
from evaluate import evaluator, load
|
10 |
import evaluate
|
11 |
from datasets import load_dataset, Audio, disable_caching, set_caching_enabled
|
12 |
import gradio as gr
|
|
|
18 |
huggingface_token = os.environ["huggingface_token"]
|
19 |
pipe = pipeline(model="mskov/whisper-small-esc50")
|
20 |
print(pipe)
|
21 |
+
processor = WhisperProcessor.from_pretrained("openai/whisper-medium").to("cuda")
|
22 |
dataset = load_dataset("mskov/miso_test", split="test").cast_column("audio", Audio(sampling_rate=16000))
|
23 |
|
24 |
print(dataset, "and at 0[audio][array] ", dataset[0]["audio"]["array"], type(dataset[0]["audio"]["array"]), "and at audio : ", dataset[0]["audio"])
|
|
|
27 |
# Evaluate the model
|
28 |
# model.eval()
|
29 |
#print("model.eval ", model.eval())
|
30 |
+
|
31 |
+
audio = batch["audio"]
|
32 |
+
input_features = processor(audio["array"], sampling_rate=audio["sampling_rate"], return_tensors="pt").input_features
|
33 |
+
batch["reference"] = processor.tokenizer._normalize(batch['text'])
|
34 |
+
|
35 |
+
|
36 |
+
with torch.no_grad():
|
37 |
+
predicted_ids = model.generate(input_features.to("cuda"))[0]
|
38 |
+
transcription = processor.decode(predicted_ids)
|
39 |
+
batch["prediction"] = processor.tokenizer._normalize(transcription)
|
40 |
+
return batch
|
41 |
+
|
42 |
+
result = dataset.map(map_to_pred)
|
43 |
+
|
44 |
+
wer = load("wer")
|
45 |
+
print(100 * wer.compute(references=result["reference"], predictions=result["prediction"]))
|
46 |
+
'''
|
47 |
with torch.no_grad():
|
48 |
outputs = model(input_ids=input_ids, attention_mask=attention_mask)
|
49 |
print("outputs ", outputs)
|
|
|
60 |
|
61 |
# Print or return WER score
|
62 |
print(f"Word Error Rate (WER): {wer_score}")
|
63 |
+
'''
|
64 |
|
65 |
def transcribe(audio):
|
66 |
text = pipe(audio)["text"]
|