Spaces:

mskov
/

test

Runtime error

App Files Files Community

mskov commited on Sep 5, 2023

Commit

0597769

1 Parent(s): 68ed0e8

Update app.py

Browse files

Files changed (1) hide show

app.py +7 -62

app.py CHANGED Viewed

@@ -7,90 +7,35 @@ os.system("pip install jiwer")
 from jiwer import wer
 os.system("pip install datasets[audio]")
 from evaluate import evaluator, load
-import evaluate
 from datasets import load_dataset, Audio, disable_caching, set_caching_enabled
 import gradio as gr
 import torch
-import re
-set_caching_enabled(False)
-disable_caching()
-huggingface_token = os.environ["huggingface_token"]
-pipe = pipeline(model="mskov/whisper-small-esc50")
-print(pipe)
 processor = WhisperProcessor.from_pretrained("mskov/whisper-small-esc50")
-dataset = load_dataset("ashraq/esc50", split="train").cast_column("audio", Audio(sampling_rate=16000))
-# print(dataset, "and at 0[audio][array] ", dataset[0]["audio"]["array"], type(dataset[0]["audio"]["array"]), "and at audio : ", dataset[0]["audio"])
-model = WhisperForConditionalGeneration.from_pretrained("mskov/whisper-small-esc50")
-# Evaluate the model
-# model.eval()
-#print("model.eval ", model.eval())
-# Remove brackets and extra spaces
 def map_to_pred(batch):
     audio = batch["audio"]
-    input_features = processor(audio["array"], sampling_rate=audio["sampling_rate"], return_tensors="pt").input_features
     batch["reference"] = processor.tokenizer._normalize(batch['category'])
     with torch.no_grad():
         predicted_ids = model.generate(input_features.to("cuda"))[0]
     transcription = processor.decode(predicted_ids)
     batch["prediction"] = processor.tokenizer._normalize(transcription)
     return batch
-result = dataset.map(map_to_pred)
-wer = load("wer")
-print(100 * wer.compute(references=result["reference"], predictions=result["prediction"]))
-'''
-def map_to_pred(batch):
-    cleaned_transcription = re.sub(r'\[[^\]]+\]', '', batch['category']).strip()
-    print("cleaned transcript", cleaned_transcription)
-    cleaned_transcription = preprocess_transcription(batch['category'])
-    normalized_transcription = processor.tokenizer._normalize(cleaned_transcription)
-    audio = batch["audio"]
-    input_features = processor(audio["array"], sampling_rate=audio["sampling_rate"], return_tensors="pt").input_features
-    batch["reference"] = processor.tokenizer._normalize(batch['category'])
-    with torch.no_grad():
-        predicted_ids = model.generate(input_features)[0]
-    transcription = processor.decode(predicted_ids)
-    batch["prediction"] = processor.tokenizer._normalize(transcription)
-    return batch
-result = dataset.map(map_to_pred)
 wer = load("wer")
 print(100 * wer.compute(references=result["reference"], predictions=result["prediction"]))
-'''
-'''
-with torch.no_grad():
-    outputs = model(input_ids=input_ids, attention_mask=attention_mask)
-    print("outputs ", outputs)
-# Convert predicted token IDs back to text
-predicted_text = tokenizer.batch_decode(outputs.logits.argmax(dim=-1), skip_special_tokens=True)
-# Get ground truth labels from the dataset
-labels = dataset["audio"]  # Replace "labels" with the appropriate key in your dataset
-print("labels are ", labels)
-# Compute WER
-wer = load("wer")
-wer_score = wer(labels, predicted_text)
-# Print or return WER score
-print(f"Word Error Rate (WER): {wer_score}")
-'''
 def transcribe(audio):
     text = pipe(audio)["text"]

 from jiwer import wer
 os.system("pip install datasets[audio]")
 from evaluate import evaluator, load
+from transformers import AutoModelForSequenceClassification, pipeline, BertTokenizer, AutoTokenizer, GPT2Model
 from datasets import load_dataset, Audio, disable_caching, set_caching_enabled
 import gradio as gr
 import torch
+from datasets import load_dataset
+from transformers import WhisperForConditionalGeneration, WhisperProcessor
 processor = WhisperProcessor.from_pretrained("mskov/whisper-small-esc50")
+model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-small-esc50").to("cuda")
 def map_to_pred(batch):
     audio = batch["audio"]
+    input_features = processor(audio["array"], sampling_rate=16000, return_tensors="pt").input_features
     batch["reference"] = processor.tokenizer._normalize(batch['category'])
     with torch.no_grad():
         predicted_ids = model.generate(input_features.to("cuda"))[0]
     transcription = processor.decode(predicted_ids)
     batch["prediction"] = processor.tokenizer._normalize(transcription)
+    print(batch["prediction"])
     return batch
+result = librispeech_test_clean.map(map_to_pred)
 wer = load("wer")
 print(100 * wer.compute(references=result["reference"], predictions=result["prediction"]))
 def transcribe(audio):
     text = pipe(audio)["text"]