Spaces:
Sleeping
Sleeping
File size: 1,225 Bytes
2417517 7380009 6465ad1 d1f9a9c 7380009 d1f9a9c 7380009 d1f9a9c 7380009 d1f9a9c 7380009 8e3c59e d1f9a9c 8e3c59e d1f9a9c 7380009 d1f9a9c 8e3c59e d1f9a9c 8e3c59e d1f9a9c 8e3c59e d1f9a9c 7380009 d1f9a9c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 |
import torch
from transformers import WhisperProcessor, WhisperForConditionalGeneration
import requests
import soundfile as sf
import io
# Load the Whisper model and processor from Hugging Face Model Hub
model_name = "openai/whisper-base"
processor = WhisperProcessor.from_pretrained(model_name)
model = WhisperForConditionalGeneration.from_pretrained(model_name)
# Use GPU if available, otherwise use CPU
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)
# URL of the audio file
audio_url = "https://www.signalogic.com/melp/EngSamples/Orig/male.wav"
# Download the audio file
response = requests.get(audio_url)
audio_data = io.BytesIO(response.content)
# Read the audio using soundfile
audio_input, _ = sf.read(audio_data)
# Preprocess the audio for Whisper
inputs = processor(audio_input, return_tensors="pt", sampling_rate=16000)
inputs = {key: value.to(device) for key, value in inputs.items()}
# Generate the transcription
with torch.no_grad():
predicted_ids = model.generate(inputs["input_features"])
# Decode the transcription
transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
# Print the transcription result
print("Transcription:", transcription)
|