File size: 1,225 Bytes
2417517
7380009
6465ad1
d1f9a9c
 
7380009
d1f9a9c
7380009
d1f9a9c
 
7380009
d1f9a9c
7380009
 
8e3c59e
d1f9a9c
 
8e3c59e
d1f9a9c
 
 
7380009
d1f9a9c
 
8e3c59e
d1f9a9c
 
 
8e3c59e
d1f9a9c
 
 
8e3c59e
d1f9a9c
 
7380009
d1f9a9c
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
import torch
from transformers import WhisperProcessor, WhisperForConditionalGeneration
import requests
import soundfile as sf
import io

# Load the Whisper model and processor from Hugging Face Model Hub
model_name = "openai/whisper-base"
processor = WhisperProcessor.from_pretrained(model_name)
model = WhisperForConditionalGeneration.from_pretrained(model_name)

# Use GPU if available, otherwise use CPU
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

# URL of the audio file
audio_url = "https://www.signalogic.com/melp/EngSamples/Orig/male.wav"

# Download the audio file
response = requests.get(audio_url)
audio_data = io.BytesIO(response.content)

# Read the audio using soundfile
audio_input, _ = sf.read(audio_data)

# Preprocess the audio for Whisper
inputs = processor(audio_input, return_tensors="pt", sampling_rate=16000)
inputs = {key: value.to(device) for key, value in inputs.items()}

# Generate the transcription
with torch.no_grad():
    predicted_ids = model.generate(inputs["input_features"])

# Decode the transcription
transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]

# Print the transcription result
print("Transcription:", transcription)