File size: 1,837 Bytes
e2f65f6 8c679c2 e2f65f6 8c679c2 86fab4a 98333ca 8c679c2 1895fc7 fbc6758 8c679c2 1895fc7 8c679c2 1895fc7 8c679c2 1895fc7 8c679c2 1895fc7 8c679c2 1895fc7 8c679c2 1895fc7 8c679c2 fbc6758 8c679c2 e2f65f6 1895fc7 8c679c2 e2f65f6 1895fc7 8c679c2 e2f65f6 8c679c2 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 |
import transformers
import gradio as gr
import librosa
import torch
import spaces
import numpy as np
@spaces.GPU(duration=60)
def transcribe_audio(audio_file):
try:
pipe = transformers.pipeline(
model='sarvamai/shuka_v1',
trust_remote_code=True,
device=0,
torch_dtype=torch.bfloat16
)
# Load the audio file at 16kHz
audio, sr = librosa.load(audio_file, sr=16000)
# Ensure audio is a floating-point numpy array
audio = np.array(audio, dtype=np.float32)
# Convert multi-channel audio to mono if needed
if audio.ndim > 1:
audio = np.mean(audio, axis=-1)
# Debug: Print audio properties for troubleshooting
print(f"Audio dtype: {audio.dtype}, Audio shape: {audio.shape}, Sample rate: {sr}")
# Change the conversation turns to instruct transcription
turns = [
{'role': 'system', 'content': 'Please transcribe the following audio exactly.'},
{'role': 'user', 'content': '<|audio|>'}
]
# Debug: Print the initial turns
print(f"Initial turns: {turns}")
# Call the model with the audio and transcription prompt
output = pipe({'audio': audio, 'turns': turns, 'sampling_rate': sr}, max_new_tokens=512)
# Debug: Print the final output from the model
print(f"Model output: {output}")
return output
except Exception as e:
return f"Error: {str(e)}"
iface = gr.Interface(
fn=transcribe_audio,
inputs=gr.Audio(sources="microphone", type="filepath"),
outputs="text",
title="Shuka ASR Demo",
description="Speak into your microphone, and the model will transcribe your speech.",
live=True
)
if __name__ == "__main__":
iface.launch()
|