File size: 2,292 Bytes
e2f65f6 8c679c2 e2f65f6 8c679c2 86fab4a 98333ca 8c679c2 a0b460e fbc6758 8c679c2 9c37c06 8c679c2 9c37c06 8c679c2 9c37c06 f80cfc8 9c37c06 8c679c2 9c37c06 8c679c2 9c37c06 1895fc7 8c679c2 9c37c06 a0b460e 8c679c2 9c37c06 8c679c2 9c37c06 8c679c2 fbc6758 8c679c2 e2f65f6 a0b460e 8c679c2 e2f65f6 0a3e7f6 9c37c06 8c679c2 e2f65f6 9c37c06 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 |
import transformers
import gradio as gr
import librosa
import torch
import spaces
import numpy as np
@spaces.GPU(duration=60)
def transcribe_and_respond(audio_file):
try:
pipe = transformers.pipeline(
model='sarvamai/shuka_v1',
trust_remote_code=True,
device=0,
torch_dtype=torch.bfloat16
)
# Load the audio file, requesting a sample rate of 16000
audio, sr = librosa.load(audio_file, sr=16000)
# Convert the loaded audio to a contiguous float32 array
audio = np.ascontiguousarray(audio, dtype=np.float32)
# If audio has more than one channel, convert to mono by averaging channels
if audio.ndim > 1:
audio = np.mean(audio, axis=-1)
# Debug: Print audio properties
print(f"Audio dtype: {audio.dtype}, Audio shape: {audio.shape}, Sample rate: {sr}")
# Although we requested 16000 Hz, double-check the sample rate.
# If not 16000, force conversion:
if sr != 16000:
# Ensure the audio is float32 before resampling
audio = audio.astype(np.float32)
audio = librosa.resample(audio, orig_sr=sr, target_sr=16000)
sr = 16000
# Set up the transcription prompt to get exact transcription
turns = [
{'role': 'system', 'content': 'Please transcribe the following audio exactly.'},
{'role': 'user', 'content': '<|audio|>'}
]
# Debug: Print the initial turns
print(f"Initial turns: {turns}")
# Call the model with the audio and prompt
output = pipe({'audio': audio, 'turns': turns, 'sampling_rate': sr}, max_new_tokens=512)
# Debug: Print the final output from the model
print(f"Model output: {output}")
return output
except Exception as e:
return f"Error: {str(e)}"
iface = gr.Interface(
fn=transcribe_and_respond,
inputs=gr.Audio(sources="microphone", type="filepath"),
outputs="text",
title="Live Transcription and Response",
description="Speak into your microphone, and the model will transcribe your speech.",
live=True
)
if __name__ == "__main__":
iface.launch()
|