File size: 1,576 Bytes
e2f65f6 8c679c2 e2f65f6 8c679c2 86fab4a 98333ca 8c679c2 a0b460e fbc6758 8c679c2 66c2b05 8c679c2 b5f86ee 8c679c2 f80cfc8 8c679c2 57740a6 8c679c2 b5f86ee 1895fc7 8c679c2 b5f86ee a0b460e 8a52d70 b5f86ee 8c679c2 b5f86ee 8c679c2 fbc6758 8c679c2 e2f65f6 a0b460e 57740a6 02b1ff9 57740a6 66c2b05 02b1ff9 57740a6 289e5e4 0743662 8c679c2 e2f65f6 57740a6 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 |
import transformers
import gradio as gr
import librosa
import torch
import spaces
import numpy as np
@spaces.GPU(duration=60)
def transcribe_and_respond(audio_file):
try:
pipe = transformers.pipeline(
model='sarvamai/shuka_v1',
trust_remote_code=True,
device=0,
torch_dtype=torch.bfloat16
)
# Load the audio file at 16kHz
audio, sr = librosa.load(audio_file, sr=16000)
# Print audio properties for debugging
print(f"Audio dtype: {audio.dtype}, Audio shape: {audio.shape}, Sample rate: {sr}")
turns = [
{'role': 'system', 'content': 'Compile the information'},
{'role': 'user', 'content': '<|audio|>'}
]
# Debug: Print the initial turns
print(f"Initial turns: {turns}")
# Call the model with the audio and prompt
output = pipe({'audio': audio, 'turns': turns, 'sampling_rate': sr}, max_new_tokens=1000)
# Debug: Print the final output from the model
print(f"Model output: {output}")
return output
except Exception as e:
return f"Error: {str(e)}"
iface = gr.Interface(
fn=transcribe_and_respond,
inputs=[
gr.Audio(sources=["upload", "microphone"], type="filepath"),
],
outputs=[
gr.Textbox(label="Transcript"),
gr.File(label="Download Transcript")
],
title="ShukaNotesApp",
description="Note Maker for Indian Offices and Their Many Languages.",
live=True
)
if __name__ == "__main__":
iface.launch() |