Spaces:

helvekami
/

ShukaNote

Running on Zero

File size: 1,576 Bytes

e2f65f6
8c679c2
e2f65f6
 
8c679c2
86fab4a
98333ca
8c679c2
a0b460e
fbc6758
8c679c2
 
 
 
 
 
 
66c2b05
8c679c2
b5f86ee
 
8c679c2
f80cfc8
8c679c2
57740a6
8c679c2
 
b5f86ee
1895fc7
8c679c2
b5f86ee
a0b460e
8a52d70
b5f86ee
8c679c2
 
b5f86ee
8c679c2
 
fbc6758
8c679c2
e2f65f6
 
a0b460e
57740a6
 
02b1ff9
57740a6
66c2b05
02b1ff9
 
57740a6
289e5e4
0743662
8c679c2
e2f65f6
 
 
57740a6

import transformers
import gradio as gr
import librosa
import torch
import spaces
import numpy as np

@spaces.GPU(duration=60)
def transcribe_and_respond(audio_file):
    try:
        pipe = transformers.pipeline(
            model='sarvamai/shuka_v1',
            trust_remote_code=True,
            device=0,
            torch_dtype=torch.bfloat16
        )

        # Load the audio file at 16kHz
        audio, sr = librosa.load(audio_file, sr=16000)

        # Print audio properties for debugging
        print(f"Audio dtype: {audio.dtype}, Audio shape: {audio.shape}, Sample rate: {sr}")

        turns = [
            {'role': 'system', 'content': 'Compile the information'},
            {'role': 'user', 'content': '<|audio|>'}
        ]

        # Debug: Print the initial turns
        print(f"Initial turns: {turns}")

        # Call the model with the audio and prompt
        output = pipe({'audio': audio, 'turns': turns, 'sampling_rate': sr}, max_new_tokens=1000)

        # Debug: Print the final output from the model
        print(f"Model output: {output}")

        return output

    except Exception as e:
        return f"Error: {str(e)}"

iface = gr.Interface(
    fn=transcribe_and_respond,
    inputs=[
        gr.Audio(sources=["upload", "microphone"], type="filepath"),

    ],
    outputs=[
        gr.Textbox(label="Transcript"),
        gr.File(label="Download Transcript")
    ],
    title="ShukaNotesApp",
    description="Note Maker for Indian Offices and Their Many Languages.",
    live=True
)

if __name__ == "__main__":
    iface.launch()