File size: 1,837 Bytes
e2f65f6
8c679c2
e2f65f6
 
8c679c2
86fab4a
98333ca
8c679c2
1895fc7
fbc6758
8c679c2
 
 
 
 
 
 
 
 
 
 
 
1895fc7
8c679c2
 
 
1895fc7
8c679c2
 
1895fc7
8c679c2
1895fc7
8c679c2
 
 
1895fc7
8c679c2
 
1895fc7
8c679c2
 
 
 
 
 
 
fbc6758
8c679c2
e2f65f6
 
1895fc7
8c679c2
e2f65f6
1895fc7
 
8c679c2
e2f65f6
 
 
8c679c2
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
import transformers
import gradio as gr
import librosa
import torch
import spaces
import numpy as np

@spaces.GPU(duration=60)
def transcribe_audio(audio_file):
    try:
        pipe = transformers.pipeline(
            model='sarvamai/shuka_v1',
            trust_remote_code=True,
            device=0,
            torch_dtype=torch.bfloat16
        )

        # Load the audio file at 16kHz
        audio, sr = librosa.load(audio_file, sr=16000)
        
        # Ensure audio is a floating-point numpy array
        audio = np.array(audio, dtype=np.float32)
        # Convert multi-channel audio to mono if needed
        if audio.ndim > 1:
            audio = np.mean(audio, axis=-1)

        # Debug: Print audio properties for troubleshooting
        print(f"Audio dtype: {audio.dtype}, Audio shape: {audio.shape}, Sample rate: {sr}")

        # Change the conversation turns to instruct transcription
        turns = [
            {'role': 'system', 'content': 'Please transcribe the following audio exactly.'},
            {'role': 'user', 'content': '<|audio|>'}
        ]
        
        # Debug: Print the initial turns
        print(f"Initial turns: {turns}")

        # Call the model with the audio and transcription prompt
        output = pipe({'audio': audio, 'turns': turns, 'sampling_rate': sr}, max_new_tokens=512)
        
        # Debug: Print the final output from the model
        print(f"Model output: {output}")

        return output

    except Exception as e:
        return f"Error: {str(e)}"

iface = gr.Interface(
    fn=transcribe_audio,
    inputs=gr.Audio(sources="microphone", type="filepath"),
    outputs="text",
    title="Shuka ASR Demo",
    description="Speak into your microphone, and the model will transcribe your speech.",
    live=True
)

if __name__ == "__main__":
    iface.launch()