File size: 2,292 Bytes
e2f65f6
8c679c2
e2f65f6
 
8c679c2
86fab4a
98333ca
8c679c2
a0b460e
fbc6758
8c679c2
 
 
 
 
 
 
9c37c06
8c679c2
9c37c06
 
 
 
 
 
 
 
 
8c679c2
9c37c06
 
 
 
 
 
 
 
f80cfc8
9c37c06
8c679c2
9c37c06
8c679c2
 
9c37c06
1895fc7
8c679c2
9c37c06
a0b460e
8c679c2
9c37c06
8c679c2
 
9c37c06
8c679c2
 
fbc6758
8c679c2
e2f65f6
 
a0b460e
8c679c2
e2f65f6
0a3e7f6
9c37c06
8c679c2
e2f65f6
 
 
9c37c06
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
import transformers
import gradio as gr
import librosa
import torch
import spaces
import numpy as np

@spaces.GPU(duration=60)
def transcribe_and_respond(audio_file):
    try:
        pipe = transformers.pipeline(
            model='sarvamai/shuka_v1',
            trust_remote_code=True,
            device=0,
            torch_dtype=torch.bfloat16
        )

        # Load the audio file, requesting a sample rate of 16000
        audio, sr = librosa.load(audio_file, sr=16000)
        
        # Convert the loaded audio to a contiguous float32 array
        audio = np.ascontiguousarray(audio, dtype=np.float32)
        
        # If audio has more than one channel, convert to mono by averaging channels
        if audio.ndim > 1:
            audio = np.mean(audio, axis=-1)
        
        # Debug: Print audio properties
        print(f"Audio dtype: {audio.dtype}, Audio shape: {audio.shape}, Sample rate: {sr}")
        
        # Although we requested 16000 Hz, double-check the sample rate.
        # If not 16000, force conversion:
        if sr != 16000:
            # Ensure the audio is float32 before resampling
            audio = audio.astype(np.float32)
            audio = librosa.resample(audio, orig_sr=sr, target_sr=16000)
            sr = 16000

        # Set up the transcription prompt to get exact transcription
        turns = [
            {'role': 'system', 'content': 'Please transcribe the following audio exactly.'},
            {'role': 'user', 'content': '<|audio|>'}
        ]
        
        # Debug: Print the initial turns
        print(f"Initial turns: {turns}")
        
        # Call the model with the audio and prompt
        output = pipe({'audio': audio, 'turns': turns, 'sampling_rate': sr}, max_new_tokens=512)
        
        # Debug: Print the final output from the model
        print(f"Model output: {output}")
        
        return output

    except Exception as e:
        return f"Error: {str(e)}"

iface = gr.Interface(
    fn=transcribe_and_respond,
    inputs=gr.Audio(sources="microphone", type="filepath"),
    outputs="text",
    title="Live Transcription and Response",
    description="Speak into your microphone, and the model will transcribe your speech.",
    live=True
)

if __name__ == "__main__":
    iface.launch()