shubham24 commited on
Commit
2f9ee0a
·
1 Parent(s): 3d72142

share true

Browse files
Files changed (1) hide show
  1. app.py +19 -35
app.py CHANGED
@@ -1,61 +1,45 @@
1
- import transformers
2
  import gradio as gr
3
  import librosa
4
  import torch
5
- import numpy as np
6
  import spaces
7
 
8
  @spaces.GPU
9
  def transcribe_and_respond(audio_file):
10
  try:
11
- # Load audio as waveform
12
  audio, sr = librosa.load(audio_file, sr=16000)
13
 
14
- # Initialize the model using pipeline with remote code
15
- pipe = transformers.pipeline(
16
  model="sarvamai/shuka_v1",
17
  trust_remote_code=True,
18
  device=0 if torch.cuda.is_available() else -1
19
  )
20
 
21
- # Define structured turns with audio placeholder
22
- turns = [
23
- {"role": "system", "content": "Respond naturally and informatively."},
24
- {"role": "user", "content": "<|audio|>"}
25
- ]
26
-
27
- # Run the pipeline with audio + turns
28
- output = pipe(
29
- {"audio": audio, "sampling_rate": sr, "turns": turns},
30
- max_new_tokens=256
31
- )
32
 
33
  return output
34
 
35
  except Exception as e:
36
  return f"Error: {str(e)}"
37
 
38
- # Gradio UI
39
- with gr.Blocks(title="Live Transcription with Shuka v1") as iface:
40
- gr.Markdown("## 🎙️ Live Transcription with Shuka v1")
41
- gr.Markdown("Upload or speak, and the model will respond naturally using SarvamAI's voice foundation model.")
42
-
43
  with gr.Row():
44
- audio_input = gr.Audio(
45
- sources=["microphone", "upload"],
46
- type="filepath",
47
- label="🎧 Audio Input"
48
- )
49
- text_output = gr.Textbox(
50
- label="📝 Model Response",
51
- placeholder="Response will appear here..."
52
- )
53
 
54
- audio_input.change(
55
- fn=transcribe_and_respond,
56
- inputs=audio_input,
57
- outputs=text_output
58
- )
59
 
60
  if __name__ == "__main__":
61
  iface.launch(share=True)
 
 
1
  import gradio as gr
2
  import librosa
3
  import torch
4
+ from transformers import pipeline
5
  import spaces
6
 
7
  @spaces.GPU
8
  def transcribe_and_respond(audio_file):
9
  try:
10
+ # Load audio
11
  audio, sr = librosa.load(audio_file, sr=16000)
12
 
13
+ # Load Shuka model via pipeline
14
+ pipe = pipeline(
15
  model="sarvamai/shuka_v1",
16
  trust_remote_code=True,
17
  device=0 if torch.cuda.is_available() else -1
18
  )
19
 
20
+ # Use Shuka's expected format
21
+ output = pipe({
22
+ "audio": audio,
23
+ "sampling_rate": sr,
24
+ "turns": [
25
+ {"role": "system", "content": "Respond naturally and informatively."},
26
+ {"role": "user", "content": "<|audio|>"}
27
+ ]
28
+ })
 
 
29
 
30
  return output
31
 
32
  except Exception as e:
33
  return f"Error: {str(e)}"
34
 
35
+ # Gradio interface
36
+ with gr.Blocks(title="Shuka v1 Transcription") as iface:
37
+ gr.Markdown("## 🎙️ Shuka v1 - Voice Transcription")
 
 
38
  with gr.Row():
39
+ audio_input = gr.Audio(sources=["microphone", "upload"], type="filepath", label="Audio")
40
+ text_output = gr.Textbox(label="Response")
 
 
 
 
 
 
 
41
 
42
+ audio_input.change(fn=transcribe_and_respond, inputs=audio_input, outputs=text_output)
 
 
 
 
43
 
44
  if __name__ == "__main__":
45
  iface.launch(share=True)