|
import gradio as gr |
|
import librosa |
|
import torch |
|
from transformers import pipeline |
|
import spaces |
|
import numpy as np |
|
|
|
|
|
pipe = pipeline( |
|
model="sarvamai/shuka_v1", |
|
trust_remote_code=True, |
|
device=0 if torch.cuda.is_available() else -1 |
|
) |
|
|
|
def preprocess_audio(audio, sr): |
|
|
|
audio = librosa.util.normalize(audio) |
|
|
|
|
|
audio, _ = librosa.effects.trim(audio, top_db=20) |
|
|
|
|
|
audio = librosa.effects.preemphasis(audio) |
|
|
|
|
|
if len(audio.shape) > 1: |
|
audio = librosa.to_mono(audio) |
|
|
|
return audio, sr |
|
|
|
@spaces.GPU |
|
def transcribe_and_respond(audio_file): |
|
try: |
|
|
|
audio, sr = librosa.load( |
|
audio_file, |
|
sr=16000, |
|
mono=True, |
|
res_type='kaiser_best' |
|
) |
|
|
|
|
|
audio, sr = preprocess_audio(audio, sr) |
|
|
|
|
|
if len(audio) < sr * 0.5: |
|
return "Error: Audio is too short. Please speak for at least 0.5 seconds." |
|
if len(audio) > sr * 30: |
|
return "Error: Audio is too long. Please keep it under 30 seconds." |
|
|
|
|
|
output = pipe({ |
|
"audio": audio, |
|
"sampling_rate": sr, |
|
"turns": [ |
|
{"role": "system", "content": """You are an expert English pronunciation teacher specializing in teaching Indian English learners. Your role is to: |
|
1. Listen carefully to the student's pronunciation |
|
2. Provide specific feedback on pronunciation accuracy |
|
3. Break down difficult words into syllables |
|
4. Explain the correct mouth positions and sounds |
|
5. Use simple, clear language |
|
6. Be encouraging and supportive |
|
7. Focus on common Indian English pronunciation challenges |
|
8. Provide examples of correct pronunciation |
|
|
|
Format your response in this structure: |
|
- What you heard |
|
- Specific pronunciation feedback |
|
- Tips for improvement |
|
- Example words to practice"""}, |
|
{"role": "user", "content": "<|audio|>"} |
|
] |
|
}, max_new_tokens=256) |
|
|
|
return output |
|
|
|
except Exception as e: |
|
return f"Error: {str(e)}" |
|
|
|
|
|
with gr.Blocks(title="Shuka v1 Transcription") as iface: |
|
gr.Markdown("## Shuka v1 - Voice Transcription") |
|
gr.Markdown("""Upload or speak, and the model will respond naturally using SarvamAI's voice foundation model. |
|
|
|
Tips for best results: |
|
- Speak clearly and at a moderate pace |
|
- Keep background noise to a minimum |
|
- Maintain a distance of 6-12 inches from the microphone |
|
- Speak for at least 0.5 seconds but no more than 30 seconds""") |
|
|
|
with gr.Row(): |
|
audio_input = gr.Audio( |
|
sources=["microphone", "upload"], |
|
type="filepath", |
|
label="Audio Input", |
|
format="wav" |
|
) |
|
text_output = gr.Textbox(label="Model Response", placeholder="Response will appear here...") |
|
|
|
audio_input.change(fn=transcribe_and_respond, inputs=audio_input, outputs=text_output) |
|
|
|
if __name__ == "__main__": |
|
iface.launch() |
|
|