File size: 3,681 Bytes
2ed7223
2ba8923
c621812
2f9ee0a
62ab8e6
7092158
011a958
2fe8145
 
 
 
 
 
 
7092158
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
62ab8e6
60f64df
dc03737
7092158
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ee53056
2f9ee0a
 
 
 
 
7092158
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2f9ee0a
 
d1e68ca
ee53056
3d72142
2ed7223
ab07d9e
60f64df
2ed7223
2f9ee0a
 
d1e68ca
7092158
 
 
 
 
 
 
d1e68ca
ed509ec
7092158
 
 
 
 
 
d1e68ca
3d72142
2f9ee0a
2ed7223
c621812
d1e68ca
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
import gradio as gr
import librosa
import torch
from transformers import pipeline
import spaces
import numpy as np

# Initialize model once at startup
pipe = pipeline(
    model="sarvamai/shuka_v1",
    trust_remote_code=True,
    device=0 if torch.cuda.is_available() else -1
)

def preprocess_audio(audio, sr):
    # Normalize audio
    audio = librosa.util.normalize(audio)
    
    # Remove silence
    audio, _ = librosa.effects.trim(audio, top_db=20)
    
    # Apply noise reduction (simple high-pass filter)
    audio = librosa.effects.preemphasis(audio)
    
    # Ensure audio is mono
    if len(audio.shape) > 1:
        audio = librosa.to_mono(audio)
    
    return audio, sr

@spaces.GPU
def transcribe_and_respond(audio_file):
    try:
        # Load audio with higher quality settings
        audio, sr = librosa.load(
            audio_file,
            sr=16000,  # Standard sample rate for speech
            mono=True,  # Ensure mono audio
            res_type='kaiser_best'  # High-quality resampling
        )
        
        # Preprocess audio
        audio, sr = preprocess_audio(audio, sr)
        
        # Ensure audio is not too short or too long
        if len(audio) < sr * 0.5:  # Less than 0.5 seconds
            return "Error: Audio is too short. Please speak for at least 0.5 seconds."
        if len(audio) > sr * 30:  # More than 30 seconds
            return "Error: Audio is too long. Please keep it under 30 seconds."

        # Use Shuka's expected format
        output = pipe({
            "audio": audio,
            "sampling_rate": sr,
            "turns": [
                {"role": "system", "content": """You are an expert English pronunciation teacher specializing in teaching Indian English learners. Your role is to:
                    1. Listen carefully to the student's pronunciation
                    2. Provide specific feedback on pronunciation accuracy
                    3. Break down difficult words into syllables
                    4. Explain the correct mouth positions and sounds
                    5. Use simple, clear language
                    6. Be encouraging and supportive
                    7. Focus on common Indian English pronunciation challenges
                    8. Provide examples of correct pronunciation

                    Format your response in this structure:
                    - What you heard
                    - Specific pronunciation feedback
                    - Tips for improvement
                    - Example words to practice"""},
                {"role": "user", "content": "<|audio|>"}
            ]
        }, max_new_tokens=256)

        return output

    except Exception as e:
        return f"Error: {str(e)}"

# Gradio interface
with gr.Blocks(title="Shuka v1 Transcription") as iface:
    gr.Markdown("## Shuka v1 - Voice Transcription")
    gr.Markdown("""Upload or speak, and the model will respond naturally using SarvamAI's voice foundation model.
    
Tips for best results:
- Speak clearly and at a moderate pace
- Keep background noise to a minimum
- Maintain a distance of 6-12 inches from the microphone
- Speak for at least 0.5 seconds but no more than 30 seconds""")
    
    with gr.Row():
        audio_input = gr.Audio(
            sources=["microphone", "upload"],
            type="filepath",
            label="Audio Input",
            format="wav"  # Ensure WAV format for best quality
        )
        text_output = gr.Textbox(label="Model Response", placeholder="Response will appear here...")

    audio_input.change(fn=transcribe_and_respond, inputs=audio_input, outputs=text_output)

if __name__ == "__main__":
    iface.launch()