Spaces:

Mohan-diffuser
/

indian-asr-with-sarvam

Running

File size: 5,942 Bytes

import gradio as gr
import time  
import numpy as np
import os
import requests
import io
from pydub import AudioSegment



def translate_audio(audio, language_code, SARVAM_API_KEY):

    # API endpoint for speech-to-text translation
    api_url = "https://api.sarvam.ai/speech-to-text-translate"

    # Headers containing the API subscription key
    headers = {
        "api-subscription-key": SARVAM_API_KEY  # Replace with your API key
    }

    # Data payload for the translation request
    model_data = {
        "model": "saaras:v2",  # Specify the model to be used
        "with_diarization": False,  # Set to True for speaker diarization
        "language_code": language_code
    }


    chunk_buffer = io.BytesIO()
    audio.export(chunk_buffer, format="wav")
    chunk_buffer.seek(0)  # Reset the pointer to the start of the stream

    # Prepare the file for the API request
    files = {'file': ('audiofile.wav', chunk_buffer, 'audio/wav')}

    try:
        response = requests.post(api_url, headers=headers, files=files, data=model_data)

        if response.status_code == 200 or response.status_code == 201:
            response_data = response.json()
            transcript = response_data.get("transcript", "")
            detected_language = response_data.get("language_code", "")
        elif response.status_code == 401 or response.status_code == 403:
            raise ValueError("❌ Invalid API key. Please check your Sarvam AI key.")
        else:
            raise RuntimeError(f"❌ Request failed with status code: {response.status_code}. Details: {response.text}")

    except Exception as e:
        raise e  # Let the caller handle it
    finally:
        chunk_buffer.close()

    return transcript,detected_language

def stream_transcribe(history, new_chunk, language_code, SARVAM_API_KEY):
    
    if history is None:
        history = ""
        
    try:
        sr, y = new_chunk
        # Convert to mono if stereo
        if y.ndim > 1:
            y = y.mean(axis=1)

        # Convert to int16 for AudioSegment
        y_int16 = y.astype(np.int16)

        # Create AudioSegment from raw PCM data
        audio_segment = AudioSegment(
            data=y_int16.tobytes(),
            sample_width=2,
            frame_rate=sr,
            channels=1
        )

        transcription,detected_language = translate_audio(audio_segment, language_code, SARVAM_API_KEY)
   
        history = history + '\n' + f'({detected_language})==> ' + transcription

        return history, history
    except ValueError as ve:
        return history, str(ve)
    except Exception as e:
        print(f"Error during Transcription: {e}")
        return history, str(e)




def clear():
    return ""

def clear_state():
    return None

def clear_api_key():
    return ""


with gr.Blocks(theme=gr.themes.Soft()) as microphone:
    with gr.Column():
        
        gr.Markdown(
                    """
                    ## Translate simultaneously from multiple Indian languages to **English**. 
                    ### It supports **22 Indian languages**, including **Hindi, Oriya, Tamil, Telugu, Gujarati**, and more. 

                    ### 🔑 Sarvam AI API Key Required
                    To use this app, you need a free API key from [Sarvam AI](https://sarvam.ai).
                    
                    👉 **Step 1:** Visit [https://sarvam.ai](https://sarvam.ai)  
                    👉 **Step 2:** Sign up or log in  
                    👉 **Step 3:** Generate your API key and paste it below

                    Your key stays on your device and is not stored.
                    """
                        )


        api_key_box = gr.Textbox(label="Enter SARVAM AI API Key", type="password")

        language_options = [
                                "hi-IN", "bn-IN", "kn-IN", "ml-IN", "mr-IN", "od-IN",
                                "pa-IN", "ta-IN", "te-IN", "en-IN", "gu-IN", "unknown"
                            ]
        language_code_box = gr.Dropdown(
                                    choices=language_options,
                                    label="Select Language Code",
                                    value="unknown"  # optional: default selected value
                                )

       
        input_audio_microphone = gr.Audio(streaming=True)
        output = gr.Textbox(label="Transcription", lines=10,max_lines=100, show_copy_button=True, value="")

        with gr.Row():
            clear_button = gr.Button("Clear Output")
            clear_api_key_button = gr.Button("Clear API Key")
            
        state = gr.State(value="")
        
        input_audio_microphone.stream(
            stream_transcribe,
            [state, input_audio_microphone,language_code_box, api_key_box],
            [state, output],
            time_limit=30,
            stream_every=5,
            concurrency_limit=None,
        )

        clear_button.click(clear_state, outputs=[state]).then(clear, outputs=[output])
        clear_api_key_button.click(clear_api_key, outputs=[api_key_box])

        gr.Markdown(
                    """
                    ---

                    ### 👋 Who am I?

                    I am **Dr. Mohan Dash**, a PhD in Industrial Computer Science and an AI Research Engineer.  
                    I run a YouTube channel called **[Intelligent Machines](https://www.youtube.com/@Mohankumardash)** where I share practical tutorials and insights on building real-world AI applications.

                    If you find this app useful, you'll definitely enjoy the tutorials and breakdowns I post there.
                    ![YouTube Channel](https://yt3.googleusercontent.com/UYcIFCkqev-zwJemtbOPmmOzRU26gk-hetSSU18GWO-1wBbGHd7pjx5oTsz4x1sJ8riWg35TQw=w1707-fcrop64=1,00005a57ffffa5a8-k-c0xffffffff-no-nd-rj)

                    ---
                    """
                        )

demo = microphone
demo.launch()