Spaces:

Mohan-diffuser
/

indian-asr-with-sarvam

Sleeping

File size: 4,657 Bytes

import gradio as gr
import time  
import numpy as np
import os
import requests
import io
from pydub import AudioSegment



def translate_audio(audio, SARVAM_API_KEY):

    # API endpoint for speech-to-text translation
    api_url = "https://api.sarvam.ai/speech-to-text-translate"

    # Headers containing the API subscription key
    headers = {
        "api-subscription-key": SARVAM_API_KEY  # Replace with your API key
    }

    # Data payload for the translation request
    model_data = {
        "model": "saaras:v2",  # Specify the model to be used
        "with_diarization": False  # Set to True for speaker diarization
    }


    chunk_buffer = io.BytesIO()
    audio.export(chunk_buffer, format="wav")
    chunk_buffer.seek(0)  # Reset the pointer to the start of the stream

    # Prepare the file for the API request
    files = {'file': ('audiofile.wav', chunk_buffer, 'audio/wav')}

    try:
        response = requests.post(api_url, headers=headers, files=files, data=model_data)

        if response.status_code == 200 or response.status_code == 201:
            response_data = response.json()
            transcript = response_data.get("transcript", "")
        elif response.status_code == 401 or response.status_code == 403:
            raise ValueError("❌ Invalid API key. Please check your Sarvam AI key.")
        else:
            raise RuntimeError(f"❌ Request failed with status code: {response.status_code}. Details: {response.text}")

    except Exception as e:
        raise e  # Let the caller handle it
    finally:
        chunk_buffer.close()

    return transcript

def stream_transcribe(history, new_chunk, SARVAM_API_KEY):
    start_time = time.time()
    
    if history is None:
        history = ""
        
    try:
        sr, y = new_chunk

        # Convert to mono if stereo
        if y.ndim > 1:
            y = y.mean(axis=1)

        # Convert to int16 for AudioSegment
        y_int16 = y.astype(np.int16)

        # Create AudioSegment from raw PCM data
        audio_segment = AudioSegment(
            data=y_int16.tobytes(),
            sample_width=2,
            frame_rate=sr,
            channels=1
        )

        transcription = translate_audio(audio_segment, SARVAM_API_KEY)
        end_time = time.time()
        latency = end_time - start_time
        history = history + '\n' + transcription

        return history, history, f"{latency:.2f}"
    except ValueError as ve:
        return history, str(ve), "Invalid Key"
    except Exception as e:
        print(f"Error during Transcription: {e}")
        return history, str(e), "Error"




def clear():
    return ""

def clear_state():
    return None

def clear_api_key():
    return ""


with gr.Blocks(theme=gr.themes.Glass()) as microphone:
    with gr.Column():
        
        gr.Markdown(
    """
    ### This app is designed to **transcribe and translate simultaneously from multiple Indian languages**. It supports **22 Indian languages**, including **Hindi, Oriya, Tamil, Telugu, Gujarati**, and more. It can **translate the transcribed text in real-time to English**, making it incredibly useful for multilingual audio processing.

    ### 🔑 Sarvam AI API Key Required
    To use this app, you need a free API key from [Sarvam AI](https://sarvam.ai).
    
    👉 **Step 1:** Visit [https://sarvam.ai](https://sarvam.ai)  
    👉 **Step 2:** Sign up or log in  
    👉 **Step 3:** Generate your API key and paste it below

    Your key stays on your device and is not stored.
    """
        )
        api_key_box = gr.Textbox(label="Enter SARVAM AI API Key", type="password")

        with gr.Row():
            input_audio_microphone = gr.Audio(streaming=True)
            output = gr.Textbox(label="Transcription", value="")
            latency_textbox = gr.Textbox(label="Latency (seconds)", value="0.0", scale=0)
        with gr.Row():
            clear_button = gr.Button("Clear Output")
            clear_api_key_button = gr.Button("Clear API Key")
        state = gr.State(value="")
        def wrapped_stream_transcribe(history, new_chunk, api_key):
            return stream_transcribe(history, new_chunk, api_key)
        
        input_audio_microphone.stream(
            wrapped_stream_transcribe,
            [state, input_audio_microphone, api_key_box],
            [state, output, latency_textbox],
            time_limit=30,
            stream_every=5,
            concurrency_limit=None,
        )

        clear_button.click(clear_state, outputs=[state]).then(clear, outputs=[output])
        clear_api_key_button.click(clear_api_key, outputs=[api_key_box])


demo = microphone
demo.launch()