import gradio as gr import time import numpy as np import os import requests import io from pydub import AudioSegment def translate_audio(audio, language_code, SARVAM_API_KEY): # API endpoint for speech-to-text translation api_url = "https://api.sarvam.ai/speech-to-text-translate" # Headers containing the API subscription key headers = { "api-subscription-key": SARVAM_API_KEY # Replace with your API key } # Data payload for the translation request model_data = { "model": "saaras:v2", # Specify the model to be used "with_diarization": False, # Set to True for speaker diarization "language_code": language_code } chunk_buffer = io.BytesIO() audio.export(chunk_buffer, format="wav") chunk_buffer.seek(0) # Reset the pointer to the start of the stream # Prepare the file for the API request files = {'file': ('audiofile.wav', chunk_buffer, 'audio/wav')} try: response = requests.post(api_url, headers=headers, files=files, data=model_data) if response.status_code == 200 or response.status_code == 201: response_data = response.json() transcript = response_data.get("transcript", "") detected_language = response_data.get("language_code", "") elif response.status_code == 401 or response.status_code == 403: raise ValueError("❌ Invalid API key. Please check your Sarvam AI key.") else: raise RuntimeError(f"❌ Request failed with status code: {response.status_code}. Details: {response.text}") except Exception as e: raise e # Let the caller handle it finally: chunk_buffer.close() return transcript,detected_language def stream_transcribe(history, new_chunk, language_code, SARVAM_API_KEY): if history is None: history = "" try: sr, y = new_chunk # Convert to mono if stereo if y.ndim > 1: y = y.mean(axis=1) # Convert to int16 for AudioSegment y_int16 = y.astype(np.int16) # Create AudioSegment from raw PCM data audio_segment = AudioSegment( data=y_int16.tobytes(), sample_width=2, frame_rate=sr, channels=1 ) transcription,detected_language = translate_audio(audio_segment, language_code, SARVAM_API_KEY) history = history + '\n' + f'({detected_language})==> ' + transcription return history, history except ValueError as ve: return history, str(ve) except Exception as e: print(f"Error during Transcription: {e}") return history, str(e) def clear(): return "" def clear_state(): return None def clear_api_key(): return "" with gr.Blocks(theme=gr.themes.Soft()) as microphone: with gr.Column(): gr.Markdown( """ ## Translate simultaneously from multiple Indian languages to **English**. ### It supports **22 Indian languages**, including **Hindi, Oriya, Tamil, Telugu, Gujarati**, and more. ### 🔑 Sarvam AI API Key Required To use this app, you need a free API key from [Sarvam AI](https://sarvam.ai). 👉 **Step 1:** Visit [https://sarvam.ai](https://sarvam.ai) 👉 **Step 2:** Sign up or log in 👉 **Step 3:** Generate your API key and paste it below Your key stays on your device and is not stored. """ ) api_key_box = gr.Textbox(label="Enter SARVAM AI API Key", type="password") language_options = [ "hi-IN", "bn-IN", "kn-IN", "ml-IN", "mr-IN", "od-IN", "pa-IN", "ta-IN", "te-IN", "en-IN", "gu-IN", "unknown" ] language_code_box = gr.Dropdown( choices=language_options, label="Select Language Code", value="unknown" # optional: default selected value ) input_audio_microphone = gr.Audio(streaming=True) output = gr.Textbox(label="Transcription", lines=10,max_lines=100, show_copy_button=True, value="") with gr.Row(): clear_button = gr.Button("Clear Output") clear_api_key_button = gr.Button("Clear API Key") state = gr.State(value="") input_audio_microphone.stream( stream_transcribe, [state, input_audio_microphone,language_code_box, api_key_box], [state, output], time_limit=30, stream_every=5, concurrency_limit=None, ) clear_button.click(clear_state, outputs=[state]).then(clear, outputs=[output]) clear_api_key_button.click(clear_api_key, outputs=[api_key_box]) gr.Markdown( """ --- ### 👋 Who am I? I am **Dr. Mohan Dash**, a PhD in Industrial Computer Science and an AI Research Engineer. I run a YouTube channel called **[Intelligent Machines](https://www.youtube.com/@Mohankumardash)** where I share practical tutorials and insights on building real-world AI applications. If you find this app useful, you'll definitely enjoy the tutorials and breakdowns I post there. ![YouTube Channel](https://yt3.googleusercontent.com/UYcIFCkqev-zwJemtbOPmmOzRU26gk-hetSSU18GWO-1wBbGHd7pjx5oTsz4x1sJ8riWg35TQw=w1707-fcrop64=1,00005a57ffffa5a8-k-c0xffffffff-no-nd-rj) --- """ ) demo = microphone demo.launch()