mohan696matlab
edit:readme
f57aae5
raw
history blame
5.24 kB
import gradio as gr
import time
import numpy as np
import os
import requests
import io
from pydub import AudioSegment
def translate_audio(audio, SARVAM_API_KEY):
# API endpoint for speech-to-text translation
api_url = "https://api.sarvam.ai/speech-to-text-translate"
# Headers containing the API subscription key
headers = {
"api-subscription-key": SARVAM_API_KEY # Replace with your API key
}
# Data payload for the translation request
model_data = {
"model": "saaras:v2", # Specify the model to be used
"with_diarization": False # Set to True for speaker diarization
}
chunk_buffer = io.BytesIO()
audio.export(chunk_buffer, format="wav")
chunk_buffer.seek(0) # Reset the pointer to the start of the stream
# Prepare the file for the API request
files = {'file': ('audiofile.wav', chunk_buffer, 'audio/wav')}
try:
response = requests.post(api_url, headers=headers, files=files, data=model_data)
if response.status_code == 200 or response.status_code == 201:
response_data = response.json()
transcript = response_data.get("transcript", "")
elif response.status_code == 401 or response.status_code == 403:
raise ValueError("❌ Invalid API key. Please check your Sarvam AI key.")
else:
raise RuntimeError(f"❌ Request failed with status code: {response.status_code}. Details: {response.text}")
except Exception as e:
raise e # Let the caller handle it
finally:
chunk_buffer.close()
return transcript
def stream_transcribe(history, new_chunk, SARVAM_API_KEY):
start_time = time.time()
if history is None:
history = ""
try:
sr, y = new_chunk
# Convert to mono if stereo
if y.ndim > 1:
y = y.mean(axis=1)
# Convert to int16 for AudioSegment
y_int16 = y.astype(np.int16)
# Create AudioSegment from raw PCM data
audio_segment = AudioSegment(
data=y_int16.tobytes(),
sample_width=2,
frame_rate=sr,
channels=1
)
transcription = translate_audio(audio_segment, SARVAM_API_KEY)
end_time = time.time()
latency = end_time - start_time
history = history + '\n' + transcription
return history, history, f"{latency:.2f}"
except ValueError as ve:
return history, str(ve), "Invalid Key"
except Exception as e:
print(f"Error during Transcription: {e}")
return history, str(e), "Error"
def clear():
return ""
def clear_state():
return None
def clear_api_key():
return ""
with gr.Blocks(theme=gr.themes.Citrus) as microphone:
with gr.Column():
gr.Markdown(
"""
### This app is designed to **transcribe and translate simultaneously from multiple Indian languages**. It supports **22 Indian languages**, including **Hindi, Oriya, Tamil, Telugu, Gujarati**, and more. It can **translate the transcribed text in real-time to English**, making it incredibly useful for multilingual audio processing.
### πŸ”‘ Sarvam AI API Key Required
To use this app, you need a free API key from [Sarvam AI](https://sarvam.ai).
πŸ‘‰ **Step 1:** Visit [https://sarvam.ai](https://sarvam.ai)
πŸ‘‰ **Step 2:** Sign up or log in
πŸ‘‰ **Step 3:** Generate your API key and paste it below
Your key stays on your device and is not stored.
"""
)
api_key_box = gr.Textbox(label="Enter SARVAM AI API Key", type="password")
with gr.Row():
input_audio_microphone = gr.Audio(streaming=True)
output = gr.Textbox(label="Transcription", value="")
latency_textbox = gr.Textbox(label="Latency (seconds)", value="0.0", scale=0)
with gr.Row():
clear_button = gr.Button("Clear Output")
clear_api_key_button = gr.Button("Clear API Key")
state = gr.State(value="")
def wrapped_stream_transcribe(history, new_chunk, api_key):
return stream_transcribe(history, new_chunk, api_key)
input_audio_microphone.stream(
wrapped_stream_transcribe,
[state, input_audio_microphone, api_key_box],
[state, output, latency_textbox],
time_limit=30,
stream_every=5,
concurrency_limit=None,
)
clear_button.click(clear_state, outputs=[state]).then(clear, outputs=[output])
clear_api_key_button.click(clear_api_key, outputs=[api_key_box])
gr.Markdown(
"""
---
### πŸ‘‹ Who am I?
I'm **Dr. Mohan Dash**, a PhD in Industrial Computer Science and an AI Research Engineer.
I run a YouTube channel called **[Intelligent Machines](https://www.youtube.com/@Mohankumardash)** where I share practical tutorials and insights on building real-world AI applications.
If you find this app useful, you'll definitely enjoy the tutorials and breakdowns I post there.
"""
)
demo = microphone
demo.launch()