mohan696matlab
citrus theme
c36c82b
raw
history blame
4.66 kB
import gradio as gr
import time
import numpy as np
import os
import requests
import io
from pydub import AudioSegment
def translate_audio(audio, SARVAM_API_KEY):
# API endpoint for speech-to-text translation
api_url = "https://api.sarvam.ai/speech-to-text-translate"
# Headers containing the API subscription key
headers = {
"api-subscription-key": SARVAM_API_KEY # Replace with your API key
}
# Data payload for the translation request
model_data = {
"model": "saaras:v2", # Specify the model to be used
"with_diarization": False # Set to True for speaker diarization
}
chunk_buffer = io.BytesIO()
audio.export(chunk_buffer, format="wav")
chunk_buffer.seek(0) # Reset the pointer to the start of the stream
# Prepare the file for the API request
files = {'file': ('audiofile.wav', chunk_buffer, 'audio/wav')}
try:
response = requests.post(api_url, headers=headers, files=files, data=model_data)
if response.status_code == 200 or response.status_code == 201:
response_data = response.json()
transcript = response_data.get("transcript", "")
elif response.status_code == 401 or response.status_code == 403:
raise ValueError("❌ Invalid API key. Please check your Sarvam AI key.")
else:
raise RuntimeError(f"❌ Request failed with status code: {response.status_code}. Details: {response.text}")
except Exception as e:
raise e # Let the caller handle it
finally:
chunk_buffer.close()
return transcript
def stream_transcribe(history, new_chunk, SARVAM_API_KEY):
start_time = time.time()
if history is None:
history = ""
try:
sr, y = new_chunk
# Convert to mono if stereo
if y.ndim > 1:
y = y.mean(axis=1)
# Convert to int16 for AudioSegment
y_int16 = y.astype(np.int16)
# Create AudioSegment from raw PCM data
audio_segment = AudioSegment(
data=y_int16.tobytes(),
sample_width=2,
frame_rate=sr,
channels=1
)
transcription = translate_audio(audio_segment, SARVAM_API_KEY)
end_time = time.time()
latency = end_time - start_time
history = history + '\n' + transcription
return history, history, f"{latency:.2f}"
except ValueError as ve:
return history, str(ve), "Invalid Key"
except Exception as e:
print(f"Error during Transcription: {e}")
return history, str(e), "Error"
def clear():
return ""
def clear_state():
return None
def clear_api_key():
return ""
with gr.Blocks(theme=gr.themes.Citrus) as microphone:
with gr.Column():
gr.Markdown(
"""
### This app is designed to **transcribe and translate simultaneously from multiple Indian languages**. It supports **22 Indian languages**, including **Hindi, Oriya, Tamil, Telugu, Gujarati**, and more. It can **translate the transcribed text in real-time to English**, making it incredibly useful for multilingual audio processing.
### πŸ”‘ Sarvam AI API Key Required
To use this app, you need a free API key from [Sarvam AI](https://sarvam.ai).
πŸ‘‰ **Step 1:** Visit [https://sarvam.ai](https://sarvam.ai)
πŸ‘‰ **Step 2:** Sign up or log in
πŸ‘‰ **Step 3:** Generate your API key and paste it below
Your key stays on your device and is not stored.
"""
)
api_key_box = gr.Textbox(label="Enter SARVAM AI API Key", type="password")
with gr.Row():
input_audio_microphone = gr.Audio(streaming=True)
output = gr.Textbox(label="Transcription", value="")
latency_textbox = gr.Textbox(label="Latency (seconds)", value="0.0", scale=0)
with gr.Row():
clear_button = gr.Button("Clear Output")
clear_api_key_button = gr.Button("Clear API Key")
state = gr.State(value="")
def wrapped_stream_transcribe(history, new_chunk, api_key):
return stream_transcribe(history, new_chunk, api_key)
input_audio_microphone.stream(
wrapped_stream_transcribe,
[state, input_audio_microphone, api_key_box],
[state, output, latency_textbox],
time_limit=30,
stream_every=5,
concurrency_limit=None,
)
clear_button.click(clear_state, outputs=[state]).then(clear, outputs=[output])
clear_api_key_button.click(clear_api_key, outputs=[api_key_box])
demo = microphone
demo.launch()