|
import gradio as gr |
|
import time |
|
import numpy as np |
|
import os |
|
import requests |
|
import io |
|
from pydub import AudioSegment |
|
|
|
|
|
|
|
def translate_audio(audio, SARVAM_API_KEY): |
|
|
|
|
|
api_url = "https://api.sarvam.ai/speech-to-text-translate" |
|
|
|
|
|
headers = { |
|
"api-subscription-key": SARVAM_API_KEY |
|
} |
|
|
|
|
|
model_data = { |
|
"model": "saaras:v2", |
|
"with_diarization": False |
|
} |
|
|
|
|
|
chunk_buffer = io.BytesIO() |
|
audio.export(chunk_buffer, format="wav") |
|
chunk_buffer.seek(0) |
|
|
|
|
|
files = {'file': ('audiofile.wav', chunk_buffer, 'audio/wav')} |
|
|
|
try: |
|
response = requests.post(api_url, headers=headers, files=files, data=model_data) |
|
|
|
if response.status_code == 200 or response.status_code == 201: |
|
response_data = response.json() |
|
transcript = response_data.get("transcript", "") |
|
elif response.status_code == 401 or response.status_code == 403: |
|
raise ValueError("β Invalid API key. Please check your Sarvam AI key.") |
|
else: |
|
raise RuntimeError(f"β Request failed with status code: {response.status_code}. Details: {response.text}") |
|
|
|
except Exception as e: |
|
raise e |
|
finally: |
|
chunk_buffer.close() |
|
|
|
return transcript |
|
|
|
def stream_transcribe(history, new_chunk, SARVAM_API_KEY): |
|
start_time = time.time() |
|
|
|
if history is None: |
|
history = "" |
|
|
|
try: |
|
sr, y = new_chunk |
|
|
|
|
|
if y.ndim > 1: |
|
y = y.mean(axis=1) |
|
|
|
|
|
y_int16 = y.astype(np.int16) |
|
|
|
|
|
audio_segment = AudioSegment( |
|
data=y_int16.tobytes(), |
|
sample_width=2, |
|
frame_rate=sr, |
|
channels=1 |
|
) |
|
|
|
transcription = translate_audio(audio_segment, SARVAM_API_KEY) |
|
end_time = time.time() |
|
latency = end_time - start_time |
|
history = history + '\n' + transcription |
|
|
|
return history, history, f"{latency:.2f}" |
|
except ValueError as ve: |
|
return history, str(ve), "Invalid Key" |
|
except Exception as e: |
|
print(f"Error during Transcription: {e}") |
|
return history, str(e), "Error" |
|
|
|
|
|
|
|
|
|
def clear(): |
|
return "" |
|
|
|
def clear_state(): |
|
return None |
|
|
|
def clear_api_key(): |
|
return "" |
|
|
|
|
|
with gr.Blocks(theme=gr.themes.Glass()) as microphone: |
|
with gr.Column(): |
|
|
|
gr.Markdown( |
|
""" |
|
### This app is designed to **transcribe and translate simultaneously from multiple Indian languages**. It supports **22 Indian languages**, including **Hindi, Oriya, Tamil, Telugu, Gujarati**, and more. It can **translate the transcribed text in real-time to English**, making it incredibly useful for multilingual audio processing. |
|
|
|
### π Sarvam AI API Key Required |
|
To use this app, you need a free API key from [Sarvam AI](https://sarvam.ai). |
|
|
|
π **Step 1:** Visit [https://sarvam.ai](https://sarvam.ai) |
|
π **Step 2:** Sign up or log in |
|
π **Step 3:** Generate your API key and paste it below |
|
|
|
Your key stays on your device and is not stored. |
|
""" |
|
) |
|
api_key_box = gr.Textbox(label="Enter SARVAM AI API Key", type="password") |
|
|
|
with gr.Row(): |
|
input_audio_microphone = gr.Audio(streaming=True) |
|
output = gr.Textbox(label="Transcription", value="") |
|
latency_textbox = gr.Textbox(label="Latency (seconds)", value="0.0", scale=0) |
|
with gr.Row(): |
|
clear_button = gr.Button("Clear Output") |
|
clear_api_key_button = gr.Button("Clear API Key") |
|
state = gr.State(value="") |
|
def wrapped_stream_transcribe(history, new_chunk, api_key): |
|
return stream_transcribe(history, new_chunk, api_key) |
|
|
|
input_audio_microphone.stream( |
|
wrapped_stream_transcribe, |
|
[state, input_audio_microphone, api_key_box], |
|
[state, output, latency_textbox], |
|
time_limit=30, |
|
stream_every=5, |
|
concurrency_limit=None, |
|
) |
|
|
|
clear_button.click(clear_state, outputs=[state]).then(clear, outputs=[output]) |
|
clear_api_key_button.click(clear_api_key, outputs=[api_key_box]) |
|
|
|
|
|
demo = microphone |
|
demo.launch() |