Spaces:

Mohan-diffuser
/

indian-asr-with-sarvam

Running

mohan696matlab

edit:readme

f57aae5 3 months ago

5.24 kB

	import gradio as gr
	import time
	import numpy as np
	import os
	import requests
	import io
	from pydub import AudioSegment



	def translate_audio(audio, SARVAM_API_KEY):

	# API endpoint for speech-to-text translation
	api_url = "https://api.sarvam.ai/speech-to-text-translate"

	# Headers containing the API subscription key
	headers = {
	"api-subscription-key": SARVAM_API_KEY # Replace with your API key
	}

	# Data payload for the translation request
	model_data = {
	"model": "saaras:v2", # Specify the model to be used
	"with_diarization": False # Set to True for speaker diarization
	}


	chunk_buffer = io.BytesIO()
	audio.export(chunk_buffer, format="wav")
	chunk_buffer.seek(0) # Reset the pointer to the start of the stream

	# Prepare the file for the API request
	files = {'file': ('audiofile.wav', chunk_buffer, 'audio/wav')}

	try:
	response = requests.post(api_url, headers=headers, files=files, data=model_data)

	if response.status_code == 200 or response.status_code == 201:
	response_data = response.json()
	transcript = response_data.get("transcript", "")
	elif response.status_code == 401 or response.status_code == 403:
	raise ValueError("❌ Invalid API key. Please check your Sarvam AI key.")
	else:
	raise RuntimeError(f"❌ Request failed with status code: {response.status_code}. Details: {response.text}")

	except Exception as e:
	raise e # Let the caller handle it
	finally:
	chunk_buffer.close()

	return transcript

	def stream_transcribe(history, new_chunk, SARVAM_API_KEY):
	start_time = time.time()

	if history is None:
	history = ""

	try:
	sr, y = new_chunk

	# Convert to mono if stereo
	if y.ndim > 1:
	y = y.mean(axis=1)

	# Convert to int16 for AudioSegment
	y_int16 = y.astype(np.int16)

	# Create AudioSegment from raw PCM data
	audio_segment = AudioSegment(
	data=y_int16.tobytes(),
	sample_width=2,
	frame_rate=sr,
	channels=1
	)

	transcription = translate_audio(audio_segment, SARVAM_API_KEY)
	end_time = time.time()
	latency = end_time - start_time
	history = history + '\n' + transcription

	return history, history, f"{latency:.2f}"
	except ValueError as ve:
	return history, str(ve), "Invalid Key"
	except Exception as e:
	print(f"Error during Transcription: {e}")
	return history, str(e), "Error"




	def clear():
	return ""

	def clear_state():
	return None

	def clear_api_key():
	return ""


	with gr.Blocks(theme=gr.themes.Citrus) as microphone:
	with gr.Column():

	gr.Markdown(
	"""
	### This app is designed to transcribe and translate simultaneously from multiple Indian languages. It supports 22 Indian languages, including Hindi, Oriya, Tamil, Telugu, Gujarati, and more. It can translate the transcribed text in real-time to English, making it incredibly useful for multilingual audio processing.

	### 🔑 Sarvam AI API Key Required
	To use this app, you need a free API key from [Sarvam AI](https://sarvam.ai).

	👉 Step 1: Visit [https://sarvam.ai](https://sarvam.ai)
	👉 Step 2: Sign up or log in
	👉 Step 3: Generate your API key and paste it below

	Your key stays on your device and is not stored.
	"""
	)
	api_key_box = gr.Textbox(label="Enter SARVAM AI API Key", type="password")

	with gr.Row():
	input_audio_microphone = gr.Audio(streaming=True)
	output = gr.Textbox(label="Transcription", value="")
	latency_textbox = gr.Textbox(label="Latency (seconds)", value="0.0", scale=0)
	with gr.Row():
	clear_button = gr.Button("Clear Output")
	clear_api_key_button = gr.Button("Clear API Key")
	state = gr.State(value="")
	def wrapped_stream_transcribe(history, new_chunk, api_key):
	return stream_transcribe(history, new_chunk, api_key)

	input_audio_microphone.stream(
	wrapped_stream_transcribe,
	[state, input_audio_microphone, api_key_box],
	[state, output, latency_textbox],
	time_limit=30,
	stream_every=5,
	concurrency_limit=None,
	)

	clear_button.click(clear_state, outputs=[state]).then(clear, outputs=[output])
	clear_api_key_button.click(clear_api_key, outputs=[api_key_box])

	gr.Markdown(
	"""
	---

	### 👋 Who am I?

	I'm Dr. Mohan Dash, a PhD in Industrial Computer Science and an AI Research Engineer.
	I run a YouTube channel called [Intelligent Machines](https://www.youtube.com/@Mohankumardash) where I share practical tutorials and insights on building real-world AI applications.

	If you find this app useful, you'll definitely enjoy the tutorials and breakdowns I post there.
	"""
	)



	demo = microphone
	demo.launch()