Spaces:

Mohan-diffuser
/

indian-asr-with-sarvam

Running

mohan696matlab

ui update

401279f 14 days ago

5.94 kB

	import gradio as gr
	import time
	import numpy as np
	import os
	import requests
	import io
	from pydub import AudioSegment



	def translate_audio(audio, language_code, SARVAM_API_KEY):

	# API endpoint for speech-to-text translation
	api_url = "https://api.sarvam.ai/speech-to-text-translate"

	# Headers containing the API subscription key
	headers = {
	"api-subscription-key": SARVAM_API_KEY # Replace with your API key
	}

	# Data payload for the translation request
	model_data = {
	"model": "saaras:v2", # Specify the model to be used
	"with_diarization": False, # Set to True for speaker diarization
	"language_code": language_code
	}


	chunk_buffer = io.BytesIO()
	audio.export(chunk_buffer, format="wav")
	chunk_buffer.seek(0) # Reset the pointer to the start of the stream

	# Prepare the file for the API request
	files = {'file': ('audiofile.wav', chunk_buffer, 'audio/wav')}

	try:
	response = requests.post(api_url, headers=headers, files=files, data=model_data)

	if response.status_code == 200 or response.status_code == 201:
	response_data = response.json()
	transcript = response_data.get("transcript", "")
	detected_language = response_data.get("language_code", "")
	elif response.status_code == 401 or response.status_code == 403:
	raise ValueError("❌ Invalid API key. Please check your Sarvam AI key.")
	else:
	raise RuntimeError(f"❌ Request failed with status code: {response.status_code}. Details: {response.text}")

	except Exception as e:
	raise e # Let the caller handle it
	finally:
	chunk_buffer.close()

	return transcript,detected_language

	def stream_transcribe(history, new_chunk, language_code, SARVAM_API_KEY):

	if history is None:
	history = ""

	try:
	sr, y = new_chunk
	# Convert to mono if stereo
	if y.ndim > 1:
	y = y.mean(axis=1)

	# Convert to int16 for AudioSegment
	y_int16 = y.astype(np.int16)

	# Create AudioSegment from raw PCM data
	audio_segment = AudioSegment(
	data=y_int16.tobytes(),
	sample_width=2,
	frame_rate=sr,
	channels=1
	)

	transcription,detected_language = translate_audio(audio_segment, language_code, SARVAM_API_KEY)

	history = history + '\n' + f'({detected_language})==> ' + transcription

	return history, history
	except ValueError as ve:
	return history, str(ve)
	except Exception as e:
	print(f"Error during Transcription: {e}")
	return history, str(e)




	def clear():
	return ""

	def clear_state():
	return None

	def clear_api_key():
	return ""


	with gr.Blocks(theme=gr.themes.Soft()) as microphone:
	with gr.Column():

	gr.Markdown(
	"""
	## Translate simultaneously from multiple Indian languages to English.
	### It supports 22 Indian languages, including Hindi, Oriya, Tamil, Telugu, Gujarati, and more.

	### 🔑 Sarvam AI API Key Required
	To use this app, you need a free API key from [Sarvam AI](https://sarvam.ai).

	👉 Step 1: Visit [https://sarvam.ai](https://sarvam.ai)
	👉 Step 2: Sign up or log in
	👉 Step 3: Generate your API key and paste it below

	Your key stays on your device and is not stored.
	"""
	)


	api_key_box = gr.Textbox(label="Enter SARVAM AI API Key", type="password")

	language_options = [
	"hi-IN", "bn-IN", "kn-IN", "ml-IN", "mr-IN", "od-IN",
	"pa-IN", "ta-IN", "te-IN", "en-IN", "gu-IN", "unknown"
	]
	language_code_box = gr.Dropdown(
	choices=language_options,
	label="Select Language Code",
	value="unknown" # optional: default selected value
	)


	input_audio_microphone = gr.Audio(streaming=True)
	output = gr.Textbox(label="Transcription", lines=10,max_lines=100, show_copy_button=True, value="")

	with gr.Row():
	clear_button = gr.Button("Clear Output")
	clear_api_key_button = gr.Button("Clear API Key")

	state = gr.State(value="")

	input_audio_microphone.stream(
	stream_transcribe,
	[state, input_audio_microphone,language_code_box, api_key_box],
	[state, output],
	time_limit=30,
	stream_every=5,
	concurrency_limit=None,
	)

	clear_button.click(clear_state, outputs=[state]).then(clear, outputs=[output])
	clear_api_key_button.click(clear_api_key, outputs=[api_key_box])

	gr.Markdown(
	"""
	---

	### 👋 Who am I?

	I am Dr. Mohan Dash, a PhD in Industrial Computer Science and an AI Research Engineer.
	I run a YouTube channel called [Intelligent Machines](https://www.youtube.com/@Mohankumardash) where I share practical tutorials and insights on building real-world AI applications.

	If you find this app useful, you'll definitely enjoy the tutorials and breakdowns I post there.
	![YouTube Channel](https://yt3.googleusercontent.com/UYcIFCkqev-zwJemtbOPmmOzRU26gk-hetSSU18GWO-1wBbGHd7pjx5oTsz4x1sJ8riWg35TQw=w1707-fcrop64=1,00005a57ffffa5a8-k-c0xffffffff-no-nd-rj)

	---
	"""
	)

	demo = microphone
	demo.launch()