Spaces:

Mohan-diffuser
/

indian-asr-with-sarvam

Sleeping

mohan696matlab

ui update

65a7d00 about 2 months ago

6.13 kB

	import gradio as gr
	import time
	import numpy as np
	import os
	import requests
	import io
	from pydub import AudioSegment



	def translate_audio(audio, language_code, SARVAM_API_KEY):

	# API endpoint for speech-to-text translation
	api_url = "https://api.sarvam.ai/speech-to-text-translate"

	# Headers containing the API subscription key
	headers = {
	"api-subscription-key": SARVAM_API_KEY # Replace with your API key
	}

	# Data payload for the translation request
	model_data = {
	"model": "saaras:v2", # Specify the model to be used
	"with_diarization": False, # Set to True for speaker diarization
	"language_code": language_code
	}


	chunk_buffer = io.BytesIO()
	audio.export(chunk_buffer, format="wav")
	chunk_buffer.seek(0) # Reset the pointer to the start of the stream

	# Prepare the file for the API request
	files = {'file': ('audiofile.wav', chunk_buffer, 'audio/wav')}

	try:
	response = requests.post(api_url, headers=headers, files=files, data=model_data)

	if response.status_code == 200 or response.status_code == 201:
	response_data = response.json()
	transcript = response_data.get("transcript", "")
	detected_language = response_data.get("language_code", "")
	elif response.status_code == 401 or response.status_code == 403:
	raise ValueError("❌ Invalid API key. Please check your Sarvam AI key.")
	else:
	raise RuntimeError(f"❌ Request failed with status code: {response.status_code}. Details: {response.text}")

	except Exception as e:
	raise e # Let the caller handle it
	finally:
	chunk_buffer.close()

	return transcript,detected_language

	def stream_transcribe(history, new_chunk, language_code, SARVAM_API_KEY):

	if history is None:
	history = ""

	try:
	sr, y = new_chunk

	print(y.max(), y.min())

	# Convert to mono if stereo
	if y.ndim > 1:
	y = y.mean(axis=1)

	# Convert to int16 for AudioSegment
	y_int16 = y.astype(np.int16)

	# Create AudioSegment from raw PCM data
	audio_segment = AudioSegment(
	data=y_int16.tobytes(),
	sample_width=2,
	frame_rate=sr,
	channels=1
	)

	transcription,detected_language = translate_audio(audio_segment, language_code, SARVAM_API_KEY)

	history = history + '\n' + f'({detected_language})==> ' +transcription

	return history, history
	except ValueError as ve:
	return history, str(ve)
	except Exception as e:
	print(f"Error during Transcription: {e}")
	return history, str(e)




	def clear():
	return ""

	def clear_state():
	return None

	def clear_api_key():
	return ""


	with gr.Blocks(theme=gr.themes.Soft()) as microphone:
	with gr.Column():

	gr.Markdown(
	"""
	## Translate simultaneously from multiple Indian languages to English.
	### It supports 22 Indian languages, including Hindi, Oriya, Tamil, Telugu, Gujarati, and more.

	### 🔑 Sarvam AI API Key Required
	To use this app, you need a free API key from [Sarvam AI](https://sarvam.ai).

	👉 Step 1: Visit [https://sarvam.ai](https://sarvam.ai)
	👉 Step 2: Sign up or log in
	👉 Step 3: Generate your API key and paste it below

	Your key stays on your device and is not stored.
	"""
	)


	api_key_box = gr.Textbox(label="Enter SARVAM AI API Key", type="password")

	language_options = [
	"hi-IN", "bn-IN", "kn-IN", "ml-IN", "mr-IN", "od-IN",
	"pa-IN", "ta-IN", "te-IN", "en-IN", "gu-IN", "unknown"
	]
	language_code_box = gr.Dropdown(
	choices=language_options,
	label="Select Language Code",
	value="unknown" # optional: default selected value
	)


	input_audio_microphone = gr.Audio(streaming=True)
	output = gr.Textbox(label="Transcription", lines=10,max_lines=100, show_copy_button=True, value="")

	with gr.Row():
	clear_button = gr.Button("Clear Output")
	clear_api_key_button = gr.Button("Clear API Key")
	state = gr.State(value="")
	def wrapped_stream_transcribe(history, new_chunk,language_code, api_key):
	return stream_transcribe(history, new_chunk,language_code, api_key)

	input_audio_microphone.stream(
	wrapped_stream_transcribe,
	[state, input_audio_microphone,language_code_box, api_key_box],
	[state, output],
	time_limit=30,
	stream_every=5,
	concurrency_limit=None,
	)

	clear_button.click(clear_state, outputs=[state]).then(clear, outputs=[output])
	clear_api_key_button.click(clear_api_key, outputs=[api_key_box])

	gr.Markdown(
	"""
	---

	### 👋 Who am I?

	I am Dr. Mohan Dash, a PhD in Industrial Computer Science and an AI Research Engineer.
	I run a YouTube channel called [Intelligent Machines](https://www.youtube.com/@Mohankumardash) where I share practical tutorials and insights on building real-world AI applications.

	If you find this app useful, you'll definitely enjoy the tutorials and breakdowns I post there.
	![YouTube Channel](https://yt3.googleusercontent.com/UYcIFCkqev-zwJemtbOPmmOzRU26gk-hetSSU18GWO-1wBbGHd7pjx5oTsz4x1sJ8riWg35TQw=w1707-fcrop64=1,00005a57ffffa5a8-k-c0xffffffff-no-nd-rj)

	---
	"""
	)

	demo = microphone
	demo.launch()