Spaces:

yalsaffar
/

S3TVR-Demo

Runtime error

App Files Files Community

S3TVR-Demo / stream_VAD2.py

yalsaffar

init

aa7cb02 over 1 year ago

raw

history blame

9.63 kB

	import collections
	import contextlib
	import wave
	import webrtcvad
	import pyaudio
	import os
	import librosa
	import numpy as np
	from models.nllb import nllb_translate
	from models.TTS_utils import append_text_order
	from models.parakeet import parakeet_ctc_process
	from models.es_fastconformer import stt_es_process
	from concurrent.futures import ThreadPoolExecutor
	import time
	from models.noise_red import noise_reduction
	class Frame(object):
	"""
	Represents a "frame" of audio data.

	Args:
	bytes (bytes): The audio data.
	timestamp (float): The timestamp of the frame.
	duration (float): The duration of the frame.
	"""
	def __init__(self, bytes, timestamp, duration):
	self.bytes = bytes
	self.timestamp = timestamp
	self.duration = duration

	def read_audio(stream, frame_duration_ms, rate):
	"""
	Generates audio frames from the input stream.

	Args:
	stream (pyaudio.Stream): The audio stream.
	frame_duration_ms (int): Duration of each frame in milliseconds.
	rate (int): The sample rate of the audio.

	Yields:
	bytes: The audio frames.
	"""
	frames_per_buffer = int(rate * frame_duration_ms / 1000)
	while True:
	yield stream.read(frames_per_buffer)

	def vad_collector(sample_rate, frame_duration_ms, padding_duration_ms, vad, frames):
	"""
	Filters out non-voiced audio frames.

	Args:
	sample_rate (int): The sample rate of the audio.
	frame_duration_ms (int): Duration of each frame in milliseconds.
	padding_duration_ms (int): Duration of padding in milliseconds.
	vad (webrtcvad.Vad): The VAD object.
	frames (generator): A generator yielding audio frames.

	Yields:
	bytes: Voiced audio frames.
	"""
	num_padding_frames = int(padding_duration_ms / frame_duration_ms)
	ring_buffer = collections.deque(maxlen=num_padding_frames)
	triggered = False

	voiced_frames = []
	for frame in frames:
	is_speech = vad.is_speech(frame.bytes, sample_rate)

	if not triggered:
	ring_buffer.append((frame, is_speech))
	num_voiced = len([f for f, speech in ring_buffer if speech])
	if num_voiced > 0.9 * ring_buffer.maxlen:
	triggered = True
	voiced_frames.extend(f for f, speech in ring_buffer)
	ring_buffer.clear()
	else:
	voiced_frames.append(frame)
	ring_buffer.append((frame, is_speech))
	num_unvoiced = len([f for f, speech in ring_buffer if not speech])
	if num_unvoiced > 0.9 * ring_buffer.maxlen:
	yield b''.join([f.bytes for f in voiced_frames])
	ring_buffer.clear()
	voiced_frames = []
	triggered = False
	if voiced_frames:
	yield b''.join([f.bytes for f in voiced_frames])


	def is_segment_empty(file_path):
	"""
	Check if the audio segment is empty.

	Args:
	file_path (str): Path to the audio file.

	Returns:
	bool: True if the segment is empty, False otherwise.
	"""
	audio, _ = librosa.load(file_path)
	rms = librosa.feature.rms(y=audio) # Pass the audio data as an argument
	rms_mean = np.mean(rms)
	print(rms_mean)

	if rms_mean < 0.015:
	return True
	else:
	return False


	def process_segment(asr_model, model_nllb, tokenizer_nllb, path_segments, path_results, target_lang, order, json_path_temp, json_path_record):
	"""
	Process an audio segment: noise reduction, transcription, translation, and append results.

	Args:
	asr_model: The ASR model for transcription.
	model_nllb: The NLLB model for translation.
	tokenizer_nllb: The tokenizer for the NLLB model.
	path_segments (str): Path to the audio segment.
	path_results (str): Path to save the results.
	target_lang (str): Target language for translation.
	order (int): Order index of the segment.
	json_path_temp (str): Path to the temporary JSON file.
	json_path_record (str): Path to the record JSON file.
	"""
	print("Processing segment...")
	if is_segment_empty(path_segments):
	print("No speech detected.")
	# remove the empty segment
	os.remove(path_segments)
	return
	# Noise Reduction
	start_time = time.time()
	noise_reduction(path_segments, path_segments)
	print("Noise removed. Time:", time.time() - start_time)


	# Transcription
	transcription = transcribe(asr_model, path_segments, target_lang)
	#if not transcription.strip():
	# print("No speech detected.")
	# return

	# Translation
	print("Translating...")
	translation = translate(model_nllb, tokenizer_nllb, transcription, target_lang)

	# Text-to-Speech
	# process_tts(tts_model, translation, path_segments, target_lang, path_results)
	append_text_order(json_path_temp,translation, order, path_segments, path_results, "es" if target_lang == "spanish" else "en", transcription)
	append_text_order(json_path_record,translation, order, path_segments, path_results, "es" if target_lang == "spanish" else "en", transcription)
	def transcribe(asr_model, path_segments, target_lang):
	"""
	Transcribe an audio segment using the specified ASR model.

	Args:
	asr_model: The ASR model for transcription.
	path_segments (str): Path to the audio segment.
	target_lang (str): Target language for transcription.

	Returns:
	str: The transcription of the audio segment.
	"""
	start_time = time.time()
	transcription_func = {
	"spanish": parakeet_ctc_process,
	"english": stt_es_process
	}[target_lang]
	transcription = transcription_func(asr_model, path_segments)
	print("Transcription:", transcription[0])
	print("Transcription time:", time.time() - start_time)
	return transcription[0]

	def translate(model_nllb, tokenizer_nllb, text, target_lang):
	"""
	Translate text using the specified NLLB model and tokenizer.

	Args:
	model_nllb: The NLLB model for translation.
	tokenizer_nllb: The tokenizer for the NLLB model.
	text (str): The text to translate.
	target_lang (str): Target language for translation.

	Returns:
	str: The translated text.
	"""
	print("Processing translation...")
	start_time = time.time()
	translation = nllb_translate(model_nllb, tokenizer_nllb, text, target_lang)
	print("Translation:", translation)
	print("Translation time:", time.time() - start_time)
	return translation







	import os
	import time
	import contextlib
	import wave
	from watchdog.observers import Observer
	from watchdog.events import FileSystemEventHandler
	from concurrent.futures import ThreadPoolExecutor

	# Assuming you have the following functions defined elsewhere:
	# - process_segment
	# - asr_model
	# - model_nllb
	# - tokinizer_nllb

	class NewAudioHandler(FileSystemEventHandler):
	def __init__(self, asr_model, model_nllb, tokinizer_nllb, source_lang, target_lang, json_file_temp, json_file_record, result_dir):
	self.asr_model = asr_model
	self.model_nllb = model_nllb
	self.tokinizer_nllb = tokinizer_nllb
	self.source_lang = source_lang
	self.target_lang = target_lang
	self.json_file_temp = json_file_temp
	self.json_file_record = json_file_record
	self.result_dir = result_dir
	self.executor = ThreadPoolExecutor(max_workers=2)

	def on_created(self, event):
	if not event.is_directory and event.src_path.endswith(".wav"):
	self.process_new_audio(event.src_path)

	def process_new_audio(self, audio_path):
	file_name = os.path.basename(audio_path)
	result_path = os.path.join(self.result_dir, f"result_{file_name}")
	print(f"Processing {audio_path}...")
	self.executor.submit(process_segment, self.asr_model, self.model_nllb, self.tokinizer_nllb, audio_path, result_path, self.target_lang, file_name, self.json_file_temp, self.json_file_record)

	def watch_folder(asr_model, model_nllb, tokinizer_nllb, source_lang, target_lang, json_file_temp, json_file_record, watch_dir="audio_segments", result_dir="results"):
	"""
	Watch a folder for new audio files and process them.

	Args:
	asr_model: The ASR model for transcription.
	model_nllb: The NLLB model for translation.
	tokinizer_nllb: The tokenizer for the NLLB model.
	source_lang (str): Source language of the audio.
	target_lang (str): Target language for translation.
	json_file_temp (str): Path to the temporary JSON file.
	json_file_record (str): Path to the record JSON file.
	watch_dir (str, optional): Directory to watch for new audio files. Default is "audio_segments".
	result_dir (str, optional): Directory to save the results. Default is "results".
	"""
	if not os.path.exists(watch_dir):
	os.makedirs(watch_dir)
	if not os.path.exists(result_dir):
	os.makedirs(result_dir)

	event_handler = NewAudioHandler(asr_model, model_nllb, tokinizer_nllb, source_lang, target_lang, json_file_temp, json_file_record, result_dir)
	observer = Observer()
	observer.schedule(event_handler, watch_dir, recursive=False)
	observer.start()
	print(f"Watching directory: {watch_dir}")

	try:
	while True:
	time.sleep(1)
	except KeyboardInterrupt:
	observer.stop()
	observer.join()

	# Example usage:
	# watch_folder(asr_model, model_nllb, tokinizer_nllb, "en", "fr", "temp.json", "record.json")