Spaces:

langtech-innovation
/

WhisperLiveKitDiarization

Paused

App Files Files Community

WhisperLiveKitDiarization / whisperlivekit /audio_processor.py

qfuxa

uses sentinel object when end of transcription, to properly terminate tasks

c2bd36d 3 months ago

raw

history blame

29.3 kB

	import asyncio
	import numpy as np
	import ffmpeg
	from time import time, sleep
	import math
	import logging
	import traceback
	from datetime import timedelta
	from whisperlivekit.timed_objects import ASRToken
	from whisperlivekit.whisper_streaming_custom.whisper_online import online_factory
	from whisperlivekit.core import WhisperLiveKit

	# Set up logging once
	logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
	logger = logging.getLogger(__name__)
	logger.setLevel(logging.DEBUG)

	SENTINEL = object() # unique sentinel object for end of stream marker

	def format_time(seconds: float) -> str:
	"""Format seconds as HH:MM:SS."""
	return str(timedelta(seconds=int(seconds)))

	class AudioProcessor:
	"""
	Processes audio streams for transcription and diarization.
	Handles audio processing, state management, and result formatting.
	"""

	def __init__(self):
	"""Initialize the audio processor with configuration, models, and state."""

	models = WhisperLiveKit()

	# Audio processing settings
	self.args = models.args
	self.sample_rate = 16000
	self.channels = 1
	self.samples_per_sec = int(self.sample_rate * self.args.min_chunk_size)
	self.bytes_per_sample = 2
	self.bytes_per_sec = self.samples_per_sec * self.bytes_per_sample
	self.max_bytes_per_sec = 32000 * 5 # 5 seconds of audio at 32 kHz
	self.last_ffmpeg_activity = time()
	self.ffmpeg_health_check_interval = 5
	self.ffmpeg_max_idle_time = 10

	# State management
	self.is_stopping = False
	self.tokens = []
	self.buffer_transcription = ""
	self.buffer_diarization = ""
	self.full_transcription = ""
	self.end_buffer = 0
	self.end_attributed_speaker = 0
	self.lock = asyncio.Lock()
	self.beg_loop = time()
	self.sep = " " # Default separator
	self.last_response_content = ""

	# Models and processing
	self.asr = models.asr
	self.tokenizer = models.tokenizer
	self.diarization = models.diarization
	self.ffmpeg_process = self.start_ffmpeg_decoder()
	self.transcription_queue = asyncio.Queue() if self.args.transcription else None
	self.diarization_queue = asyncio.Queue() if self.args.diarization else None
	self.pcm_buffer = bytearray()

	# Task references
	self.transcription_task = None
	self.diarization_task = None
	self.ffmpeg_reader_task = None
	self.watchdog_task = None
	self.all_tasks_for_cleanup = []

	# Initialize transcription engine if enabled
	if self.args.transcription:
	self.online = online_factory(self.args, models.asr, models.tokenizer)

	def convert_pcm_to_float(self, pcm_buffer):
	"""Convert PCM buffer in s16le format to normalized NumPy array."""
	return np.frombuffer(pcm_buffer, dtype=np.int16).astype(np.float32) / 32768.0

	def start_ffmpeg_decoder(self):
	"""Start FFmpeg process for WebM to PCM conversion."""
	return (ffmpeg.input("pipe:0", format="webm")
	.output("pipe:1", format="s16le", acodec="pcm_s16le",
	ac=self.channels, ar=str(self.sample_rate))
	.run_async(pipe_stdin=True, pipe_stdout=True, pipe_stderr=True))

	async def restart_ffmpeg(self):
	"""Restart the FFmpeg process after failure."""
	logger.warning("Restarting FFmpeg process...")

	if self.ffmpeg_process:
	try:
	# we check if process is still running
	if self.ffmpeg_process.poll() is None:
	logger.info("Terminating existing FFmpeg process")
	self.ffmpeg_process.stdin.close()
	self.ffmpeg_process.terminate()

	# wait for termination with timeout
	try:
	await asyncio.wait_for(
	asyncio.get_event_loop().run_in_executor(None, self.ffmpeg_process.wait),
	timeout=5.0
	)
	except asyncio.TimeoutError:
	logger.warning("FFmpeg process did not terminate, killing forcefully")
	self.ffmpeg_process.kill()
	await asyncio.get_event_loop().run_in_executor(None, self.ffmpeg_process.wait)
	except Exception as e:
	logger.error(f"Error during FFmpeg process termination: {e}")
	logger.error(traceback.format_exc())

	# we start new process
	try:
	logger.info("Starting new FFmpeg process")
	self.ffmpeg_process = self.start_ffmpeg_decoder()
	self.pcm_buffer = bytearray()
	self.last_ffmpeg_activity = time()
	logger.info("FFmpeg process restarted successfully")
	except Exception as e:
	logger.error(f"Failed to restart FFmpeg process: {e}")
	logger.error(traceback.format_exc())
	# try again after 5s
	await asyncio.sleep(5)
	try:
	self.ffmpeg_process = self.start_ffmpeg_decoder()
	self.pcm_buffer = bytearray()
	self.last_ffmpeg_activity = time()
	logger.info("FFmpeg process restarted successfully on second attempt")
	except Exception as e2:
	logger.critical(f"Failed to restart FFmpeg process on second attempt: {e2}")
	logger.critical(traceback.format_exc())

	async def update_transcription(self, new_tokens, buffer, end_buffer, full_transcription, sep):
	"""Thread-safe update of transcription with new data."""
	async with self.lock:
	self.tokens.extend(new_tokens)
	self.buffer_transcription = buffer
	self.end_buffer = end_buffer
	self.full_transcription = full_transcription
	self.sep = sep

	async def update_diarization(self, end_attributed_speaker, buffer_diarization=""):
	"""Thread-safe update of diarization with new data."""
	async with self.lock:
	self.end_attributed_speaker = end_attributed_speaker
	if buffer_diarization:
	self.buffer_diarization = buffer_diarization

	async def add_dummy_token(self):
	"""Placeholder token when no transcription is available."""
	async with self.lock:
	current_time = time() - self.beg_loop
	self.tokens.append(ASRToken(
	start=current_time, end=current_time + 1,
	text=".", speaker=-1, is_dummy=True
	))

	async def get_current_state(self):
	"""Get current state."""
	async with self.lock:
	current_time = time()

	# Calculate remaining times
	remaining_transcription = 0
	if self.end_buffer > 0:
	remaining_transcription = max(0, round(current_time - self.beg_loop - self.end_buffer, 2))

	remaining_diarization = 0
	if self.tokens:
	latest_end = max(self.end_buffer, self.tokens[-1].end if self.tokens else 0)
	remaining_diarization = max(0, round(latest_end - self.end_attributed_speaker, 2))

	return {
	"tokens": self.tokens.copy(),
	"buffer_transcription": self.buffer_transcription,
	"buffer_diarization": self.buffer_diarization,
	"end_buffer": self.end_buffer,
	"end_attributed_speaker": self.end_attributed_speaker,
	"sep": self.sep,
	"remaining_time_transcription": remaining_transcription,
	"remaining_time_diarization": remaining_diarization
	}

	async def reset(self):
	"""Reset all state variables to initial values."""
	async with self.lock:
	self.tokens = []
	self.buffer_transcription = self.buffer_diarization = ""
	self.end_buffer = self.end_attributed_speaker = 0
	self.full_transcription = self.last_response_content = ""
	self.beg_loop = time()

	async def ffmpeg_stdout_reader(self):
	"""Read audio data from FFmpeg stdout and process it."""
	loop = asyncio.get_event_loop()
	beg = time()

	while True:
	try:
	current_time = time()
	elapsed_time = math.floor((current_time - beg) * 10) / 10
	buffer_size = max(int(32000 * elapsed_time), 4096)
	beg = current_time

	# Detect idle state much more quickly
	if current_time - self.last_ffmpeg_activity > self.ffmpeg_max_idle_time:
	logger.warning(f"FFmpeg process idle for {current_time - self.last_ffmpeg_activity:.2f}s. Restarting...")
	await self.restart_ffmpeg()
	beg = time()
	self.last_ffmpeg_activity = time()
	continue

	chunk = await loop.run_in_executor(None, self.ffmpeg_process.stdout.read, buffer_size)
	if chunk:
	self.last_ffmpeg_activity = time()

	if not chunk:
	logger.info("FFmpeg stdout closed, no more data to read.")
	break

	self.pcm_buffer.extend(chunk)

	# Send to diarization if enabled
	if self.args.diarization and self.diarization_queue:
	await self.diarization_queue.put(
	self.convert_pcm_to_float(self.pcm_buffer).copy()
	)

	# Process when enough data
	if len(self.pcm_buffer) >= self.bytes_per_sec:
	if len(self.pcm_buffer) > self.max_bytes_per_sec:
	logger.warning(
	f"Audio buffer too large: {len(self.pcm_buffer) / self.bytes_per_sec:.2f}s. "
	f"Consider using a smaller model."
	)

	# Process audio chunk
	pcm_array = self.convert_pcm_to_float(self.pcm_buffer[:self.max_bytes_per_sec])
	self.pcm_buffer = self.pcm_buffer[self.max_bytes_per_sec:]

	# Send to transcription if enabled
	if self.args.transcription and self.transcription_queue:
	await self.transcription_queue.put(pcm_array.copy())

	# Sleep if no processing is happening
	if not self.args.transcription and not self.args.diarization:
	await asyncio.sleep(0.1)

	except Exception as e:
	logger.warning(f"Exception in ffmpeg_stdout_reader: {e}")
	logger.warning(f"Traceback: {traceback.format_exc()}")
	break

	logger.info("FFmpeg stdout processing finished. Signaling downstream processors.")
	if self.args.transcription and self.transcription_queue:
	await self.transcription_queue.put(SENTINEL)
	logger.debug("Sentinel put into transcription_queue.")
	if self.args.diarization and self.diarization_queue:
	await self.diarization_queue.put(SENTINEL)
	logger.debug("Sentinel put into diarization_queue.")


	async def transcription_processor(self):
	"""Process audio chunks for transcription."""
	self.full_transcription = ""
	self.sep = self.online.asr.sep

	while True:
	try:
	pcm_array = await self.transcription_queue.get()
	if pcm_array is SENTINEL:
	logger.debug("Transcription processor received sentinel. Finishing.")
	self.transcription_queue.task_done()
	break

	if not self.online: # Should not happen if queue is used
	logger.warning("Transcription processor: self.online not initialized.")
	self.transcription_queue.task_done()
	continue

	logger.info(f"{len(self.online.audio_buffer) / self.online.SAMPLING_RATE} seconds of audio to process.")

	# Process transcription
	self.online.insert_audio_chunk(pcm_array)
	new_tokens = self.online.process_iter()

	if new_tokens:
	self.full_transcription += self.sep.join([t.text for t in new_tokens])

	# Get buffer information
	_buffer = self.online.get_buffer()
	buffer = _buffer.text
	end_buffer = _buffer.end if _buffer.end else (
	new_tokens[-1].end if new_tokens else 0
	)

	# Avoid duplicating content
	if buffer in self.full_transcription:
	buffer = ""

	await self.update_transcription(
	new_tokens, buffer, end_buffer, self.full_transcription, self.sep
	)
	self.transcription_queue.task_done()

	except Exception as e:
	logger.warning(f"Exception in transcription_processor: {e}")
	logger.warning(f"Traceback: {traceback.format_exc()}")
	if 'pcm_array' in locals() and pcm_array is not SENTINEL : # Check if pcm_array was assigned from queue
	self.transcription_queue.task_done()
	logger.info("Transcription processor task finished.")


	async def diarization_processor(self, diarization_obj):
	"""Process audio chunks for speaker diarization."""
	buffer_diarization = ""

	while True:
	try:
	pcm_array = await self.diarization_queue.get()
	if pcm_array is SENTINEL:
	logger.debug("Diarization processor received sentinel. Finishing.")
	self.diarization_queue.task_done()
	break

	# Process diarization
	await diarization_obj.diarize(pcm_array)

	# Get current state and update speakers
	state = await self.get_current_state()
	new_end = diarization_obj.assign_speakers_to_tokens(
	state["end_attributed_speaker"], state["tokens"]
	)

	await self.update_diarization(new_end, buffer_diarization)
	self.diarization_queue.task_done()

	except Exception as e:
	logger.warning(f"Exception in diarization_processor: {e}")
	logger.warning(f"Traceback: {traceback.format_exc()}")
	if 'pcm_array' in locals() and pcm_array is not SENTINEL:
	self.diarization_queue.task_done()
	logger.info("Diarization processor task finished.")


	async def results_formatter(self):
	"""Format processing results for output."""
	while True:
	try:
	# Get current state
	state = await self.get_current_state()
	tokens = state["tokens"]
	buffer_transcription = state["buffer_transcription"]
	buffer_diarization = state["buffer_diarization"]
	end_attributed_speaker = state["end_attributed_speaker"]
	sep = state["sep"]

	# Add dummy tokens if needed
	if (not tokens or tokens[-1].is_dummy) and not self.args.transcription and self.args.diarization:
	await self.add_dummy_token()
	sleep(0.5)
	state = await self.get_current_state()
	tokens = state["tokens"]

	# Format output
	previous_speaker = -1
	lines = []
	last_end_diarized = 0
	undiarized_text = []

	# Process each token
	for token in tokens:
	speaker = token.speaker

	# Handle diarization
	if self.args.diarization:
	if (speaker in [-1, 0]) and token.end >= end_attributed_speaker:
	undiarized_text.append(token.text)
	continue
	elif (speaker in [-1, 0]) and token.end < end_attributed_speaker:
	speaker = previous_speaker
	if speaker not in [-1, 0]:
	last_end_diarized = max(token.end, last_end_diarized)

	# Group by speaker
	if speaker != previous_speaker or not lines:
	lines.append({
	"speaker": speaker,
	"text": token.text,
	"beg": format_time(token.start),
	"end": format_time(token.end),
	"diff": round(token.end - last_end_diarized, 2)
	})
	previous_speaker = speaker
	elif token.text: # Only append if text isn't empty
	lines[-1]["text"] += sep + token.text
	lines[-1]["end"] = format_time(token.end)
	lines[-1]["diff"] = round(token.end - last_end_diarized, 2)

	# Handle undiarized text
	if undiarized_text:
	combined = sep.join(undiarized_text)
	if buffer_transcription:
	combined += sep
	await self.update_diarization(end_attributed_speaker, combined)
	buffer_diarization = combined

	# Create response object
	if not lines:
	lines = [{
	"speaker": 1,
	"text": "",
	"beg": format_time(0),
	"end": format_time(tokens[-1].end if tokens else 0),
	"diff": 0
	}]

	response = {
	"lines": lines,
	"buffer_transcription": buffer_transcription,
	"buffer_diarization": buffer_diarization,
	"remaining_time_transcription": state["remaining_time_transcription"],
	"remaining_time_diarization": state["remaining_time_diarization"]
	}

	# Only yield if content has changed
	response_content = ' '.join([f"{line['speaker']} {line['text']}" for line in lines]) + \
	f" \| {buffer_transcription} \| {buffer_diarization}"

	if response_content != self.last_response_content and (lines or buffer_transcription or buffer_diarization):
	yield response
	self.last_response_content = response_content

	# Check for termination condition
	if self.is_stopping:
	all_processors_done = True
	if self.args.transcription and self.transcription_task and not self.transcription_task.done():
	all_processors_done = False
	if self.args.diarization and self.diarization_task and not self.diarization_task.done():
	all_processors_done = False

	if all_processors_done:
	logger.info("Results formatter: All upstream processors are done and in stopping state. Terminating.")
	final_state = await self.get_current_state()
	return

	await asyncio.sleep(0.1) # Avoid overwhelming the client

	except Exception as e:
	logger.warning(f"Exception in results_formatter: {e}")
	logger.warning(f"Traceback: {traceback.format_exc()}")
	await asyncio.sleep(0.5) # Back off on error

	async def create_tasks(self):
	"""Create and start processing tasks."""
	self.all_tasks_for_cleanup = []
	processing_tasks_for_watchdog = []

	if self.args.transcription and self.online:
	self.transcription_task = asyncio.create_task(self.transcription_processor())
	self.all_tasks_for_cleanup.append(self.transcription_task)
	processing_tasks_for_watchdog.append(self.transcription_task)

	if self.args.diarization and self.diarization:
	self.diarization_task = asyncio.create_task(self.diarization_processor(self.diarization))
	self.all_tasks_for_cleanup.append(self.diarization_task)
	processing_tasks_for_watchdog.append(self.diarization_task)

	self.ffmpeg_reader_task = asyncio.create_task(self.ffmpeg_stdout_reader())
	self.all_tasks_for_cleanup.append(self.ffmpeg_reader_task)
	processing_tasks_for_watchdog.append(self.ffmpeg_reader_task)

	# Monitor overall system health
	self.watchdog_task = asyncio.create_task(self.watchdog(processing_tasks_for_watchdog))
	self.all_tasks_for_cleanup.append(self.watchdog_task)

	return self.results_formatter()

	async def watchdog(self, tasks_to_monitor):
	"""Monitors the health of critical processing tasks."""
	while True:
	try:
	await asyncio.sleep(10)
	current_time = time()

	for i, task in enumerate(tasks_to_monitor):
	if task.done():
	exc = task.exception()
	task_name = task.get_name() if hasattr(task, 'get_name') else f"Monitored Task {i}"
	if exc:
	logger.error(f"{task_name} unexpectedly completed with exception: {exc}")
	else:
	logger.info(f"{task_name} completed normally.")

	ffmpeg_idle_time = current_time - self.last_ffmpeg_activity
	if ffmpeg_idle_time > 15:
	logger.warning(f"FFmpeg idle for {ffmpeg_idle_time:.2f}s - may need attention.")
	if ffmpeg_idle_time > 30 and not self.is_stopping:
	logger.error("FFmpeg idle for too long and not in stopping phase, forcing restart.")
	await self.restart_ffmpeg()
	except asyncio.CancelledError:
	logger.info("Watchdog task cancelled.")
	break
	except Exception as e:
	logger.error(f"Error in watchdog task: {e}", exc_info=True)

	async def cleanup(self):
	"""Clean up resources when processing is complete."""
	logger.info("Starting cleanup of AudioProcessor resources.")
	for task in self.all_tasks_for_cleanup:
	if task and not task.done():
	task.cancel()

	created_tasks = [t for t in self.all_tasks_for_cleanup if t]
	if created_tasks:
	await asyncio.gather(*created_tasks, return_exceptions=True)
	logger.info("All processing tasks cancelled or finished.")

	if self.ffmpeg_process:
	if self.ffmpeg_process.stdin and not self.ffmpeg_process.stdin.closed:
	try:
	self.ffmpeg_process.stdin.close()
	except Exception as e:
	logger.warning(f"Error closing ffmpeg stdin during cleanup: {e}")

	# Wait for ffmpeg process to terminate
	if self.ffmpeg_process.poll() is None: # Check if process is still running
	logger.info("Waiting for FFmpeg process to terminate...")
	try:
	# Run wait in executor to avoid blocking async loop
	await asyncio.get_event_loop().run_in_executor(None, self.ffmpeg_process.wait, 5.0) # 5s timeout
	except Exception as e: # subprocess.TimeoutExpired is not directly caught by asyncio.wait_for with run_in_executor
	logger.warning(f"FFmpeg did not terminate gracefully, killing. Error: {e}")
	self.ffmpeg_process.kill()
	await asyncio.get_event_loop().run_in_executor(None, self.ffmpeg_process.wait) # Wait for kill
	logger.info("FFmpeg process terminated.")

	if self.args.diarization and hasattr(self, 'diarization') and hasattr(self.diarization, 'close'):
	self.diarization.close()
	logger.info("AudioProcessor cleanup complete.")


	async def process_audio(self, message):
	"""Process incoming audio data."""
	# If already stopping or stdin is closed, ignore further audio, especially residual chunks.
	if self.is_stopping or (self.ffmpeg_process and self.ffmpeg_process.stdin and self.ffmpeg_process.stdin.closed):
	logger.warning(f"AudioProcessor is stopping or stdin is closed. Ignoring incoming audio message (length: {len(message)}).")
	if not message and self.ffmpeg_process and self.ffmpeg_process.stdin and not self.ffmpeg_process.stdin.closed:
	logger.info("Received empty message while already in stopping state; ensuring stdin is closed.")
	try:
	self.ffmpeg_process.stdin.close()
	except Exception as e:
	logger.warning(f"Error closing ffmpeg stdin on redundant stop signal during stopping state: {e}")
	return

	if not message: # primary signal to start stopping
	logger.info("Empty audio message received, initiating stop sequence.")
	self.is_stopping = True
	if self.ffmpeg_process and self.ffmpeg_process.stdin and not self.ffmpeg_process.stdin.closed:
	try:
	self.ffmpeg_process.stdin.close()
	logger.info("FFmpeg stdin closed due to primary stop signal.")
	except Exception as e:
	logger.warning(f"Error closing ffmpeg stdin on stop: {e}")
	return

	retry_count = 0
	max_retries = 3

	# Log periodic heartbeats showing ongoing audio proc
	current_time = time()
	if not hasattr(self, '_last_heartbeat') or current_time - self._last_heartbeat >= 10:
	logger.debug(f"Processing audio chunk, last FFmpeg activity: {current_time - self.last_ffmpeg_activity:.2f}s ago")
	self._last_heartbeat = current_time

	while retry_count < max_retries:
	try:
	if not self.ffmpeg_process or not hasattr(self.ffmpeg_process, 'stdin') or self.ffmpeg_process.poll() is not None:
	logger.warning("FFmpeg process not available, restarting...")
	await self.restart_ffmpeg()

	loop = asyncio.get_running_loop()
	try:
	await asyncio.wait_for(
	loop.run_in_executor(None, lambda: self.ffmpeg_process.stdin.write(message)),
	timeout=2.0
	)
	except asyncio.TimeoutError:
	logger.warning("FFmpeg write operation timed out, restarting...")
	await self.restart_ffmpeg()
	retry_count += 1
	continue

	try:
	await asyncio.wait_for(
	loop.run_in_executor(None, self.ffmpeg_process.stdin.flush),
	timeout=2.0
	)
	except asyncio.TimeoutError:
	logger.warning("FFmpeg flush operation timed out, restarting...")
	await self.restart_ffmpeg()
	retry_count += 1
	continue

	self.last_ffmpeg_activity = time()
	return

	except (BrokenPipeError, AttributeError, OSError) as e:
	retry_count += 1
	logger.warning(f"Error writing to FFmpeg: {e}. Retry {retry_count}/{max_retries}...")

	if retry_count < max_retries:
	await self.restart_ffmpeg()
	await asyncio.sleep(0.5)
	else:
	logger.error("Maximum retries reached for FFmpeg process")
	await self.restart_ffmpeg()
	return