Spaces:

langtech-innovation
/

WhisperLiveKitDiarization

Paused

App Files Files Community

qfuxa commited on Mar 18

Commit

b9f09f7

1 Parent(s): 6cccf9e

refacto 0

Browse files

Files changed (7) hide show

audio.py +306 -0
diarization/diarization_online.py +3 -3
formatters.py +91 -0
parse_args.py +52 -0
state.py +96 -0
whisper_fastapi_online_server.py +23 -400
whisper_streaming_custom/whisper_online.py +1 -1

audio.py ADDED Viewed

	@@ -0,0 +1,306 @@

+import io
+import argparse
+import asyncio
+import numpy as np
+import ffmpeg
+from time import time, sleep
+from contextlib import asynccontextmanager
+from fastapi import FastAPI, WebSocket, WebSocketDisconnect
+from fastapi.responses import HTMLResponse
+from fastapi.middleware.cors import CORSMiddleware
+from whisper_streaming_custom.whisper_online import backend_factory, online_factory, add_shared_args, warmup_asr
+from timed_objects import ASRToken
+import math
+import logging
+from datetime import timedelta
+import traceback
+from state import SharedState
+from formatters import format_time
+from parse_args import parse_args
+logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
+logging.getLogger().setLevel(logging.WARNING)
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.DEBUG)
+class AudioProcessor:
+    def __init__(self, args, asr, tokenizer):
+        self.args = args
+        self.sample_rate = 16000
+        self.channels = 1
+        self.samples_per_sec = int(self.sample_rate * args.min_chunk_size)
+        self.bytes_per_sample = 2
+        self.bytes_per_sec = self.samples_per_sec * self.bytes_per_sample
+        self.max_bytes_per_sec = 32000 * 5  # 5 seconds of audio at 32 kHz
+        self.shared_state = SharedState()
+        self.asr = asr
+        self.tokenizer = tokenizer
+    def convert_pcm_to_float(self, pcm_buffer):
+        """
+        Converts a PCM buffer in s16le format to a normalized NumPy array.
+        Arg: pcm_buffer. PCM buffer containing raw audio data in s16le format
+        Returns: np.ndarray. NumPy array of float32 type normalized between -1.0 and 1.0
+        """
+        pcm_array = (np.frombuffer(pcm_buffer, dtype=np.int16).astype(np.float32)
+                    / 32768.0)
+        return pcm_array
+    async def start_ffmpeg_decoder(self):
+        """
+        Start an FFmpeg process in async streaming mode that reads WebM from stdin
+        and outputs raw s16le PCM on stdout. Returns the process object.
+        """
+        process = (
+            ffmpeg.input("pipe:0", format="webm")
+            .output(
+                "pipe:1",
+                format="s16le",
+                acodec="pcm_s16le",
+                ac=self.channels,
+                ar=str(self.sample_rate),
+            )
+            .run_async(pipe_stdin=True, pipe_stdout=True, pipe_stderr=True)
+        )
+        return process
+    async def restart_ffmpeg(self, ffmpeg_process, online, pcm_buffer):
+        if ffmpeg_process:
+            try:
+                ffmpeg_process.kill()
+                await asyncio.get_event_loop().run_in_executor(None, ffmpeg_process.wait)
+            except Exception as e:
+                logger.warning(f"Error killing FFmpeg process: {e}")
+        ffmpeg_process = await self.start_ffmpeg_decoder()
+        pcm_buffer = bytearray()
+        if self.args.transcription:
+            online = online_factory(self.args, self.asr, self.tokenizer)
+        await self.shared_state.reset()
+        logger.info("FFmpeg process started.")
+        return ffmpeg_process, online, pcm_buffer
+    async def ffmpeg_stdout_reader(self, ffmpeg_process, pcm_buffer, diarization_queue, transcription_queue):
+        loop = asyncio.get_event_loop()
+        beg = time()
+        while True:
+            try:
+                elapsed_time = math.floor((time() - beg) * 10) / 10 # Round to 0.1 sec
+                ffmpeg_buffer_from_duration = max(int(32000 * elapsed_time), 4096)
+                beg = time()
+                # Read chunk with timeout
+                try:
+                    chunk = await asyncio.wait_for(
+                        loop.run_in_executor(
+                            None, ffmpeg_process.stdout.read, ffmpeg_buffer_from_duration
+                        ),
+                        timeout=15.0
+                    )
+                except asyncio.TimeoutError:
+                    logger.warning("FFmpeg read timeout. Restarting...")
+                    ffmpeg_process, online, pcm_buffer = await self.restart_ffmpeg(ffmpeg_process, online, pcm_buffer)
+                    beg = time()
+                    continue  # Skip processing and read from new process
+                if not chunk:
+                    logger.info("FFmpeg stdout closed.")
+                    break
+                pcm_buffer.extend(chunk)
+                if self.args.diarization and diarization_queue:
+                    await diarization_queue.put(self.convert_pcm_to_float(pcm_buffer).copy())
+                if len(pcm_buffer) >= self.bytes_per_sec:
+                    if len(pcm_buffer) > self.max_bytes_per_sec:
+                        logger.warning(
+                            f"""Audio buffer is too large: {len(pcm_buffer) / self.bytes_per_sec:.2f} seconds.
+                            The model probably struggles to keep up. Consider using a smaller model.
+                            """)
+                    pcm_array = self.convert_pcm_to_float(pcm_buffer[:self.max_bytes_per_sec])
+                    pcm_buffer = pcm_buffer[self.max_bytes_per_sec:]
+                    if self.args.transcription and transcription_queue:
+                        await transcription_queue.put(pcm_array.copy())
+                    if not self.args.transcription and not self.args.diarization:
+                        await asyncio.sleep(0.1)
+            except Exception as e:
+                logger.warning(f"Exception in ffmpeg_stdout_reader: {e}")
+                logger.warning(f"Traceback: {traceback.format_exc()}")
+                break
+        logger.info("Exiting ffmpeg_stdout_reader...")
+    async def transcription_processor(self, pcm_queue, online):
+        full_transcription = ""
+        sep = online.asr.sep
+        while True:
+            try:
+                pcm_array = await pcm_queue.get()
+                logger.info(f"{len(online.audio_buffer) / online.SAMPLING_RATE} seconds of audio will be processed by the model.")
+                # Process transcription
+                online.insert_audio_chunk(pcm_array)
+                new_tokens = online.process_iter()
+                if new_tokens:
+                    full_transcription += sep.join([t.text for t in new_tokens])
+                _buffer = online.get_buffer()
+                buffer = _buffer.text
+                end_buffer = _buffer.end if _buffer.end else (new_tokens[-1].end if new_tokens else 0)
+                if buffer in full_transcription:
+                    buffer = ""
+                await self.shared_state.update_transcription(
+                    new_tokens, buffer, end_buffer, full_transcription, sep)
+            except Exception as e:
+                logger.warning(f"Exception in transcription_processor: {e}")
+                logger.warning(f"Traceback: {traceback.format_exc()}")
+            finally:
+                pcm_queue.task_done()
+    async def diarization_processor(self, pcm_queue, diarization_obj):
+        buffer_diarization = ""
+        while True:
+            try:
+                pcm_array = await pcm_queue.get()
+                # Process diarization
+                await diarization_obj.diarize(pcm_array)
+                # Get current state
+                state = await self.shared_state.get_current_state()
+                tokens = state["tokens"]
+                end_attributed_speaker = state["end_attributed_speaker"]
+                # Update speaker information
+                new_end_attributed_speaker = diarization_obj.assign_speakers_to_tokens(
+                    end_attributed_speaker, tokens)
+                await self.shared_state.update_diarization(new_end_attributed_speaker, buffer_diarization)
+            except Exception as e:
+                logger.warning(f"Exception in diarization_processor: {e}")
+                logger.warning(f"Traceback: {traceback.format_exc()}")
+            finally:
+                pcm_queue.task_done()
+    async def results_formatter(self, websocket):
+        while True:
+            try:
+                # Get the current state
+                state = await self.shared_state.get_current_state()
+                tokens = state["tokens"]
+                buffer_transcription = state["buffer_transcription"]
+                buffer_diarization = state["buffer_diarization"]
+                end_attributed_speaker = state["end_attributed_speaker"]
+                remaining_time_transcription = state["remaining_time_transcription"]
+                remaining_time_diarization = state["remaining_time_diarization"]
+                sep = state["sep"]
+                # If diarization is enabled but no transcription, add dummy tokens periodically
+                if (not tokens or tokens[-1].is_dummy) and not self.args.transcription and self.args.diarization:
+                    await self.shared_state.add_dummy_token()
+                    sleep(0.5)
+                    state = await self.shared_state.get_current_state()
+                    tokens = state["tokens"]
+                # Process tokens to create response
+                previous_speaker = -1
+                lines = []
+                last_end_diarized = 0
+                undiarized_text = []
+                for token in tokens:
+                    speaker = token.speaker
+                    if self.args.diarization:
+                        if (speaker == -1 or speaker == 0) and token.end >= end_attributed_speaker:
+                            undiarized_text.append(token.text)
+                            continue
+                        elif (speaker == -1 or speaker == 0) and token.end < end_attributed_speaker:
+                            speaker = previous_speaker
+                        if speaker not in [-1, 0]:
+                            last_end_diarized = max(token.end, last_end_diarized)
+                    if speaker != previous_speaker or not lines:
+                        lines.append(
+                            {
+                                "speaker": speaker,
+                                "text": token.text,
+                                "beg": format_time(token.start),
+                                "end": format_time(token.end),
+                                "diff": round(token.end - last_end_diarized, 2)
+                            }
+                        )
+                        previous_speaker = speaker
+                    elif token.text:  # Only append if text isn't empty
+                        lines[-1]["text"] += sep + token.text
+                        lines[-1]["end"] = format_time(token.end)
+                        lines[-1]["diff"] = round(token.end - last_end_diarized, 2)
+                if undiarized_text:
+                    combined_buffer_diarization = sep.join(undiarized_text)
+                    if buffer_transcription:
+                        combined_buffer_diarization += sep
+                    await self.shared_state.update_diarization(end_attributed_speaker, combined_buffer_diarization)
+                    buffer_diarization = combined_buffer_diarization
+                if lines:
+                    response = {
+                        "lines": lines,
+                        "buffer_transcription": buffer_transcription,
+                        "buffer_diarization": buffer_diarization,
+                        "remaining_time_transcription": remaining_time_transcription,
+                        "remaining_time_diarization": remaining_time_diarization
+                    }
+                else:
+                    response = {
+                        "lines": [{
+                            "speaker": 1,
+                            "text": "",
+                            "beg": format_time(0),
+                            "end": format_time(tokens[-1].end) if tokens else format_time(0),
+                            "diff": 0
+                    }],
+                        "buffer_transcription": buffer_transcription,
+                        "buffer_diarization": buffer_diarization,
+                        "remaining_time_transcription": remaining_time_transcription,
+                        "remaining_time_diarization": remaining_time_diarization
+                    }
+                response_content = ' '.join([str(line['speaker']) + ' ' + line["text"] for line in lines]) + ' | ' + buffer_transcription + ' | ' + buffer_diarization
+                if response_content != self.shared_state.last_response_content:
+                    if lines or buffer_transcription or buffer_diarization:
+                        await websocket.send_json(response)
+                        self.shared_state.last_response_content = response_content
+                # Add a small delay to avoid overwhelming the client
+                await asyncio.sleep(0.1)
+            except Exception as e:
+                logger.warning(f"Exception in results_formatter: {e}")
+                logger.warning(f"Traceback: {traceback.format_exc()}")
+                await asyncio.sleep(0.5)  # Back off on error

diarization/diarization_online.py CHANGED Viewed

@@ -5,7 +5,7 @@ import numpy as np
 import logging
-from diart import SpeakerDiarization
 from diart.inference import StreamingInference
 from diart.sources import AudioSource
 from timed_objects import SpeakerSegment
@@ -103,8 +103,8 @@ class WebSocketAudioSource(AudioSource):
 class DiartDiarization:
-    def __init__(self, sample_rate: int, use_microphone: bool = False):
-        self.pipeline = SpeakerDiarization()
         self.observer = DiarizationObserver()
         if use_microphone:

 import logging
+from diart import SpeakerDiarization, SpeakerDiarizationConfig
 from diart.inference import StreamingInference
 from diart.sources import AudioSource
 from timed_objects import SpeakerSegment
 class DiartDiarization:
+    def __init__(self, sample_rate: int, config : SpeakerDiarizationConfig = None, use_microphone: bool = False):
+        self.pipeline = SpeakerDiarization(config=config)
         self.observer = DiarizationObserver()
         if use_microphone:

formatters.py ADDED Viewed

	@@ -0,0 +1,91 @@

+from typing import Dict, Any, List
+from datetime import timedelta
+def format_time(seconds: float) -> str:
+    """Format seconds as HH:MM:SS."""
+    return str(timedelta(seconds=int(seconds)))
+def format_response(state: Dict[str, Any], with_diarization: bool = False) -> Dict[str, Any]:
+    """
+    Format the shared state into a client-friendly response.
+    Args:
+        state: Current shared state dictionary
+        with_diarization: Whether to include diarization formatting
+    Returns:
+        Formatted response dictionary ready to send to client
+    """
+    tokens = state["tokens"]
+    buffer_transcription = state["buffer_transcription"]
+    buffer_diarization = state["buffer_diarization"]
+    end_attributed_speaker = state["end_attributed_speaker"]
+    remaining_time_transcription = state["remaining_time_transcription"]
+    remaining_time_diarization = state["remaining_time_diarization"]
+    sep = state["sep"]
+    # Default response for empty state
+    if not tokens:
+        return {
+            "lines": [{
+                "speaker": 1,
+                "text": "",
+                "beg": format_time(0),
+                "end": format_time(0),
+                "diff": 0
+            }],
+            "buffer_transcription": buffer_transcription,
+            "buffer_diarization": buffer_diarization,
+            "remaining_time_transcription": remaining_time_transcription,
+            "remaining_time_diarization": remaining_time_diarization
+        }
+    # Process tokens to create response
+    previous_speaker = -1
+    lines = []
+    last_end_diarized = 0
+    undiarized_text = []
+    for token in tokens:
+        speaker = token.speaker
+        # Handle diarization logic
+        if with_diarization:
+            if (speaker == -1 or speaker == 0) and token.end >= end_attributed_speaker:
+                undiarized_text.append(token.text)
+                continue
+            elif (speaker == -1 or speaker == 0) and token.end < end_attributed_speaker:
+                speaker = previous_speaker
+            if speaker not in [-1, 0]:
+                last_end_diarized = max(token.end, last_end_diarized)
+        # Add new line or append to existing line
+        if speaker != previous_speaker or not lines:
+            lines.append({
+                "speaker": speaker,
+                "text": token.text,
+                "beg": format_time(token.start),
+                "end": format_time(token.end),
+                "diff": round(token.end - last_end_diarized, 2)
+            })
+            previous_speaker = speaker
+        elif token.text:  # Only append if text isn't empty
+            lines[-1]["text"] += sep + token.text
+            lines[-1]["end"] = format_time(token.end)
+            lines[-1]["diff"] = round(token.end - last_end_diarized, 2)
+    # If we have undiarized text, include it in the buffer
+    if undiarized_text:
+        combined_buffer = sep.join(undiarized_text)
+        if buffer_transcription:
+            combined_buffer += sep + buffer_transcription
+        buffer_diarization = combined_buffer
+    return {
+        "lines": lines,
+        "buffer_transcription": buffer_transcription,
+        "buffer_diarization": buffer_diarization,
+        "remaining_time_transcription": remaining_time_transcription,
+        "remaining_time_diarization": remaining_time_diarization
+    }

parse_args.py ADDED Viewed

	@@ -0,0 +1,52 @@

+import argparse
+from whisper_streaming_custom.whisper_online import add_shared_args
+def parse_args():
+    parser = argparse.ArgumentParser(description="Whisper FastAPI Online Server")
+    parser.add_argument(
+        "--host",
+        type=str,
+        default="localhost",
+        help="The host address to bind the server to.",
+    )
+    parser.add_argument(
+        "--port", type=int, default=8000, help="The port number to bind the server to."
+    )
+    parser.add_argument(
+        "--warmup-file",
+        type=str,
+        default=None,
+        dest="warmup_file",
+        help="""
+        The path to a speech audio wav file to warm up Whisper so that the very first chunk processing is fast.
+        If not set, uses https://github.com/ggerganov/whisper.cpp/raw/master/samples/jfk.wav.
+        If False, no warmup is performed.
+        """,
+    )
+    parser.add_argument(
+        "--confidence-validation",
+        type=bool,
+        default=False,
+        help="Accelerates validation of tokens using confidence scores. Transcription will be faster but punctuation might be less accurate.",
+    )
+    parser.add_argument(
+        "--diarization",
+        type=bool,
+        default=True,
+        help="Whether to enable speaker diarization.",
+    )
+    parser.add_argument(
+        "--transcription",
+        type=bool,
+        default=True,
+        help="To disable to only see live diarization results.",
+    )
+    add_shared_args(parser)
+    args = parser.parse_args()
+    return args

state.py ADDED Viewed

	@@ -0,0 +1,96 @@

+import asyncio
+import logging
+from time import time
+from typing import List, Dict, Any, Optional
+from dataclasses import dataclass, field
+from timed_objects import ASRToken
+logger = logging.getLogger(__name__)
+class SharedState:
+    """
+    Thread-safe state manager for streaming transcription and diarization.
+    Handles coordination between audio processing, transcription, and diarization.
+    """
+    def __init__(self):
+        self.tokens: List[ASRToken] = []
+        self.buffer_transcription: str = ""
+        self.buffer_diarization: str = ""
+        self.full_transcription: str = ""
+        self.end_buffer: float = 0
+        self.end_attributed_speaker: float = 0
+        self.lock = asyncio.Lock()
+        self.beg_loop: float = time()
+        self.sep: str = " "  # Default separator
+        self.last_response_content: str = ""  # To track changes in response
+    async def update_transcription(self, new_tokens: List[ASRToken], buffer: str,
+                                  end_buffer: float, full_transcription: str, sep: str) -> None:
+        """Update the state with new transcription data."""
+        async with self.lock:
+            self.tokens.extend(new_tokens)
+            self.buffer_transcription = buffer
+            self.end_buffer = end_buffer
+            self.full_transcription = full_transcription
+            self.sep = sep
+    async def update_diarization(self, end_attributed_speaker: float, buffer_diarization: str = "") -> None:
+        """Update the state with new diarization data."""
+        async with self.lock:
+            self.end_attributed_speaker = end_attributed_speaker
+            if buffer_diarization:
+                self.buffer_diarization = buffer_diarization
+    async def add_dummy_token(self) -> None:
+        """Add a dummy token to keep the state updated even without transcription."""
+        async with self.lock:
+            current_time = time() - self.beg_loop
+            dummy_token = ASRToken(
+                start=current_time,
+                end=current_time + 1,
+                text=".",
+                speaker=-1,
+                is_dummy=True
+            )
+            self.tokens.append(dummy_token)
+    async def get_current_state(self) -> Dict[str, Any]:
+        """Get the current state with calculated timing information."""
+        async with self.lock:
+            current_time = time()
+            remaining_time_transcription = 0
+            remaining_time_diarization = 0
+            # Calculate remaining time for transcription buffer
+            if self.end_buffer > 0:
+                remaining_time_transcription = max(0, round(current_time - self.beg_loop - self.end_buffer, 2))
+            # Calculate remaining time for diarization
+            if self.tokens:
+                latest_end = max(self.end_buffer, self.tokens[-1].end if self.tokens else 0)
+                remaining_time_diarization = max(0, round(latest_end - self.end_attributed_speaker, 2))
+            return {
+                "tokens": self.tokens.copy(),
+                "buffer_transcription": self.buffer_transcription,
+                "buffer_diarization": self.buffer_diarization,
+                "end_buffer": self.end_buffer,
+                "end_attributed_speaker": self.end_attributed_speaker,
+                "sep": self.sep,
+                "remaining_time_transcription": remaining_time_transcription,
+                "remaining_time_diarization": remaining_time_diarization
+            }
+    async def reset(self) -> None:
+        """Reset the state to initial values."""
+        async with self.lock:
+            self.tokens = []
+            self.buffer_transcription = ""
+            self.buffer_diarization = ""
+            self.end_buffer = 0
+            self.end_attributed_speaker = 0
+            self.full_transcription = ""
+            self.beg_loop = time()
+            self.last_response_content = ""

whisper_fastapi_online_server.py CHANGED Viewed

@@ -17,147 +17,28 @@ import math
 import logging
 from datetime import timedelta
 import traceback
-def format_time(seconds):
-    return str(timedelta(seconds=int(seconds)))
 logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
 logging.getLogger().setLevel(logging.WARNING)
 logger = logging.getLogger(__name__)
 logger.setLevel(logging.DEBUG)
-##### LOAD ARGS #####
-parser = argparse.ArgumentParser(description="Whisper FastAPI Online Server")
-parser.add_argument(
-    "--host",
-    type=str,
-    default="localhost",
-    help="The host address to bind the server to.",
-)
-parser.add_argument(
-    "--port", type=int, default=8000, help="The port number to bind the server to."
-)
-parser.add_argument(
-    "--warmup-file",
-    type=str,
-    default=None,
-    dest="warmup_file",
-    help="""
-    The path to a speech audio wav file to warm up Whisper so that the very first chunk processing is fast.
-    If not set, uses https://github.com/ggerganov/whisper.cpp/raw/master/samples/jfk.wav.
-    If False, no warmup is performed.
-    """,
-)
-parser.add_argument(
-    "--confidence-validation",
-    type=bool,
-    default=False,
-    help="Accelerates validation of tokens using confidence scores. Transcription will be faster but punctuation might be less accurate.",
-)
-parser.add_argument(
-    "--diarization",
-    type=bool,
-    default=False,
-    help="Whether to enable speaker diarization.",
-)
-parser.add_argument(
-    "--transcription",
-    type=bool,
-    default=True,
-    help="To disable to only see live diarization results.",
-)
-add_shared_args(parser)
-args = parser.parse_args()
 SAMPLE_RATE = 16000
-CHANNELS = 1
-SAMPLES_PER_SEC = SAMPLE_RATE * int(args.min_chunk_size)
-BYTES_PER_SAMPLE = 2  # s16le = 2 bytes per sample
-BYTES_PER_SEC = SAMPLES_PER_SEC * BYTES_PER_SAMPLE
-MAX_BYTES_PER_SEC = 32000 * 5  # 5 seconds of audio at 32 kHz
-class SharedState:
-    def __init__(self):
-        self.tokens = []
-        self.buffer_transcription = ""
-        self.buffer_diarization = ""
-        self.full_transcription = ""
-        self.end_buffer = 0
-        self.end_attributed_speaker = 0
-        self.lock = asyncio.Lock()
-        self.beg_loop = time()
-        self.sep = " "  # Default separator
-        self.last_response_content = ""  # To track changes in response
-    async def update_transcription(self, new_tokens, buffer, end_buffer, full_transcription, sep):
-        async with self.lock:
-            self.tokens.extend(new_tokens)
-            self.buffer_transcription = buffer
-            self.end_buffer = end_buffer
-            self.full_transcription = full_transcription
-            self.sep = sep
-    async def update_diarization(self, end_attributed_speaker, buffer_diarization=""):
-        async with self.lock:
-            self.end_attributed_speaker = end_attributed_speaker
-            if buffer_diarization:
-                self.buffer_diarization = buffer_diarization
-    async def add_dummy_token(self):
-        async with self.lock:
-            current_time = time() - self.beg_loop
-            dummy_token = ASRToken(
-                start=current_time,
-                end=current_time + 1,
-                text=".",
-                speaker=-1,
-                is_dummy=True
-            )
-            self.tokens.append(dummy_token)
-    async def get_current_state(self):
-        async with self.lock:
-            current_time = time()
-            remaining_time_transcription = 0
-            remaining_time_diarization = 0
-            # Calculate remaining time for transcription buffer
-            if self.end_buffer > 0:
-                remaining_time_transcription = max(0, round(current_time - self.beg_loop - self.end_buffer, 2))
-            # Calculate remaining time for diarization
-            remaining_time_diarization = max(0, round(max(self.end_buffer, self.tokens[-1].end if self.tokens else 0) - self.end_attributed_speaker, 2))
-            return {
-                "tokens": self.tokens.copy(),
-                "buffer_transcription": self.buffer_transcription,
-                "buffer_diarization": self.buffer_diarization,
-                "end_buffer": self.end_buffer,
-                "end_attributed_speaker": self.end_attributed_speaker,
-                "sep": self.sep,
-                "remaining_time_transcription": remaining_time_transcription,
-                "remaining_time_diarization": remaining_time_diarization
-            }
-    async def reset(self):
-        """Reset the state."""
-        async with self.lock:
-            self.tokens = []
-            self.buffer_transcription = ""
-            self.buffer_diarization = ""
-            self.end_buffer = 0
-            self.end_attributed_speaker = 0
-            self.full_transcription = ""
-            self.beg_loop = time()
-            self.last_response_content = ""
 ##### LOAD APP #####
 @asynccontextmanager
@@ -190,300 +71,45 @@ app.add_middleware(
 with open("web/live_transcription.html", "r", encoding="utf-8") as f:
     html = f.read()
-def convert_pcm_to_float(pcm_buffer):
-    """
-    Converts a PCM buffer in s16le format to a normalized NumPy array.
-    Arg: pcm_buffer. PCM buffer containing raw audio data in s16le format
-    Returns: np.ndarray. NumPy array of float32 type normalized between -1.0 and 1.0
-    """
-    pcm_array = (np.frombuffer(pcm_buffer, dtype=np.int16).astype(np.float32)
-                / 32768.0)
-    return pcm_array
-async def start_ffmpeg_decoder():
-    """
-    Start an FFmpeg process in async streaming mode that reads WebM from stdin
-    and outputs raw s16le PCM on stdout. Returns the process object.
-    """
-    process = (
-        ffmpeg.input("pipe:0", format="webm")
-        .output(
-            "pipe:1",
-            format="s16le",
-            acodec="pcm_s16le",
-            ac=CHANNELS,
-            ar=str(SAMPLE_RATE),
-        )
-        .run_async(pipe_stdin=True, pipe_stdout=True, pipe_stderr=True)
-    )
-    return process
-async def transcription_processor(shared_state, pcm_queue, online):
-    full_transcription = ""
-    sep = online.asr.sep
-    while True:
-        try:
-            pcm_array = await pcm_queue.get()
-            logger.info(f"{len(online.audio_buffer) / online.SAMPLING_RATE} seconds of audio will be processed by the model.")
-            # Process transcription
-            online.insert_audio_chunk(pcm_array)
-            new_tokens = online.process_iter()
-            if new_tokens:
-                full_transcription += sep.join([t.text for t in new_tokens])
-            _buffer = online.get_buffer()
-            buffer = _buffer.text
-            end_buffer = _buffer.end if _buffer.end else (new_tokens[-1].end if new_tokens else 0)
-            if buffer in full_transcription:
-                buffer = ""
-            await shared_state.update_transcription(
-                new_tokens, buffer, end_buffer, full_transcription, sep)
-        except Exception as e:
-            logger.warning(f"Exception in transcription_processor: {e}")
-            logger.warning(f"Traceback: {traceback.format_exc()}")
-        finally:
-            pcm_queue.task_done()
-async def diarization_processor(shared_state, pcm_queue, diarization_obj):
-    buffer_diarization = ""
-    while True:
-        try:
-            pcm_array = await pcm_queue.get()
-            # Process diarization
-            await diarization_obj.diarize(pcm_array)
-            # Get current state
-            state = await shared_state.get_current_state()
-            tokens = state["tokens"]
-            end_attributed_speaker = state["end_attributed_speaker"]
-            # Update speaker information
-            new_end_attributed_speaker = diarization_obj.assign_speakers_to_tokens(
-                end_attributed_speaker, tokens)
-            await shared_state.update_diarization(new_end_attributed_speaker, buffer_diarization)
-        except Exception as e:
-            logger.warning(f"Exception in diarization_processor: {e}")
-            logger.warning(f"Traceback: {traceback.format_exc()}")
-        finally:
-            pcm_queue.task_done()
-async def results_formatter(shared_state, websocket):
-    while True:
-        try:
-            # Get the current state
-            state = await shared_state.get_current_state()
-            tokens = state["tokens"]
-            buffer_transcription = state["buffer_transcription"]
-            buffer_diarization = state["buffer_diarization"]
-            end_attributed_speaker = state["end_attributed_speaker"]
-            remaining_time_transcription = state["remaining_time_transcription"]
-            remaining_time_diarization = state["remaining_time_diarization"]
-            sep = state["sep"]
-            # If diarization is enabled but no transcription, add dummy tokens periodically
-            if (not tokens or tokens[-1].is_dummy) and not args.transcription and args.diarization:
-                await shared_state.add_dummy_token()
-                sleep(0.5)
-                state = await shared_state.get_current_state()
-                tokens = state["tokens"]
-            # Process tokens to create response
-            previous_speaker = -1
-            lines = []
-            last_end_diarized = 0
-            undiarized_text = []
-            for token in tokens:
-                speaker = token.speaker
-                if args.diarization:
-                    if (speaker == -1 or speaker == 0) and token.end >= end_attributed_speaker:
-                        undiarized_text.append(token.text)
-                        continue
-                    elif (speaker == -1 or speaker == 0) and token.end < end_attributed_speaker:
-                        speaker = previous_speaker
-                    if speaker not in [-1, 0]:
-                        last_end_diarized = max(token.end, last_end_diarized)
-                if speaker != previous_speaker or not lines:
-                    lines.append(
-                        {
-                            "speaker": speaker,
-                            "text": token.text,
-                            "beg": format_time(token.start),
-                            "end": format_time(token.end),
-                            "diff": round(token.end - last_end_diarized, 2)
-                        }
-                    )
-                    previous_speaker = speaker
-                elif token.text:  # Only append if text isn't empty
-                    lines[-1]["text"] += sep + token.text
-                    lines[-1]["end"] = format_time(token.end)
-                    lines[-1]["diff"] = round(token.end - last_end_diarized, 2)
-            if undiarized_text:
-                combined_buffer_diarization = sep.join(undiarized_text)
-                if buffer_transcription:
-                    combined_buffer_diarization += sep
-                await shared_state.update_diarization(end_attributed_speaker, combined_buffer_diarization)
-                buffer_diarization = combined_buffer_diarization
-            if lines:
-                response = {
-                    "lines": lines,
-                    "buffer_transcription": buffer_transcription,
-                    "buffer_diarization": buffer_diarization,
-                    "remaining_time_transcription": remaining_time_transcription,
-                    "remaining_time_diarization": remaining_time_diarization
-                }
-            else:
-                response = {
-                    "lines": [{
-                        "speaker": 1,
-                        "text": "",
-                        "beg": format_time(0),
-                        "end": format_time(tokens[-1].end) if tokens else format_time(0),
-                        "diff": 0
-                }],
-                    "buffer_transcription": buffer_transcription,
-                    "buffer_diarization": buffer_diarization,
-                    "remaining_time_transcription": remaining_time_transcription,
-                    "remaining_time_diarization": remaining_time_diarization
-                }
-            response_content = ' '.join([str(line['speaker']) + ' ' + line["text"] for line in lines]) + ' | ' + buffer_transcription + ' | ' + buffer_diarization
-            if response_content != shared_state.last_response_content:
-                if lines or buffer_transcription or buffer_diarization:
-                    await websocket.send_json(response)
-                    shared_state.last_response_content = response_content
-            # Add a small delay to avoid overwhelming the client
-            await asyncio.sleep(0.1)
-        except Exception as e:
-            logger.warning(f"Exception in results_formatter: {e}")
-            logger.warning(f"Traceback: {traceback.format_exc()}")
-            await asyncio.sleep(0.5)  # Back off on error
-##### ENDPOINTS #####
-@app.get("/")
-async def get():
-    return HTMLResponse(html)
 @app.websocket("/asr")
 async def websocket_endpoint(websocket: WebSocket):
     await websocket.accept()
     logger.info("WebSocket connection opened.")
     ffmpeg_process = None
     pcm_buffer = bytearray()
-    shared_state = SharedState()
     transcription_queue = asyncio.Queue() if args.transcription else None
     diarization_queue = asyncio.Queue() if args.diarization else None
     online = None
-    async def restart_ffmpeg():
-        nonlocal ffmpeg_process, online, pcm_buffer
-        if ffmpeg_process:
-            try:
-                ffmpeg_process.kill()
-                await asyncio.get_event_loop().run_in_executor(None, ffmpeg_process.wait)
-            except Exception as e:
-                logger.warning(f"Error killing FFmpeg process: {e}")
-        ffmpeg_process = await start_ffmpeg_decoder()
-        pcm_buffer = bytearray()
-        if args.transcription:
-            online = online_factory(args, asr, tokenizer)
-        await shared_state.reset()
-        logger.info("FFmpeg process started.")
-    await restart_ffmpeg()
     tasks = []
     if args.transcription and online:
         tasks.append(asyncio.create_task(
-            transcription_processor(shared_state, transcription_queue, online)))
     if args.diarization and diarization:
         tasks.append(asyncio.create_task(
-            diarization_processor(shared_state, diarization_queue, diarization)))
-    formatter_task = asyncio.create_task(results_formatter(shared_state, websocket))
     tasks.append(formatter_task)
-    async def ffmpeg_stdout_reader():
-        nonlocal ffmpeg_process, pcm_buffer
-        loop = asyncio.get_event_loop()
-        beg = time()
-        while True:
-            try:
-                elapsed_time = math.floor((time() - beg) * 10) / 10 # Round to 0.1 sec
-                ffmpeg_buffer_from_duration = max(int(32000 * elapsed_time), 4096)
-                beg = time()
-                # Read chunk with timeout
-                try:
-                    chunk = await asyncio.wait_for(
-                        loop.run_in_executor(
-                            None, ffmpeg_process.stdout.read, ffmpeg_buffer_from_duration
-                        ),
-                        timeout=15.0
-                    )
-                except asyncio.TimeoutError:
-                    logger.warning("FFmpeg read timeout. Restarting...")
-                    await restart_ffmpeg()
-                    beg = time()
-                    continue  # Skip processing and read from new process
-                if not chunk:
-                    logger.info("FFmpeg stdout closed.")
-                    break
-                pcm_buffer.extend(chunk)
-                if args.diarization and diarization_queue:
-                    await diarization_queue.put(convert_pcm_to_float(pcm_buffer).copy())
-                if len(pcm_buffer) >= BYTES_PER_SEC:
-                    if len(pcm_buffer) > MAX_BYTES_PER_SEC:
-                        logger.warning(
-                            f"""Audio buffer is too large: {len(pcm_buffer) / BYTES_PER_SEC:.2f} seconds.
-                            The model probably struggles to keep up. Consider using a smaller model.
-                            """)
-                    pcm_array = convert_pcm_to_float(pcm_buffer[:MAX_BYTES_PER_SEC])
-                    pcm_buffer = pcm_buffer[MAX_BYTES_PER_SEC:]
-                    if args.transcription and transcription_queue:
-                        await transcription_queue.put(pcm_array.copy())
-                    if not args.transcription and not args.diarization:
-                        await asyncio.sleep(0.1)
-            except Exception as e:
-                logger.warning(f"Exception in ffmpeg_stdout_reader: {e}")
-                logger.warning(f"Traceback: {traceback.format_exc()}")
-                break
-        logger.info("Exiting ffmpeg_stdout_reader...")
-    stdout_reader_task = asyncio.create_task(ffmpeg_stdout_reader())
-    tasks.append(stdout_reader_task)
     try:
         while True:
             # Receive incoming WebM audio chunks from the client
@@ -493,7 +119,7 @@ async def websocket_endpoint(websocket: WebSocket):
                 ffmpeg_process.stdin.flush()
             except (BrokenPipeError, AttributeError) as e:
                 logger.warning(f"Error writing to FFmpeg: {e}. Restarting...")
-                await restart_ffmpeg()
                 ffmpeg_process.stdin.write(message)
                 ffmpeg_process.stdin.flush()
     except WebSocketDisconnect:
@@ -501,17 +127,14 @@ async def websocket_endpoint(websocket: WebSocket):
     finally:
         for task in tasks:
             task.cancel()
         try:
             await asyncio.gather(*tasks, return_exceptions=True)
             ffmpeg_process.stdin.close()
             ffmpeg_process.wait()
         except Exception as e:
             logger.warning(f"Error during cleanup: {e}")
         if args.diarization and diarization:
             diarization.close()
         logger.info("WebSocket endpoint cleaned up.")
 if __name__ == "__main__":

 import logging
 from datetime import timedelta
 import traceback
+from state import SharedState
+from formatters import format_time
+from parse_args import parse_args
+from audio import AudioProcessor
 logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
 logging.getLogger().setLevel(logging.WARNING)
 logger = logging.getLogger(__name__)
 logger.setLevel(logging.DEBUG)
+args = parse_args()
 SAMPLE_RATE = 16000
+# CHANNELS = 1
+# SAMPLES_PER_SEC = int(SAMPLE_RATE * args.min_chunk_size)
+# BYTES_PER_SAMPLE = 2  # s16le = 2 bytes per sample
+# BYTES_PER_SEC = SAMPLES_PER_SEC * BYTES_PER_SAMPLE
+# MAX_BYTES_PER_SEC = 32000 * 5  # 5 seconds of audio at 32 kHz
 ##### LOAD APP #####
 @asynccontextmanager
 with open("web/live_transcription.html", "r", encoding="utf-8") as f:
     html = f.read()
+@app.get("/")
+async def get():
+    return HTMLResponse(html)
 @app.websocket("/asr")
 async def websocket_endpoint(websocket: WebSocket):
+    audio_processor = AudioProcessor(args, asr, tokenizer)
     await websocket.accept()
     logger.info("WebSocket connection opened.")
     ffmpeg_process = None
     pcm_buffer = bytearray()
     transcription_queue = asyncio.Queue() if args.transcription else None
     diarization_queue = asyncio.Queue() if args.diarization else None
     online = None
+    ffmpeg_process, online, pcm_buffer = await audio_processor.restart_ffmpeg(ffmpeg_process, online, pcm_buffer)
     tasks = []
     if args.transcription and online:
         tasks.append(asyncio.create_task(
+            audio_processor.transcription_processor(transcription_queue, online)))
     if args.diarization and diarization:
         tasks.append(asyncio.create_task(
+            audio_processor.diarization_processor(diarization_queue, diarization)))
+    formatter_task = asyncio.create_task(audio_processor.results_formatter(websocket))
     tasks.append(formatter_task)
+    stdout_reader_task = asyncio.create_task(audio_processor.ffmpeg_stdout_reader(ffmpeg_process, pcm_buffer, diarization_queue, transcription_queue))
+    tasks.append(stdout_reader_task)
     try:
         while True:
             # Receive incoming WebM audio chunks from the client
                 ffmpeg_process.stdin.flush()
             except (BrokenPipeError, AttributeError) as e:
                 logger.warning(f"Error writing to FFmpeg: {e}. Restarting...")
+                ffmpeg_process, online, pcm_buffer = await audio_processor.restart_ffmpeg(ffmpeg_process, online, pcm_buffer)
                 ffmpeg_process.stdin.write(message)
                 ffmpeg_process.stdin.flush()
     except WebSocketDisconnect:
     finally:
         for task in tasks:
             task.cancel()
         try:
             await asyncio.gather(*tasks, return_exceptions=True)
             ffmpeg_process.stdin.close()
             ffmpeg_process.wait()
         except Exception as e:
             logger.warning(f"Error during cleanup: {e}")
         if args.diarization and diarization:
             diarization.close()
         logger.info("WebSocket endpoint cleaned up.")
 if __name__ == "__main__":

whisper_streaming_custom/whisper_online.py CHANGED Viewed

@@ -71,7 +71,7 @@ def add_shared_args(parser):
     parser.add_argument(
         "--min-chunk-size",
         type=float,
-        default=1.0,
         help="Minimum audio chunk size in seconds. It waits up to this time to do processing. If the processing takes shorter time, it waits, otherwise it processes the whole segment that was received by this time.",
     )
     parser.add_argument(

     parser.add_argument(
         "--min-chunk-size",
         type=float,
+        default=0.5,
         help="Minimum audio chunk size in seconds. It waits up to this time to do processing. If the processing takes shorter time, it waits, otherwise it processes the whole segment that was received by this time.",
     )
     parser.add_argument(