Spaces:

nuernie
/

ai-server

Paused

App Files Files Community

nuernie commited on May 16

Commit

7222c68

1 Parent(s): d17f6c7

initial commit

Browse files

Files changed (37) hide show

.gitignore +127 -0
Dockerfile +17 -0
LICENSE +21 -0
README.md +209 -11
TensorRT_whisper.md +47 -0
app.py +23 -0
docker/Dockerfile.cpu +25 -0
docker/Dockerfile.gpu +26 -0
docker/Dockerfile.openvino +19 -0
docker/Dockerfile.tensorrt +30 -0
requirements.txt +7 -0
requirements/client.txt +4 -0
requirements/server.txt +21 -0
run_server.py +54 -0
scripts/build_whisper_tensorrt.sh +120 -0
scripts/setup.sh +3 -0
setup.py +67 -0
tests/__init__.py +0 -0
tests/test_client.py +162 -0
tests/test_server.py +148 -0
tests/test_vad.py +26 -0
whisper_live/__init__.py +0 -0
whisper_live/__version__.py +1 -0
whisper_live/backend/__init__.py +0 -0
whisper_live/backend/base.py +361 -0
whisper_live/backend/faster_whisper_backend.py +216 -0
whisper_live/backend/openvino_backend.py +148 -0
whisper_live/backend/trt_backend.py +210 -0
whisper_live/client.py +782 -0
whisper_live/server.py +446 -0
whisper_live/transcriber/__init__.py +0 -0
whisper_live/transcriber/tensorrt_utils.py +364 -0
whisper_live/transcriber/transcriber_faster_whisper.py +1889 -0
whisper_live/transcriber/transcriber_openvino.py +23 -0
whisper_live/transcriber/transcriber_tensorrt.py +479 -0
whisper_live/utils.py +82 -0
whisper_live/vad.py +157 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,127 @@

+# Editors
+.vscode/
+.idea/
+# Vagrant
+.vagrant/
+# Mac/OSX
+.DS_Store
+# Windows
+Thumbs.db
+# Source for the following rules: https://raw.githubusercontent.com/github/gitignore/master/Python.gitignore
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+.hypothesis/
+.pytest_cache/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+.python-version
+# celery beat schedule file
+celerybeat-schedule
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+docs/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json

Dockerfile ADDED Viewed

	@@ -0,0 +1,17 @@

+FROM python:3.10-slim
+WORKDIR /app
+RUN apt-get update && \
+    apt-get install -y portaudio19-dev python3-dev gcc && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+COPY . .
+EXPOSE 7860
+CMD ["python", "app.py"]

LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2023 Vineet Suryan, Collabora Ltd.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

README.md CHANGED Viewed

@@ -1,11 +1,209 @@
----
-title: Ai Server
-emoji: 🐨
-colorFrom: pink
-colorTo: green
-sdk: docker
-pinned: false
-short_description: Answers interview questions
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+# WhisperLive
+<h2 align="center">
+  <a href="https://www.youtube.com/watch?v=0PHWCApIcCI"><img
+src="https://img.youtube.com/vi/0PHWCApIcCI/0.jpg" style="background-color:rgba(0,0,0,0);" height=300 alt="WhisperLive"></a>
+  <br><br>A nearly-live implementation of OpenAI's Whisper.
+<br><br>
+</h2>
+This project is a real-time transcription application that uses the OpenAI Whisper model
+to convert speech input into text output. It can be used to transcribe both live audio
+input from microphone and pre-recorded audio files.
+- [Installation](#installation)
+- [Getting Started](#getting-started)
+- [Running the Server](#running-the-server)
+- [Running the Client](#running-the-client)
+- [Browser Extensions](#browser-extensions)
+- [Whisper Live Server in Docker](#whisper-live-server-in-docker)
+- [Future Work](#future-work)
+- [Contact](#contact)
+- [Citations](#citations)
+## Installation
+- Install PyAudio
+```bash
+ bash scripts/setup.sh
+```
+- Install whisper-live from pip
+```bash
+ pip install whisper-live
+```
+### Setting up NVIDIA/TensorRT-LLM for TensorRT backend
+- Please follow [TensorRT_whisper readme](https://github.com/collabora/WhisperLive/blob/main/TensorRT_whisper.md) for setup of [NVIDIA/TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM) and for building Whisper-TensorRT engine.
+## Getting Started
+The server supports 3 backends `faster_whisper`, `tensorrt` and `openvino`. If running `tensorrt` backend follow [TensorRT_whisper readme](https://github.com/collabora/WhisperLive/blob/main/TensorRT_whisper.md)
+### Running the Server
+- [Faster Whisper](https://github.com/SYSTRAN/faster-whisper) backend
+```bash
+python3 run_server.py --port 9090 \
+                      --backend faster_whisper
+# running with custom model
+python3 run_server.py --port 9090 \
+                      --backend faster_whisper \
+                      -fw "/path/to/custom/faster/whisper/model"
+```
+- TensorRT backend. Currently, we recommend to only use the docker setup for TensorRT. Follow [TensorRT_whisper readme](https://github.com/collabora/WhisperLive/blob/main/TensorRT_whisper.md) which works as expected. Make sure to build your TensorRT Engines before running the server with TensorRT backend.
+```bash
+# Run English only model
+python3 run_server.py -p 9090 \
+                      -b tensorrt \
+                      -trt /home/TensorRT-LLM/examples/whisper/whisper_small_en
+# Run Multilingual model
+python3 run_server.py -p 9090 \
+                      -b tensorrt \
+                      -trt /home/TensorRT-LLM/examples/whisper/whisper_small \
+                      -m
+```
+- WhisperLive now supports the [OpenVINO](https://github.com/openvinotoolkit/openvino) backend for efficient inference on Intel CPUs, iGPU and dGPUs. Currently, we tested the models uploaded to [huggingface by OpenVINO](https://huggingface.co/OpenVINO?search_models=whisper).
+  - > **Docker Recommended:** Running WhisperLive with OpenVINO inside Docker automatically enables GPU support (iGPU/dGPU) without requiring additional host setup.
+  - > **Native (non-Docker) Use:** If you prefer running outside Docker, ensure the Intel drivers and OpenVINO runtime are installed and properly configured on your system. Refer to the documentation for [installing OpenVINO](https://docs.openvino.ai/2025/get-started/install-openvino.html?PACKAGE=OPENVINO_BASE&VERSION=v_2025_0_0&OP_SYSTEM=LINUX&DISTRIBUTION=PIP#).
+```
+python3 run_server.py -p 9090 -b openvino
+```
+#### Controlling OpenMP Threads
+To control the number of threads used by OpenMP, you can set the `OMP_NUM_THREADS` environment variable. This is useful for managing CPU resources and ensuring consistent performance. If not specified, `OMP_NUM_THREADS` is set to `1` by default. You can change this by using the `--omp_num_threads` argument:
+```bash
+python3 run_server.py --port 9090 \
+                      --backend faster_whisper \
+                      --omp_num_threads 4
+```
+#### Single model mode
+By default, when running the server without specifying a model, the server will instantiate a new whisper model for every client connection. This has the advantage, that the server can use different model sizes, based on the client's requested model size. On the other hand, it also means you have to wait for the model to be loaded upon client connection and you will have increased (V)RAM usage.
+When serving a custom TensorRT model using the `-trt` or a custom faster_whisper model using the `-fw` option, the server will instead only instantiate the custom model once and then reuse it for all client connections.
+If you don't want this, set `--no_single_model`.
+### Running the Client
+- Initializing the client with below parameters:
+  - `lang`: Language of the input audio, applicable only if using a multilingual model.
+  - `translate`: If set to `True` then translate from any language to `en`.
+  - `model`: Whisper model size.
+  - `use_vad`: Whether to use `Voice Activity Detection` on the server.
+  - `save_output_recording`: Set to True to save the microphone input as a `.wav` file during live transcription. This option is helpful for recording sessions for later playback or analysis. Defaults to `False`.
+  - `output_recording_filename`: Specifies the `.wav` file path where the microphone input will be saved if `save_output_recording` is set to `True`.
+  - `max_clients`: Specifies the maximum number of clients the server should allow. Defaults to 4.
+  - `max_connection_time`: Maximum connection time for each client in seconds. Defaults to 600.
+  - `mute_audio_playback`: Whether to mute audio playback when transcribing an audio file. Defaults to False.
+```python
+from whisper_live.client import TranscriptionClient
+client = TranscriptionClient(
+  "localhost",
+  9090,
+  lang="en",
+  translate=False,
+  model="small",                                      # also support hf_model => `Systran/faster-whisper-small`
+  use_vad=False,
+  save_output_recording=True,                         # Only used for microphone input, False by Default
+  output_recording_filename="./output_recording.wav", # Only used for microphone input
+  max_clients=4,
+  max_connection_time=600,
+  mute_audio_playback=False,                          # Only used for file input, False by Default
+)
+```
+It connects to the server running on localhost at port 9090. Using a multilingual model, language for the transcription will be automatically detected. You can also use the language option to specify the target language for the transcription, in this case, English ("en"). The translate option should be set to `True` if we want to translate from the source language to English and `False` if we want to transcribe in the source language.
+- Transcribe an audio file:
+```python
+client("tests/jfk.wav")
+```
+- To transcribe from microphone:
+```python
+client()
+```
+- To transcribe from a RTSP stream:
+```python
+client(rtsp_url="rtsp://admin:[email protected]/rtsp")
+```
+- To transcribe from a HLS stream:
+```python
+client(hls_url="http://as-hls-ww-live.akamaized.net/pool_904/live/ww/bbc_1xtra/bbc_1xtra.isml/bbc_1xtra-audio%3d96000.norewind.m3u8")
+```
+## Browser Extensions
+- Run the server with your desired backend as shown [here](https://github.com/collabora/WhisperLive?tab=readme-ov-file#running-the-server).
+- Transcribe audio directly from your browser using our Chrome or Firefox extensions. Refer to [Audio-Transcription-Chrome](https://github.com/collabora/whisper-live/tree/main/Audio-Transcription-Chrome#readme) and https://github.com/collabora/WhisperLive/blob/main/TensorRT_whisper.md
+## Whisper Live Server in Docker
+- GPU
+  - Faster-Whisper
+  ```bash
+  docker run -it --gpus all -p 9090:9090 ghcr.io/collabora/whisperlive-gpu:latest
+  ```
+  - TensorRT. Refer to [TensorRT_whisper readme](https://github.com/collabora/WhisperLive/blob/main/TensorRT_whisper.md) for setup and more tensorrt backend configurations.
+  ```bash
+  docker build . -f docker/Dockerfile.tensorrt -t whisperlive-tensorrt
+  docker run -p 9090:9090 --runtime=nvidia --gpus all --entrypoint /bin/bash -it whisperlive-tensorrt
+  # Build small.en engine
+  bash build_whisper_tensorrt.sh /app/TensorRT-LLM-examples small.en        # float16
+  bash build_whisper_tensorrt.sh /app/TensorRT-LLM-examples small.en int8   # int8 weight only quantization
+  bash build_whisper_tensorrt.sh /app/TensorRT-LLM-examples small.en int4   # int4 weight only quantization
+  # Run server with small.en
+  python3 run_server.py --port 9090 \
+                        --backend tensorrt \
+                        --trt_model_path "/app/TensorRT-LLM-examples/whisper/whisper_small_en_float16"
+                        --trt_model_path "/app/TensorRT-LLM-examples/whisper/whisper_small_en_int8"
+                        --trt_model_path "/app/TensorRT-LLM-examples/whisper/whisper_small_en_int4"
+  ```
+  - OpenVINO
+  ```
+  docker run -it --device=/dev/dri -p 9090:9090 ghcr.io/collabora/whisperlive-openvino
+  ```
+- CPU
+  - Faster-whisper
+  ```bash
+  docker run -it -p 9090:9090 ghcr.io/collabora/whisperlive-cpu:latest
+  ```
+## Future Work
+- [ ] Add translation to other languages on top of transcription.
+## Contact
+We are available to help you with both Open Source and proprietary AI projects. You can reach us via the Collabora website or [[email protected]](mailto:[email protected]) and [[email protected]](mailto:[email protected]).
+## Citations
+```bibtex
+@article{Whisper
+  title = {Robust Speech Recognition via Large-Scale Weak Supervision},
+  url = {https://arxiv.org/abs/2212.04356},
+  author = {Radford, Alec and Kim, Jong Wook and Xu, Tao and Brockman, Greg and McLeavey, Christine and Sutskever, Ilya},
+  publisher = {arXiv},
+  year = {2022},
+}
+```
+```bibtex
+@misc{Silero VAD,
+  author = {Silero Team},
+  title = {Silero VAD: pre-trained enterprise-grade Voice Activity Detector (VAD), Number Detector and Language Classifier},
+  year = {2021},
+  publisher = {GitHub},
+  journal = {GitHub repository},
+  howpublished = {\url{https://github.com/snakers4/silero-vad}},
+  email = {[email protected]}
+}

TensorRT_whisper.md ADDED Viewed

	@@ -0,0 +1,47 @@

+# WhisperLive-TensorRT
+We have only tested the TensorRT backend in docker so, we recommend docker for a smooth TensorRT backend setup.
+**Note**: We use `tensorrt_llm==0.18.2`
+## Installation
+- Install [docker](https://docs.docker.com/engine/install/)
+- Install [nvidia-container-toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html)
+- Run WhisperLive TensorRT in docker
+```bash
+docker build . -f docker/Dockerfile.tensorrt -t whisperlive-tensorrt
+docker run -p 9090:9090 --runtime=nvidia --gpus all --entrypoint /bin/bash -it whisperlive-tensorrt
+```
+## Whisper TensorRT Engine
+- We build `small.en` and `small` multilingual TensorRT engine as examples below. The script logs the path of the directory with Whisper TensorRT engine. We need that model_path to run the server.
+```bash
+# convert small.en
+bash build_whisper_tensorrt.sh /app/TensorRT-LLM-examples small.en        # float16
+bash build_whisper_tensorrt.sh /app/TensorRT-LLM-examples small.en int8   # int8 weight only quantization
+bash build_whisper_tensorrt.sh /app/TensorRT-LLM-examples small.en int4   # int4 weight only quantization
+# convert small multilingual model
+bash build_whisper_tensorrt.sh /app/TensorRT-LLM-examples small
+```
+## Run WhisperLive Server with TensorRT Backend
+```bash
+# Run English only model
+python3 run_server.py --port 9090 \
+                      --backend tensorrt \
+                      --trt_model_path "/app/TensorRT-LLM-examples/whisper/whisper_small_en_float16"
+# Run Multilingual model
+python3 run_server.py --port 9090 \
+                      --backend tensorrt \
+                      --trt_model_path "/app/TensorRT-LLM-examples/whisper/whisper_small_float16" \
+                      --trt_multilingual
+```
+By default trt_backend uses cpp_session, to use python session pass `--trt_py_session` to run_server.py
+```bash
+python3 run_server.py --port 9090 \
+                      --backend tensorrt \
+                      --trt_model_path "/app/TensorRT-LLM-examples/whisper/whisper_small_float16" \
+                      --trt_py_session
+```

app.py ADDED Viewed

	@@ -0,0 +1,23 @@

+from fastapi import FastAPI
+import uvicorn
+from whisper_live.server import TranscriptionServer
+app = FastAPI(title="Whisper Live Server")
+@app.on_event("startup")
+async def startup_event():
+    # Start the transcription server in the background
+    server = TranscriptionServer()
+    server.run(
+        host="0.0.0.0",
+        port=7860,  # Hugging Face Spaces uses port 7860
+        backend="faster_whisper",  # Using faster_whisper as the backend
+        single_model=True  # Use single model mode for better resource usage
+    )
+@app.get("/health")
+def health_check():
+    return {"status": "healthy"}
+if __name__ == "__main__":
+    uvicorn.run(app, host="0.0.0.0", port=7860)

docker/Dockerfile.cpu ADDED Viewed

	@@ -0,0 +1,25 @@

+FROM python:3.10-bookworm
+ARG DEBIAN_FRONTEND=noninteractive
+# install lib required for pyaudio
+RUN apt update && apt install -y portaudio19-dev && apt-get clean && rm -rf /var/lib/apt/lists/*
+# update pip to support for whl.metadata -> less downloading
+RUN pip install --no-cache-dir -U "pip>=24"
+# create a working directory
+RUN mkdir /app
+WORKDIR /app
+# install pytorch, but without the nvidia-libs that are only necessary for gpu
+RUN pip install --no-cache-dir torch --index-url https://download.pytorch.org/whl/cpu
+# install the requirements for running the whisper-live server
+COPY requirements/server.txt /app/
+RUN pip install --no-cache-dir -r server.txt && rm server.txt
+COPY whisper_live /app/whisper_live
+COPY run_server.py /app
+CMD ["python", "run_server.py"]

docker/Dockerfile.gpu ADDED Viewed

	@@ -0,0 +1,26 @@

+FROM python:3.10-bookworm
+ARG DEBIAN_FRONTEND=noninteractive
+# install lib required for pyaudio
+RUN apt update && apt install -y portaudio19-dev && apt-get clean && rm -rf /var/lib/apt/lists/*
+# update pip to support for whl.metadata -> less downloading
+RUN pip install --no-cache-dir -U "pip>=24"
+# create a working directory
+RUN mkdir /app
+WORKDIR /app
+# install the requirements for running the whisper-live server
+COPY requirements/server.txt /app/
+RUN pip install --no-cache-dir -r server.txt && rm server.txt
+# make the paths of the nvidia libs installed as wheels visible. equivalent to:
+# export LD_LIBRARY_PATH=`python3 -c 'import os; import nvidia.cublas.lib; import nvidia.cudnn.lib; print(os.path.dirname(nvidia.cublas.lib.__file__) + ":" + os.path.dirname(nvidia.cudnn.lib.__file__))'`
+ENV LD_LIBRARY_PATH="/usr/local/lib/python3.10/site-packages/nvidia/cublas/lib:/usr/local/lib/python3.10/site-packages/nvidia/cudnn/lib"
+COPY whisper_live /app/whisper_live
+COPY run_server.py /app
+CMD ["python", "run_server.py"]

docker/Dockerfile.openvino ADDED Viewed

	@@ -0,0 +1,19 @@

+FROM openvino/ubuntu22_runtime:latest
+ARG DEBIAN_FRONTEND=noninteractive
+USER root
+RUN apt update && apt install -y portaudio19-dev python-is-python3 && apt-get clean && rm -rf /var/lib/apt/lists/*
+RUN pip install --no-cache-dir -U "pip>=24"
+RUN mkdir /app
+WORKDIR /app
+COPY requirements/server.txt /app/
+RUN pip install --no-cache-dir -r server.txt && rm server.txt
+COPY whisper_live /app/whisper_live
+COPY run_server.py /app
+CMD ["python", "run_server.py", "--backend", "openvino"]

docker/Dockerfile.tensorrt ADDED Viewed

	@@ -0,0 +1,30 @@

+FROM nvidia/cuda:12.8.1-base-ubuntu22.04 AS base
+ARG DEBIAN_FRONTEND=noninteractive
+RUN apt-get update && apt-get install -y \
+    python3.10 python3-pip openmpi-bin libopenmpi-dev git git-lfs wget \
+    && apt install python-is-python3 \
+    && pip install --upgrade pip setuptools \
+    && rm -rf /var/lib/apt/lists/*
+FROM base AS devel
+RUN pip install --no-cache-dir -U tensorrt_llm==0.18.2 --extra-index-url https://pypi.nvidia.com
+WORKDIR /app
+RUN git clone -b v0.18.2 https://github.com/NVIDIA/TensorRT-LLM.git \
+    && mv TensorRT-LLM/examples ./TensorRT-LLM-examples \
+    && rm -rf TensorRT-LLM
+FROM devel AS release
+WORKDIR /app
+COPY assets/ ./assets
+RUN wget -nc -P assets/ https://raw.githubusercontent.com/openai/whisper/main/whisper/assets/mel_filters.npz
+COPY scripts/setup.sh ./
+RUN apt update && bash setup.sh && rm setup.sh
+COPY requirements/server.txt .
+RUN pip install --no-cache-dir -r server.txt && rm server.txt
+COPY whisper_live ./whisper_live
+COPY scripts/build_whisper_tensorrt.sh .
+COPY run_server.py .

requirements.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+faster-whisper
+numpy
+websockets
+pyaudio
+soundfile
+torch
+torchaudio

requirements/client.txt ADDED Viewed

	@@ -0,0 +1,4 @@

+PyAudio
+av
+scipy
+websocket-client

requirements/server.txt ADDED Viewed

	@@ -0,0 +1,21 @@

+faster-whisper==1.1.0
+websockets
+onnxruntime==1.17.0
+numba
+kaldialign
+soundfile
+scipy
+av
+jiwer
+evaluate
+numpy<2
+openai-whisper==20240930
+tokenizers==0.20.3
+# openvino
+librosa
+openvino
+openvino-genai
+openvino-tokenizers
+optimum
+optimum-intel

run_server.py ADDED Viewed

	@@ -0,0 +1,54 @@

+import argparse
+import os
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--port', '-p',
+                        type=int,
+                        default=9090,
+                        help="Websocket port to run the server on.")
+    parser.add_argument('--backend', '-b',
+                        type=str,
+                        default='faster_whisper',
+                        help='Backends from ["tensorrt", "faster_whisper", "openvino"]')
+    parser.add_argument('--faster_whisper_custom_model_path', '-fw',
+                        type=str, default=None,
+                        help="Custom Faster Whisper Model")
+    parser.add_argument('--trt_model_path', '-trt',
+                        type=str,
+                        default=None,
+                        help='Whisper TensorRT model path')
+    parser.add_argument('--trt_multilingual', '-m',
+                        action="store_true",
+                        help='Boolean only for TensorRT model. True if multilingual.')
+    parser.add_argument('--trt_py_session',
+                        action="store_true",
+                        help='Boolean only for TensorRT model. Use python session or cpp session, By default uses Cpp.')
+    parser.add_argument('--omp_num_threads', '-omp',
+                        type=int,
+                        default=1,
+                        help="Number of threads to use for OpenMP")
+    parser.add_argument('--no_single_model', '-nsm',
+                        action='store_true',
+                        help='Set this if every connection should instantiate its own model. Only relevant for custom model, passed using -trt or -fw.')
+    args = parser.parse_args()
+    if args.backend == "tensorrt":
+        if args.trt_model_path is None:
+            raise ValueError("Please Provide a valid tensorrt model path")
+    if "OMP_NUM_THREADS" not in os.environ:
+        os.environ["OMP_NUM_THREADS"] = str(args.omp_num_threads)
+    from whisper_live.server import TranscriptionServer
+    server = TranscriptionServer()
+    server.run(
+        "0.0.0.0",
+        port=args.port,
+        backend=args.backend,
+        faster_whisper_custom_model_path=args.faster_whisper_custom_model_path,
+        whisper_tensorrt_path=args.trt_model_path,
+        trt_multilingual=args.trt_multilingual,
+        trt_py_session=args.trt_py_session,
+        single_model=not args.no_single_model,
+    )

scripts/build_whisper_tensorrt.sh ADDED Viewed

	@@ -0,0 +1,120 @@

+#!/bin/bash
+download_and_build_model() {
+    local model_name="$1"
+    local model_url=""
+    case "$model_name" in
+        "tiny.en")
+            model_url="https://openaipublic.azureedge.net/main/whisper/models/d3dd57d32accea0b295c96e26691aa14d8822fac7d9d27d5dc00b4ca2826dd03/tiny.en.pt"
+            ;;
+        "tiny")
+            model_url="https://openaipublic.azureedge.net/main/whisper/models/65147644a518d12f04e32d6f3b26facc3f8dd46e5390956a9424a650c0ce22b9/tiny.pt"
+            ;;
+        "base.en")
+            model_url="https://openaipublic.azureedge.net/main/whisper/models/25a8566e1d0c1e2231d1c762132cd20e0f96a85d16145c3a00adf5d1ac670ead/base.en.pt"
+            ;;
+        "base")
+            model_url="https://openaipublic.azureedge.net/main/whisper/models/ed3a0b6b1c0edf879ad9b11b1af5a0e6ab5db9205f891f668f8b0e6c6326e34e/base.pt"
+            ;;
+        "small.en")
+            model_url="https://openaipublic.azureedge.net/main/whisper/models/f953ad0fd29cacd07d5a9eda5624af0f6bcf2258be67c92b79389873d91e0872/small.en.pt"
+            ;;
+        "small")
+            model_url="https://openaipublic.azureedge.net/main/whisper/models/9ecf779972d90ba49c06d968637d720dd632c55bbf19d441fb42bf17a411e794/small.pt"
+            ;;
+        "medium.en")
+            model_url="https://openaipublic.azureedge.net/main/whisper/models/d7440d1dc186f76616474e0ff0b3b6b879abc9d1a4926b7adfa41db2d497ab4f/medium.en.pt"
+            ;;
+        "medium")
+            model_url="https://openaipublic.azureedge.net/main/whisper/models/345ae4da62f9b3d59415adc60127b97c714f32e89e936602e85993674d08dcb1/medium.pt"
+            ;;
+        "large-v1")
+            model_url="https://openaipublic.azureedge.net/main/whisper/models/e4b87e7e0bf463eb8e6956e646f1e277e901512310def2c24bf0e11bd3c28e9a/large-v1.pt"
+            ;;
+        "large-v2")
+            model_url="https://openaipublic.azureedge.net/main/whisper/models/81f7c96c852ee8fc832187b0132e569d6c3065a3252ed18e56effd0b6a73e524/large-v2.pt"
+            ;;
+        "large-v3" | "large")
+            model_url="https://openaipublic.azureedge.net/main/whisper/models/e5b1a55b89c1367dacf97e3e19bfd829a01529dbfdeefa8caeb59b3f1b81dadb/large-v3.pt"
+            ;;
+        "large-v3-turbo" | "turbo")
+            model_url="https://openaipublic.azureedge.net/main/whisper/models/aff26ae408abcba5fbf8813c21e62b0941638c5f6eebfb145be0c9839262a19a/large-v3-turbo.pt"
+            ;;
+        *)
+            echo "Invalid model name: $model_name"
+            exit 1
+            ;;
+    esac
+    if [ "$model_name" == "turbo" ]; then
+        model_name="large-v3-turbo"
+    fi
+    local inference_precision="float16"
+    local weight_only_precision="${2:-float16}"
+    local max_beam_width=4
+    local max_batch_size=4
+    echo "Downloading $model_name..."
+    # wget --directory-prefix=assets "$model_url"
+    # echo "Download completed: ${model_name}.pt"
+    if [ ! -f "assets/${model_name}.pt" ]; then
+        wget --directory-prefix=assets "$model_url"
+        echo "Download completed: ${model_name}.pt"
+    else
+        echo "${model_name}.pt already exists in assets directory."
+    fi
+    local sanitized_model_name="${model_name//./_}"
+    local checkpoint_dir="whisper_${sanitized_model_name}_weights_${weight_only_precision}"
+    local output_dir="whisper_${sanitized_model_name}_${weight_only_precision}"
+    echo "$output_dir"
+    echo "Converting model weights for $model_name..."
+    python3 convert_checkpoint.py \
+        $( [[ "$weight_only_precision" == "int8" || "$weight_only_precision" == "int4" ]] && echo "--use_weight_only --weight_only_precision $weight_only_precision" ) \
+        --output_dir "$checkpoint_dir" --model_name "$model_name"
+    echo "Building encoder for $model_name..."
+    trtllm-build \
+        --checkpoint_dir "${checkpoint_dir}/encoder" \
+        --output_dir "${output_dir}/encoder" \
+        --moe_plugin disable \
+        --max_batch_size "$max_batch_size" \
+        --gemm_plugin disable \
+        --bert_attention_plugin "$inference_precision" \
+        --max_input_len 3000 \
+        --max_seq_len 3000
+    echo "Building decoder for $model_name..."
+    trtllm-build \
+        --checkpoint_dir "${checkpoint_dir}/decoder" \
+        --output_dir "${output_dir}/decoder" \
+        --moe_plugin disable \
+        --max_beam_width "$max_beam_width" \
+        --max_batch_size "$max_batch_size" \
+        --max_seq_len 225 \
+        --max_input_len 32 \
+        --max_encoder_input_len 3000 \
+        --gemm_plugin "$inference_precision" \
+        --bert_attention_plugin "$inference_precision" \
+        --gpt_attention_plugin "$inference_precision"
+    echo "TensorRT LLM engine built for $model_name."
+    echo "========================================="
+    echo "Model is located at: $(pwd)/$output_dir"
+}
+if [ "$#" -lt 1 ]; then
+    echo "Usage: $0 <path-to-tensorrt-examples-dir> [model-name]"
+    exit 1
+fi
+tensorrt_examples_dir="$1"
+model_name="${2:-small.en}"
+weight_only_precision="${3:-float16}"  # Default to float16 if not provided
+cd $tensorrt_examples_dir/whisper
+pip install --no-deps -r requirements.txt
+download_and_build_model "$model_name" "$weight_only_precision"

scripts/setup.sh ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ #! /bin/bash
2	+
3	+ apt-get install portaudio19-dev wget -y

setup.py ADDED Viewed

	@@ -0,0 +1,67 @@

+import pathlib
+from setuptools import find_packages, setup
+from whisper_live.__version__ import __version__
+# The directory containing this file
+HERE = pathlib.Path(__file__).parent
+# The text of the README file
+README = (HERE / "README.md").read_text()
+# This call to setup() does all the work
+setup(
+    name="whisper_live",
+    version=__version__,
+    description="A nearly-live implementation of OpenAI's Whisper.",
+    long_description=README,
+    long_description_content_type="text/markdown",
+    include_package_data=True,
+    url="https://github.com/collabora/WhisperLive",
+    author="Collabora Ltd",
+    author_email="[email protected]",
+    license="MIT",
+    classifiers=[
+        "Development Status :: 4 - Beta",
+        "Intended Audience :: Developers",
+        "Intended Audience :: Science/Research",
+        "License :: OSI Approved :: MIT License",
+        "Programming Language :: Python :: 3",
+        "Programming Language :: Python :: 3 :: Only",
+        "Programming Language :: Python :: 3.8",
+        "Programming Language :: Python :: 3.9",
+        "Topic :: Scientific/Engineering :: Artificial Intelligence",
+    ],
+    packages=find_packages(
+        exclude=(
+            "examples",
+            "Audio-Transcription-Chrome",
+            "Audio-Transcription-Firefox",
+            "requirements",
+            "whisper-finetuning"
+        )
+    ),
+    install_requires=[
+        "PyAudio",
+        "faster-whisper==1.1.0",
+        "torch",
+        "torchaudio",
+        "websockets",
+        "onnxruntime==1.17.0",
+        "scipy",
+        "websocket-client",
+        "numba",
+        "openai-whisper==20240930",
+        "kaldialign",
+        "soundfile",
+        "tokenizers==0.20.3",
+        "librosa",
+        "numpy==1.26.4",
+        "openvino",
+        "openvino-genai",
+        "openvino-tokenizers",
+        "optimum",
+        "optimum-intel",
+    ],
+    python_requires=">=3.9"
+)

tests/__init__.py ADDED Viewed

File without changes

tests/test_client.py ADDED Viewed

	@@ -0,0 +1,162 @@

+import json
+import os
+import scipy
+import websocket
+import copy
+import unittest
+from unittest.mock import patch, MagicMock
+from whisper_live.client import Client, TranscriptionClient, TranscriptionTeeClient
+from whisper_live.utils import resample
+from pathlib import Path
+class BaseTestCase(unittest.TestCase):
+    @patch('whisper_live.client.websocket.WebSocketApp')
+    @patch('whisper_live.client.pyaudio.PyAudio')
+    def setUp(self, mock_pyaudio, mock_websocket):
+        self.mock_pyaudio_instance = MagicMock()
+        mock_pyaudio.return_value = self.mock_pyaudio_instance
+        self.mock_stream = MagicMock()
+        self.mock_pyaudio_instance.open.return_value = self.mock_stream
+        self.mock_ws_app = mock_websocket.return_value
+        self.mock_ws_app.send = MagicMock()
+        self.client = TranscriptionClient(host='localhost', port=9090, lang="en").client
+        self.mock_pyaudio = mock_pyaudio
+        self.mock_websocket = mock_websocket
+        self.mock_audio_packet = b'\x00\x01\x02\x03'
+    def tearDown(self):
+        self.client.close_websocket()
+        self.mock_pyaudio.stop()
+        self.mock_websocket.stop()
+        del self.client
+class TestClientWebSocketCommunication(BaseTestCase):
+    def test_websocket_communication(self):
+        expected_url = 'ws://localhost:9090'
+        self.mock_websocket.assert_called()
+        self.assertEqual(self.mock_websocket.call_args[0][0], expected_url)
+class TestClientCallbacks(BaseTestCase):
+    def test_on_open(self):
+        expected_message = json.dumps({
+            "uid": self.client.uid,
+            "language": self.client.language,
+            "task": self.client.task,
+            "model": self.client.model,
+            "use_vad": True,
+            "max_clients": 4,
+            "max_connection_time": 600,
+            "send_last_n_segments": 10,
+            "no_speech_thresh": 0.45,
+            "clip_audio": False,
+            "same_output_threshold": 10,
+        })
+        self.client.on_open(self.mock_ws_app)
+        self.mock_ws_app.send.assert_called_with(expected_message)
+    def test_on_message(self):
+        message = json.dumps(
+            {
+                "uid": self.client.uid,
+                "message": "SERVER_READY",
+                "backend": "faster_whisper"
+            }
+        )
+        self.client.on_message(self.mock_ws_app, message)
+        message = json.dumps({
+            "uid": self.client.uid,
+            "segments": [
+                {"start": 0, "end": 1, "text": "Test transcript", "completed": True},
+                {"start": 1, "end": 2, "text": "Test transcript 2", "completed": True},
+                {"start": 2, "end": 3, "text": "Test transcript 3", "completed": True}
+            ]
+        })
+        self.client.on_message(self.mock_ws_app, message)
+        # Assert that the transcript was updated correctly
+        self.assertEqual(len(self.client.transcript), 3)
+        self.assertEqual(self.client.transcript[1]['text'], "Test transcript 2")
+    def test_on_close(self):
+        close_status_code = 1000
+        close_msg = "Normal closure"
+        self.client.on_close(self.mock_ws_app, close_status_code, close_msg)
+        self.assertFalse(self.client.recording)
+        self.assertFalse(self.client.server_error)
+        self.assertFalse(self.client.waiting)
+    def test_on_error(self):
+        error_message = "Test Error"
+        self.client.on_error(self.mock_ws_app, error_message)
+        self.assertTrue(self.client.server_error)
+        self.assertEqual(self.client.error_message, error_message)
+class TestAudioResampling(unittest.TestCase):
+    def test_resample_audio(self):
+        original_audio = "assets/jfk.flac"
+        expected_sr = 16000
+        resampled_audio = resample(original_audio, expected_sr)
+        sr, _ = scipy.io.wavfile.read(resampled_audio)
+        self.assertEqual(sr, expected_sr)
+        os.remove(resampled_audio)
+class TestSendingAudioPacket(BaseTestCase):
+    def test_send_packet(self):
+        self.client.send_packet_to_server(self.mock_audio_packet)
+        self.client.client_socket.send.assert_called_with(self.mock_audio_packet, websocket.ABNF.OPCODE_BINARY)
+class TestTee(BaseTestCase):
+    @patch('whisper_live.client.websocket.WebSocketApp')
+    @patch('whisper_live.client.pyaudio.PyAudio')
+    def setUp(self, mock_audio, mock_websocket):
+        super().setUp()
+        self.client2 = Client(host='localhost', port=9090, lang="es", translate=False, srt_file_path="transcript.srt")
+        self.client3 = Client(host='localhost', port=9090, lang="es", translate=True, srt_file_path="translation.srt")
+        # need a separate mock for each websocket
+        self.client3.client_socket = copy.deepcopy(self.client3.client_socket)
+        self.tee = TranscriptionTeeClient([self.client2, self.client3])
+    def tearDown(self):
+        self.tee.close_all_clients()
+        del self.tee
+        super().tearDown()
+    def test_invalid_constructor(self):
+        with self.assertRaises(Exception) as context:
+            TranscriptionTeeClient([])
+    def test_multicast_unconditional(self):
+        self.tee.multicast_packet(self.mock_audio_packet, True)
+        for client in self.tee.clients:
+            client.client_socket.send.assert_called_with(self.mock_audio_packet, websocket.ABNF.OPCODE_BINARY)
+    def test_multicast_conditional(self):
+        self.client2.recording = False
+        self.client3.recording = True
+        self.tee.multicast_packet(self.mock_audio_packet, False)
+        self.client2.client_socket.send.assert_not_called()
+        self.client3.client_socket.send.assert_called_with(self.mock_audio_packet, websocket.ABNF.OPCODE_BINARY)
+    def test_close_all(self):
+        self.tee.close_all_clients()
+        for client in self.tee.clients:
+            client.client_socket.close.assert_called()
+    def test_write_all_srt(self):
+        for client in self.tee.clients:
+            client.server_backend = "faster_whisper"
+        self.tee.write_all_clients_srt()
+        self.assertTrue(Path("transcript.srt").is_file())
+        self.assertTrue(Path("translation.srt").is_file())

tests/test_server.py ADDED Viewed

	@@ -0,0 +1,148 @@

+import subprocess
+import time
+import json
+import unittest
+from unittest import mock
+import numpy as np
+import jiwer
+from websockets.exceptions import ConnectionClosed
+from whisper_live.server import TranscriptionServer, BackendType, ClientManager
+from whisper_live.client import Client, TranscriptionClient, TranscriptionTeeClient
+from whisper.normalizers import EnglishTextNormalizer
+class TestTranscriptionServerInitialization(unittest.TestCase):
+    def test_initialization(self):
+        server = TranscriptionServer()
+        server.client_manager = ClientManager(max_clients=4, max_connection_time=600)
+        self.assertEqual(server.client_manager.max_clients, 4)
+        self.assertEqual(server.client_manager.max_connection_time, 600)
+        self.assertDictEqual(server.client_manager.clients, {})
+        self.assertDictEqual(server.client_manager.start_times, {})
+class TestGetWaitTime(unittest.TestCase):
+    def setUp(self):
+        self.server = TranscriptionServer()
+        self.server.client_manager = ClientManager(max_clients=4, max_connection_time=600)
+        self.server.client_manager.start_times = {
+            'client1': time.time() - 120,
+            'client2': time.time() - 300
+        }
+        self.server.client_manager.max_connection_time = 600
+    def test_get_wait_time(self):
+        expected_wait_time = (600 - (time.time() - self.server.client_manager.start_times['client2'])) / 60
+        print(self.server.client_manager.get_wait_time(), expected_wait_time)
+        self.assertAlmostEqual(self.server.client_manager.get_wait_time(), expected_wait_time, places=2)
+class TestServerConnection(unittest.TestCase):
+    def setUp(self):
+        self.server = TranscriptionServer()
+    @mock.patch('websockets.WebSocketCommonProtocol')
+    def test_connection(self, mock_websocket):
+        mock_websocket.recv.return_value = json.dumps({
+            'uid': 'test_client',
+            'language': 'en',
+            'task': 'transcribe',
+            'model': 'tiny.en'
+        })
+        self.server.recv_audio(mock_websocket, BackendType("faster_whisper"))
+    @mock.patch('websockets.WebSocketCommonProtocol')
+    def test_recv_audio_exception_handling(self, mock_websocket):
+        mock_websocket.recv.side_effect = [json.dumps({
+            'uid': 'test_client',
+            'language': 'en',
+            'task': 'transcribe',
+            'model': 'tiny.en'
+        }),  np.array([1, 2, 3]).tobytes()]
+        with self.assertLogs(level="ERROR"):
+            self.server.recv_audio(mock_websocket, BackendType("faster_whisper"))
+        self.assertNotIn(mock_websocket, self.server.client_manager.clients)
+class TestServerInferenceAccuracy(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.mock_pyaudio_patch = mock.patch('pyaudio.PyAudio')
+        cls.mock_pyaudio = cls.mock_pyaudio_patch.start()
+        cls.mock_pyaudio.return_value.open.return_value = mock.MagicMock()
+        cls.server_process = subprocess.Popen(["python", "run_server.py"])
+        time.sleep(2)
+    @classmethod
+    def tearDownClass(cls):
+        cls.server_process.terminate()
+        cls.server_process.wait()
+    def setUp(self):
+        self.normalizer = EnglishTextNormalizer()
+    def check_prediction(self, srt_path):
+        gt = "And so my fellow Americans, ask not, what your country can do for you. Ask what you can do for your country!"
+        with open(srt_path, "r") as f:
+            lines = f.readlines()
+            prediction = " ".join([line.strip() for line in lines[2::4]])
+        prediction_normalized = self.normalizer(prediction)
+        gt_normalized = self.normalizer(gt)
+        # calculate WER
+        wer_score = jiwer.wer(gt_normalized, prediction_normalized)
+        self.assertLess(wer_score, 0.05)
+    def test_inference(self):
+        client = TranscriptionClient(
+            "localhost", "9090", model="base.en", lang="en",
+        )
+        client("assets/jfk.flac")
+        self.check_prediction("output.srt")
+    def test_simultaneous_inference(self):
+        client1 = Client(
+            "localhost", "9090", model="base.en", lang="en", srt_file_path="transcript1.srt")
+        client2 = Client(
+            "localhost", "9090", model="base.en", lang="en", srt_file_path="transcript2.srt")
+        tee = TranscriptionTeeClient([client1, client2])
+        tee("assets/jfk.flac")
+        self.check_prediction("transcript1.srt")
+        self.check_prediction("transcript2.srt")
+class TestExceptionHandling(unittest.TestCase):
+    def setUp(self):
+        self.server = TranscriptionServer()
+    @mock.patch('websockets.WebSocketCommonProtocol')
+    def test_connection_closed_exception(self, mock_websocket):
+        mock_websocket.recv.side_effect = ConnectionClosed(1001, "testing connection closed", rcvd_then_sent=mock.Mock())
+        with self.assertLogs(level="INFO") as log:
+            self.server.recv_audio(mock_websocket, BackendType("faster_whisper"))
+            self.assertTrue(any("Connection closed by client" in message for message in log.output))
+    @mock.patch('websockets.WebSocketCommonProtocol')
+    def test_json_decode_exception(self, mock_websocket):
+        mock_websocket.recv.return_value = "invalid json"
+        with self.assertLogs(level="ERROR") as log:
+            self.server.recv_audio(mock_websocket, BackendType("faster_whisper"))
+            self.assertTrue(any("Failed to decode JSON from client" in message for message in log.output))
+    @mock.patch('websockets.WebSocketCommonProtocol')
+    def test_unexpected_exception_handling(self, mock_websocket):
+        mock_websocket.recv.side_effect = RuntimeError("Unexpected error")
+        with self.assertLogs(level="ERROR") as log:
+            self.server.recv_audio(mock_websocket, BackendType("faster_whisper"))
+            for message in log.output:
+                print(message)
+            print()
+            self.assertTrue(any("Unexpected error" in message for message in log.output))

tests/test_vad.py ADDED Viewed

	@@ -0,0 +1,26 @@

+import unittest
+import numpy as np
+from whisper_live.transcriber.tensorrt_utils import load_audio
+from whisper_live.vad import VoiceActivityDetector
+class TestVoiceActivityDetection(unittest.TestCase):
+    def setUp(self):
+        self.vad = VoiceActivityDetector()
+        self.sample_rate = 16000
+    def generate_silence(self, duration_seconds):
+        return np.zeros(int(self.sample_rate * duration_seconds), dtype=np.float32)
+    def load_speech_segment(self, filepath):
+        return load_audio(filepath)
+    def test_vad_silence_detection(self):
+        silence = self.generate_silence(3)
+        is_speech_present = self.vad(silence.copy())
+        self.assertFalse(is_speech_present, "VAD incorrectly identified silence as speech.")
+    def test_vad_speech_detection(self):
+        audio_tensor = load_audio("assets/jfk.flac")
+        is_speech_present = self.vad(audio_tensor)
+        self.assertTrue(is_speech_present, "VAD failed to identify speech segment.")

whisper_live/__init__.py ADDED Viewed

File without changes

whisper_live/__version__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ __version__ = "0.7.1"

whisper_live/backend/__init__.py ADDED Viewed

File without changes

whisper_live/backend/base.py ADDED Viewed

	@@ -0,0 +1,361 @@

+import json
+import logging
+import threading
+import time
+import numpy as np
+class ServeClientBase(object):
+    RATE = 16000
+    SERVER_READY = "SERVER_READY"
+    DISCONNECT = "DISCONNECT"
+    client_uid: str
+    """A unique identifier for the client."""
+    websocket: object
+    """The WebSocket connection for the client."""
+    send_last_n_segments: int
+    """Number of most recent segments to send to the client."""
+    no_speech_thresh: float
+    """Segments with no speech probability above this threshold will be discarded."""
+    clip_audio: bool
+    """Whether to clip audio with no valid segments."""
+    same_output_threshold: int
+    """Number of repeated outputs before considering it as a valid segment."""
+    def __init__(
+        self,
+        client_uid,
+        websocket,
+        send_last_n_segments=10,
+        no_speech_thresh=0.45,
+        clip_audio=False,
+        same_output_threshold=10,
+    ):
+        self.client_uid = client_uid
+        self.websocket = websocket
+        self.send_last_n_segments = send_last_n_segments
+        self.no_speech_thresh = no_speech_thresh
+        self.clip_audio = clip_audio
+        self.same_output_threshold = same_output_threshold
+        self.frames = b""
+        self.timestamp_offset = 0.0
+        self.frames_np = None
+        self.frames_offset = 0.0
+        self.text = []
+        self.current_out = ""
+        self.prev_out = ""
+        self.exit = False
+        self.same_output_count = 0
+        self.transcript = []
+        self.end_time_for_same_output = None
+        # threading
+        self.lock = threading.Lock()
+    def speech_to_text(self):
+        """
+        Process an audio stream in an infinite loop, continuously transcribing the speech.
+        This method continuously receives audio frames, performs real-time transcription, and sends
+        transcribed segments to the client via a WebSocket connection.
+        If the client's language is not detected, it waits for 30 seconds of audio input to make a language prediction.
+        It utilizes the Whisper ASR model to transcribe the audio, continuously processing and streaming results. Segments
+        are sent to the client in real-time, and a history of segments is maintained to provide context.
+        Raises:
+            Exception: If there is an issue with audio processing or WebSocket communication.
+        """
+        while True:
+            if self.exit:
+                logging.info("Exiting speech to text thread")
+                break
+            if self.frames_np is None:
+                continue
+            if self.clip_audio:
+                self.clip_audio_if_no_valid_segment()
+            input_bytes, duration = self.get_audio_chunk_for_processing()
+            if duration < 1.0:
+                time.sleep(0.1)     # wait for audio chunks to arrive
+                continue
+            try:
+                input_sample = input_bytes.copy()
+                result = self.transcribe_audio(input_sample)
+                if result is None or self.language is None:
+                    self.timestamp_offset += duration
+                    time.sleep(0.25)    # wait for voice activity, result is None when no voice activity
+                    continue
+                self.handle_transcription_output(result, duration)
+            except Exception as e:
+                logging.error(f"[ERROR]: Failed to transcribe audio chunk: {e}")
+                time.sleep(0.01)
+    def transcribe_audio(self):
+        raise NotImplementedError
+    def handle_transcription_output(self, result, duration):
+        raise NotImplementedError
+    def format_segment(self, start, end, text, completed=False):
+        """
+        Formats a transcription segment with precise start and end times alongside the transcribed text.
+        Args:
+            start (float): The start time of the transcription segment in seconds.
+            end (float): The end time of the transcription segment in seconds.
+            text (str): The transcribed text corresponding to the segment.
+        Returns:
+            dict: A dictionary representing the formatted transcription segment, including
+                'start' and 'end' times as strings with three decimal places and the 'text'
+                of the transcription.
+        """
+        return {
+            'start': "{:.3f}".format(start),
+            'end': "{:.3f}".format(end),
+            'text': text,
+            'completed': completed
+        }
+    def add_frames(self, frame_np):
+        """
+        Add audio frames to the ongoing audio stream buffer.
+        This method is responsible for maintaining the audio stream buffer, allowing the continuous addition
+        of audio frames as they are received. It also ensures that the buffer does not exceed a specified size
+        to prevent excessive memory usage.
+        If the buffer size exceeds a threshold (45 seconds of audio data), it discards the oldest 30 seconds
+        of audio data to maintain a reasonable buffer size. If the buffer is empty, it initializes it with the provided
+        audio frame. The audio stream buffer is used for real-time processing of audio data for transcription.
+        Args:
+            frame_np (numpy.ndarray): The audio frame data as a NumPy array.
+        """
+        self.lock.acquire()
+        if self.frames_np is not None and self.frames_np.shape[0] > 45*self.RATE:
+            self.frames_offset += 30.0
+            self.frames_np = self.frames_np[int(30*self.RATE):]
+            # check timestamp offset(should be >= self.frame_offset)
+            # this basically means that there is no speech as timestamp offset hasnt updated
+            # and is less than frame_offset
+            if self.timestamp_offset < self.frames_offset:
+                self.timestamp_offset = self.frames_offset
+        if self.frames_np is None:
+            self.frames_np = frame_np.copy()
+        else:
+            self.frames_np = np.concatenate((self.frames_np, frame_np), axis=0)
+        self.lock.release()
+    def clip_audio_if_no_valid_segment(self):
+        """
+        Update the timestamp offset based on audio buffer status.
+        Clip audio if the current chunk exceeds 30 seconds, this basically implies that
+        no valid segment for the last 30 seconds from whisper
+        """
+        with self.lock:
+            if self.frames_np[int((self.timestamp_offset - self.frames_offset)*self.RATE):].shape[0] > 25 * self.RATE:
+                duration = self.frames_np.shape[0] / self.RATE
+                self.timestamp_offset = self.frames_offset + duration - 5
+    def get_audio_chunk_for_processing(self):
+        """
+        Retrieves the next chunk of audio data for processing based on the current offsets.
+        Calculates which part of the audio data should be processed next, based on
+        the difference between the current timestamp offset and the frame's offset, scaled by
+        the audio sample rate (RATE). It then returns this chunk of audio data along with its
+        duration in seconds.
+        Returns:
+            tuple: A tuple containing:
+                - input_bytes (np.ndarray): The next chunk of audio data to be processed.
+                - duration (float): The duration of the audio chunk in seconds.
+        """
+        with self.lock:
+            samples_take = max(0, (self.timestamp_offset - self.frames_offset) * self.RATE)
+            input_bytes = self.frames_np[int(samples_take):].copy()
+        duration = input_bytes.shape[0] / self.RATE
+        return input_bytes, duration
+    def prepare_segments(self, last_segment=None):
+        """
+        Prepares the segments of transcribed text to be sent to the client.
+        This method compiles the recent segments of transcribed text, ensuring that only the
+        specified number of the most recent segments are included. It also appends the most
+        recent segment of text if provided (which is considered incomplete because of the possibility
+        of the last word being truncated in the audio chunk).
+        Args:
+            last_segment (str, optional): The most recent segment of transcribed text to be added
+                                          to the list of segments. Defaults to None.
+        Returns:
+            list: A list of transcribed text segments to be sent to the client.
+        """
+        segments = []
+        if len(self.transcript) >= self.send_last_n_segments:
+            segments = self.transcript[-self.send_last_n_segments:].copy()
+        else:
+            segments = self.transcript.copy()
+        if last_segment is not None:
+            segments = segments + [last_segment]
+        return segments
+    def get_audio_chunk_duration(self, input_bytes):
+        """
+        Calculates the duration of the provided audio chunk.
+        Args:
+            input_bytes (numpy.ndarray): The audio chunk for which to calculate the duration.
+        Returns:
+            float: The duration of the audio chunk in seconds.
+        """
+        return input_bytes.shape[0] / self.RATE
+    def send_transcription_to_client(self, segments):
+        """
+        Sends the specified transcription segments to the client over the websocket connection.
+        This method formats the transcription segments into a JSON object and attempts to send
+        this object to the client. If an error occurs during the send operation, it logs the error.
+        Returns:
+            segments (list): A list of transcription segments to be sent to the client.
+        """
+        try:
+            self.websocket.send(
+                json.dumps({
+                    "uid": self.client_uid,
+                    "segments": segments,
+                })
+            )
+        except Exception as e:
+            logging.error(f"[ERROR]: Sending data to client: {e}")
+    def disconnect(self):
+        """
+        Notify the client of disconnection and send a disconnect message.
+        This method sends a disconnect message to the client via the WebSocket connection to notify them
+        that the transcription service is disconnecting gracefully.
+        """
+        self.websocket.send(json.dumps({
+            "uid": self.client_uid,
+            "message": self.DISCONNECT
+        }))
+    def cleanup(self):
+        """
+        Perform cleanup tasks before exiting the transcription service.
+        This method performs necessary cleanup tasks, including stopping the transcription thread, marking
+        the exit flag to indicate the transcription thread should exit gracefully, and destroying resources
+        associated with the transcription process.
+        """
+        logging.info("Cleaning up.")
+        self.exit = True
+    def get_segment_no_speech_prob(self, segment):
+        return getattr(segment, "no_speech_prob", 0)
+    def get_segment_start(self, segment):
+        return getattr(segment, "start", getattr(segment, "start_ts", 0))
+    def get_segment_end(self, segment):
+        return getattr(segment, "end", getattr(segment, "end_ts", 0))
+    def update_segments(self, segments, duration):
+        """
+        Processes the segments from Whisper and updates the transcript.
+        Uses helper methods to account for differences between backends.
+        Args:
+            segments (list): List of segments returned by the transcriber.
+            duration (float): Duration of the current audio chunk.
+        Returns:
+            dict or None: The last processed segment (if any).
+        """
+        offset = None
+        self.current_out = ''
+        last_segment = None
+        # Process complete segments only if there are more than one
+        # and if the last segment's no_speech_prob is below the threshold.
+        if len(segments) > 1 and self.get_segment_no_speech_prob(segments[-1]) <= self.no_speech_thresh:
+            for s in segments[:-1]:
+                text_ = s.text
+                self.text.append(text_)
+                with self.lock:
+                    start = self.timestamp_offset + self.get_segment_start(s)
+                    end = self.timestamp_offset + min(duration, self.get_segment_end(s))
+                if start >= end:
+                    continue
+                if self.get_segment_no_speech_prob(s) > self.no_speech_thresh:
+                    continue
+                self.transcript.append(self.format_segment(start, end, text_, completed=True))
+                offset = min(duration, self.get_segment_end(s))
+        # Process the last segment if its no_speech_prob is acceptable.
+        if self.get_segment_no_speech_prob(segments[-1]) <= self.no_speech_thresh:
+            self.current_out += segments[-1].text
+            with self.lock:
+                last_segment = self.format_segment(
+                    self.timestamp_offset + self.get_segment_start(segments[-1]),
+                    self.timestamp_offset + min(duration, self.get_segment_end(segments[-1])),
+                    self.current_out,
+                    completed=False
+                )
+        # Handle repeated output logic.
+        if self.current_out.strip() == self.prev_out.strip() and self.current_out != '':
+            self.same_output_count += 1
+            # if we remove the audio because of same output on the nth reptition we might remove the
+            # audio thats not yet transcribed so, capturing the time when it was repeated for the first time
+            if self.end_time_for_same_output is None:
+                self.end_time_for_same_output = self.get_segment_end(segments[-1])
+            time.sleep(0.1)  # wait briefly for any new voice activity
+        else:
+            self.same_output_count = 0
+            self.end_time_for_same_output = None
+        # If the same incomplete segment is repeated too many times,
+        # append it to the transcript and update the offset.
+        if self.same_output_count > self.same_output_threshold:
+            if not self.text or self.text[-1].strip().lower() != self.current_out.strip().lower():
+                self.text.append(self.current_out)
+                with self.lock:
+                    self.transcript.append(self.format_segment(
+                        self.timestamp_offset,
+                        self.timestamp_offset + min(duration, self.end_time_for_same_output),
+                        self.current_out,
+                        completed=True
+                    ))
+            self.current_out = ''
+            offset = min(duration, self.end_time_for_same_output)
+            self.same_output_count = 0
+            last_segment = None
+            self.end_time_for_same_output = None
+        else:
+            self.prev_out = self.current_out
+        if offset is not None:
+            with self.lock:
+                self.timestamp_offset += offset
+        return last_segment

whisper_live/backend/faster_whisper_backend.py ADDED Viewed

	@@ -0,0 +1,216 @@

+import json
+import logging
+import threading
+import time
+import torch
+from whisper_live.transcriber.transcriber_faster_whisper import WhisperModel
+from whisper_live.backend.base import ServeClientBase
+class ServeClientFasterWhisper(ServeClientBase):
+    SINGLE_MODEL = None
+    SINGLE_MODEL_LOCK = threading.Lock()
+    def __init__(
+        self,
+        websocket,
+        task="transcribe",
+        device=None,
+        language=None,
+        client_uid=None,
+        model="small.en",
+        initial_prompt=None,
+        vad_parameters=None,
+        use_vad=True,
+        single_model=False,
+        send_last_n_segments=10,
+        no_speech_thresh=0.45,
+        clip_audio=False,
+        same_output_threshold=10,
+    ):
+        """
+        Initialize a ServeClient instance.
+        The Whisper model is initialized based on the client's language and device availability.
+        The transcription thread is started upon initialization. A "SERVER_READY" message is sent
+        to the client to indicate that the server is ready.
+        Args:
+            websocket (WebSocket): The WebSocket connection for the client.
+            task (str, optional): The task type, e.g., "transcribe". Defaults to "transcribe".
+            device (str, optional): The device type for Whisper, "cuda" or "cpu". Defaults to None.
+            language (str, optional): The language for transcription. Defaults to None.
+            client_uid (str, optional): A unique identifier for the client. Defaults to None.
+            model (str, optional): The whisper model size. Defaults to 'small.en'
+            initial_prompt (str, optional): Prompt for whisper inference. Defaults to None.
+            single_model (bool, optional): Whether to instantiate a new model for each client connection. Defaults to False.
+            send_last_n_segments (int, optional): Number of most recent segments to send to the client. Defaults to 10.
+            no_speech_thresh (float, optional): Segments with no speech probability above this threshold will be discarded. Defaults to 0.45.
+            clip_audio (bool, optional): Whether to clip audio with no valid segments. Defaults to False.
+            same_output_threshold (int, optional): Number of repeated outputs before considering it as a valid segment. Defaults to 10.
+        """
+        super().__init__(
+            client_uid,
+            websocket,
+            send_last_n_segments,
+            no_speech_thresh,
+            clip_audio,
+            same_output_threshold,
+        )
+        self.model_sizes = [
+            "tiny", "tiny.en", "base", "base.en", "small", "small.en",
+            "medium", "medium.en", "large-v2", "large-v3", "distil-small.en",
+            "distil-medium.en", "distil-large-v2", "distil-large-v3",
+            "large-v3-turbo", "turbo"
+        ]
+        self.model_size_or_path = model
+        self.language = "en" if self.model_size_or_path.endswith("en") else language
+        self.task = task
+        self.initial_prompt = initial_prompt
+        self.vad_parameters = vad_parameters or {"onset": 0.5}
+        device = "cuda" if torch.cuda.is_available() else "cpu"
+        if device == "cuda":
+            major, _ = torch.cuda.get_device_capability(device)
+            self.compute_type = "float16" if major >= 7 else "float32"
+        else:
+            self.compute_type = "int8"
+        if self.model_size_or_path is None:
+            return
+        logging.info(f"Using Device={device} with precision {self.compute_type}")
+        try:
+            if single_model:
+                if ServeClientFasterWhisper.SINGLE_MODEL is None:
+                    self.create_model(device)
+                    ServeClientFasterWhisper.SINGLE_MODEL = self.transcriber
+                else:
+                    self.transcriber = ServeClientFasterWhisper.SINGLE_MODEL
+            else:
+                self.create_model(device)
+        except Exception as e:
+            logging.error(f"Failed to load model: {e}")
+            self.websocket.send(json.dumps({
+                "uid": self.client_uid,
+                "status": "ERROR",
+                "message": f"Failed to load model: {str(self.model_size_or_path)}"
+            }))
+            self.websocket.close()
+            return
+        self.use_vad = use_vad
+        # threading
+        self.trans_thread = threading.Thread(target=self.speech_to_text)
+        self.trans_thread.start()
+        self.websocket.send(
+            json.dumps(
+                {
+                    "uid": self.client_uid,
+                    "message": self.SERVER_READY,
+                    "backend": "faster_whisper"
+                }
+            )
+        )
+    def create_model(self, device):
+        """
+        Instantiates a new model, sets it as the transcriber.
+        """
+        self.transcriber = WhisperModel(
+            self.model_size_or_path,
+            device=device,
+            compute_type=self.compute_type,
+            local_files_only=False,
+        )
+    def check_valid_model(self, model_size):
+        """
+        Check if it's a valid whisper model size.
+        Args:
+            model_size (str): The name of the model size to check.
+        Returns:
+            str: The model size if valid, None otherwise.
+        """
+        if model_size not in self.model_sizes:
+            self.websocket.send(
+                json.dumps(
+                    {
+                        "uid": self.client_uid,
+                        "status": "ERROR",
+                        "message": f"Invalid model size {model_size}. Available choices: {self.model_sizes}"
+                    }
+                )
+            )
+            return None
+        return model_size
+    def set_language(self, info):
+        """
+        Updates the language attribute based on the detected language information.
+        Args:
+            info (object): An object containing the detected language and its probability. This object
+                        must have at least two attributes: `language`, a string indicating the detected
+                        language, and `language_probability`, a float representing the confidence level
+                        of the language detection.
+        """
+        if info.language_probability > 0.5:
+            self.language = info.language
+            logging.info(f"Detected language {self.language} with probability {info.language_probability}")
+            self.websocket.send(json.dumps(
+                {"uid": self.client_uid, "language": self.language, "language_prob": info.language_probability}))
+    def transcribe_audio(self, input_sample):
+        """
+        Transcribes the provided audio sample using the configured transcriber instance.
+        If the language has not been set, it updates the session's language based on the transcription
+        information.
+        Args:
+            input_sample (np.array): The audio chunk to be transcribed. This should be a NumPy
+                                    array representing the audio data.
+        Returns:
+            The transcription result from the transcriber. The exact format of this result
+            depends on the implementation of the `transcriber.transcribe` method but typically
+            includes the transcribed text.
+        """
+        if ServeClientFasterWhisper.SINGLE_MODEL:
+            ServeClientFasterWhisper.SINGLE_MODEL_LOCK.acquire()
+        result, info = self.transcriber.transcribe(
+            input_sample,
+            initial_prompt=self.initial_prompt,
+            language=self.language,
+            task=self.task,
+            vad_filter=self.use_vad,
+            vad_parameters=self.vad_parameters if self.use_vad else None)
+        if ServeClientFasterWhisper.SINGLE_MODEL:
+            ServeClientFasterWhisper.SINGLE_MODEL_LOCK.release()
+        if self.language is None and info is not None:
+            self.set_language(info)
+        return result
+    def handle_transcription_output(self, result, duration):
+        """
+        Handle the transcription output, updating the transcript and sending data to the client.
+        Args:
+            result (str): The result from whisper inference i.e. the list of segments.
+            duration (float): Duration of the transcribed audio chunk.
+        """
+        segments = []
+        if len(result):
+            self.t_start = None
+            last_segment = self.update_segments(result, duration)
+            segments = self.prepare_segments(last_segment)
+        if len(segments):
+            self.send_transcription_to_client(segments)

whisper_live/backend/openvino_backend.py ADDED Viewed

	@@ -0,0 +1,148 @@

+import json
+import logging
+import threading
+import time
+from openvino import Core
+from whisper_live.backend.base import ServeClientBase
+from whisper_live.transcriber.transcriber_openvino import WhisperOpenVINO
+class ServeClientOpenVINO(ServeClientBase):
+    SINGLE_MODEL = None
+    SINGLE_MODEL_LOCK = threading.Lock()
+    def __init__(
+        self,
+        websocket,
+        task="transcribe",
+        device=None,
+        language=None,
+        client_uid=None,
+        model="small.en",
+        initial_prompt=None,
+        vad_parameters=None,
+        use_vad=True,
+        single_model=False,
+        send_last_n_segments=10,
+        no_speech_thresh=0.45,
+        clip_audio=False,
+        same_output_threshold=10,
+    ):
+        """
+        Initialize a ServeClient instance.
+        The Whisper model is initialized based on the client's language and device availability.
+        The transcription thread is started upon initialization. A "SERVER_READY" message is sent
+        to the client to indicate that the server is ready.
+        Args:
+            websocket (WebSocket): The WebSocket connection for the client.
+            task (str, optional): The task type, e.g., "transcribe." Defaults to "transcribe".
+            device (str, optional): The device type for Whisper, "cuda" or "cpu". Defaults to None.
+            language (str, optional): The language for transcription. Defaults to None.
+            client_uid (str, optional): A unique identifier for the client. Defaults to None.
+            model (str, optional): Huggingface model_id for a valid OpenVINO model.
+            initial_prompt (str, optional): Prompt for whisper inference. Defaults to None.
+            single_model (bool, optional): Whether to instantiate a new model for each client connection. Defaults to False.
+            send_last_n_segments (int, optional): Number of most recent segments to send to the client. Defaults to 10.
+            no_speech_thresh (float, optional): Segments with no speech probability above this threshold will be discarded. Defaults to 0.45.
+            clip_audio (bool, optional): Whether to clip audio with no valid segments. Defaults to False.
+            same_output_threshold (int, optional): Number of repeated outputs before considering it as a valid segment. Defaults to 10.
+        """
+        super().__init__(
+            client_uid,
+            websocket,
+            send_last_n_segments,
+            no_speech_thresh,
+            clip_audio,
+            same_output_threshold,
+        )
+        self.language = "en" if language is None else language
+        if not self.language.startswith("<|"):
+            self.language = f"<|{self.language}|>"
+        self.task = "transcribe" if task is None else task
+        self.clip_audio = True
+        core = Core()
+        available_devices = core.available_devices
+        if 'GPU' in available_devices:
+            selected_device = 'GPU'
+        else:
+            gpu_devices = [d for d in available_devices if d.startswith('GPU')]
+            selected_device = gpu_devices[0] if gpu_devices else 'CPU'
+        self.device = selected_device
+        if single_model:
+            if ServeClientOpenVINO.SINGLE_MODEL is None:
+                self.create_model(model)
+                ServeClientOpenVINO.SINGLE_MODEL = self.transcriber
+            else:
+                self.transcriber = ServeClientOpenVINO.SINGLE_MODEL
+        else:
+            self.create_model(model)
+        # threading
+        self.trans_thread = threading.Thread(target=self.speech_to_text)
+        self.trans_thread.start()
+        self.websocket.send(json.dumps({
+            "uid": self.client_uid,
+            "message": self.SERVER_READY,
+            "backend": "openvino"
+        }))
+        logging.info(f"Using OpenVINO device: {self.device}")
+        logging.info(f"Running OpenVINO backend with language: {self.language} and task: {self.task}")
+    def create_model(self, model_id):
+        """
+        Instantiates a new model, sets it as the transcriber.
+        """
+        self.transcriber = WhisperOpenVINO(
+            model_id,
+            device=self.device,
+            language=self.language,
+            task=self.task
+        )
+    def transcribe_audio(self, input_sample):
+        """
+        Transcribes the provided audio sample using the configured transcriber instance.
+        If the language has not been set, it updates the session's language based on the transcription
+        information.
+        Args:
+            input_sample (np.array): The audio chunk to be transcribed. This should be a NumPy
+                                    array representing the audio data.
+        Returns:
+            The transcription result from the transcriber. The exact format of this result
+            depends on the implementation of the `transcriber.transcribe` method but typically
+            includes the transcribed text.
+        """
+        if ServeClientOpenVINO.SINGLE_MODEL:
+            ServeClientOpenVINO.SINGLE_MODEL_LOCK.acquire()
+        result = self.transcriber.transcribe(input_sample)
+        if ServeClientOpenVINO.SINGLE_MODEL:
+            ServeClientOpenVINO.SINGLE_MODEL_LOCK.release()
+        return result
+    def handle_transcription_output(self, result, duration):
+        """
+        Handle the transcription output, updating the transcript and sending data to the client.
+        Args:
+            result (str): The result from whisper inference i.e. the list of segments.
+            duration (float): Duration of the transcribed audio chunk.
+        """
+        segments = []
+        if len(result):
+            self.t_start = None
+            last_segment = self.update_segments(result, duration)
+            segments = self.prepare_segments(last_segment)
+        if len(segments):
+            self.send_transcription_to_client(segments)

whisper_live/backend/trt_backend.py ADDED Viewed

	@@ -0,0 +1,210 @@

+import json
+import logging
+import threading
+import time
+from whisper_live.backend.base import ServeClientBase
+from whisper_live.transcriber.transcriber_tensorrt import WhisperTRTLLM
+class ServeClientTensorRT(ServeClientBase):
+    SINGLE_MODEL = None
+    SINGLE_MODEL_LOCK = threading.Lock()
+    def __init__(
+        self,
+        websocket,
+        task="transcribe",
+        multilingual=False,
+        language=None,
+        client_uid=None,
+        model=None,
+        single_model=False,
+        use_py_session=False,
+        max_new_tokens=225,
+        send_last_n_segments=10,
+        no_speech_thresh=0.45,
+        clip_audio=False,
+        same_output_threshold=10,
+    ):
+        """
+        Initialize a ServeClient instance.
+        The Whisper model is initialized based on the client's language and device availability.
+        The transcription thread is started upon initialization. A "SERVER_READY" message is sent
+        to the client to indicate that the server is ready.
+        Args:
+            websocket (WebSocket): The WebSocket connection for the client.
+            task (str, optional): The task type, e.g., "transcribe." Defaults to "transcribe".
+            device (str, optional): The device type for Whisper, "cuda" or "cpu". Defaults to None.
+            multilingual (bool, optional): Whether the client supports multilingual transcription. Defaults to False.
+            language (str, optional): The language for transcription. Defaults to None.
+            client_uid (str, optional): A unique identifier for the client. Defaults to None.
+            single_model (bool, optional): Whether to instantiate a new model for each client connection. Defaults to False.
+            use_py_session (bool, optional): Use python session or cpp session. Defaults to Cpp Session.
+            max_new_tokens (int, optional): Max number of tokens to generate.
+            send_last_n_segments (int, optional): Number of most recent segments to send to the client. Defaults to 10.
+            no_speech_thresh (float, optional): Segments with no speech probability above this threshold will be discarded. Defaults to 0.45.
+            clip_audio (bool, optional): Whether to clip audio with no valid segments. Defaults to False.
+            same_output_threshold (int, optional): Number of repeated outputs before considering it as a valid segment. Defaults to 10.
+        """
+        super().__init__(
+            client_uid,
+            websocket,
+            send_last_n_segments,
+            no_speech_thresh,
+            clip_audio,
+            same_output_threshold,
+        )
+        self.language = language if multilingual else "en"
+        self.task = task
+        self.eos = False
+        self.max_new_tokens = max_new_tokens
+        if single_model:
+            if ServeClientTensorRT.SINGLE_MODEL is None:
+                self.create_model(model, multilingual, use_py_session=use_py_session)
+                ServeClientTensorRT.SINGLE_MODEL = self.transcriber
+            else:
+                self.transcriber = ServeClientTensorRT.SINGLE_MODEL
+        else:
+            self.create_model(model, multilingual, use_py_session=use_py_session)
+        # threading
+        self.trans_thread = threading.Thread(target=self.speech_to_text)
+        self.trans_thread.start()
+        self.websocket.send(json.dumps({
+            "uid": self.client_uid,
+            "message": self.SERVER_READY,
+            "backend": "tensorrt"
+        }))
+    def create_model(self, model, multilingual, warmup=True, use_py_session=False):
+        """
+        Instantiates a new model, sets it as the transcriber and does warmup if desired.
+        """
+        self.transcriber = WhisperTRTLLM(
+            model,
+            assets_dir="assets",
+            device="cuda",
+            is_multilingual=multilingual,
+            language=self.language,
+            task=self.task,
+            use_py_session=use_py_session,
+            max_output_len=self.max_new_tokens,
+        )
+        if warmup:
+            self.warmup()
+    def warmup(self, warmup_steps=10):
+        """
+        Warmup TensorRT since first few inferences are slow.
+        Args:
+            warmup_steps (int): Number of steps to warm up the model for.
+        """
+        logging.info("[INFO:] Warming up TensorRT engine..")
+        mel, _ = self.transcriber.log_mel_spectrogram("assets/jfk.flac")
+        for i in range(warmup_steps):
+            self.transcriber.transcribe(mel)
+    def set_eos(self, eos):
+        """
+        Sets the End of Speech (EOS) flag.
+        Args:
+            eos (bool): The value to set for the EOS flag.
+        """
+        self.lock.acquire()
+        self.eos = eos
+        self.lock.release()
+    def handle_transcription_output(self, last_segment, duration):
+        """
+        Handle the transcription output, updating the transcript and sending data to the client.
+        Args:
+            last_segment (str): The last segment from the whisper output which is considered to be incomplete because
+                                of the possibility of word being truncated.
+            duration (float): Duration of the transcribed audio chunk.
+        """
+        segments = self.prepare_segments({"text": last_segment})
+        self.send_transcription_to_client(segments)
+        if self.eos:
+            self.update_timestamp_offset(last_segment, duration)
+    def transcribe_audio(self, input_bytes):
+        """
+        Transcribe the audio chunk and send the results to the client.
+        Args:
+            input_bytes (np.array): The audio chunk to transcribe.
+        """
+        if ServeClientTensorRT.SINGLE_MODEL:
+            ServeClientTensorRT.SINGLE_MODEL_LOCK.acquire()
+        logging.info(f"[WhisperTensorRT:] Processing audio with duration: {input_bytes.shape[0] / self.RATE}")
+        mel, duration = self.transcriber.log_mel_spectrogram(input_bytes)
+        last_segment = self.transcriber.transcribe(
+            mel,
+            text_prefix=f"<|startoftranscript|><|{self.language}|><|{self.task}|><|notimestamps|>",
+        )
+        if ServeClientTensorRT.SINGLE_MODEL:
+            ServeClientTensorRT.SINGLE_MODEL_LOCK.release()
+        if last_segment:
+            self.handle_transcription_output(last_segment, duration)
+    def update_timestamp_offset(self, last_segment, duration):
+        """
+        Update timestamp offset and transcript.
+        Args:
+            last_segment (str): Last transcribed audio from the whisper model.
+            duration (float): Duration of the last audio chunk.
+        """
+        if not len(self.transcript):
+            self.transcript.append({"text": last_segment + " "})
+        elif self.transcript[-1]["text"].strip() != last_segment:
+            self.transcript.append({"text": last_segment + " "})
+        with self.lock:
+            self.timestamp_offset += duration
+    def speech_to_text(self):
+        """
+        Process an audio stream in an infinite loop, continuously transcribing the speech.
+        This method continuously receives audio frames, performs real-time transcription, and sends
+        transcribed segments to the client via a WebSocket connection.
+        If the client's language is not detected, it waits for 30 seconds of audio input to make a language prediction.
+        It utilizes the Whisper ASR model to transcribe the audio, continuously processing and streaming results. Segments
+        are sent to the client in real-time, and a history of segments is maintained to provide context.
+        Raises:
+            Exception: If there is an issue with audio processing or WebSocket communication.
+        """
+        while True:
+            if self.exit:
+                logging.info("Exiting speech to text thread")
+                break
+            if self.frames_np is None:
+                time.sleep(0.02)    # wait for any audio to arrive
+                continue
+            self.clip_audio_if_no_valid_segment()
+            input_bytes, duration = self.get_audio_chunk_for_processing()
+            if duration < 0.4:
+                continue
+            try:
+                input_sample = input_bytes.copy()
+                logging.info(f"[WhisperTensorRT:] Processing audio with duration: {duration}")
+                self.transcribe_audio(input_sample)
+            except Exception as e:
+                logging.error(f"[ERROR]: {e}")

whisper_live/client.py ADDED Viewed

	@@ -0,0 +1,782 @@

+import os
+import shutil
+import wave
+import logging
+import numpy as np
+import pyaudio
+import threading
+import json
+import websocket
+import uuid
+import time
+import av
+import whisper_live.utils as utils
+class Client:
+    """
+    Handles communication with a server using WebSocket.
+    """
+    INSTANCES = {}
+    END_OF_AUDIO = "END_OF_AUDIO"
+    def __init__(
+        self,
+        host=None,
+        port=None,
+        lang=None,
+        translate=False,
+        model="small",
+        srt_file_path="output.srt",
+        use_vad=True,
+        use_wss=False,
+        log_transcription=True,
+        max_clients=4,
+        max_connection_time=600,
+        send_last_n_segments=10,
+        no_speech_thresh=0.45,
+        clip_audio=False,
+        same_output_threshold=10,
+        transcription_callback=None,
+    ):
+        """
+        Initializes a Client instance for audio recording and streaming to a server.
+        If host and port are not provided, the WebSocket connection will not be established.
+        When translate is True, the task will be set to "translate" instead of "transcribe".
+        he audio recording starts immediately upon initialization.
+        Args:
+            host (str): The hostname or IP address of the server.
+            port (int): The port number for the WebSocket server.
+            lang (str, optional): The selected language for transcription. Default is None.
+            translate (bool, optional): Specifies if the task is translation. Default is False.
+            model (str, optional): The whisper model to use (e.g., "small", "medium", "large"). Default is "small".
+            srt_file_path (str, optional): The file path to save the output SRT file. Default is "output.srt".
+            use_vad (bool, optional): Whether to enable voice activity detection. Default is True.
+            log_transcription (bool, optional): Whether to log transcription output to the console. Default is True.
+            max_clients (int, optional): Maximum number of client connections allowed. Default is 4.
+            max_connection_time (int, optional): Maximum allowed connection time in seconds. Default is 600.
+            send_last_n_segments (int, optional): Number of most recent segments to send to the client. Defaults to 10.
+            no_speech_thresh (float, optional): Segments with no speech probability above this threshold will be discarded. Defaults to 0.45.
+            clip_audio (bool, optional): Whether to clip audio with no valid segments. Defaults to False.
+            same_output_threshold (int, optional): Number of repeated outputs before considering it as a valid segment. Defaults to 10.
+            transcription_callback (callable, optional): A callback function to handle transcription results. Default is None.
+        """
+        self.recording = False
+        self.task = "transcribe"
+        self.uid = str(uuid.uuid4())
+        self.waiting = False
+        self.last_response_received = None
+        self.disconnect_if_no_response_for = 15
+        self.language = lang
+        self.model = model
+        self.server_error = False
+        self.srt_file_path = srt_file_path
+        self.use_vad = use_vad
+        self.use_wss = use_wss
+        self.last_segment = None
+        self.last_received_segment = None
+        self.log_transcription = log_transcription
+        self.max_clients = max_clients
+        self.max_connection_time = max_connection_time
+        self.send_last_n_segments = send_last_n_segments
+        self.no_speech_thresh = no_speech_thresh
+        self.clip_audio = clip_audio
+        self.same_output_threshold = same_output_threshold
+        self.transcription_callback = transcription_callback
+        if translate:
+            self.task = "translate"
+        self.audio_bytes = None
+        if host is not None and port is not None:
+            socket_protocol = 'wss' if self.use_wss else "ws"
+            socket_url = f"{socket_protocol}://{host}:{port}"
+            self.client_socket = websocket.WebSocketApp(
+                socket_url,
+                on_open=lambda ws: self.on_open(ws),
+                on_message=lambda ws, message: self.on_message(ws, message),
+                on_error=lambda ws, error: self.on_error(ws, error),
+                on_close=lambda ws, close_status_code, close_msg: self.on_close(
+                    ws, close_status_code, close_msg
+                ),
+            )
+        else:
+            print("[ERROR]: No host or port specified.")
+            return
+        Client.INSTANCES[self.uid] = self
+        # start websocket client in a thread
+        self.ws_thread = threading.Thread(target=self.client_socket.run_forever)
+        self.ws_thread.daemon = True
+        self.ws_thread.start()
+        self.transcript = []
+        print("[INFO]: * recording")
+    def handle_status_messages(self, message_data):
+        """Handles server status messages."""
+        status = message_data["status"]
+        if status == "WAIT":
+            self.waiting = True
+            print(f"[INFO]: Server is full. Estimated wait time {round(message_data['message'])} minutes.")
+        elif status == "ERROR":
+            print(f"Message from Server: {message_data['message']}")
+            self.server_error = True
+        elif status == "WARNING":
+            print(f"Message from Server: {message_data['message']}")
+    def process_segments(self, segments):
+        """Processes transcript segments."""
+        text = []
+        for i, seg in enumerate(segments):
+            if not text or text[-1] != seg["text"]:
+                text.append(seg["text"])
+                if i == len(segments) - 1 and not seg.get("completed", False):
+                    self.last_segment = seg
+                elif (self.server_backend == "faster_whisper" and seg.get("completed", False) and
+                      (not self.transcript or
+                        float(seg['start']) >= float(self.transcript[-1]['end']))):
+                    self.transcript.append(seg)
+        # update last received segment and last valid response time
+        if self.last_received_segment is None or self.last_received_segment != segments[-1]["text"]:
+            self.last_response_received = time.time()
+            self.last_received_segment = segments[-1]["text"]
+        # call the transcription callback if provided
+        if self.transcription_callback and callable(self.transcription_callback):
+            try:
+                self.transcription_callback(" ".join(text), segments) # string, list
+            except Exception as e:
+                print(f"[WARN] transcription_callback raised: {e}")
+            return
+        if self.log_transcription:
+            # Truncate to last 3 entries for brevity.
+            text = text[-3:]
+            utils.clear_screen()
+            utils.print_transcript(text)
+    def on_message(self, ws, message):
+        """
+        Callback function called when a message is received from the server.
+        It updates various attributes of the client based on the received message, including
+        recording status, language detection, and server messages. If a disconnect message
+        is received, it sets the recording status to False.
+        Args:
+            ws (websocket.WebSocketApp): The WebSocket client instance.
+            message (str): The received message from the server.
+        """
+        message = json.loads(message)
+        if self.uid != message.get("uid"):
+            print("[ERROR]: invalid client uid")
+            return
+        if "status" in message.keys():
+            self.handle_status_messages(message)
+            return
+        if "message" in message.keys() and message["message"] == "DISCONNECT":
+            print("[INFO]: Server disconnected due to overtime.")
+            self.recording = False
+        if "message" in message.keys() and message["message"] == "SERVER_READY":
+            self.last_response_received = time.time()
+            self.recording = True
+            self.server_backend = message["backend"]
+            print(f"[INFO]: Server Running with backend {self.server_backend}")
+            return
+        if "language" in message.keys():
+            self.language = message.get("language")
+            lang_prob = message.get("language_prob")
+            print(
+                f"[INFO]: Server detected language {self.language} with probability {lang_prob}"
+            )
+            return
+        if "segments" in message.keys():
+            self.process_segments(message["segments"])
+    def on_error(self, ws, error):
+        print(f"[ERROR] WebSocket Error: {error}")
+        self.server_error = True
+        self.error_message = error
+    def on_close(self, ws, close_status_code, close_msg):
+        print(f"[INFO]: Websocket connection closed: {close_status_code}: {close_msg}")
+        self.recording = False
+        self.waiting = False
+    def on_open(self, ws):
+        """
+        Callback function called when the WebSocket connection is successfully opened.
+        Sends an initial configuration message to the server, including client UID,
+        language selection, and task type.
+        Args:
+            ws (websocket.WebSocketApp): The WebSocket client instance.
+        """
+        print("[INFO]: Opened connection")
+        ws.send(
+            json.dumps(
+                {
+                    "uid": self.uid,
+                    "language": self.language,
+                    "task": self.task,
+                    "model": self.model,
+                    "use_vad": self.use_vad,
+                    "max_clients": self.max_clients,
+                    "max_connection_time": self.max_connection_time,
+                    "send_last_n_segments": self.send_last_n_segments,
+                    "no_speech_thresh": self.no_speech_thresh,
+                    "clip_audio": self.clip_audio,
+                    "same_output_threshold": self.same_output_threshold,
+                }
+            )
+        )
+    def send_packet_to_server(self, message):
+        """
+        Send an audio packet to the server using WebSocket.
+        Args:
+            message (bytes): The audio data packet in bytes to be sent to the server.
+        """
+        try:
+            self.client_socket.send(message, websocket.ABNF.OPCODE_BINARY)
+        except Exception as e:
+            print(e)
+    def close_websocket(self):
+        """
+        Close the WebSocket connection and join the WebSocket thread.
+        First attempts to close the WebSocket connection using `self.client_socket.close()`. After
+        closing the connection, it joins the WebSocket thread to ensure proper termination.
+        """
+        try:
+            self.client_socket.close()
+        except Exception as e:
+            print("[ERROR]: Error closing WebSocket:", e)
+        try:
+            self.ws_thread.join()
+        except Exception as e:
+            print("[ERROR:] Error joining WebSocket thread:", e)
+    def get_client_socket(self):
+        """
+        Get the WebSocket client socket instance.
+        Returns:
+            WebSocketApp: The WebSocket client socket instance currently in use by the client.
+        """
+        return self.client_socket
+    def write_srt_file(self, output_path="output.srt"):
+        """
+        Writes out the transcript in .srt format.
+        Args:
+            message (output_path, optional): The path to the target file.  Default is "output.srt".
+        """
+        if self.server_backend == "faster_whisper":
+            if not self.transcript and self.last_segment is not None:
+                self.transcript.append(self.last_segment)
+            elif self.last_segment and self.transcript[-1]["text"] != self.last_segment["text"]:
+                self.transcript.append(self.last_segment)
+            utils.create_srt_file(self.transcript, output_path)
+    def wait_before_disconnect(self):
+        """Waits a bit before disconnecting in order to process pending responses."""
+        assert self.last_response_received
+        while time.time() - self.last_response_received < self.disconnect_if_no_response_for:
+            continue
+class TranscriptionTeeClient:
+    """
+    Client for handling audio recording, streaming, and transcription tasks via one or more
+    WebSocket connections.
+    Acts as a high-level client for audio transcription tasks using a WebSocket connection. It can be used
+    to send audio data for transcription to one or more servers, and receive transcribed text segments.
+    Args:
+        clients (list): one or more previously initialized Client instances
+    Attributes:
+        clients (list): the underlying Client instances responsible for handling WebSocket connections.
+    """
+    def __init__(self, clients, save_output_recording=False, output_recording_filename="./output_recording.wav", mute_audio_playback=False):
+        self.clients = clients
+        if not self.clients:
+            raise Exception("At least one client is required.")
+        self.chunk = 4096
+        self.format = pyaudio.paInt16
+        self.channels = 1
+        self.rate = 16000
+        self.record_seconds = 60000
+        self.save_output_recording = save_output_recording
+        self.output_recording_filename = output_recording_filename
+        self.mute_audio_playback = mute_audio_playback
+        self.frames = b""
+        self.p = pyaudio.PyAudio()
+        try:
+            self.stream = self.p.open(
+                format=self.format,
+                channels=self.channels,
+                rate=self.rate,
+                input=True,
+                frames_per_buffer=self.chunk,
+            )
+        except OSError as error:
+            print(f"[WARN]: Unable to access microphone. {error}")
+            self.stream = None
+    def __call__(self, audio=None, rtsp_url=None, hls_url=None, save_file=None):
+        """
+        Start the transcription process.
+        Initiates the transcription process by connecting to the server via a WebSocket. It waits for the server
+        to be ready to receive audio data and then sends audio for transcription. If an audio file is provided, it
+        will be played and streamed to the server; otherwise, it will perform live recording.
+        Args:
+            audio (str, optional): Path to an audio file for transcription. Default is None, which triggers live recording.
+        """
+        assert sum(
+            source is not None for source in [audio, rtsp_url, hls_url]
+        ) <= 1, 'You must provide only one selected source'
+        print("[INFO]: Waiting for server ready ...")
+        for client in self.clients:
+            while not client.recording:
+                if client.waiting or client.server_error:
+                    self.close_all_clients()
+                    return
+        print("[INFO]: Server Ready!")
+        if hls_url is not None:
+            self.process_hls_stream(hls_url, save_file)
+        elif audio is not None:
+            resampled_file = utils.resample(audio)
+            self.play_file(resampled_file)
+        elif rtsp_url is not None:
+            self.process_rtsp_stream(rtsp_url)
+        else:
+            self.record()
+    def close_all_clients(self):
+        """Closes all client websockets."""
+        for client in self.clients:
+            client.close_websocket()
+    def write_all_clients_srt(self):
+        """Writes out .srt files for all clients."""
+        for client in self.clients:
+            client.write_srt_file(client.srt_file_path)
+    def multicast_packet(self, packet, unconditional=False):
+        """
+        Sends an identical packet via all clients.
+        Args:
+            packet (bytes): The audio data packet in bytes to be sent.
+            unconditional (bool, optional): If true, send regardless of whether clients are recording.  Default is False.
+        """
+        for client in self.clients:
+            if (unconditional or client.recording):
+                client.send_packet_to_server(packet)
+    def play_file(self, filename):
+        """
+        Play an audio file and send it to the server for processing.
+        Reads an audio file, plays it through the audio output, and simultaneously sends
+        the audio data to the server for processing. It uses PyAudio to create an audio
+        stream for playback. The audio data is read from the file in chunks, converted to
+        floating-point format, and sent to the server using WebSocket communication.
+        This method is typically used when you want to process pre-recorded audio and send it
+        to the server in real-time.
+        Args:
+            filename (str): The path to the audio file to be played and sent to the server.
+        """
+        # read audio and create pyaudio stream
+        with wave.open(filename, "rb") as wavfile:
+            self.stream = self.p.open(
+                format=self.p.get_format_from_width(wavfile.getsampwidth()),
+                channels=wavfile.getnchannels(),
+                rate=wavfile.getframerate(),
+                input=True,
+                output=True,
+                frames_per_buffer=self.chunk,
+            )
+            chunk_duration = self.chunk / float(wavfile.getframerate())
+            try:
+                while any(client.recording for client in self.clients):
+                    data = wavfile.readframes(self.chunk)
+                    if data == b"":
+                        break
+                    audio_array = self.bytes_to_float_array(data)
+                    self.multicast_packet(audio_array.tobytes())
+                    if self.mute_audio_playback:
+                        time.sleep(chunk_duration)
+                    else:
+                        self.stream.write(data)
+                wavfile.close()
+                for client in self.clients:
+                    client.wait_before_disconnect()
+                self.multicast_packet(Client.END_OF_AUDIO.encode('utf-8'), True)
+                self.write_all_clients_srt()
+                self.stream.close()
+                self.close_all_clients()
+            except KeyboardInterrupt:
+                wavfile.close()
+                self.stream.stop_stream()
+                self.stream.close()
+                self.p.terminate()
+                self.close_all_clients()
+                self.write_all_clients_srt()
+                print("[INFO]: Keyboard interrupt.")
+    def process_rtsp_stream(self, rtsp_url):
+        """
+        Connect to an RTSP source, process the audio stream, and send it for transcription.
+        Args:
+            rtsp_url (str): The URL of the RTSP stream source.
+        """
+        print("[INFO]: Connecting to RTSP stream...")
+        try:
+            container = av.open(rtsp_url, format="rtsp", options={"rtsp_transport": "tcp"})
+            self.process_av_stream(container, stream_type="RTSP")
+        except Exception as e:
+            print(f"[ERROR]: Failed to process RTSP stream: {e}")
+        finally:
+            for client in self.clients:
+                client.wait_before_disconnect()
+            self.multicast_packet(Client.END_OF_AUDIO.encode('utf-8'), True)
+            self.close_all_clients()
+            self.write_all_clients_srt()
+        print("[INFO]: RTSP stream processing finished.")
+    def process_hls_stream(self, hls_url, save_file=None):
+        """
+        Connect to an HLS source, process the audio stream, and send it for transcription.
+        Args:
+            hls_url (str): The URL of the HLS stream source.
+            save_file (str, optional): Local path to save the network stream.
+        """
+        print("[INFO]: Connecting to HLS stream...")
+        try:
+            container = av.open(hls_url, format="hls")
+            self.process_av_stream(container, stream_type="HLS", save_file=save_file)
+        except Exception as e:
+            print(f"[ERROR]: Failed to process HLS stream: {e}")
+        finally:
+            for client in self.clients:
+                client.wait_before_disconnect()
+            self.multicast_packet(Client.END_OF_AUDIO.encode('utf-8'), True)
+            self.close_all_clients()
+            self.write_all_clients_srt()
+        print("[INFO]: HLS stream processing finished.")
+    def process_av_stream(self, container, stream_type, save_file=None):
+        """
+        Process an AV container stream and send audio packets to the server.
+        Args:
+            container (av.container.InputContainer): The input container to process.
+            stream_type (str): The type of stream being processed ("RTSP" or "HLS").
+            save_file (str, optional): Local path to save the stream. Default is None.
+        """
+        audio_stream = next((s for s in container.streams if s.type == "audio"), None)
+        if not audio_stream:
+            print(f"[ERROR]: No audio stream found in {stream_type} source.")
+            return
+        output_container = None
+        if save_file:
+            output_container = av.open(save_file, mode="w")
+            output_audio_stream = output_container.add_stream(codec_name="pcm_s16le", rate=self.rate)
+        try:
+            for packet in container.demux(audio_stream):
+                for frame in packet.decode():
+                    audio_data = frame.to_ndarray().tobytes()
+                    self.multicast_packet(audio_data)
+                    if save_file:
+                        output_container.mux(frame)
+        except Exception as e:
+            print(f"[ERROR]: Error during {stream_type} stream processing: {e}")
+        finally:
+            # Wait for server to send any leftover transcription.
+            time.sleep(5)
+            self.multicast_packet(Client.END_OF_AUDIO.encode('utf-8'), True)
+            if output_container:
+                output_container.close()
+            container.close()
+    def save_chunk(self, n_audio_file):
+        """
+        Saves the current audio frames to a WAV file in a separate thread.
+        Args:
+        n_audio_file (int): The index of the audio file which determines the filename.
+                            This helps in maintaining the order and uniqueness of each chunk.
+        """
+        t = threading.Thread(
+            target=self.write_audio_frames_to_file,
+            args=(self.frames[:], f"chunks/{n_audio_file}.wav",),
+        )
+        t.start()
+    def finalize_recording(self, n_audio_file):
+        """
+        Finalizes the recording process by saving any remaining audio frames,
+        closing the audio stream, and terminating the process.
+        Args:
+        n_audio_file (int): The file index to be used if there are remaining audio frames to be saved.
+                            This index is incremented before use if the last chunk is saved.
+        """
+        if self.save_output_recording and len(self.frames):
+            self.write_audio_frames_to_file(
+                self.frames[:], f"chunks/{n_audio_file}.wav"
+            )
+            n_audio_file += 1
+        self.stream.stop_stream()
+        self.stream.close()
+        self.p.terminate()
+        self.close_all_clients()
+        if self.save_output_recording:
+            self.write_output_recording(n_audio_file)
+        self.write_all_clients_srt()
+    def record(self):
+        """
+        Record audio data from the input stream and save it to a WAV file.
+        Continuously records audio data from the input stream, sends it to the server via a WebSocket
+        connection, and simultaneously saves it to multiple WAV files in chunks. It stops recording when
+        the `RECORD_SECONDS` duration is reached or when the `RECORDING` flag is set to `False`.
+        Audio data is saved in chunks to the "chunks" directory. Each chunk is saved as a separate WAV file.
+        The recording will continue until the specified duration is reached or until the `RECORDING` flag is set to `False`.
+        The recording process can be interrupted by sending a KeyboardInterrupt (e.g., pressing Ctrl+C). After recording,
+        the method combines all the saved audio chunks into the specified `out_file`.
+        """
+        n_audio_file = 0
+        if self.save_output_recording:
+            if os.path.exists("chunks"):
+                shutil.rmtree("chunks")
+            os.makedirs("chunks")
+        try:
+            for _ in range(0, int(self.rate / self.chunk * self.record_seconds)):
+                if not any(client.recording for client in self.clients):
+                    break
+                data = self.stream.read(self.chunk, exception_on_overflow=False)
+                self.frames += data
+                audio_array = self.bytes_to_float_array(data)
+                self.multicast_packet(audio_array.tobytes())
+                # save frames if more than a minute
+                if len(self.frames) > 60 * self.rate:
+                    if self.save_output_recording:
+                        self.save_chunk(n_audio_file)
+                        n_audio_file += 1
+                    self.frames = b""
+            self.write_all_clients_srt()
+        except KeyboardInterrupt:
+            self.finalize_recording(n_audio_file)
+    def write_audio_frames_to_file(self, frames, file_name):
+        """
+        Write audio frames to a WAV file.
+        The WAV file is created or overwritten with the specified name. The audio frames should be
+        in the correct format and match the specified channel, sample width, and sample rate.
+        Args:
+            frames (bytes): The audio frames to be written to the file.
+            file_name (str): The name of the WAV file to which the frames will be written.
+        """
+        with wave.open(file_name, "wb") as wavfile:
+            wavfile: wave.Wave_write
+            wavfile.setnchannels(self.channels)
+            wavfile.setsampwidth(2)
+            wavfile.setframerate(self.rate)
+            wavfile.writeframes(frames)
+    def write_output_recording(self, n_audio_file):
+        """
+        Combine and save recorded audio chunks into a single WAV file.
+        The individual audio chunk files are expected to be located in the "chunks" directory. Reads each chunk
+        file, appends its audio data to the final recording, and then deletes the chunk file. After combining
+        and saving, the final recording is stored in the specified `out_file`.
+        Args:
+            n_audio_file (int): The number of audio chunk files to combine.
+            out_file (str): The name of the output WAV file to save the final recording.
+        """
+        input_files = [
+            f"chunks/{i}.wav"
+            for i in range(n_audio_file)
+            if os.path.exists(f"chunks/{i}.wav")
+        ]
+        with wave.open(self.output_recording_filename, "wb") as wavfile:
+            wavfile: wave.Wave_write
+            wavfile.setnchannels(self.channels)
+            wavfile.setsampwidth(2)
+            wavfile.setframerate(self.rate)
+            for in_file in input_files:
+                with wave.open(in_file, "rb") as wav_in:
+                    while True:
+                        data = wav_in.readframes(self.chunk)
+                        if data == b"":
+                            break
+                        wavfile.writeframes(data)
+                # remove this file
+                os.remove(in_file)
+        wavfile.close()
+        # clean up temporary directory to store chunks
+        if os.path.exists("chunks"):
+            shutil.rmtree("chunks")
+    @staticmethod
+    def bytes_to_float_array(audio_bytes):
+        """
+        Convert audio data from bytes to a NumPy float array.
+        It assumes that the audio data is in 16-bit PCM format. The audio data is normalized to
+        have values between -1 and 1.
+        Args:
+            audio_bytes (bytes): Audio data in bytes.
+        Returns:
+            np.ndarray: A NumPy array containing the audio data as float values normalized between -1 and 1.
+        """
+        raw_data = np.frombuffer(buffer=audio_bytes, dtype=np.int16)
+        return raw_data.astype(np.float32) / 32768.0
+class TranscriptionClient(TranscriptionTeeClient):
+    """
+    Client for handling audio transcription tasks via a single WebSocket connection.
+    Acts as a high-level client for audio transcription tasks using a WebSocket connection. It can be used
+    to send audio data for transcription to a server and receive transcribed text segments.
+    Args:
+        host (str): The hostname or IP address of the server.
+        port (int): The port number to connect to on the server.
+        lang (str, optional): The primary language for transcription. Default is None, which defaults to English ('en').
+        translate (bool, optional): If True, the task will be translation instead of transcription. Default is False.
+        model (str, optional): The whisper model to use (e.g., "small", "base"). Default is "small".
+        use_vad (bool, optional): Whether to enable voice activity detection. Default is True.
+        save_output_recording (bool, optional): Whether to save the microphone recording. Default is False.
+        output_recording_filename (str, optional): Path to save the output recording WAV file. Default is "./output_recording.wav".
+        output_transcription_path (str, optional): File path to save the output transcription (SRT file). Default is "./output.srt".
+        log_transcription (bool, optional): Whether to log transcription output to the console. Default is True.
+        max_clients (int, optional): Maximum number of client connections allowed. Default is 4.
+        max_connection_time (int, optional): Maximum allowed connection time in seconds. Default is 600.
+        mute_audio_playback (bool, optional): If True, mutes audio playback during file playback. Default is False.
+        send_last_n_segments (int, optional): Number of most recent segments to send to the client. Defaults to 10.
+        no_speech_thresh (float, optional): Segments with no speech probability above this threshold will be discarded. Defaults to 0.45.
+        clip_audio (bool, optional): Whether to clip audio with no valid segments. Defaults to False.
+        same_output_threshold (int, optional): Number of repeated outputs before considering it as a valid segment. Defaults to 10.
+        transcription_callback (callable, optional): A callback function to handle transcription results. Default is None.
+    Attributes:
+        client (Client): An instance of the underlying Client class responsible for handling the WebSocket connection.
+    Example:
+        To create a TranscriptionClient and start transcription on microphone audio:
+        ```python
+        transcription_client = TranscriptionClient(host="localhost", port=9090)
+        transcription_client()
+        ```
+    """
+    def __init__(
+        self,
+        host,
+        port,
+        lang=None,
+        translate=False,
+        model="small",
+        use_vad=True,
+        use_wss=False,
+        save_output_recording=False,
+        output_recording_filename="./output_recording.wav",
+        output_transcription_path="./output.srt",
+        log_transcription=True,
+        max_clients=4,
+        max_connection_time=600,
+        mute_audio_playback=False,
+        send_last_n_segments=10,
+        no_speech_thresh=0.45,
+        clip_audio=False,
+        same_output_threshold=10,
+        transcription_callback=None,
+    ):
+        self.client = Client(
+            host,
+            port,
+            lang,
+            translate,
+            model,
+            srt_file_path=output_transcription_path,
+            use_vad=use_vad,
+            use_wss=use_wss,
+            log_transcription=log_transcription,
+            max_clients=max_clients,
+            max_connection_time=max_connection_time,
+            send_last_n_segments=send_last_n_segments,
+            no_speech_thresh=no_speech_thresh,
+            clip_audio=clip_audio,
+            same_output_threshold=same_output_threshold,
+            transcription_callback=transcription_callback,
+        )
+        if save_output_recording and not output_recording_filename.endswith(".wav"):
+            raise ValueError(f"Please provide a valid `output_recording_filename`: {output_recording_filename}")
+        if not output_transcription_path.endswith(".srt"):
+            raise ValueError(f"Please provide a valid `output_transcription_path`: {output_transcription_path}. The file extension should be `.srt`.")
+        TranscriptionTeeClient.__init__(
+            self,
+            [self.client],
+            save_output_recording=save_output_recording,
+            output_recording_filename=output_recording_filename,
+            mute_audio_playback=mute_audio_playback
+        )

whisper_live/server.py ADDED Viewed

	@@ -0,0 +1,446 @@

+import os
+import time
+import threading
+import json
+import functools
+import logging
+from enum import Enum
+from typing import List, Optional
+import numpy as np
+from websockets.sync.server import serve
+from websockets.exceptions import ConnectionClosed
+from whisper_live.vad import VoiceActivityDetector
+from whisper_live.backend.base import ServeClientBase
+logging.basicConfig(level=logging.INFO)
+class ClientManager:
+    def __init__(self, max_clients=4, max_connection_time=600):
+        """
+        Initializes the ClientManager with specified limits on client connections and connection durations.
+        Args:
+            max_clients (int, optional): The maximum number of simultaneous client connections allowed. Defaults to 4.
+            max_connection_time (int, optional): The maximum duration (in seconds) a client can stay connected. Defaults
+                                                 to 600 seconds (10 minutes).
+        """
+        self.clients = {}
+        self.start_times = {}
+        self.max_clients = max_clients
+        self.max_connection_time = max_connection_time
+    def add_client(self, websocket, client):
+        """
+        Adds a client and their connection start time to the tracking dictionaries.
+        Args:
+            websocket: The websocket associated with the client to add.
+            client: The client object to be added and tracked.
+        """
+        self.clients[websocket] = client
+        self.start_times[websocket] = time.time()
+    def get_client(self, websocket):
+        """
+        Retrieves a client associated with the given websocket.
+        Args:
+            websocket: The websocket associated with the client to retrieve.
+        Returns:
+            The client object if found, False otherwise.
+        """
+        if websocket in self.clients:
+            return self.clients[websocket]
+        return False
+    def remove_client(self, websocket):
+        """
+        Removes a client and their connection start time from the tracking dictionaries. Performs cleanup on the
+        client if necessary.
+        Args:
+            websocket: The websocket associated with the client to be removed.
+        """
+        client = self.clients.pop(websocket, None)
+        if client:
+            client.cleanup()
+        self.start_times.pop(websocket, None)
+    def get_wait_time(self):
+        """
+        Calculates the estimated wait time for new clients based on the remaining connection times of current clients.
+        Returns:
+            The estimated wait time in minutes for new clients to connect. Returns 0 if there are available slots.
+        """
+        wait_time = None
+        for start_time in self.start_times.values():
+            current_client_time_remaining = self.max_connection_time - (time.time() - start_time)
+            if wait_time is None or current_client_time_remaining < wait_time:
+                wait_time = current_client_time_remaining
+        return wait_time / 60 if wait_time is not None else 0
+    def is_server_full(self, websocket, options):
+        """
+        Checks if the server is at its maximum client capacity and sends a wait message to the client if necessary.
+        Args:
+            websocket: The websocket of the client attempting to connect.
+            options: A dictionary of options that may include the client's unique identifier.
+        Returns:
+            True if the server is full, False otherwise.
+        """
+        if len(self.clients) >= self.max_clients:
+            wait_time = self.get_wait_time()
+            response = {"uid": options["uid"], "status": "WAIT", "message": wait_time}
+            websocket.send(json.dumps(response))
+            return True
+        return False
+    def is_client_timeout(self, websocket):
+        """
+        Checks if a client has exceeded the maximum allowed connection time and disconnects them if so, issuing a warning.
+        Args:
+            websocket: The websocket associated with the client to check.
+        Returns:
+            True if the client's connection time has exceeded the maximum limit, False otherwise.
+        """
+        elapsed_time = time.time() - self.start_times[websocket]
+        if elapsed_time >= self.max_connection_time:
+            self.clients[websocket].disconnect()
+            logging.warning(f"Client with uid '{self.clients[websocket].client_uid}' disconnected due to overtime.")
+            return True
+        return False
+class BackendType(Enum):
+    FASTER_WHISPER = "faster_whisper"
+    TENSORRT = "tensorrt"
+    OPENVINO = "openvino"
+    @staticmethod
+    def valid_types() -> List[str]:
+        return [backend_type.value for backend_type in BackendType]
+    @staticmethod
+    def is_valid(backend: str) -> bool:
+        return backend in BackendType.valid_types()
+    def is_faster_whisper(self) -> bool:
+        return self == BackendType.FASTER_WHISPER
+    def is_tensorrt(self) -> bool:
+        return self == BackendType.TENSORRT
+    def is_openvino(self) -> bool:
+        return self == BackendType.OPENVINO
+class TranscriptionServer:
+    RATE = 16000
+    def __init__(self):
+        self.client_manager = None
+        self.no_voice_activity_chunks = 0
+        self.use_vad = True
+        self.single_model = False
+    def initialize_client(
+        self, websocket, options, faster_whisper_custom_model_path,
+        whisper_tensorrt_path, trt_multilingual, trt_py_session=False,
+    ):
+        client: Optional[ServeClientBase] = None
+        if self.backend.is_tensorrt():
+            try:
+                from whisper_live.backend.trt_backend import ServeClientTensorRT
+                client = ServeClientTensorRT(
+                    websocket,
+                    multilingual=trt_multilingual,
+                    language=options["language"],
+                    task=options["task"],
+                    client_uid=options["uid"],
+                    model=whisper_tensorrt_path,
+                    single_model=self.single_model,
+                    use_py_session=trt_py_session,
+                    send_last_n_segments=options.get("send_last_n_segments", 10),
+                    no_speech_thresh=options.get("no_speech_thresh", 0.45),
+                    clip_audio=options.get("clip_audio", False),
+                    same_output_threshold=options.get("same_output_threshold", 10),
+                )
+                logging.info("Running TensorRT backend.")
+            except Exception as e:
+                logging.error(f"TensorRT-LLM not supported: {e}")
+                self.client_uid = options["uid"]
+                websocket.send(json.dumps({
+                    "uid": self.client_uid,
+                    "status": "WARNING",
+                    "message": "TensorRT-LLM not supported on Server yet. "
+                               "Reverting to available backend: 'faster_whisper'"
+                }))
+                self.backend = BackendType.FASTER_WHISPER
+        if self.backend.is_openvino():
+            try:
+                from whisper_live.backend.openvino_backend import ServeClientOpenVINO
+                client = ServeClientOpenVINO(
+                    websocket,
+                    language=options["language"],
+                    task=options["task"],
+                    client_uid=options["uid"],
+                    model=options["model"],
+                    single_model=self.single_model,
+                    send_last_n_segments=options.get("send_last_n_segments", 10),
+                    no_speech_thresh=options.get("no_speech_thresh", 0.45),
+                    clip_audio=options.get("clip_audio", False),
+                    same_output_threshold=options.get("same_output_threshold", 10),
+                )
+                logging.info("Running OpenVINO backend.")
+            except Exception as e:
+                logging.error(f"OpenVINO not supported: {e}")
+                self.backend = BackendType.FASTER_WHISPER
+                self.client_uid = options["uid"]
+                websocket.send(json.dumps({
+                    "uid": self.client_uid,
+                    "status": "WARNING",
+                    "message": "OpenVINO not supported on Server yet. "
+                                "Reverting to available backend: 'faster_whisper'"
+                }))
+        try:
+            if self.backend.is_faster_whisper():
+                from whisper_live.backend.faster_whisper_backend import ServeClientFasterWhisper
+                if faster_whisper_custom_model_path is not None and os.path.exists(faster_whisper_custom_model_path):
+                    logging.info(f"Using custom model {faster_whisper_custom_model_path}")
+                    options["model"] = faster_whisper_custom_model_path
+                client = ServeClientFasterWhisper(
+                    websocket,
+                    language=options["language"],
+                    task=options["task"],
+                    client_uid=options["uid"],
+                    model=options["model"],
+                    initial_prompt=options.get("initial_prompt"),
+                    vad_parameters=options.get("vad_parameters"),
+                    use_vad=self.use_vad,
+                    single_model=self.single_model,
+                    send_last_n_segments=options.get("send_last_n_segments", 10),
+                    no_speech_thresh=options.get("no_speech_thresh", 0.45),
+                    clip_audio=options.get("clip_audio", False),
+                    same_output_threshold=options.get("same_output_threshold", 10),
+                )
+                logging.info("Running faster_whisper backend.")
+        except Exception as e:
+            logging.error(e)
+            return
+        if client is None:
+            raise ValueError(f"Backend type {self.backend.value} not recognised or not handled.")
+        self.client_manager.add_client(websocket, client)
+    def get_audio_from_websocket(self, websocket):
+        """
+        Receives audio buffer from websocket and creates a numpy array out of it.
+        Args:
+            websocket: The websocket to receive audio from.
+        Returns:
+            A numpy array containing the audio.
+        """
+        frame_data = websocket.recv()
+        if frame_data == b"END_OF_AUDIO":
+            return False
+        return np.frombuffer(frame_data, dtype=np.float32)
+    def handle_new_connection(self, websocket, faster_whisper_custom_model_path,
+                              whisper_tensorrt_path, trt_multilingual, trt_py_session=False):
+        try:
+            logging.info("New client connected")
+            options = websocket.recv()
+            options = json.loads(options)
+            if self.client_manager is None:
+                max_clients = options.get('max_clients', 4)
+                max_connection_time = options.get('max_connection_time', 600)
+                self.client_manager = ClientManager(max_clients, max_connection_time)
+            self.use_vad = options.get('use_vad')
+            if self.client_manager.is_server_full(websocket, options):
+                websocket.close()
+                return False  # Indicates that the connection should not continue
+            if self.backend.is_tensorrt():
+                self.vad_detector = VoiceActivityDetector(frame_rate=self.RATE)
+            self.initialize_client(websocket, options, faster_whisper_custom_model_path,
+                                   whisper_tensorrt_path, trt_multilingual, trt_py_session=trt_py_session)
+            return True
+        except json.JSONDecodeError:
+            logging.error("Failed to decode JSON from client")
+            return False
+        except ConnectionClosed:
+            logging.info("Connection closed by client")
+            return False
+        except Exception as e:
+            logging.error(f"Error during new connection initialization: {str(e)}")
+            return False
+    def process_audio_frames(self, websocket):
+        frame_np = self.get_audio_from_websocket(websocket)
+        client = self.client_manager.get_client(websocket)
+        if frame_np is False:
+            if self.backend.is_tensorrt():
+                client.set_eos(True)
+            return False
+        if self.backend.is_tensorrt():
+            voice_active = self.voice_activity(websocket, frame_np)
+            if voice_active:
+                self.no_voice_activity_chunks = 0
+                client.set_eos(False)
+            if self.use_vad and not voice_active:
+                return True
+        client.add_frames(frame_np)
+        return True
+    def recv_audio(self,
+                   websocket,
+                   backend: BackendType = BackendType.FASTER_WHISPER,
+                   faster_whisper_custom_model_path=None,
+                   whisper_tensorrt_path=None,
+                   trt_multilingual=False,
+                   trt_py_session=False):
+        """
+        Receive audio chunks from a client in an infinite loop.
+        Continuously receives audio frames from a connected client
+        over a WebSocket connection. It processes the audio frames using a
+        voice activity detection (VAD) model to determine if they contain speech
+        or not. If the audio frame contains speech, it is added to the client's
+        audio data for ASR.
+        If the maximum number of clients is reached, the method sends a
+        "WAIT" status to the client, indicating that they should wait
+        until a slot is available.
+        If a client's connection exceeds the maximum allowed time, it will
+        be disconnected, and the client's resources will be cleaned up.
+        Args:
+            websocket (WebSocket): The WebSocket connection for the client.
+            backend (str): The backend to run the server with.
+            faster_whisper_custom_model_path (str): path to custom faster whisper model.
+            whisper_tensorrt_path (str): Required for tensorrt backend.
+            trt_multilingual(bool): Only used for tensorrt, True if multilingual model.
+        Raises:
+            Exception: If there is an error during the audio frame processing.
+        """
+        self.backend = backend
+        if not self.handle_new_connection(websocket, faster_whisper_custom_model_path,
+                                          whisper_tensorrt_path, trt_multilingual, trt_py_session=trt_py_session):
+            return
+        try:
+            while not self.client_manager.is_client_timeout(websocket):
+                if not self.process_audio_frames(websocket):
+                    break
+        except ConnectionClosed:
+            logging.info("Connection closed by client")
+        except Exception as e:
+            logging.error(f"Unexpected error: {str(e)}")
+        finally:
+            if self.client_manager.get_client(websocket):
+                self.cleanup(websocket)
+                websocket.close()
+            del websocket
+    def run(self,
+            host,
+            port=9090,
+            backend="tensorrt",
+            faster_whisper_custom_model_path=None,
+            whisper_tensorrt_path=None,
+            trt_multilingual=False,
+            trt_py_session=False,
+            single_model=False):
+        """
+        Run the transcription server.
+        Args:
+            host (str): The host address to bind the server.
+            port (int): The port number to bind the server.
+        """
+        if faster_whisper_custom_model_path is not None and not os.path.exists(faster_whisper_custom_model_path):
+            raise ValueError(f"Custom faster_whisper model '{faster_whisper_custom_model_path}' is not a valid path.")
+        if whisper_tensorrt_path is not None and not os.path.exists(whisper_tensorrt_path):
+            raise ValueError(f"TensorRT model '{whisper_tensorrt_path}' is not a valid path.")
+        if single_model:
+            if faster_whisper_custom_model_path or whisper_tensorrt_path:
+                logging.info("Custom model option was provided. Switching to single model mode.")
+                self.single_model = True
+                # TODO: load model initially
+            else:
+                logging.info("Single model mode currently only works with custom models.")
+        if not BackendType.is_valid(backend):
+            raise ValueError(f"{backend} is not a valid backend type. Choose backend from {BackendType.valid_types()}")
+        with serve(
+            functools.partial(
+                self.recv_audio,
+                backend=BackendType(backend),
+                faster_whisper_custom_model_path=faster_whisper_custom_model_path,
+                whisper_tensorrt_path=whisper_tensorrt_path,
+                trt_multilingual=trt_multilingual,
+                trt_py_session=trt_py_session,
+            ),
+            host,
+            port
+        ) as server:
+            server.serve_forever()
+    def voice_activity(self, websocket, frame_np):
+        """
+        Evaluates the voice activity in a given audio frame and manages the state of voice activity detection.
+        This method uses the configured voice activity detection (VAD) model to assess whether the given audio frame
+        contains speech. If the VAD model detects no voice activity for more than three consecutive frames,
+        it sets an end-of-speech (EOS) flag for the associated client. This method aims to efficiently manage
+        speech detection to improve subsequent processing steps.
+        Args:
+            websocket: The websocket associated with the current client. Used to retrieve the client object
+                    from the client manager for state management.
+            frame_np (numpy.ndarray): The audio frame to be analyzed. This should be a NumPy array containing
+                                    the audio data for the current frame.
+        Returns:
+            bool: True if voice activity is detected in the current frame, False otherwise. When returning False
+                after detecting no voice activity for more than three consecutive frames, it also triggers the
+                end-of-speech (EOS) flag for the client.
+        """
+        if not self.vad_detector(frame_np):
+            self.no_voice_activity_chunks += 1
+            if self.no_voice_activity_chunks > 3:
+                client = self.client_manager.get_client(websocket)
+                if not client.eos:
+                    client.set_eos(True)
+                time.sleep(0.1)    # Sleep 100m; wait some voice activity.
+            return False
+        return True
+    def cleanup(self, websocket):
+        """
+        Cleans up resources associated with a given client's websocket.
+        Args:
+            websocket: The websocket associated with the client to be cleaned up.
+        """
+        if self.client_manager.get_client(websocket):
+            self.client_manager.remove_client(websocket)

whisper_live/transcriber/__init__.py ADDED Viewed

File without changes

whisper_live/transcriber/tensorrt_utils.py ADDED Viewed

	@@ -0,0 +1,364 @@

+# SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import logging
+import os
+from collections import defaultdict
+from functools import lru_cache
+from pathlib import Path
+from subprocess import CalledProcessError, run
+from typing import Dict, Iterable, List, Optional, TextIO, Tuple, Union
+import kaldialign
+import numpy as np
+import soundfile
+import av
+import wave
+import torch
+import torch.nn.functional as F
+from whisper_live.utils import resample
+Pathlike = Union[str, Path]
+SAMPLE_RATE = 16000
+N_FFT = 400
+HOP_LENGTH = 160
+CHUNK_LENGTH = 30
+N_SAMPLES = CHUNK_LENGTH * SAMPLE_RATE  # 480000 samples in a 30-second chunk
+def load_audio(file: str, sr: int = 16000):
+    """
+    Open an audio file, resample it, and read as a mono waveform.
+    Parameters
+    ----------
+    file: str
+        The audio file to open.
+    sr: int
+        The sample rate to resample the audio if necessary.
+    Returns
+    -------
+    A NumPy array containing the audio waveform, in float32 dtype.
+    """
+    resampled_file = resample(file, sr)
+    with wave.open(resampled_file, "rb") as wav_file:
+        num_frames = wav_file.getnframes()
+        raw_data = wav_file.readframes(num_frames)
+        audio_data = np.frombuffer(raw_data, dtype=np.int16)
+    audio_data = audio_data.astype(np.float32) / 32768.0
+    return audio_data
+def load_audio_wav_format(wav_path):
+    # make sure audio in .wav format
+    assert wav_path.endswith(
+        '.wav'), f"Only support .wav format, but got {wav_path}"
+    waveform, sample_rate = soundfile.read(wav_path)
+    assert sample_rate == 16000, f"Only support 16k sample rate, but got {sample_rate}"
+    return waveform, sample_rate
+def pad_or_trim(array, length: int = N_SAMPLES, *, axis: int = -1):
+    """
+    Pad or trim the audio array to N_SAMPLES, as expected by the encoder.
+    """
+    if torch.is_tensor(array):
+        if array.shape[axis] > length:
+            array = array.index_select(dim=axis,
+                                       index=torch.arange(length,
+                                                          device=array.device))
+        if array.shape[axis] < length:
+            pad_widths = [(0, 0)] * array.ndim
+            pad_widths[axis] = (0, length - array.shape[axis])
+            array = F.pad(array,
+                          [pad for sizes in pad_widths[::-1] for pad in sizes])
+    else:
+        if array.shape[axis] > length:
+            array = array.take(indices=range(length), axis=axis)
+        if array.shape[axis] < length:
+            pad_widths = [(0, 0)] * array.ndim
+            pad_widths[axis] = (0, length - array.shape[axis])
+            array = np.pad(array, pad_widths)
+    return array
+@lru_cache(maxsize=None)
+def mel_filters(device,
+                n_mels: int,
+                mel_filters_dir: str = None) -> torch.Tensor:
+    """
+    load the mel filterbank matrix for projecting STFT into a Mel spectrogram.
+    Allows decoupling librosa dependency; saved using:
+        np.savez_compressed(
+            "mel_filters.npz",
+            mel_80=librosa.filters.mel(sr=16000, n_fft=400, n_mels=80),
+        )
+    """
+    assert n_mels in {80, 128}, f"Unsupported n_mels: {n_mels}"
+    if mel_filters_dir is None:
+        mel_filters_path = os.path.join(os.path.dirname(__file__), "assets",
+                                        "mel_filters.npz")
+    else:
+        mel_filters_path = os.path.join(mel_filters_dir, "mel_filters.npz")
+    with np.load(mel_filters_path) as f:
+        return torch.from_numpy(f[f"mel_{n_mels}"]).to(device)
+def log_mel_spectrogram(
+    audio: Union[str, np.ndarray, torch.Tensor],
+    n_mels: int,
+    padding: int = 0,
+    device: Optional[Union[str, torch.device]] = None,
+    return_duration: bool = False,
+    mel_filters_dir: str = None,
+):
+    """
+    Compute the log-Mel spectrogram of
+    Parameters
+    ----------
+    audio: Union[str, np.ndarray, torch.Tensor], shape = (*)
+        The path to audio or either a NumPy array or Tensor containing the audio waveform in 16 kHz
+    n_mels: int
+        The number of Mel-frequency filters, only 80 and 128 are supported
+    padding: int
+        Number of zero samples to pad to the right
+    device: Optional[Union[str, torch.device]]
+        If given, the audio tensor is moved to this device before STFT
+    Returns
+    -------
+    torch.Tensor, shape = (80 or 128, n_frames)
+        A Tensor that contains the Mel spectrogram
+    """
+    if not torch.is_tensor(audio):
+        if isinstance(audio, str):
+            if audio.endswith('.wav'):
+                audio, _ = load_audio_wav_format(audio)
+            else:
+                audio = load_audio(audio)
+        assert isinstance(audio,
+                          np.ndarray), f"Unsupported audio type: {type(audio)}"
+        duration = audio.shape[-1] / SAMPLE_RATE
+        audio = pad_or_trim(audio, N_SAMPLES)
+        audio = audio.astype(np.float32)
+        audio = torch.from_numpy(audio)
+    if device is not None:
+        audio = audio.to(device)
+    if padding > 0:
+        audio = F.pad(audio, (0, padding))
+    window = torch.hann_window(N_FFT).to(audio.device)
+    stft = torch.stft(audio,
+                      N_FFT,
+                      HOP_LENGTH,
+                      window=window,
+                      return_complex=True)
+    magnitudes = stft[..., :-1].abs()**2
+    filters = mel_filters(audio.device, n_mels, mel_filters_dir)
+    mel_spec = filters @ magnitudes
+    log_spec = torch.clamp(mel_spec, min=1e-10).log10()
+    log_spec = torch.maximum(log_spec, log_spec.max() - 8.0)
+    log_spec = (log_spec + 4.0) / 4.0
+    if return_duration:
+        return log_spec, duration
+    else:
+        return log_spec
+def store_transcripts(filename: Pathlike, texts: Iterable[Tuple[str, str,
+                                                                str]]) -> None:
+    """Save predicted results and reference transcripts to a file.
+    https://github.com/k2-fsa/icefall/blob/master/icefall/utils.py
+    Args:
+      filename:
+        File to save the results to.
+      texts:
+        An iterable of tuples. The first element is the cur_id, the second is
+        the reference transcript and the third element is the predicted result.
+    Returns:
+      Return None.
+    """
+    with open(filename, "w") as f:
+        for cut_id, ref, hyp in texts:
+            print(f"{cut_id}:\tref={ref}", file=f)
+            print(f"{cut_id}:\thyp={hyp}", file=f)
+def write_error_stats(                                              # noqa: C901
+    f: TextIO,
+    test_set_name: str,
+    results: List[Tuple[str, str]],
+    enable_log: bool = True,
+) -> float:
+    """Write statistics based on predicted results and reference transcripts.
+    https://github.com/k2-fsa/icefall/blob/master/icefall/utils.py
+    It will write the following to the given file:
+        - WER
+        - number of insertions, deletions, substitutions, corrects and total
+          reference words. For example::
+              Errors: 23 insertions, 57 deletions, 212 substitutions, over 2606
+              reference words (2337 correct)
+        - The difference between the reference transcript and predicted result.
+          An instance is given below::
+            THE ASSOCIATION OF (EDISON->ADDISON) ILLUMINATING COMPANIES
+          The above example shows that the reference word is `EDISON`,
+          but it is predicted to `ADDISON` (a substitution error).
+          Another example is::
+            FOR THE FIRST DAY (SIR->*) I THINK
+          The reference word `SIR` is missing in the predicted
+          results (a deletion error).
+      results:
+        An iterable of tuples. The first element is the cur_id, the second is
+        the reference transcript and the third element is the predicted result.
+      enable_log:
+        If True, also print detailed WER to the console.
+        Otherwise, it is written only to the given file.
+    Returns:
+      Return None.
+    """
+    subs: Dict[Tuple[str, str], int] = defaultdict(int)
+    ins: Dict[str, int] = defaultdict(int)
+    dels: Dict[str, int] = defaultdict(int)
+    # `words` stores counts per word, as follows:
+    #   corr, ref_sub, hyp_sub, ins, dels
+    words: Dict[str, List[int]] = defaultdict(lambda: [0, 0, 0, 0, 0])
+    num_corr = 0
+    ERR = "*"
+    for cut_id, ref, hyp in results:
+        ali = kaldialign.align(ref, hyp, ERR)
+        for ref_word, hyp_word in ali:
+            if ref_word == ERR:
+                ins[hyp_word] += 1
+                words[hyp_word][3] += 1
+            elif hyp_word == ERR:
+                dels[ref_word] += 1
+                words[ref_word][4] += 1
+            elif hyp_word != ref_word:
+                subs[(ref_word, hyp_word)] += 1
+                words[ref_word][1] += 1
+                words[hyp_word][2] += 1
+            else:
+                words[ref_word][0] += 1
+                num_corr += 1
+    ref_len = sum([len(r) for _, r, _ in results])
+    sub_errs = sum(subs.values())
+    ins_errs = sum(ins.values())
+    del_errs = sum(dels.values())
+    tot_errs = sub_errs + ins_errs + del_errs
+    tot_err_rate = "%.2f" % (100.0 * tot_errs / ref_len)
+    if enable_log:
+        logging.info(f"[{test_set_name}] %WER {tot_errs / ref_len:.2%} "
+                     f"[{tot_errs} / {ref_len}, {ins_errs} ins, "
+                     f"{del_errs} del, {sub_errs} sub ]")
+    print(f"%WER = {tot_err_rate}", file=f)
+    print(
+        f"Errors: {ins_errs} insertions, {del_errs} deletions, "
+        f"{sub_errs} substitutions, over {ref_len} reference "
+        f"words ({num_corr} correct)",
+        file=f,
+    )
+    print(
+        "Search below for sections starting with PER-UTT DETAILS:, "
+        "SUBSTITUTIONS:, DELETIONS:, INSERTIONS:, PER-WORD STATS:",
+        file=f,
+    )
+    print("", file=f)
+    print("PER-UTT DETAILS: corr or (ref->hyp)  ", file=f)
+    for cut_id, ref, hyp in results:
+        ali = kaldialign.align(ref, hyp, ERR)
+        combine_successive_errors = True
+        if combine_successive_errors:
+            ali = [[[x], [y]] for x, y in ali]
+            for i in range(len(ali) - 1):
+                if ali[i][0] != ali[i][1] and ali[i + 1][0] != ali[i + 1][1]:
+                    ali[i + 1][0] = ali[i][0] + ali[i + 1][0]
+                    ali[i + 1][1] = ali[i][1] + ali[i + 1][1]
+                    ali[i] = [[], []]
+            ali = [[
+                list(filter(lambda a: a != ERR, x)),
+                list(filter(lambda a: a != ERR, y)),
+            ] for x, y in ali]
+            ali = list(filter(lambda x: x != [[], []], ali))
+            ali = [[
+                ERR if x == [] else " ".join(x),
+                ERR if y == [] else " ".join(y),
+            ] for x, y in ali]
+        print(
+            f"{cut_id}:\t" + " ".join((ref_word if ref_word == hyp_word else
+                                       f"({ref_word}->{hyp_word})"
+                                       for ref_word, hyp_word in ali)),
+            file=f,
+        )
+    print("", file=f)
+    print("SUBSTITUTIONS: count ref -> hyp", file=f)
+    for count, (ref, hyp) in sorted([(v, k) for k, v in subs.items()],
+                                    reverse=True):
+        print(f"{count}   {ref} -> {hyp}", file=f)
+    print("", file=f)
+    print("DELETIONS: count ref", file=f)
+    for count, ref in sorted([(v, k) for k, v in dels.items()], reverse=True):
+        print(f"{count}   {ref}", file=f)
+    print("", file=f)
+    print("INSERTIONS: count hyp", file=f)
+    for count, hyp in sorted([(v, k) for k, v in ins.items()], reverse=True):
+        print(f"{count}   {hyp}", file=f)
+    print("", file=f)
+    print("PER-WORD STATS: word  corr tot_errs count_in_ref count_in_hyp",
+          file=f)
+    for _, word, counts in sorted([(sum(v[1:]), k, v)
+                                   for k, v in words.items()],
+                                  reverse=True):
+        (corr, ref_sub, hyp_sub, ins, dels) = counts
+        tot_errs = ref_sub + hyp_sub + ins + dels
+        ref_count = corr + ref_sub + dels
+        hyp_count = corr + hyp_sub + ins
+        print(f"{word}   {corr} {tot_errs} {ref_count} {hyp_count}", file=f)
+    return float(tot_err_rate)

whisper_live/transcriber/transcriber_faster_whisper.py ADDED Viewed

	@@ -0,0 +1,1889 @@

+# original https://github.com/guillaumekln/faster-whisper/blob/master/faster_whisper/transcribe.py
+import itertools
+import json
+import logging
+import os
+import zlib
+from dataclasses import asdict, dataclass
+from inspect import signature
+from math import ceil
+from typing import BinaryIO, Iterable, List, Optional, Tuple, Union
+from warnings import warn
+import ctranslate2
+import numpy as np
+import tokenizers
+from tqdm import tqdm
+from faster_whisper.audio import decode_audio, pad_or_trim
+from faster_whisper.feature_extractor import FeatureExtractor
+from faster_whisper.tokenizer import _LANGUAGE_CODES, Tokenizer
+from faster_whisper.utils import download_model, format_timestamp, get_end, get_logger
+from faster_whisper.vad import (
+    SpeechTimestampsMap,
+    VadOptions,
+    collect_chunks,
+    get_speech_timestamps,
+    merge_segments,
+)
+@dataclass
+class Word:
+    start: float
+    end: float
+    word: str
+    probability: float
+    def _asdict(self):
+        warn(
+            "Word._asdict() method is deprecated, use dataclasses.asdict(Word) instead",
+            DeprecationWarning,
+            2,
+        )
+        return asdict(self)
+@dataclass
+class Segment:
+    id: int
+    seek: int
+    start: float
+    end: float
+    text: str
+    tokens: List[int]
+    avg_logprob: float
+    compression_ratio: float
+    no_speech_prob: float
+    words: Optional[List[Word]]
+    temperature: Optional[float]
+    def _asdict(self):
+        warn(
+            "Segment._asdict() method is deprecated, use dataclasses.asdict(Segment) instead",
+            DeprecationWarning,
+            2,
+        )
+        return asdict(self)
+@dataclass
+class TranscriptionOptions:
+    beam_size: int
+    best_of: int
+    patience: float
+    length_penalty: float
+    repetition_penalty: float
+    no_repeat_ngram_size: int
+    log_prob_threshold: Optional[float]
+    no_speech_threshold: Optional[float]
+    compression_ratio_threshold: Optional[float]
+    condition_on_previous_text: bool
+    prompt_reset_on_temperature: float
+    temperatures: List[float]
+    initial_prompt: Optional[Union[str, Iterable[int]]]
+    prefix: Optional[str]
+    suppress_blank: bool
+    suppress_tokens: Optional[List[int]]
+    without_timestamps: bool
+    max_initial_timestamp: float
+    word_timestamps: bool
+    prepend_punctuations: str
+    append_punctuations: str
+    multilingual: bool
+    max_new_tokens: Optional[int]
+    clip_timestamps: Union[str, List[float]]
+    hallucination_silence_threshold: Optional[float]
+    hotwords: Optional[str]
+@dataclass
+class TranscriptionInfo:
+    language: str
+    language_probability: float
+    duration: float
+    duration_after_vad: float
+    all_language_probs: Optional[List[Tuple[str, float]]]
+    transcription_options: TranscriptionOptions
+    vad_options: VadOptions
+class BatchedInferencePipeline:
+    def __init__(
+        self,
+        model,
+    ):
+        self.model: WhisperModel = model
+        self.last_speech_timestamp = 0.0
+    def forward(self, features, tokenizer, chunks_metadata, options):
+        encoder_output, outputs = self.generate_segment_batched(
+            features, tokenizer, options
+        )
+        segmented_outputs = []
+        segment_sizes = []
+        for chunk_metadata, output in zip(chunks_metadata, outputs):
+            duration = chunk_metadata["end_time"] - chunk_metadata["start_time"]
+            segment_size = int(ceil(duration) * self.model.frames_per_second)
+            segment_sizes.append(segment_size)
+            (
+                subsegments,
+                seek,
+                single_timestamp_ending,
+            ) = self.model._split_segments_by_timestamps(
+                tokenizer=tokenizer,
+                tokens=output["tokens"],
+                time_offset=chunk_metadata["start_time"],
+                segment_size=segment_size,
+                segment_duration=duration,
+                seek=0,
+            )
+            segmented_outputs.append(
+                [
+                    dict(
+                        text=tokenizer.decode(subsegment["tokens"]),
+                        avg_logprob=output["avg_logprob"],
+                        no_speech_prob=output["no_speech_prob"],
+                        tokens=subsegment["tokens"],
+                        start=subsegment["start"],
+                        end=subsegment["end"],
+                        compression_ratio=get_compression_ratio(
+                            tokenizer.decode(subsegment["tokens"])
+                        ),
+                        seek=int(
+                            chunk_metadata["start_time"] * self.model.frames_per_second
+                        ),
+                    )
+                    for subsegment in subsegments
+                ]
+            )
+        if options.word_timestamps:
+            self.last_speech_timestamp = self.model.add_word_timestamps(
+                segmented_outputs,
+                tokenizer,
+                encoder_output,
+                segment_sizes,
+                options.prepend_punctuations,
+                options.append_punctuations,
+                self.last_speech_timestamp,
+            )
+        return segmented_outputs
+    def generate_segment_batched(
+        self,
+        features: np.ndarray,
+        tokenizer: Tokenizer,
+        options: TranscriptionOptions,
+    ):
+        batch_size = features.shape[0]
+        prompt = self.model.get_prompt(
+            tokenizer,
+            previous_tokens=(
+                tokenizer.encode(options.initial_prompt)
+                if options.initial_prompt is not None
+                else []
+            ),
+            without_timestamps=options.without_timestamps,
+            hotwords=options.hotwords,
+        )
+        if options.max_new_tokens is not None:
+            max_length = len(prompt) + options.max_new_tokens
+        else:
+            max_length = self.model.max_length
+        if max_length > self.model.max_length:
+            raise ValueError(
+                f"The length of the prompt is {len(prompt)}, and the `max_new_tokens` "
+                f"{max_length - len(prompt)}. Thus, the combined length of the prompt "
+                f"and `max_new_tokens` is: {max_length}. This exceeds the "
+                f"`max_length` of the Whisper model: {self.model.max_length}. "
+                "You should either reduce the length of your prompt, or "
+                "reduce the value of `max_new_tokens`, "
+                f"so that their combined length is less that {self.model.max_length}."
+            )
+        encoder_output = self.model.encode(features)
+        prompts = [prompt.copy() for _ in range(batch_size)]
+        if options.multilingual:
+            language_tokens = [
+                tokenizer.tokenizer.token_to_id(segment_langs[0][0])
+                for segment_langs in self.model.model.detect_language(encoder_output)
+            ]
+            language_token_index = prompt.index(tokenizer.language)
+            for i, language_token in enumerate(language_tokens):
+                prompts[i][language_token_index] = language_token
+        results = self.model.model.generate(
+            encoder_output,
+            prompts,
+            beam_size=options.beam_size,
+            patience=options.patience,
+            length_penalty=options.length_penalty,
+            max_length=max_length,
+            suppress_blank=options.suppress_blank,
+            suppress_tokens=options.suppress_tokens,
+            return_scores=True,
+            return_no_speech_prob=True,
+            sampling_temperature=options.temperatures[0],
+            repetition_penalty=options.repetition_penalty,
+            no_repeat_ngram_size=options.no_repeat_ngram_size,
+        )
+        output = []
+        for result in results:
+            # return scores
+            seq_len = len(result.sequences_ids[0])
+            cum_logprob = result.scores[0] * (seq_len**options.length_penalty)
+            output.append(
+                dict(
+                    avg_logprob=cum_logprob / (seq_len + 1),
+                    no_speech_prob=result.no_speech_prob,
+                    tokens=result.sequences_ids[0],
+                )
+            )
+        return encoder_output, output
+    def transcribe(
+        self,
+        audio: Union[str, BinaryIO, np.ndarray],
+        language: Optional[str] = None,
+        task: str = "transcribe",
+        log_progress: bool = False,
+        beam_size: int = 5,
+        best_of: int = 5,
+        patience: float = 1,
+        length_penalty: float = 1,
+        repetition_penalty: float = 1,
+        no_repeat_ngram_size: int = 0,
+        temperature: Union[float, List[float], Tuple[float, ...]] = [
+            0.0,
+            0.2,
+            0.4,
+            0.6,
+            0.8,
+            1.0,
+        ],
+        compression_ratio_threshold: Optional[float] = 2.4,
+        log_prob_threshold: Optional[float] = -1.0,
+        no_speech_threshold: Optional[float] = 0.6,
+        condition_on_previous_text: bool = True,
+        prompt_reset_on_temperature: float = 0.5,
+        initial_prompt: Optional[Union[str, Iterable[int]]] = None,
+        prefix: Optional[str] = None,
+        suppress_blank: bool = True,
+        suppress_tokens: Optional[List[int]] = [-1],
+        without_timestamps: bool = True,
+        max_initial_timestamp: float = 1.0,
+        word_timestamps: bool = False,
+        prepend_punctuations: str = "\"'“¿([{-",
+        append_punctuations: str = "\"'.。,，!！?？:：”)]}、",
+        multilingual: bool = False,
+        vad_filter: bool = True,
+        vad_parameters: Optional[Union[dict, VadOptions]] = None,
+        max_new_tokens: Optional[int] = None,
+        chunk_length: Optional[int] = None,
+        clip_timestamps: Optional[List[dict]] = None,
+        hallucination_silence_threshold: Optional[float] = None,
+        batch_size: int = 8,
+        hotwords: Optional[str] = None,
+        language_detection_threshold: Optional[float] = 0.5,
+        language_detection_segments: int = 1,
+    ) -> Tuple[Iterable[Segment], TranscriptionInfo]:
+        """transcribe audio in chunks in batched fashion and return with language info.
+        Arguments:
+            audio: Path to the input file (or a file-like object), or the audio waveform.
+            language: The language spoken in the audio. It should be a language code such
+                as "en" or "fr". If not set, the language will be detected in the first 30 seconds
+                of audio.
+            task: Task to execute (transcribe or translate).
+            log_progress: whether to show progress bar or not.
+            beam_size: Beam size to use for decoding.
+            best_of: Number of candidates when sampling with non-zero temperature.
+            patience: Beam search patience factor.
+            length_penalty: Exponential length penalty constant.
+            repetition_penalty: Penalty applied to the score of previously generated tokens
+                (set > 1 to penalize).
+            no_repeat_ngram_size: Prevent repetitions of ngrams with this size (set 0 to disable).
+            temperature: Temperature for sampling. If a list or tuple is passed,
+                only the first value is used.
+            initial_prompt: Optional text string or iterable of token ids to provide as a
+                prompt for the each window.
+            suppress_blank: Suppress blank outputs at the beginning of the sampling.
+            suppress_tokens: List of token IDs to suppress. -1 will suppress a default set
+                of symbols as defined in `tokenizer.non_speech_tokens()`.
+            without_timestamps: Only sample text tokens.
+            word_timestamps: Extract word-level timestamps using the cross-attention pattern
+                and dynamic time warping, and include the timestamps for each word in each segment.
+                Set as False.
+            prepend_punctuations: If word_timestamps is True, merge these punctuation symbols
+                with the next word
+            append_punctuations: If word_timestamps is True, merge these punctuation symbols
+                with the previous word
+            multilingual: Perform language detection on every segment.
+            vad_filter: Enable the voice activity detection (VAD) to filter out parts of the audio
+                without speech. This step is using the Silero VAD model
+                https://github.com/snakers4/silero-vad.
+            vad_parameters: Dictionary of Silero VAD parameters or VadOptions class (see available
+                parameters and default values in the class `VadOptions`).
+            max_new_tokens: Maximum number of new tokens to generate per-chunk. If not set,
+                the maximum will be set by the default max_length.
+            chunk_length: The length of audio segments. If it is not None, it will overwrite the
+                default chunk_length of the FeatureExtractor.
+            clip_timestamps: Optionally provide list of dictionaries each containing "start" and
+                "end" keys that specify the start and end of the voiced region within
+                `chunk_length` boundary. vad_filter will be ignored if clip_timestamps is used.
+            batch_size: the maximum number of parallel requests to model for decoding.
+            hotwords:
+                Hotwords/hint phrases to the model. Has no effect if prefix is not None.
+            language_detection_threshold: If the maximum probability of the language tokens is
+                higher than this value, the language is detected.
+            language_detection_segments: Number of segments to consider for the language detection.
+        Unused Arguments
+            compression_ratio_threshold: If the gzip compression ratio is above this value,
+                treat as failed.
+            log_prob_threshold: If the average log probability over sampled tokens is
+                below this value, treat as failed.
+            no_speech_threshold: If the no_speech probability is higher than this value AND
+                the average log probability over sampled tokens is below `log_prob_threshold`,
+                consider the segment as silent.
+            condition_on_previous_text: If True, the previous output of the model is provided
+                as a prompt for the next window; disabling may make the text inconsistent across
+                windows, but the model becomes less prone to getting stuck in a failure loop,
+                such as repetition looping or timestamps going out of sync. Set as False
+            prompt_reset_on_temperature: Resets prompt if temperature is above this value.
+                Arg has effect only if condition_on_previous_text is True. Set at 0.5
+            prefix: Optional text to provide as a prefix at the beginning of each window.
+            max_initial_timestamp: The initial timestamp cannot be later than this, set at 0.0.
+            hallucination_silence_threshold: Optional[float]
+                When word_timestamps is True, skip silent periods longer than this threshold
+                (in seconds) when a possible hallucination is detected. set as None.
+        Returns:
+          A tuple with:
+            - a generator over transcribed segments
+            - an instance of TranscriptionInfo
+        """
+        sampling_rate = self.model.feature_extractor.sampling_rate
+        if multilingual and not self.model.model.is_multilingual:
+            self.model.logger.warning(
+                "The current model is English-only but the multilingual parameter is set to"
+                "True; setting to False instead."
+            )
+            multilingual = False
+        if not isinstance(audio, np.ndarray):
+            audio = decode_audio(audio, sampling_rate=sampling_rate)
+        duration = audio.shape[0] / sampling_rate
+        chunk_length = chunk_length or self.model.feature_extractor.chunk_length
+        # if no segment split is provided, use vad_model and generate segments
+        if not clip_timestamps:
+            if vad_filter:
+                if vad_parameters is None:
+                    vad_parameters = VadOptions(
+                        max_speech_duration_s=chunk_length,
+                        min_silence_duration_ms=160,
+                    )
+                elif isinstance(vad_parameters, dict):
+                    if "max_speech_duration_s" in vad_parameters.keys():
+                        vad_parameters.pop("max_speech_duration_s")
+                    vad_parameters = VadOptions(
+                        **vad_parameters, max_speech_duration_s=chunk_length
+                    )
+                active_segments = get_speech_timestamps(audio, vad_parameters)
+                clip_timestamps = merge_segments(active_segments, vad_parameters)
+            # run the audio if it is less than 30 sec even without clip_timestamps
+            elif duration < chunk_length:
+                clip_timestamps = [{"start": 0, "end": audio.shape[0]}]
+            else:
+                raise RuntimeError(
+                    "No clip timestamps found. "
+                    "Set 'vad_filter' to True or provide 'clip_timestamps'."
+                )
+        duration_after_vad = (
+            sum((segment["end"] - segment["start"]) for segment in clip_timestamps)
+            / sampling_rate
+        )
+        audio_chunks, chunks_metadata = collect_chunks(audio, clip_timestamps)
+        features = (
+            [self.model.feature_extractor(chunk)[..., :-1] for chunk in audio_chunks]
+            if duration_after_vad
+            else []
+        )
+        all_language_probs = None
+        # detecting the language if not provided
+        if language is None:
+            if not self.model.model.is_multilingual:
+                language = "en"
+                language_probability = 1
+            else:
+                (
+                    language,
+                    language_probability,
+                    all_language_probs,
+                ) = self.model.detect_language(
+                    features=np.concatenate(
+                        features
+                        + [
+                            np.full((self.model.model.n_mels, 1), -1.5, dtype="float32")
+                        ],
+                        axis=1,
+                    ),  # add a dummy feature to account for empty audio
+                    language_detection_segments=language_detection_segments,
+                    language_detection_threshold=language_detection_threshold,
+                )
+                self.model.logger.info(
+                    "Detected language '%s' with probability %.2f",
+                    language,
+                    language_probability,
+                )
+        else:
+            if not self.model.model.is_multilingual and language != "en":
+                self.model.logger.warning(
+                    "The current model is English-only but the language parameter is set to '%s'; "
+                    "using 'en' instead." % language
+                )
+                language = "en"
+            language_probability = 1
+        tokenizer = Tokenizer(
+            self.model.hf_tokenizer,
+            self.model.model.is_multilingual,
+            task=task,
+            language=language,
+        )
+        features = (
+            np.stack([pad_or_trim(feature) for feature in features]) if features else []
+        )
+        options = TranscriptionOptions(
+            beam_size=beam_size,
+            best_of=best_of,
+            patience=patience,
+            length_penalty=length_penalty,
+            repetition_penalty=repetition_penalty,
+            no_repeat_ngram_size=no_repeat_ngram_size,
+            log_prob_threshold=log_prob_threshold,
+            no_speech_threshold=no_speech_threshold,
+            compression_ratio_threshold=compression_ratio_threshold,
+            temperatures=(
+                temperature[:1]
+                if isinstance(temperature, (list, tuple))
+                else [temperature]
+            ),
+            initial_prompt=initial_prompt,
+            prefix=prefix,
+            suppress_blank=suppress_blank,
+            suppress_tokens=get_suppressed_tokens(tokenizer, suppress_tokens),
+            prepend_punctuations=prepend_punctuations,
+            append_punctuations=append_punctuations,
+            max_new_tokens=max_new_tokens,
+            hotwords=hotwords,
+            word_timestamps=word_timestamps,
+            hallucination_silence_threshold=None,
+            condition_on_previous_text=False,
+            clip_timestamps=clip_timestamps,
+            prompt_reset_on_temperature=0.5,
+            multilingual=multilingual,
+            without_timestamps=without_timestamps,
+            max_initial_timestamp=0.0,
+        )
+        info = TranscriptionInfo(
+            language=language,
+            language_probability=language_probability,
+            duration=duration,
+            duration_after_vad=duration_after_vad,
+            transcription_options=options,
+            vad_options=vad_parameters,
+            all_language_probs=all_language_probs,
+        )
+        segments = self._batched_segments_generator(
+            features,
+            tokenizer,
+            chunks_metadata,
+            batch_size,
+            options,
+            log_progress,
+        )
+        return segments, info
+    def _batched_segments_generator(
+        self, features, tokenizer, chunks_metadata, batch_size, options, log_progress
+    ):
+        pbar = tqdm(total=len(features), disable=not log_progress, position=0)
+        seg_idx = 0
+        for i in range(0, len(features), batch_size):
+            results = self.forward(
+                features[i : i + batch_size],
+                tokenizer,
+                chunks_metadata[i : i + batch_size],
+                options,
+            )
+            for result in results:
+                for segment in result:
+                    seg_idx += 1
+                    yield Segment(
+                        seek=segment["seek"],
+                        id=seg_idx,
+                        text=segment["text"],
+                        start=round(segment["start"], 3),
+                        end=round(segment["end"], 3),
+                        words=(
+                            None
+                            if not options.word_timestamps
+                            else [Word(**word) for word in segment["words"]]
+                        ),
+                        tokens=segment["tokens"],
+                        avg_logprob=segment["avg_logprob"],
+                        no_speech_prob=segment["no_speech_prob"],
+                        compression_ratio=segment["compression_ratio"],
+                        temperature=options.temperatures[0],
+                    )
+                pbar.update(1)
+        pbar.close()
+        self.last_speech_timestamp = 0.0
+class WhisperModel:
+    def __init__(
+        self,
+        model_size_or_path: str,
+        device: str = "auto",
+        device_index: Union[int, List[int]] = 0,
+        compute_type: str = "default",
+        cpu_threads: int = 0,
+        num_workers: int = 1,
+        download_root: Optional[str] = None,
+        local_files_only: bool = False,
+        files: dict = None,
+        **model_kwargs,
+    ):
+        """Initializes the Whisper model.
+        Args:
+          model_size_or_path: Size of the model to use (tiny, tiny.en, base, base.en,
+            small, small.en, distil-small.en, medium, medium.en, distil-medium.en, large-v1,
+            large-v2, large-v3, large, distil-large-v2, distil-large-v3, large-v3-turbo, or turbo),
+            a path to a converted model directory, or a CTranslate2-converted Whisper model ID from
+            the HF Hub. When a size or a model ID is configured, the converted model is downloaded
+            from the Hugging Face Hub.
+          device: Device to use for computation ("cpu", "cuda", "auto").
+          device_index: Device ID to use.
+            The model can also be loaded on multiple GPUs by passing a list of IDs
+            (e.g. [0, 1, 2, 3]). In that case, multiple transcriptions can run in parallel
+            when transcribe() is called from multiple Python threads (see also num_workers).
+          compute_type: Type to use for computation.
+            See https://opennmt.net/CTranslate2/quantization.html.
+          cpu_threads: Number of threads to use when running on CPU (4 by default).
+            A non zero value overrides the OMP_NUM_THREADS environment variable.
+          num_workers: When transcribe() is called from multiple Python threads,
+            having multiple workers enables true parallelism when running the model
+            (concurrent calls to self.model.generate() will run in parallel).
+            This can improve the global throughput at the cost of increased memory usage.
+          download_root: Directory where the models should be saved. If not set, the models
+            are saved in the standard Hugging Face cache directory.
+          local_files_only:  If True, avoid downloading the file and return the path to the
+            local cached file if it exists.
+          files: Load model files from the memory. This argument is a dictionary mapping file names
+            to file contents as file-like or bytes objects. If this is set, model_path acts as an
+            identifier for this model.
+        """
+        self.logger = get_logger()
+        tokenizer_bytes, preprocessor_bytes = None, None
+        if files:
+            model_path = model_size_or_path
+            tokenizer_bytes = files.pop("tokenizer.json", None)
+            preprocessor_bytes = files.pop("preprocessor_config.json", None)
+        elif os.path.isdir(model_size_or_path):
+            model_path = model_size_or_path
+        else:
+            model_path = download_model(
+                model_size_or_path,
+                local_files_only=local_files_only,
+                cache_dir=download_root,
+            )
+        self.model = ctranslate2.models.Whisper(
+            model_path,
+            device=device,
+            device_index=device_index,
+            compute_type=compute_type,
+            intra_threads=cpu_threads,
+            inter_threads=num_workers,
+            files=files,
+            **model_kwargs,
+        )
+        tokenizer_file = os.path.join(model_path, "tokenizer.json")
+        if tokenizer_bytes:
+            self.hf_tokenizer = tokenizers.Tokenizer.from_buffer(tokenizer_bytes)
+        elif os.path.isfile(tokenizer_file):
+            self.hf_tokenizer = tokenizers.Tokenizer.from_file(tokenizer_file)
+        else:
+            self.hf_tokenizer = tokenizers.Tokenizer.from_pretrained(
+                "openai/whisper-tiny" + ("" if self.model.is_multilingual else ".en")
+            )
+        self.feat_kwargs = self._get_feature_kwargs(model_path, preprocessor_bytes)
+        self.feature_extractor = FeatureExtractor(**self.feat_kwargs)
+        self.input_stride = 2
+        self.num_samples_per_token = (
+            self.feature_extractor.hop_length * self.input_stride
+        )
+        self.frames_per_second = (
+            self.feature_extractor.sampling_rate // self.feature_extractor.hop_length
+        )
+        self.tokens_per_second = (
+            self.feature_extractor.sampling_rate // self.num_samples_per_token
+        )
+        self.time_precision = 0.02
+        self.max_length = 448
+    @property
+    def supported_languages(self) -> List[str]:
+        """The languages supported by the model."""
+        return list(_LANGUAGE_CODES) if self.model.is_multilingual else ["en"]
+    def _get_feature_kwargs(self, model_path, preprocessor_bytes=None) -> dict:
+        config = {}
+        try:
+            config_path = os.path.join(model_path, "preprocessor_config.json")
+            if preprocessor_bytes:
+                config = json.loads(preprocessor_bytes)
+            elif os.path.isfile(config_path):
+                with open(config_path, "r", encoding="utf-8") as file:
+                    config = json.load(file)
+            else:
+                return config
+            valid_keys = signature(FeatureExtractor.__init__).parameters.keys()
+            return {k: v for k, v in config.items() if k in valid_keys}
+        except json.JSONDecodeError as e:
+            self.logger.warning("Could not load preprocessor config: %s", e)
+        return config
+    def transcribe(
+        self,
+        audio: Union[str, BinaryIO, np.ndarray],
+        language: Optional[str] = None,
+        task: str = "transcribe",
+        log_progress: bool = False,
+        beam_size: int = 5,
+        best_of: int = 5,
+        patience: float = 1,
+        length_penalty: float = 1,
+        repetition_penalty: float = 1,
+        no_repeat_ngram_size: int = 0,
+        temperature: Union[float, List[float], Tuple[float, ...]] = [
+            0.0,
+            0.2,
+            0.4,
+            0.6,
+            0.8,
+            1.0,
+        ],
+        compression_ratio_threshold: Optional[float] = 2.4,
+        log_prob_threshold: Optional[float] = -1.0,
+        no_speech_threshold: Optional[float] = 0.6,
+        condition_on_previous_text: bool = True,
+        prompt_reset_on_temperature: float = 0.5,
+        initial_prompt: Optional[Union[str, Iterable[int]]] = None,
+        prefix: Optional[str] = None,
+        suppress_blank: bool = True,
+        suppress_tokens: Optional[List[int]] = [-1],
+        without_timestamps: bool = False,
+        max_initial_timestamp: float = 1.0,
+        word_timestamps: bool = False,
+        prepend_punctuations: str = "\"'“¿([{-",
+        append_punctuations: str = "\"'.。,，!！?？:：”)]}、",
+        multilingual: bool = False,
+        vad_filter: bool = False,
+        vad_parameters: Optional[Union[dict, VadOptions]] = None,
+        max_new_tokens: Optional[int] = None,
+        chunk_length: Optional[int] = None,
+        clip_timestamps: Union[str, List[float]] = "0",
+        hallucination_silence_threshold: Optional[float] = None,
+        hotwords: Optional[str] = None,
+        language_detection_threshold: Optional[float] = 0.5,
+        language_detection_segments: int = 1,
+    ) -> Tuple[Iterable[Segment], TranscriptionInfo]:
+        """Transcribes an input file.
+        Arguments:
+          audio: Path to the input file (or a file-like object), or the audio waveform.
+          language: The language spoken in the audio. It should be a language code such
+            as "en" or "fr". If not set, the language will be detected in the first 30 seconds
+            of audio.
+          task: Task to execute (transcribe or translate).
+          log_progress: whether to show progress bar or not.
+          beam_size: Beam size to use for decoding.
+          best_of: Number of candidates when sampling with non-zero temperature.
+          patience: Beam search patience factor.
+          length_penalty: Exponential length penalty constant.
+          repetition_penalty: Penalty applied to the score of previously generated tokens
+            (set > 1 to penalize).
+          no_repeat_ngram_size: Prevent repetitions of ngrams with this size (set 0 to disable).
+          temperature: Temperature for sampling. It can be a tuple of temperatures,
+            which will be successively used upon failures according to either
+            `compression_ratio_threshold` or `log_prob_threshold`.
+          compression_ratio_threshold: If the gzip compression ratio is above this value,
+            treat as failed.
+          log_prob_threshold: If the average log probability over sampled tokens is
+            below this value, treat as failed.
+          no_speech_threshold: If the no_speech probability is higher than this value AND
+            the average log probability over sampled tokens is below `log_prob_threshold`,
+            consider the segment as silent.
+          condition_on_previous_text: If True, the previous output of the model is provided
+            as a prompt for the next window; disabling may make the text inconsistent across
+            windows, but the model becomes less prone to getting stuck in a failure loop,
+            such as repetition looping or timestamps going out of sync.
+          prompt_reset_on_temperature: Resets prompt if temperature is above this value.
+            Arg has effect only if condition_on_previous_text is True.
+          initial_prompt: Optional text string or iterable of token ids to provide as a
+            prompt for the first window.
+          prefix: Optional text to provide as a prefix for the first window.
+          suppress_blank: Suppress blank outputs at the beginning of the sampling.
+          suppress_tokens: List of token IDs to suppress. -1 will suppress a default set
+            of symbols as defined in `tokenizer.non_speech_tokens()`.
+          without_timestamps: Only sample text tokens.
+          max_initial_timestamp: The initial timestamp cannot be later than this.
+          word_timestamps: Extract word-level timestamps using the cross-attention pattern
+            and dynamic time warping, and include the timestamps for each word in each segment.
+          prepend_punctuations: If word_timestamps is True, merge these punctuation symbols
+            with the next word
+          append_punctuations: If word_timestamps is True, merge these punctuation symbols
+            with the previous word
+          multilingual: Perform language detection on every segment.
+          vad_filter: Enable the voice activity detection (VAD) to filter out parts of the audio
+            without speech. This step is using the Silero VAD model
+            https://github.com/snakers4/silero-vad.
+          vad_parameters: Dictionary of Silero VAD parameters or VadOptions class (see available
+            parameters and default values in the class `VadOptions`).
+          max_new_tokens: Maximum number of new tokens to generate per-chunk. If not set,
+            the maximum will be set by the default max_length.
+          chunk_length: The length of audio segments. If it is not None, it will overwrite the
+            default chunk_length of the FeatureExtractor.
+          clip_timestamps:
+            Comma-separated list start,end,start,end,... timestamps (in seconds) of clips to
+             process. The last end timestamp defaults to the end of the file.
+             vad_filter will be ignored if clip_timestamps is used.
+          hallucination_silence_threshold:
+            When word_timestamps is True, skip silent periods longer than this threshold
+             (in seconds) when a possible hallucination is detected
+          hotwords:
+            Hotwords/hint phrases to provide the model with. Has no effect if prefix is not None.
+          language_detection_threshold: If the maximum probability of the language tokens is higher
+           than this value, the language is detected.
+          language_detection_segments: Number of segments to consider for the language detection.
+        Returns:
+          A tuple with:
+            - a generator over transcribed segments
+            - an instance of TranscriptionInfo
+        """
+        sampling_rate = self.feature_extractor.sampling_rate
+        if multilingual and not self.model.is_multilingual:
+            self.logger.warning(
+                "The current model is English-only but the multilingual parameter is set to"
+                "True; setting to False instead."
+            )
+            multilingual = False
+        if not isinstance(audio, np.ndarray):
+            audio = decode_audio(audio, sampling_rate=sampling_rate)
+        duration = audio.shape[0] / sampling_rate
+        duration_after_vad = duration
+        self.logger.info(
+            "Processing audio with duration %s", format_timestamp(duration)
+        )
+        if vad_filter and clip_timestamps == "0":
+            if vad_parameters is None:
+                vad_parameters = VadOptions()
+            elif isinstance(vad_parameters, dict):
+                vad_parameters = VadOptions(**vad_parameters)
+            speech_chunks = get_speech_timestamps(audio, vad_parameters)
+            audio_chunks, chunks_metadata = collect_chunks(audio, speech_chunks)
+            audio = np.concatenate(audio_chunks, axis=0)
+            duration_after_vad = audio.shape[0] / sampling_rate
+            self.logger.info(
+                "VAD filter removed %s of audio",
+                format_timestamp(duration - duration_after_vad),
+            )
+            if self.logger.isEnabledFor(logging.DEBUG):
+                self.logger.debug(
+                    "VAD filter kept the following audio segments: %s",
+                    ", ".join(
+                        "[%s -> %s]"
+                        % (
+                            format_timestamp(chunk["start"] / sampling_rate),
+                            format_timestamp(chunk["end"] / sampling_rate),
+                        )
+                        for chunk in speech_chunks
+                    ),
+                )
+        else:
+            speech_chunks = None
+        if audio.shape[0] == 0:
+            return None, None
+        features = self.feature_extractor(audio, chunk_length=chunk_length)
+        encoder_output = None
+        all_language_probs = None
+        # detecting the language if not provided
+        if language is None:
+            if not self.model.is_multilingual:
+                language = "en"
+                language_probability = 1
+            else:
+                start_timestamp = (
+                    float(clip_timestamps.split(",")[0])
+                    if isinstance(clip_timestamps, str)
+                    else clip_timestamps[0]
+                )
+                content_frames = features.shape[-1] - 1
+                seek = (
+                    int(start_timestamp * self.frames_per_second)
+                    if start_timestamp * self.frames_per_second < content_frames
+                    else 0
+                )
+                (
+                    language,
+                    language_probability,
+                    all_language_probs,
+                ) = self.detect_language(
+                    features=features[..., seek:],
+                    language_detection_segments=language_detection_segments,
+                    language_detection_threshold=language_detection_threshold,
+                )
+                self.logger.info(
+                    "Detected language '%s' with probability %.2f",
+                    language,
+                    language_probability,
+                )
+        else:
+            if not self.model.is_multilingual and language != "en":
+                self.logger.warning(
+                    "The current model is English-only but the language parameter is set to '%s'; "
+                    "using 'en' instead." % language
+                )
+                language = "en"
+            language_probability = 1
+        tokenizer = Tokenizer(
+            self.hf_tokenizer,
+            self.model.is_multilingual,
+            task=task,
+            language=language,
+        )
+        options = TranscriptionOptions(
+            beam_size=beam_size,
+            best_of=best_of,
+            patience=patience,
+            length_penalty=length_penalty,
+            repetition_penalty=repetition_penalty,
+            no_repeat_ngram_size=no_repeat_ngram_size,
+            log_prob_threshold=log_prob_threshold,
+            no_speech_threshold=no_speech_threshold,
+            compression_ratio_threshold=compression_ratio_threshold,
+            condition_on_previous_text=condition_on_previous_text,
+            prompt_reset_on_temperature=prompt_reset_on_temperature,
+            temperatures=(
+                temperature if isinstance(temperature, (list, tuple)) else [temperature]
+            ),
+            initial_prompt=initial_prompt,
+            prefix=prefix,
+            suppress_blank=suppress_blank,
+            suppress_tokens=(
+                get_suppressed_tokens(tokenizer, suppress_tokens)
+                if suppress_tokens
+                else suppress_tokens
+            ),
+            without_timestamps=without_timestamps,
+            max_initial_timestamp=max_initial_timestamp,
+            word_timestamps=word_timestamps,
+            prepend_punctuations=prepend_punctuations,
+            append_punctuations=append_punctuations,
+            multilingual=multilingual,
+            max_new_tokens=max_new_tokens,
+            clip_timestamps=clip_timestamps,
+            hallucination_silence_threshold=hallucination_silence_threshold,
+            hotwords=hotwords,
+        )
+        segments = self.generate_segments(
+            features, tokenizer, options, log_progress, encoder_output
+        )
+        if speech_chunks:
+            segments = restore_speech_timestamps(segments, speech_chunks, sampling_rate)
+        info = TranscriptionInfo(
+            language=language,
+            language_probability=language_probability,
+            duration=duration,
+            duration_after_vad=duration_after_vad,
+            transcription_options=options,
+            vad_options=vad_parameters,
+            all_language_probs=all_language_probs,
+        )
+        return segments, info
+    def _split_segments_by_timestamps(
+        self,
+        tokenizer: Tokenizer,
+        tokens: List[int],
+        time_offset: float,
+        segment_size: int,
+        segment_duration: float,
+        seek: int,
+    ) -> List[List[int]]:
+        current_segments = []
+        single_timestamp_ending = (
+            len(tokens) >= 2 and tokens[-2] < tokenizer.timestamp_begin <= tokens[-1]
+        )
+        consecutive_timestamps = [
+            i
+            for i in range(len(tokens))
+            if i > 0
+            and tokens[i] >= tokenizer.timestamp_begin
+            and tokens[i - 1] >= tokenizer.timestamp_begin
+        ]
+        if len(consecutive_timestamps) > 0:
+            slices = list(consecutive_timestamps)
+            if single_timestamp_ending:
+                slices.append(len(tokens))
+            last_slice = 0
+            for current_slice in slices:
+                sliced_tokens = tokens[last_slice:current_slice]
+                start_timestamp_position = sliced_tokens[0] - tokenizer.timestamp_begin
+                end_timestamp_position = sliced_tokens[-1] - tokenizer.timestamp_begin
+                start_time = (
+                    time_offset + start_timestamp_position * self.time_precision
+                )
+                end_time = time_offset + end_timestamp_position * self.time_precision
+                current_segments.append(
+                    dict(
+                        seek=seek,
+                        start=start_time,
+                        end=end_time,
+                        tokens=sliced_tokens,
+                    )
+                )
+                last_slice = current_slice
+            if single_timestamp_ending:
+                # single timestamp at the end means no speech after the last timestamp.
+                seek += segment_size
+            else:
+                # otherwise, ignore the unfinished segment and seek to the last timestamp
+                last_timestamp_position = (
+                    tokens[last_slice - 1] - tokenizer.timestamp_begin
+                )
+                seek += last_timestamp_position * self.input_stride
+        else:
+            duration = segment_duration
+            timestamps = [
+                token for token in tokens if token >= tokenizer.timestamp_begin
+            ]
+            if len(timestamps) > 0 and timestamps[-1] != tokenizer.timestamp_begin:
+                last_timestamp_position = timestamps[-1] - tokenizer.timestamp_begin
+                duration = last_timestamp_position * self.time_precision
+            current_segments.append(
+                dict(
+                    seek=seek,
+                    start=time_offset,
+                    end=time_offset + duration,
+                    tokens=tokens,
+                )
+            )
+            seek += segment_size
+        return current_segments, seek, single_timestamp_ending
+    def generate_segments(
+        self,
+        features: np.ndarray,
+        tokenizer: Tokenizer,
+        options: TranscriptionOptions,
+        log_progress,
+        encoder_output: Optional[ctranslate2.StorageView] = None,
+    ) -> Iterable[Segment]:
+        content_frames = features.shape[-1] - 1
+        content_duration = float(content_frames * self.feature_extractor.time_per_frame)
+        if isinstance(options.clip_timestamps, str):
+            options.clip_timestamps = [
+                float(ts)
+                for ts in (
+                    options.clip_timestamps.split(",")
+                    if options.clip_timestamps
+                    else []
+                )
+            ]
+        seek_points: List[int] = [
+            round(ts * self.frames_per_second) for ts in options.clip_timestamps
+        ]
+        if len(seek_points) == 0:
+            seek_points.append(0)
+        if len(seek_points) % 2 == 1:
+            seek_points.append(content_frames)
+        seek_clips: List[Tuple[int, int]] = list(
+            zip(seek_points[::2], seek_points[1::2])
+        )
+        punctuation = "\"'“¿([{-\"'.。,，!！?？:：”)]}、"
+        idx = 0
+        clip_idx = 0
+        seek = seek_clips[clip_idx][0]
+        all_tokens = []
+        prompt_reset_since = 0
+        if options.initial_prompt is not None:
+            if isinstance(options.initial_prompt, str):
+                initial_prompt = " " + options.initial_prompt.strip()
+                initial_prompt_tokens = tokenizer.encode(initial_prompt)
+                all_tokens.extend(initial_prompt_tokens)
+            else:
+                all_tokens.extend(options.initial_prompt)
+        pbar = tqdm(total=content_duration, unit="seconds", disable=not log_progress)
+        last_speech_timestamp = 0.0
+        all_segments = []
+        # NOTE: This loop is obscurely flattened to make the diff readable.
+        # A later commit should turn this into a simpler nested loop.
+        # for seek_clip_start, seek_clip_end in seek_clips:
+        #     while seek < seek_clip_end
+        while clip_idx < len(seek_clips):
+            seek_clip_start, seek_clip_end = seek_clips[clip_idx]
+            if seek_clip_end > content_frames:
+                seek_clip_end = content_frames
+            if seek < seek_clip_start:
+                seek = seek_clip_start
+            if seek >= seek_clip_end:
+                clip_idx += 1
+                if clip_idx < len(seek_clips):
+                    seek = seek_clips[clip_idx][0]
+                continue
+            time_offset = seek * self.feature_extractor.time_per_frame
+            window_end_time = float(
+                (seek + self.feature_extractor.nb_max_frames)
+                * self.feature_extractor.time_per_frame
+            )
+            segment_size = min(
+                self.feature_extractor.nb_max_frames,
+                content_frames - seek,
+                seek_clip_end - seek,
+            )
+            segment = features[:, seek : seek + segment_size]
+            segment_duration = segment_size * self.feature_extractor.time_per_frame
+            segment = pad_or_trim(segment)
+            if self.logger.isEnabledFor(logging.DEBUG):
+                self.logger.debug(
+                    "Processing segment at %s", format_timestamp(time_offset)
+                )
+            previous_tokens = all_tokens[prompt_reset_since:]
+            if seek > 0 or encoder_output is None:
+                encoder_output = self.encode(segment)
+            if options.multilingual:
+                results = self.model.detect_language(encoder_output)
+                language_token, language_probability = results[0][0]
+                language = language_token[2:-2]
+                tokenizer.language = tokenizer.tokenizer.token_to_id(language_token)
+                tokenizer.language_code = language
+            prompt = self.get_prompt(
+                tokenizer,
+                previous_tokens,
+                without_timestamps=options.without_timestamps,
+                prefix=options.prefix if seek == 0 else None,
+                hotwords=options.hotwords,
+            )
+            (
+                result,
+                avg_logprob,
+                temperature,
+                compression_ratio,
+            ) = self.generate_with_fallback(encoder_output, prompt, tokenizer, options)
+            if options.no_speech_threshold is not None:
+                # no voice activity check
+                should_skip = result.no_speech_prob > options.no_speech_threshold
+                if (
+                    options.log_prob_threshold is not None
+                    and avg_logprob > options.log_prob_threshold
+                ):
+                    # don't skip if the logprob is high enough, despite the no_speech_prob
+                    should_skip = False
+                if should_skip:
+                    self.logger.debug(
+                        "No speech threshold is met (%f > %f)",
+                        result.no_speech_prob,
+                        options.no_speech_threshold,
+                    )
+                    # fast-forward to the next segment boundary
+                    seek += segment_size
+                    continue
+            tokens = result.sequences_ids[0]
+            previous_seek = seek
+            # anomalous words are very long/short/improbable
+            def word_anomaly_score(word: dict) -> float:
+                probability = word.get("probability", 0.0)
+                duration = word["end"] - word["start"]
+                score = 0.0
+                if probability < 0.15:
+                    score += 1.0
+                if duration < 0.133:
+                    score += (0.133 - duration) * 15
+                if duration > 2.0:
+                    score += duration - 2.0
+                return score
+            def is_segment_anomaly(segment: Optional[dict]) -> bool:
+                if segment is None or not segment["words"]:
+                    return False
+                words = [w for w in segment["words"] if w["word"] not in punctuation]
+                words = words[:8]
+                score = sum(word_anomaly_score(w) for w in words)
+                return score >= 3 or score + 0.01 >= len(words)
+            def next_words_segment(segments: List[dict]) -> Optional[dict]:
+                return next((s for s in segments if s["words"]), None)
+            (
+                current_segments,
+                seek,
+                single_timestamp_ending,
+            ) = self._split_segments_by_timestamps(
+                tokenizer=tokenizer,
+                tokens=tokens,
+                time_offset=time_offset,
+                segment_size=segment_size,
+                segment_duration=segment_duration,
+                seek=seek,
+            )
+            if options.word_timestamps:
+                self.add_word_timestamps(
+                    [current_segments],
+                    tokenizer,
+                    encoder_output,
+                    segment_size,
+                    options.prepend_punctuations,
+                    options.append_punctuations,
+                    last_speech_timestamp=last_speech_timestamp,
+                )
+                if not single_timestamp_ending:
+                    last_word_end = get_end(current_segments)
+                    if last_word_end is not None and last_word_end > time_offset:
+                        seek = round(last_word_end * self.frames_per_second)
+                # skip silence before possible hallucinations
+                if options.hallucination_silence_threshold is not None:
+                    threshold = options.hallucination_silence_threshold
+                    # if first segment might be a hallucination, skip leading silence
+                    first_segment = next_words_segment(current_segments)
+                    if first_segment is not None and is_segment_anomaly(first_segment):
+                        gap = first_segment["start"] - time_offset
+                        if gap > threshold:
+                            seek = previous_seek + round(gap * self.frames_per_second)
+                            continue
+                    # skip silence before any possible hallucination that is surrounded
+                    # by silence or more hallucinations
+                    hal_last_end = last_speech_timestamp
+                    for si in range(len(current_segments)):
+                        segment = current_segments[si]
+                        if not segment["words"]:
+                            continue
+                        if is_segment_anomaly(segment):
+                            next_segment = next_words_segment(
+                                current_segments[si + 1 :]
+                            )
+                            if next_segment is not None:
+                                hal_next_start = next_segment["words"][0]["start"]
+                            else:
+                                hal_next_start = time_offset + segment_duration
+                            silence_before = (
+                                segment["start"] - hal_last_end > threshold
+                                or segment["start"] < threshold
+                                or segment["start"] - time_offset < 2.0
+                            )
+                            silence_after = (
+                                hal_next_start - segment["end"] > threshold
+                                or is_segment_anomaly(next_segment)
+                                or window_end_time - segment["end"] < 2.0
+                            )
+                            if silence_before and silence_after:
+                                seek = round(
+                                    max(time_offset + 1, segment["start"])
+                                    * self.frames_per_second
+                                )
+                                if content_duration - segment["end"] < threshold:
+                                    seek = content_frames
+                                current_segments[si:] = []
+                                break
+                        hal_last_end = segment["end"]
+                last_word_end = get_end(current_segments)
+                if last_word_end is not None:
+                    last_speech_timestamp = last_word_end
+            for segment in current_segments:
+                tokens = segment["tokens"]
+                text = tokenizer.decode(tokens)
+                if segment["start"] == segment["end"] or not text.strip():
+                    continue
+                all_tokens.extend(tokens)
+                idx += 1
+                all_segments.append(Segment(
+                    id=idx,
+                    seek=previous_seek,
+                    start=segment["start"],
+                    end=segment["end"],
+                    text=text,
+                    tokens=tokens,
+                    temperature=temperature,
+                    avg_logprob=avg_logprob,
+                    compression_ratio=compression_ratio,
+                    no_speech_prob=result.no_speech_prob,
+                    words=(
+                        [Word(**word) for word in segment["words"]]
+                        if options.word_timestamps
+                        else None
+                    ),
+                ))
+            if (
+                not options.condition_on_previous_text
+                or temperature > options.prompt_reset_on_temperature
+            ):
+                if options.condition_on_previous_text:
+                    self.logger.debug(
+                        "Reset prompt. prompt_reset_on_temperature threshold is met %f > %f",
+                        temperature,
+                        options.prompt_reset_on_temperature,
+                    )
+                prompt_reset_since = len(all_tokens)
+            pbar.update(
+                (min(content_frames, seek) - previous_seek)
+                * self.feature_extractor.time_per_frame,
+            )
+        pbar.close()
+        return all_segments
+    def encode(self, features: np.ndarray) -> ctranslate2.StorageView:
+        # When the model is running on multiple GPUs, the encoder output should be moved
+        # to the CPU since we don't know which GPU will handle the next job.
+        to_cpu = self.model.device == "cuda" and len(self.model.device_index) > 1
+        if features.ndim == 2:
+            features = np.expand_dims(features, 0)
+        features = get_ctranslate2_storage(features)
+        return self.model.encode(features, to_cpu=to_cpu)
+    def generate_with_fallback(
+        self,
+        encoder_output: ctranslate2.StorageView,
+        prompt: List[int],
+        tokenizer: Tokenizer,
+        options: TranscriptionOptions,
+    ) -> Tuple[ctranslate2.models.WhisperGenerationResult, float, float, float]:
+        decode_result = None
+        all_results = []
+        below_cr_threshold_results = []
+        max_initial_timestamp_index = int(
+            round(options.max_initial_timestamp / self.time_precision)
+        )
+        if options.max_new_tokens is not None:
+            max_length = len(prompt) + options.max_new_tokens
+        else:
+            max_length = self.max_length
+        if max_length > self.max_length:
+            raise ValueError(
+                f"The length of the prompt is {len(prompt)}, and the `max_new_tokens` "
+                f"{max_length - len(prompt)}. Thus, the combined length of the prompt "
+                f"and `max_new_tokens` is: {max_length}. This exceeds the "
+                f"`max_length` of the Whisper model: {self.max_length}. "
+                "You should either reduce the length of your prompt, or "
+                "reduce the value of `max_new_tokens`, "
+                f"so that their combined length is less that {self.max_length}."
+            )
+        for temperature in options.temperatures:
+            if temperature > 0:
+                kwargs = {
+                    "beam_size": 1,
+                    "num_hypotheses": options.best_of,
+                    "sampling_topk": 0,
+                    "sampling_temperature": temperature,
+                }
+            else:
+                kwargs = {
+                    "beam_size": options.beam_size,
+                    "patience": options.patience,
+                }
+            result = self.model.generate(
+                encoder_output,
+                [prompt],
+                length_penalty=options.length_penalty,
+                repetition_penalty=options.repetition_penalty,
+                no_repeat_ngram_size=options.no_repeat_ngram_size,
+                max_length=max_length,
+                return_scores=True,
+                return_no_speech_prob=True,
+                suppress_blank=options.suppress_blank,
+                suppress_tokens=options.suppress_tokens,
+                max_initial_timestamp_index=max_initial_timestamp_index,
+                **kwargs,
+            )[0]
+            tokens = result.sequences_ids[0]
+            # Recover the average log prob from the returned score.
+            seq_len = len(tokens)
+            cum_logprob = result.scores[0] * (seq_len**options.length_penalty)
+            avg_logprob = cum_logprob / (seq_len + 1)
+            text = tokenizer.decode(tokens).strip()
+            compression_ratio = get_compression_ratio(text)
+            decode_result = (
+                result,
+                avg_logprob,
+                temperature,
+                compression_ratio,
+            )
+            all_results.append(decode_result)
+            needs_fallback = False
+            if options.compression_ratio_threshold is not None:
+                if compression_ratio > options.compression_ratio_threshold:
+                    needs_fallback = True  # too repetitive
+                    self.logger.debug(
+                        "Compression ratio threshold is not met with temperature %.1f (%f > %f)",
+                        temperature,
+                        compression_ratio,
+                        options.compression_ratio_threshold,
+                    )
+                else:
+                    below_cr_threshold_results.append(decode_result)
+            if (
+                options.log_prob_threshold is not None
+                and avg_logprob < options.log_prob_threshold
+            ):
+                needs_fallback = True  # average log probability is too low
+                self.logger.debug(
+                    "Log probability threshold is not met with temperature %.1f (%f < %f)",
+                    temperature,
+                    avg_logprob,
+                    options.log_prob_threshold,
+                )
+            if (
+                options.no_speech_threshold is not None
+                and result.no_speech_prob > options.no_speech_threshold
+                and options.log_prob_threshold is not None
+                and avg_logprob < options.log_prob_threshold
+            ):
+                needs_fallback = False  # silence
+            if not needs_fallback:
+                break
+        else:
+            # all failed, select the result with the highest average log probability
+            decode_result = max(
+                below_cr_threshold_results or all_results, key=lambda x: x[1]
+            )
+            # to pass final temperature for prompt_reset_on_temperature
+            decode_result = (
+                decode_result[0],
+                decode_result[1],
+                temperature,
+                decode_result[3],
+            )
+        return decode_result
+    def get_prompt(
+        self,
+        tokenizer: Tokenizer,
+        previous_tokens: List[int],
+        without_timestamps: bool = False,
+        prefix: Optional[str] = None,
+        hotwords: Optional[str] = None,
+    ) -> List[int]:
+        prompt = []
+        if previous_tokens or (hotwords and not prefix):
+            prompt.append(tokenizer.sot_prev)
+            if hotwords and not prefix:
+                hotwords_tokens = tokenizer.encode(" " + hotwords.strip())
+                if len(hotwords_tokens) >= self.max_length // 2:
+                    hotwords_tokens = hotwords_tokens[: self.max_length // 2 - 1]
+                prompt.extend(hotwords_tokens)
+            if previous_tokens:
+                prompt.extend(previous_tokens[-(self.max_length // 2 - 1) :])
+        prompt.extend(tokenizer.sot_sequence)
+        if without_timestamps:
+            prompt.append(tokenizer.no_timestamps)
+        if prefix:
+            prefix_tokens = tokenizer.encode(" " + prefix.strip())
+            if len(prefix_tokens) >= self.max_length // 2:
+                prefix_tokens = prefix_tokens[: self.max_length // 2 - 1]
+            if not without_timestamps:
+                prompt.append(tokenizer.timestamp_begin)
+            prompt.extend(prefix_tokens)
+        return prompt
+    def add_word_timestamps(
+        self,
+        segments: List[dict],
+        tokenizer: Tokenizer,
+        encoder_output: ctranslate2.StorageView,
+        num_frames: int,
+        prepend_punctuations: str,
+        append_punctuations: str,
+        last_speech_timestamp: float,
+    ) -> float:
+        if len(segments) == 0:
+            return
+        text_tokens = []
+        text_tokens_per_segment = []
+        for segment in segments:
+            segment_tokens = [
+                [token for token in subsegment["tokens"] if token < tokenizer.eot]
+                for subsegment in segment
+            ]
+            text_tokens.append(list(itertools.chain.from_iterable(segment_tokens)))
+            text_tokens_per_segment.append(segment_tokens)
+        alignments = self.find_alignment(
+            tokenizer, text_tokens, encoder_output, num_frames
+        )
+        median_max_durations = []
+        for alignment in alignments:
+            word_durations = np.array(
+                [word["end"] - word["start"] for word in alignment]
+            )
+            word_durations = word_durations[word_durations.nonzero()]
+            median_duration = (
+                np.median(word_durations) if len(word_durations) > 0 else 0.0
+            )
+            median_duration = min(0.7, float(median_duration))
+            max_duration = median_duration * 2
+            # hack: truncate long words at sentence boundaries.
+            # a better segmentation algorithm based on VAD should be able to replace this.
+            if len(word_durations) > 0:
+                sentence_end_marks = ".。!！?？"
+                # ensure words at sentence boundaries
+                # are not longer than twice the median word duration.
+                for i in range(1, len(alignment)):
+                    if alignment[i]["end"] - alignment[i]["start"] > max_duration:
+                        if alignment[i]["word"] in sentence_end_marks:
+                            alignment[i]["end"] = alignment[i]["start"] + max_duration
+                        elif alignment[i - 1]["word"] in sentence_end_marks:
+                            alignment[i]["start"] = alignment[i]["end"] - max_duration
+            merge_punctuations(alignment, prepend_punctuations, append_punctuations)
+            median_max_durations.append((median_duration, max_duration))
+        for segment_idx, segment in enumerate(segments):
+            word_index = 0
+            time_offset = segment[0]["seek"] / self.frames_per_second
+            median_duration, max_duration = median_max_durations[segment_idx]
+            for subsegment_idx, subsegment in enumerate(segment):
+                saved_tokens = 0
+                words = []
+                while word_index < len(alignments[segment_idx]) and saved_tokens < len(
+                    text_tokens_per_segment[segment_idx][subsegment_idx]
+                ):
+                    timing = alignments[segment_idx][word_index]
+                    if timing["word"]:
+                        words.append(
+                            dict(
+                                word=timing["word"],
+                                start=round(time_offset + timing["start"], 2),
+                                end=round(time_offset + timing["end"], 2),
+                                probability=timing["probability"],
+                            )
+                        )
+                    saved_tokens += len(timing["tokens"])
+                    word_index += 1
+                # hack: truncate long words at segment boundaries.
+                # a better segmentation algorithm based on VAD should be able to replace this.
+                if len(words) > 0:
+                    # ensure the first and second word after a pause is not longer than
+                    # twice the median word duration.
+                    if words[0][
+                        "end"
+                    ] - last_speech_timestamp > median_duration * 4 and (
+                        words[0]["end"] - words[0]["start"] > max_duration
+                        or (
+                            len(words) > 1
+                            and words[1]["end"] - words[0]["start"] > max_duration * 2
+                        )
+                    ):
+                        if (
+                            len(words) > 1
+                            and words[1]["end"] - words[1]["start"] > max_duration
+                        ):
+                            boundary = max(
+                                words[1]["end"] / 2, words[1]["end"] - max_duration
+                            )
+                            words[0]["end"] = words[1]["start"] = boundary
+                        words[0]["start"] = max(0, words[0]["end"] - max_duration)
+                    # prefer the segment-level start timestamp if the first word is too long.
+                    if (
+                        subsegment["start"] < words[0]["end"]
+                        and subsegment["start"] - 0.5 > words[0]["start"]
+                    ):
+                        words[0]["start"] = max(
+                            0,
+                            min(words[0]["end"] - median_duration, subsegment["start"]),
+                        )
+                    else:
+                        subsegment["start"] = words[0]["start"]
+                    # prefer the segment-level end timestamp if the last word is too long.
+                    if (
+                        subsegment["end"] > words[-1]["start"]
+                        and subsegment["end"] + 0.5 < words[-1]["end"]
+                    ):
+                        words[-1]["end"] = max(
+                            words[-1]["start"] + median_duration, subsegment["end"]
+                        )
+                    else:
+                        subsegment["end"] = words[-1]["end"]
+                    last_speech_timestamp = subsegment["end"]
+                segments[segment_idx][subsegment_idx]["words"] = words
+        return last_speech_timestamp
+    def find_alignment(
+        self,
+        tokenizer: Tokenizer,
+        text_tokens: List[int],
+        encoder_output: ctranslate2.StorageView,
+        num_frames: int,
+        median_filter_width: int = 7,
+    ) -> List[dict]:
+        if len(text_tokens) == 0:
+            return []
+        results = self.model.align(
+            encoder_output,
+            tokenizer.sot_sequence,
+            text_tokens,
+            num_frames,
+            median_filter_width=median_filter_width,
+        )
+        return_list = []
+        for result, text_token in zip(results, text_tokens):
+            text_token_probs = result.text_token_probs
+            alignments = result.alignments
+            text_indices = np.array([pair[0] for pair in alignments])
+            time_indices = np.array([pair[1] for pair in alignments])
+            words, word_tokens = tokenizer.split_to_word_tokens(
+                text_token + [tokenizer.eot]
+            )
+            if len(word_tokens) <= 1:
+                # return on eot only
+                # >>> np.pad([], (1, 0))
+                # array([0.])
+                # This results in crashes when we lookup jump_times with float, like
+                # IndexError: arrays used as indices must be of integer (or boolean) type
+                return_list.append([])
+                continue
+            word_boundaries = np.pad(
+                np.cumsum([len(t) for t in word_tokens[:-1]]), (1, 0)
+            )
+            if len(word_boundaries) <= 1:
+                return_list.append([])
+                continue
+            jumps = np.pad(np.diff(text_indices), (1, 0), constant_values=1).astype(
+                bool
+            )
+            jump_times = time_indices[jumps] / self.tokens_per_second
+            start_times = jump_times[word_boundaries[:-1]]
+            end_times = jump_times[word_boundaries[1:]]
+            word_probabilities = [
+                np.mean(text_token_probs[i:j])
+                for i, j in zip(word_boundaries[:-1], word_boundaries[1:])
+            ]
+            return_list.append(
+                [
+                    dict(
+                        word=word,
+                        tokens=tokens,
+                        start=start,
+                        end=end,
+                        probability=probability,
+                    )
+                    for word, tokens, start, end, probability in zip(
+                        words, word_tokens, start_times, end_times, word_probabilities
+                    )
+                ]
+            )
+        return return_list
+    def detect_language(
+        self,
+        audio: Optional[np.ndarray] = None,
+        features: Optional[np.ndarray] = None,
+        vad_filter: bool = False,
+        vad_parameters: Union[dict, VadOptions] = None,
+        language_detection_segments: int = 1,
+        language_detection_threshold: float = 0.5,
+    ) -> Tuple[str, float, List[Tuple[str, float]]]:
+        """
+        Use Whisper to detect the language of the input audio or features.
+        Arguments:
+            audio: Input audio signal, must be a 1D float array sampled at 16khz.
+            features: Input Mel spectrogram features, must be a float array with
+                shape (n_mels, n_frames), if `audio` is provided, the features will be ignored.
+                Either `audio` or `features` must be provided.
+            vad_filter: Enable the voice activity detection (VAD) to filter out parts of the audio
+                without speech. This step is using the Silero VAD model.
+            vad_parameters: Dictionary of Silero VAD parameters or VadOptions class (see available
+                parameters and default values in the class `VadOptions`).
+            language_detection_threshold: If the maximum probability of the language tokens is
+                higher than this value, the language is detected.
+            language_detection_segments: Number of segments to consider for the language detection.
+        Returns:
+            language: Detected language.
+            languege_probability: Probability of the detected language.
+            all_language_probs: List of tuples with all language names and probabilities.
+        """
+        assert (
+            audio is not None or features is not None
+        ), "Either `audio` or `features` must be provided."
+        if audio is not None:
+            if vad_filter:
+                speech_chunks = get_speech_timestamps(audio, vad_parameters)
+                audio_chunks, chunks_metadata = collect_chunks(audio, speech_chunks)
+                audio = np.concatenate(audio_chunks, axis=0)
+            audio = audio[
+                : language_detection_segments * self.feature_extractor.n_samples
+            ]
+            features = self.feature_extractor(audio)
+        features = features[
+            ..., : language_detection_segments * self.feature_extractor.nb_max_frames
+        ]
+        detected_language_info = {}
+        for i in range(0, features.shape[-1], self.feature_extractor.nb_max_frames):
+            encoder_output = self.encode(
+                pad_or_trim(features[..., i : i + self.feature_extractor.nb_max_frames])
+            )
+            # results is a list of tuple[str, float] with language names and probabilities.
+            results = self.model.detect_language(encoder_output)[0]
+            # Parse language names to strip out markers
+            all_language_probs = [(token[2:-2], prob) for (token, prob) in results]
+            # Get top language token and probability
+            language, language_probability = all_language_probs[0]
+            if language_probability > language_detection_threshold:
+                break
+            detected_language_info.setdefault(language, []).append(language_probability)
+        else:
+            # If no language detected for all segments, the majority vote of the highest
+            # projected languages for all segments is used to determine the language.
+            language = max(
+                detected_language_info,
+                key=lambda lang: len(detected_language_info[lang]),
+            )
+            language_probability = max(detected_language_info[language])
+        return language, language_probability, all_language_probs
+def restore_speech_timestamps(
+    segments: Iterable[Segment],
+    speech_chunks: List[dict],
+    sampling_rate: int,
+) -> Iterable[Segment]:
+    ts_map = SpeechTimestampsMap(speech_chunks, sampling_rate)
+    for segment in segments:
+        if segment.words:
+            words = []
+            for word in segment.words:
+                # Ensure the word start and end times are resolved to the same chunk.
+                middle = (word.start + word.end) / 2
+                chunk_index = ts_map.get_chunk_index(middle)
+                word.start = ts_map.get_original_time(word.start, chunk_index)
+                word.end = ts_map.get_original_time(word.end, chunk_index)
+                words.append(word)
+            segment.start = words[0].start
+            segment.end = words[-1].end
+            segment.words = words
+        else:
+            segment.start = ts_map.get_original_time(segment.start)
+            segment.end = ts_map.get_original_time(segment.end)
+    return segments
+def get_ctranslate2_storage(segment: np.ndarray) -> ctranslate2.StorageView:
+    segment = np.ascontiguousarray(segment)
+    segment = ctranslate2.StorageView.from_array(segment)
+    return segment
+def get_compression_ratio(text: str) -> float:
+    text_bytes = text.encode("utf-8")
+    return len(text_bytes) / len(zlib.compress(text_bytes))
+def get_suppressed_tokens(
+    tokenizer: Tokenizer,
+    suppress_tokens: Tuple[int],
+) -> Optional[List[int]]:
+    if -1 in suppress_tokens:
+        suppress_tokens = [t for t in suppress_tokens if t >= 0]
+        suppress_tokens.extend(tokenizer.non_speech_tokens)
+    elif suppress_tokens is None or len(suppress_tokens) == 0:
+        suppress_tokens = []  # interpret empty string as an empty list
+    else:
+        assert isinstance(suppress_tokens, list), "suppress_tokens must be a list"
+    suppress_tokens.extend(
+        [
+            tokenizer.transcribe,
+            tokenizer.translate,
+            tokenizer.sot,
+            tokenizer.sot_prev,
+            tokenizer.sot_lm,
+        ]
+    )
+    return tuple(sorted(set(suppress_tokens)))
+def merge_punctuations(alignment: List[dict], prepended: str, appended: str) -> None:
+    # merge prepended punctuations
+    i = len(alignment) - 2
+    j = len(alignment) - 1
+    while i >= 0:
+        previous = alignment[i]
+        following = alignment[j]
+        if previous["word"].startswith(" ") and previous["word"].strip() in prepended:
+            # prepend it to the following word
+            following["word"] = previous["word"] + following["word"]
+            following["tokens"] = previous["tokens"] + following["tokens"]
+            previous["word"] = ""
+            previous["tokens"] = []
+        else:
+            j = i
+        i -= 1
+    # merge appended punctuations
+    i = 0
+    j = 1
+    while j < len(alignment):
+        previous = alignment[i]
+        following = alignment[j]
+        if not previous["word"].endswith(" ") and following["word"] in appended:
+            # append it to the previous word
+            previous["word"] = previous["word"] + following["word"]
+            previous["tokens"] = previous["tokens"] + following["tokens"]
+            following["word"] = ""
+            following["tokens"] = []
+        else:
+            i = j
+        j += 1

whisper_live/transcriber/transcriber_openvino.py ADDED Viewed

	@@ -0,0 +1,23 @@

+import librosa
+import os
+import openvino_genai as ov_genai
+import huggingface_hub as hf_hub
+class WhisperOpenVINO(object):
+    def __init__(self, model_id="OpenVINO/whisper-tiny-fp16-ov", device="CPU", language="en", task="transcribe"):
+        model_path = model_id.split('/')[-1]
+        cache_dir = os.path.join(os.path.expanduser("~"), ".cache", "openvino_whisper_models")
+        os.makedirs(cache_dir, exist_ok=True)
+        model_path = os.path.join(cache_dir, model_path)
+        if not os.path.exists(model_path):
+            hf_hub.snapshot_download(model_id, local_dir=model_path)
+        self.model = ov_genai.WhisperPipeline(str(model_path), device=device)
+        self.language = language
+        self.task = task
+    def transcribe(self, input_audio):
+        outputs = self.model.generate(input_audio, return_timestamps=True, language=self.language, task=self.task)
+        outputs = [seg for seg in outputs.chunks]
+        return outputs

whisper_live/transcriber/transcriber_tensorrt.py ADDED Viewed

	@@ -0,0 +1,479 @@

+import json
+import re
+import math
+from collections import OrderedDict
+from pathlib import Path
+from typing import Union
+import torch
+import numpy as np
+import torch.nn.functional as F
+from whisper.tokenizer import get_tokenizer
+from whisper_live.transcriber.tensorrt_utils import (
+    mel_filters,
+    load_audio_wav_format,
+    pad_or_trim,
+    load_audio
+)
+import tensorrt_llm
+import tensorrt_llm.logger as logger
+from tensorrt_llm._utils import (str_dtype_to_torch, str_dtype_to_trt,
+                                 trt_dtype_to_torch)
+from tensorrt_llm.bindings import GptJsonConfig, KVCacheType
+from tensorrt_llm.runtime import PYTHON_BINDINGS, ModelConfig, SamplingConfig
+from tensorrt_llm.runtime.session import Session, TensorInfo
+if PYTHON_BINDINGS:
+    from tensorrt_llm.runtime import ModelRunnerCpp
+SAMPLE_RATE = 16000
+N_FFT = 400
+HOP_LENGTH = 160
+CHUNK_LENGTH = 30
+N_SAMPLES = CHUNK_LENGTH * SAMPLE_RATE  # 480000 samples in a 30-second chunk
+def read_config(component, engine_dir):
+    config_path = engine_dir / component / 'config.json'
+    with open(config_path, 'r') as f:
+        config = json.load(f)
+    model_config = OrderedDict()
+    model_config.update(config['pretrained_config'])
+    model_config.update(config['build_config'])
+    return model_config
+def remove_tensor_padding(input_tensor,
+                          input_tensor_lengths=None,
+                          pad_value=None):
+    if pad_value:
+        assert input_tensor_lengths is None, "input_tensor_lengths should be None when pad_value is provided"
+        # Text tensor case: batch, seq_len
+        assert torch.all(
+            input_tensor[:, 0] != pad_value
+        ), "First token in each sequence should not be pad_value"
+        assert input_tensor_lengths is None
+        # Create a mask for all non-pad tokens
+        mask = input_tensor != pad_value
+        # Apply the mask to input_tensor to remove pad tokens
+        output_tensor = input_tensor[mask].view(1, -1)
+    else:
+        # Audio tensor case: batch, seq_len, feature_len
+        # position_ids case: batch, seq_len
+        assert input_tensor_lengths is not None, "input_tensor_lengths must be provided for 3D input_tensor"
+        # Initialize a list to collect valid sequences
+        valid_sequences = []
+        for i in range(input_tensor.shape[0]):
+            valid_length = input_tensor_lengths[i]
+            valid_sequences.append(input_tensor[i, :valid_length])
+        # Concatenate all valid sequences along the batch dimension
+        output_tensor = torch.cat(valid_sequences, dim=0)
+    return output_tensor
+class WhisperEncoding:
+    def __init__(self, engine_dir):
+        self.session = self.get_session(engine_dir)
+        config = read_config('encoder', engine_dir)
+        self.n_mels = config['n_mels']
+        self.dtype = config['dtype']
+        self.num_languages = config['num_languages']
+        self.encoder_config = config
+    def get_session(self, engine_dir):
+        serialize_path = engine_dir / 'encoder' / 'rank0.engine'
+        with open(serialize_path, 'rb') as f:
+            session = Session.from_serialized_engine(f.read())
+        return session
+    def get_audio_features(self,
+                           mel,
+                           mel_input_lengths,
+                           encoder_downsampling_factor=2):
+        if isinstance(mel, list):
+            longest_mel = max([f.shape[-1] for f in mel])
+            mel = [
+                torch.nn.functional.pad(f, (0, longest_mel - f.shape[-1]),
+                                        mode='constant') for f in mel
+            ]
+            mel = torch.cat(mel, dim=0).type(
+                str_dtype_to_torch("float16")).contiguous()
+        bsz, seq_len = mel.shape[0], mel.shape[2]
+        position_ids = torch.arange(
+            math.ceil(seq_len / encoder_downsampling_factor),
+            dtype=torch.int32,
+            device=mel.device).expand(bsz, -1).contiguous()
+        if self.encoder_config['plugin_config']['remove_input_padding']:
+            # mel B,D,T -> B,T,D -> BxT, D
+            mel = mel.transpose(1, 2)
+            mel = remove_tensor_padding(mel, mel_input_lengths)
+            position_ids = remove_tensor_padding(
+                position_ids, mel_input_lengths // encoder_downsampling_factor)
+        inputs = OrderedDict()
+        inputs['input_features'] = mel
+        inputs['input_lengths'] = mel_input_lengths
+        inputs['position_ids'] = position_ids
+        output_list = [
+            TensorInfo('input_features', str_dtype_to_trt(self.dtype),
+                       mel.shape),
+            TensorInfo('input_lengths', str_dtype_to_trt('int32'),
+                       mel_input_lengths.shape),
+            TensorInfo('position_ids', str_dtype_to_trt('int32'),
+                       inputs['position_ids'].shape)
+        ]
+        output_info = (self.session).infer_shapes(output_list)
+        logger.debug(f'output info {output_info}')
+        outputs = {
+            t.name: torch.empty(tuple(t.shape),
+                                dtype=trt_dtype_to_torch(t.dtype),
+                                device='cuda')
+            for t in output_info
+        }
+        stream = torch.cuda.current_stream()
+        ok = self.session.run(inputs=inputs,
+                              outputs=outputs,
+                              stream=stream.cuda_stream)
+        assert ok, 'Engine execution failed'
+        stream.synchronize()
+        encoder_output = outputs['encoder_output']
+        encoder_output_lengths = mel_input_lengths // encoder_downsampling_factor
+        return encoder_output, encoder_output_lengths
+class WhisperDecoding:
+    def __init__(self, engine_dir, runtime_mapping, debug_mode=False):
+        self.decoder_config = read_config('decoder', engine_dir)
+        self.decoder_generation_session = self.get_session(
+            engine_dir, runtime_mapping, debug_mode)
+    def get_session(self, engine_dir, runtime_mapping, debug_mode=False):
+        serialize_path = engine_dir / 'decoder' / 'rank0.engine'
+        with open(serialize_path, "rb") as f:
+            decoder_engine_buffer = f.read()
+        decoder_model_config = ModelConfig(
+            max_batch_size=self.decoder_config['max_batch_size'],
+            max_beam_width=self.decoder_config['max_beam_width'],
+            num_heads=self.decoder_config['num_attention_heads'],
+            num_kv_heads=self.decoder_config['num_attention_heads'],
+            hidden_size=self.decoder_config['hidden_size'],
+            vocab_size=self.decoder_config['vocab_size'],
+            cross_attention=True,
+            num_layers=self.decoder_config['num_hidden_layers'],
+            gpt_attention_plugin=self.decoder_config['plugin_config']
+            ['gpt_attention_plugin'],
+            remove_input_padding=self.decoder_config['plugin_config']
+            ['remove_input_padding'],
+            kv_cache_type=KVCacheType.PAGED
+            if self.decoder_config['plugin_config']['paged_kv_cache'] == True
+            else KVCacheType.CONTINUOUS,
+            has_position_embedding=self.
+            decoder_config['has_position_embedding'],
+            dtype=self.decoder_config['dtype'],
+            has_token_type_embedding=False,
+        )
+        decoder_generation_session = tensorrt_llm.runtime.GenerationSession(
+            decoder_model_config,
+            decoder_engine_buffer,
+            runtime_mapping,
+            debug_mode=debug_mode)
+        return decoder_generation_session
+    def generate(self,
+                 decoder_input_ids,
+                 encoder_outputs,
+                 encoder_max_input_length,
+                 encoder_input_lengths,
+                 eot_id,
+                 max_new_tokens=40,
+                 num_beams=1):
+        batch_size = decoder_input_ids.shape[0]
+        decoder_input_lengths = torch.tensor([
+            decoder_input_ids.shape[-1]
+            for _ in range(decoder_input_ids.shape[0])
+        ],
+                                             dtype=torch.int32,
+                                             device='cuda')
+        decoder_max_input_length = torch.max(decoder_input_lengths).item()
+        cross_attention_mask = torch.ones([
+            batch_size, decoder_max_input_length + max_new_tokens,
+            encoder_max_input_length
+        ]).int().cuda()
+        # generation config
+        sampling_config = SamplingConfig(end_id=eot_id,
+                                         pad_id=eot_id,
+                                         num_beams=num_beams)
+        self.decoder_generation_session.setup(
+            decoder_input_lengths.size(0),
+            decoder_max_input_length,
+            max_new_tokens,
+            beam_width=num_beams,
+            encoder_max_input_length=encoder_max_input_length)
+        torch.cuda.synchronize()
+        decoder_input_ids = decoder_input_ids.type(torch.int32).cuda()
+        if self.decoder_config['plugin_config']['remove_input_padding']:
+            # 50256 is the index of <pad> for all whisper models' decoder
+            WHISPER_PAD_TOKEN_ID = 50256
+            decoder_input_ids = remove_tensor_padding(
+                decoder_input_ids, pad_value=WHISPER_PAD_TOKEN_ID)
+            if encoder_outputs.dim() == 3:
+                encoder_output_lens = torch.full((encoder_outputs.shape[0], ),
+                                                 encoder_outputs.shape[1],
+                                                 dtype=torch.int32,
+                                                 device='cuda')
+                encoder_outputs = remove_tensor_padding(encoder_outputs,
+                                                        encoder_output_lens)
+        output_ids = self.decoder_generation_session.decode(
+            decoder_input_ids,
+            decoder_input_lengths,
+            sampling_config,
+            encoder_output=encoder_outputs,
+            encoder_input_lengths=encoder_input_lengths,
+            cross_attention_mask=cross_attention_mask,
+        )
+        torch.cuda.synchronize()
+        # get the list of int from output_ids tensor
+        output_ids = output_ids.cpu().numpy().tolist()
+        return output_ids
+class WhisperTRTLLM(object):
+    def __init__(self,
+                 engine_dir,
+                 assets_dir=None,
+                 device=None,
+                 is_multilingual=False,
+                 language="en",
+                 task="transcribe",
+                 use_py_session=False,
+                 num_beams=1,
+                 debug_mode=False,
+                 max_output_len=96):
+        world_size = 1
+        runtime_rank = tensorrt_llm.mpi_rank()
+        runtime_mapping = tensorrt_llm.Mapping(world_size, runtime_rank)
+        torch.cuda.set_device(runtime_rank % runtime_mapping.gpus_per_node)
+        engine_dir = Path(engine_dir)
+        encoder_config = read_config('encoder', engine_dir)
+        decoder_config = read_config('decoder', engine_dir)
+        self.n_mels = encoder_config['n_mels']
+        self.num_languages = encoder_config['num_languages']
+        is_multilingual = (decoder_config['vocab_size'] >= 51865)
+        self.device = device
+        self.tokenizer = get_tokenizer(
+            is_multilingual,
+            num_languages=self.num_languages,
+            language=language,
+            task=task,
+        )
+        if use_py_session:
+            self.encoder = WhisperEncoding(engine_dir)
+            self.decoder = WhisperDecoding(engine_dir,
+                                        runtime_mapping,
+                                        debug_mode=False)
+        else:
+            json_config = GptJsonConfig.parse_file(engine_dir / 'decoder' /
+                                                   'config.json')
+            assert json_config.model_config.supports_inflight_batching
+            runner_kwargs = dict(engine_dir=engine_dir,
+                                 is_enc_dec=True,
+                                 max_batch_size=1,
+                                 max_input_len=3000,
+                                 max_output_len=max_output_len,
+                                 max_beam_width=num_beams,
+                                 debug_mode=debug_mode,
+                                 kv_cache_free_gpu_memory_fraction=0.9,
+                                 cross_kv_cache_fraction=0.5)
+            self.model_runner_cpp = ModelRunnerCpp.from_dir(**runner_kwargs)
+        self.filters = mel_filters(self.device, self.n_mels, assets_dir)
+        self.use_py_session = use_py_session
+    def log_mel_spectrogram(
+        self,
+        audio: Union[str, np.ndarray, torch.Tensor],
+        padding: int = 0,
+        return_duration=True
+    ):
+        """
+        Compute the log-Mel spectrogram of
+        Parameters
+        ----------
+        audio: Union[str, np.ndarray, torch.Tensor], shape = (*)
+            The path to audio or either a NumPy array or Tensor containing the audio waveform in 16 kHz
+        n_mels: int
+            The number of Mel-frequency filters, only 80 and 128 are supported
+        padding: int
+            Number of zero samples to pad to the right
+        device: Optional[Union[str, torch.device]]
+            If given, the audio tensor is moved to this device before STFT
+        Returns
+        -------
+        torch.Tensor, shape = (80 or 128, n_frames)
+            A Tensor that contains the Mel spectrogram
+        """
+        if not torch.is_tensor(audio):
+            if isinstance(audio, str):
+                if audio.endswith('.wav'):
+                    audio, _ = load_audio_wav_format(audio)
+                else:
+                    audio = load_audio(audio)
+            assert isinstance(audio, np.ndarray), f"Unsupported audio type: {type(audio)}"
+            duration = audio.shape[-1] / SAMPLE_RATE
+            audio = pad_or_trim(audio, N_SAMPLES)
+            audio = audio.astype(np.float32)
+            audio = torch.from_numpy(audio)
+        if self.device is not None:
+            audio = audio.to(self.device)
+        if padding > 0:
+            audio = F.pad(audio, (0, padding))
+        window = torch.hann_window(N_FFT).to(audio.device)
+        stft = torch.stft(audio, N_FFT, HOP_LENGTH, window=window, return_complex=True)
+        magnitudes = stft[..., :-1].abs()**2
+        mel_spec = self.filters @ magnitudes
+        log_spec = torch.clamp(mel_spec, min=1e-10).log10()
+        log_spec = torch.maximum(log_spec, log_spec.max() - 8.0)
+        log_spec = (log_spec + 4.0) / 4.0
+        if return_duration:
+            return log_spec, duration
+        else:
+            return log_spec
+    def process_batch(
+            self,
+            mel,
+            mel_input_lengths,
+            text_prefix="<|startoftranscript|><|en|><|transcribe|><|notimestamps|>",
+            num_beams=1,
+            max_new_tokens=96):
+        prompt_id = self.tokenizer.encode(
+            text_prefix, allowed_special=set(self.tokenizer.special_tokens.keys()))
+        prompt_id = torch.tensor(prompt_id)
+        batch_size = mel.shape[0]
+        decoder_input_ids = prompt_id.repeat(batch_size, 1)
+        if self.use_py_session:
+            encoder_output, encoder_output_lengths = self.encoder.get_audio_features(mel, mel_input_lengths)
+            encoder_max_input_length = torch.max(encoder_output_lengths).item()
+            output_ids = self.decoder.generate(decoder_input_ids,
+                                            encoder_output,
+                                            encoder_max_input_length,
+                                            encoder_output_lengths,
+                                            self.tokenizer.eot,
+                                            max_new_tokens=max_new_tokens,
+                                            num_beams=num_beams)
+        else:
+            with torch.no_grad():
+                if isinstance(mel, list):
+                    mel = [
+                        m.transpose(1, 2).type(
+                            str_dtype_to_torch("float16")).squeeze(0)
+                        for m in mel
+                    ]
+                else:
+                    mel = mel.transpose(1, 2)
+                outputs = self.model_runner_cpp.generate(
+                    batch_input_ids=decoder_input_ids,
+                    encoder_input_features=mel,
+                    encoder_output_lengths=mel_input_lengths // 2,
+                    max_new_tokens=max_new_tokens,
+                    end_id=self.tokenizer.eot,
+                    pad_id=self.tokenizer.eot,
+                    num_beams=num_beams,
+                    output_sequence_lengths=True,
+                    return_dict=True)
+                torch.cuda.synchronize()
+                output_ids = outputs['output_ids'].cpu().numpy().tolist()
+        texts = []
+        for i in range(len(output_ids)):
+            text = self.tokenizer.decode(output_ids[i][0]).strip()
+            texts.append(text)
+        return texts
+    def transcribe(
+            self,
+            mel,
+            text_prefix="<|startoftranscript|><|en|><|transcribe|><|notimestamps|>",
+            dtype='float16',
+            batch_size=1,
+            num_beams=1,
+            padding_strategy="max",
+            max_new_tokens=96,
+        ):
+        mel = mel.type(str_dtype_to_torch(dtype))
+        mel = mel.unsqueeze(0)
+        # repeat the mel spectrogram to match the batch size
+        mel = mel.repeat(batch_size, 1, 1)
+        if padding_strategy == "longest":
+            pass
+        else:
+            mel = torch.nn.functional.pad(mel, (0, 3000 - mel.shape[2]))
+        features_input_lengths = torch.full((mel.shape[0], ),
+                                             mel.shape[2],
+                                             dtype=torch.int32,
+                                             device=mel.device)
+        predictions = self.process_batch(
+            mel,
+            features_input_lengths,
+            text_prefix,
+            num_beams,
+            max_new_tokens=max_new_tokens
+        )
+        prediction = predictions[0]
+        # remove all special tokens in the prediction
+        prediction = re.sub(r'<\|.*?\|>', '', prediction)
+        return prediction.strip()
+def decode_wav_file(
+        model,
+        mel,
+        text_prefix="<|startoftranscript|><|en|><|transcribe|><|notimestamps|>",
+        dtype='float16',
+        batch_size=1,
+        num_beams=1,
+        normalizer=None,
+        mel_filters_dir=None):
+    mel = mel.type(str_dtype_to_torch(dtype))
+    mel = mel.unsqueeze(0)
+    # repeat the mel spectrogram to match the batch size
+    mel = mel.repeat(batch_size, 1, 1)
+    predictions = model.process_batch(mel, text_prefix, num_beams)
+    prediction = predictions[0]
+    # remove all special tokens in the prediction
+    prediction = re.sub(r'<\|.*?\|>', '', prediction)
+    if normalizer:
+        prediction = normalizer(prediction)
+    return prediction.strip()

whisper_live/utils.py ADDED Viewed

	@@ -0,0 +1,82 @@

+import os
+import textwrap
+import scipy
+import numpy as np
+import av
+from pathlib import Path
+def clear_screen():
+    """Clears the console screen."""
+    os.system("cls" if os.name == "nt" else "clear")
+def print_transcript(text):
+    """Prints formatted transcript text."""
+    wrapper = textwrap.TextWrapper(width=60)
+    for line in wrapper.wrap(text="".join(text)):
+        print(line)
+def format_time(s):
+    """Convert seconds (float) to SRT time format."""
+    hours = int(s // 3600)
+    minutes = int((s % 3600) // 60)
+    seconds = int(s % 60)
+    milliseconds = int((s - int(s)) * 1000)
+    return f"{hours:02}:{minutes:02}:{seconds:02},{milliseconds:03}"
+def create_srt_file(segments, resampled_file):
+    with open(resampled_file, 'w', encoding='utf-8') as srt_file:
+        segment_number = 1
+        for segment in segments:
+            start_time = format_time(float(segment['start']))
+            end_time = format_time(float(segment['end']))
+            text = segment['text']
+            srt_file.write(f"{segment_number}\n")
+            srt_file.write(f"{start_time} --> {end_time}\n")
+            srt_file.write(f"{text}\n\n")
+            segment_number += 1
+def resample(file: str, sr: int = 16000):
+    """
+    Resample the audio file to 16kHz.
+    Args:
+        file (str): The audio file to open
+        sr (int): The sample rate to resample the audio if necessary
+    Returns:
+        resampled_file (str): The resampled audio file
+    """
+    container = av.open(file)
+    stream = next(s for s in container.streams if s.type == 'audio')
+    resampler = av.AudioResampler(
+        format='s16',
+        layout='mono',
+        rate=sr,
+    )
+    resampled_file = Path(file).stem + "_resampled.wav"
+    output_container = av.open(resampled_file, mode='w')
+    output_stream = output_container.add_stream('pcm_s16le', rate=sr)
+    output_stream.layout = 'mono'
+    for frame in container.decode(audio=0):
+        frame.pts = None
+        resampled_frames = resampler.resample(frame)
+        if resampled_frames is not None:
+            for resampled_frame in resampled_frames:
+                for packet in output_stream.encode(resampled_frame):
+                    output_container.mux(packet)
+    for packet in output_stream.encode(None):
+        output_container.mux(packet)
+    output_container.close()
+    return resampled_file

whisper_live/vad.py ADDED Viewed

	@@ -0,0 +1,157 @@

+import os
+import subprocess
+import torch
+import numpy as np
+import onnxruntime
+import warnings
+class VoiceActivityDetection():
+    def __init__(self, force_onnx_cpu=True):
+        path = self.download()
+        opts = onnxruntime.SessionOptions()
+        opts.log_severity_level = 3
+        opts.inter_op_num_threads = 1
+        opts.intra_op_num_threads = 1
+        if force_onnx_cpu and 'CPUExecutionProvider' in onnxruntime.get_available_providers():
+            self.session = onnxruntime.InferenceSession(path, providers=['CPUExecutionProvider'], sess_options=opts)
+        else:
+            self.session = onnxruntime.InferenceSession(path, providers=['CUDAExecutionProvider'], sess_options=opts)
+        self.reset_states()
+        if '16k' in path:
+            warnings.warn('This model support only 16000 sampling rate!')
+            self.sample_rates = [16000]
+        else:
+            self.sample_rates = [8000, 16000]
+    def _validate_input(self, x, sr: int):
+        if x.dim() == 1:
+            x = x.unsqueeze(0)
+        if x.dim() > 2:
+            raise ValueError(f"Too many dimensions for input audio chunk {x.dim()}")
+        if sr != 16000 and (sr % 16000 == 0):
+            step = sr // 16000
+            x = x[:,::step]
+            sr = 16000
+        if sr not in self.sample_rates:
+            raise ValueError(f"Supported sampling rates: {self.sample_rates} (or multiply of 16000)")
+        if sr / x.shape[1] > 31.25:
+            raise ValueError("Input audio chunk is too short")
+        return x, sr
+    def reset_states(self, batch_size=1):
+        self._state = torch.zeros((2, batch_size, 128)).float()
+        self._context = torch.zeros(0)
+        self._last_sr = 0
+        self._last_batch_size = 0
+    def __call__(self, x, sr: int):
+        x, sr = self._validate_input(x, sr)
+        num_samples = 512 if sr == 16000 else 256
+        if x.shape[-1] != num_samples:
+            raise ValueError(f"Provided number of samples is {x.shape[-1]} (Supported values: 256 for 8000 sample rate, 512 for 16000)")
+        batch_size = x.shape[0]
+        context_size = 64 if sr == 16000 else 32
+        if not self._last_batch_size:
+            self.reset_states(batch_size)
+        if (self._last_sr) and (self._last_sr != sr):
+            self.reset_states(batch_size)
+        if (self._last_batch_size) and (self._last_batch_size != batch_size):
+            self.reset_states(batch_size)
+        if not len(self._context):
+            self._context = torch.zeros(batch_size, context_size)
+        x = torch.cat([self._context, x], dim=1)
+        if sr in [8000, 16000]:
+            ort_inputs = {'input': x.numpy(), 'state': self._state.numpy(), 'sr': np.array(sr, dtype='int64')}
+            ort_outs = self.session.run(None, ort_inputs)
+            out, state = ort_outs
+            self._state = torch.from_numpy(state)
+        else:
+            raise ValueError()
+        self._context = x[..., -context_size:]
+        self._last_sr = sr
+        self._last_batch_size = batch_size
+        out = torch.from_numpy(out)
+        return out
+    def audio_forward(self, x, sr: int):
+        outs = []
+        x, sr = self._validate_input(x, sr)
+        self.reset_states()
+        num_samples = 512 if sr == 16000 else 256
+        if x.shape[1] % num_samples:
+            pad_num = num_samples - (x.shape[1] % num_samples)
+            x = torch.nn.functional.pad(x, (0, pad_num), 'constant', value=0.0)
+        for i in range(0, x.shape[1], num_samples):
+            wavs_batch = x[:, i:i+num_samples]
+            out_chunk = self.__call__(wavs_batch, sr)
+            outs.append(out_chunk)
+        stacked = torch.cat(outs, dim=1)
+        return stacked.cpu()
+    @staticmethod
+    def download(model_url="https://github.com/snakers4/silero-vad/raw/v5.0/files/silero_vad.onnx"):
+        target_dir = os.path.expanduser("~/.cache/whisper-live/")
+        # Ensure the target directory exists
+        os.makedirs(target_dir, exist_ok=True)
+        # Define the target file path
+        model_filename = os.path.join(target_dir, "silero_vad.onnx")
+        # Check if the model file already exists
+        if not os.path.exists(model_filename):
+            # If it doesn't exist, download the model using wget
+            try:
+                subprocess.run(["wget", "-O", model_filename, model_url], check=True)
+            except subprocess.CalledProcessError:
+                print("Failed to download the model using wget.")
+        return model_filename
+class VoiceActivityDetector:
+    def __init__(self, threshold=0.5, frame_rate=16000):
+        """
+        Initializes the VoiceActivityDetector with a voice activity detection model and a threshold.
+        Args:
+            threshold (float, optional): The probability threshold for detecting voice activity. Defaults to 0.5.
+        """
+        self.model = VoiceActivityDetection()
+        self.threshold = threshold
+        self.frame_rate = frame_rate
+    def __call__(self, audio_frame):
+        """
+        Determines if the given audio frame contains speech by comparing the detected speech probability against
+        the threshold.
+        Args:
+            audio_frame (np.ndarray): The audio frame to be analyzed for voice activity. It is expected to be a
+                                      NumPy array of audio samples.
+        Returns:
+            bool: True if the speech probability exceeds the threshold, indicating the presence of voice activity;
+                  False otherwise.
+        """
+        speech_probs = self.model.audio_forward(torch.from_numpy(audio_frame.copy()), self.frame_rate)[0]
+        return torch.any(speech_probs > self.threshold).item()