Spaces:

langtech-innovation
/

WhisperLiveKitDiarization

Paused

App Files Files Community

qfuxa commited on Feb 19

Commit

d920423

1 Parent(s): f6b6b4a

add parameter to disable transcription (only diarization), add time in output

Browse files

Files changed (1) hide show

whisper_fastapi_online_server.py +54 -30

whisper_fastapi_online_server.py CHANGED Viewed

@@ -3,7 +3,7 @@ import argparse
 import asyncio
 import numpy as np
 import ffmpeg
-from time import time
 from contextlib import asynccontextmanager
 from fastapi import FastAPI, WebSocket, WebSocketDisconnect
@@ -12,9 +12,12 @@ from fastapi.middleware.cors import CORSMiddleware
 from src.whisper_streaming.whisper_online import backend_factory, online_factory, add_shared_args
-import subprocess
 import math
 import logging
 logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
@@ -48,6 +51,12 @@ parser.add_argument(
     help="Whether to enable speaker diarization.",
 )
 add_shared_args(parser)
 args = parser.parse_args()
@@ -68,7 +77,10 @@ if args.diarization:
 @asynccontextmanager
 async def lifespan(app: FastAPI):
     global asr, tokenizer
-    asr, tokenizer = backend_factory(args)
     yield
 app = FastAPI(lifespan=lifespan)
@@ -117,7 +129,7 @@ async def websocket_endpoint(websocket: WebSocket):
     ffmpeg_process = None
     pcm_buffer = bytearray()
-    online = online_factory(args, asr, tokenizer)
     diarization = DiartDiarization(SAMPLE_RATE) if args.diarization else None
     async def restart_ffmpeg():
@@ -130,7 +142,7 @@ async def websocket_endpoint(websocket: WebSocket):
                 logger.warning(f"Error killing FFmpeg process: {e}")
         ffmpeg_process = await start_ffmpeg_decoder()
         pcm_buffer = bytearray()
-        online = online_factory(args, asr, tokenizer)
         if args.diarization:
             diarization = DiartDiarization(SAMPLE_RATE)
         logger.info("FFmpeg process started.")
@@ -142,7 +154,7 @@ async def websocket_endpoint(websocket: WebSocket):
         loop = asyncio.get_event_loop()
         full_transcription = ""
         beg = time()
         chunk_history = []  # Will store dicts: {beg, end, text, speaker}
         while True:
@@ -184,45 +196,57 @@ async def websocket_endpoint(websocket: WebSocket):
                         / 32768.0
                     )
                     pcm_buffer = pcm_buffer[MAX_BYTES_PER_SEC:]
-                    logger.info(f"{len(online.audio_buffer) / online.SAMPLING_RATE} seconds of audio will be processed by the model.")
-                    online.insert_audio_chunk(pcm_array)
-                    transcription = online.process_iter()
-                    if transcription:
                         chunk_history.append({
-                            "beg": transcription.start,
-                            "end": transcription.end,
-                            "text": transcription.text,
-                            "speaker": "0"
                         })
-                    full_transcription += transcription.text if transcription else ""
-                    buffer = online.get_buffer()
-                    if buffer in full_transcription: # With VAC, the buffer is not updated until the next chunk is processed
-                        buffer = ""
-                    lines = [
-                        {
-                            "speaker": "0",
-                            "text": "",
-                        }
-                    ]
                     if args.diarization:
                         await diarization.diarize(pcm_array)
                         diarization.assign_speakers_to_chunks(chunk_history)
                     for ch in chunk_history:
-                        if args.diarization and ch["speaker"] and ch["speaker"][-1] != lines[-1]["speaker"]:
                             lines.append(
                                 {
-                                    "speaker": ch["speaker"][-1],
-                                    "text": ch['text']
                                 }
                             )
                         else:
                             lines[-1]["text"] += ch['text']
                     response = {"lines": lines, "buffer": buffer}
                     await websocket.send_json(response)

 import asyncio
 import numpy as np
 import ffmpeg
+from time import time, sleep
 from contextlib import asynccontextmanager
 from fastapi import FastAPI, WebSocket, WebSocketDisconnect
 from src.whisper_streaming.whisper_online import backend_factory, online_factory, add_shared_args
 import math
 import logging
+from datetime import timedelta
+def format_time(seconds):
+    return str(timedelta(seconds=int(seconds)))
 logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
     help="Whether to enable speaker diarization.",
 )
+parser.add_argument(
+    "--transcription",
+    type=bool,
+    default=True,
+    help="To disable to only see live diarization results.",
+)
 add_shared_args(parser)
 args = parser.parse_args()
 @asynccontextmanager
 async def lifespan(app: FastAPI):
     global asr, tokenizer
+    if args.transcription:
+        asr, tokenizer = backend_factory(args)
+    else:
+        asr, tokenizer = None, None
     yield
 app = FastAPI(lifespan=lifespan)
     ffmpeg_process = None
     pcm_buffer = bytearray()
+    online = online_factory(args, asr, tokenizer) if args.transcription else None
     diarization = DiartDiarization(SAMPLE_RATE) if args.diarization else None
     async def restart_ffmpeg():
                 logger.warning(f"Error killing FFmpeg process: {e}")
         ffmpeg_process = await start_ffmpeg_decoder()
         pcm_buffer = bytearray()
+        online = online_factory(args, asr, tokenizer) if args.transcription else None
         if args.diarization:
             diarization = DiartDiarization(SAMPLE_RATE)
         logger.info("FFmpeg process started.")
         loop = asyncio.get_event_loop()
         full_transcription = ""
         beg = time()
+        beg_loop = time()
         chunk_history = []  # Will store dicts: {beg, end, text, speaker}
         while True:
                         / 32768.0
                     )
                     pcm_buffer = pcm_buffer[MAX_BYTES_PER_SEC:]
+                    if args.transcription:
+                        logger.info(f"{len(online.audio_buffer) / online.SAMPLING_RATE} seconds of audio will be processed by the model.")
+                        online.insert_audio_chunk(pcm_array)
+                        transcription = online.process_iter()
+                        if transcription.start:
+                            chunk_history.append({
+                                "beg": transcription.start,
+                                "end": transcription.end,
+                                "text": transcription.text,
+                            })
+                        full_transcription += transcription.text if transcription else ""
+                        buffer = online.get_buffer()
+                        if buffer in full_transcription: # With VAC, the buffer is not updated until the next chunk is processed
+                            buffer = ""
+                    else:
                         chunk_history.append({
+                                "beg": time() - beg_loop,
+                                "end": time() - beg_loop + 0.1,
+                                "text": '',
                         })
+                        sleep(0.1)
+                        buffer = ''
                     if args.diarization:
                         await diarization.diarize(pcm_array)
                         diarization.assign_speakers_to_chunks(chunk_history)
+                    current_speaker = -1
+                    lines = [{
+                        "beg": 0,
+                        "end": 0,
+                        "speaker": current_speaker,
+                        "text": ""
+                        }]
                     for ch in chunk_history:
+                        if args.diarization and ch["speaker"] and ch["speaker"] != current_speaker:
+                            new_speaker = ch["speaker"]
                             lines.append(
                                 {
+                                    "speaker": new_speaker,
+                                    "text": ch['text'],
+                                    "beg": format_time(ch['beg']),
+                                    "end": format_time(ch['end']),
                                 }
                             )
+                            current_speaker = new_speaker
                         else:
                             lines[-1]["text"] += ch['text']
+                            lines[-1]["end"] = format_time(ch['end'])
                     response = {"lines": lines, "buffer": buffer}
                     await websocket.send_json(response)