Spaces:

sofdog
/

live-transcription-english

Sleeping

App Files Files Community

Sofia Casadei commited on May 28

Commit

bebdee6

1 Parent(s): 8d6b944

up: params

Browse files

Files changed (1) hide show

main.py +16 -12

main.py CHANGED Viewed

@@ -2,8 +2,6 @@ import os
 import logging
 import json
 import torch
-import asyncio
-import subprocess
 import gradio as gr
 import numpy as np
@@ -126,13 +124,19 @@ async def transcribe(audio: tuple[int, np.ndarray]):
     outputs = transcribe_pipeline(
         audio_to_bytes(audio), # pass bytes
         #audio_array, # pass numpy array
-        chunk_length_s=3,
         batch_size=1,
         generate_kwargs={
-            'task': 'transcribe',
-            'language': LANGUAGE,
-        },
-        #return_timestamps="word"
     )
     yield AdditionalOutputs(outputs["text"].strip())
@@ -142,25 +146,25 @@ stream = Stream(
         transcribe,
         algo_options=AlgoOptions(
             # Duration in seconds of audio chunks passed to the VAD model (default 0.6)
-            audio_chunk_duration=0.6,
             # If the chunk has more than started_talking_threshold seconds of speech, the user started talking (default 0.2)
             started_talking_threshold=0.1,
             # If, after the user started speaking, there is a chunk with less than speech_threshold seconds of speech, the user stopped speaking. (default 0.1)
             speech_threshold=0.1,
             # Max duration of speech chunks before the handler is triggered, even if a pause is not detected by the VAD model. (default -inf)
-            max_continuous_speech_s=6
         ),
         model_options=SileroVadOptions(
             # Threshold for what is considered speech (default 0.5)
             threshold=0.5,
             # Final speech chunks shorter min_speech_duration_ms are thrown out (default 250)
-            min_speech_duration_ms=250,
             # Max duration of speech chunks, longer will be split at the timestamp of the last silence that lasts more than 100ms (if any) or just before max_speech_duration_s (default float('inf')) (used internally in the VAD algorithm to split the audio that's passed to the algorithm)
-            max_speech_duration_s=3,
             # Wait for ms at the end of each speech chunk before separating it (default 2000)
             min_silence_duration_ms=100,
             # Chunk size for VAD model. Can be 512, 1024, 1536 for 16k s.r. (default 1024)
-            window_size_samples=512,
             # Final speech chunks are padded by speech_pad_ms each side (default 400)
             speech_pad_ms=200,
         ),

 import logging
 import json
 import torch
 import gradio as gr
 import numpy as np
     outputs = transcribe_pipeline(
         audio_to_bytes(audio), # pass bytes
         #audio_array, # pass numpy array
+        chunk_length_s=5,
         batch_size=1,
         generate_kwargs={
+            "compression_ratio_threshold": 1.35,
+            "no_speech_threshold": 0.6,
+            "logprob_threshold": -1.0,
+            #"num_beams": 1,
+            #"condition_on_prev_tokens": False,
+            #"temperature": (0.0, 0.2, 0.4, 0.6),
+            "return_timestamps": True,
+            "task": "transcribe",
+            "language": LANGUAGE,
+        }
     )
     yield AdditionalOutputs(outputs["text"].strip())
         transcribe,
         algo_options=AlgoOptions(
             # Duration in seconds of audio chunks passed to the VAD model (default 0.6)
+            audio_chunk_duration=0.5,
             # If the chunk has more than started_talking_threshold seconds of speech, the user started talking (default 0.2)
             started_talking_threshold=0.1,
             # If, after the user started speaking, there is a chunk with less than speech_threshold seconds of speech, the user stopped speaking. (default 0.1)
             speech_threshold=0.1,
             # Max duration of speech chunks before the handler is triggered, even if a pause is not detected by the VAD model. (default -inf)
+            max_continuous_speech_s=5
         ),
         model_options=SileroVadOptions(
             # Threshold for what is considered speech (default 0.5)
             threshold=0.5,
             # Final speech chunks shorter min_speech_duration_ms are thrown out (default 250)
+            min_speech_duration_ms=200,
             # Max duration of speech chunks, longer will be split at the timestamp of the last silence that lasts more than 100ms (if any) or just before max_speech_duration_s (default float('inf')) (used internally in the VAD algorithm to split the audio that's passed to the algorithm)
+            max_speech_duration_s=5,
             # Wait for ms at the end of each speech chunk before separating it (default 2000)
             min_silence_duration_ms=100,
             # Chunk size for VAD model. Can be 512, 1024, 1536 for 16k s.r. (default 1024)
+            window_size_samples=1024,
             # Final speech chunks are padded by speech_pad_ms each side (default 400)
             speech_pad_ms=200,
         ),