Sofia Casadei commited on
Commit
bebdee6
Β·
1 Parent(s): 8d6b944

up: params

Browse files
Files changed (1) hide show
  1. main.py +16 -12
main.py CHANGED
@@ -2,8 +2,6 @@ import os
2
  import logging
3
  import json
4
  import torch
5
- import asyncio
6
- import subprocess
7
 
8
  import gradio as gr
9
  import numpy as np
@@ -126,13 +124,19 @@ async def transcribe(audio: tuple[int, np.ndarray]):
126
  outputs = transcribe_pipeline(
127
  audio_to_bytes(audio), # pass bytes
128
  #audio_array, # pass numpy array
129
- chunk_length_s=3,
130
  batch_size=1,
131
  generate_kwargs={
132
- 'task': 'transcribe',
133
- 'language': LANGUAGE,
134
- },
135
- #return_timestamps="word"
 
 
 
 
 
 
136
  )
137
  yield AdditionalOutputs(outputs["text"].strip())
138
 
@@ -142,25 +146,25 @@ stream = Stream(
142
  transcribe,
143
  algo_options=AlgoOptions(
144
  # Duration in seconds of audio chunks passed to the VAD model (default 0.6)
145
- audio_chunk_duration=0.6,
146
  # If the chunk has more than started_talking_threshold seconds of speech, the user started talking (default 0.2)
147
  started_talking_threshold=0.1,
148
  # If, after the user started speaking, there is a chunk with less than speech_threshold seconds of speech, the user stopped speaking. (default 0.1)
149
  speech_threshold=0.1,
150
  # Max duration of speech chunks before the handler is triggered, even if a pause is not detected by the VAD model. (default -inf)
151
- max_continuous_speech_s=6
152
  ),
153
  model_options=SileroVadOptions(
154
  # Threshold for what is considered speech (default 0.5)
155
  threshold=0.5,
156
  # Final speech chunks shorter min_speech_duration_ms are thrown out (default 250)
157
- min_speech_duration_ms=250,
158
  # Max duration of speech chunks, longer will be split at the timestamp of the last silence that lasts more than 100ms (if any) or just before max_speech_duration_s (default float('inf')) (used internally in the VAD algorithm to split the audio that's passed to the algorithm)
159
- max_speech_duration_s=3,
160
  # Wait for ms at the end of each speech chunk before separating it (default 2000)
161
  min_silence_duration_ms=100,
162
  # Chunk size for VAD model. Can be 512, 1024, 1536 for 16k s.r. (default 1024)
163
- window_size_samples=512,
164
  # Final speech chunks are padded by speech_pad_ms each side (default 400)
165
  speech_pad_ms=200,
166
  ),
 
2
  import logging
3
  import json
4
  import torch
 
 
5
 
6
  import gradio as gr
7
  import numpy as np
 
124
  outputs = transcribe_pipeline(
125
  audio_to_bytes(audio), # pass bytes
126
  #audio_array, # pass numpy array
127
+ chunk_length_s=5,
128
  batch_size=1,
129
  generate_kwargs={
130
+ "compression_ratio_threshold": 1.35,
131
+ "no_speech_threshold": 0.6,
132
+ "logprob_threshold": -1.0,
133
+ #"num_beams": 1,
134
+ #"condition_on_prev_tokens": False,
135
+ #"temperature": (0.0, 0.2, 0.4, 0.6),
136
+ "return_timestamps": True,
137
+ "task": "transcribe",
138
+ "language": LANGUAGE,
139
+ }
140
  )
141
  yield AdditionalOutputs(outputs["text"].strip())
142
 
 
146
  transcribe,
147
  algo_options=AlgoOptions(
148
  # Duration in seconds of audio chunks passed to the VAD model (default 0.6)
149
+ audio_chunk_duration=0.5,
150
  # If the chunk has more than started_talking_threshold seconds of speech, the user started talking (default 0.2)
151
  started_talking_threshold=0.1,
152
  # If, after the user started speaking, there is a chunk with less than speech_threshold seconds of speech, the user stopped speaking. (default 0.1)
153
  speech_threshold=0.1,
154
  # Max duration of speech chunks before the handler is triggered, even if a pause is not detected by the VAD model. (default -inf)
155
+ max_continuous_speech_s=5
156
  ),
157
  model_options=SileroVadOptions(
158
  # Threshold for what is considered speech (default 0.5)
159
  threshold=0.5,
160
  # Final speech chunks shorter min_speech_duration_ms are thrown out (default 250)
161
+ min_speech_duration_ms=200,
162
  # Max duration of speech chunks, longer will be split at the timestamp of the last silence that lasts more than 100ms (if any) or just before max_speech_duration_s (default float('inf')) (used internally in the VAD algorithm to split the audio that's passed to the algorithm)
163
+ max_speech_duration_s=5,
164
  # Wait for ms at the end of each speech chunk before separating it (default 2000)
165
  min_silence_duration_ms=100,
166
  # Chunk size for VAD model. Can be 512, 1024, 1536 for 16k s.r. (default 1024)
167
+ window_size_samples=1024,
168
  # Final speech chunks are padded by speech_pad_ms each side (default 400)
169
  speech_pad_ms=200,
170
  ),