Spaces:
Sleeping
Sleeping
Sofia Casadei
commited on
Commit
Β·
bebdee6
1
Parent(s):
8d6b944
up: params
Browse files
main.py
CHANGED
@@ -2,8 +2,6 @@ import os
|
|
2 |
import logging
|
3 |
import json
|
4 |
import torch
|
5 |
-
import asyncio
|
6 |
-
import subprocess
|
7 |
|
8 |
import gradio as gr
|
9 |
import numpy as np
|
@@ -126,13 +124,19 @@ async def transcribe(audio: tuple[int, np.ndarray]):
|
|
126 |
outputs = transcribe_pipeline(
|
127 |
audio_to_bytes(audio), # pass bytes
|
128 |
#audio_array, # pass numpy array
|
129 |
-
chunk_length_s=
|
130 |
batch_size=1,
|
131 |
generate_kwargs={
|
132 |
-
|
133 |
-
|
134 |
-
|
135 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
136 |
)
|
137 |
yield AdditionalOutputs(outputs["text"].strip())
|
138 |
|
@@ -142,25 +146,25 @@ stream = Stream(
|
|
142 |
transcribe,
|
143 |
algo_options=AlgoOptions(
|
144 |
# Duration in seconds of audio chunks passed to the VAD model (default 0.6)
|
145 |
-
audio_chunk_duration=0.
|
146 |
# If the chunk has more than started_talking_threshold seconds of speech, the user started talking (default 0.2)
|
147 |
started_talking_threshold=0.1,
|
148 |
# If, after the user started speaking, there is a chunk with less than speech_threshold seconds of speech, the user stopped speaking. (default 0.1)
|
149 |
speech_threshold=0.1,
|
150 |
# Max duration of speech chunks before the handler is triggered, even if a pause is not detected by the VAD model. (default -inf)
|
151 |
-
max_continuous_speech_s=
|
152 |
),
|
153 |
model_options=SileroVadOptions(
|
154 |
# Threshold for what is considered speech (default 0.5)
|
155 |
threshold=0.5,
|
156 |
# Final speech chunks shorter min_speech_duration_ms are thrown out (default 250)
|
157 |
-
min_speech_duration_ms=
|
158 |
# Max duration of speech chunks, longer will be split at the timestamp of the last silence that lasts more than 100ms (if any) or just before max_speech_duration_s (default float('inf')) (used internally in the VAD algorithm to split the audio that's passed to the algorithm)
|
159 |
-
max_speech_duration_s=
|
160 |
# Wait for ms at the end of each speech chunk before separating it (default 2000)
|
161 |
min_silence_duration_ms=100,
|
162 |
# Chunk size for VAD model. Can be 512, 1024, 1536 for 16k s.r. (default 1024)
|
163 |
-
window_size_samples=
|
164 |
# Final speech chunks are padded by speech_pad_ms each side (default 400)
|
165 |
speech_pad_ms=200,
|
166 |
),
|
|
|
2 |
import logging
|
3 |
import json
|
4 |
import torch
|
|
|
|
|
5 |
|
6 |
import gradio as gr
|
7 |
import numpy as np
|
|
|
124 |
outputs = transcribe_pipeline(
|
125 |
audio_to_bytes(audio), # pass bytes
|
126 |
#audio_array, # pass numpy array
|
127 |
+
chunk_length_s=5,
|
128 |
batch_size=1,
|
129 |
generate_kwargs={
|
130 |
+
"compression_ratio_threshold": 1.35,
|
131 |
+
"no_speech_threshold": 0.6,
|
132 |
+
"logprob_threshold": -1.0,
|
133 |
+
#"num_beams": 1,
|
134 |
+
#"condition_on_prev_tokens": False,
|
135 |
+
#"temperature": (0.0, 0.2, 0.4, 0.6),
|
136 |
+
"return_timestamps": True,
|
137 |
+
"task": "transcribe",
|
138 |
+
"language": LANGUAGE,
|
139 |
+
}
|
140 |
)
|
141 |
yield AdditionalOutputs(outputs["text"].strip())
|
142 |
|
|
|
146 |
transcribe,
|
147 |
algo_options=AlgoOptions(
|
148 |
# Duration in seconds of audio chunks passed to the VAD model (default 0.6)
|
149 |
+
audio_chunk_duration=0.5,
|
150 |
# If the chunk has more than started_talking_threshold seconds of speech, the user started talking (default 0.2)
|
151 |
started_talking_threshold=0.1,
|
152 |
# If, after the user started speaking, there is a chunk with less than speech_threshold seconds of speech, the user stopped speaking. (default 0.1)
|
153 |
speech_threshold=0.1,
|
154 |
# Max duration of speech chunks before the handler is triggered, even if a pause is not detected by the VAD model. (default -inf)
|
155 |
+
max_continuous_speech_s=5
|
156 |
),
|
157 |
model_options=SileroVadOptions(
|
158 |
# Threshold for what is considered speech (default 0.5)
|
159 |
threshold=0.5,
|
160 |
# Final speech chunks shorter min_speech_duration_ms are thrown out (default 250)
|
161 |
+
min_speech_duration_ms=200,
|
162 |
# Max duration of speech chunks, longer will be split at the timestamp of the last silence that lasts more than 100ms (if any) or just before max_speech_duration_s (default float('inf')) (used internally in the VAD algorithm to split the audio that's passed to the algorithm)
|
163 |
+
max_speech_duration_s=5,
|
164 |
# Wait for ms at the end of each speech chunk before separating it (default 2000)
|
165 |
min_silence_duration_ms=100,
|
166 |
# Chunk size for VAD model. Can be 512, 1024, 1536 for 16k s.r. (default 1024)
|
167 |
+
window_size_samples=1024,
|
168 |
# Final speech chunks are padded by speech_pad_ms each side (default 400)
|
169 |
speech_pad_ms=200,
|
170 |
),
|