Ensure VAD supports detect language
Browse files- app.py +3 -2
- src/vad.py +12 -7
app.py
CHANGED
|
@@ -90,7 +90,8 @@ class WhisperTranscriber:
|
|
| 90 |
def transcribe_file(self, model: whisper.Whisper, audio_path: str, language: str, task: str = None, vad: str = None,
|
| 91 |
vadMergeWindow: float = 5, vadMaxMergeSize: float = 150, vadPadding: float = 1, vadPromptWindow: float = 1, **decodeOptions: dict):
|
| 92 |
# Callable for processing an audio file
|
| 93 |
-
whisperCallable = lambda audio, prompt : model.transcribe(audio,
|
|
|
|
| 94 |
|
| 95 |
# The results
|
| 96 |
if (vad == 'silero-vad'):
|
|
@@ -112,7 +113,7 @@ class WhisperTranscriber:
|
|
| 112 |
result = periodic_vad.transcribe(audio_path, whisperCallable)
|
| 113 |
else:
|
| 114 |
# Default VAD
|
| 115 |
-
result = whisperCallable(audio_path, None)
|
| 116 |
|
| 117 |
return result
|
| 118 |
|
|
|
|
| 90 |
def transcribe_file(self, model: whisper.Whisper, audio_path: str, language: str, task: str = None, vad: str = None,
|
| 91 |
vadMergeWindow: float = 5, vadMaxMergeSize: float = 150, vadPadding: float = 1, vadPromptWindow: float = 1, **decodeOptions: dict):
|
| 92 |
# Callable for processing an audio file
|
| 93 |
+
whisperCallable = lambda audio, prompt, detected_language : model.transcribe(audio, \
|
| 94 |
+
language=language if language else detected_language, task=task, initial_prompt=prompt, **decodeOptions)
|
| 95 |
|
| 96 |
# The results
|
| 97 |
if (vad == 'silero-vad'):
|
|
|
|
| 113 |
result = periodic_vad.transcribe(audio_path, whisperCallable)
|
| 114 |
else:
|
| 115 |
# Default VAD
|
| 116 |
+
result = whisperCallable(audio_path, None, None)
|
| 117 |
|
| 118 |
return result
|
| 119 |
|
src/vad.py
CHANGED
|
@@ -100,9 +100,9 @@ class AbstractTranscription(ABC):
|
|
| 100 |
audio: str
|
| 101 |
The audio file.
|
| 102 |
|
| 103 |
-
whisperCallable: Callable[[Union[str, np.ndarray, torch.Tensor], str], dict[str, Union[dict, Any]]]
|
| 104 |
The callback that is used to invoke Whisper on an audio file/buffer. The first parameter is the audio file/buffer,
|
| 105 |
-
|
| 106 |
|
| 107 |
Returns
|
| 108 |
-------
|
|
@@ -145,6 +145,7 @@ class AbstractTranscription(ABC):
|
|
| 145 |
'language': ""
|
| 146 |
}
|
| 147 |
languageCounter = Counter()
|
|
|
|
| 148 |
|
| 149 |
# For each time segment, run whisper
|
| 150 |
for segment in merged:
|
|
@@ -163,9 +164,12 @@ class AbstractTranscription(ABC):
|
|
| 163 |
# Previous segments to use as a prompt
|
| 164 |
segment_prompt = ' '.join([segment['text'] for segment in prompt_window]) if len(prompt_window) > 0 else None
|
| 165 |
|
|
|
|
|
|
|
|
|
|
| 166 |
print("Running whisper from ", format_timestamp(segment_start), " to ", format_timestamp(segment_end), ", duration: ",
|
| 167 |
-
segment_duration, "expanded: ", segment_expand_amount, "prompt: ", segment_prompt)
|
| 168 |
-
segment_result = whisperCallable(segment_audio, segment_prompt)
|
| 169 |
|
| 170 |
adjusted_segments = self.adjust_timestamp(segment_result["segments"], adjust_seconds=segment_start, max_source_time=segment_duration)
|
| 171 |
|
|
@@ -185,13 +189,14 @@ class AbstractTranscription(ABC):
|
|
| 185 |
result['segments'].extend(adjusted_segments)
|
| 186 |
|
| 187 |
# Increment detected language
|
| 188 |
-
|
|
|
|
| 189 |
|
| 190 |
# Update prompt window
|
| 191 |
self.__update_prompt_window(prompt_window, adjusted_segments, segment_end, segment_gap)
|
| 192 |
|
| 193 |
-
if
|
| 194 |
-
result['language'] =
|
| 195 |
|
| 196 |
return result
|
| 197 |
|
|
|
|
| 100 |
audio: str
|
| 101 |
The audio file.
|
| 102 |
|
| 103 |
+
whisperCallable: Callable[[Union[str, np.ndarray, torch.Tensor], str, str], dict[str, Union[dict, Any]]]
|
| 104 |
The callback that is used to invoke Whisper on an audio file/buffer. The first parameter is the audio file/buffer,
|
| 105 |
+
the second parameter is an optional text prompt, and the last is the current detected language. The return value is the result of the Whisper call.
|
| 106 |
|
| 107 |
Returns
|
| 108 |
-------
|
|
|
|
| 145 |
'language': ""
|
| 146 |
}
|
| 147 |
languageCounter = Counter()
|
| 148 |
+
detected_language = None
|
| 149 |
|
| 150 |
# For each time segment, run whisper
|
| 151 |
for segment in merged:
|
|
|
|
| 164 |
# Previous segments to use as a prompt
|
| 165 |
segment_prompt = ' '.join([segment['text'] for segment in prompt_window]) if len(prompt_window) > 0 else None
|
| 166 |
|
| 167 |
+
# Detected language
|
| 168 |
+
detected_language = languageCounter.most_common(1)[0][0] if len(languageCounter) > 0 else None
|
| 169 |
+
|
| 170 |
print("Running whisper from ", format_timestamp(segment_start), " to ", format_timestamp(segment_end), ", duration: ",
|
| 171 |
+
segment_duration, "expanded: ", segment_expand_amount, "prompt: ", segment_prompt, "language: ", detected_language)
|
| 172 |
+
segment_result = whisperCallable(segment_audio, segment_prompt, detected_language)
|
| 173 |
|
| 174 |
adjusted_segments = self.adjust_timestamp(segment_result["segments"], adjust_seconds=segment_start, max_source_time=segment_duration)
|
| 175 |
|
|
|
|
| 189 |
result['segments'].extend(adjusted_segments)
|
| 190 |
|
| 191 |
# Increment detected language
|
| 192 |
+
if not segment_gap:
|
| 193 |
+
languageCounter[segment_result['language']] += 1
|
| 194 |
|
| 195 |
# Update prompt window
|
| 196 |
self.__update_prompt_window(prompt_window, adjusted_segments, segment_end, segment_gap)
|
| 197 |
|
| 198 |
+
if detected_language is not None:
|
| 199 |
+
result['language'] = detected_language
|
| 200 |
|
| 201 |
return result
|
| 202 |
|