Spaces:
Configuration error
Configuration error
Fedir Zadniprovskyi
commited on
Commit
·
2a79f48
1
Parent(s):
125092f
refactor
Browse files- faster_whisper_server/asr.py +3 -19
- faster_whisper_server/core.py +103 -52
- faster_whisper_server/main.py +5 -3
- faster_whisper_server/server_models.py +11 -65
- faster_whisper_server/transcriber.py +4 -23
- faster_whisper_server/utils.py +0 -14
faster_whisper_server/asr.py
CHANGED
|
@@ -1,11 +1,10 @@
|
|
| 1 |
import asyncio
|
| 2 |
-
from collections.abc import Iterable
|
| 3 |
import time
|
| 4 |
|
| 5 |
from faster_whisper import transcribe
|
| 6 |
|
| 7 |
from faster_whisper_server.audio import Audio
|
| 8 |
-
from faster_whisper_server.core import Transcription, Word
|
| 9 |
from faster_whisper_server.logger import logger
|
| 10 |
|
| 11 |
|
|
@@ -30,7 +29,8 @@ class FasterWhisperASR:
|
|
| 30 |
word_timestamps=True,
|
| 31 |
**self.transcribe_opts,
|
| 32 |
)
|
| 33 |
-
|
|
|
|
| 34 |
for word in words:
|
| 35 |
word.offset(audio.start)
|
| 36 |
transcription = Transcription(words)
|
|
@@ -54,19 +54,3 @@ class FasterWhisperASR:
|
|
| 54 |
audio,
|
| 55 |
prompt,
|
| 56 |
)
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
def words_from_whisper_segments(segments: Iterable[transcribe.Segment]) -> list[Word]:
|
| 60 |
-
words: list[Word] = []
|
| 61 |
-
for segment in segments:
|
| 62 |
-
assert segment.words is not None
|
| 63 |
-
words.extend(
|
| 64 |
-
Word(
|
| 65 |
-
start=word.start,
|
| 66 |
-
end=word.end,
|
| 67 |
-
text=word.word,
|
| 68 |
-
probability=word.probability,
|
| 69 |
-
)
|
| 70 |
-
for word in segment.words
|
| 71 |
-
)
|
| 72 |
-
return words
|
|
|
|
| 1 |
import asyncio
|
|
|
|
| 2 |
import time
|
| 3 |
|
| 4 |
from faster_whisper import transcribe
|
| 5 |
|
| 6 |
from faster_whisper_server.audio import Audio
|
| 7 |
+
from faster_whisper_server.core import Segment, Transcription, Word
|
| 8 |
from faster_whisper_server.logger import logger
|
| 9 |
|
| 10 |
|
|
|
|
| 29 |
word_timestamps=True,
|
| 30 |
**self.transcribe_opts,
|
| 31 |
)
|
| 32 |
+
segments = Segment.from_faster_whisper_segments(segments)
|
| 33 |
+
words = Word.from_segments(segments)
|
| 34 |
for word in words:
|
| 35 |
word.offset(audio.start)
|
| 36 |
transcription = Transcription(words)
|
|
|
|
| 54 |
audio,
|
| 55 |
prompt,
|
| 56 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
faster_whisper_server/core.py
CHANGED
|
@@ -1,43 +1,85 @@
|
|
| 1 |
-
# TODO: rename module
|
| 2 |
from __future__ import annotations
|
| 3 |
|
| 4 |
-
from dataclasses import dataclass
|
| 5 |
import re
|
|
|
|
|
|
|
|
|
|
| 6 |
|
| 7 |
from faster_whisper_server.config import config
|
| 8 |
|
|
|
|
|
|
|
| 9 |
|
| 10 |
-
|
| 11 |
-
@dataclass
|
| 12 |
-
class Segment:
|
| 13 |
-
text: str
|
| 14 |
-
start: float = 0.0
|
| 15 |
-
end: float = 0.0
|
| 16 |
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 22 |
|
| 23 |
def offset(self, seconds: float) -> None:
|
| 24 |
self.start += seconds
|
| 25 |
self.end += seconds
|
| 26 |
|
| 27 |
-
|
| 28 |
-
# TODO: use the `Word` from `faster-whisper.transcribe` instead
|
| 29 |
-
@dataclass
|
| 30 |
-
class Word(Segment):
|
| 31 |
-
probability: float = 0.0
|
| 32 |
-
|
| 33 |
@classmethod
|
| 34 |
def common_prefix(cls, a: list[Word], b: list[Word]) -> list[Word]:
|
| 35 |
i = 0
|
| 36 |
-
while i < len(a) and i < len(b) and canonicalize_word(a[i].
|
| 37 |
i += 1
|
| 38 |
return a[:i]
|
| 39 |
|
| 40 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 41 |
class Transcription:
|
| 42 |
def __init__(self, words: list[Word] = []) -> None:
|
| 43 |
self.words: list[Word] = []
|
|
@@ -45,7 +87,7 @@ class Transcription:
|
|
| 45 |
|
| 46 |
@property
|
| 47 |
def text(self) -> str:
|
| 48 |
-
return " ".join(word.
|
| 49 |
|
| 50 |
@property
|
| 51 |
def start(self) -> float:
|
|
@@ -77,48 +119,57 @@ class Transcription:
|
|
| 77 |
raise ValueError(f"Words overlap: {words[i - 1]} and {words[i]}. All words: {words}")
|
| 78 |
|
| 79 |
|
| 80 |
-
def
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
assert not
|
| 88 |
-
assert
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 89 |
|
| 90 |
|
| 91 |
-
def to_full_sentences(words: list[Word]) -> list[
|
| 92 |
-
sentences: list[
|
| 93 |
for word in words:
|
| 94 |
-
sentences[-1]
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
)
|
| 99 |
-
if word.is_eos:
|
| 100 |
-
sentences.append(Segment(""))
|
| 101 |
-
if len(sentences) > 0 and not sentences[-1].is_eos:
|
| 102 |
sentences.pop()
|
| 103 |
return sentences
|
| 104 |
|
| 105 |
|
| 106 |
def tests_to_full_sentences() -> None:
|
|
|
|
|
|
|
|
|
|
| 107 |
assert to_full_sentences([]) == []
|
| 108 |
-
assert to_full_sentences([
|
| 109 |
-
assert to_full_sentences([
|
| 110 |
-
assert to_full_sentences([
|
| 111 |
-
assert to_full_sentences([
|
| 112 |
-
|
| 113 |
]
|
| 114 |
|
| 115 |
|
| 116 |
-
def
|
| 117 |
-
return "".join(word.
|
|
|
|
|
|
|
|
|
|
|
|
|
| 118 |
|
| 119 |
|
| 120 |
-
def
|
| 121 |
-
return "".join(
|
| 122 |
|
| 123 |
|
| 124 |
def canonicalize_word(text: str) -> str:
|
|
@@ -136,14 +187,14 @@ def test_canonicalize_word() -> None:
|
|
| 136 |
|
| 137 |
def common_prefix(a: list[Word], b: list[Word]) -> list[Word]:
|
| 138 |
i = 0
|
| 139 |
-
while i < len(a) and i < len(b) and canonicalize_word(a[i].
|
| 140 |
i += 1
|
| 141 |
return a[:i]
|
| 142 |
|
| 143 |
|
| 144 |
def test_common_prefix() -> None:
|
| 145 |
def word(text: str) -> Word:
|
| 146 |
-
return Word(
|
| 147 |
|
| 148 |
a = [word("a"), word("b"), word("c")]
|
| 149 |
b = [word("a"), word("b"), word("c")]
|
|
@@ -176,7 +227,7 @@ def test_common_prefix() -> None:
|
|
| 176 |
|
| 177 |
def test_common_prefix_and_canonicalization() -> None:
|
| 178 |
def word(text: str) -> Word:
|
| 179 |
-
return Word(
|
| 180 |
|
| 181 |
a = [word("A...")]
|
| 182 |
b = [word("a?"), word("b"), word("c")]
|
|
|
|
|
|
|
| 1 |
from __future__ import annotations
|
| 2 |
|
|
|
|
| 3 |
import re
|
| 4 |
+
from typing import TYPE_CHECKING
|
| 5 |
+
|
| 6 |
+
from pydantic import BaseModel
|
| 7 |
|
| 8 |
from faster_whisper_server.config import config
|
| 9 |
|
| 10 |
+
if TYPE_CHECKING:
|
| 11 |
+
from collections.abc import Iterable
|
| 12 |
|
| 13 |
+
import faster_whisper.transcribe
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 14 |
|
| 15 |
+
|
| 16 |
+
class Word(BaseModel):
|
| 17 |
+
start: float
|
| 18 |
+
end: float
|
| 19 |
+
word: str
|
| 20 |
+
probability: float
|
| 21 |
+
|
| 22 |
+
@classmethod
|
| 23 |
+
def from_segments(cls, segments: Iterable[Segment]) -> list[Word]:
|
| 24 |
+
words: list[Word] = []
|
| 25 |
+
for segment in segments:
|
| 26 |
+
assert segment.words is not None
|
| 27 |
+
words.extend(segment.words)
|
| 28 |
+
return words
|
| 29 |
|
| 30 |
def offset(self, seconds: float) -> None:
|
| 31 |
self.start += seconds
|
| 32 |
self.end += seconds
|
| 33 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 34 |
@classmethod
|
| 35 |
def common_prefix(cls, a: list[Word], b: list[Word]) -> list[Word]:
|
| 36 |
i = 0
|
| 37 |
+
while i < len(a) and i < len(b) and canonicalize_word(a[i].word) == canonicalize_word(b[i].word):
|
| 38 |
i += 1
|
| 39 |
return a[:i]
|
| 40 |
|
| 41 |
|
| 42 |
+
class Segment(BaseModel):
|
| 43 |
+
id: int
|
| 44 |
+
seek: int
|
| 45 |
+
start: float
|
| 46 |
+
end: float
|
| 47 |
+
text: str
|
| 48 |
+
tokens: list[int]
|
| 49 |
+
temperature: float
|
| 50 |
+
avg_logprob: float
|
| 51 |
+
compression_ratio: float
|
| 52 |
+
no_speech_prob: float
|
| 53 |
+
words: list[Word] | None
|
| 54 |
+
|
| 55 |
+
@classmethod
|
| 56 |
+
def from_faster_whisper_segments(cls, segments: Iterable[faster_whisper.transcribe.Segment]) -> Iterable[Segment]:
|
| 57 |
+
for segment in segments:
|
| 58 |
+
yield cls(
|
| 59 |
+
id=segment.id,
|
| 60 |
+
seek=segment.seek,
|
| 61 |
+
start=segment.start,
|
| 62 |
+
end=segment.end,
|
| 63 |
+
text=segment.text,
|
| 64 |
+
tokens=segment.tokens,
|
| 65 |
+
temperature=segment.temperature,
|
| 66 |
+
avg_logprob=segment.avg_logprob,
|
| 67 |
+
compression_ratio=segment.compression_ratio,
|
| 68 |
+
no_speech_prob=segment.no_speech_prob,
|
| 69 |
+
words=[
|
| 70 |
+
Word(
|
| 71 |
+
start=word.start,
|
| 72 |
+
end=word.end,
|
| 73 |
+
word=word.word,
|
| 74 |
+
probability=word.probability,
|
| 75 |
+
)
|
| 76 |
+
for word in segment.words
|
| 77 |
+
]
|
| 78 |
+
if segment.words is not None
|
| 79 |
+
else None,
|
| 80 |
+
)
|
| 81 |
+
|
| 82 |
+
|
| 83 |
class Transcription:
|
| 84 |
def __init__(self, words: list[Word] = []) -> None:
|
| 85 |
self.words: list[Word] = []
|
|
|
|
| 87 |
|
| 88 |
@property
|
| 89 |
def text(self) -> str:
|
| 90 |
+
return " ".join(word.word for word in self.words).strip()
|
| 91 |
|
| 92 |
@property
|
| 93 |
def start(self) -> float:
|
|
|
|
| 119 |
raise ValueError(f"Words overlap: {words[i - 1]} and {words[i]}. All words: {words}")
|
| 120 |
|
| 121 |
|
| 122 |
+
def is_eos(text: str) -> bool:
|
| 123 |
+
if text.endswith("..."):
|
| 124 |
+
return False
|
| 125 |
+
return any(text.endswith(punctuation_symbol) for punctuation_symbol in ".?!")
|
| 126 |
+
|
| 127 |
+
|
| 128 |
+
def test_is_eos() -> None:
|
| 129 |
+
assert not is_eos("Hello")
|
| 130 |
+
assert not is_eos("Hello...")
|
| 131 |
+
assert is_eos("Hello.")
|
| 132 |
+
assert is_eos("Hello!")
|
| 133 |
+
assert is_eos("Hello?")
|
| 134 |
+
assert not is_eos("Hello. Yo")
|
| 135 |
+
assert not is_eos("Hello. Yo...")
|
| 136 |
+
assert is_eos("Hello. Yo.")
|
| 137 |
|
| 138 |
|
| 139 |
+
def to_full_sentences(words: list[Word]) -> list[list[Word]]:
|
| 140 |
+
sentences: list[list[Word]] = [[]]
|
| 141 |
for word in words:
|
| 142 |
+
sentences[-1].append(word)
|
| 143 |
+
if is_eos(word.word):
|
| 144 |
+
sentences.append([])
|
| 145 |
+
if len(sentences[-1]) == 0 or not is_eos(sentences[-1][-1].word):
|
|
|
|
|
|
|
|
|
|
|
|
|
| 146 |
sentences.pop()
|
| 147 |
return sentences
|
| 148 |
|
| 149 |
|
| 150 |
def tests_to_full_sentences() -> None:
|
| 151 |
+
def word(text: str) -> Word:
|
| 152 |
+
return Word(word=text, start=0.0, end=0.0, probability=0.0)
|
| 153 |
+
|
| 154 |
assert to_full_sentences([]) == []
|
| 155 |
+
assert to_full_sentences([word(text="Hello")]) == []
|
| 156 |
+
assert to_full_sentences([word(text="Hello..."), word(" world")]) == []
|
| 157 |
+
assert to_full_sentences([word(text="Hello..."), word(" world.")]) == [[word("Hello..."), word(" world.")]]
|
| 158 |
+
assert to_full_sentences([word(text="Hello..."), word(" world."), word(" How")]) == [
|
| 159 |
+
[word("Hello..."), word(" world.")],
|
| 160 |
]
|
| 161 |
|
| 162 |
|
| 163 |
+
def word_to_text(words: list[Word]) -> str:
|
| 164 |
+
return "".join(word.word for word in words)
|
| 165 |
+
|
| 166 |
+
|
| 167 |
+
def words_to_text_w_ts(words: list[Word]) -> str:
|
| 168 |
+
return "".join(f"{word.word}({word.start:.2f}-{word.end:.2f})" for word in words)
|
| 169 |
|
| 170 |
|
| 171 |
+
def segments_to_text(segments: Iterable[Segment]) -> str:
|
| 172 |
+
return "".join(segment.text for segment in segments).strip()
|
| 173 |
|
| 174 |
|
| 175 |
def canonicalize_word(text: str) -> str:
|
|
|
|
| 187 |
|
| 188 |
def common_prefix(a: list[Word], b: list[Word]) -> list[Word]:
|
| 189 |
i = 0
|
| 190 |
+
while i < len(a) and i < len(b) and canonicalize_word(a[i].word) == canonicalize_word(b[i].word):
|
| 191 |
i += 1
|
| 192 |
return a[:i]
|
| 193 |
|
| 194 |
|
| 195 |
def test_common_prefix() -> None:
|
| 196 |
def word(text: str) -> Word:
|
| 197 |
+
return Word(word=text, start=0.0, end=0.0, probability=0.0)
|
| 198 |
|
| 199 |
a = [word("a"), word("b"), word("c")]
|
| 200 |
b = [word("a"), word("b"), word("c")]
|
|
|
|
| 227 |
|
| 228 |
def test_common_prefix_and_canonicalization() -> None:
|
| 229 |
def word(text: str) -> Word:
|
| 230 |
+
return Word(word=text, start=0.0, end=0.0, probability=0.0)
|
| 231 |
|
| 232 |
a = [word("A...")]
|
| 233 |
b = [word("a?"), word("b"), word("c")]
|
faster_whisper_server/main.py
CHANGED
|
@@ -24,7 +24,6 @@ from faster_whisper.vad import VadOptions, get_speech_timestamps
|
|
| 24 |
import huggingface_hub
|
| 25 |
from pydantic import AfterValidator
|
| 26 |
|
| 27 |
-
from faster_whisper_server import utils
|
| 28 |
from faster_whisper_server.asr import FasterWhisperASR
|
| 29 |
from faster_whisper_server.audio import AudioStream, audio_samples_from_file
|
| 30 |
from faster_whisper_server.config import (
|
|
@@ -34,6 +33,7 @@ from faster_whisper_server.config import (
|
|
| 34 |
Task,
|
| 35 |
config,
|
| 36 |
)
|
|
|
|
| 37 |
from faster_whisper_server.logger import logger
|
| 38 |
from faster_whisper_server.server_models import (
|
| 39 |
ModelListResponse,
|
|
@@ -46,7 +46,7 @@ from faster_whisper_server.transcriber import audio_transcriber
|
|
| 46 |
if TYPE_CHECKING:
|
| 47 |
from collections.abc import Generator, Iterable
|
| 48 |
|
| 49 |
-
from faster_whisper.transcribe import
|
| 50 |
from huggingface_hub.hf_api import ModelInfo
|
| 51 |
|
| 52 |
loaded_models: OrderedDict[str, WhisperModel] = OrderedDict()
|
|
@@ -157,7 +157,7 @@ def segments_to_response(
|
|
| 157 |
) -> str | TranscriptionJsonResponse | TranscriptionVerboseJsonResponse:
|
| 158 |
segments = list(segments)
|
| 159 |
if response_format == ResponseFormat.TEXT: # noqa: RET503
|
| 160 |
-
return
|
| 161 |
elif response_format == ResponseFormat.JSON:
|
| 162 |
return TranscriptionJsonResponse.from_segments(segments)
|
| 163 |
elif response_format == ResponseFormat.VERBOSE_JSON:
|
|
@@ -220,6 +220,7 @@ def translate_file(
|
|
| 220 |
temperature=temperature,
|
| 221 |
vad_filter=True,
|
| 222 |
)
|
|
|
|
| 223 |
|
| 224 |
if stream:
|
| 225 |
return segments_to_streaming_response(segments, transcription_info, response_format)
|
|
@@ -258,6 +259,7 @@ def transcribe_file(
|
|
| 258 |
vad_filter=True,
|
| 259 |
hotwords=hotwords,
|
| 260 |
)
|
|
|
|
| 261 |
|
| 262 |
if stream:
|
| 263 |
return segments_to_streaming_response(segments, transcription_info, response_format)
|
|
|
|
| 24 |
import huggingface_hub
|
| 25 |
from pydantic import AfterValidator
|
| 26 |
|
|
|
|
| 27 |
from faster_whisper_server.asr import FasterWhisperASR
|
| 28 |
from faster_whisper_server.audio import AudioStream, audio_samples_from_file
|
| 29 |
from faster_whisper_server.config import (
|
|
|
|
| 33 |
Task,
|
| 34 |
config,
|
| 35 |
)
|
| 36 |
+
from faster_whisper_server.core import Segment, segments_to_text
|
| 37 |
from faster_whisper_server.logger import logger
|
| 38 |
from faster_whisper_server.server_models import (
|
| 39 |
ModelListResponse,
|
|
|
|
| 46 |
if TYPE_CHECKING:
|
| 47 |
from collections.abc import Generator, Iterable
|
| 48 |
|
| 49 |
+
from faster_whisper.transcribe import TranscriptionInfo
|
| 50 |
from huggingface_hub.hf_api import ModelInfo
|
| 51 |
|
| 52 |
loaded_models: OrderedDict[str, WhisperModel] = OrderedDict()
|
|
|
|
| 157 |
) -> str | TranscriptionJsonResponse | TranscriptionVerboseJsonResponse:
|
| 158 |
segments = list(segments)
|
| 159 |
if response_format == ResponseFormat.TEXT: # noqa: RET503
|
| 160 |
+
return segments_to_text(segments)
|
| 161 |
elif response_format == ResponseFormat.JSON:
|
| 162 |
return TranscriptionJsonResponse.from_segments(segments)
|
| 163 |
elif response_format == ResponseFormat.VERBOSE_JSON:
|
|
|
|
| 220 |
temperature=temperature,
|
| 221 |
vad_filter=True,
|
| 222 |
)
|
| 223 |
+
segments = Segment.from_faster_whisper_segments(segments)
|
| 224 |
|
| 225 |
if stream:
|
| 226 |
return segments_to_streaming_response(segments, transcription_info, response_format)
|
|
|
|
| 259 |
vad_filter=True,
|
| 260 |
hotwords=hotwords,
|
| 261 |
)
|
| 262 |
+
segments = Segment.from_faster_whisper_segments(segments)
|
| 263 |
|
| 264 |
if stream:
|
| 265 |
return segments_to_streaming_response(segments, transcription_info, response_format)
|
faster_whisper_server/server_models.py
CHANGED
|
@@ -4,12 +4,10 @@ from typing import TYPE_CHECKING, Literal
|
|
| 4 |
|
| 5 |
from pydantic import BaseModel, ConfigDict, Field
|
| 6 |
|
| 7 |
-
from faster_whisper_server import
|
| 8 |
|
| 9 |
if TYPE_CHECKING:
|
| 10 |
-
from faster_whisper.transcribe import
|
| 11 |
-
|
| 12 |
-
from faster_whisper_server.core import Transcription
|
| 13 |
|
| 14 |
|
| 15 |
# https://platform.openai.com/docs/api-reference/audio/json-object
|
|
@@ -18,65 +16,21 @@ class TranscriptionJsonResponse(BaseModel):
|
|
| 18 |
|
| 19 |
@classmethod
|
| 20 |
def from_segments(cls, segments: list[Segment]) -> TranscriptionJsonResponse:
|
| 21 |
-
return cls(text=
|
| 22 |
|
| 23 |
@classmethod
|
| 24 |
def from_transcription(cls, transcription: Transcription) -> TranscriptionJsonResponse:
|
| 25 |
return cls(text=transcription.text)
|
| 26 |
|
| 27 |
|
| 28 |
-
class WordObject(BaseModel):
|
| 29 |
-
start: float
|
| 30 |
-
end: float
|
| 31 |
-
word: str
|
| 32 |
-
probability: float
|
| 33 |
-
|
| 34 |
-
@classmethod
|
| 35 |
-
def from_word(cls, word: Word) -> WordObject:
|
| 36 |
-
return cls(
|
| 37 |
-
start=word.start,
|
| 38 |
-
end=word.end,
|
| 39 |
-
word=word.word,
|
| 40 |
-
probability=word.probability,
|
| 41 |
-
)
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
class SegmentObject(BaseModel):
|
| 45 |
-
id: int
|
| 46 |
-
seek: int
|
| 47 |
-
start: float
|
| 48 |
-
end: float
|
| 49 |
-
text: str
|
| 50 |
-
tokens: list[int]
|
| 51 |
-
temperature: float
|
| 52 |
-
avg_logprob: float
|
| 53 |
-
compression_ratio: float
|
| 54 |
-
no_speech_prob: float
|
| 55 |
-
|
| 56 |
-
@classmethod
|
| 57 |
-
def from_segment(cls, segment: Segment) -> SegmentObject:
|
| 58 |
-
return cls(
|
| 59 |
-
id=segment.id,
|
| 60 |
-
seek=segment.seek,
|
| 61 |
-
start=segment.start,
|
| 62 |
-
end=segment.end,
|
| 63 |
-
text=segment.text,
|
| 64 |
-
tokens=segment.tokens,
|
| 65 |
-
temperature=segment.temperature,
|
| 66 |
-
avg_logprob=segment.avg_logprob,
|
| 67 |
-
compression_ratio=segment.compression_ratio,
|
| 68 |
-
no_speech_prob=segment.no_speech_prob,
|
| 69 |
-
)
|
| 70 |
-
|
| 71 |
-
|
| 72 |
# https://platform.openai.com/docs/api-reference/audio/verbose-json-object
|
| 73 |
class TranscriptionVerboseJsonResponse(BaseModel):
|
| 74 |
task: str = "transcribe"
|
| 75 |
language: str
|
| 76 |
duration: float
|
| 77 |
text: str
|
| 78 |
-
words: list[
|
| 79 |
-
segments: list[
|
| 80 |
|
| 81 |
@classmethod
|
| 82 |
def from_segment(cls, segment: Segment, transcription_info: TranscriptionInfo) -> TranscriptionVerboseJsonResponse:
|
|
@@ -84,8 +38,8 @@ class TranscriptionVerboseJsonResponse(BaseModel):
|
|
| 84 |
language=transcription_info.language,
|
| 85 |
duration=segment.end - segment.start,
|
| 86 |
text=segment.text,
|
| 87 |
-
words=(
|
| 88 |
-
segments=[
|
| 89 |
)
|
| 90 |
|
| 91 |
@classmethod
|
|
@@ -95,9 +49,9 @@ class TranscriptionVerboseJsonResponse(BaseModel):
|
|
| 95 |
return cls(
|
| 96 |
language=transcription_info.language,
|
| 97 |
duration=transcription_info.duration,
|
| 98 |
-
text=
|
| 99 |
-
segments=
|
| 100 |
-
words=
|
| 101 |
)
|
| 102 |
|
| 103 |
@classmethod
|
|
@@ -106,15 +60,7 @@ class TranscriptionVerboseJsonResponse(BaseModel):
|
|
| 106 |
language="english", # FIX: hardcoded
|
| 107 |
duration=transcription.duration,
|
| 108 |
text=transcription.text,
|
| 109 |
-
words=
|
| 110 |
-
WordObject(
|
| 111 |
-
start=word.start,
|
| 112 |
-
end=word.end,
|
| 113 |
-
word=word.text,
|
| 114 |
-
probability=word.probability,
|
| 115 |
-
)
|
| 116 |
-
for word in transcription.words
|
| 117 |
-
],
|
| 118 |
segments=[], # FIX: hardcoded
|
| 119 |
)
|
| 120 |
|
|
|
|
| 4 |
|
| 5 |
from pydantic import BaseModel, ConfigDict, Field
|
| 6 |
|
| 7 |
+
from faster_whisper_server.core import Segment, Transcription, Word, segments_to_text
|
| 8 |
|
| 9 |
if TYPE_CHECKING:
|
| 10 |
+
from faster_whisper.transcribe import TranscriptionInfo
|
|
|
|
|
|
|
| 11 |
|
| 12 |
|
| 13 |
# https://platform.openai.com/docs/api-reference/audio/json-object
|
|
|
|
| 16 |
|
| 17 |
@classmethod
|
| 18 |
def from_segments(cls, segments: list[Segment]) -> TranscriptionJsonResponse:
|
| 19 |
+
return cls(text=segments_to_text(segments))
|
| 20 |
|
| 21 |
@classmethod
|
| 22 |
def from_transcription(cls, transcription: Transcription) -> TranscriptionJsonResponse:
|
| 23 |
return cls(text=transcription.text)
|
| 24 |
|
| 25 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 26 |
# https://platform.openai.com/docs/api-reference/audio/verbose-json-object
|
| 27 |
class TranscriptionVerboseJsonResponse(BaseModel):
|
| 28 |
task: str = "transcribe"
|
| 29 |
language: str
|
| 30 |
duration: float
|
| 31 |
text: str
|
| 32 |
+
words: list[Word]
|
| 33 |
+
segments: list[Segment]
|
| 34 |
|
| 35 |
@classmethod
|
| 36 |
def from_segment(cls, segment: Segment, transcription_info: TranscriptionInfo) -> TranscriptionVerboseJsonResponse:
|
|
|
|
| 38 |
language=transcription_info.language,
|
| 39 |
duration=segment.end - segment.start,
|
| 40 |
text=segment.text,
|
| 41 |
+
words=(segment.words if isinstance(segment.words, list) else []),
|
| 42 |
+
segments=[segment],
|
| 43 |
)
|
| 44 |
|
| 45 |
@classmethod
|
|
|
|
| 49 |
return cls(
|
| 50 |
language=transcription_info.language,
|
| 51 |
duration=transcription_info.duration,
|
| 52 |
+
text=segments_to_text(segments),
|
| 53 |
+
segments=segments,
|
| 54 |
+
words=Word.from_segments(segments),
|
| 55 |
)
|
| 56 |
|
| 57 |
@classmethod
|
|
|
|
| 60 |
language="english", # FIX: hardcoded
|
| 61 |
duration=transcription.duration,
|
| 62 |
text=transcription.text,
|
| 63 |
+
words=transcription.words,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 64 |
segments=[], # FIX: hardcoded
|
| 65 |
)
|
| 66 |
|
faster_whisper_server/transcriber.py
CHANGED
|
@@ -4,12 +4,7 @@ from typing import TYPE_CHECKING
|
|
| 4 |
|
| 5 |
from faster_whisper_server.audio import Audio, AudioStream
|
| 6 |
from faster_whisper_server.config import config
|
| 7 |
-
from faster_whisper_server.core import
|
| 8 |
-
Transcription,
|
| 9 |
-
Word,
|
| 10 |
-
common_prefix,
|
| 11 |
-
to_full_sentences,
|
| 12 |
-
)
|
| 13 |
from faster_whisper_server.logger import logger
|
| 14 |
|
| 15 |
if TYPE_CHECKING:
|
|
@@ -37,30 +32,16 @@ class LocalAgreement:
|
|
| 37 |
|
| 38 |
return prefix
|
| 39 |
|
| 40 |
-
@classmethod
|
| 41 |
-
def prompt(cls, confirmed: Transcription) -> str | None:
|
| 42 |
-
sentences = to_full_sentences(confirmed.words)
|
| 43 |
-
if len(sentences) == 0:
|
| 44 |
-
return None
|
| 45 |
-
return sentences[-1].text
|
| 46 |
-
|
| 47 |
-
# TODO: better name
|
| 48 |
-
@classmethod
|
| 49 |
-
def needs_audio_after(cls, confirmed: Transcription) -> float:
|
| 50 |
-
full_sentences = to_full_sentences(confirmed.words)
|
| 51 |
-
return full_sentences[-1].end if len(full_sentences) > 0 else 0.0
|
| 52 |
-
|
| 53 |
|
|
|
|
| 54 |
def needs_audio_after(confirmed: Transcription) -> float:
|
| 55 |
full_sentences = to_full_sentences(confirmed.words)
|
| 56 |
-
return full_sentences[-1].end if len(full_sentences) > 0 else 0.0
|
| 57 |
|
| 58 |
|
| 59 |
def prompt(confirmed: Transcription) -> str | None:
|
| 60 |
sentences = to_full_sentences(confirmed.words)
|
| 61 |
-
if len(sentences)
|
| 62 |
-
return None
|
| 63 |
-
return sentences[-1].text
|
| 64 |
|
| 65 |
|
| 66 |
async def audio_transcriber(
|
|
|
|
| 4 |
|
| 5 |
from faster_whisper_server.audio import Audio, AudioStream
|
| 6 |
from faster_whisper_server.config import config
|
| 7 |
+
from faster_whisper_server.core import Transcription, Word, common_prefix, to_full_sentences, word_to_text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8 |
from faster_whisper_server.logger import logger
|
| 9 |
|
| 10 |
if TYPE_CHECKING:
|
|
|
|
| 32 |
|
| 33 |
return prefix
|
| 34 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 35 |
|
| 36 |
+
# TODO: needs a better name
|
| 37 |
def needs_audio_after(confirmed: Transcription) -> float:
|
| 38 |
full_sentences = to_full_sentences(confirmed.words)
|
| 39 |
+
return full_sentences[-1][-1].end if len(full_sentences) > 0 else 0.0
|
| 40 |
|
| 41 |
|
| 42 |
def prompt(confirmed: Transcription) -> str | None:
|
| 43 |
sentences = to_full_sentences(confirmed.words)
|
| 44 |
+
return word_to_text(sentences[-1]) if len(sentences) > 0 else None
|
|
|
|
|
|
|
| 45 |
|
| 46 |
|
| 47 |
async def audio_transcriber(
|
faster_whisper_server/utils.py
DELETED
|
@@ -1,14 +0,0 @@
|
|
| 1 |
-
from faster_whisper.transcribe import Segment, Word
|
| 2 |
-
|
| 3 |
-
|
| 4 |
-
def segments_text(segments: list[Segment]) -> str:
|
| 5 |
-
return "".join(segment.text for segment in segments).strip()
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
def words_from_segments(segments: list[Segment]) -> list[Word]:
|
| 9 |
-
words = []
|
| 10 |
-
for segment in segments:
|
| 11 |
-
if segment.words is None:
|
| 12 |
-
continue
|
| 13 |
-
words.extend(segment.words)
|
| 14 |
-
return words
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|