Spaces:

fedirz
/

faster-whisper-server

Configuration error

App Files Files Community

Fedir Zadniprovskyi commited on May 23, 2024

Commit

db7bf9a

1 Parent(s): af21424

feat: improve openai compatability

Browse files

Files changed (2) hide show

speaches/main.py +9 -18
speaches/server_models.py +40 -8

speaches/main.py CHANGED Viewed

@@ -7,14 +7,8 @@ from contextlib import asynccontextmanager
 from io import BytesIO
 from typing import Annotated
-from fastapi import (
-    Depends,
-    FastAPI,
-    Response,
-    UploadFile,
-    WebSocket,
-    WebSocketDisconnect,
-)
 from fastapi.websockets import WebSocketState
 from faster_whisper import WhisperModel
 from faster_whisper.vad import VadOptions, get_speech_timestamps
@@ -24,11 +18,8 @@ from speaches.audio import AudioStream, audio_samples_from_file
 from speaches.config import SAMPLES_PER_SECOND, Language, config
 from speaches.core import Transcription
 from speaches.logger import logger
-from speaches.server_models import (
-    ResponseFormat,
-    TranscriptionResponse,
-    TranscriptionVerboseResponse,
-)
 from speaches.transcriber import audio_transcriber
 whisper: WhisperModel = None  # type: ignore
@@ -132,12 +123,12 @@ def format_transcription(
     if response_format == ResponseFormat.TEXT:
         return transcription.text
     elif response_format == ResponseFormat.JSON:
-        return TranscriptionResponse(text=transcription.text).model_dump_json()
     elif response_format == ResponseFormat.VERBOSE_JSON:
-        return TranscriptionVerboseResponse(
-            duration=transcription.duration,
-            text=transcription.text,
-            words=transcription.words,
         ).model_dump_json()

 from io import BytesIO
 from typing import Annotated
+from fastapi import (Depends, FastAPI, Response, UploadFile, WebSocket,
+                     WebSocketDisconnect)
 from fastapi.websockets import WebSocketState
 from faster_whisper import WhisperModel
 from faster_whisper.vad import VadOptions, get_speech_timestamps
 from speaches.config import SAMPLES_PER_SECOND, Language, config
 from speaches.core import Transcription
 from speaches.logger import logger
+from speaches.server_models import (ResponseFormat, TranscriptionJsonResponse,
+                                    TranscriptionVerboseJsonResponse)
 from speaches.transcriber import audio_transcriber
 whisper: WhisperModel = None  # type: ignore
     if response_format == ResponseFormat.TEXT:
         return transcription.text
     elif response_format == ResponseFormat.JSON:
+        return TranscriptionJsonResponse.from_transcription(
+            transcription
+        ).model_dump_json()
     elif response_format == ResponseFormat.VERBOSE_JSON:
+        return TranscriptionVerboseJsonResponse.from_transcription(
+            transcription
         ).model_dump_json()

speaches/server_models.py CHANGED Viewed

@@ -1,26 +1,58 @@
 import enum
 from pydantic import BaseModel
-from speaches.core import Word
 class ResponseFormat(enum.StrEnum):
-    JSON = "json"
     TEXT = "text"
     VERBOSE_JSON = "verbose_json"
 # https://platform.openai.com/docs/api-reference/audio/json-object
-class TranscriptionResponse(BaseModel):
     text: str
-# Subset of https://platform.openai.com/docs/api-reference/audio/verbose-json-object
-class TranscriptionVerboseResponse(BaseModel):
     task: str = "transcribe"
     duration: float
     text: str
-    words: list[
-        Word
-    ]  # Different from OpenAI's `words`. `Word.text` instead of `Word.word`

+from __future__ import annotations
 import enum
+from faster_whisper.transcribe import Segment, Word
 from pydantic import BaseModel
+from speaches.core import Transcription
+# https://platform.openai.com/docs/api-reference/audio/createTranscription#audio-createtranscription-response_format
 class ResponseFormat(enum.StrEnum):
     TEXT = "text"
+    JSON = "json"
     VERBOSE_JSON = "verbose_json"
+    # VTT = "vtt"
+    # SRT = "srt"
 # https://platform.openai.com/docs/api-reference/audio/json-object
+class TranscriptionJsonResponse(BaseModel):
     text: str
+    @classmethod
+    def from_transcription(
+        cls, transcription: Transcription
+    ) -> TranscriptionJsonResponse:
+        return cls(text=transcription.text)
+# https://platform.openai.com/docs/api-reference/audio/verbose-json-object
+class TranscriptionVerboseJsonResponse(BaseModel):
     task: str = "transcribe"
+    language: str
     duration: float
     text: str
+    words: list[Word]
+    segments: list[Segment]
+    @classmethod
+    def from_transcription(
+        cls, transcription: Transcription
+    ) -> TranscriptionVerboseJsonResponse:
+        return cls(
+            language="english",  # FIX: hardcoded
+            duration=transcription.duration,
+            text=transcription.text,
+            words=[
+                Word(
+                    start=word.start,
+                    end=word.end,
+                    word=word.text,
+                    probability=word.probability,
+                )
+                for word in transcription.words
+            ],
+            segments=[],  # FIX: hardcoded
+        )