Spaces:
Paused
Paused
| """ | |
| STT (Speech-to-Text) Interface and Data Models | |
| """ | |
| from abc import ABC, abstractmethod | |
| from typing import Optional, Dict, Any, AsyncIterator, List | |
| from dataclasses import dataclass | |
| from enum import Enum | |
| import json | |
| class STTEngineType(Enum): | |
| NO_STT = "no_stt" | |
| GOOGLE = "google" | |
| AZURE = "azure" | |
| AMAZON = "amazon" | |
| FLICKER = "flicker" | |
| class STTConfig: | |
| """STT configuration parameters""" | |
| language: str = "tr-TR" | |
| sample_rate: int = 16000 | |
| encoding: str = "WEBM_OPUS" | |
| enable_punctuation: bool = True | |
| enable_word_timestamps: bool = False | |
| model: str = "latest_long" | |
| use_enhanced: bool = True | |
| single_utterance: bool = False | |
| interim_results: bool = True | |
| # Voice Activity Detection | |
| vad_enabled: bool = True | |
| speech_timeout_ms: int = 2000 | |
| # Noise reduction | |
| noise_reduction_enabled: bool = True | |
| noise_reduction_level: int = 2 | |
| class TranscriptionResult: | |
| """Result from STT engine""" | |
| text: str | |
| is_final: bool | |
| confidence: float | |
| timestamp: float | |
| word_timestamps: Optional[List[Dict]] = None | |
| language: Optional[str] = None | |
| is_interrupt: bool = False | |
| class STTInterface(ABC): | |
| """Abstract base class for STT providers""" | |
| async def start_streaming(self, config: STTConfig) -> None: | |
| """Start streaming session""" | |
| pass | |
| async def stream_audio(self, audio_chunk: bytes) -> AsyncIterator[TranscriptionResult]: | |
| """Stream audio chunk and get transcription results""" | |
| pass | |
| async def stop_streaming(self) -> Optional[TranscriptionResult]: | |
| """Stop streaming and get final result""" | |
| pass | |
| def supports_realtime(self) -> bool: | |
| """Check if provider supports real-time streaming""" | |
| pass | |
| def get_supported_languages(self) -> List[str]: | |
| """Get list of supported language codes""" | |
| pass |