Spaces:

UcsTurkey
/

flare

Paused

App Files Files Community

ciyidogan commited on Jul 8

Commit

e90d3a0

verified ·

1 Parent(s): c5bf788

Create stt_deepgram.py

Browse files

Files changed (1) hide show

stt/stt_deepgram.py +425 -0

stt/stt_deepgram.py ADDED Viewed

	@@ -0,0 +1,425 @@

+"""
+Deepgram Speech-to-Text Implementation
+"""
+import os
+import asyncio
+import websockets
+import json
+from typing import AsyncIterator, Optional, List, Any, Dict
+from datetime import datetime
+import queue
+import threading
+import time
+import traceback
+import base64
+from urllib.parse import urlencode
+from utils.logger import log_info, log_error, log_debug, log_warning
+from .stt_interface import STTInterface, STTConfig, TranscriptionResult
+class DeepgramSTT(STTInterface):
+    """Deepgram Speech-to-Text implementation with advanced VAD support"""
+    def __init__(self, api_key: str):
+        if not api_key:
+            raise ValueError("Deepgram API key is required")
+        self.api_key = api_key
+        self.websocket = None
+        self.is_streaming = False
+        self.responses_queue = queue.Queue(maxsize=100)
+        self.ws_thread = None
+        self.stop_event = threading.Event()
+        # Session tracking
+        self.session_id = 0
+        self.total_audio_bytes = 0
+        self.total_chunks = 0
+        # VAD tracking
+        self.vad_enabled = False
+        self.last_speech_end_time = None
+        log_info(f"✅ Deepgram STT initialized")
+    def _get_websocket_url(self, config: STTConfig) -> str:
+        """Build Deepgram WebSocket URL with parameters"""
+        base_url = "wss://api.deepgram.com/v1/listen"
+        params = {
+            "language": config.language,
+            "model": "nova-2",  # Use Nova-2 for best performance
+            "punctuate": str(config.enable_punctuation).lower(),
+            "interim_results": str(config.interim_results).lower(),
+            "utterance_end_ms": str(config.speech_timeout_ms),
+            "vad_events": str(config.vad_enabled).lower(),
+            "smart_format": "true",
+            "no_delay": "true",  # Low latency mode
+            "encoding": self._map_encoding(config.encoding),
+            "sample_rate": str(config.sample_rate)
+        }
+        # Add endpointing for VAD support
+        if config.vad_enabled:
+            params["endpointing"] = str(config.speech_timeout_ms)
+        # Single utterance mode
+        if config.single_utterance:
+            params["utterance_end_ms"] = "1000"  # Faster end detection for single utterance
+        query_string = urlencode(params)
+        return f"{base_url}?{query_string}"
+    def _map_encoding(self, encoding: str) -> str:
+        """Map encoding to Deepgram format"""
+        encoding_map = {
+            "WEBM_OPUS": "webm-opus",
+            "LINEAR16": "linear16",
+            "FLAC": "flac",
+            "MP3": "mp3",
+            "OGG_OPUS": "ogg-opus",
+        }
+        return encoding_map.get(encoding, "webm-opus")
+    async def start_streaming(self, config: STTConfig) -> None:
+        """Initialize streaming session with WebSocket"""
+        try:
+            # Stop any existing stream
+            if self.is_streaming or self.ws_thread:
+                log_warning("⚠️ Previous stream still active, stopping it first")
+                await self.stop_streaming()
+                await asyncio.sleep(0.5)
+            # Reset session data
+            self._reset_session_data()
+            log_info(f"🎤 Starting Deepgram STT streaming session #{self.session_id}")
+            log_debug(f"Config: language={config.language}, vad={config.vad_enabled}, interim={config.interim_results}")
+            # Clear stop event
+            self.stop_event.clear()
+            # Store config
+            self.config = config
+            self.vad_enabled = config.vad_enabled
+            # Start WebSocket thread
+            self.is_streaming = True
+            self.ws_thread = threading.Thread(
+                target=self._run_websocket,
+                args=(config,),
+                name=f"DeepgramSTT-Session-{self.session_id}"
+            )
+            self.ws_thread.daemon = True
+            self.ws_thread.start()
+            # Wait a bit for connection
+            await asyncio.sleep(0.5)
+            if not self.is_streaming:
+                raise RuntimeError("Failed to establish WebSocket connection")
+            log_info(f"✅ Deepgram STT streaming session #{self.session_id} started successfully")
+        except Exception as e:
+            log_error(f"❌ Failed to start Deepgram STT streaming", error=str(e))
+            self.is_streaming = False
+            self.websocket = None
+            raise
+    async def stream_audio(self, audio_chunk: bytes) -> AsyncIterator[TranscriptionResult]:
+        """Stream audio chunk and get transcription results"""
+        if not self.is_streaming:
+            raise RuntimeError("Streaming not started. Call start_streaming() first.")
+        try:
+            # Send audio to WebSocket
+            if self.websocket and not self.websocket.closed:
+                # Send as binary data
+                await asyncio.get_event_loop().run_in_executor(
+                    None,
+                    self._send_audio_sync,
+                    audio_chunk
+                )
+                self.total_chunks += 1
+                self.total_bytes += len(audio_chunk)
+                # Log progress
+                if self.total_chunks % 50 == 0:
+                    log_debug(f"📊 Progress: {self.total_chunks} chunks, {self.total_bytes/1024:.1f}KB total")
+            # Check for results
+            while True:
+                try:
+                    result = self.responses_queue.get_nowait()
+                    yield result
+                except queue.Empty:
+                    break
+        except Exception as e:
+            log_error(f"❌ Deepgram STT streaming error", error=str(e))
+            self.is_streaming = False
+            raise
+    def _send_audio_sync(self, audio_chunk: bytes):
+        """Synchronous method to send audio"""
+        if self.websocket and not self.websocket.closed:
+            try:
+                asyncio.run(self.websocket.send(audio_chunk))
+            except Exception as e:
+                log_error(f"❌ Error sending audio chunk: {e}")
+    async def stop_streaming(self) -> Optional[TranscriptionResult]:
+        """Stop streaming and clean up"""
+        if not self.is_streaming and not self.ws_thread:
+            log_debug("Already stopped, nothing to do")
+            return None
+        try:
+            log_info(f"🛑 Stopping Deepgram STT streaming session #{self.session_id}")
+            # Set stop flag
+            self.is_streaming = False
+            self.stop_event.set()
+            # Close WebSocket
+            if self.websocket and not self.websocket.closed:
+                try:
+                    await self.websocket.close()
+                except:
+                    pass
+            # Wait for thread
+            if self.ws_thread and self.ws_thread.is_alive():
+                log_info("⏳ Waiting for WebSocket thread to finish...")
+                self.ws_thread.join(timeout=5.0)
+                if self.ws_thread.is_alive():
+                    log_warning("⚠️ WebSocket thread did not stop gracefully")
+                else:
+                    log_info("✅ WebSocket thread finished")
+            # Get final result
+            final_result = None
+            while not self.responses_queue.empty():
+                try:
+                    result = self.responses_queue.get_nowait()
+                    if result.is_final:
+                        final_result = result
+                except queue.Empty:
+                    break
+            # Reset
+            self.websocket = None
+            self.ws_thread = None
+            self.stop_event.clear()
+            log_info(f"✅ Deepgram STT streaming session #{self.session_id} stopped")
+            return final_result
+        except Exception as e:
+            log_error(f"❌ Error during stop_streaming", error=str(e))
+            self.is_streaming = False
+            self.websocket = None
+            self.ws_thread = None
+            return None
+    def _run_websocket(self, config: STTConfig):
+        """Run WebSocket connection in separate thread"""
+        asyncio.set_event_loop(asyncio.new_event_loop())
+        loop = asyncio.get_event_loop()
+        try:
+            loop.run_until_complete(self._websocket_handler(config))
+        except Exception as e:
+            log_error(f"❌ WebSocket thread error", error=str(e), traceback=traceback.format_exc())
+        finally:
+            loop.close()
+            self.is_streaming = False
+    async def _websocket_handler(self, config: STTConfig):
+        """Handle WebSocket connection and messages"""
+        url = self._get_websocket_url(config)
+        headers = {
+            "Authorization": f"Token {self.api_key}"
+        }
+        try:
+            log_info(f"🔌 Connecting to Deepgram WebSocket...")
+            async with websockets.connect(url, extra_headers=headers) as websocket:
+                self.websocket = websocket
+                log_info(f"✅ Connected to Deepgram WebSocket")
+                # Send keep-alive and receive messages
+                receive_task = asyncio.create_task(self._receive_messages())
+                keepalive_task = asyncio.create_task(self._send_keepalive())
+                # Wait until stop event or connection closes
+                while not self.stop_event.is_set() and not websocket.closed:
+                    await asyncio.sleep(0.1)
+                # Cancel tasks
+                receive_task.cancel()
+                keepalive_task.cancel()
+                try:
+                    await receive_task
+                    await keepalive_task
+                except asyncio.CancelledError:
+                    pass
+        except Exception as e:
+            log_error(f"❌ WebSocket connection error", error=str(e))
+            self.is_streaming = False
+    async def _receive_messages(self):
+        """Receive and process messages from WebSocket"""
+        try:
+            async for message in self.websocket:
+                if self.stop_event.is_set():
+                    break
+                try:
+                    data = json.loads(message)
+                    self._process_deepgram_message(data)
+                except json.JSONDecodeError as e:
+                    log_error(f"❌ Failed to parse message: {e}")
+        except websockets.exceptions.ConnectionClosed:
+            log_info("WebSocket connection closed")
+        except Exception as e:
+            log_error(f"❌ Error receiving messages: {e}")
+    async def _send_keepalive(self):
+        """Send keepalive messages to maintain connection"""
+        try:
+            while not self.stop_event.is_set():
+                if self.websocket and not self.websocket.closed:
+                    await self.websocket.send(json.dumps({"type": "KeepAlive"}))
+                await asyncio.sleep(8)  # Deepgram requires keepalive every 10s
+        except Exception as e:
+            log_debug(f"Keepalive stopped: {e}")
+    def _process_deepgram_message(self, data: Dict[str, Any]):
+        """Process Deepgram response message"""
+        msg_type = data.get("type", "")
+        if msg_type == "Results":
+            # Transcription result
+            channel = data.get("channel", {})
+            alternatives = channel.get("alternatives", [])
+            if alternatives:
+                alt = alternatives[0]
+                transcript = alt.get("transcript", "")
+                confidence = alt.get("confidence", 0.0)
+                is_final = data.get("is_final", False)
+                # Skip empty transcripts unless it's a final result
+                if transcript.strip() or is_final:
+                    result = TranscriptionResult(
+                        text=transcript,
+                        is_final=is_final,
+                        confidence=confidence,
+                        timestamp=datetime.now().timestamp()
+                    )
+                    # Queue result
+                    try:
+                        self.responses_queue.put(result)
+                        if is_final:
+                            log_info(f"🎯 FINAL: '{transcript}'")
+                        else:
+                            log_debug(f"📝 Interim: '{transcript}'")
+                    except queue.Full:
+                        log_warning("⚠️ Response queue full")
+        elif msg_type == "SpeechStarted":
+            # VAD: Speech started
+            log_debug("🎤 VAD: Speech started")
+        elif msg_type == "UtteranceEnd":
+            # VAD: Utterance ended
+            log_debug("🔚 VAD: Utterance ended")
+            self.last_speech_end_time = datetime.now()
+            # For single utterance mode, this signals end
+            if hasattr(self, 'config') and self.config.single_utterance:
+                log_info("✅ Single utterance completed - VAD triggered")
+        elif msg_type == "Error":
+            # Error message
+            error = data.get("error", {})
+            log_error(f"❌ Deepgram error: {error}")
+        elif msg_type == "Metadata":
+            # Connection metadata
+            log_debug(f"Metadata: {data}")
+    def _reset_session_data(self):
+        """Reset session-specific data"""
+        # Clear queue
+        while not self.responses_queue.empty():
+            try:
+                self.responses_queue.get_nowait()
+            except:
+                pass
+        # Reset counters
+        self.total_audio_bytes = 0
+        self.total_chunks = 0
+        self.session_id += 1
+        self.last_speech_end_time = None
+        log_info(f"🔄 Deepgram STT session data reset. New session ID: {self.session_id}")
+    def supports_realtime(self) -> bool:
+        """Deepgram supports real-time streaming"""
+        return True
+    def get_supported_languages(self) -> List[str]:
+        """Get list of supported language codes"""
+        # Deepgram supports 36+ languages with Nova-2
+        return [
+            "tr",      # Turkish
+            "en",      # English
+            "en-US",   # English (US)
+            "en-GB",   # English (UK)
+            "de",      # German
+            "fr",      # French
+            "es",      # Spanish
+            "it",      # Italian
+            "pt",      # Portuguese
+            "ru",      # Russian
+            "ja",      # Japanese
+            "ko",      # Korean
+            "zh",      # Chinese
+            "ar",      # Arabic
+            "nl",      # Dutch
+            "sv",      # Swedish
+            "pl",      # Polish
+            "hi",      # Hindi
+            "cs",      # Czech
+            "da",      # Danish
+            "fi",      # Finnish
+            "el",      # Greek
+            "he",      # Hebrew
+            "hu",      # Hungarian
+            "id",      # Indonesian
+            "ms",      # Malay
+            "no",      # Norwegian
+            "ro",      # Romanian
+            "sk",      # Slovak
+            "th",      # Thai
+            "uk",      # Ukrainian
+            "vi",      # Vietnamese
+        ]
+    def get_provider_name(self) -> str:
+        """Get provider name"""
+        return "deepgram"