Spaces:

UcsTurkey
/

flare

Running

App Files Files Community

ciyidogan commited on Jul 8

Commit

ad5c8be

verified ·

1 Parent(s): e8fb1f7

Update stt/stt_deepgram.py

Browse files

Files changed (1) hide show

stt/stt_deepgram.py +224 -286

stt/stt_deepgram.py CHANGED Viewed

@@ -1,39 +1,37 @@
 """
-Deepgram Speech-to-Text Implementation - Optimized for Voice Agent
 """
-import os
 import asyncio
-import websockets
-import json
-from typing import AsyncIterator, Optional, List, Any, Dict
 from datetime import datetime
 import queue
 import threading
-import time
 import traceback
-import base64
-from urllib.parse import urlencode
 from utils.logger import log_info, log_error, log_debug, log_warning
 from .stt_interface import STTInterface, STTConfig, TranscriptionResult
 class DeepgramSTT(STTInterface):
-    """Deepgram STT - Single utterance mode with VAD"""
     def __init__(self, api_key: str):
         if not api_key:
             raise ValueError("Deepgram API key is required")
-        # Debug için API key'in ilk 5 karakterini logla
-        log_info(f"🔑 Deepgram API key resolved: {api_key[:10]}... (length: {len(api_key)})")
         self.api_key = api_key
-        self.websocket = None
         self.is_streaming = False
         self.responses_queue = queue.Queue(maxsize=100)
-        self.ws_thread = None
-        self.stop_event = threading.Event()
         # Session tracking
         self.session_id = 0
@@ -42,64 +40,34 @@ class DeepgramSTT(STTInterface):
         # Final result tracking
         self.final_result_received = False
-        log_info(f"✅ Deepgram STT initialized for single utterance mode")
-    def _get_websocket_url(self, config: STTConfig) -> str:
-        """Build Deepgram WebSocket URL with optimized parameters"""
-        base_url = "wss://api.deepgram.com/v1/listen"
-        # Manuel olarak optimize edilmiş parametreler
-        """
-        params = {
-            "language": config.language,  # Dil config'den alınır
-            "model": "nova-2",           # En iyi model
-            "punctuate": "true",         # Noktalama işaretleri açık
-            "interim_results": "false",   # ❌ Interim results KAPALI
-            "utterance_end_ms": "1000",   # 1 saniye sessizlik = konuşma sonu
-            "vad_events": "true",         # VAD events AÇIK
-            "smart_format": "true",       # Akıllı formatlama
-            "no_delay": "true",          # Düşük gecikme modu
-            "encoding": "webm-opus",      # WebM Opus encoding
-            "sample_rate": "16000",       # 16kHz sample rate
-            "endpointing": "1000",        # 1 saniye endpointing
-            "diarize": "false",          # Speaker diarization kapalı
-            "multichannel": "false",      # Tek kanal
-            "alternatives": "1",          # Sadece en iyi alternatif
-            "profanity_filter": "false",  # Küfür filtresi kapalı
-            "redact": "false",           # Redaction kapalı
-            "replace": "false",          # Replace kapalı
-            "search": "false",           # Search kapalı
-            "keywords": "false",         # Keywords kapalı
-            "filler_words": "false",     # Filler words algılama kapalı
-            "numerals": "true"           # Sayıları rakam olarak yaz
-        }
-        """
-        params = {
-            "language": "tr",
-            "model": "nova-2",
-            "punctuate": "true",
-            "interim_results": "false",
-            "smart_format": "true",
-            "encoding": "webm-opus",
-            "sample_rate": "16000"
         }
-        # VAD ve endpointing parametreleri
-        if config.single_utterance or True:  # Single utterance mode için
-            params["utterance_end_ms"] = "1000"  # 1 saniye sessizlik
-            params["vad_events"] = "true"        # VAD events
-            params["endpointing"] = "1000"       # Endpointing
-        query_string = urlencode(params)
-        return f"{base_url}?{query_string}"
     async def start_streaming(self, config: STTConfig) -> None:
-        """Initialize streaming session - single utterance mode"""
         try:
             # Stop any existing stream
-            if self.is_streaming or self.ws_thread:
                 log_warning("⚠️ Previous stream still active, stopping it first")
                 await self.stop_streaming()
                 await asyncio.sleep(0.5)
@@ -107,124 +75,204 @@ class DeepgramSTT(STTInterface):
             # Reset session data
             self._reset_session_data()
-            log_info(f"🎤 Starting Deepgram STT - Single Utterance Mode #{self.session_id}")
-            log_debug(f"Language: {config.language}, Sample Rate: 16kHz, Utterance End: 1000ms")
-            # Clear stop event
-            self.stop_event.clear()
-            self.final_result_received = False
-            # Store config
-            self.config = config
-            # Start WebSocket thread
-            self.is_streaming = True
-            self.ws_thread = threading.Thread(
-                target=self._run_websocket,
-                args=(config,),
-                name=f"DeepgramSTT-SingleUtterance-{self.session_id}"
             )
-            self.ws_thread.daemon = True
-            self.ws_thread.start()
-            # Wait for connection
-            await asyncio.sleep(0.5)
-            if not self.is_streaming:
-                raise RuntimeError("Failed to establish WebSocket connection")
-            log_info(f"✅ Deepgram STT ready - Listening for single utterance")
         except Exception as e:
             log_error(f"❌ Failed to start Deepgram STT", error=str(e))
             self.is_streaming = False
-            self.websocket = None
             raise
     async def stream_audio(self, audio_chunk: bytes) -> AsyncIterator[TranscriptionResult]:
-        """Stream audio chunk - only returns final results"""
-        if not self.is_streaming:
             raise RuntimeError("Streaming not started. Call start_streaming() first.")
-        # Eğer final result alındıysa, daha fazla audio kabul etme
         if self.final_result_received:
             log_debug("Final result already received, ignoring audio chunk")
             return
         try:
-            # Send audio to WebSocket
-            if self.websocket and not self.websocket.closed:
-                # Send as binary data
-                await asyncio.get_event_loop().run_in_executor(
-                    None,
-                    self._send_audio_sync,
-                    audio_chunk
-                )
-                self.total_chunks += 1
-                self.total_bytes += len(audio_chunk)
-                # Log progress every 50 chunks
-                if self.total_chunks % 50 == 0:
-                    log_debug(f"📊 Listening... {self.total_chunks} chunks, {self.total_bytes/1024:.1f}KB")
-            # Check for final results only
             while True:
                 try:
                     result = self.responses_queue.get_nowait()
-                    # Sadece final result'ları yield et
                     if result.is_final:
                         yield result
                 except queue.Empty:
                     break
         except Exception as e:
-            log_error(f"❌ Deepgram STT streaming error", error=str(e))
             self.is_streaming = False
             raise
-    def _send_audio_sync(self, audio_chunk: bytes):
-        """Synchronous method to send audio"""
-        if self.websocket and not self.websocket.closed and not self.final_result_received:
-            try:
-                asyncio.run(self.websocket.send(audio_chunk))
-            except Exception as e:
-                log_error(f"❌ Error sending audio chunk: {e}")
     async def stop_streaming(self) -> Optional[TranscriptionResult]:
-        """Stop streaming and dispose"""
-        if not self.is_streaming and not self.ws_thread:
             log_debug("Already stopped, nothing to do")
             return None
         try:
-            log_info(f"🛑 Disposing Deepgram STT session #{self.session_id}")
-            # Set stop flag
             self.is_streaming = False
-            self.stop_event.set()
-            # Close WebSocket with close frame
-            if self.websocket and not self.websocket.closed:
                 try:
-                    # Send close frame to trigger final response
-                    await self.websocket.send(json.dumps({"type": "CloseStream"}))
-                    await asyncio.sleep(0.2)  # Wait for final response
-                    await self.websocket.close()
-                except:
-                    pass
-            # Wait for thread
-            if self.ws_thread and self.ws_thread.is_alive():
-                log_debug("⏳ Waiting for WebSocket thread to finish...")
-                self.ws_thread.join(timeout=3.0)
-                if self.ws_thread.is_alive():
-                    log_warning("⚠️ WebSocket thread did not stop gracefully")
-                else:
-                    log_debug("✅ WebSocket thread finished")
-            # Get the final result
             final_result = None
             while not self.responses_queue.empty():
                 try:
@@ -234,143 +282,21 @@ class DeepgramSTT(STTInterface):
                 except queue.Empty:
                     break
-            # Reset everything
-            self.websocket = None
-            self.ws_thread = None
-            self.stop_event.clear()
             self.final_result_received = False
-            log_info(f"✅ Deepgram STT session #{self.session_id} disposed")
             return final_result
         except Exception as e:
             log_error(f"❌ Error during stop_streaming", error=str(e))
             self.is_streaming = False
-            self.websocket = None
-            self.ws_thread = None
             return None
-    def _run_websocket(self, config: STTConfig):
-        """Run WebSocket connection in separate thread"""
-        asyncio.set_event_loop(asyncio.new_event_loop())
-        loop = asyncio.get_event_loop()
-        try:
-            loop.run_until_complete(self._websocket_handler(config))
-        except Exception as e:
-            log_error(f"❌ WebSocket thread error", error=str(e), traceback=traceback.format_exc())
-        finally:
-            loop.close()
-            self.is_streaming = False
-    async def _websocket_handler(self, config: STTConfig):
-        """Handle WebSocket connection and messages"""
-        url = self._get_websocket_url(config)
-        headers = {
-            "Authorization": f"Token {self.api_key}"
-        }
-        try:
-            log_debug(f"🔌 Connecting to Deepgram WebSocket...")
-            async with websockets.connect(url, extra_headers=headers, ping_interval=5) as websocket:
-                self.websocket = websocket
-                log_info(f"✅ Connected to Deepgram - Ready for speech")
-                # Receive messages task only (no keepalive needed for short sessions)
-                receive_task = asyncio.create_task(self._receive_messages())
-                # Wait until stop event, final result, or connection closes
-                while not self.stop_event.is_set() and not websocket.closed and not self.final_result_received:
-                    await asyncio.sleep(0.1)
-                # Cancel task
-                receive_task.cancel()
-                try:
-                    await receive_task
-                except asyncio.CancelledError:
-                    pass
-        except Exception as e:
-            log_error(f"❌ WebSocket connection error", error=str(e))
-            self.is_streaming = False
-    async def _receive_messages(self):
-        """Receive and process messages from WebSocket"""
-        try:
-            async for message in self.websocket:
-                if self.stop_event.is_set() or self.final_result_received:
-                    break
-                try:
-                    data = json.loads(message)
-                    self._process_deepgram_message(data)
-                except json.JSONDecodeError as e:
-                    log_error(f"❌ Failed to parse message: {e}")
-        except websockets.exceptions.ConnectionClosed:
-            log_info("WebSocket connection closed")
-        except Exception as e:
-            log_error(f"❌ Error receiving messages: {e}")
-    def _process_deepgram_message(self, data: Dict[str, Any]):
-        """Process Deepgram response message"""
-        msg_type = data.get("type", "")
-        if msg_type == "Results":
-            # Transcription result
-            is_final = data.get("is_final", False)
-            # Sadece final result'ları işle
-            if is_final:
-                channel = data.get("channel", {})
-                alternatives = channel.get("alternatives", [])
-                if alternatives:
-                    alt = alternatives[0]
-                    transcript = alt.get("transcript", "")
-                    confidence = alt.get("confidence", 0.0)
-                    # Create final result
-                    result = TranscriptionResult(
-                        text=transcript,
-                        is_final=True,
-                        confidence=confidence,
-                        timestamp=datetime.now().timestamp()
-                    )
-                    # Queue result
-                    try:
-                        self.responses_queue.put(result)
-                        self.final_result_received = True
-                        log_info(f"🎯 FINAL RESULT: '{transcript}' (confidence: {confidence:.2f})")
-                        log_info(f"📊 Session stats: {self.total_chunks} chunks, {self.total_bytes/1024:.1f}KB")
-                    except queue.Full:
-                        log_warning("⚠️ Response queue full")
-        elif msg_type == "SpeechStarted":
-            # VAD: Speech started
-            log_info("🎤 Speech detected - User started speaking")
-        elif msg_type == "UtteranceEnd":
-            # VAD: Utterance ended - kullanıcı konuşmayı bitirdi
-            log_info("🔚 Speech ended - User stopped speaking")
-            # Bu noktada Deepgram final result gönderecek
-        elif msg_type == "Error":
-            # Error message
-            error = data.get("error", {})
-            log_error(f"❌ Deepgram error: {error}")
-        elif msg_type == "Metadata":
-            # Connection metadata
-            request_id = data.get("request_id", "")
-            log_debug(f"📋 Connected with request_id: {request_id}")
     def _reset_session_data(self):
         """Reset session-specific data"""
         # Clear queue
@@ -394,26 +320,38 @@ class DeepgramSTT(STTInterface):
     def get_supported_languages(self) -> List[str]:
         """Get list of supported language codes"""
-        # Deepgram Nova-2 supported languages
         return [
-            "tr",      # Turkish
-            "en",      # English
             "en-US",   # English (US)
             "en-GB",   # English (UK)
-            "de",      # German
-            "fr",      # French
-            "es",      # Spanish
-            "it",      # Italian
-            "pt",      # Portuguese
-            "ru",      # Russian
-            "ja",      # Japanese
-            "ko",      # Korean
-            "zh",      # Chinese
-            "ar",      # Arabic
-            "nl",      # Dutch
-            "sv",      # Swedish
-            "pl",      # Polish
-            "hi",      # Hindi
         ]
     def get_provider_name(self) -> str:

 """
+Deepgram Speech-to-Text Implementation using Deepgram SDK
 """
 import asyncio
+from typing import AsyncIterator, Optional, List, Any
 from datetime import datetime
 import queue
 import threading
 import traceback
+from deepgram import (
+    DeepgramClient,
+    DeepgramClientOptions,
+    LiveTranscriptionEvents,
+    LiveOptions,
+    Microphone,
+)
 from utils.logger import log_info, log_error, log_debug, log_warning
 from .stt_interface import STTInterface, STTConfig, TranscriptionResult
 class DeepgramSTT(STTInterface):
+    """Deepgram STT implementation using official SDK"""
     def __init__(self, api_key: str):
         if not api_key:
             raise ValueError("Deepgram API key is required")
         self.api_key = api_key
+        self.deepgram_client = None
+        self.live_connection = None
         self.is_streaming = False
         self.responses_queue = queue.Queue(maxsize=100)
         # Session tracking
         self.session_id = 0
         # Final result tracking
         self.final_result_received = False
+        self.stop_event = threading.Event()
+        log_info(f"✅ Deepgram STT initialized (SDK version)")
+    def _map_language_code(self, language: str) -> str:
+        """Map language codes to Deepgram format"""
+        language_map = {
+            "tr-TR": "tr",
+            "en-US": "en-US",
+            "en-GB": "en-GB",
+            "de-DE": "de",
+            "fr-FR": "fr",
+            "es-ES": "es",
+            "it-IT": "it",
+            "pt-BR": "pt-BR",
+            "ru-RU": "ru",
+            "ja-JP": "ja",
+            "ko-KR": "ko",
+            "zh-CN": "zh-CN",
+            "ar-SA": "ar",
         }
+        return language_map.get(language, language)
     async def start_streaming(self, config: STTConfig) -> None:
+        """Initialize streaming session using SDK"""
         try:
             # Stop any existing stream
+            if self.is_streaming:
                 log_warning("⚠️ Previous stream still active, stopping it first")
                 await self.stop_streaming()
                 await asyncio.sleep(0.5)
             # Reset session data
             self._reset_session_data()
+            log_info(f"🎤 Starting Deepgram STT (SDK) - Session #{self.session_id}")
+            # Create Deepgram client
+            config_options = DeepgramClientOptions(
+                verbose=False,
+                options={"keepalive": "true"}
+            )
+            self.deepgram_client = DeepgramClient(self.api_key, config=config_options)
+            # Configure live transcription options
+            deepgram_language = self._map_language_code(config.language)
+            options = LiveOptions(
+                language="tr",
+                model="nova-2",
+                punctuate=True,
+                smart_format=True,
+                encoding="webm-opus",
+                sample_rate=16000,
+                channels=1,
+                interim_results=False,  # Only final results
+                utterance_end_ms=1000,  # 1 second silence = end
+                vad_events=True,        # Enable VAD events
+                endpointing=1000,       # Enable endpointing
+                diarize=False,
+                numerals=True,
+                profanity_filter=False,
+                redact=False,
+                filler_words=False,
             )
+            log_debug(f"🔧 Deepgram options: language={deepgram_language}, model=nova-2, utterance_end=1000ms")
+            # Create live connection
+            self.live_connection = self.deepgram_client.listen.live.v("1")
+            # Setup event handlers
+            self._setup_event_handlers()
+            # Start the connection
+            if await self.live_connection.start(options):
+                self.is_streaming = True
+                log_info(f"✅ Deepgram SDK connected - Ready for speech")
+            else:
+                raise RuntimeError("Failed to start Deepgram connection")
         except Exception as e:
             log_error(f"❌ Failed to start Deepgram STT", error=str(e))
             self.is_streaming = False
+            self.live_connection = None
+            self.deepgram_client = None
             raise
+    def _setup_event_handlers(self):
+        """Setup event handlers for Deepgram events"""
+        # Transcript received
+        self.live_connection.on(LiveTranscriptionEvents.Transcript, self._on_transcript)
+        # Speech started
+        self.live_connection.on(LiveTranscriptionEvents.SpeechStarted, self._on_speech_started)
+        # Utterance end
+        self.live_connection.on(LiveTranscriptionEvents.UtteranceEnd, self._on_utterance_end)
+        # Metadata
+        self.live_connection.on(LiveTranscriptionEvents.Metadata, self._on_metadata)
+        # Error
+        self.live_connection.on(LiveTranscriptionEvents.Error, self._on_error)
+        # Connection closed
+        self.live_connection.on(LiveTranscriptionEvents.Close, self._on_close)
+    def _on_transcript(self, *args, **kwargs):
+        """Handle transcript event"""
+        try:
+            result = args[1] if len(args) > 1 else kwargs.get("result", {})
+            # Extract transcript data
+            is_final = result.get("is_final", False)
+            # Only process final results
+            if is_final:
+                channel = result.get("channel", {})
+                alternatives = channel.get("alternatives", [])
+                if alternatives:
+                    alt = alternatives[0]
+                    transcript = alt.get("transcript", "")
+                    confidence = alt.get("confidence", 0.0)
+                    if transcript.strip():  # Only process non-empty transcripts
+                        transcription_result = TranscriptionResult(
+                            text=transcript,
+                            is_final=True,
+                            confidence=confidence,
+                            timestamp=datetime.now().timestamp()
+                        )
+                        # Queue result
+                        try:
+                            self.responses_queue.put(transcription_result)
+                            self.final_result_received = True
+                            log_info(f"🎯 FINAL RESULT: '{transcript}' (confidence: {confidence:.2f})")
+                            log_info(f"📊 Session stats: {self.total_chunks} chunks, {self.total_audio_bytes/1024:.1f}KB")
+                        except queue.Full:
+                            log_warning("⚠️ Response queue full")
+        except Exception as e:
+            log_error(f"❌ Error processing transcript: {e}")
+    def _on_speech_started(self, *args, **kwargs):
+        """Handle speech started event"""
+        log_info("🎤 Speech detected - User started speaking")
+    def _on_utterance_end(self, *args, **kwargs):
+        """Handle utterance end event"""
+        log_info("🔚 Speech ended - User stopped speaking")
+        # Deepgram will send final transcript after this
+    def _on_metadata(self, *args, **kwargs):
+        """Handle metadata event"""
+        metadata = args[1] if len(args) > 1 else kwargs.get("metadata", {})
+        request_id = metadata.get("request_id", "")
+        log_debug(f"📋 Deepgram metadata - Request ID: {request_id}")
+    def _on_error(self, *args, **kwargs):
+        """Handle error event"""
+        error = args[1] if len(args) > 1 else kwargs.get("error", {})
+        log_error(f"❌ Deepgram error: {error}")
+    def _on_close(self, *args, **kwargs):
+        """Handle connection close event"""
+        log_info("🔌 Deepgram connection closed")
+        self.is_streaming = False
     async def stream_audio(self, audio_chunk: bytes) -> AsyncIterator[TranscriptionResult]:
+        """Stream audio chunk and get transcription results"""
+        if not self.is_streaming or not self.live_connection:
             raise RuntimeError("Streaming not started. Call start_streaming() first.")
+        # Don't send audio if final result already received
         if self.final_result_received:
             log_debug("Final result already received, ignoring audio chunk")
             return
         try:
+            # Send audio to Deepgram
+            self.live_connection.send(audio_chunk)
+            self.total_chunks += 1
+            self.total_audio_bytes += len(audio_chunk)
+            # Log progress
+            if self.total_chunks % 50 == 0:
+                log_debug(f"📊 Listening... {self.total_chunks} chunks, {self.total_audio_bytes/1024:.1f}KB")
+            # Check for final results
             while True:
                 try:
                     result = self.responses_queue.get_nowait()
                     if result.is_final:
                         yield result
                 except queue.Empty:
                     break
         except Exception as e:
+            log_error(f"❌ Error streaming audio", error=str(e))
             self.is_streaming = False
             raise
     async def stop_streaming(self) -> Optional[TranscriptionResult]:
+        """Stop streaming and clean up"""
+        if not self.is_streaming:
             log_debug("Already stopped, nothing to do")
             return None
         try:
+            log_info(f"🛑 Stopping Deepgram STT session #{self.session_id}")
             self.is_streaming = False
+            # Finish the stream to get final results
+            if self.live_connection:
                 try:
+                    # Finish the stream - this triggers final transcript
+                    self.live_connection.finish()
+                    # Wait a bit for final result
+                    await asyncio.sleep(0.5)
+                except Exception as e:
+                    log_warning(f"⚠️ Error finishing stream: {e}")
+            # Get final result from queue
             final_result = None
             while not self.responses_queue.empty():
                 try:
                 except queue.Empty:
                     break
+            # Clean up
+            self.live_connection = None
+            self.deepgram_client = None
             self.final_result_received = False
+            log_info(f"✅ Deepgram STT session #{self.session_id} stopped")
             return final_result
         except Exception as e:
             log_error(f"❌ Error during stop_streaming", error=str(e))
             self.is_streaming = False
+            self.live_connection = None
+            self.deepgram_client = None
             return None
     def _reset_session_data(self):
         """Reset session-specific data"""
         # Clear queue
     def get_supported_languages(self) -> List[str]:
         """Get list of supported language codes"""
         return [
+            "tr-TR",   # Turkish
             "en-US",   # English (US)
             "en-GB",   # English (UK)
+            "de-DE",   # German
+            "fr-FR",   # French
+            "es-ES",   # Spanish
+            "it-IT",   # Italian
+            "pt-BR",   # Portuguese (Brazil)
+            "ru-RU",   # Russian
+            "ja-JP",   # Japanese
+            "ko-KR",   # Korean
+            "zh-CN",   # Chinese (Simplified)
+            "ar-SA",   # Arabic
+            "nl-NL",   # Dutch
+            "sv-SE",   # Swedish
+            "pl-PL",   # Polish
+            "hi-IN",   # Hindi
+            "cs-CZ",   # Czech
+            "da-DK",   # Danish
+            "fi-FI",   # Finnish
+            "el-GR",   # Greek
+            "he-IL",   # Hebrew
+            "hu-HU",   # Hungarian
+            "id-ID",   # Indonesian
+            "ms-MY",   # Malay
+            "no-NO",   # Norwegian
+            "ro-RO",   # Romanian
+            "sk-SK",   # Slovak
+            "th-TH",   # Thai
+            "uk-UA",   # Ukrainian
+            "vi-VN",   # Vietnamese
         ]
     def get_provider_name(self) -> str: