Spaces:

UcsTurkey
/

flare

Building

App Files Files Community

ciyidogan commited on Jun 18

Commit

b0a4866

verified ·

1 Parent(s): a847f43

Create stt_google.py

Browse files

Files changed (1) hide show

stt_google.py +143 -0

stt_google.py ADDED Viewed

	@@ -0,0 +1,143 @@

+"""
+Google Cloud Speech-to-Text Implementation
+"""
+import os
+import asyncio
+from typing import AsyncIterator, Optional, List
+from google.cloud import speech_v1p1beta1 as speech
+from google.api_core import exceptions
+from utils import log
+from stt_interface import STTInterface, STTConfig, TranscriptionResult
+class GoogleCloudSTT(STTInterface):
+    """Google Cloud Speech-to-Text implementation"""
+    def __init__(self, credentials_path: str):
+        if credentials_path and os.path.exists(credentials_path):
+            os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = credentials_path
+            log(f"✅ Google credentials set from: {credentials_path}")
+        else:
+            log("⚠️ Google credentials path not found, using default credentials")
+        self.client = speech.SpeechAsyncClient()
+        self.streaming_config = None
+        self.is_streaming = False
+        self.audio_queue = asyncio.Queue()
+    async def start_streaming(self, config: STTConfig) -> None:
+        """Initialize streaming session"""
+        try:
+            recognition_config = speech.RecognitionConfig(
+                encoding=self._get_encoding(config.encoding),
+                sample_rate_hertz=config.sample_rate,
+                language_code=config.language,
+                enable_automatic_punctuation=config.enable_punctuation,
+                enable_word_time_offsets=config.enable_word_timestamps,
+                model=config.model,
+                use_enhanced=config.use_enhanced,
+                metadata=speech.RecognitionMetadata(
+                    interaction_type=speech.RecognitionMetadata.InteractionType.VOICE_SEARCH,
+                    recording_device_type=speech.RecognitionMetadata.RecordingDeviceType.PC,
+                    audio_topic="general"
+                )
+            )
+            self.streaming_config = speech.StreamingRecognitionConfig(
+                config=recognition_config,
+                interim_results=config.interim_results,
+                single_utterance=config.single_utterance
+            )
+            self.is_streaming = True
+            log("✅ Google STT streaming session started")
+        except Exception as e:
+            log(f"❌ Failed to start Google STT streaming: {e}")
+            raise
+    async def stream_audio(self, audio_chunk: bytes) -> AsyncIterator[TranscriptionResult]:
+        """Stream audio chunk and get transcription results"""
+        if not self.is_streaming:
+            log("⚠️ STT streaming not started")
+            return
+        try:
+            # Add audio chunk to queue
+            await self.audio_queue.put(audio_chunk)
+            # Process audio stream
+            async def audio_generator():
+                while self.is_streaming:
+                    chunk = await self.audio_queue.get()
+                    yield speech.StreamingRecognizeRequest(audio_content=chunk)
+            # Get responses
+            responses = await self.client.streaming_recognize(
+                self.streaming_config,
+                audio_generator()
+            )
+            async for response in responses:
+                for result in response.results:
+                    if result.alternatives:
+                        yield TranscriptionResult(
+                            text=result.alternatives[0].transcript,
+                            is_final=result.is_final,
+                            confidence=result.alternatives[0].confidence,
+                            timestamp=asyncio.get_event_loop().time()
+                        )
+        except exceptions.OutOfRange:
+            log("⚠️ Google STT: Exceeded maximum audio duration")
+            self.is_streaming = False
+        except Exception as e:
+            log(f"❌ Google STT streaming error: {e}")
+            raise
+    async def stop_streaming(self) -> Optional[TranscriptionResult]:
+        """Stop streaming and get final result"""
+        self.is_streaming = False
+        log("🛑 Google STT streaming stopped")
+        # Process any remaining audio in queue
+        if not self.audio_queue.empty():
+            # TODO: Process remaining audio
+            pass
+        return None
+    def supports_realtime(self) -> bool:
+        """Google Cloud Speech supports real-time streaming"""
+        return True
+    def get_supported_languages(self) -> List[str]:
+        """Get list of supported language codes"""
+        return [
+            "tr-TR",  # Turkish
+            "en-US",  # English (US)
+            "en-GB",  # English (UK)
+            "de-DE",  # German
+            "fr-FR",  # French
+            "es-ES",  # Spanish
+            "it-IT",  # Italian
+            "pt-BR",  # Portuguese (Brazil)
+            "ru-RU",  # Russian
+            "ja-JP",  # Japanese
+            "ko-KR",  # Korean
+            "zh-CN",  # Chinese (Simplified)
+        ]
+    def _get_encoding(self, encoding: str):
+        """Convert encoding string to Google Cloud Speech encoding"""
+        encoding_map = {
+            "LINEAR16": speech.RecognitionConfig.AudioEncoding.LINEAR16,
+            "FLAC": speech.RecognitionConfig.AudioEncoding.FLAC,
+            "MULAW": speech.RecognitionConfig.AudioEncoding.MULAW,
+            "AMR": speech.RecognitionConfig.AudioEncoding.AMR,
+            "AMR_WB": speech.RecognitionConfig.AudioEncoding.AMR_WB,
+            "OGG_OPUS": speech.RecognitionConfig.AudioEncoding.OGG_OPUS,
+            "SPEEX_WITH_HEADER_BYTE": speech.RecognitionConfig.AudioEncoding.SPEEX_WITH_HEADER_BYTE,
+            "WEBM_OPUS": speech.RecognitionConfig.AudioEncoding.WEBM_OPUS,
+        }
+        return encoding_map.get(encoding, speech.RecognitionConfig.AudioEncoding.WEBM_OPUS)