""" Google Cloud Speech-to-Text Implementation - Simple Batch Mode """ from typing import Optional, List from datetime import datetime import io import wave from google.cloud import speech from google.cloud.speech import RecognitionConfig, RecognitionAudio from utils.logger import log_info, log_error, log_debug, log_warning from .stt_interface import STTInterface, STTConfig, TranscriptionResult class GoogleSTT(STTInterface): def __init__(self, credentials_path: Optional[str] = None): """ Initialize Google STT Args: credentials_path: Path to service account JSON file (optional if using default credentials) """ try: # Initialize client if credentials_path: self.client = speech.SpeechClient.from_service_account_file(credentials_path) log_info(f"✅ Google STT initialized with service account: {credentials_path}") else: # Use default credentials (ADC) self.client = speech.SpeechClient() log_info("✅ Google STT initialized with default credentials") except Exception as e: log_error(f"❌ Failed to initialize Google STT: {str(e)}") raise def _map_language_code(self, language: str) -> str: """Map language codes to Google format""" # Google uses BCP-47 language codes language_map = { "tr-TR": "tr-TR", "en-US": "en-US", "en-GB": "en-GB", "de-DE": "de-DE", "fr-FR": "fr-FR", "es-ES": "es-ES", "it-IT": "it-IT", "pt-BR": "pt-BR", "ru-RU": "ru-RU", "ja-JP": "ja-JP", "ko-KR": "ko-KR", "zh-CN": "zh-CN", "ar-SA": "ar-SA", } return language_map.get(language, language) async def transcribe(self, audio_data: bytes, config: STTConfig) -> Optional[TranscriptionResult]: """Transcribe audio data using Google Cloud Speech API""" try: # Check if we have audio to transcribe if not audio_data: log_warning("⚠️ No audio data provided") return None log_info(f"📊 Transcribing {len(audio_data)} bytes of audio") # Convert to WAV format for better compatibility wav_audio = self._convert_to_wav(audio_data, config.sample_rate) # Configure recognition language_code = self._map_language_code(config.language) recognition_config = RecognitionConfig( encoding=RecognitionConfig.AudioEncoding.LINEAR16, sample_rate_hertz=config.sample_rate, language_code=language_code, enable_automatic_punctuation=config.enable_punctuation, model=config.model, use_enhanced=config.use_enhanced, enable_word_time_offsets=config.enable_word_timestamps, ) # Create audio object audio = RecognitionAudio(content=wav_audio) # Perform synchronous recognition log_info(f"🔄 Sending audio to Google Cloud Speech API...") response = self.client.recognize(config=recognition_config, audio=audio) # Process results if response.results: result = response.results[0] if result.alternatives: alternative = result.alternatives[0] # Extract word timestamps if available word_timestamps = None if config.enable_word_timestamps and hasattr(alternative, 'words'): word_timestamps = [ { "word": word_info.word, "start_time": word_info.start_time.total_seconds(), "end_time": word_info.end_time.total_seconds() } for word_info in alternative.words ] transcription = TranscriptionResult( text=alternative.transcript, confidence=alternative.confidence, timestamp=datetime.now().timestamp(), language=language_code, word_timestamps=word_timestamps ) log_info(f"✅ Transcription: '{alternative.transcript}' (confidence: {alternative.confidence:.2f})") return transcription log_warning("⚠️ No transcription results") return None except Exception as e: log_error(f"❌ Error during transcription: {str(e)}") import traceback log_error(f"Traceback: {traceback.format_exc()}") return None def _convert_to_wav(self, audio_data: bytes, sample_rate: int) -> bytes: """Convert raw PCM audio to WAV format""" # Create WAV file in memory wav_buffer = io.BytesIO() with wave.open(wav_buffer, 'wb') as wav_file: # Set WAV parameters wav_file.setnchannels(1) # Mono wav_file.setsampwidth(2) # 16-bit wav_file.setframerate(sample_rate) wav_file.writeframes(audio_data) # Get WAV data wav_buffer.seek(0) return wav_buffer.read() def get_supported_languages(self) -> List[str]: """Get list of supported language codes""" # Google Cloud Speech-to-Text supported languages (partial list) return [ "tr-TR", "en-US", "en-GB", "en-AU", "en-CA", "en-IN", "es-ES", "es-MX", "es-AR", "fr-FR", "fr-CA", "de-DE", "it-IT", "pt-BR", "pt-PT", "ru-RU", "ja-JP", "ko-KR", "zh-CN", "zh-TW", "ar-SA", "ar-EG", "hi-IN", "nl-NL", "pl-PL", "sv-SE", "da-DK", "no-NO", "fi-FI", "el-GR", "he-IL", "th-TH", "vi-VN", "id-ID", "ms-MY", "fil-PH" ] def get_provider_name(self) -> str: """Get provider name""" return "google"