File size: 6,416 Bytes
78b5a88
a532986
78b5a88
a532986
78b5a88
a532986
 
78b5a88
a532986
78b5a88
 
 
a532986
78b5a88
3586948
 
 
 
 
 
a532986
3586948
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a532986
 
3586948
a532986
 
 
 
3586948
a532986
3586948
a532986
 
3586948
a532986
3586948
 
9c60cb5
 
a532986
 
 
 
 
 
9c60cb5
 
a532986
 
3586948
a532986
 
 
3586948
a532986
 
 
 
3586948
78b5a88
a532986
 
 
 
 
 
 
 
 
 
 
1e56973
a532986
 
 
 
 
 
 
1e56973
a532986
 
1e56973
a532986
3586948
 
78b5a88
a532986
 
 
3586948
78b5a88
a532986
 
 
 
78b5a88
a532986
 
 
 
 
 
78b5a88
a532986
 
 
78b5a88
3586948
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
"""
Google Cloud Speech-to-Text Implementation - Simple Batch Mode
"""
from typing import Optional, List
from datetime import datetime
import io
import wave
from google.cloud import speech
from google.cloud.speech import RecognitionConfig, RecognitionAudio
from utils.logger import log_info, log_error, log_debug, log_warning
from .stt_interface import STTInterface, STTConfig, TranscriptionResult


class GoogleSTT(STTInterface):
    def __init__(self, credentials_path: Optional[str] = None):
        """
        Initialize Google STT
        Args:
            credentials_path: Path to service account JSON file (optional if using default credentials)
        """
        try:
            # Initialize client
            if credentials_path:
                self.client = speech.SpeechClient.from_service_account_file(credentials_path)
                log_info(f"βœ… Google STT initialized with service account: {credentials_path}")
            else:
                # Use default credentials (ADC)
                self.client = speech.SpeechClient()
                log_info("βœ… Google STT initialized with default credentials")
            
        except Exception as e:
            log_error(f"❌ Failed to initialize Google STT: {str(e)}")
            raise
    
    def _map_language_code(self, language: str) -> str:
        """Map language codes to Google format"""
        # Google uses BCP-47 language codes
        language_map = {
            "tr-TR": "tr-TR",
            "en-US": "en-US", 
            "en-GB": "en-GB",
            "de-DE": "de-DE",
            "fr-FR": "fr-FR",
            "es-ES": "es-ES",
            "it-IT": "it-IT",
            "pt-BR": "pt-BR",
            "ru-RU": "ru-RU",
            "ja-JP": "ja-JP",
            "ko-KR": "ko-KR",
            "zh-CN": "zh-CN",
            "ar-SA": "ar-SA",
        }
        return language_map.get(language, language)
    
    async def transcribe(self, audio_data: bytes, config: STTConfig) -> Optional[TranscriptionResult]:
        """Transcribe audio data using Google Cloud Speech API"""
        try:
            # Check if we have audio to transcribe
            if not audio_data:
                log_warning("⚠️ No audio data provided")
                return None
            
            log_info(f"πŸ“Š Transcribing {len(audio_data)} bytes of audio")
            
            # Convert to WAV format for better compatibility
            wav_audio = self._convert_to_wav(audio_data, config.sample_rate)
            
            # Configure recognition
            language_code = self._map_language_code(config.language)
            
            recognition_config = RecognitionConfig(
                encoding=RecognitionConfig.AudioEncoding.LINEAR16,
                sample_rate_hertz=config.sample_rate,
                language_code=language_code,
                enable_automatic_punctuation=config.enable_punctuation,
                model=config.model,
                use_enhanced=config.use_enhanced,
                enable_word_time_offsets=config.enable_word_timestamps,
            )
            
            # Create audio object
            audio = RecognitionAudio(content=wav_audio)
            
            # Perform synchronous recognition
            log_info(f"πŸ”„ Sending audio to Google Cloud Speech API...")
            response = self.client.recognize(config=recognition_config, audio=audio)
            
            # Process results
            if response.results:
                result = response.results[0]
                if result.alternatives:
                    alternative = result.alternatives[0]
                    
                    # Extract word timestamps if available
                    word_timestamps = None
                    if config.enable_word_timestamps and hasattr(alternative, 'words'):
                        word_timestamps = [
                            {
                                "word": word_info.word,
                                "start_time": word_info.start_time.total_seconds(),
                                "end_time": word_info.end_time.total_seconds()
                            }
                            for word_info in alternative.words
                        ]
                    
                    transcription = TranscriptionResult(
                        text=alternative.transcript,
                        confidence=alternative.confidence,
                        timestamp=datetime.now().timestamp(),
                        language=language_code,
                        word_timestamps=word_timestamps
                    )
                    
                    log_info(f"βœ… Transcription: '{alternative.transcript}' (confidence: {alternative.confidence:.2f})")
                    return transcription
            
            log_warning("⚠️ No transcription results")
            return None
            
        except Exception as e:
            log_error(f"❌ Error during transcription: {str(e)}")
            import traceback
            log_error(f"Traceback: {traceback.format_exc()}")
            return None
    
    def _convert_to_wav(self, audio_data: bytes, sample_rate: int) -> bytes:
        """Convert raw PCM audio to WAV format"""
        # Create WAV file in memory
        wav_buffer = io.BytesIO()
        
        with wave.open(wav_buffer, 'wb') as wav_file:
            # Set WAV parameters
            wav_file.setnchannels(1)  # Mono
            wav_file.setsampwidth(2)  # 16-bit
            wav_file.setframerate(sample_rate)
            wav_file.writeframes(audio_data)
        
        # Get WAV data
        wav_buffer.seek(0)
        return wav_buffer.read()
    
    def get_supported_languages(self) -> List[str]:
        """Get list of supported language codes"""
        # Google Cloud Speech-to-Text supported languages (partial list)
        return [
            "tr-TR", "en-US", "en-GB", "en-AU", "en-CA", "en-IN",
            "es-ES", "es-MX", "es-AR", "fr-FR", "fr-CA", "de-DE",
            "it-IT", "pt-BR", "pt-PT", "ru-RU", "ja-JP", "ko-KR",
            "zh-CN", "zh-TW", "ar-SA", "ar-EG", "hi-IN", "nl-NL",
            "pl-PL", "sv-SE", "da-DK", "no-NO", "fi-FI", "el-GR",
            "he-IL", "th-TH", "vi-VN", "id-ID", "ms-MY", "fil-PH"
        ]
    
    def get_provider_name(self) -> str:
        """Get provider name"""
        return "google"