File size: 10,158 Bytes
78b5a88
a532986
78b5a88
a532986
78b5a88
a532986
 
78b5a88
a532986
78b5a88
 
 
a532986
78b5a88
3586948
 
 
 
 
 
a532986
3586948
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5ab11dd
3586948
5ab11dd
3586948
 
5ab11dd
3586948
5ab11dd
3586948
5ab11dd
3586948
5ab11dd
3586948
5ab11dd
3586948
5ab11dd
3586948
5ab11dd
3586948
5ab11dd
3586948
5ab11dd
3586948
5ab11dd
3586948
 
5ab11dd
 
3586948
 
a532986
3586948
a532986
 
 
3586948
a532986
3586948
6abf273
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f4b2af6
6abf273
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a532986
3586948
a532986
f4b2af6
 
ee90174
ddfbba9
6abf273
ddfbba9
6abf273
20f6ba5
6abf273
a532986
 
3586948
a532986
 
 
6abf273
 
 
 
 
 
 
 
 
 
 
 
a532986
6abf273
 
 
 
 
 
 
a532986
6abf273
3586948
78b5a88
a532986
 
 
 
6abf273
 
a532986
1e56973
6abf273
a532986
1e56973
6abf273
3586948
 
78b5a88
a532986
 
 
3586948
78b5a88
a532986
 
 
 
78b5a88
a532986
 
 
 
 
 
78b5a88
a532986
 
 
78b5a88
3586948
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
"""
Google Cloud Speech-to-Text Implementation - Simple Batch Mode
"""
from typing import Optional, List
from datetime import datetime
import io
import wave
from google.cloud import speech
from google.cloud.speech import RecognitionConfig, RecognitionAudio
from utils.logger import log_info, log_error, log_debug, log_warning
from .stt_interface import STTInterface, STTConfig, TranscriptionResult


class GoogleSTT(STTInterface):
    def __init__(self, credentials_path: Optional[str] = None):
        """
        Initialize Google STT
        Args:
            credentials_path: Path to service account JSON file (optional if using default credentials)
        """
        try:
            # Initialize client
            if credentials_path:
                self.client = speech.SpeechClient.from_service_account_file(credentials_path)
                log_info(f"✅ Google STT initialized with service account: {credentials_path}")
            else:
                # Use default credentials (ADC)
                self.client = speech.SpeechClient()
                log_info("✅ Google STT initialized with default credentials")
            
        except Exception as e:
            log_error(f"❌ Failed to initialize Google STT: {str(e)}")
            raise
    
    def _map_language_code(self, language: str) -> str:
        """Map language codes to Google format"""
        # Google uses BCP-47 language codes
        language_map = {
            "tr": "tr-TR",
            "tr-TR": "tr-TR",
            "en": "en-US",
            "en-US": "en-US", 
            "en-GB": "en-GB",
            "de": "de-DE",
            "de-DE": "de-DE",
            "fr": "fr-FR",
            "fr-FR": "fr-FR",
            "es": "es-ES",
            "es-ES": "es-ES",
            "it": "it-IT",
            "it-IT": "it-IT",
            "pt": "pt-BR",
            "pt-BR": "pt-BR",
            "ru": "ru-RU",
            "ru-RU": "ru-RU",
            "ja": "ja-JP",
            "ja-JP": "ja-JP",
            "ko": "ko-KR",
            "ko-KR": "ko-KR",
            "zh": "zh-CN",
            "zh-CN": "zh-CN",
            "ar": "ar-SA",
            "ar-SA": "ar-SA",
        }
        
        # Default to the language itself if not in map
        return language_map.get(language, language)
    
    async def transcribe(self, audio_data: bytes, config: STTConfig) -> Optional[TranscriptionResult]:
        try:
            if not audio_data:
                log_warning("⚠️ No audio data provided")
                return None
            
            log_info(f"📊 Transcribing {len(audio_data)} bytes of audio")
            
            # ✅ Detaylı audio analizi - logda
            import struct
            samples = struct.unpack(f'{len(audio_data)//2}h', audio_data)
            total_samples = len(samples)
            
            # 1. Genel istatistikler
            non_zero_samples = [s for s in samples if s != 0]
            zero_count = total_samples - len(non_zero_samples)
            
            if non_zero_samples:
                avg_amplitude = sum(abs(s) for s in non_zero_samples) / len(non_zero_samples)
                max_amplitude = max(abs(s) for s in non_zero_samples)
            else:
                avg_amplitude = 0
                max_amplitude = 0
            
            log_info(f"🔍 Audio stats: {total_samples} total samples, {zero_count} zeros ({zero_count/total_samples:.1%})")
            log_info(f"🔍 Non-zero stats: avg={avg_amplitude:.1f}, max={max_amplitude}")
            
            # 2. Bölüm bazlı analiz (10 bölüme ayır)
            section_size = total_samples // 10
            log_info(f"🔍 Section analysis (each {section_size} samples):")
            
            for i in range(10):
                start_idx = i * section_size
                end_idx = (i + 1) * section_size if i < 9 else total_samples
                section = samples[start_idx:end_idx]
                
                section_non_zero = [s for s in section if s != 0]
                section_max = max(abs(s) for s in section_non_zero) if section_non_zero else 0
                section_avg = sum(abs(s) for s in section_non_zero) / len(section_non_zero) if section_non_zero else 0
                zero_ratio = (len(section) - len(section_non_zero)) / len(section)
                
                log_info(f"  Section {i+1}: max={section_max}, avg={section_avg:.1f}, zeros={zero_ratio:.1%}")
            
            # 3. İlk konuşma başlangıcını bul
            speech_threshold = 500  # RMS eşiği
            speech_start_idx = -1
            
            # 100 sample'lık pencerelerle RMS hesapla
            window_size = 100
            for i in range(0, total_samples - window_size, window_size):
                window = samples[i:i + window_size]
                rms = (sum(s * s for s in window) / window_size) ** 0.5
                
                if rms > speech_threshold:
                    speech_start_idx = i
                    break
            
            if speech_start_idx >= 0:
                speech_start_time = speech_start_idx / config.sample_rate
                log_info(f"🎤 Speech detected starting at sample {speech_start_idx} ({speech_start_time:.2f}s)")
            else:
                log_warning("⚠️ No speech detected above threshold in entire audio")
            
            # 4. Audio'nun gerçekten boş olup olmadığını kontrol et
            if max_amplitude < 100:
                log_warning(f"⚠️ Audio appears silent: max_amplitude={max_amplitude}")
                return None
            
            if zero_count / total_samples > 0.95:  # %95'den fazla sıfır
                log_warning(f"⚠️ Audio is mostly zeros: {zero_count/total_samples:.1%}")
                return None
            
            # Convert to WAV format
            wav_audio = self._convert_to_wav(audio_data, config.sample_rate)
            
            # Configure recognition
            recognition_config = RecognitionConfig(
                encoding=RecognitionConfig.AudioEncoding.LINEAR16,
                sample_rate_hertz=16000,
                language_code="tr-TR",
                audio_channel_count=1,
                enable_separate_recognition_per_channel=False,
                enable_automatic_punctuation=True,
            )
            
            # Create audio object
            audio = RecognitionAudio(content=wav_audio)
            
            # Perform synchronous recognition
            log_info(f"🔄 Sending audio to Google Cloud Speech API...")
            response = self.client.recognize(config=recognition_config, audio=audio)
    
            # ✅ Detaylı response analizi
            log_info(f"🔍 Google response details:")
            log_info(f"  - Has results: {bool(response.results)}")
            log_info(f"  - Results count: {len(response.results) if response.results else 0}")
            
            if hasattr(response, 'total_billed_time'):
                if response.total_billed_time and response.total_billed_time.total_seconds() > 0:
                    log_info(f"  - Billed time: {response.total_billed_time.total_seconds()}s")
                else:
                    log_info(f"  - Billed time: 0s (no audio processed)")
            
            # Process results
            if response.results and len(response.results) > 0:
                for i, result in enumerate(response.results):
                    log_info(f"  - Result {i}: {len(result.alternatives)} alternatives")
                    if result.alternatives:
                        for j, alt in enumerate(result.alternatives):
                            log_info(f"    - Alt {j}: '{alt.transcript}' (conf: {alt.confidence:.3f})")
                
                result = response.results[0]
                if result.alternatives and len(result.alternatives) > 0:
                    alternative = result.alternatives[0]
                    
                    transcription = TranscriptionResult(
                        text=alternative.transcript,
                        confidence=alternative.confidence,
                        timestamp=datetime.now().timestamp(),
                        language="tr-TR",
                        word_timestamps=None
                    )
                    
                    log_info(f"✅ Transcription SUCCESS: '{alternative.transcript}' (confidence: {alternative.confidence:.2f})")
                    return transcription
            
            log_warning("⚠️ No transcription results - Google couldn't recognize speech")
            return None
            
        except Exception as e:
            log_error(f"❌ Error during transcription: {str(e)}")
            import traceback
            log_error(f"Traceback: {traceback.format_exc()}")
            return None
    
    def _convert_to_wav(self, audio_data: bytes, sample_rate: int) -> bytes:
        """Convert raw PCM audio to WAV format"""
        # Create WAV file in memory
        wav_buffer = io.BytesIO()
        
        with wave.open(wav_buffer, 'wb') as wav_file:
            # Set WAV parameters
            wav_file.setnchannels(1)  # Mono
            wav_file.setsampwidth(2)  # 16-bit
            wav_file.setframerate(sample_rate)
            wav_file.writeframes(audio_data)
        
        # Get WAV data
        wav_buffer.seek(0)
        return wav_buffer.read()
    
    def get_supported_languages(self) -> List[str]:
        """Get list of supported language codes"""
        # Google Cloud Speech-to-Text supported languages (partial list)
        return [
            "tr-TR", "en-US", "en-GB", "en-AU", "en-CA", "en-IN",
            "es-ES", "es-MX", "es-AR", "fr-FR", "fr-CA", "de-DE",
            "it-IT", "pt-BR", "pt-PT", "ru-RU", "ja-JP", "ko-KR",
            "zh-CN", "zh-TW", "ar-SA", "ar-EG", "hi-IN", "nl-NL",
            "pl-PL", "sv-SE", "da-DK", "no-NO", "fi-FI", "el-GR",
            "he-IL", "th-TH", "vi-VN", "id-ID", "ms-MY", "fil-PH"
        ]
    
    def get_provider_name(self) -> str:
        """Get provider name"""
        return "google"