ciyidogan commited on
Commit
308dbba
·
verified ·
1 Parent(s): 98d7635

Update stt/stt_google.py

Browse files
Files changed (1) hide show
  1. stt/stt_google.py +50 -90
stt/stt_google.py CHANGED
@@ -76,124 +76,84 @@ class GoogleSTT(STTInterface):
76
 
77
  log_info(f"📊 Transcribing {len(audio_data)} bytes of audio")
78
 
79
- # ✅ Detaylı audio analizi - logda
80
- import struct
81
- samples = struct.unpack(f'{len(audio_data)//2}h', audio_data)
82
- total_samples = len(samples)
83
-
84
- # 1. Genel istatistikler
85
- non_zero_samples = [s for s in samples if s != 0]
86
- zero_count = total_samples - len(non_zero_samples)
87
-
88
- if non_zero_samples:
89
- avg_amplitude = sum(abs(s) for s in non_zero_samples) / len(non_zero_samples)
90
- max_amplitude = max(abs(s) for s in non_zero_samples)
91
- else:
92
- avg_amplitude = 0
93
- max_amplitude = 0
94
-
95
- log_info(f"🔍 Audio stats: {total_samples} total samples, {zero_count} zeros ({zero_count/total_samples:.1%})")
96
- log_info(f"🔍 Non-zero stats: avg={avg_amplitude:.1f}, max={max_amplitude}")
97
-
98
- # 2. Bölüm bazlı analiz (10 bölüme ayır)
99
- section_size = total_samples // 10
100
- log_info(f"🔍 Section analysis (each {section_size} samples):")
101
-
102
- for i in range(10):
103
- start_idx = i * section_size
104
- end_idx = (i + 1) * section_size if i < 9 else total_samples
105
- section = samples[start_idx:end_idx]
106
 
107
- section_non_zero = [s for s in section if s != 0]
108
- section_max = max(abs(s) for s in section_non_zero) if section_non_zero else 0
109
- section_avg = sum(abs(s) for s in section_non_zero) / len(section_non_zero) if section_non_zero else 0
110
- zero_ratio = (len(section) - len(section_non_zero)) / len(section)
111
-
112
- log_info(f" Section {i+1}: max={section_max}, avg={section_avg:.1f}, zeros={zero_ratio:.1%}")
113
-
114
- # 3. İlk konuşma başlangıcını bul
115
- speech_threshold = 500 # RMS eşiği
116
- speech_start_idx = -1
117
-
118
- # 100 sample'lık pencerelerle RMS hesapla
119
- window_size = 100
120
- for i in range(0, total_samples - window_size, window_size):
121
- window = samples[i:i + window_size]
122
- rms = (sum(s * s for s in window) / window_size) ** 0.5
123
-
124
- if rms > speech_threshold:
125
- speech_start_idx = i
126
- break
127
 
128
- if speech_start_idx >= 0:
129
- speech_start_time = speech_start_idx / config.sample_rate
130
- log_info(f"🎤 Speech detected starting at sample {speech_start_idx} ({speech_start_time:.2f}s)")
131
- else:
132
- log_warning("⚠️ No speech detected above threshold in entire audio")
133
 
134
- # 4. Audio'nun gerçekten boş olup olmadığını kontrol et
135
- if max_amplitude < 100:
136
- log_warning(f"⚠️ Audio appears silent: max_amplitude={max_amplitude}")
137
- return None
138
-
139
- if zero_count / total_samples > 0.95: # %95'den fazla sıfır
140
- log_warning(f"⚠️ Audio is mostly zeros: {zero_count/total_samples:.1%}")
141
- return None
142
-
143
- wav_audio = self._convert_to_wav(audio_data, 16000)
144
-
145
  # Configure recognition
 
 
 
146
  recognition_config = RecognitionConfig(
147
  encoding=RecognitionConfig.AudioEncoding.LINEAR16,
148
  sample_rate_hertz=16000,
149
  language_code="tr-TR",
150
- audio_channel_count=1,
151
  enable_separate_recognition_per_channel=False,
152
- enable_automatic_punctuation=True,
153
  )
154
-
155
- # WAV audio gönder
156
- audio = RecognitionAudio(content=wav_audio)
157
 
158
- # ✅ RAW audio gönder, WAV conversion yapmadan
159
- audio = RecognitionAudio(content=audio_data) # Direkt raw PCM
160
 
161
  # Perform synchronous recognition
162
- log_info(f"🔄 Sending {len(audio_data)} bytes RAW PCM to Google Cloud Speech API...")
163
  response = self.client.recognize(config=recognition_config, audio=audio)
164
 
165
- # ✅ Detaylı response analizi
 
166
  log_info(f"🔍 Google response details:")
167
- log_info(f" - Has results: {bool(response.results)}")
168
- log_info(f" - Results count: {len(response.results) if response.results else 0}")
169
 
170
  if hasattr(response, 'total_billed_time'):
171
- if response.total_billed_time and response.total_billed_time.total_seconds() > 0:
172
- log_info(f" - Billed time: {response.total_billed_time.total_seconds()}s")
173
- else:
174
- log_info(f" - Billed time: 0s (no audio processed)")
175
-
176
  # Process results
177
- if response.results and len(response.results) > 0:
178
- for i, result in enumerate(response.results):
179
- log_info(f" - Result {i}: {len(result.alternatives)} alternatives")
180
- if result.alternatives:
181
- for j, alt in enumerate(result.alternatives):
182
- log_info(f" - Alt {j}: '{alt.transcript}' (conf: {alt.confidence:.3f})")
183
-
184
  result = response.results[0]
185
- if result.alternatives and len(result.alternatives) > 0:
186
  alternative = result.alternatives[0]
187
 
 
 
 
 
 
 
 
 
 
 
 
 
188
  transcription = TranscriptionResult(
189
  text=alternative.transcript,
190
  confidence=alternative.confidence,
191
  timestamp=datetime.now().timestamp(),
192
- language="tr-TR",
193
- word_timestamps=None
194
  )
195
 
196
- log_info(f"✅ Transcription SUCCESS: '{alternative.transcript}' (confidence: {alternative.confidence:.2f})")
197
  return transcription
198
 
199
  log_warning("⚠️ No transcription results - Google couldn't recognize speech")
 
76
 
77
  log_info(f"📊 Transcribing {len(audio_data)} bytes of audio")
78
 
79
+ # ✅ Debug - audio verisi analizi
80
+ if len(audio_data) > 100:
81
+ # İlk ve son 50 byte'ı kontrol et
82
+ first_50 = audio_data[:50]
83
+ last_50 = audio_data[-50:]
84
+ log_debug(f"Audio first 50 bytes: {first_50.hex()}")
85
+ log_debug(f"Audio last 50 bytes: {last_50.hex()}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
86
 
87
+ # Ortalama amplitude kontrolü
88
+ import struct
89
+ samples = struct.unpack(f'{len(audio_data)//2}h', audio_data)
90
+ avg_amplitude = sum(abs(s) for s in samples) / len(samples)
91
+ max_amplitude = max(abs(s) for s in samples)
92
+ log_debug(f"Audio stats: avg_amplitude={avg_amplitude:.1f}, max_amplitude={max_amplitude}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
93
 
94
+ # Convert to WAV format for better compatibility
95
+ wav_audio = self._convert_to_wav(audio_data, config.sample_rate)
96
+ log_info(f"🔧 WAV conversion: {len(audio_data)} PCM {len(wav_audio)} WAV")
 
 
97
 
 
 
 
 
 
 
 
 
 
 
 
98
  # Configure recognition
99
+ language_code = self._map_language_code(config.language)
100
+
101
+ # ✅ WAV audio kullanıyoruz artık
102
  recognition_config = RecognitionConfig(
103
  encoding=RecognitionConfig.AudioEncoding.LINEAR16,
104
  sample_rate_hertz=16000,
105
  language_code="tr-TR",
106
+ audio_channel_count=1, # Frontend mono audio gönderiyor
107
  enable_separate_recognition_per_channel=False,
 
108
  )
109
+
110
+ log_debug(f"Recognition config: language={language_code}, sample_rate={config.sample_rate}, model={config.model}")
 
111
 
112
+ # ✅ Create audio object with WAV data (not raw PCM)
113
+ audio = RecognitionAudio(content=wav_audio) # wav_audio kullan, audio_data değil
114
 
115
  # Perform synchronous recognition
116
+ log_info(f"🔄 Sending {len(wav_audio)} bytes WAV to Google Cloud Speech API...")
117
  response = self.client.recognize(config=recognition_config, audio=audio)
118
 
119
+ # ✅ Debug response
120
+ log_debug(f"API Response: {response}")
121
  log_info(f"🔍 Google response details:")
122
+ log_info(f"- Has results: {bool(response.results)}")
123
+ log_info(f"- Results count: {len(response.results)}")
124
 
125
  if hasattr(response, 'total_billed_time'):
126
+ log_info(f"- Billed time: {response.total_billed_time.total_seconds()}s")
127
+ else:
128
+ log_info(f"- Billed time: 0s (no audio processed)")
129
+
 
130
  # Process results
131
+ if response.results:
 
 
 
 
 
 
132
  result = response.results[0]
133
+ if result.alternatives:
134
  alternative = result.alternatives[0]
135
 
136
+ # Extract word timestamps if available
137
+ word_timestamps = None
138
+ if config.enable_word_timestamps and hasattr(alternative, 'words'):
139
+ word_timestamps = [
140
+ {
141
+ "word": word_info.word,
142
+ "start_time": word_info.start_time.total_seconds(),
143
+ "end_time": word_info.end_time.total_seconds()
144
+ }
145
+ for word_info in alternative.words
146
+ ]
147
+
148
  transcription = TranscriptionResult(
149
  text=alternative.transcript,
150
  confidence=alternative.confidence,
151
  timestamp=datetime.now().timestamp(),
152
+ language=language_code,
153
+ word_timestamps=word_timestamps
154
  )
155
 
156
+ log_info(f"✅ Transcription: '{alternative.transcript}' (confidence: {alternative.confidence:.2f})")
157
  return transcription
158
 
159
  log_warning("⚠️ No transcription results - Google couldn't recognize speech")