Spaces:
Building
Building
Update stt/stt_google.py
Browse files- stt/stt_google.py +50 -90
stt/stt_google.py
CHANGED
@@ -76,124 +76,84 @@ class GoogleSTT(STTInterface):
|
|
76 |
|
77 |
log_info(f"📊 Transcribing {len(audio_data)} bytes of audio")
|
78 |
|
79 |
-
# ✅
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
zero_count = total_samples - len(non_zero_samples)
|
87 |
-
|
88 |
-
if non_zero_samples:
|
89 |
-
avg_amplitude = sum(abs(s) for s in non_zero_samples) / len(non_zero_samples)
|
90 |
-
max_amplitude = max(abs(s) for s in non_zero_samples)
|
91 |
-
else:
|
92 |
-
avg_amplitude = 0
|
93 |
-
max_amplitude = 0
|
94 |
-
|
95 |
-
log_info(f"🔍 Audio stats: {total_samples} total samples, {zero_count} zeros ({zero_count/total_samples:.1%})")
|
96 |
-
log_info(f"🔍 Non-zero stats: avg={avg_amplitude:.1f}, max={max_amplitude}")
|
97 |
-
|
98 |
-
# 2. Bölüm bazlı analiz (10 bölüme ayır)
|
99 |
-
section_size = total_samples // 10
|
100 |
-
log_info(f"🔍 Section analysis (each {section_size} samples):")
|
101 |
-
|
102 |
-
for i in range(10):
|
103 |
-
start_idx = i * section_size
|
104 |
-
end_idx = (i + 1) * section_size if i < 9 else total_samples
|
105 |
-
section = samples[start_idx:end_idx]
|
106 |
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
|
113 |
-
|
114 |
-
# 3. İlk konuşma başlangıcını bul
|
115 |
-
speech_threshold = 500 # RMS eşiği
|
116 |
-
speech_start_idx = -1
|
117 |
-
|
118 |
-
# 100 sample'lık pencerelerle RMS hesapla
|
119 |
-
window_size = 100
|
120 |
-
for i in range(0, total_samples - window_size, window_size):
|
121 |
-
window = samples[i:i + window_size]
|
122 |
-
rms = (sum(s * s for s in window) / window_size) ** 0.5
|
123 |
-
|
124 |
-
if rms > speech_threshold:
|
125 |
-
speech_start_idx = i
|
126 |
-
break
|
127 |
|
128 |
-
|
129 |
-
|
130 |
-
|
131 |
-
else:
|
132 |
-
log_warning("⚠️ No speech detected above threshold in entire audio")
|
133 |
|
134 |
-
# 4. Audio'nun gerçekten boş olup olmadığını kontrol et
|
135 |
-
if max_amplitude < 100:
|
136 |
-
log_warning(f"⚠️ Audio appears silent: max_amplitude={max_amplitude}")
|
137 |
-
return None
|
138 |
-
|
139 |
-
if zero_count / total_samples > 0.95: # %95'den fazla sıfır
|
140 |
-
log_warning(f"⚠️ Audio is mostly zeros: {zero_count/total_samples:.1%}")
|
141 |
-
return None
|
142 |
-
|
143 |
-
wav_audio = self._convert_to_wav(audio_data, 16000)
|
144 |
-
|
145 |
# Configure recognition
|
|
|
|
|
|
|
146 |
recognition_config = RecognitionConfig(
|
147 |
encoding=RecognitionConfig.AudioEncoding.LINEAR16,
|
148 |
sample_rate_hertz=16000,
|
149 |
language_code="tr-TR",
|
150 |
-
audio_channel_count=1,
|
151 |
enable_separate_recognition_per_channel=False,
|
152 |
-
enable_automatic_punctuation=True,
|
153 |
)
|
154 |
-
|
155 |
-
|
156 |
-
audio = RecognitionAudio(content=wav_audio)
|
157 |
|
158 |
-
# ✅
|
159 |
-
audio = RecognitionAudio(content=
|
160 |
|
161 |
# Perform synchronous recognition
|
162 |
-
log_info(f"🔄 Sending {len(
|
163 |
response = self.client.recognize(config=recognition_config, audio=audio)
|
164 |
|
165 |
-
# ✅
|
|
|
166 |
log_info(f"🔍 Google response details:")
|
167 |
-
log_info(f"
|
168 |
-
log_info(f"
|
169 |
|
170 |
if hasattr(response, 'total_billed_time'):
|
171 |
-
|
172 |
-
|
173 |
-
|
174 |
-
|
175 |
-
|
176 |
# Process results
|
177 |
-
if response.results
|
178 |
-
for i, result in enumerate(response.results):
|
179 |
-
log_info(f" - Result {i}: {len(result.alternatives)} alternatives")
|
180 |
-
if result.alternatives:
|
181 |
-
for j, alt in enumerate(result.alternatives):
|
182 |
-
log_info(f" - Alt {j}: '{alt.transcript}' (conf: {alt.confidence:.3f})")
|
183 |
-
|
184 |
result = response.results[0]
|
185 |
-
if result.alternatives
|
186 |
alternative = result.alternatives[0]
|
187 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
188 |
transcription = TranscriptionResult(
|
189 |
text=alternative.transcript,
|
190 |
confidence=alternative.confidence,
|
191 |
timestamp=datetime.now().timestamp(),
|
192 |
-
language=
|
193 |
-
word_timestamps=
|
194 |
)
|
195 |
|
196 |
-
log_info(f"✅ Transcription
|
197 |
return transcription
|
198 |
|
199 |
log_warning("⚠️ No transcription results - Google couldn't recognize speech")
|
|
|
76 |
|
77 |
log_info(f"📊 Transcribing {len(audio_data)} bytes of audio")
|
78 |
|
79 |
+
# ✅ Debug - audio verisi analizi
|
80 |
+
if len(audio_data) > 100:
|
81 |
+
# İlk ve son 50 byte'ı kontrol et
|
82 |
+
first_50 = audio_data[:50]
|
83 |
+
last_50 = audio_data[-50:]
|
84 |
+
log_debug(f"Audio first 50 bytes: {first_50.hex()}")
|
85 |
+
log_debug(f"Audio last 50 bytes: {last_50.hex()}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
86 |
|
87 |
+
# Ortalama amplitude kontrolü
|
88 |
+
import struct
|
89 |
+
samples = struct.unpack(f'{len(audio_data)//2}h', audio_data)
|
90 |
+
avg_amplitude = sum(abs(s) for s in samples) / len(samples)
|
91 |
+
max_amplitude = max(abs(s) for s in samples)
|
92 |
+
log_debug(f"Audio stats: avg_amplitude={avg_amplitude:.1f}, max_amplitude={max_amplitude}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
93 |
|
94 |
+
# ✅ Convert to WAV format for better compatibility
|
95 |
+
wav_audio = self._convert_to_wav(audio_data, config.sample_rate)
|
96 |
+
log_info(f"🔧 WAV conversion: {len(audio_data)} PCM → {len(wav_audio)} WAV")
|
|
|
|
|
97 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
98 |
# Configure recognition
|
99 |
+
language_code = self._map_language_code(config.language)
|
100 |
+
|
101 |
+
# ✅ WAV audio kullanıyoruz artık
|
102 |
recognition_config = RecognitionConfig(
|
103 |
encoding=RecognitionConfig.AudioEncoding.LINEAR16,
|
104 |
sample_rate_hertz=16000,
|
105 |
language_code="tr-TR",
|
106 |
+
audio_channel_count=1, # Frontend mono audio gönderiyor
|
107 |
enable_separate_recognition_per_channel=False,
|
|
|
108 |
)
|
109 |
+
|
110 |
+
log_debug(f"Recognition config: language={language_code}, sample_rate={config.sample_rate}, model={config.model}")
|
|
|
111 |
|
112 |
+
# ✅ Create audio object with WAV data (not raw PCM)
|
113 |
+
audio = RecognitionAudio(content=wav_audio) # wav_audio kullan, audio_data değil
|
114 |
|
115 |
# Perform synchronous recognition
|
116 |
+
log_info(f"🔄 Sending {len(wav_audio)} bytes WAV to Google Cloud Speech API...")
|
117 |
response = self.client.recognize(config=recognition_config, audio=audio)
|
118 |
|
119 |
+
# ✅ Debug response
|
120 |
+
log_debug(f"API Response: {response}")
|
121 |
log_info(f"🔍 Google response details:")
|
122 |
+
log_info(f"- Has results: {bool(response.results)}")
|
123 |
+
log_info(f"- Results count: {len(response.results)}")
|
124 |
|
125 |
if hasattr(response, 'total_billed_time'):
|
126 |
+
log_info(f"- Billed time: {response.total_billed_time.total_seconds()}s")
|
127 |
+
else:
|
128 |
+
log_info(f"- Billed time: 0s (no audio processed)")
|
129 |
+
|
|
|
130 |
# Process results
|
131 |
+
if response.results:
|
|
|
|
|
|
|
|
|
|
|
|
|
132 |
result = response.results[0]
|
133 |
+
if result.alternatives:
|
134 |
alternative = result.alternatives[0]
|
135 |
|
136 |
+
# Extract word timestamps if available
|
137 |
+
word_timestamps = None
|
138 |
+
if config.enable_word_timestamps and hasattr(alternative, 'words'):
|
139 |
+
word_timestamps = [
|
140 |
+
{
|
141 |
+
"word": word_info.word,
|
142 |
+
"start_time": word_info.start_time.total_seconds(),
|
143 |
+
"end_time": word_info.end_time.total_seconds()
|
144 |
+
}
|
145 |
+
for word_info in alternative.words
|
146 |
+
]
|
147 |
+
|
148 |
transcription = TranscriptionResult(
|
149 |
text=alternative.transcript,
|
150 |
confidence=alternative.confidence,
|
151 |
timestamp=datetime.now().timestamp(),
|
152 |
+
language=language_code,
|
153 |
+
word_timestamps=word_timestamps
|
154 |
)
|
155 |
|
156 |
+
log_info(f"✅ Transcription: '{alternative.transcript}' (confidence: {alternative.confidence:.2f})")
|
157 |
return transcription
|
158 |
|
159 |
log_warning("⚠️ No transcription results - Google couldn't recognize speech")
|