Spaces:
Building
Building
Update stt/stt_google.py
Browse files- stt/stt_google.py +54 -0
stt/stt_google.py
CHANGED
@@ -125,6 +125,57 @@ class GoogleSTT(STTInterface):
|
|
125 |
|
126 |
except Exception as e:
|
127 |
log_error(f"❌ Error analyzing audio: {e}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
128 |
|
129 |
async def transcribe(self, audio_data: bytes, config: STTConfig) -> Optional[TranscriptionResult]:
|
130 |
"""Transcribe audio data using Google Cloud Speech API"""
|
@@ -138,6 +189,9 @@ class GoogleSTT(STTInterface):
|
|
138 |
|
139 |
# ✅ Audio analizi
|
140 |
self._analyze_audio_content(audio_data)
|
|
|
|
|
|
|
141 |
|
142 |
# ✅ WAV formatında gönder - Google bu formatı daha iyi tanıyor
|
143 |
wav_audio = self._convert_to_wav_proper(audio_data, config.sample_rate)
|
|
|
125 |
|
126 |
except Exception as e:
|
127 |
log_error(f"❌ Error analyzing audio: {e}")
|
128 |
+
|
129 |
+
def _trim_silence(self, audio_data: bytes) -> bytes:
|
130 |
+
"""Trim silence from beginning and end of audio"""
|
131 |
+
try:
|
132 |
+
if len(audio_data) < 100:
|
133 |
+
return audio_data
|
134 |
+
|
135 |
+
# Convert to samples
|
136 |
+
samples = list(struct.unpack(f'{len(audio_data)//2}h', audio_data))
|
137 |
+
|
138 |
+
# Silence threshold - daha düşük bir threshold kullan
|
139 |
+
silence_threshold = 200 # Daha düşük threshold
|
140 |
+
|
141 |
+
# Find first non-silent sample
|
142 |
+
start_idx = 0
|
143 |
+
for i, sample in enumerate(samples):
|
144 |
+
if abs(sample) > silence_threshold:
|
145 |
+
start_idx = i
|
146 |
+
break
|
147 |
+
|
148 |
+
# Find last non-silent sample
|
149 |
+
end_idx = len(samples) - 1
|
150 |
+
for i in range(len(samples) - 1, -1, -1):
|
151 |
+
if abs(samples[i]) > silence_threshold:
|
152 |
+
end_idx = i
|
153 |
+
break
|
154 |
+
|
155 |
+
# Ensure we have some audio
|
156 |
+
if start_idx >= end_idx:
|
157 |
+
log_warning("⚠️ No audio content above silence threshold")
|
158 |
+
return audio_data
|
159 |
+
|
160 |
+
# Add small padding (250ms = 4000 samples at 16kHz)
|
161 |
+
padding = 2000 # 125ms padding
|
162 |
+
start_idx = max(0, start_idx - padding)
|
163 |
+
end_idx = min(len(samples) - 1, end_idx + padding)
|
164 |
+
|
165 |
+
# Extract trimmed audio
|
166 |
+
trimmed_samples = samples[start_idx:end_idx + 1]
|
167 |
+
|
168 |
+
log_info(f"🔧 Silence trimming: {len(samples)} → {len(trimmed_samples)} samples")
|
169 |
+
log_info(f"🔧 Trimmed duration: {len(trimmed_samples)/16000:.2f}s")
|
170 |
+
|
171 |
+
# Convert back to bytes
|
172 |
+
trimmed_audio = struct.pack(f'{len(trimmed_samples)}h', *trimmed_samples)
|
173 |
+
|
174 |
+
return trimmed_audio
|
175 |
+
|
176 |
+
except Exception as e:
|
177 |
+
log_error(f"❌ Silence trimming failed: {e}")
|
178 |
+
return audio_data
|
179 |
|
180 |
async def transcribe(self, audio_data: bytes, config: STTConfig) -> Optional[TranscriptionResult]:
|
181 |
"""Transcribe audio data using Google Cloud Speech API"""
|
|
|
189 |
|
190 |
# ✅ Audio analizi
|
191 |
self._analyze_audio_content(audio_data)
|
192 |
+
|
193 |
+
# ✅ Silence trimming ekle
|
194 |
+
trimmed_audio = self._trim_silence(audio_data)
|
195 |
|
196 |
# ✅ WAV formatında gönder - Google bu formatı daha iyi tanıyor
|
197 |
wav_audio = self._convert_to_wav_proper(audio_data, config.sample_rate)
|