Spaces:
Running
Running
File size: 10,158 Bytes
78b5a88 a532986 78b5a88 a532986 78b5a88 a532986 78b5a88 a532986 78b5a88 a532986 78b5a88 3586948 a532986 3586948 5ab11dd 3586948 5ab11dd 3586948 5ab11dd 3586948 5ab11dd 3586948 5ab11dd 3586948 5ab11dd 3586948 5ab11dd 3586948 5ab11dd 3586948 5ab11dd 3586948 5ab11dd 3586948 5ab11dd 3586948 5ab11dd 3586948 5ab11dd 3586948 a532986 3586948 a532986 3586948 a532986 3586948 6abf273 f4b2af6 6abf273 a532986 3586948 a532986 f4b2af6 ee90174 ddfbba9 6abf273 ddfbba9 6abf273 20f6ba5 6abf273 a532986 3586948 a532986 6abf273 a532986 6abf273 a532986 6abf273 3586948 78b5a88 a532986 6abf273 a532986 1e56973 6abf273 a532986 1e56973 6abf273 3586948 78b5a88 a532986 3586948 78b5a88 a532986 78b5a88 a532986 78b5a88 a532986 78b5a88 3586948 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 |
"""
Google Cloud Speech-to-Text Implementation - Simple Batch Mode
"""
from typing import Optional, List
from datetime import datetime
import io
import wave
from google.cloud import speech
from google.cloud.speech import RecognitionConfig, RecognitionAudio
from utils.logger import log_info, log_error, log_debug, log_warning
from .stt_interface import STTInterface, STTConfig, TranscriptionResult
class GoogleSTT(STTInterface):
def __init__(self, credentials_path: Optional[str] = None):
"""
Initialize Google STT
Args:
credentials_path: Path to service account JSON file (optional if using default credentials)
"""
try:
# Initialize client
if credentials_path:
self.client = speech.SpeechClient.from_service_account_file(credentials_path)
log_info(f"✅ Google STT initialized with service account: {credentials_path}")
else:
# Use default credentials (ADC)
self.client = speech.SpeechClient()
log_info("✅ Google STT initialized with default credentials")
except Exception as e:
log_error(f"❌ Failed to initialize Google STT: {str(e)}")
raise
def _map_language_code(self, language: str) -> str:
"""Map language codes to Google format"""
# Google uses BCP-47 language codes
language_map = {
"tr": "tr-TR",
"tr-TR": "tr-TR",
"en": "en-US",
"en-US": "en-US",
"en-GB": "en-GB",
"de": "de-DE",
"de-DE": "de-DE",
"fr": "fr-FR",
"fr-FR": "fr-FR",
"es": "es-ES",
"es-ES": "es-ES",
"it": "it-IT",
"it-IT": "it-IT",
"pt": "pt-BR",
"pt-BR": "pt-BR",
"ru": "ru-RU",
"ru-RU": "ru-RU",
"ja": "ja-JP",
"ja-JP": "ja-JP",
"ko": "ko-KR",
"ko-KR": "ko-KR",
"zh": "zh-CN",
"zh-CN": "zh-CN",
"ar": "ar-SA",
"ar-SA": "ar-SA",
}
# Default to the language itself if not in map
return language_map.get(language, language)
async def transcribe(self, audio_data: bytes, config: STTConfig) -> Optional[TranscriptionResult]:
try:
if not audio_data:
log_warning("⚠️ No audio data provided")
return None
log_info(f"📊 Transcribing {len(audio_data)} bytes of audio")
# ✅ Detaylı audio analizi - logda
import struct
samples = struct.unpack(f'{len(audio_data)//2}h', audio_data)
total_samples = len(samples)
# 1. Genel istatistikler
non_zero_samples = [s for s in samples if s != 0]
zero_count = total_samples - len(non_zero_samples)
if non_zero_samples:
avg_amplitude = sum(abs(s) for s in non_zero_samples) / len(non_zero_samples)
max_amplitude = max(abs(s) for s in non_zero_samples)
else:
avg_amplitude = 0
max_amplitude = 0
log_info(f"🔍 Audio stats: {total_samples} total samples, {zero_count} zeros ({zero_count/total_samples:.1%})")
log_info(f"🔍 Non-zero stats: avg={avg_amplitude:.1f}, max={max_amplitude}")
# 2. Bölüm bazlı analiz (10 bölüme ayır)
section_size = total_samples // 10
log_info(f"🔍 Section analysis (each {section_size} samples):")
for i in range(10):
start_idx = i * section_size
end_idx = (i + 1) * section_size if i < 9 else total_samples
section = samples[start_idx:end_idx]
section_non_zero = [s for s in section if s != 0]
section_max = max(abs(s) for s in section_non_zero) if section_non_zero else 0
section_avg = sum(abs(s) for s in section_non_zero) / len(section_non_zero) if section_non_zero else 0
zero_ratio = (len(section) - len(section_non_zero)) / len(section)
log_info(f" Section {i+1}: max={section_max}, avg={section_avg:.1f}, zeros={zero_ratio:.1%}")
# 3. İlk konuşma başlangıcını bul
speech_threshold = 500 # RMS eşiği
speech_start_idx = -1
# 100 sample'lık pencerelerle RMS hesapla
window_size = 100
for i in range(0, total_samples - window_size, window_size):
window = samples[i:i + window_size]
rms = (sum(s * s for s in window) / window_size) ** 0.5
if rms > speech_threshold:
speech_start_idx = i
break
if speech_start_idx >= 0:
speech_start_time = speech_start_idx / config.sample_rate
log_info(f"🎤 Speech detected starting at sample {speech_start_idx} ({speech_start_time:.2f}s)")
else:
log_warning("⚠️ No speech detected above threshold in entire audio")
# 4. Audio'nun gerçekten boş olup olmadığını kontrol et
if max_amplitude < 100:
log_warning(f"⚠️ Audio appears silent: max_amplitude={max_amplitude}")
return None
if zero_count / total_samples > 0.95: # %95'den fazla sıfır
log_warning(f"⚠️ Audio is mostly zeros: {zero_count/total_samples:.1%}")
return None
# Convert to WAV format
wav_audio = self._convert_to_wav(audio_data, config.sample_rate)
# Configure recognition
recognition_config = RecognitionConfig(
encoding=RecognitionConfig.AudioEncoding.LINEAR16,
sample_rate_hertz=16000,
language_code="tr-TR",
audio_channel_count=1,
enable_separate_recognition_per_channel=False,
enable_automatic_punctuation=True,
)
# Create audio object
audio = RecognitionAudio(content=wav_audio)
# Perform synchronous recognition
log_info(f"🔄 Sending audio to Google Cloud Speech API...")
response = self.client.recognize(config=recognition_config, audio=audio)
# ✅ Detaylı response analizi
log_info(f"🔍 Google response details:")
log_info(f" - Has results: {bool(response.results)}")
log_info(f" - Results count: {len(response.results) if response.results else 0}")
if hasattr(response, 'total_billed_time'):
if response.total_billed_time and response.total_billed_time.total_seconds() > 0:
log_info(f" - Billed time: {response.total_billed_time.total_seconds()}s")
else:
log_info(f" - Billed time: 0s (no audio processed)")
# Process results
if response.results and len(response.results) > 0:
for i, result in enumerate(response.results):
log_info(f" - Result {i}: {len(result.alternatives)} alternatives")
if result.alternatives:
for j, alt in enumerate(result.alternatives):
log_info(f" - Alt {j}: '{alt.transcript}' (conf: {alt.confidence:.3f})")
result = response.results[0]
if result.alternatives and len(result.alternatives) > 0:
alternative = result.alternatives[0]
transcription = TranscriptionResult(
text=alternative.transcript,
confidence=alternative.confidence,
timestamp=datetime.now().timestamp(),
language="tr-TR",
word_timestamps=None
)
log_info(f"✅ Transcription SUCCESS: '{alternative.transcript}' (confidence: {alternative.confidence:.2f})")
return transcription
log_warning("⚠️ No transcription results - Google couldn't recognize speech")
return None
except Exception as e:
log_error(f"❌ Error during transcription: {str(e)}")
import traceback
log_error(f"Traceback: {traceback.format_exc()}")
return None
def _convert_to_wav(self, audio_data: bytes, sample_rate: int) -> bytes:
"""Convert raw PCM audio to WAV format"""
# Create WAV file in memory
wav_buffer = io.BytesIO()
with wave.open(wav_buffer, 'wb') as wav_file:
# Set WAV parameters
wav_file.setnchannels(1) # Mono
wav_file.setsampwidth(2) # 16-bit
wav_file.setframerate(sample_rate)
wav_file.writeframes(audio_data)
# Get WAV data
wav_buffer.seek(0)
return wav_buffer.read()
def get_supported_languages(self) -> List[str]:
"""Get list of supported language codes"""
# Google Cloud Speech-to-Text supported languages (partial list)
return [
"tr-TR", "en-US", "en-GB", "en-AU", "en-CA", "en-IN",
"es-ES", "es-MX", "es-AR", "fr-FR", "fr-CA", "de-DE",
"it-IT", "pt-BR", "pt-PT", "ru-RU", "ja-JP", "ko-KR",
"zh-CN", "zh-TW", "ar-SA", "ar-EG", "hi-IN", "nl-NL",
"pl-PL", "sv-SE", "da-DK", "no-NO", "fi-FI", "el-GR",
"he-IL", "th-TH", "vi-VN", "id-ID", "ms-MY", "fil-PH"
]
def get_provider_name(self) -> str:
"""Get provider name"""
return "google" |