Spaces:
Running
Running
File size: 8,169 Bytes
78b5a88 a532986 78b5a88 a532986 78b5a88 a532986 78b5a88 a532986 78b5a88 a532986 78b5a88 3586948 a532986 3586948 5ab11dd 3586948 5ab11dd 3586948 5ab11dd 3586948 5ab11dd 3586948 5ab11dd 3586948 5ab11dd 3586948 5ab11dd 3586948 5ab11dd 3586948 5ab11dd 3586948 5ab11dd 3586948 5ab11dd 3586948 5ab11dd 3586948 5ab11dd 3586948 a532986 3586948 a532986 3586948 a532986 3586948 f4b2af6 a532986 3586948 a532986 3586948 f4b2af6 9c60cb5 a532986 9c60cb5 f4b2af6 ee90174 ddfbba9 20f6ba5 f4b2af6 a532986 3586948 a532986 f4b2af6 a532986 3586948 78b5a88 a532986 1e56973 a532986 1e56973 a532986 1e56973 a532986 3586948 78b5a88 a532986 3586948 78b5a88 a532986 78b5a88 a532986 78b5a88 a532986 78b5a88 3586948 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 |
"""
Google Cloud Speech-to-Text Implementation - Simple Batch Mode
"""
from typing import Optional, List
from datetime import datetime
import io
import wave
from google.cloud import speech
from google.cloud.speech import RecognitionConfig, RecognitionAudio
from utils.logger import log_info, log_error, log_debug, log_warning
from .stt_interface import STTInterface, STTConfig, TranscriptionResult
class GoogleSTT(STTInterface):
def __init__(self, credentials_path: Optional[str] = None):
"""
Initialize Google STT
Args:
credentials_path: Path to service account JSON file (optional if using default credentials)
"""
try:
# Initialize client
if credentials_path:
self.client = speech.SpeechClient.from_service_account_file(credentials_path)
log_info(f"✅ Google STT initialized with service account: {credentials_path}")
else:
# Use default credentials (ADC)
self.client = speech.SpeechClient()
log_info("✅ Google STT initialized with default credentials")
except Exception as e:
log_error(f"❌ Failed to initialize Google STT: {str(e)}")
raise
def _map_language_code(self, language: str) -> str:
"""Map language codes to Google format"""
# Google uses BCP-47 language codes
language_map = {
"tr": "tr-TR",
"tr-TR": "tr-TR",
"en": "en-US",
"en-US": "en-US",
"en-GB": "en-GB",
"de": "de-DE",
"de-DE": "de-DE",
"fr": "fr-FR",
"fr-FR": "fr-FR",
"es": "es-ES",
"es-ES": "es-ES",
"it": "it-IT",
"it-IT": "it-IT",
"pt": "pt-BR",
"pt-BR": "pt-BR",
"ru": "ru-RU",
"ru-RU": "ru-RU",
"ja": "ja-JP",
"ja-JP": "ja-JP",
"ko": "ko-KR",
"ko-KR": "ko-KR",
"zh": "zh-CN",
"zh-CN": "zh-CN",
"ar": "ar-SA",
"ar-SA": "ar-SA",
}
# Default to the language itself if not in map
return language_map.get(language, language)
async def transcribe(self, audio_data: bytes, config: STTConfig) -> Optional[TranscriptionResult]:
"""Transcribe audio data using Google Cloud Speech API"""
try:
# Check if we have audio to transcribe
if not audio_data:
log_warning("⚠️ No audio data provided")
return None
log_info(f"📊 Transcribing {len(audio_data)} bytes of audio")
# ✅ Debug - audio verisi analizi
if len(audio_data) > 100:
# İlk ve son 50 byte'ı kontrol et
first_50 = audio_data[:50]
last_50 = audio_data[-50:]
log_debug(f"Audio first 50 bytes: {first_50.hex()}")
log_debug(f"Audio last 50 bytes: {last_50.hex()}")
# Ortalama amplitude kontrolü
import struct
samples = struct.unpack(f'{len(audio_data)//2}h', audio_data)
avg_amplitude = sum(abs(s) for s in samples) / len(samples)
max_amplitude = max(abs(s) for s in samples)
log_debug(f"Audio stats: avg_amplitude={avg_amplitude:.1f}, max_amplitude={max_amplitude}")
# Convert to WAV format for better compatibility
wav_audio = self._convert_to_wav(audio_data, config.sample_rate)
# Configure recognition
language_code = self._map_language_code(config.language)
"""
recognition_config = RecognitionConfig(
encoding=RecognitionConfig.AudioEncoding.LINEAR16,
sample_rate_hertz=config.sample_rate,
language_code=language_code,
enable_automatic_punctuation=config.enable_punctuation,
model=config.model,
use_enhanced=config.use_enhanced,
enable_word_time_offsets=config.enable_word_timestamps,
)
"""
recognition_config = RecognitionConfig(
encoding=RecognitionConfig.AudioEncoding.LINEAR16,
sample_rate_hertz=16000,
language_code="tr-TR",
audio_channel_count=1, # Frontend mono audio gönderiyor
enable_separate_recognition_per_channel=False,
)
log_debug(f"Recognition config: language={language_code}, sample_rate={config.sample_rate}, model={config.model}")
# Create audio object
audio = RecognitionAudio(content=wav_audio)
# Perform synchronous recognition
log_info(f"🔄 Sending audio to Google Cloud Speech API...")
response = self.client.recognize(config=recognition_config, audio=audio)
# ✅ Debug response
log_debug(f"API Response: {response}")
# Process results
if response.results:
result = response.results[0]
if result.alternatives:
alternative = result.alternatives[0]
# Extract word timestamps if available
word_timestamps = None
if config.enable_word_timestamps and hasattr(alternative, 'words'):
word_timestamps = [
{
"word": word_info.word,
"start_time": word_info.start_time.total_seconds(),
"end_time": word_info.end_time.total_seconds()
}
for word_info in alternative.words
]
transcription = TranscriptionResult(
text=alternative.transcript,
confidence=alternative.confidence,
timestamp=datetime.now().timestamp(),
language=language_code,
word_timestamps=word_timestamps
)
log_info(f"✅ Transcription: '{alternative.transcript}' (confidence: {alternative.confidence:.2f})")
return transcription
log_warning("⚠️ No transcription results")
return None
except Exception as e:
log_error(f"❌ Error during transcription: {str(e)}")
import traceback
log_error(f"Traceback: {traceback.format_exc()}")
return None
def _convert_to_wav(self, audio_data: bytes, sample_rate: int) -> bytes:
"""Convert raw PCM audio to WAV format"""
# Create WAV file in memory
wav_buffer = io.BytesIO()
with wave.open(wav_buffer, 'wb') as wav_file:
# Set WAV parameters
wav_file.setnchannels(1) # Mono
wav_file.setsampwidth(2) # 16-bit
wav_file.setframerate(sample_rate)
wav_file.writeframes(audio_data)
# Get WAV data
wav_buffer.seek(0)
return wav_buffer.read()
def get_supported_languages(self) -> List[str]:
"""Get list of supported language codes"""
# Google Cloud Speech-to-Text supported languages (partial list)
return [
"tr-TR", "en-US", "en-GB", "en-AU", "en-CA", "en-IN",
"es-ES", "es-MX", "es-AR", "fr-FR", "fr-CA", "de-DE",
"it-IT", "pt-BR", "pt-PT", "ru-RU", "ja-JP", "ko-KR",
"zh-CN", "zh-TW", "ar-SA", "ar-EG", "hi-IN", "nl-NL",
"pl-PL", "sv-SE", "da-DK", "no-NO", "fi-FI", "el-GR",
"he-IL", "th-TH", "vi-VN", "id-ID", "ms-MY", "fil-PH"
]
def get_provider_name(self) -> str:
"""Get provider name"""
return "google" |