flare / stt /stt_google.py
ciyidogan's picture
Update stt/stt_google.py
a532986 verified
raw
history blame
6.42 kB
"""
Google Cloud Speech-to-Text Implementation - Simple Batch Mode
"""
from typing import Optional, List
from datetime import datetime
import io
import wave
from google.cloud import speech
from google.cloud.speech import RecognitionConfig, RecognitionAudio
from utils.logger import log_info, log_error, log_debug, log_warning
from .stt_interface import STTInterface, STTConfig, TranscriptionResult
class GoogleSTT(STTInterface):
def __init__(self, credentials_path: Optional[str] = None):
"""
Initialize Google STT
Args:
credentials_path: Path to service account JSON file (optional if using default credentials)
"""
try:
# Initialize client
if credentials_path:
self.client = speech.SpeechClient.from_service_account_file(credentials_path)
log_info(f"βœ… Google STT initialized with service account: {credentials_path}")
else:
# Use default credentials (ADC)
self.client = speech.SpeechClient()
log_info("βœ… Google STT initialized with default credentials")
except Exception as e:
log_error(f"❌ Failed to initialize Google STT: {str(e)}")
raise
def _map_language_code(self, language: str) -> str:
"""Map language codes to Google format"""
# Google uses BCP-47 language codes
language_map = {
"tr-TR": "tr-TR",
"en-US": "en-US",
"en-GB": "en-GB",
"de-DE": "de-DE",
"fr-FR": "fr-FR",
"es-ES": "es-ES",
"it-IT": "it-IT",
"pt-BR": "pt-BR",
"ru-RU": "ru-RU",
"ja-JP": "ja-JP",
"ko-KR": "ko-KR",
"zh-CN": "zh-CN",
"ar-SA": "ar-SA",
}
return language_map.get(language, language)
async def transcribe(self, audio_data: bytes, config: STTConfig) -> Optional[TranscriptionResult]:
"""Transcribe audio data using Google Cloud Speech API"""
try:
# Check if we have audio to transcribe
if not audio_data:
log_warning("⚠️ No audio data provided")
return None
log_info(f"πŸ“Š Transcribing {len(audio_data)} bytes of audio")
# Convert to WAV format for better compatibility
wav_audio = self._convert_to_wav(audio_data, config.sample_rate)
# Configure recognition
language_code = self._map_language_code(config.language)
recognition_config = RecognitionConfig(
encoding=RecognitionConfig.AudioEncoding.LINEAR16,
sample_rate_hertz=config.sample_rate,
language_code=language_code,
enable_automatic_punctuation=config.enable_punctuation,
model=config.model,
use_enhanced=config.use_enhanced,
enable_word_time_offsets=config.enable_word_timestamps,
)
# Create audio object
audio = RecognitionAudio(content=wav_audio)
# Perform synchronous recognition
log_info(f"πŸ”„ Sending audio to Google Cloud Speech API...")
response = self.client.recognize(config=recognition_config, audio=audio)
# Process results
if response.results:
result = response.results[0]
if result.alternatives:
alternative = result.alternatives[0]
# Extract word timestamps if available
word_timestamps = None
if config.enable_word_timestamps and hasattr(alternative, 'words'):
word_timestamps = [
{
"word": word_info.word,
"start_time": word_info.start_time.total_seconds(),
"end_time": word_info.end_time.total_seconds()
}
for word_info in alternative.words
]
transcription = TranscriptionResult(
text=alternative.transcript,
confidence=alternative.confidence,
timestamp=datetime.now().timestamp(),
language=language_code,
word_timestamps=word_timestamps
)
log_info(f"βœ… Transcription: '{alternative.transcript}' (confidence: {alternative.confidence:.2f})")
return transcription
log_warning("⚠️ No transcription results")
return None
except Exception as e:
log_error(f"❌ Error during transcription: {str(e)}")
import traceback
log_error(f"Traceback: {traceback.format_exc()}")
return None
def _convert_to_wav(self, audio_data: bytes, sample_rate: int) -> bytes:
"""Convert raw PCM audio to WAV format"""
# Create WAV file in memory
wav_buffer = io.BytesIO()
with wave.open(wav_buffer, 'wb') as wav_file:
# Set WAV parameters
wav_file.setnchannels(1) # Mono
wav_file.setsampwidth(2) # 16-bit
wav_file.setframerate(sample_rate)
wav_file.writeframes(audio_data)
# Get WAV data
wav_buffer.seek(0)
return wav_buffer.read()
def get_supported_languages(self) -> List[str]:
"""Get list of supported language codes"""
# Google Cloud Speech-to-Text supported languages (partial list)
return [
"tr-TR", "en-US", "en-GB", "en-AU", "en-CA", "en-IN",
"es-ES", "es-MX", "es-AR", "fr-FR", "fr-CA", "de-DE",
"it-IT", "pt-BR", "pt-PT", "ru-RU", "ja-JP", "ko-KR",
"zh-CN", "zh-TW", "ar-SA", "ar-EG", "hi-IN", "nl-NL",
"pl-PL", "sv-SE", "da-DK", "no-NO", "fi-FI", "el-GR",
"he-IL", "th-TH", "vi-VN", "id-ID", "ms-MY", "fil-PH"
]
def get_provider_name(self) -> str:
"""Get provider name"""
return "google"