Spaces:
Running
Running
File size: 6,416 Bytes
78b5a88 a532986 78b5a88 a532986 78b5a88 a532986 78b5a88 a532986 78b5a88 a532986 78b5a88 3586948 a532986 3586948 a532986 3586948 a532986 3586948 a532986 3586948 a532986 3586948 a532986 3586948 9c60cb5 a532986 9c60cb5 a532986 3586948 a532986 3586948 a532986 3586948 78b5a88 a532986 1e56973 a532986 1e56973 a532986 1e56973 a532986 3586948 78b5a88 a532986 3586948 78b5a88 a532986 78b5a88 a532986 78b5a88 a532986 78b5a88 3586948 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 |
"""
Google Cloud Speech-to-Text Implementation - Simple Batch Mode
"""
from typing import Optional, List
from datetime import datetime
import io
import wave
from google.cloud import speech
from google.cloud.speech import RecognitionConfig, RecognitionAudio
from utils.logger import log_info, log_error, log_debug, log_warning
from .stt_interface import STTInterface, STTConfig, TranscriptionResult
class GoogleSTT(STTInterface):
def __init__(self, credentials_path: Optional[str] = None):
"""
Initialize Google STT
Args:
credentials_path: Path to service account JSON file (optional if using default credentials)
"""
try:
# Initialize client
if credentials_path:
self.client = speech.SpeechClient.from_service_account_file(credentials_path)
log_info(f"β
Google STT initialized with service account: {credentials_path}")
else:
# Use default credentials (ADC)
self.client = speech.SpeechClient()
log_info("β
Google STT initialized with default credentials")
except Exception as e:
log_error(f"β Failed to initialize Google STT: {str(e)}")
raise
def _map_language_code(self, language: str) -> str:
"""Map language codes to Google format"""
# Google uses BCP-47 language codes
language_map = {
"tr-TR": "tr-TR",
"en-US": "en-US",
"en-GB": "en-GB",
"de-DE": "de-DE",
"fr-FR": "fr-FR",
"es-ES": "es-ES",
"it-IT": "it-IT",
"pt-BR": "pt-BR",
"ru-RU": "ru-RU",
"ja-JP": "ja-JP",
"ko-KR": "ko-KR",
"zh-CN": "zh-CN",
"ar-SA": "ar-SA",
}
return language_map.get(language, language)
async def transcribe(self, audio_data: bytes, config: STTConfig) -> Optional[TranscriptionResult]:
"""Transcribe audio data using Google Cloud Speech API"""
try:
# Check if we have audio to transcribe
if not audio_data:
log_warning("β οΈ No audio data provided")
return None
log_info(f"π Transcribing {len(audio_data)} bytes of audio")
# Convert to WAV format for better compatibility
wav_audio = self._convert_to_wav(audio_data, config.sample_rate)
# Configure recognition
language_code = self._map_language_code(config.language)
recognition_config = RecognitionConfig(
encoding=RecognitionConfig.AudioEncoding.LINEAR16,
sample_rate_hertz=config.sample_rate,
language_code=language_code,
enable_automatic_punctuation=config.enable_punctuation,
model=config.model,
use_enhanced=config.use_enhanced,
enable_word_time_offsets=config.enable_word_timestamps,
)
# Create audio object
audio = RecognitionAudio(content=wav_audio)
# Perform synchronous recognition
log_info(f"π Sending audio to Google Cloud Speech API...")
response = self.client.recognize(config=recognition_config, audio=audio)
# Process results
if response.results:
result = response.results[0]
if result.alternatives:
alternative = result.alternatives[0]
# Extract word timestamps if available
word_timestamps = None
if config.enable_word_timestamps and hasattr(alternative, 'words'):
word_timestamps = [
{
"word": word_info.word,
"start_time": word_info.start_time.total_seconds(),
"end_time": word_info.end_time.total_seconds()
}
for word_info in alternative.words
]
transcription = TranscriptionResult(
text=alternative.transcript,
confidence=alternative.confidence,
timestamp=datetime.now().timestamp(),
language=language_code,
word_timestamps=word_timestamps
)
log_info(f"β
Transcription: '{alternative.transcript}' (confidence: {alternative.confidence:.2f})")
return transcription
log_warning("β οΈ No transcription results")
return None
except Exception as e:
log_error(f"β Error during transcription: {str(e)}")
import traceback
log_error(f"Traceback: {traceback.format_exc()}")
return None
def _convert_to_wav(self, audio_data: bytes, sample_rate: int) -> bytes:
"""Convert raw PCM audio to WAV format"""
# Create WAV file in memory
wav_buffer = io.BytesIO()
with wave.open(wav_buffer, 'wb') as wav_file:
# Set WAV parameters
wav_file.setnchannels(1) # Mono
wav_file.setsampwidth(2) # 16-bit
wav_file.setframerate(sample_rate)
wav_file.writeframes(audio_data)
# Get WAV data
wav_buffer.seek(0)
return wav_buffer.read()
def get_supported_languages(self) -> List[str]:
"""Get list of supported language codes"""
# Google Cloud Speech-to-Text supported languages (partial list)
return [
"tr-TR", "en-US", "en-GB", "en-AU", "en-CA", "en-IN",
"es-ES", "es-MX", "es-AR", "fr-FR", "fr-CA", "de-DE",
"it-IT", "pt-BR", "pt-PT", "ru-RU", "ja-JP", "ko-KR",
"zh-CN", "zh-TW", "ar-SA", "ar-EG", "hi-IN", "nl-NL",
"pl-PL", "sv-SE", "da-DK", "no-NO", "fi-FI", "el-GR",
"he-IL", "th-TH", "vi-VN", "id-ID", "ms-MY", "fil-PH"
]
def get_provider_name(self) -> str:
"""Get provider name"""
return "google" |