Spaces:

UcsTurkey
/

flare

Paused

App Files Files Community

flare / stt_google.py

ciyidogan

Update stt_google.py

8d8ad71 verified 4 months ago

raw

history blame

5.39 kB

	"""
	Google Cloud Speech-to-Text Implementation
	"""
	import os
	import asyncio
	from typing import AsyncIterator, Optional, List
	from datetime import datetime
	import sys
	from logger import log_info, log_error, log_debug, log_warning

	# Import Google Cloud Speech only if available
	try:
	from google.cloud import speech_v1p1beta1 as speech
	from google.api_core import exceptions
	GOOGLE_SPEECH_AVAILABLE = True
	except ImportError:
	GOOGLE_SPEECH_AVAILABLE = False
	log_info("⚠️ Google Cloud Speech library not installed")

	from stt_interface import STTInterface, STTConfig, TranscriptionResult

	class GoogleCloudSTT(STTInterface):
	"""Google Cloud Speech-to-Text implementation"""

	def __init__(self, credentials_path: str):
	if not GOOGLE_SPEECH_AVAILABLE:
	raise ImportError("google-cloud-speech library not installed. Run: pip install google-cloud-speech")

	if credentials_path and os.path.exists(credentials_path):
	os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = credentials_path
	log_info(f"✅ Google credentials set from: {credentials_path}")
	else:
	log_info("⚠️ Google credentials path not found, using default credentials")

	self.client = speech.SpeechAsyncClient()
	self.streaming_config = None
	self.is_streaming = False
	self.audio_queue = asyncio.Queue()

	async def start_streaming(self, config: STTConfig) -> None:
	"""Initialize streaming session"""
	try:
	recognition_config = speech.RecognitionConfig(
	encoding=self._get_encoding(config.encoding),
	sample_rate_hertz=config.sample_rate,
	language_code=config.language,
	enable_automatic_punctuation=config.enable_punctuation,
	enable_word_time_offsets=config.enable_word_timestamps,
	model=config.model,
	use_enhanced=config.use_enhanced
	)

	self.streaming_config = speech.StreamingRecognitionConfig(
	config=recognition_config,
	interim_results=config.interim_results,
	single_utterance=config.single_utterance
	)

	self.is_streaming = True
	log_info("✅ Google STT streaming started")

	except Exception as e:
	log_error("❌ Failed to start Google STT streaming", e)
	raise

	async def stream_audio(self, audio_chunk: bytes) -> AsyncIterator[TranscriptionResult]:
	"""Stream audio chunk and get transcription results"""
	if not self.is_streaming:
	raise RuntimeError("Streaming not started. Call start_streaming() first.")

	try:
	# Add audio to queue
	await self.audio_queue.put(audio_chunk)

	# Process with Google STT
	request = speech.StreamingRecognizeRequest(audio_content=audio_chunk)

	# This is a simplified version - actual implementation would need
	# proper streaming handling with Google's API
	# For now, return empty iterator
	return
	yield # Make it a generator

	except Exception as e:
	log_error("❌ Google STT streaming error", e)
	raise

	async def stop_streaming(self) -> Optional[TranscriptionResult]:
	"""Stop streaming and get final result"""
	if not self.is_streaming:
	return None

	try:
	self.is_streaming = False
	log_info("✅ Google STT streaming stopped")

	# Return final result if any
	return None

	except Exception as e:
	log_error("❌ Failed to stop Google STT streaming", e)
	raise

	def supports_realtime(self) -> bool:
	"""Google Cloud STT supports real-time streaming"""
	return True

	def get_supported_languages(self) -> List[str]:
	"""Get list of supported language codes"""
	return [
	"tr-TR", # Turkish
	"en-US", # English (US)
	"en-GB", # English (UK)
	"de-DE", # German
	"fr-FR", # French
	"es-ES", # Spanish
	"it-IT", # Italian
	"pt-BR", # Portuguese (Brazil)
	"ru-RU", # Russian
	"ja-JP", # Japanese
	"ko-KR", # Korean
	"zh-CN", # Chinese (Simplified)
	"ar-SA", # Arabic
	]

	def get_provider_name(self) -> str:
	"""Get provider name"""
	return "google"

	def _get_encoding(self, encoding_str: str):
	"""Convert encoding string to Google Speech enum"""
	if not GOOGLE_SPEECH_AVAILABLE:
	return None

	encoding_map = {
	"WEBM_OPUS": speech.RecognitionConfig.AudioEncoding.WEBM_OPUS,
	"LINEAR16": speech.RecognitionConfig.AudioEncoding.LINEAR16,
	"FLAC": speech.RecognitionConfig.AudioEncoding.FLAC,
	"MP3": speech.RecognitionConfig.AudioEncoding.MP3,
	"OGG_OPUS": speech.RecognitionConfig.AudioEncoding.OGG_OPUS,
	}
	return encoding_map.get(encoding_str, speech.RecognitionConfig.AudioEncoding.WEBM_OPUS)