Spaces:
				
			
			
	
			
			
		Paused
		
	
	
	
			
			
	
	
	
	
		
		
		Paused
		
	| """ | |
| Google Cloud Speech-to-Text Implementation | |
| """ | |
| import os | |
| import asyncio | |
| from typing import AsyncIterator, Optional, List | |
| from google.cloud import speech_v1p1beta1 as speech | |
| from google.api_core import exceptions | |
| from utils import log | |
| from stt_interface import STTInterface, STTConfig, TranscriptionResult | |
| class GoogleCloudSTT(STTInterface): | |
| """Google Cloud Speech-to-Text implementation""" | |
| def __init__(self, credentials_path: str): | |
| if credentials_path and os.path.exists(credentials_path): | |
| os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = credentials_path | |
| log(f"β Google credentials set from: {credentials_path}") | |
| else: | |
| log("β οΈ Google credentials path not found, using default credentials") | |
| self.client = speech.SpeechAsyncClient() | |
| self.streaming_config = None | |
| self.is_streaming = False | |
| self.audio_queue = asyncio.Queue() | |
| async def start_streaming(self, config: STTConfig) -> None: | |
| """Initialize streaming session""" | |
| try: | |
| recognition_config = speech.RecognitionConfig( | |
| encoding=self._get_encoding(config.encoding), | |
| sample_rate_hertz=config.sample_rate, | |
| language_code=config.language, | |
| enable_automatic_punctuation=config.enable_punctuation, | |
| enable_word_time_offsets=config.enable_word_timestamps, | |
| model=config.model, | |
| use_enhanced=config.use_enhanced, | |
| metadata=speech.RecognitionMetadata( | |
| interaction_type=speech.RecognitionMetadata.InteractionType.VOICE_SEARCH, | |
| recording_device_type=speech.RecognitionMetadata.RecordingDeviceType.PC, | |
| audio_topic="general" | |
| ) | |
| ) | |
| self.streaming_config = speech.StreamingRecognitionConfig( | |
| config=recognition_config, | |
| interim_results=config.interim_results, | |
| single_utterance=config.single_utterance | |
| ) | |
| self.is_streaming = True | |
| log("β Google STT streaming session started") | |
| except Exception as e: | |
| log(f"β Failed to start Google STT streaming: {e}") | |
| raise | |
| async def stream_audio(self, audio_chunk: bytes) -> AsyncIterator[TranscriptionResult]: | |
| """Stream audio chunk and get transcription results""" | |
| if not self.is_streaming: | |
| log("β οΈ STT streaming not started") | |
| return | |
| try: | |
| # Add audio chunk to queue | |
| await self.audio_queue.put(audio_chunk) | |
| # Process audio stream | |
| async def audio_generator(): | |
| while self.is_streaming: | |
| chunk = await self.audio_queue.get() | |
| yield speech.StreamingRecognizeRequest(audio_content=chunk) | |
| # Get responses | |
| responses = await self.client.streaming_recognize( | |
| self.streaming_config, | |
| audio_generator() | |
| ) | |
| async for response in responses: | |
| for result in response.results: | |
| if result.alternatives: | |
| yield TranscriptionResult( | |
| text=result.alternatives[0].transcript, | |
| is_final=result.is_final, | |
| confidence=result.alternatives[0].confidence, | |
| timestamp=asyncio.get_event_loop().time() | |
| ) | |
| except exceptions.OutOfRange: | |
| log("β οΈ Google STT: Exceeded maximum audio duration") | |
| self.is_streaming = False | |
| except Exception as e: | |
| log(f"β Google STT streaming error: {e}") | |
| raise | |
| async def stop_streaming(self) -> Optional[TranscriptionResult]: | |
| """Stop streaming and get final result""" | |
| self.is_streaming = False | |
| log("π Google STT streaming stopped") | |
| # Process any remaining audio in queue | |
| if not self.audio_queue.empty(): | |
| # TODO: Process remaining audio | |
| pass | |
| return None | |
| def supports_realtime(self) -> bool: | |
| """Google Cloud Speech supports real-time streaming""" | |
| return True | |
| def get_supported_languages(self) -> List[str]: | |
| """Get list of supported language codes""" | |
| return [ | |
| "tr-TR", # Turkish | |
| "en-US", # English (US) | |
| "en-GB", # English (UK) | |
| "de-DE", # German | |
| "fr-FR", # French | |
| "es-ES", # Spanish | |
| "it-IT", # Italian | |
| "pt-BR", # Portuguese (Brazil) | |
| "ru-RU", # Russian | |
| "ja-JP", # Japanese | |
| "ko-KR", # Korean | |
| "zh-CN", # Chinese (Simplified) | |
| ] | |
| def _get_encoding(self, encoding: str): | |
| """Convert encoding string to Google Cloud Speech encoding""" | |
| encoding_map = { | |
| "LINEAR16": speech.RecognitionConfig.AudioEncoding.LINEAR16, | |
| "FLAC": speech.RecognitionConfig.AudioEncoding.FLAC, | |
| "MULAW": speech.RecognitionConfig.AudioEncoding.MULAW, | |
| "AMR": speech.RecognitionConfig.AudioEncoding.AMR, | |
| "AMR_WB": speech.RecognitionConfig.AudioEncoding.AMR_WB, | |
| "OGG_OPUS": speech.RecognitionConfig.AudioEncoding.OGG_OPUS, | |
| "SPEEX_WITH_HEADER_BYTE": speech.RecognitionConfig.AudioEncoding.SPEEX_WITH_HEADER_BYTE, | |
| "WEBM_OPUS": speech.RecognitionConfig.AudioEncoding.WEBM_OPUS, | |
| } | |
| return encoding_map.get(encoding, speech.RecognitionConfig.AudioEncoding.WEBM_OPUS) | 

