ciyidogan commited on
Commit
b0a4866
·
verified ·
1 Parent(s): a847f43

Create stt_google.py

Browse files
Files changed (1) hide show
  1. stt_google.py +143 -0
stt_google.py ADDED
@@ -0,0 +1,143 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Google Cloud Speech-to-Text Implementation
3
+ """
4
+
5
+ import os
6
+ import asyncio
7
+ from typing import AsyncIterator, Optional, List
8
+ from google.cloud import speech_v1p1beta1 as speech
9
+ from google.api_core import exceptions
10
+ from utils import log
11
+ from stt_interface import STTInterface, STTConfig, TranscriptionResult
12
+
13
+ class GoogleCloudSTT(STTInterface):
14
+ """Google Cloud Speech-to-Text implementation"""
15
+
16
+ def __init__(self, credentials_path: str):
17
+ if credentials_path and os.path.exists(credentials_path):
18
+ os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = credentials_path
19
+ log(f"✅ Google credentials set from: {credentials_path}")
20
+ else:
21
+ log("⚠️ Google credentials path not found, using default credentials")
22
+
23
+ self.client = speech.SpeechAsyncClient()
24
+ self.streaming_config = None
25
+ self.is_streaming = False
26
+ self.audio_queue = asyncio.Queue()
27
+
28
+ async def start_streaming(self, config: STTConfig) -> None:
29
+ """Initialize streaming session"""
30
+ try:
31
+ recognition_config = speech.RecognitionConfig(
32
+ encoding=self._get_encoding(config.encoding),
33
+ sample_rate_hertz=config.sample_rate,
34
+ language_code=config.language,
35
+ enable_automatic_punctuation=config.enable_punctuation,
36
+ enable_word_time_offsets=config.enable_word_timestamps,
37
+ model=config.model,
38
+ use_enhanced=config.use_enhanced,
39
+ metadata=speech.RecognitionMetadata(
40
+ interaction_type=speech.RecognitionMetadata.InteractionType.VOICE_SEARCH,
41
+ recording_device_type=speech.RecognitionMetadata.RecordingDeviceType.PC,
42
+ audio_topic="general"
43
+ )
44
+ )
45
+
46
+ self.streaming_config = speech.StreamingRecognitionConfig(
47
+ config=recognition_config,
48
+ interim_results=config.interim_results,
49
+ single_utterance=config.single_utterance
50
+ )
51
+
52
+ self.is_streaming = True
53
+ log("✅ Google STT streaming session started")
54
+
55
+ except Exception as e:
56
+ log(f"❌ Failed to start Google STT streaming: {e}")
57
+ raise
58
+
59
+ async def stream_audio(self, audio_chunk: bytes) -> AsyncIterator[TranscriptionResult]:
60
+ """Stream audio chunk and get transcription results"""
61
+ if not self.is_streaming:
62
+ log("⚠️ STT streaming not started")
63
+ return
64
+
65
+ try:
66
+ # Add audio chunk to queue
67
+ await self.audio_queue.put(audio_chunk)
68
+
69
+ # Process audio stream
70
+ async def audio_generator():
71
+ while self.is_streaming:
72
+ chunk = await self.audio_queue.get()
73
+ yield speech.StreamingRecognizeRequest(audio_content=chunk)
74
+
75
+ # Get responses
76
+ responses = await self.client.streaming_recognize(
77
+ self.streaming_config,
78
+ audio_generator()
79
+ )
80
+
81
+ async for response in responses:
82
+ for result in response.results:
83
+ if result.alternatives:
84
+ yield TranscriptionResult(
85
+ text=result.alternatives[0].transcript,
86
+ is_final=result.is_final,
87
+ confidence=result.alternatives[0].confidence,
88
+ timestamp=asyncio.get_event_loop().time()
89
+ )
90
+
91
+ except exceptions.OutOfRange:
92
+ log("⚠️ Google STT: Exceeded maximum audio duration")
93
+ self.is_streaming = False
94
+ except Exception as e:
95
+ log(f"❌ Google STT streaming error: {e}")
96
+ raise
97
+
98
+ async def stop_streaming(self) -> Optional[TranscriptionResult]:
99
+ """Stop streaming and get final result"""
100
+ self.is_streaming = False
101
+ log("🛑 Google STT streaming stopped")
102
+
103
+ # Process any remaining audio in queue
104
+ if not self.audio_queue.empty():
105
+ # TODO: Process remaining audio
106
+ pass
107
+
108
+ return None
109
+
110
+ def supports_realtime(self) -> bool:
111
+ """Google Cloud Speech supports real-time streaming"""
112
+ return True
113
+
114
+ def get_supported_languages(self) -> List[str]:
115
+ """Get list of supported language codes"""
116
+ return [
117
+ "tr-TR", # Turkish
118
+ "en-US", # English (US)
119
+ "en-GB", # English (UK)
120
+ "de-DE", # German
121
+ "fr-FR", # French
122
+ "es-ES", # Spanish
123
+ "it-IT", # Italian
124
+ "pt-BR", # Portuguese (Brazil)
125
+ "ru-RU", # Russian
126
+ "ja-JP", # Japanese
127
+ "ko-KR", # Korean
128
+ "zh-CN", # Chinese (Simplified)
129
+ ]
130
+
131
+ def _get_encoding(self, encoding: str):
132
+ """Convert encoding string to Google Cloud Speech encoding"""
133
+ encoding_map = {
134
+ "LINEAR16": speech.RecognitionConfig.AudioEncoding.LINEAR16,
135
+ "FLAC": speech.RecognitionConfig.AudioEncoding.FLAC,
136
+ "MULAW": speech.RecognitionConfig.AudioEncoding.MULAW,
137
+ "AMR": speech.RecognitionConfig.AudioEncoding.AMR,
138
+ "AMR_WB": speech.RecognitionConfig.AudioEncoding.AMR_WB,
139
+ "OGG_OPUS": speech.RecognitionConfig.AudioEncoding.OGG_OPUS,
140
+ "SPEEX_WITH_HEADER_BYTE": speech.RecognitionConfig.AudioEncoding.SPEEX_WITH_HEADER_BYTE,
141
+ "WEBM_OPUS": speech.RecognitionConfig.AudioEncoding.WEBM_OPUS,
142
+ }
143
+ return encoding_map.get(encoding, speech.RecognitionConfig.AudioEncoding.WEBM_OPUS)