Spaces:
Running
Running
Update stt/stt_google.py
Browse files- stt/stt_google.py +31 -3
stt/stt_google.py
CHANGED
@@ -76,12 +76,28 @@ class GoogleSTT(STTInterface):
|
|
76 |
|
77 |
log_info(f"📊 Transcribing {len(audio_data)} bytes of audio")
|
78 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
79 |
# Convert to WAV format for better compatibility
|
80 |
wav_audio = self._convert_to_wav(audio_data, config.sample_rate)
|
81 |
|
82 |
# Configure recognition
|
83 |
language_code = self._map_language_code(config.language)
|
84 |
-
|
|
|
85 |
recognition_config = RecognitionConfig(
|
86 |
encoding=RecognitionConfig.AudioEncoding.LINEAR16,
|
87 |
sample_rate_hertz=config.sample_rate,
|
@@ -91,14 +107,26 @@ class GoogleSTT(STTInterface):
|
|
91 |
use_enhanced=config.use_enhanced,
|
92 |
enable_word_time_offsets=config.enable_word_timestamps,
|
93 |
)
|
94 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
95 |
# Create audio object
|
96 |
audio = RecognitionAudio(content=wav_audio)
|
97 |
|
98 |
# Perform synchronous recognition
|
99 |
log_info(f"🔄 Sending audio to Google Cloud Speech API...")
|
100 |
response = self.client.recognize(config=recognition_config, audio=audio)
|
101 |
-
|
|
|
|
|
|
|
102 |
# Process results
|
103 |
if response.results:
|
104 |
result = response.results[0]
|
|
|
76 |
|
77 |
log_info(f"📊 Transcribing {len(audio_data)} bytes of audio")
|
78 |
|
79 |
+
# ✅ Debug - audio verisi analizi
|
80 |
+
if len(audio_data) > 100:
|
81 |
+
# İlk ve son 50 byte'ı kontrol et
|
82 |
+
first_50 = audio_data[:50]
|
83 |
+
last_50 = audio_data[-50:]
|
84 |
+
log_debug(f"Audio first 50 bytes: {first_50.hex()}")
|
85 |
+
log_debug(f"Audio last 50 bytes: {last_50.hex()}")
|
86 |
+
|
87 |
+
# Ortalama amplitude kontrolü
|
88 |
+
import struct
|
89 |
+
samples = struct.unpack(f'{len(audio_data)//2}h', audio_data)
|
90 |
+
avg_amplitude = sum(abs(s) for s in samples) / len(samples)
|
91 |
+
max_amplitude = max(abs(s) for s in samples)
|
92 |
+
log_debug(f"Audio stats: avg_amplitude={avg_amplitude:.1f}, max_amplitude={max_amplitude}")
|
93 |
+
|
94 |
# Convert to WAV format for better compatibility
|
95 |
wav_audio = self._convert_to_wav(audio_data, config.sample_rate)
|
96 |
|
97 |
# Configure recognition
|
98 |
language_code = self._map_language_code(config.language)
|
99 |
+
|
100 |
+
"""
|
101 |
recognition_config = RecognitionConfig(
|
102 |
encoding=RecognitionConfig.AudioEncoding.LINEAR16,
|
103 |
sample_rate_hertz=config.sample_rate,
|
|
|
107 |
use_enhanced=config.use_enhanced,
|
108 |
enable_word_time_offsets=config.enable_word_timestamps,
|
109 |
)
|
110 |
+
"""
|
111 |
+
|
112 |
+
recognition_config = RecognitionConfig(
|
113 |
+
encoding=RecognitionConfig.AudioEncoding.LINEAR16,
|
114 |
+
sample_rate_hertz=16000, # Sabit
|
115 |
+
language_code="tr-TR", # Sabit
|
116 |
+
enable_automatic_punctuation=True
|
117 |
+
|
118 |
+
log_debug(f"Recognition config: language={language_code}, sample_rate={config.sample_rate}, model={config.model}")
|
119 |
+
|
120 |
# Create audio object
|
121 |
audio = RecognitionAudio(content=wav_audio)
|
122 |
|
123 |
# Perform synchronous recognition
|
124 |
log_info(f"🔄 Sending audio to Google Cloud Speech API...")
|
125 |
response = self.client.recognize(config=recognition_config, audio=audio)
|
126 |
+
|
127 |
+
# ✅ Debug response
|
128 |
+
log_debug(f"API Response: {response}")
|
129 |
+
|
130 |
# Process results
|
131 |
if response.results:
|
132 |
result = response.results[0]
|