Omachoko
commited on
Commit
·
c7cc357
1
Parent(s):
a1492aa
✅ Update speech recognition to use HuggingFace provider pattern
Browse files🎙️ Updated transcribe_speech method to use new InferenceClient pattern:
- provider='hf-inference' with api_key=HF_TOKEN
- Explicit model='openai/whisper-large-v3' parameter
- Enhanced error handling and logging
- Graceful fallback to local speech recognition
🔧 Following user's exact specification for HF Inference API integration
- gaia_system.py +44 -20
gaia_system.py
CHANGED
@@ -183,8 +183,11 @@ class UniversalMultimodalToolkit:
|
|
183 |
self.clients['image_gen'] = InferenceClient(model="stabilityai/stable-diffusion-xl-base-1.0", token=self.hf_token)
|
184 |
self.clients['object_detection'] = InferenceClient(model="facebook/detr-resnet-50", token=self.hf_token)
|
185 |
|
186 |
-
# Audio models
|
187 |
-
self.clients['speech_to_text'] = InferenceClient(
|
|
|
|
|
|
|
188 |
self.clients['audio_classification'] = InferenceClient(model="facebook/wav2vec2-base-960h", token=self.hf_token)
|
189 |
|
190 |
# Text generation for multimodal
|
@@ -311,30 +314,51 @@ class UniversalMultimodalToolkit:
|
|
311 |
return f"❌ Audio analysis failed: {e}"
|
312 |
|
313 |
def transcribe_speech(self, audio_path: str) -> str:
|
314 |
-
"""🎙️ Convert speech to text"""
|
315 |
try:
|
316 |
-
|
317 |
-
|
318 |
-
|
319 |
-
|
320 |
-
|
321 |
-
|
322 |
-
|
323 |
-
|
324 |
-
|
325 |
-
|
326 |
-
|
327 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
328 |
text = r.recognize_google(audio)
|
329 |
return f"Transcription: {text}"
|
330 |
-
|
331 |
-
|
332 |
-
|
333 |
-
|
334 |
else:
|
335 |
-
return "❌ Speech recognition unavailable"
|
336 |
|
337 |
except Exception as e:
|
|
|
338 |
return f"❌ Transcription failed: {e}"
|
339 |
|
340 |
# === IMAGE GENERATION ===
|
|
|
183 |
self.clients['image_gen'] = InferenceClient(model="stabilityai/stable-diffusion-xl-base-1.0", token=self.hf_token)
|
184 |
self.clients['object_detection'] = InferenceClient(model="facebook/detr-resnet-50", token=self.hf_token)
|
185 |
|
186 |
+
# Audio models - Updated to use provider pattern for speech recognition
|
187 |
+
self.clients['speech_to_text'] = InferenceClient(
|
188 |
+
provider="hf-inference",
|
189 |
+
api_key=self.hf_token,
|
190 |
+
)
|
191 |
self.clients['audio_classification'] = InferenceClient(model="facebook/wav2vec2-base-960h", token=self.hf_token)
|
192 |
|
193 |
# Text generation for multimodal
|
|
|
314 |
return f"❌ Audio analysis failed: {e}"
|
315 |
|
316 |
def transcribe_speech(self, audio_path: str) -> str:
|
317 |
+
"""🎙️ Convert speech to text using Whisper via HuggingFace Inference API"""
|
318 |
try:
|
319 |
+
logger.info(f"🎙️ Transcribing speech from: {audio_path}")
|
320 |
+
|
321 |
+
if self.hf_token and HF_AVAILABLE and 'speech_to_text' in self.clients:
|
322 |
+
# Use Whisper via HuggingFace Inference API with provider pattern
|
323 |
+
try:
|
324 |
+
result = self.clients['speech_to_text'].automatic_speech_recognition(
|
325 |
+
audio_path,
|
326 |
+
model="openai/whisper-large-v3"
|
327 |
+
)
|
328 |
+
|
329 |
+
if isinstance(result, dict) and 'text' in result:
|
330 |
+
transcription = result['text'].strip()
|
331 |
+
elif isinstance(result, str):
|
332 |
+
transcription = result.strip()
|
333 |
+
else:
|
334 |
+
transcription = str(result).strip()
|
335 |
+
|
336 |
+
if transcription:
|
337 |
+
return f"Transcription: {transcription}"
|
338 |
+
else:
|
339 |
+
return "❌ No transcription available"
|
340 |
+
|
341 |
+
except Exception as hf_error:
|
342 |
+
logger.warning(f"⚠️ HuggingFace speech recognition failed: {hf_error}")
|
343 |
+
# Fall through to local recognition
|
344 |
+
|
345 |
+
# Fallback to local speech recognition if available
|
346 |
+
if SPEECH_AVAILABLE:
|
347 |
+
try:
|
348 |
+
r = sr.Recognizer()
|
349 |
+
with sr.AudioFile(audio_path) as source:
|
350 |
+
audio = r.record(source)
|
351 |
text = r.recognize_google(audio)
|
352 |
return f"Transcription: {text}"
|
353 |
+
except sr.UnknownValueError:
|
354 |
+
return "❌ Could not understand audio"
|
355 |
+
except sr.RequestError as e:
|
356 |
+
return f"❌ Speech recognition error: {e}"
|
357 |
else:
|
358 |
+
return "❌ Speech recognition unavailable. Need HuggingFace token or speech_recognition library."
|
359 |
|
360 |
except Exception as e:
|
361 |
+
logger.error(f"❌ Transcription error: {e}")
|
362 |
return f"❌ Transcription failed: {e}"
|
363 |
|
364 |
# === IMAGE GENERATION ===
|