Omachoko commited on
Commit
c7cc357
·
1 Parent(s): a1492aa

✅ Update speech recognition to use HuggingFace provider pattern

Browse files

🎙️ Updated transcribe_speech method to use new InferenceClient pattern:
- provider='hf-inference' with api_key=HF_TOKEN
- Explicit model='openai/whisper-large-v3' parameter
- Enhanced error handling and logging
- Graceful fallback to local speech recognition
🔧 Following user's exact specification for HF Inference API integration

Files changed (1) hide show
  1. gaia_system.py +44 -20
gaia_system.py CHANGED
@@ -183,8 +183,11 @@ class UniversalMultimodalToolkit:
183
  self.clients['image_gen'] = InferenceClient(model="stabilityai/stable-diffusion-xl-base-1.0", token=self.hf_token)
184
  self.clients['object_detection'] = InferenceClient(model="facebook/detr-resnet-50", token=self.hf_token)
185
 
186
- # Audio models
187
- self.clients['speech_to_text'] = InferenceClient(model="openai/whisper-large-v3", token=self.hf_token)
 
 
 
188
  self.clients['audio_classification'] = InferenceClient(model="facebook/wav2vec2-base-960h", token=self.hf_token)
189
 
190
  # Text generation for multimodal
@@ -311,30 +314,51 @@ class UniversalMultimodalToolkit:
311
  return f"❌ Audio analysis failed: {e}"
312
 
313
  def transcribe_speech(self, audio_path: str) -> str:
314
- """🎙️ Convert speech to text"""
315
  try:
316
- if self.hf_token and 'speech_to_text' in self.clients:
317
- # Use Whisper via HuggingFace
318
- with open(audio_path, 'rb') as audio_file:
319
- result = self.clients['speech_to_text'].automatic_speech_recognition(audio_file.read())
320
- return f"Transcription: {result.get('text', 'No transcription available')}"
321
-
322
- elif SPEECH_AVAILABLE:
323
- # Use local speech recognition
324
- r = sr.Recognizer()
325
- with sr.AudioFile(audio_path) as source:
326
- audio = r.record(source)
327
- try:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
328
  text = r.recognize_google(audio)
329
  return f"Transcription: {text}"
330
- except sr.UnknownValueError:
331
- return "❌ Could not understand audio"
332
- except sr.RequestError as e:
333
- return f"❌ Speech recognition error: {e}"
334
  else:
335
- return "❌ Speech recognition unavailable"
336
 
337
  except Exception as e:
 
338
  return f"❌ Transcription failed: {e}"
339
 
340
  # === IMAGE GENERATION ===
 
183
  self.clients['image_gen'] = InferenceClient(model="stabilityai/stable-diffusion-xl-base-1.0", token=self.hf_token)
184
  self.clients['object_detection'] = InferenceClient(model="facebook/detr-resnet-50", token=self.hf_token)
185
 
186
+ # Audio models - Updated to use provider pattern for speech recognition
187
+ self.clients['speech_to_text'] = InferenceClient(
188
+ provider="hf-inference",
189
+ api_key=self.hf_token,
190
+ )
191
  self.clients['audio_classification'] = InferenceClient(model="facebook/wav2vec2-base-960h", token=self.hf_token)
192
 
193
  # Text generation for multimodal
 
314
  return f"❌ Audio analysis failed: {e}"
315
 
316
  def transcribe_speech(self, audio_path: str) -> str:
317
+ """🎙️ Convert speech to text using Whisper via HuggingFace Inference API"""
318
  try:
319
+ logger.info(f"🎙️ Transcribing speech from: {audio_path}")
320
+
321
+ if self.hf_token and HF_AVAILABLE and 'speech_to_text' in self.clients:
322
+ # Use Whisper via HuggingFace Inference API with provider pattern
323
+ try:
324
+ result = self.clients['speech_to_text'].automatic_speech_recognition(
325
+ audio_path,
326
+ model="openai/whisper-large-v3"
327
+ )
328
+
329
+ if isinstance(result, dict) and 'text' in result:
330
+ transcription = result['text'].strip()
331
+ elif isinstance(result, str):
332
+ transcription = result.strip()
333
+ else:
334
+ transcription = str(result).strip()
335
+
336
+ if transcription:
337
+ return f"Transcription: {transcription}"
338
+ else:
339
+ return "❌ No transcription available"
340
+
341
+ except Exception as hf_error:
342
+ logger.warning(f"⚠️ HuggingFace speech recognition failed: {hf_error}")
343
+ # Fall through to local recognition
344
+
345
+ # Fallback to local speech recognition if available
346
+ if SPEECH_AVAILABLE:
347
+ try:
348
+ r = sr.Recognizer()
349
+ with sr.AudioFile(audio_path) as source:
350
+ audio = r.record(source)
351
  text = r.recognize_google(audio)
352
  return f"Transcription: {text}"
353
+ except sr.UnknownValueError:
354
+ return "❌ Could not understand audio"
355
+ except sr.RequestError as e:
356
+ return f"❌ Speech recognition error: {e}"
357
  else:
358
+ return "❌ Speech recognition unavailable. Need HuggingFace token or speech_recognition library."
359
 
360
  except Exception as e:
361
+ logger.error(f"❌ Transcription error: {e}")
362
  return f"❌ Transcription failed: {e}"
363
 
364
  # === IMAGE GENERATION ===