Spaces:

schoolkithub
/

multi-agent-gaia-system

Running

Omachoko commited on about 6 hours ago

Commit

c7cc357

1 Parent(s): a1492aa

✅ Update speech recognition to use HuggingFace provider pattern

🎙️ Updated transcribe_speech method to use new InferenceClient pattern:
- provider='hf-inference' with api_key=HF_TOKEN
- Explicit model='openai/whisper-large-v3' parameter
- Enhanced error handling and logging
- Graceful fallback to local speech recognition
🔧 Following user's exact specification for HF Inference API integration

Files changed (1) hide show

gaia_system.py +44 -20

gaia_system.py CHANGED Viewed

@@ -183,8 +183,11 @@ class UniversalMultimodalToolkit:
             self.clients['image_gen'] = InferenceClient(model="stabilityai/stable-diffusion-xl-base-1.0", token=self.hf_token)
             self.clients['object_detection'] = InferenceClient(model="facebook/detr-resnet-50", token=self.hf_token)
-            # Audio models
-            self.clients['speech_to_text'] = InferenceClient(model="openai/whisper-large-v3", token=self.hf_token)
             self.clients['audio_classification'] = InferenceClient(model="facebook/wav2vec2-base-960h", token=self.hf_token)
             # Text generation for multimodal
@@ -311,30 +314,51 @@ class UniversalMultimodalToolkit:
             return f"❌ Audio analysis failed: {e}"
     def transcribe_speech(self, audio_path: str) -> str:
-        """🎙️ Convert speech to text"""
         try:
-            if self.hf_token and 'speech_to_text' in self.clients:
-                # Use Whisper via HuggingFace
-                with open(audio_path, 'rb') as audio_file:
-                    result = self.clients['speech_to_text'].automatic_speech_recognition(audio_file.read())
-                    return f"Transcription: {result.get('text', 'No transcription available')}"
-            elif SPEECH_AVAILABLE:
-                # Use local speech recognition
-                r = sr.Recognizer()
-                with sr.AudioFile(audio_path) as source:
-                    audio = r.record(source)
-                    try:
                         text = r.recognize_google(audio)
                         return f"Transcription: {text}"
-                    except sr.UnknownValueError:
-                        return "❌ Could not understand audio"
-                    except sr.RequestError as e:
-                        return f"❌ Speech recognition error: {e}"
             else:
-                return "❌ Speech recognition unavailable"
         except Exception as e:
             return f"❌ Transcription failed: {e}"
     # === IMAGE GENERATION ===

             self.clients['image_gen'] = InferenceClient(model="stabilityai/stable-diffusion-xl-base-1.0", token=self.hf_token)
             self.clients['object_detection'] = InferenceClient(model="facebook/detr-resnet-50", token=self.hf_token)
+            # Audio models - Updated to use provider pattern for speech recognition
+            self.clients['speech_to_text'] = InferenceClient(
+                provider="hf-inference",
+                api_key=self.hf_token,
+            )
             self.clients['audio_classification'] = InferenceClient(model="facebook/wav2vec2-base-960h", token=self.hf_token)
             # Text generation for multimodal
             return f"❌ Audio analysis failed: {e}"
     def transcribe_speech(self, audio_path: str) -> str:
+        """🎙️ Convert speech to text using Whisper via HuggingFace Inference API"""
         try:
+            logger.info(f"🎙️ Transcribing speech from: {audio_path}")
+            if self.hf_token and HF_AVAILABLE and 'speech_to_text' in self.clients:
+                # Use Whisper via HuggingFace Inference API with provider pattern
+                try:
+                    result = self.clients['speech_to_text'].automatic_speech_recognition(
+                        audio_path,
+                        model="openai/whisper-large-v3"
+                    )
+                    if isinstance(result, dict) and 'text' in result:
+                        transcription = result['text'].strip()
+                    elif isinstance(result, str):
+                        transcription = result.strip()
+                    else:
+                        transcription = str(result).strip()
+                    if transcription:
+                        return f"Transcription: {transcription}"
+                    else:
+                        return "❌ No transcription available"
+                except Exception as hf_error:
+                    logger.warning(f"⚠️ HuggingFace speech recognition failed: {hf_error}")
+                    # Fall through to local recognition
+            # Fallback to local speech recognition if available
+            if SPEECH_AVAILABLE:
+                try:
+                    r = sr.Recognizer()
+                    with sr.AudioFile(audio_path) as source:
+                        audio = r.record(source)
                         text = r.recognize_google(audio)
                         return f"Transcription: {text}"
+                except sr.UnknownValueError:
+                    return "❌ Could not understand audio"
+                except sr.RequestError as e:
+                    return f"❌ Speech recognition error: {e}"
             else:
+                return "❌ Speech recognition unavailable. Need HuggingFace token or speech_recognition library."
         except Exception as e:
+            logger.error(f"❌ Transcription error: {e}")
             return f"❌ Transcription failed: {e}"
     # === IMAGE GENERATION ===