from transformers import pipeline import librosa # Or soundfile import os from smolagents.tools import Tool # Added import from typing import Optional # Added for type hinting # Initialize the ASR pipeline once _asr_pipeline_instance = None def get_asr_pipeline(): global _asr_pipeline_instance if _asr_pipeline_instance is None: try: # Using a smaller Whisper model for quicker setup, but larger models offer better accuracy _asr_pipeline_instance = pipeline( "automatic-speech-recognition", model="openai/whisper-tiny.en", # Consider making model configurable ) print("ASR pipeline initialized.") # For feedback except Exception as e: print(f"Error initializing ASR pipeline: {e}") # Handle error appropriately, e.g., raise or log return _asr_pipeline_instance # Original transcription function, renamed to be internal def _transcribe_audio_file(audio_filepath: str, asr_pipeline_instance) -> str: """ Converts speech in an audio file to text using the provided ASR pipeline. Args: audio_filepath (str): Path to the audio file. asr_pipeline_instance: The initialized ASR pipeline. Returns: str: Transcribed text from the audio or an error message. """ if not asr_pipeline_instance: return "Error: ASR pipeline is not available." if not os.path.exists(audio_filepath): return f"Error: Audio file not found at {audio_filepath}" try: # Ensure the file can be loaded by librosa (or your chosen audio library) # This step can help catch corrupted or unsupported audio formats early. y, sr = librosa.load(audio_filepath, sr=None) # Load with original sample rate if sr != 16000: # Whisper models expect 16kHz y = librosa.resample(y, orig_sr=sr, target_sr=16000) # Pass the numpy array to the pipeline transcription_result = asr_pipeline_instance( {"raw": y, "sampling_rate": 16000}, return_timestamps=False ) # Changed to False for simplicity return transcription_result["text"] except Exception as e: return f"Error during transcription of {audio_filepath}: {e}" class SpeechToTextTool(Tool): """ Transcribes audio from a given audio file path to text. """ name = "speech_to_text_transcriber" description = "Converts speech in an audio file (e.g., .mp3, .wav) to text using speech recognition." inputs = { "audio_filepath": {"type": "string", "description": "Path to the audio file to transcribe."} } outputs = { "transcribed_text": { "type": "string", "description": "The transcribed text from the audio, or an error message.", } } output_type = "string" def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.asr_pipeline = get_asr_pipeline() # Initialize or get the shared pipeline self.is_initialized = True if self.asr_pipeline else False def forward(self, audio_filepath: str) -> str: """ Wrapper for the _transcribe_audio_file function. """ if not self.is_initialized or not self.asr_pipeline: return "Error: SpeechToTextTool was not initialized properly (ASR pipeline missing)." return _transcribe_audio_file(audio_filepath, self.asr_pipeline) # Expose the original function name if needed by other parts of the system (optional) # transcribe_audio = _transcribe_audio_file # This would need adjustment if it expects the pipeline passed in # Example usage: if __name__ == "__main__": tool_instance = SpeechToTextTool() # Create a dummy MP3 file for testing (requires ffmpeg to be installed for pydub to work) # This part is tricky to make universally runnable without external dependencies for audio creation. # For a simple test, we'll assume a file exists or skip this part if it doesn't. # Path to a test audio file (replace with an actual .mp3 or .wav file for testing) # You might need to download a short sample audio file and place it in your project. # e.g., create a `test_data` directory and put `sample.mp3` there. test_audio_file = "./data/downloaded_files/1f975693-876d-457b-a649-393859e79bf3.mp3" # GAIA example # test_audio_file_2 = "./data/downloaded_files/99c9cc74-fdc8-46c6-8f8d-3ce2d3bfeea3.mp3" # GAIA example if tool_instance.is_initialized: if os.path.exists(test_audio_file): print(f"Attempting to transcribe: {test_audio_file}") transcribed_text = tool_instance.forward(test_audio_file) print(f"Transcription:\n{transcribed_text}") else: print( f"Test audio file not found: {test_audio_file}. Skipping transcription test." ) print("Please place a sample .mp3 or .wav file at that location for testing.") # if os.path.exists(test_audio_file_2): # print(f"\nAttempting to transcribe: {test_audio_file_2}") # transcribed_text_2 = tool_instance.forward(test_audio_file_2) # print(f"Transcription 2:\n{transcribed_text_2}") # else: # print(f"Test audio file 2 not found: {test_audio_file_2}. Skipping.") else: print( "SpeechToTextTool could not be initialized (ASR pipeline missing). Transcription test skipped." ) # Test with a non-existent file non_existent_file = "./non_existent_audio.mp3" print(f"\nAttempting to transcribe non-existent file: {non_existent_file}") error_text = tool_instance.forward(non_existent_file) print(f"Result for non-existent file:\n{error_text}") assert "Error:" in error_text # Expect an error message