Yago Bolivar
Refactor speech_to_text.py to implement a singleton ASR pipeline, enhance error handling, and introduce SpeechToTextTool for better integration. Update spreadsheet_tool.py to support querying and improve parsing functionality, including CSV support. Enhance video_processing_tool.py with new tasks for metadata extraction and frame extraction, while improving object detection capabilities and initialization checks.
87aa741
from transformers import pipeline | |
import librosa # Or soundfile | |
import os | |
from smolagents.tools import Tool # Added import | |
from typing import Optional # Added for type hinting | |
# Initialize the ASR pipeline once | |
_asr_pipeline_instance = None | |
def get_asr_pipeline(): | |
global _asr_pipeline_instance | |
if _asr_pipeline_instance is None: | |
try: | |
# Using a smaller Whisper model for quicker setup, but larger models offer better accuracy | |
_asr_pipeline_instance = pipeline( | |
"automatic-speech-recognition", | |
model="openai/whisper-tiny.en", # Consider making model configurable | |
) | |
print("ASR pipeline initialized.") # For feedback | |
except Exception as e: | |
print(f"Error initializing ASR pipeline: {e}") | |
# Handle error appropriately, e.g., raise or log | |
return _asr_pipeline_instance | |
# Original transcription function, renamed to be internal | |
def _transcribe_audio_file(audio_filepath: str, asr_pipeline_instance) -> str: | |
""" | |
Converts speech in an audio file to text using the provided ASR pipeline. | |
Args: | |
audio_filepath (str): Path to the audio file. | |
asr_pipeline_instance: The initialized ASR pipeline. | |
Returns: | |
str: Transcribed text from the audio or an error message. | |
""" | |
if not asr_pipeline_instance: | |
return "Error: ASR pipeline is not available." | |
if not os.path.exists(audio_filepath): | |
return f"Error: Audio file not found at {audio_filepath}" | |
try: | |
# Ensure the file can be loaded by librosa (or your chosen audio library) | |
# This step can help catch corrupted or unsupported audio formats early. | |
y, sr = librosa.load(audio_filepath, sr=None) # Load with original sample rate | |
if sr != 16000: # Whisper models expect 16kHz | |
y = librosa.resample(y, orig_sr=sr, target_sr=16000) | |
# Pass the numpy array to the pipeline | |
transcription_result = asr_pipeline_instance( | |
{"raw": y, "sampling_rate": 16000}, return_timestamps=False | |
) # Changed to False for simplicity | |
return transcription_result["text"] | |
except Exception as e: | |
return f"Error during transcription of {audio_filepath}: {e}" | |
class SpeechToTextTool(Tool): | |
""" | |
Transcribes audio from a given audio file path to text. | |
""" | |
name = "speech_to_text_transcriber" | |
description = "Converts speech in an audio file (e.g., .mp3, .wav) to text using speech recognition." | |
inputs = { | |
"audio_filepath": {"type": "string", "description": "Path to the audio file to transcribe."} | |
} | |
outputs = { | |
"transcribed_text": { | |
"type": "string", | |
"description": "The transcribed text from the audio, or an error message.", | |
} | |
} | |
output_type = "string" | |
def __init__(self, *args, **kwargs): | |
super().__init__(*args, **kwargs) | |
self.asr_pipeline = get_asr_pipeline() # Initialize or get the shared pipeline | |
self.is_initialized = True if self.asr_pipeline else False | |
def forward(self, audio_filepath: str) -> str: | |
""" | |
Wrapper for the _transcribe_audio_file function. | |
""" | |
if not self.is_initialized or not self.asr_pipeline: | |
return "Error: SpeechToTextTool was not initialized properly (ASR pipeline missing)." | |
return _transcribe_audio_file(audio_filepath, self.asr_pipeline) | |
# Expose the original function name if needed by other parts of the system (optional) | |
# transcribe_audio = _transcribe_audio_file # This would need adjustment if it expects the pipeline passed in | |
# Example usage: | |
if __name__ == "__main__": | |
tool_instance = SpeechToTextTool() | |
# Create a dummy MP3 file for testing (requires ffmpeg to be installed for pydub to work) | |
# This part is tricky to make universally runnable without external dependencies for audio creation. | |
# For a simple test, we'll assume a file exists or skip this part if it doesn't. | |
# Path to a test audio file (replace with an actual .mp3 or .wav file for testing) | |
# You might need to download a short sample audio file and place it in your project. | |
# e.g., create a `test_data` directory and put `sample.mp3` there. | |
test_audio_file = "./data/downloaded_files/1f975693-876d-457b-a649-393859e79bf3.mp3" # GAIA example | |
# test_audio_file_2 = "./data/downloaded_files/99c9cc74-fdc8-46c6-8f8d-3ce2d3bfeea3.mp3" # GAIA example | |
if tool_instance.is_initialized: | |
if os.path.exists(test_audio_file): | |
print(f"Attempting to transcribe: {test_audio_file}") | |
transcribed_text = tool_instance.forward(test_audio_file) | |
print(f"Transcription:\n{transcribed_text}") | |
else: | |
print( | |
f"Test audio file not found: {test_audio_file}. Skipping transcription test." | |
) | |
print("Please place a sample .mp3 or .wav file at that location for testing.") | |
# if os.path.exists(test_audio_file_2): | |
# print(f"\nAttempting to transcribe: {test_audio_file_2}") | |
# transcribed_text_2 = tool_instance.forward(test_audio_file_2) | |
# print(f"Transcription 2:\n{transcribed_text_2}") | |
# else: | |
# print(f"Test audio file 2 not found: {test_audio_file_2}. Skipping.") | |
else: | |
print( | |
"SpeechToTextTool could not be initialized (ASR pipeline missing). Transcription test skipped." | |
) | |
# Test with a non-existent file | |
non_existent_file = "./non_existent_audio.mp3" | |
print(f"\nAttempting to transcribe non-existent file: {non_existent_file}") | |
error_text = tool_instance.forward(non_existent_file) | |
print(f"Result for non-existent file:\n{error_text}") | |
assert "Error:" in error_text # Expect an error message |