HF_Agents_Final_Project / src /speech_to_text.py
Yago Bolivar
Refactor speech_to_text.py to implement a singleton ASR pipeline, enhance error handling, and introduce SpeechToTextTool for better integration. Update spreadsheet_tool.py to support querying and improve parsing functionality, including CSV support. Enhance video_processing_tool.py with new tasks for metadata extraction and frame extraction, while improving object detection capabilities and initialization checks.
87aa741
from transformers import pipeline
import librosa # Or soundfile
import os
from smolagents.tools import Tool # Added import
from typing import Optional # Added for type hinting
# Initialize the ASR pipeline once
_asr_pipeline_instance = None
def get_asr_pipeline():
global _asr_pipeline_instance
if _asr_pipeline_instance is None:
try:
# Using a smaller Whisper model for quicker setup, but larger models offer better accuracy
_asr_pipeline_instance = pipeline(
"automatic-speech-recognition",
model="openai/whisper-tiny.en", # Consider making model configurable
)
print("ASR pipeline initialized.") # For feedback
except Exception as e:
print(f"Error initializing ASR pipeline: {e}")
# Handle error appropriately, e.g., raise or log
return _asr_pipeline_instance
# Original transcription function, renamed to be internal
def _transcribe_audio_file(audio_filepath: str, asr_pipeline_instance) -> str:
"""
Converts speech in an audio file to text using the provided ASR pipeline.
Args:
audio_filepath (str): Path to the audio file.
asr_pipeline_instance: The initialized ASR pipeline.
Returns:
str: Transcribed text from the audio or an error message.
"""
if not asr_pipeline_instance:
return "Error: ASR pipeline is not available."
if not os.path.exists(audio_filepath):
return f"Error: Audio file not found at {audio_filepath}"
try:
# Ensure the file can be loaded by librosa (or your chosen audio library)
# This step can help catch corrupted or unsupported audio formats early.
y, sr = librosa.load(audio_filepath, sr=None) # Load with original sample rate
if sr != 16000: # Whisper models expect 16kHz
y = librosa.resample(y, orig_sr=sr, target_sr=16000)
# Pass the numpy array to the pipeline
transcription_result = asr_pipeline_instance(
{"raw": y, "sampling_rate": 16000}, return_timestamps=False
) # Changed to False for simplicity
return transcription_result["text"]
except Exception as e:
return f"Error during transcription of {audio_filepath}: {e}"
class SpeechToTextTool(Tool):
"""
Transcribes audio from a given audio file path to text.
"""
name = "speech_to_text_transcriber"
description = "Converts speech in an audio file (e.g., .mp3, .wav) to text using speech recognition."
inputs = {
"audio_filepath": {"type": "string", "description": "Path to the audio file to transcribe."}
}
outputs = {
"transcribed_text": {
"type": "string",
"description": "The transcribed text from the audio, or an error message.",
}
}
output_type = "string"
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.asr_pipeline = get_asr_pipeline() # Initialize or get the shared pipeline
self.is_initialized = True if self.asr_pipeline else False
def forward(self, audio_filepath: str) -> str:
"""
Wrapper for the _transcribe_audio_file function.
"""
if not self.is_initialized or not self.asr_pipeline:
return "Error: SpeechToTextTool was not initialized properly (ASR pipeline missing)."
return _transcribe_audio_file(audio_filepath, self.asr_pipeline)
# Expose the original function name if needed by other parts of the system (optional)
# transcribe_audio = _transcribe_audio_file # This would need adjustment if it expects the pipeline passed in
# Example usage:
if __name__ == "__main__":
tool_instance = SpeechToTextTool()
# Create a dummy MP3 file for testing (requires ffmpeg to be installed for pydub to work)
# This part is tricky to make universally runnable without external dependencies for audio creation.
# For a simple test, we'll assume a file exists or skip this part if it doesn't.
# Path to a test audio file (replace with an actual .mp3 or .wav file for testing)
# You might need to download a short sample audio file and place it in your project.
# e.g., create a `test_data` directory and put `sample.mp3` there.
test_audio_file = "./data/downloaded_files/1f975693-876d-457b-a649-393859e79bf3.mp3" # GAIA example
# test_audio_file_2 = "./data/downloaded_files/99c9cc74-fdc8-46c6-8f8d-3ce2d3bfeea3.mp3" # GAIA example
if tool_instance.is_initialized:
if os.path.exists(test_audio_file):
print(f"Attempting to transcribe: {test_audio_file}")
transcribed_text = tool_instance.forward(test_audio_file)
print(f"Transcription:\n{transcribed_text}")
else:
print(
f"Test audio file not found: {test_audio_file}. Skipping transcription test."
)
print("Please place a sample .mp3 or .wav file at that location for testing.")
# if os.path.exists(test_audio_file_2):
# print(f"\nAttempting to transcribe: {test_audio_file_2}")
# transcribed_text_2 = tool_instance.forward(test_audio_file_2)
# print(f"Transcription 2:\n{transcribed_text_2}")
# else:
# print(f"Test audio file 2 not found: {test_audio_file_2}. Skipping.")
else:
print(
"SpeechToTextTool could not be initialized (ASR pipeline missing). Transcription test skipped."
)
# Test with a non-existent file
non_existent_file = "./non_existent_audio.mp3"
print(f"\nAttempting to transcribe non-existent file: {non_existent_file}")
error_text = tool_instance.forward(non_existent_file)
print(f"Result for non-existent file:\n{error_text}")
assert "Error:" in error_text # Expect an error message