HF_Agents_Final_Project

Sleeping

HF_Agents_Final_Project / src /speech_to_text.py

Yago Bolivar

Refactor speech_to_text.py to implement a singleton ASR pipeline, enhance error handling, and introduce SpeechToTextTool for better integration. Update spreadsheet_tool.py to support querying and improve parsing functionality, including CSV support. Enhance video_processing_tool.py with new tasks for metadata extraction and frame extraction, while improving object detection capabilities and initialization checks.

87aa741 3 months ago

raw

history blame contribute delete

5.85 kB

	from transformers import pipeline
	import librosa # Or soundfile
	import os
	from smolagents.tools import Tool # Added import
	from typing import Optional # Added for type hinting

	# Initialize the ASR pipeline once
	_asr_pipeline_instance = None


	def get_asr_pipeline():
	global _asr_pipeline_instance
	if _asr_pipeline_instance is None:
	try:
	# Using a smaller Whisper model for quicker setup, but larger models offer better accuracy
	_asr_pipeline_instance = pipeline(
	"automatic-speech-recognition",
	model="openai/whisper-tiny.en", # Consider making model configurable
	)
	print("ASR pipeline initialized.") # For feedback
	except Exception as e:
	print(f"Error initializing ASR pipeline: {e}")
	# Handle error appropriately, e.g., raise or log
	return _asr_pipeline_instance


	# Original transcription function, renamed to be internal
	def _transcribe_audio_file(audio_filepath: str, asr_pipeline_instance) -> str:
	"""
	Converts speech in an audio file to text using the provided ASR pipeline.
	Args:
	audio_filepath (str): Path to the audio file.
	asr_pipeline_instance: The initialized ASR pipeline.
	Returns:
	str: Transcribed text from the audio or an error message.
	"""
	if not asr_pipeline_instance:
	return "Error: ASR pipeline is not available."
	if not os.path.exists(audio_filepath):
	return f"Error: Audio file not found at {audio_filepath}"
	try:
	# Ensure the file can be loaded by librosa (or your chosen audio library)
	# This step can help catch corrupted or unsupported audio formats early.
	y, sr = librosa.load(audio_filepath, sr=None) # Load with original sample rate
	if sr != 16000: # Whisper models expect 16kHz
	y = librosa.resample(y, orig_sr=sr, target_sr=16000)

	# Pass the numpy array to the pipeline
	transcription_result = asr_pipeline_instance(
	{"raw": y, "sampling_rate": 16000}, return_timestamps=False
	) # Changed to False for simplicity
	return transcription_result["text"]
	except Exception as e:
	return f"Error during transcription of {audio_filepath}: {e}"


	class SpeechToTextTool(Tool):
	"""
	Transcribes audio from a given audio file path to text.
	"""

	name = "speech_to_text_transcriber"
	description = "Converts speech in an audio file (e.g., .mp3, .wav) to text using speech recognition."
	inputs = {
	"audio_filepath": {"type": "string", "description": "Path to the audio file to transcribe."}
	}
	outputs = {
	"transcribed_text": {
	"type": "string",
	"description": "The transcribed text from the audio, or an error message.",
	}
	}
	output_type = "string"

	def __init__(self, args, *kwargs):
	super().__init__(args, *kwargs)
	self.asr_pipeline = get_asr_pipeline() # Initialize or get the shared pipeline
	self.is_initialized = True if self.asr_pipeline else False

	def forward(self, audio_filepath: str) -> str:
	"""
	Wrapper for the _transcribe_audio_file function.
	"""
	if not self.is_initialized or not self.asr_pipeline:
	return "Error: SpeechToTextTool was not initialized properly (ASR pipeline missing)."
	return _transcribe_audio_file(audio_filepath, self.asr_pipeline)


	# Expose the original function name if needed by other parts of the system (optional)
	# transcribe_audio = _transcribe_audio_file # This would need adjustment if it expects the pipeline passed in

	# Example usage:
	if __name__ == "__main__":
	tool_instance = SpeechToTextTool()

	# Create a dummy MP3 file for testing (requires ffmpeg to be installed for pydub to work)
	# This part is tricky to make universally runnable without external dependencies for audio creation.
	# For a simple test, we'll assume a file exists or skip this part if it doesn't.

	# Path to a test audio file (replace with an actual .mp3 or .wav file for testing)
	# You might need to download a short sample audio file and place it in your project.
	# e.g., create a `test_data` directory and put `sample.mp3` there.
	test_audio_file = "./data/downloaded_files/1f975693-876d-457b-a649-393859e79bf3.mp3" # GAIA example
	# test_audio_file_2 = "./data/downloaded_files/99c9cc74-fdc8-46c6-8f8d-3ce2d3bfeea3.mp3" # GAIA example

	if tool_instance.is_initialized:
	if os.path.exists(test_audio_file):
	print(f"Attempting to transcribe: {test_audio_file}")
	transcribed_text = tool_instance.forward(test_audio_file)
	print(f"Transcription:\n{transcribed_text}")
	else:
	print(
	f"Test audio file not found: {test_audio_file}. Skipping transcription test."
	)
	print("Please place a sample .mp3 or .wav file at that location for testing.")

	# if os.path.exists(test_audio_file_2):
	# print(f"\nAttempting to transcribe: {test_audio_file_2}")
	# transcribed_text_2 = tool_instance.forward(test_audio_file_2)
	# print(f"Transcription 2:\n{transcribed_text_2}")
	# else:
	# print(f"Test audio file 2 not found: {test_audio_file_2}. Skipping.")

	else:
	print(
	"SpeechToTextTool could not be initialized (ASR pipeline missing). Transcription test skipped."
	)

	# Test with a non-existent file
	non_existent_file = "./non_existent_audio.mp3"
	print(f"\nAttempting to transcribe non-existent file: {non_existent_file}")
	error_text = tool_instance.forward(non_existent_file)
	print(f"Result for non-existent file:\n{error_text}")
	assert "Error:" in error_text # Expect an error message