import os import requests from smolagents import Tool class AudioTranscriptionTool(Tool): name = "audio_transcriber" description = "Transcribe a given audio file in mp3 or wav format to text using Whisper via Hugging Face API." inputs = { "file_path": { "type": "string", "description": "Path to the audio file (must be .mp3 or .wav)" } } output_type = "string" def __init__(self): super().__init__() self.api_url = "https://api-inference.huggingface.co/models/openai/whisper-large" self.headers = { "Authorization": f"Bearer {os.getenv('HF_API_TOKEN')}" } def forward(self, file_path: str) -> str: try: with open(file_path, "rb") as audio_file: audio_bytes = audio_file.read() response = requests.post( self.api_url, headers=self.headers, data=audio_bytes, timeout=60 ) if response.status_code == 200: result = response.json() # The exact key depends on the model; usually 'text' for whisper transcription = result.get("text", None) if transcription: return transcription.strip() else: return "Error: No transcription found in the response." else: return f"Error transcribing audio: {response.status_code} {response.text}" except Exception as e: return f"Error transcribing audio: {e}"