import os import base64 import requests from smolagents import Tool class ImageAnalysisTool(Tool): name = "image_analysis" description = "Analyze the content of an image and answer a specific question about it using Hugging Face Inference API." inputs = { "image_path": { "type": "string", "description": "Path to the image file (jpg, png, etc.)" }, "question": { "type": "string", "description": "A question about the image content" } } output_type = "string" def __init__(self): super().__init__() api_token = os.getenv("HF_API_TOKEN") if not api_token: raise EnvironmentError("HF_API_TOKEN not found in environment variables.") self.api_url = "https://api-inference.huggingface.co/models/microsoft/git-base-captioning" self.headers = { "Authorization": f"Bearer {api_token}", "Content-Type": "application/json" } def forward(self, image_path: str, question: str) -> str: try: with open(image_path, "rb") as img_file: image_bytes = img_file.read() # Encode image to base64 string img_b64 = base64.b64encode(image_bytes).decode("utf-8") # Prepare JSON payload - the exact structure depends on the model capabilities # Here we send just the image for captioning payload = { "inputs": img_b64 } response = requests.post( self.api_url, headers=self.headers, json=payload, timeout=60 ) if response.status_code == 200: result = response.json() caption = None # Try common keys for caption output if isinstance(result, dict): caption = result.get("generated_text") or result.get("caption") or result.get("text") elif isinstance(result, list) and len(result) > 0 and isinstance(result[0], dict): caption = result[0].get("generated_text") or result[0].get("caption") or result[0].get("text") if not caption: return "Error: No caption found in model response." # Combine caption with the question to form a simple answer answer = f"Caption: {caption}\nAnswer to question '{question}': {caption}" return answer.strip() else: return f"Error analyzing image: {response.status_code} {response.text}" except Exception as e: return f"Error analyzing image: {e}"