import os import base64 import requests from smolagents import Tool class ImageAnalysisTool(Tool): name = "image_analysis" description = "Analyze the content of an image and answer a specific question about it using HF Inference API." inputs = { "image_path": { "type": "string", "description": "Path to the image file (jpg, png, etc.)" }, "question": { "type": "string", "description": "A question about the image content" } } output_type = "string" def __init__(self): super().__init__() # You can replace this with any vision model capable of VQA or image captioning self.api_url = "https://api-inference.huggingface.co/models/microsoft/git-base-captioning" self.headers = { "Authorization": f"Bearer {os.getenv('HF_API_TOKEN')}" } def forward(self, image_path: str, question: str) -> str: try: with open(image_path, "rb") as img_file: image_bytes = img_file.read() # Prepare the payload depending on the model API. # Some models accept just the image bytes and return captions, # some support multimodal input with text question + image. # For this example, we'll assume a captioning model and append question manually. response = requests.post( self.api_url, headers=self.headers, data=image_bytes, timeout=60 ) if response.status_code == 200: result = response.json() caption = None # The format depends on the model; check keys like 'generated_text' or 'caption' if isinstance(result, dict): caption = result.get("generated_text") or result.get("caption") elif isinstance(result, list) and len(result) > 0: caption = result[0].get("generated_text") if "generated_text" in result[0] else None if not caption: return "Error: No caption found in model response." # Simple approach: combine caption + question to produce answer prompt # If you want a deeper answer, you could chain a chat model here. answer = f"Caption: {caption}\nAnswer to question '{question}': {caption}" return answer.strip() else: return f"Error analyzing image: {response.status_code} {response.text}" except Exception as e: return f"Error analyzing image: {e}"