File size: 4,206 Bytes
a225ae4 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 |
import base64
import os
from openai import OpenAI
from smolagents import Tool
client = OpenAI()
class DescribeImageTool(Tool):
"""
Tool to analyze and describe any image using GPT-4 Vision API.
Args:
image_path (str): Path to the image file.
description_type (str): Type of description to generate. Options:
- "general": General description of the image
- "detailed": Detailed analysis of the image
- "chess": Analysis of a chess position
- "text": Extract and describe text from the image
- "custom": Custom description based on user prompt
Returns:
str: Description of the image based on the requested type.
"""
name = "describe_image"
description = "Analyzes and describes images using GPT-4 Vision API"
inputs = {
"image_path": {"type": "string", "description": "Path to the image file"},
"description_type": {
"type": "string",
"description": "Type of description to generate (general, detailed, chess, text, custom)",
"nullable": True,
},
"custom_prompt": {
"type": "string",
"description": "Custom prompt for description (only used when description_type is 'custom')",
"nullable": True,
},
}
output_type = "string"
def encode_image(self, image_path: str) -> str:
"""Encode image to base64 string."""
with open(image_path, "rb") as image_file:
return base64.b64encode(image_file.read()).decode("utf-8")
def get_prompt(self, description_type: str, custom_prompt: str = None) -> str:
"""Get appropriate prompt based on description type."""
prompts = {
"general": "Provide a general description of this image. Focus on the main subjects, colors, and overall scene.",
"detailed": """Analyze this image in detail. Include:
1. Main subjects and their relationships
2. Colors, lighting, and composition
3. Any text or symbols present
4. Context or possible meaning
5. Notable details or interesting elements""",
"chess": """Analyze this chess position and provide a detailed description including:
1. List of pieces on the board for both white and black
2. Whose turn it is to move
3. Basic evaluation of the position
4. Any immediate tactical opportunities or threats
5. Suggested next moves with brief explanations""",
"text": "Extract and describe any text present in this image. If there are multiple pieces of text, organize them clearly.",
}
return (
custom_prompt
if description_type == "custom"
else prompts.get(description_type, prompts["general"])
)
def forward(
self,
image_path: str,
description_type: str = "general",
custom_prompt: str = None,
) -> str:
try:
if not os.path.exists(image_path):
return f"Error: Image file not found at {image_path}"
# Encode the image
base64_image = self.encode_image(image_path)
# Get appropriate prompt
prompt = self.get_prompt(description_type, custom_prompt)
# Make the API call
response = client.chat.completions.create(
model="gpt-4.1",
messages=[
{
"role": "user",
"content": [
{"type": "text", "text": prompt},
{
"type": "image_url",
"image_url": {
"url": f"data:image/jpeg;base64,{base64_image}"
},
},
],
}
],
max_tokens=1000,
)
return response.choices[0].message.content
except Exception as e:
return f"Error analyzing image: {str(e)}"
|