File size: 2,745 Bytes
9571928
02840f8
5c5f32d
0b0ce33
 
02840f8
 
0a0ae08
0b0ce33
9571928
0b0ce33
9571928
0b0ce33
 
 
9571928
0b0ce33
 
 
 
9571928
 
0a0ae08
 
 
9571928
 
0a0ae08
 
9571928
 
 
02840f8
9571928
 
 
0a0ae08
 
 
 
 
 
 
 
9571928
 
 
 
0a0ae08
9571928
0b0ce33
9571928
 
 
0a0ae08
9571928
0a0ae08
9571928
0a0ae08
 
 
9571928
 
 
 
0a0ae08
9571928
 
 
 
 
 
02840f8
 
 
0b0ce33
5c5f32d
0a0ae08
0b0ce33
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
import os
import base64
import requests
from smolagents import Tool

class ImageAnalysisTool(Tool):
    name = "image_analysis"
    description = "Analyze the content of an image and answer a specific question about it using Hugging Face Inference API."
    inputs = {
        "image_path": {
            "type": "string",
            "description": "Path to the image file (jpg, png, etc.)"
        },
        "question": {
            "type": "string",
            "description": "A question about the image content"
        }
    }
    output_type = "string"

    def __init__(self):
        super().__init__()
        api_token = os.getenv("HF_API_TOKEN")
        if not api_token:
            raise EnvironmentError("HF_API_TOKEN not found in environment variables.")
        self.api_url = "https://api-inference.huggingface.co/models/microsoft/git-base-captioning"
        self.headers = {
            "Authorization": f"Bearer {api_token}",
            "Content-Type": "application/json"
        }

    def forward(self, image_path: str, question: str) -> str:
        try:
            with open(image_path, "rb") as img_file:
                image_bytes = img_file.read()

            # Encode image to base64 string
            img_b64 = base64.b64encode(image_bytes).decode("utf-8")

            # Prepare JSON payload - the exact structure depends on the model capabilities
            # Here we send just the image for captioning
            payload = {
                "inputs": img_b64
            }

            response = requests.post(
                self.api_url,
                headers=self.headers,
                json=payload,
                timeout=60
            )

            if response.status_code == 200:
                result = response.json()

                caption = None
                # Try common keys for caption output
                if isinstance(result, dict):
                    caption = result.get("generated_text") or result.get("caption") or result.get("text")
                elif isinstance(result, list) and len(result) > 0 and isinstance(result[0], dict):
                    caption = result[0].get("generated_text") or result[0].get("caption") or result[0].get("text")

                if not caption:
                    return "Error: No caption found in model response."

                # Combine caption with the question to form a simple answer
                answer = f"Caption: {caption}\nAnswer to question '{question}': {caption}"
                return answer.strip()

            else:
                return f"Error analyzing image: {response.status_code} {response.text}"

        except Exception as e:
            return f"Error analyzing image: {e}"