dlaima commited on
Commit
0a0ae08
·
verified ·
1 Parent(s): 593012b

Update image_analyzer.py

Browse files
Files changed (1) hide show
  1. image_analyzer.py +22 -14
image_analyzer.py CHANGED
@@ -5,7 +5,7 @@ from smolagents import Tool
5
 
6
  class ImageAnalysisTool(Tool):
7
  name = "image_analysis"
8
- description = "Analyze the content of an image and answer a specific question about it using HF Inference API."
9
  inputs = {
10
  "image_path": {
11
  "type": "string",
@@ -20,10 +20,13 @@ class ImageAnalysisTool(Tool):
20
 
21
  def __init__(self):
22
  super().__init__()
23
- # You can replace this with any vision model capable of VQA or image captioning
 
 
24
  self.api_url = "https://api-inference.huggingface.co/models/microsoft/git-base-captioning"
25
  self.headers = {
26
- "Authorization": f"Bearer {os.getenv('HF_API_TOKEN')}"
 
27
  }
28
 
29
  def forward(self, image_path: str, question: str) -> str:
@@ -31,32 +34,36 @@ class ImageAnalysisTool(Tool):
31
  with open(image_path, "rb") as img_file:
32
  image_bytes = img_file.read()
33
 
34
- # Prepare the payload depending on the model API.
35
- # Some models accept just the image bytes and return captions,
36
- # some support multimodal input with text question + image.
37
- # For this example, we'll assume a captioning model and append question manually.
 
 
 
 
38
 
39
  response = requests.post(
40
  self.api_url,
41
  headers=self.headers,
42
- data=image_bytes,
43
  timeout=60
44
  )
45
 
46
  if response.status_code == 200:
47
  result = response.json()
 
48
  caption = None
49
- # The format depends on the model; check keys like 'generated_text' or 'caption'
50
  if isinstance(result, dict):
51
- caption = result.get("generated_text") or result.get("caption")
52
- elif isinstance(result, list) and len(result) > 0:
53
- caption = result[0].get("generated_text") if "generated_text" in result[0] else None
54
 
55
  if not caption:
56
  return "Error: No caption found in model response."
57
 
58
- # Simple approach: combine caption + question to produce answer prompt
59
- # If you want a deeper answer, you could chain a chat model here.
60
  answer = f"Caption: {caption}\nAnswer to question '{question}': {caption}"
61
  return answer.strip()
62
 
@@ -68,4 +75,5 @@ class ImageAnalysisTool(Tool):
68
 
69
 
70
 
 
71
 
 
5
 
6
  class ImageAnalysisTool(Tool):
7
  name = "image_analysis"
8
+ description = "Analyze the content of an image and answer a specific question about it using Hugging Face Inference API."
9
  inputs = {
10
  "image_path": {
11
  "type": "string",
 
20
 
21
  def __init__(self):
22
  super().__init__()
23
+ api_token = os.getenv("HF_API_TOKEN")
24
+ if not api_token:
25
+ raise EnvironmentError("HF_API_TOKEN not found in environment variables.")
26
  self.api_url = "https://api-inference.huggingface.co/models/microsoft/git-base-captioning"
27
  self.headers = {
28
+ "Authorization": f"Bearer {api_token}",
29
+ "Content-Type": "application/json"
30
  }
31
 
32
  def forward(self, image_path: str, question: str) -> str:
 
34
  with open(image_path, "rb") as img_file:
35
  image_bytes = img_file.read()
36
 
37
+ # Encode image to base64 string
38
+ img_b64 = base64.b64encode(image_bytes).decode("utf-8")
39
+
40
+ # Prepare JSON payload - the exact structure depends on the model capabilities
41
+ # Here we send just the image for captioning
42
+ payload = {
43
+ "inputs": img_b64
44
+ }
45
 
46
  response = requests.post(
47
  self.api_url,
48
  headers=self.headers,
49
+ json=payload,
50
  timeout=60
51
  )
52
 
53
  if response.status_code == 200:
54
  result = response.json()
55
+
56
  caption = None
57
+ # Try common keys for caption output
58
  if isinstance(result, dict):
59
+ caption = result.get("generated_text") or result.get("caption") or result.get("text")
60
+ elif isinstance(result, list) and len(result) > 0 and isinstance(result[0], dict):
61
+ caption = result[0].get("generated_text") or result[0].get("caption") or result[0].get("text")
62
 
63
  if not caption:
64
  return "Error: No caption found in model response."
65
 
66
+ # Combine caption with the question to form a simple answer
 
67
  answer = f"Caption: {caption}\nAnswer to question '{question}': {caption}"
68
  return answer.strip()
69
 
 
75
 
76
 
77
 
78
+
79