dlaima commited on
Commit
9571928
·
verified ·
1 Parent(s): ba3b3ff

Update image_analyzer.py

Browse files
Files changed (1) hide show
  1. image_analyzer.py +48 -36
image_analyzer.py CHANGED
@@ -1,59 +1,71 @@
 
1
  import base64
2
  import requests
3
- import openai
4
  from smolagents import Tool
5
 
6
  class ImageAnalysisTool(Tool):
7
  name = "image_analysis"
8
- description = "Analyze the content of an image and answer a specific question about it."
9
  inputs = {
10
- "url": {
11
  "type": "string",
12
- "description": "URL to the image"
13
  },
14
  "question": {
15
  "type": "string",
16
- "description": "Question about the image content"
17
  }
18
  }
19
  output_type = "string"
20
 
21
- def forward(self, url: str, question: str) -> str:
 
 
 
 
 
 
 
 
22
  try:
23
- # Download image
24
- image_path = "/tmp/image_input.jpg"
25
- r = requests.get(url)
26
- with open(image_path, "wb") as f:
27
- f.write(r.content)
28
-
29
- # Encode & analyze
30
- base64_image = self.encode_image(image_path)
31
- response = openai.ChatCompletion.create(
32
- model="gpt-4-turbo",
33
- messages=[
34
- {
35
- "role": "user",
36
- "content": [
37
- {"type": "text", "text": question},
38
- {
39
- "type": "image_url",
40
- "image_url": {
41
- "url": f"data:image/jpeg;base64,{base64_image}"
42
- }
43
- }
44
- ]
45
- }
46
- ],
47
- max_tokens=300
48
  )
49
- return response["choices"][0]["message"]["content"].strip()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
50
  except Exception as e:
51
  return f"Error analyzing image: {e}"
52
 
53
- def encode_image(self, image_path):
54
- with open(image_path, "rb") as image_file:
55
- return base64.b64encode(image_file.read()).decode("utf-8")
56
-
57
 
58
 
59
 
 
1
+ import os
2
  import base64
3
  import requests
 
4
  from smolagents import Tool
5
 
6
  class ImageAnalysisTool(Tool):
7
  name = "image_analysis"
8
+ description = "Analyze the content of an image and answer a specific question about it using HF Inference API."
9
  inputs = {
10
+ "image_path": {
11
  "type": "string",
12
+ "description": "Path to the image file (jpg, png, etc.)"
13
  },
14
  "question": {
15
  "type": "string",
16
+ "description": "A question about the image content"
17
  }
18
  }
19
  output_type = "string"
20
 
21
+ def __init__(self):
22
+ super().__init__()
23
+ # You can replace this with any vision model capable of VQA or image captioning
24
+ self.api_url = "https://api-inference.huggingface.co/models/microsoft/git-base-captioning"
25
+ self.headers = {
26
+ "Authorization": f"Bearer {os.getenv('HF_API_TOKEN')}"
27
+ }
28
+
29
+ def forward(self, image_path: str, question: str) -> str:
30
  try:
31
+ with open(image_path, "rb") as img_file:
32
+ image_bytes = img_file.read()
33
+
34
+ # Prepare the payload depending on the model API.
35
+ # Some models accept just the image bytes and return captions,
36
+ # some support multimodal input with text question + image.
37
+ # For this example, we'll assume a captioning model and append question manually.
38
+
39
+ response = requests.post(
40
+ self.api_url,
41
+ headers=self.headers,
42
+ data=image_bytes,
43
+ timeout=60
 
 
 
 
 
 
 
 
 
 
 
 
44
  )
45
+
46
+ if response.status_code == 200:
47
+ result = response.json()
48
+ caption = None
49
+ # The format depends on the model; check keys like 'generated_text' or 'caption'
50
+ if isinstance(result, dict):
51
+ caption = result.get("generated_text") or result.get("caption")
52
+ elif isinstance(result, list) and len(result) > 0:
53
+ caption = result[0].get("generated_text") if "generated_text" in result[0] else None
54
+
55
+ if not caption:
56
+ return "Error: No caption found in model response."
57
+
58
+ # Simple approach: combine caption + question to produce answer prompt
59
+ # If you want a deeper answer, you could chain a chat model here.
60
+ answer = f"Caption: {caption}\nAnswer to question '{question}': {caption}"
61
+ return answer.strip()
62
+
63
+ else:
64
+ return f"Error analyzing image: {response.status_code} {response.text}"
65
+
66
  except Exception as e:
67
  return f"Error analyzing image: {e}"
68
 
 
 
 
 
69
 
70
 
71