stzhao commited on
Commit
ecd3503
·
verified ·
1 Parent(s): 8bfa5ee

Update vis_python_exe.py

Browse files
Files changed (1) hide show
  1. vis_python_exe.py +737 -393
vis_python_exe.py CHANGED
@@ -1,439 +1,783 @@
 
1
  import os
2
- import io
3
- import regex
4
- import pickle
5
- import traceback
6
- import copy
7
- import datetime
8
- import dateutil.relativedelta
9
- import multiprocess
10
- from multiprocess import Pool
11
- from typing import Any, Dict, Optional, Tuple, List, Union
12
- from pebble import ProcessPool
13
- from tqdm import tqdm
14
- from concurrent.futures import TimeoutError
15
- from functools import partial
16
- from timeout_decorator import timeout
17
- from contextlib import redirect_stdout
18
  import base64
19
  from io import BytesIO
20
  from PIL import Image
21
- import pdb
 
 
 
22
 
23
- def encode_image(image_path):
24
- with open(image_path, "rb") as image_file:
25
- return base64.b64encode(image_file.read()).decode('utf-8')
26
-
27
- def base64_to_image(
28
- base64_str: str,
29
- remove_prefix: bool = True,
30
- convert_mode: Optional[str] = "RGB"
31
- ) -> Union[Image.Image, None]:
32
  """
33
- 将Base64编码的图片字符串转换为PIL Image对象
34
 
35
  Args:
36
- base64_str: Base64编码的图片字符串(可带data:前缀)
37
- remove_prefix: 是否自动去除"data:image/..."前缀(默认True)
38
- convert_mode: 转换为指定模式(如"RGB"/"RGBA",None表示不转换)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39
 
 
 
40
  Returns:
41
- PIL.Image.Image 对象,解码失败时返回None
42
-
43
- Examples:
44
- >>> img = base64_to_image("data:image/png;base64,iVBORw0KGg...")
45
- >>> img = base64_to_image("iVBORw0KGg...", remove_prefix=False)
46
  """
47
- try:
48
- # 1. 处理Base64前缀
49
- if remove_prefix and "," in base64_str:
50
- base64_str = base64_str.split(",")[1]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
51
 
52
- # 2. 解码Base64
53
- image_data = base64.b64decode(base64_str)
54
-
55
- # 3. 转换为PIL Image
56
- image = Image.open(BytesIO(image_data))
57
-
58
- # 4. 可选模式转换
59
- if convert_mode:
60
- image = image.convert(convert_mode)
 
 
 
 
 
 
 
 
 
 
 
61
 
62
- return image
63
-
64
- except (base64.binascii.Error, OSError, Exception) as e:
65
- print(f"Base64解码失败: {str(e)}")
66
- return None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
67
 
 
 
 
 
 
 
68
 
69
- class GenericRuntime:
70
- GLOBAL_DICT = {}
71
- LOCAL_DICT = None
72
- HEADERS = []
73
 
74
- def __init__(self):
75
- self._global_vars = copy.copy(self.GLOBAL_DICT)
76
- self._local_vars = copy.copy(self.LOCAL_DICT) if self.LOCAL_DICT else None
77
- self._captured_figures = []
 
 
 
 
 
78
 
79
- for c in self.HEADERS:
80
- self.exec_code(c)
 
 
 
 
81
 
82
- def exec_code(self, code_piece: str) -> None:
83
- if regex.search(r"(\s|^)?input\(", code_piece) or regex.search(
84
- r"(\s|^)?os.system\(", code_piece
85
- ):
86
- raise RuntimeError("Forbidden function calls detected")
87
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
88
 
 
 
 
 
 
 
 
89
 
90
- # 检测并修改plt.show()调用
91
- if "plt.show()" in code_piece:
92
- modified_code = code_piece.replace("plt.show()", """
93
- # 捕获当前图像
94
- buf = io.BytesIO()
95
- plt.savefig(buf, format='png')
96
- buf.seek(0)
97
- _captured_image = base64.b64encode(buf.read()).decode('utf-8')
98
- _captured_figures.append(_captured_image)
99
- plt.close()
100
- """)
101
- # 确保_captured_figures变量存在
102
- if "_captured_figures" not in self._global_vars:
103
- self._global_vars["_captured_figures"] = []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
104
 
105
- exec(modified_code, self._global_vars)
 
106
  else:
107
- print("###################################### I am excuting code. ##############################################")
108
- exec(code_piece, self._global_vars)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
109
 
110
- def eval_code(self, expr: str) -> Any:
111
- return eval(expr, self._global_vars)
 
 
 
 
 
 
 
 
 
 
 
 
 
112
 
113
- def inject(self, var_dict: Dict[str, Any]) -> None:
114
- for k, v in var_dict.items():
115
- self._global_vars[k] = v
116
 
117
- @property
118
- def answer(self):
119
- return self._global_vars.get("answer", None)
120
-
121
- @property
122
- def captured_figures(self):
123
- return self._global_vars.get("_captured_figures", [])
 
124
 
 
 
 
 
 
125
 
126
- class ImageRuntime(GenericRuntime):
127
- """支持图像处理的运行时环境"""
128
- GLOBAL_DICT = {} # 不预加载模块,避免序列化问题
129
- LOCAL_DICT = None
 
 
 
 
 
 
 
130
 
131
- HEADERS = [
132
- "import matplotlib",
133
- "matplotlib.use('Agg')", # 使用非交互式后端
134
- "import matplotlib.pyplot as plt",
135
- "from PIL import Image",
136
- "import io",
137
- "import base64",
138
- "import numpy as np",
139
- "_captured_figures = []", # 初始化图像捕获列表
140
- ]
141
-
142
- def __init__(self, messages):
143
- super().__init__()
144
-
145
- image_var_dict = {}
146
- image_var_idx = 0
147
- for message_item in messages:
148
- content = message_item['content'] # {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}}
149
- for item in content:
150
- item_type = item['type']
151
- if item_type == "image_url":
152
- item_image_url = item['image_url']['url']
153
- image = base64_to_image(item_image_url)
154
- image_var_dict[f"image_clue_{image_var_idx}"] = image
155
- image_var_idx += 1
156
-
157
- self.inject(image_var_dict)
158
- print("##################### Initialized ImageRuntime. ##########################")
159
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
160
 
161
- class DateRuntime(GenericRuntime):
162
- GLOBAL_DICT = {}
163
- HEADERS = [
164
- "import datetime",
165
- "from dateutil.relativedelta import relativedelta",
166
- "timedelta = relativedelta"
167
- ]
168
-
169
-
170
- class CustomDict(dict):
171
- def __iter__(self):
172
- return list(super().__iter__()).__iter__()
173
-
174
-
175
- class ColorObjectRuntime(GenericRuntime):
176
- GLOBAL_DICT = {"dict": CustomDict}
177
-
178
-
179
- class PythonExecutor:
180
- def __init__(
181
- self,
182
- runtime_class=None,
183
- get_answer_symbol: Optional[str] = None,
184
- get_answer_expr: Optional[str] = None,
185
- get_answer_from_stdout: bool = True,
186
- timeout_length: int = 20,
187
- ) -> None:
188
- print(f"#################### When Init PythonExcutor, RunTime typel:, TimeOut Length: {timeout_length} #############################")
189
- self.runtime_class = runtime_class if runtime_class else ImageRuntime
190
- print(self.runtime_class)
191
- self.answer_symbol = get_answer_symbol
192
- self.answer_expr = get_answer_expr
193
- self.get_answer_from_stdout = get_answer_from_stdout
194
- self.pool = Pool(multiprocess.cpu_count())
195
- self.timeout_length = timeout_length
196
-
197
- def process_generation_to_code(self, gens: str):
198
- return [g.split("\n") for g in gens]
199
-
200
- @staticmethod
201
- def execute(
202
- code,
203
- messages,
204
- get_answer_from_stdout=True,
205
- runtime_class=None,
206
- answer_symbol=None,
207
- answer_expr=None,
208
- timeout_length=20,
209
- ) -> Tuple[Union[str, Dict[str, Any]], str]:
210
- # print("dome")
211
- try:
212
- # 在每个进程中创建新的运行时实例
213
- runtime = runtime_class(messages)
214
 
215
- if get_answer_from_stdout:
216
- program_io = io.StringIO()
217
- with redirect_stdout(program_io):
218
- timeout(timeout_length)(runtime.exec_code)("\n".join(code))
219
- program_io.seek(0)
220
- result = program_io.read()
221
- elif answer_symbol:
222
- timeout(timeout_length)(runtime.exec_code)("\n".join(code))
223
- result = runtime._global_vars.get(answer_symbol, "")
224
- elif answer_expr:
225
- timeout(timeout_length)(runtime.exec_code)("\n".join(code))
226
- result = timeout(timeout_length)(runtime.eval_code)(answer_expr)
227
  else:
228
- if len(code) > 1:
229
- timeout(timeout_length)(runtime.exec_code)("\n".join(code[:-1]))
230
- result = timeout(timeout_length)(runtime.eval_code)(code[-1])
 
 
 
 
 
 
 
 
 
 
231
  else:
232
- timeout(timeout_length)(runtime.exec_code)("\n".join(code))
233
- result = ""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
234
 
235
- # 检查是否有捕获的图像
236
- captured_figures = runtime._global_vars.get("_captured_figures", [])
237
- if captured_figures:
238
- # 如果有文本输出和图像,将它们组合
239
- if result:
240
- result = {
241
- 'text': result,
242
- 'images': captured_figures
243
- }
 
 
 
 
 
 
 
 
 
 
244
  else:
245
- result = {'images': captured_figures}
 
 
 
 
 
246
 
247
- report = "Done"
248
- except Exception as e:
249
- result = ""
250
- report = f"Error: {str(e)}\n{traceback.format_exc()}"
251
-
252
- # 确保结果可序列化
253
- try:
254
- pickle.dumps(result)
255
- except Exception as e:
256
- result = f"Result serialization error: {str(e)}"
257
- report = f"Serialization Error: {str(e)}"
258
 
259
- return result, report
260
-
261
- def apply(self, code, messages):
262
- return self.batch_apply([code], messages)[0]
263
-
264
- @staticmethod
265
- def truncate(s, max_length=400):
266
- if isinstance(s, dict):
267
- # 如果是字典(包含图像),只截断文本部分
268
- if 'text' in s:
269
- half = max_length // 2
270
- if len(s['text']) > max_length:
271
- s['text'] = s['text'][:half] + "..." + s['text'][-half:]
272
- return s
273
- else:
274
- half = max_length // 2
275
- if isinstance(s, str) and len(s) > max_length:
276
- s = s[:half] + "..." + s[-half:]
277
- return s
278
-
279
- def batch_apply(self, batch_code, messages):
280
- all_code_snippets = self.process_generation_to_code(batch_code)
281
-
282
- timeout_cnt = 0
283
- all_exec_results = []
284
- print(f"################################### num of cpu: {os.cpu_count()} ; len of code: {len(all_code_snippets)} ######################################")
285
- with ProcessPool(
286
- max_workers=min(len(all_code_snippets), os.cpu_count())
287
- ) as pool:
288
- executor = partial(
289
- self.execute,
290
- get_answer_from_stdout=self.get_answer_from_stdout,
291
- runtime_class=self.runtime_class,
292
- answer_symbol=self.answer_symbol,
293
- answer_expr=self.answer_expr,
294
- timeout_length=self.timeout_length,
295
  )
296
- future = pool.map(executor, all_code_snippets, [messages], timeout=self.timeout_length)
297
- iterator = future.result()
298
 
299
- if len(all_code_snippets) > 100:
300
- progress_bar = tqdm(total=len(all_code_snippets), desc="Execute")
301
- else:
302
- progress_bar = None
303
-
304
- while True:
305
- try:
306
- result = next(iterator)
307
- all_exec_results.append(result)
308
- except StopIteration:
309
- break
310
- except TimeoutError as error:
311
- print(error)
312
- all_exec_results.append(("", "Timeout Error"))
313
- timeout_cnt += 1
314
- except Exception as error:
315
- print(f"Error in batch_apply: {error}")
316
- all_exec_results.append(("", f"Error: {str(error)}"))
317
- if progress_bar is not None:
318
- progress_bar.update(1)
319
-
320
- if progress_bar is not None:
321
- progress_bar.close()
322
-
323
- batch_results = []
324
- for code, (res, report) in zip(all_code_snippets, all_exec_results):
325
- # 处理结果
326
- if isinstance(res, dict):
327
- # 如果结果包含图像,特殊处理
328
- if 'text' in res:
329
- res['text'] = str(res['text']).strip()
330
- res['text'] = self.truncate(res['text'])
331
- report = str(report).strip()
332
- report = self.truncate(report)
333
- else:
334
- # 普通文本结果
335
- res = str(res).strip()
336
- res = self.truncate(res)
337
- report = str(report).strip()
338
- report = self.truncate(report)
339
- batch_results.append((res, report))
340
- return batch_results
341
-
342
-
343
- def _test():
344
- image_path = "/mnt/petrelfs/zhaoshitian/vis_tool_inference_engine/test_data/0.JPG"
345
- image_base64 = encode_image(image_path)
346
- messages = [
347
- {
348
- "role": "user",
349
- "content": [{"type": "text", "text": "From the information on that advertising board, what is the type of this shop?"}]
350
- },
351
- {
352
- "role": "user",
353
- "content": [{"type": "text", "text": "image_clue_0"}] + [{"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{image_base64}"}}]
354
- }
355
- ]
356
- # 测试普通计算
357
- math_code ="""
358
- a = 1
359
- b = 2
360
- c = a + b
361
- print(c)
362
- """
363
-
364
- batch_code = [math_code]
365
-
366
- executor = PythonExecutor()
367
- predictions = executor.apply(batch_code[0], messages)
368
- print("数学计算结果:", predictions)
369
 
370
- # 测试图像显示
371
- image_code = """
372
- import matplotlib.pyplot as plt
373
- import numpy as np
374
- from PIL import Image
375
- import io
376
-
377
- # 创建一个简单的图像
378
- x = np.linspace(0, 10, 100)
379
- y = np.sin(x)
380
-
381
- plt.figure(figsize=(8, 6))
382
- plt.plot(x, y, 'r-', linewidth=2)
383
- plt.title('Sine Wave')
384
- plt.grid(True)
385
- plt.show()
386
-
387
- # 也可以显示一个简单的图像
388
- # 创建一个彩色渐变图像
389
- arr = np.zeros((100, 100, 3), dtype=np.uint8)
390
- for i in range(100):
391
- for j in range(100):
392
- arr[i, j, 0] = i # 红色通道
393
- arr[i, j, 1] = j # 绿色通道
394
- arr[i, j, 2] = 100 # 蓝色通道
395
-
396
- img = Image.fromarray(arr)
397
- plt.figure()
398
- plt.imshow(img)
399
- plt.title('Gradient Image')
400
- plt.show()
401
-
402
- print("图像生成完成")
403
- """
 
 
 
 
 
 
 
 
 
 
404
 
405
- image_code = """
406
- import matplotlib.pyplot as plt
407
- import numpy as np
408
- from PIL import Image
409
- import io
 
 
 
 
 
 
 
 
410
 
411
- plt.imshow(image_clue_0)
412
- plt.title("Original Image - Locate Advertising Board")
413
- plt.show()
414
- """
 
 
 
 
 
 
 
 
 
415
 
416
- image_result = executor.apply(image_code, messages)
417
- print("\n图像结果类型:", type(image_result[0]))
418
- if isinstance(image_result[0], dict) and 'images' in image_result[0]:
419
- print(f"捕获到 {len(image_result[0]['images'])} 个图像")
420
- print("第一个图像的base64编码前20个字符:", image_result[0]['images'][0][:20])
 
 
 
 
 
 
421
 
422
- # 可选:保存图像到文件
423
- for i, img_data in enumerate(image_result[0]['images']):
424
- img_bytes = base64.b64decode(img_data)
425
- with open(f"captured_image_{i}.png", "wb") as f:
426
- f.write(img_bytes)
427
- print(f"图像已保存为 captured_image_{i}.png")
428
-
429
- if 'text' in image_result[0]:
430
- print("文本输出:", image_result[0]['text'])
431
- else:
432
- print("未捕获到图像")
433
- print("结果:", image_result[0])
434
-
435
- print("\n执行状态:", image_result[1])
436
 
 
 
 
 
437
 
438
- if __name__ == "__main__":
439
- _test()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
  import os
3
+ import re
4
+ import json
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
  import base64
6
  from io import BytesIO
7
  from PIL import Image
8
+ import argparse
9
+ from inference_engine.safe_persis_shared_vis_python_exe import PythonExecutor, ImageRuntime
10
+ from openai import OpenAI
11
+ import anthropic
12
 
13
+ def encode_image(image):
 
 
 
 
 
 
 
 
14
  """
15
+ Convert a PIL.Image object or image file path to base64-encoded string, and get resolution info.
16
 
17
  Args:
18
+ image: Can be a PIL.Image object or image file path.
19
+ Returns:
20
+ dict with keys:
21
+ - 'base64': base64-encoded string
22
+ - 'width': width in pixels
23
+ - 'height': height in pixels
24
+ - 'resolution': string "widthxheight"
25
+ """
26
+ img_obj = None
27
+
28
+ if isinstance(image, str):
29
+ # Handle file path
30
+ img_obj = Image.open(image)
31
+ with open(image, "rb") as image_file:
32
+ base64_str = base64.b64encode(image_file.read()).decode('utf-8')
33
+ else:
34
+ # Handle PIL.Image object
35
+ img_obj = image
36
+ buffered = BytesIO()
37
+ image.save(buffered, format='PNG')
38
+ base64_str = base64.b64encode(buffered.getvalue()).decode('utf-8')
39
+
40
+ width, height = img_obj.size
41
+
42
+ return {
43
+ 'base64': base64_str,
44
+ 'width': width,
45
+ 'height': height
46
+ }
47
+
48
+ def encode_image_with_resize(image):
49
+ """
50
+ Convert a PIL.Image object or image file path to base64-encoded string, get resolution info.
51
+ If resolution > 1024x1024, resize to half.
52
 
53
+ Args:
54
+ image: Can be a PIL.Image object or image file path
55
  Returns:
56
+ dict with keys:
57
+ - 'base64': base64-encoded string
58
+ - 'width': width in pixels
59
+ - 'height': height in pixels
60
+ - 'resolution': string "widthxheight"
61
  """
62
+ img_obj = None
63
+
64
+ if isinstance(image, str):
65
+ img_obj = Image.open(image)
66
+ else:
67
+ img_obj = image
68
+
69
+ # Resize if larger than 1024x1024
70
+ width, height = img_obj.size
71
+ if width > 1024 or height > 1024:
72
+ new_size = (width // 2, height // 2)
73
+ img_obj = img_obj.resize(new_size, Image.LANCZOS)
74
+ width, height = img_obj.size
75
+
76
+ buffered = BytesIO()
77
+ img_obj.save(buffered, format='PNG')
78
+ base64_str = base64.b64encode(buffered.getvalue()).decode('utf-8')
79
+
80
+ return {
81
+ 'base64': base64_str,
82
+ 'width': width,
83
+ 'height': height,
84
+ 'resolution': f"{width}x{height}"
85
+ }
86
+
87
+ def check(evaluator, pred_ans, real_ans):
88
+ if len(pred_ans) == 0:
89
+ return []
90
+ correctness = evaluator.score(pred_ans, real_ans)
91
+ return correctness
92
+
93
+ def execute_codes(codes, messages, executor: PythonExecutor):
94
+ no_code_idx = []
95
+ codes_use = []
96
+ for i, code in enumerate(codes):
97
+ if code == "":
98
+ no_code_idx.append(i)
99
+ else:
100
+ codes_use.append(code)
101
+ batch_results = executor.batch_apply(codes_use, messages)
102
+ return batch_results, no_code_idx
103
 
104
+ def process_prompt_init(question, image_path_list, prompt_template, prompt_type, api_name):
105
+ with open(prompt_template, "r") as fin:
106
+ sys = json.load(fin)
107
+ prompt_prefix = sys[prompt_type]
108
+
109
+ image_path = image_path_list[0]
110
+
111
+ if "<IMAGE_PLACE_HOLDER_0>" in question:
112
+ if "no_tool" in prompt_type:
113
+
114
+ if "claude" in api_name:
115
+ img_result = encode_image_with_resize(image_path)
116
+ else:
117
+ img_result = encode_image(image_path)
118
+ image_base64 = img_result['base64']
119
+ question_with_options = question
120
+ question = prompt_prefix.format(query=question_with_options)
121
+
122
+ parts = question.split("<IMAGE_PLACE_HOLDER_0>")
123
+ content = []
124
 
125
+ # Add text before image (if any)
126
+ if parts[0].strip():
127
+ content.append({"type": "text", "text": parts[0].strip()})
128
+ # Add image
129
+ content.append({"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{image_base64}"}})
130
+
131
+ # Add text after image (if any)
132
+ if len(parts) > 1 and parts[1].strip():
133
+ content.append({"type": "text", "text": parts[1].strip()})
134
+
135
+ messages = [
136
+ {
137
+ "role": "user",
138
+ "content": content
139
+ }
140
+ ]
141
+
142
+ return messages
143
+
144
+ else:
145
+ if "claude" in api_name:
146
+ img_result = encode_image_with_resize(image_path)
147
+ else:
148
+ img_result = encode_image(image_path)
149
+ image_base64 = img_result['base64']
150
+ width = img_result['width']
151
+ height = img_result['height']
152
+ question_with_options = question
153
+ question = prompt_prefix.format(query=question_with_options, width=str(width), height=str(height))
154
+
155
+ # Split question into parts
156
+ parts = question.split("<IMAGE_PLACE_HOLDER_0>")
157
+ # Build message with image_clue tags
158
+ content = []
159
+
160
+ # Add text before image (if any)
161
+ if parts[0].strip():
162
+ content.append({"type": "text", "text": parts[0].strip()})
163
+
164
+ # Add image with tags
165
+ content.append({"type": "text", "text": "<image_clue_0>"})
166
+ content.append({"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{image_base64}"}})
167
+ content.append({"type": "text", "text": "</image_clue_0>\n\n"})
168
+
169
+ # Add text after image (if any)
170
+ if len(parts) > 1 and parts[1].strip():
171
+ content.append({"type": "text", "text": parts[1].strip()})
172
 
173
+ messages = [
174
+ {
175
+ "role": "user",
176
+ "content": content
177
+ }
178
+ ]
179
 
180
+ return messages
 
 
 
181
 
182
+ else:
183
+ if "no_tool" in prompt_type:
184
+
185
+ if "claude" in api_name:
186
+ img_result = encode_image_with_resize(image_path)
187
+ else:
188
+ img_result = encode_image(image_path)
189
+ image_base64 = img_result['base64']
190
+ question_with_options = question
191
 
192
+ messages = [
193
+ {
194
+ "role": "user",
195
+ "content": [{"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{image_base64}"}}] + [{"type": "text", "text": prompt_prefix.format(query=question_with_options)}]
196
+ }
197
+ ]
198
 
199
+ return messages
 
 
 
 
200
 
201
+ else:
202
+ if "claude" in api_name:
203
+ img_result = encode_image_with_resize(image_path)
204
+ else:
205
+ img_result = encode_image(image_path)
206
+ image_base64 = img_result['base64']
207
+ width = img_result['width']
208
+ height = img_result['height']
209
+ question_with_options = question
210
+
211
+ messages = [
212
+ {
213
+ "role": "user",
214
+ "content": [{"type": "text", "text": "<image_clue_0>"}] + [{"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{image_base64}"}}] + [{"type": "text", "text": "</image_clue_0>\n\n"}] + [{"type": "text", "text": prompt_prefix.format(query=question_with_options, width=str(width), height=str(height))}]
215
+ }
216
+ ]
217
+
218
+ return messages
219
+
220
+ def process_prompt_init_multi_images(question, image_path_list, prompt_template, prompt_type, api_name):
221
+ with open(prompt_template, "r") as fin:
222
+ sys = json.load(fin)
223
+ prompt_prefix = sys[prompt_type]
224
+
225
+ # Prepare image data
226
+ image_data = []
227
+ image_information = ""
228
+
229
+ for i, image_path in enumerate(image_path_list):
230
+ if "claude" in api_name:
231
+ img_result = encode_image_with_resize(image_path)
232
+ else:
233
+ img_result = encode_image(image_path)
234
+ image_base64 = img_result['base64']
235
+ width = img_result['width']
236
+ height = img_result['height']
237
 
238
+ image_data.append({
239
+ "index": i,
240
+ "base64": image_base64,
241
+ "width": width,
242
+ "height": height,
243
+ "placeholder": f"<IMAGE_PLACE_HOLDER_{i}>"
244
+ })
245
 
246
+ image_information += f"width of image_clue_{i}: {width}, height of image_clue_{i}: {height}\n"
247
+
248
+ # Format question
249
+ formatted_question = prompt_prefix.format(query=question, image_information=image_information)
250
+
251
+ # Check if placeholder exists
252
+ has_placeholders = any(f"<IMAGE_PLACE_HOLDER_{i}>" in formatted_question for i in range(len(image_path_list)))
253
+
254
+ if has_placeholders:
255
+ # Insert images at placeholder positions
256
+ if "no_tool" in prompt_type:
257
+ content = []
258
+ remaining_text = formatted_question
259
+
260
+ for img_data in image_data:
261
+ placeholder = img_data["placeholder"]
262
+ if placeholder in remaining_text:
263
+ parts = remaining_text.split(placeholder, 1)
264
+
265
+ if parts[0]:
266
+ content.append({"type": "text", "text": parts[0]})
267
+
268
+ content.append({"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{img_data['base64']}"}})
269
+
270
+ remaining_text = parts[1]
271
+
272
+ if remaining_text:
273
+ content.append({"type": "text", "text": remaining_text})
274
 
275
+ messages = [{"role": "user", "content": content}]
276
+ return messages
277
  else:
278
+ content = []
279
+ remaining_text = formatted_question
280
+
281
+ for img_data in image_data:
282
+ placeholder = img_data["placeholder"]
283
+ if placeholder in remaining_text:
284
+ parts = remaining_text.split(placeholder, 1)
285
+
286
+ if parts[0]:
287
+ content.append({"type": "text", "text": parts[0]})
288
+
289
+ i = img_data["index"]
290
+ content.append({"type": "text", "text": f"<image_clue_{i}>"})
291
+ content.append({"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{img_data['base64']}"}})
292
+ content.append({"type": "text", "text": f"</image_clue_{i}>\n\n"})
293
+
294
+ remaining_text = parts[1]
295
+
296
+ if remaining_text:
297
+ content.append({"type": "text", "text": remaining_text})
298
+
299
+ messages = [{"role": "user", "content": content}]
300
+ return messages
301
+ else:
302
+ # Handle as usual if no placeholder
303
+ if "no_tool" in prompt_type:
304
+ content = []
305
+
306
+ for i, img_data in enumerate(image_data):
307
+ content.append({"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{img_data['base64']}"}})
308
+
309
+ content.append({"type": "text", "text": formatted_question})
310
+
311
+ messages = [{"role": "user", "content": content}]
312
+ return messages
313
+ else:
314
+ content = []
315
+
316
+ for i, img_data in enumerate(image_data):
317
+ content.append({"type": "text", "text": f"<image_clue_{i}>"})
318
+ content.append({"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{img_data['base64']}"}})
319
+ content.append({"type": "text", "text": f"</image_clue_{i}>\n\n"})
320
+
321
+ content.append({"type": "text", "text": formatted_question})
322
+
323
+ messages = [{"role": "user", "content": content}]
324
+ return messages
325
+
326
+
327
+ def update_messages_with_execute_content(image_nums_in_input, messages, images_result, text_result, error_result, image_clue_idx):
328
+ if error_result is None:
329
+ new_messages = []
330
+ image_content = []
331
+ for message_item in messages[:-1]:
332
+ new_messages.append(message_item)
333
+
334
+ assistant_message_item = messages[-1]['content']
335
+ interpreter_message_text_prefix = [{"type": "text", "text": f"<interpreter>\nText Result:\n{text_result}\nImage Result:\n"}]
336
+ if images_result is not None:
337
+ print(f"#### image_clue_index: {image_clue_idx},Image_nums_in_input: {image_nums_in_input}, len of images_result: {len(images_result)}")
338
+ # for image_base64_item in images_result[image_clue_idx-image_nums_in_input:]:
339
+ for image_base64_item in images_result:
340
+ interpreter_message_images = [{"type": "text", "text": f"<image_clue_{image_clue_idx}>"}] + [{"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{image_base64_item}"}}] + [{"type": "text", "text": f"</image_clue_{image_clue_idx}>"}]
341
+ image_content += interpreter_message_images
342
+ image_clue_idx += 1
343
+ else:
344
+ image_content = [{"type": "text", "text": "None"}]
345
+ interpreter_message_text_profill = [{"type": "text", "text": "</interpreter>\n"}]
346
 
347
+ interpreter_message_item = interpreter_message_text_prefix + image_content + interpreter_message_text_profill
348
+ new_messages.append({"role": "assistant", "content": assistant_message_item})
349
+ new_messages.append({"role": "user", "content": interpreter_message_item})
350
+ else:
351
+ new_messages = []
352
+ for message_item in messages[:-1]:
353
+ new_messages.append(message_item)
354
+
355
+ assistant_message_item = messages[-1]['content']
356
+ interpreter_message_text_prefix = [{"type": "text", "text": f"<interpreter>{error_result}"}]
357
+ interpreter_message_text_profill = [{"type": "text", "text": "</interpreter>\n"}]
358
+
359
+ interpreter_message_item = interpreter_message_text_prefix + interpreter_message_text_profill
360
+ new_messages.append({"role": "assistant", "content": assistant_message_item})
361
+ new_messages.append({"role": "user", "content": interpreter_message_item})
362
 
363
+ return new_messages, image_clue_idx
 
 
364
 
365
+ def update_messages_with_code(messages, generated_content):
366
+ message_item = {
367
+ "role": "assistant",
368
+ "content": [{"type": "text", "text": f"{generated_content}</code>\n"}]
369
+ }
370
+
371
+ messages.append(message_item)
372
+ return messages
373
 
374
+ def update_messages_with_text(messages, generated_content):
375
+ message_item = {
376
+ "role": "assistant",
377
+ "content": [{"type": "text", "text": f"{generated_content}"}]
378
+ }
379
 
380
+ messages.append(message_item)
381
+ return messages
382
+
383
+ def call_chatgpt_api(args, messages, client, max_tokens=10000, stop=None, temperature=0.6):
384
+ """Call ChatGPT API with the given messages"""
385
+ try:
386
+ client_type = args.client_type
387
+ api_name = args.api_name
388
+ except:
389
+ client_type = args['client_type']
390
+ api_name = args['api_name']
391
 
392
+ if client_type == "openai" or client_type == "azure":
393
+ response = client.chat.completions.create(
394
+ model=api_name,
395
+ messages=messages,
396
+ max_tokens=max_tokens,
397
+ temperature=temperature,
398
+ top_p=1.0,
399
+ stop=stop,
400
+ timeout=300
401
+ )
402
+ response_text = response.choices[0].message.content
403
+ elif client_type == "anthropic":
404
+ message = client.messages.create(
405
+ model=api_name,
406
+ max_tokens=max_tokens,
407
+ messages=messages,
408
+ temperature=temperature,
409
+ top_p=1.0,
410
+ stop_sequences=stop
411
+ )
412
+ response_text = message.content[0].text if isinstance(message.content, list) else message.content
413
+ elif client_type == "vllm":
414
+ response = client.chat.completions.create(
415
+ model=api_name,
416
+ messages=messages,
417
+ max_tokens=max_tokens,
418
+ temperature=temperature,
419
+ top_p=1.0,
420
+ stop=stop
421
+ )
422
+ response_text = response.choices[0].message.content
423
+ else:
424
+ print("Your args.client_type must be one of openai, azure, anthropic and vllm.")
425
+ return None, None
426
+
427
+ # Check if stop sequence is encountered
428
+ stop_reason = None
429
+ if stop and any(s in response_text for s in stop):
430
+ for s in stop:
431
+ if s in response_text:
432
+ stop_reason = s
433
+ break
434
+ else:
435
+ if client_type in ["openai", "azure", "vllm"]:
436
+ stop_reason = response.choices[0].finish_reason
437
+ else:
438
+ stop_reason = "stop"
439
+
440
+ if "<code>" in response_text:
441
+ stop_reason = "</code>"
442
+
443
+ return response_text, stop_reason
444
 
445
+ def evaluate_single_data(args, data, client, executor):
446
+ try:
447
+ prompt_template = args.prompt_template
448
+ prompt = args.prompt
449
+ exe_code = args.exe_code
450
+ max_tokens = args.max_tokens
451
+ temperature = args.temperature
452
+ api_name = args.api_name
453
+ except:
454
+ prompt_template = args['prompt_template']
455
+ prompt = args['prompt']
456
+ exe_code = args['exe_code']
457
+ max_tokens = args['max_tokens']
458
+ temperature = args['temperature']
459
+ api_name = args['api_name']
460
+
461
+ image_path_list = data['image_path_list']
462
+
463
+ if "no_tool" in prompt:
464
+ if len(image_path_list) == 1:
465
+ messages = process_prompt_init(data["question"], image_path_list, prompt_template, prompt, api_name)
466
+ elif len(image_path_list) >= 2:
467
+ messages = process_prompt_init_multi_images(data["question"], image_path_list, prompt_template, prompt, api_name)
468
+ else:
469
+ if len(image_path_list) == 1:
470
+ prompt = "vistool_with_img_info_v2"
471
+ messages = process_prompt_init(data["question"], image_path_list, prompt_template, prompt, api_name)
472
+ elif len(image_path_list) >= 2:
473
+ prompt = "vistool_with_img_info_multi_image"
474
+ messages = process_prompt_init_multi_images(data["question"], image_path_list, prompt_template, prompt, api_name)
475
+
476
+ # Generate initial response
477
+ response_text, pred_stop_reason = call_chatgpt_api(
478
+ args,
479
+ messages,
480
+ client,
481
+ max_tokens=max_tokens,
482
+ stop=["</code>"] if exe_code else None,
483
+ temperature=temperature
484
+ )
485
+
486
+ # Handle response
487
+ final_response = response_text
488
+ code_execution_count = 0
489
+ image_clue_idx = len(image_path_list)
490
+
491
+ while True:
492
+ # Check if code execution is needed
493
+ if exe_code and pred_stop_reason == "</code>":
494
+ # Extract code to execute
495
+ messages = update_messages_with_code(messages, response_text)
496
+ code_to_execute = response_text.split("```python")[-1].split("```")[0].strip()
 
497
 
498
+ # Execute code
499
+ exe_result = execute_codes([code_to_execute], messages, executor)[0][0]
500
+ if exe_result is None:
501
+ text_result = "None"
502
+ images_result = None
 
 
 
 
 
 
 
503
  else:
504
+ output, report = exe_result
505
+ if report == "Done":
506
+ error_result = None
507
+ try:
508
+ text_result = exe_result[0]['text']
509
+ except:
510
+ text_result = None
511
+ print("text result is none.")
512
+ try:
513
+ images_result = exe_result[0]['images']
514
+ except:
515
+ images_result = None
516
+ print("image result is none.")
517
  else:
518
+ error_result = report
519
+ text_result = None
520
+ images_result = None
521
+
522
+ messages, new_image_clue_idx = update_messages_with_execute_content(len(image_path_list), messages, images_result, text_result, error_result, image_clue_idx)
523
+ image_clue_idx = new_image_clue_idx
524
+
525
+ code_execution_count += 1
526
+
527
+ # Generate next response part
528
+ response_text, pred_stop_reason = call_chatgpt_api(
529
+ args,
530
+ messages,
531
+ client,
532
+ max_tokens=max_tokens,
533
+ stop=["</code>"] if exe_code else None,
534
+ temperature=temperature
535
+ )
536
+
537
+ else:
538
+ final_response = response_text
539
+ messages = update_messages_with_text(messages, response_text)
540
+ break
541
+
542
+ return messages, final_response
543
+
544
+
545
+ def evaluate_single_data_multi_images(args, data, client, executor):
546
+ try:
547
+ prompt_template = args.prompt_template
548
+ prompt = args.prompt
549
+ exe_code = args.exe_code
550
+ max_tokens = args.max_tokens
551
+ except:
552
+ prompt_template = args['prompt_template']
553
+ prompt = args['prompt']
554
+ exe_code = args['exe_code']
555
+ max_tokens = args['max_tokens']
556
+
557
+ messages = process_prompt_init_multi_images(data["question"], data['image_path_list'], prompt_template, prompt)
558
+
559
+ # Generate initial response
560
+ response_text, pred_stop_reason = call_chatgpt_api(
561
+ args,
562
+ messages,
563
+ client,
564
+ max_tokens=max_tokens,
565
+ stop=["</code>"] if exe_code else None
566
+ )
567
+
568
+ # Handle response
569
+ final_response = response_text
570
+ code_execution_count = 0
571
+ image_clue_idx = data['image_nums_in_input']
572
+
573
+ while True:
574
+ # Check if code execution is needed
575
+ if exe_code and pred_stop_reason == "</code>":
576
+ # Extract code to execute
577
+ messages = update_messages_with_code(messages, response_text)
578
+ code_to_execute = response_text.split("```python")[-1].split("```")[0].strip()
579
 
580
+ # Execute code
581
+ exe_result = execute_codes([code_to_execute], messages, executor)[0][0]
582
+ if exe_result is None:
583
+ text_result = "None"
584
+ images_result = None
585
+ else:
586
+ output, report = exe_result
587
+ if report == "Done":
588
+ error_result = None
589
+ try:
590
+ text_result = exe_result[0]['text']
591
+ except:
592
+ text_result = None
593
+ print("text result is none.")
594
+ try:
595
+ images_result = exe_result[0]['images']
596
+ except:
597
+ images_result = None
598
+ print("image result is none.")
599
  else:
600
+ error_result = report
601
+ text_result = None
602
+ images_result = None
603
+
604
+ messages, new_image_clue_idx = update_messages_with_execute_content(data['image_nums_in_input'], messages, images_result, text_result, error_result, image_clue_idx)
605
+ image_clue_idx = new_image_clue_idx
606
 
607
+ code_execution_count += 1
 
 
 
 
 
 
 
 
 
 
608
 
609
+ # Generate next response part
610
+ response_text, pred_stop_reason = call_chatgpt_api(
611
+ args,
612
+ messages,
613
+ client,
614
+ max_tokens=max_tokens,
615
+ stop=["</code>"] if exe_code else None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
616
  )
 
 
617
 
618
+ else:
619
+ final_response = response_text
620
+ messages = update_messages_with_text(messages, response_text)
621
+ break
622
+
623
+ return messages, final_response
624
+
625
+ def evaluate_single_data_video(args, data, client, executor):
626
+ try:
627
+ prompt_template = args.prompt_template
628
+ prompt = args.prompt
629
+ exe_code = args.exe_code
630
+ max_tokens = args.max_tokens
631
+ except:
632
+ prompt_template = args['prompt_template']
633
+ prompt = args['prompt']
634
+ exe_code = args['exe_code']
635
+ max_tokens = args['max_tokens']
636
+
637
+ messages = process_prompt_init_multi_images(data["question"], data['image_path_list'], prompt_template, prompt)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
638
 
639
+ # Generate initial response
640
+ response_text, pred_stop_reason = call_chatgpt_api(
641
+ args,
642
+ messages,
643
+ client,
644
+ max_tokens=max_tokens,
645
+ stop=["</code>"] if exe_code else None
646
+ )
647
+
648
+ # Handle response
649
+ final_response = response_text
650
+ code_execution_count = 0
651
+ image_clue_idx = data['image_nums_in_input']
652
+
653
+ while True:
654
+ # Check if code execution is needed
655
+ if exe_code and pred_stop_reason == "</code>":
656
+ # Extract code to execute
657
+ messages = update_messages_with_code(messages, response_text)
658
+ code_to_execute = response_text.split("```python")[-1].split("```")[0].strip()
659
+
660
+ # Execute code
661
+ exe_result = execute_codes([code_to_execute], messages, executor)[0][0]
662
+ if exe_result is None:
663
+ text_result = "None"
664
+ images_result = None
665
+ else:
666
+ output, report = exe_result
667
+ if report == "Done":
668
+ error_result = None
669
+ try:
670
+ text_result = exe_result[0]['text']
671
+ except:
672
+ text_result = None
673
+ print("text result is none.")
674
+ try:
675
+ images_result = exe_result[0]['images']
676
+ except:
677
+ images_result = None
678
+ print("image result is none.")
679
+ else:
680
+ error_result = report
681
+ text_result = None
682
+ images_result = None
683
 
684
+ messages, new_image_clue_idx = update_messages_with_execute_content(data['image_nums_in_input'], messages, images_result, text_result, error_result, image_clue_idx)
685
+ image_clue_idx = new_image_clue_idx
686
+
687
+ code_execution_count += 1
688
+
689
+ # Generate next response part
690
+ response_text, pred_stop_reason = call_chatgpt_api(
691
+ args,
692
+ messages,
693
+ client,
694
+ max_tokens=max_tokens,
695
+ stop=["</code>"] if exe_code else None
696
+ )
697
 
698
+ else:
699
+ final_response = response_text
700
+ messages = update_messages_with_text(messages, response_text)
701
+ break
702
+
703
+ return messages, final_response
704
+
705
+
706
+ # New wrapper functions for safe execution with cleanup
707
+ def evaluate_batch_with_cleanup(args, data_list, client):
708
+ """Wrapper function to ensure proper cleanup of resources when processing multiple items"""
709
+ # Initialize executor with process isolation
710
+ executor = PythonExecutor(use_process_isolation=True)
711
 
712
+ try:
713
+ results = []
714
+ for data in data_list:
715
+ try:
716
+ result = evaluate_single_data(args, data, client, executor)
717
+ results.append(result)
718
+ except Exception as e:
719
+ print(f"Error processing data item: {str(e)}")
720
+ results.append((None, f"Error: {str(e)}"))
721
+ # Reset the executor for the next item
722
+ executor.reset()
723
 
724
+ return results
725
+ finally:
726
+ # Ensure cleanup of persistent worker
727
+ del executor
 
 
 
 
 
 
 
 
 
 
728
 
729
+ def evaluate_single_with_cleanup(args, data, client):
730
+ """Wrapper function for evaluating a single item with proper cleanup"""
731
+ # Initialize executor with process isolation
732
+ executor = PythonExecutor(use_process_isolation=True)
733
 
734
+ try:
735
+ result = evaluate_single_data(args, data, client, executor)
736
+ return result
737
+ finally:
738
+ # Ensure cleanup of persistent worker
739
+ del executor
740
+
741
+ def evaluate_multi_images_with_cleanup(args, data_list, client):
742
+ """Wrapper function for multi-image evaluation with proper cleanup"""
743
+ # Initialize executor with process isolation
744
+ executor = PythonExecutor(use_process_isolation=True)
745
+
746
+ try:
747
+ results = []
748
+ for data in data_list:
749
+ try:
750
+ result = evaluate_single_data_multi_images(args, data, client, executor)
751
+ results.append(result)
752
+ except Exception as e:
753
+ print(f"Error processing multi-image data: {str(e)}")
754
+ results.append((None, f"Error: {str(e)}"))
755
+ # Reset the executor for the next item
756
+ executor.reset()
757
+
758
+ return results
759
+ finally:
760
+ # Ensure cleanup of persistent worker
761
+ del executor
762
+
763
+ def evaluate_video_with_cleanup(args, data_list, client):
764
+ """Wrapper function for video evaluation with proper cleanup"""
765
+ # Initialize executor with process isolation
766
+ executor = PythonExecutor(use_process_isolation=True)
767
+
768
+ try:
769
+ results = []
770
+ for data in data_list:
771
+ try:
772
+ result = evaluate_single_data_video(args, data, client, executor)
773
+ results.append(result)
774
+ except Exception as e:
775
+ print(f"Error processing video data: {str(e)}")
776
+ results.append((None, f"Error: {str(e)}"))
777
+ # Reset the executor for the next item
778
+ executor.reset()
779
+
780
+ return results
781
+ finally:
782
+ # Ensure cleanup of persistent worker
783
+ del executor