Haozhe commited on
Commit
f939456
·
2 Parent(s): 311548d f1bf896

Merge branch 'main' of https://huggingface.co/spaces/TIGER-Lab/Pixel-Reasoner

Browse files
Files changed (3) hide show
  1. .gitattributes +1 -1
  2. example_images/1.jpg → 1.jpg +0 -0
  3. app.py +14 -30
.gitattributes CHANGED
@@ -33,4 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
- example_images/1.jpg filter=lfs diff=lfs merge=lfs -text
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ 1.jpg filter=lfs diff=lfs merge=lfs -text
example_images/1.jpg → 1.jpg RENAMED
File without changes
app.py CHANGED
@@ -11,15 +11,12 @@ import spaces
11
  import os
12
  from serve_constants import html_header, bibtext, learn_more_markdown, tos_markdown
13
 
 
 
14
  MODEL_ID = "TIGER-Lab/PixelReasoner-RL-v1"
15
- # MODEL_ID = "/home/ma-user/work/haozhe/workspace/lmm-r1/toolckpts/pix17K0506wt-NormalizedPenalizedFixedReweightCont-256-lossvernone-samplevernone-fmtnone-group-n8-ml10000-lr10-sysvcot-8node/global_step24_hf_evalbest"
16
- example_image = "example_images/1.jpg" # /home/ma-user/work/haozhe/workspace/vlspaces/
17
- # example_image = "/home/ma-user/work/haozhe/workspace/vlspaces/example_images/1.jpg"
18
- example_text = "What kind of restaurant is it?"
19
  processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True,
20
- # min_pixels=min_pixels,
21
- max_pixels=512*28*28,
22
- )
23
  model = AutoModelForImageTextToText.from_pretrained(
24
  MODEL_ID,
25
  trust_remote_code=True,
@@ -161,15 +158,6 @@ def model_inference(input_dict, history):
161
  if val[1]:
162
  messages.append({"role": "assistant", "content": val[1]})
163
 
164
- current_path = os.getcwd()
165
- print(f"Current running path: {current_path}")
166
-
167
- # Define the folder name to check
168
- folder_to_find = "example_images"
169
-
170
- # Create the full path to the folder
171
- folder_path = os.path.join(current_path, folder_to_find)
172
- print('files', files)
173
  imagelist = rawimagelist = current_message_images = [load_image(image) for image in files]
174
  all_images += current_message_images
175
  messages.append({
@@ -181,7 +169,7 @@ def model_inference(input_dict, history):
181
  })
182
 
183
  print(messages)
184
- # complete_assistant_response_for_gradio = ""
185
  complete_assistant_response_for_gradio = []
186
  while True:
187
  """
@@ -197,15 +185,9 @@ def model_inference(input_dict, history):
197
 
198
  streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=False)
199
  generation_kwargs = dict(inputs, streamer=streamer, max_new_tokens=1024, temperature=0.1, top_p=0.95, top_k=50)
200
- # import pdb; pdb.set_trace()
201
  thread = Thread(target=model.generate, kwargs=generation_kwargs)
202
  thread.start()
203
 
204
- # buffer = ""
205
- # for new_text in streamer:
206
- # buffer += new_text
207
- # yield buffer
208
- # print(buffer)
209
  current_model_output_segment = "" # Text generated in this specific model call
210
  toolflag = False
211
  for new_text_chunk in streamer:
@@ -224,18 +206,14 @@ def model_inference(input_dict, history):
224
  processed_segment = current_model_output_segment.split("<|im_end|>", 1)[0] if "<|im_end|>" in current_model_output_segment else current_model_output_segment
225
 
226
  # Append this processed segment to the cumulative display string for Gradio
227
- # complete_assistant_response_for_gradio += processed_segment + "\n\n"
228
  complete_assistant_response_for_gradio += [processed_segment + "\n\n"]
229
- # print(f"this one: {complete_assistant_response_for_gradio}")
230
  yield complete_assistant_response_for_gradio # Ensure the fully processed segment is yielded to Gradio
231
 
232
 
233
  # Check for tool call in the *just generated* segment
234
  qatext_for_tool_check = processed_segment
235
  require_tool = tool_end in qatext_for_tool_check and tool_start in qatext_for_tool_check
236
-
237
- # print(f"Segment from model: \"{qatext_for_tool_check[:200]}...\", Requires tool: {require_tool}")
238
-
239
  if require_tool:
240
 
241
  tool_params = parse_last_tool(qatext_for_tool_check)
@@ -261,7 +239,6 @@ def model_inference(input_dict, history):
261
  ]
262
  )
263
  messages.append(new_piece)
264
- # print(messages)
265
  # complete_assistant_response_for_gradio += f"\n<b>Analyzing Operation Result ...</b> @region(size={proc_img.size[0]}x{proc_img.size[1]})\n\n"
266
  complete_assistant_response_for_gradio += [f"\n<b>Analyzing Operation Result ...</b> @region(size={proc_img.size[0]}x{proc_img.size[1]})\n\n"]
267
  yield complete_assistant_response_for_gradio # Update Gradio display
@@ -272,7 +249,14 @@ def model_inference(input_dict, history):
272
 
273
  with gr.Blocks() as demo:
274
  examples = [
275
- [{"text": example_text, "files": [example_image]}]
 
 
 
 
 
 
 
276
  ]
277
 
278
  gr.HTML(html_header)
 
11
  import os
12
  from serve_constants import html_header, bibtext, learn_more_markdown, tos_markdown
13
 
14
+ cur_dir = os.path.dirname(os.path.abspath(__file__))
15
+
16
  MODEL_ID = "TIGER-Lab/PixelReasoner-RL-v1"
17
+
 
 
 
18
  processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True,
19
+ max_pixels=512*28*28)
 
 
20
  model = AutoModelForImageTextToText.from_pretrained(
21
  MODEL_ID,
22
  trust_remote_code=True,
 
158
  if val[1]:
159
  messages.append({"role": "assistant", "content": val[1]})
160
 
 
 
 
 
 
 
 
 
 
161
  imagelist = rawimagelist = current_message_images = [load_image(image) for image in files]
162
  all_images += current_message_images
163
  messages.append({
 
169
  })
170
 
171
  print(messages)
172
+
173
  complete_assistant_response_for_gradio = []
174
  while True:
175
  """
 
185
 
186
  streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=False)
187
  generation_kwargs = dict(inputs, streamer=streamer, max_new_tokens=1024, temperature=0.1, top_p=0.95, top_k=50)
 
188
  thread = Thread(target=model.generate, kwargs=generation_kwargs)
189
  thread.start()
190
 
 
 
 
 
 
191
  current_model_output_segment = "" # Text generated in this specific model call
192
  toolflag = False
193
  for new_text_chunk in streamer:
 
206
  processed_segment = current_model_output_segment.split("<|im_end|>", 1)[0] if "<|im_end|>" in current_model_output_segment else current_model_output_segment
207
 
208
  # Append this processed segment to the cumulative display string for Gradio
 
209
  complete_assistant_response_for_gradio += [processed_segment + "\n\n"]
 
210
  yield complete_assistant_response_for_gradio # Ensure the fully processed segment is yielded to Gradio
211
 
212
 
213
  # Check for tool call in the *just generated* segment
214
  qatext_for_tool_check = processed_segment
215
  require_tool = tool_end in qatext_for_tool_check and tool_start in qatext_for_tool_check
216
+
 
 
217
  if require_tool:
218
 
219
  tool_params = parse_last_tool(qatext_for_tool_check)
 
239
  ]
240
  )
241
  messages.append(new_piece)
 
242
  # complete_assistant_response_for_gradio += f"\n<b>Analyzing Operation Result ...</b> @region(size={proc_img.size[0]}x{proc_img.size[1]})\n\n"
243
  complete_assistant_response_for_gradio += [f"\n<b>Analyzing Operation Result ...</b> @region(size={proc_img.size[0]}x{proc_img.size[1]})\n\n"]
244
  yield complete_assistant_response_for_gradio # Update Gradio display
 
249
 
250
  with gr.Blocks() as demo:
251
  examples = [
252
+ [
253
+ {
254
+ "text": "What kind of restaurant is it?",
255
+ "files": [
256
+ "1.jpg"
257
+ ]
258
+ }
259
+ ]
260
  ]
261
 
262
  gr.HTML(html_header)