abiyyufahri commited on
Commit
2ee69d3
·
verified ·
1 Parent(s): c3c9d97

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +49 -6
app.py CHANGED
@@ -168,6 +168,7 @@ def extract_coordinates(text):
168
 
169
  def cpu_inference(conversation, model, tokenizer, processor):
170
  try:
 
171
  prompt = processor.apply_chat_template(
172
  conversation,
173
  tokenize=False,
@@ -175,26 +176,58 @@ def cpu_inference(conversation, model, tokenizer, processor):
175
  )
176
 
177
  image = conversation[1]["content"][0]["image"]
 
 
178
  inputs = processor(
179
  text=[prompt],
180
  images=[image],
181
  return_tensors="pt",
182
- padding=True,
183
  truncation=True,
184
- max_length=512
185
  )
186
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
187
  with torch.no_grad():
 
 
 
 
188
  outputs = model.generate(
189
  **inputs,
190
  max_new_tokens=256,
191
  do_sample=True,
192
  temperature=0.3,
193
  top_p=0.8,
194
- pad_token_id=tokenizer.eos_token_id or tokenizer.pad_token_id or 0
 
 
 
 
 
195
  )
196
 
197
- generated_ids = outputs[0][inputs["input_ids"].shape[1]:]
 
 
 
 
 
198
  response = tokenizer.decode(generated_ids, skip_special_tokens=True)
199
  coordinates = extract_coordinates(response)
200
 
@@ -206,6 +239,10 @@ def cpu_inference(conversation, model, tokenizer, processor):
206
 
207
  except Exception as e:
208
  logger.error(f"Inference error: {e}")
 
 
 
 
209
  return {
210
  "topk_points": [(0.5, 0.5)],
211
  "response": f"Error during inference: {str(e)}",
@@ -242,6 +279,7 @@ async def predict_click_base64(data: Base64Request):
242
 
243
  try:
244
  pil_image = Image.open(BytesIO(image_data)).convert("RGB")
 
245
  except Exception as e:
246
  raise HTTPException(status_code=400, detail=f"Invalid image format: {e}")
247
 
@@ -311,5 +349,10 @@ async def debug_info():
311
  "processor_type": type(processor).__name__ if processor else None,
312
  "model_type": type(model).__name__ if model else None,
313
  "available_qwen_classes": available_classes,
314
- "transformers_version": transformers.__version__
 
 
 
 
 
315
  }
 
168
 
169
  def cpu_inference(conversation, model, tokenizer, processor):
170
  try:
171
+ # Apply chat template
172
  prompt = processor.apply_chat_template(
173
  conversation,
174
  tokenize=False,
 
176
  )
177
 
178
  image = conversation[1]["content"][0]["image"]
179
+
180
+ # Process inputs with explicit padding and proper tensor handling
181
  inputs = processor(
182
  text=[prompt],
183
  images=[image],
184
  return_tensors="pt",
185
+ padding=True, # Ensure padding is enabled
186
  truncation=True,
187
+ max_length=2048 # Increased max length for vision-language models
188
  )
189
+
190
+ # Debug logging
191
+ logger.info(f"Input tensor shapes: {[(k, v.shape if hasattr(v, 'shape') else type(v)) for k, v in inputs.items()]}")
192
+
193
+ # Ensure all tensors are properly formatted
194
+ for key, value in inputs.items():
195
+ if isinstance(value, torch.Tensor):
196
+ logger.info(f"{key} shape: {value.shape}, dtype: {value.dtype}")
197
+
198
+ # Set pad token if not already set
199
+ if tokenizer.pad_token_id is None:
200
+ if tokenizer.eos_token_id is not None:
201
+ tokenizer.pad_token_id = tokenizer.eos_token_id
202
+ else:
203
+ tokenizer.pad_token_id = 0
204
+
205
+ # Generate with proper attention mask handling
206
  with torch.no_grad():
207
+ # Ensure attention mask is present
208
+ if 'attention_mask' not in inputs and 'input_ids' in inputs:
209
+ inputs['attention_mask'] = torch.ones_like(inputs['input_ids'])
210
+
211
  outputs = model.generate(
212
  **inputs,
213
  max_new_tokens=256,
214
  do_sample=True,
215
  temperature=0.3,
216
  top_p=0.8,
217
+ pad_token_id=tokenizer.pad_token_id,
218
+ eos_token_id=tokenizer.eos_token_id,
219
+ use_cache=True,
220
+ # Add these parameters for better stability
221
+ repetition_penalty=1.1,
222
+ length_penalty=1.0
223
  )
224
 
225
+ # Handle batch dimension properly
226
+ if outputs.dim() > 1:
227
+ generated_ids = outputs[0][inputs["input_ids"].shape[1]:]
228
+ else:
229
+ generated_ids = outputs[inputs["input_ids"].shape[1]:]
230
+
231
  response = tokenizer.decode(generated_ids, skip_special_tokens=True)
232
  coordinates = extract_coordinates(response)
233
 
 
239
 
240
  except Exception as e:
241
  logger.error(f"Inference error: {e}")
242
+ logger.error(f"Error type: {type(e).__name__}")
243
+ import traceback
244
+ logger.error(f"Full traceback: {traceback.format_exc()}")
245
+
246
  return {
247
  "topk_points": [(0.5, 0.5)],
248
  "response": f"Error during inference: {str(e)}",
 
279
 
280
  try:
281
  pil_image = Image.open(BytesIO(image_data)).convert("RGB")
282
+ logger.info(f"Image loaded successfully: {pil_image.size}")
283
  except Exception as e:
284
  raise HTTPException(status_code=400, detail=f"Invalid image format: {e}")
285
 
 
349
  "processor_type": type(processor).__name__ if processor else None,
350
  "model_type": type(model).__name__ if model else None,
351
  "available_qwen_classes": available_classes,
352
+ "transformers_version": transformers.__version__,
353
+ "tokenizer_info": {
354
+ "pad_token_id": tokenizer.pad_token_id if tokenizer else None,
355
+ "eos_token_id": tokenizer.eos_token_id if tokenizer else None,
356
+ "vocab_size": tokenizer.vocab_size if tokenizer else None
357
+ } if tokenizer else None
358
  }