Spaces:

abiyyufahri
/

GUI-Agent

Running

App Files Files Community

abiyyufahri commited on about 1 month ago

Commit

2ee69d3

verified ·

1 Parent(s): c3c9d97

Update app.py

Browse files

Files changed (1) hide show

app.py +49 -6

app.py CHANGED Viewed

@@ -168,6 +168,7 @@ def extract_coordinates(text):
 def cpu_inference(conversation, model, tokenizer, processor):
     try:
         prompt = processor.apply_chat_template(
             conversation,
             tokenize=False,
@@ -175,26 +176,58 @@ def cpu_inference(conversation, model, tokenizer, processor):
         )
         image = conversation[1]["content"][0]["image"]
         inputs = processor(
             text=[prompt],
             images=[image],
             return_tensors="pt",
-            padding=True,
             truncation=True,
-            max_length=512
         )
         with torch.no_grad():
             outputs = model.generate(
                 **inputs,
                 max_new_tokens=256,
                 do_sample=True,
                 temperature=0.3,
                 top_p=0.8,
-                pad_token_id=tokenizer.eos_token_id or tokenizer.pad_token_id or 0
             )
-        generated_ids = outputs[0][inputs["input_ids"].shape[1]:]
         response = tokenizer.decode(generated_ids, skip_special_tokens=True)
         coordinates = extract_coordinates(response)
@@ -206,6 +239,10 @@ def cpu_inference(conversation, model, tokenizer, processor):
     except Exception as e:
         logger.error(f"Inference error: {e}")
         return {
             "topk_points": [(0.5, 0.5)],
             "response": f"Error during inference: {str(e)}",
@@ -242,6 +279,7 @@ async def predict_click_base64(data: Base64Request):
         try:
             pil_image = Image.open(BytesIO(image_data)).convert("RGB")
         except Exception as e:
             raise HTTPException(status_code=400, detail=f"Invalid image format: {e}")
@@ -311,5 +349,10 @@ async def debug_info():
         "processor_type": type(processor).__name__ if processor else None,
         "model_type": type(model).__name__ if model else None,
         "available_qwen_classes": available_classes,
-        "transformers_version": transformers.__version__
     }

 def cpu_inference(conversation, model, tokenizer, processor):
     try:
+        # Apply chat template
         prompt = processor.apply_chat_template(
             conversation,
             tokenize=False,
         )
         image = conversation[1]["content"][0]["image"]
+        # Process inputs with explicit padding and proper tensor handling
         inputs = processor(
             text=[prompt],
             images=[image],
             return_tensors="pt",
+            padding=True,  # Ensure padding is enabled
             truncation=True,
+            max_length=2048  # Increased max length for vision-language models
         )
+        # Debug logging
+        logger.info(f"Input tensor shapes: {[(k, v.shape if hasattr(v, 'shape') else type(v)) for k, v in inputs.items()]}")
+        # Ensure all tensors are properly formatted
+        for key, value in inputs.items():
+            if isinstance(value, torch.Tensor):
+                logger.info(f"{key} shape: {value.shape}, dtype: {value.dtype}")
+        # Set pad token if not already set
+        if tokenizer.pad_token_id is None:
+            if tokenizer.eos_token_id is not None:
+                tokenizer.pad_token_id = tokenizer.eos_token_id
+            else:
+                tokenizer.pad_token_id = 0
+        # Generate with proper attention mask handling
         with torch.no_grad():
+            # Ensure attention mask is present
+            if 'attention_mask' not in inputs and 'input_ids' in inputs:
+                inputs['attention_mask'] = torch.ones_like(inputs['input_ids'])
             outputs = model.generate(
                 **inputs,
                 max_new_tokens=256,
                 do_sample=True,
                 temperature=0.3,
                 top_p=0.8,
+                pad_token_id=tokenizer.pad_token_id,
+                eos_token_id=tokenizer.eos_token_id,
+                use_cache=True,
+                # Add these parameters for better stability
+                repetition_penalty=1.1,
+                length_penalty=1.0
             )
+        # Handle batch dimension properly
+        if outputs.dim() > 1:
+            generated_ids = outputs[0][inputs["input_ids"].shape[1]:]
+        else:
+            generated_ids = outputs[inputs["input_ids"].shape[1]:]
         response = tokenizer.decode(generated_ids, skip_special_tokens=True)
         coordinates = extract_coordinates(response)
     except Exception as e:
         logger.error(f"Inference error: {e}")
+        logger.error(f"Error type: {type(e).__name__}")
+        import traceback
+        logger.error(f"Full traceback: {traceback.format_exc()}")
         return {
             "topk_points": [(0.5, 0.5)],
             "response": f"Error during inference: {str(e)}",
         try:
             pil_image = Image.open(BytesIO(image_data)).convert("RGB")
+            logger.info(f"Image loaded successfully: {pil_image.size}")
         except Exception as e:
             raise HTTPException(status_code=400, detail=f"Invalid image format: {e}")
         "processor_type": type(processor).__name__ if processor else None,
         "model_type": type(model).__name__ if model else None,
         "available_qwen_classes": available_classes,
+        "transformers_version": transformers.__version__,
+        "tokenizer_info": {
+            "pad_token_id": tokenizer.pad_token_id if tokenizer else None,
+            "eos_token_id": tokenizer.eos_token_id if tokenizer else None,
+            "vocab_size": tokenizer.vocab_size if tokenizer else None
+        } if tokenizer else None
     }