IDM-VTON

Running on Zero

App Files Files Community

Saad0KH commited on Sep 7, 2024

Commit

85e57bc

verified ·

1 Parent(s): eb7b596

Update app.py

Browse files

Files changed (1) hide show

app.py +140 -35

app.py CHANGED Viewed

@@ -123,42 +123,140 @@ def pil_to_binary_mask(pil_image, threshold=0):
     return output_mask
 @spaces.GPU
-def start_tryon_full_body(tops_image: Image.Image, bottoms_image: Image.Image, model_parse_tops: Image.Image, model_parse_bottoms: Image.Image, keypoints_tops: dict, keypoints_bottoms: dict):
-    """
-    Combines tops and bottoms images into a single output image after processing with get_mask_location.
-    """
-    # Get mask for the tops (upper body)
-    mask_tops, _ = get_mask_location('hd', "upper_body", model_parse_tops, keypoints_tops)
-    # Get mask for the bottoms (lower body)
-    mask_bottoms, _ = get_mask_location('hd', "lower_body", model_parse_bottoms, keypoints_bottoms)
-    # Convert masks to NumPy arrays
-    mask_tops_np = np.array(mask_tops)
-    mask_bottoms_np = np.array(mask_bottoms)
-    # Convert tops and bottoms images to NumPy arrays
-    tops_np = np.array(tops_image)
-    bottoms_np = np.array(bottoms_image)
-    # Ensure that tops and bottoms images have the same dimensions as their masks
-    tops_resized = cv2.resize(tops_np, (mask_tops_np.shape[1], mask_tops_np.shape[0]))
-    bottoms_resized = cv2.resize(bottoms_np, (mask_bottoms_np.shape[1], mask_bottoms_np.shape[0]))
-    # Create a blank canvas for the final output image
-    combined_image = np.zeros_like(tops_resized)
-    # Apply the tops mask to the combined image
-    combined_image[mask_tops_np > 0] = tops_resized[mask_tops_np > 0]
-    # Apply the bottoms mask to the combined image
-    combined_image[mask_bottoms_np > 0] = bottoms_resized[mask_bottoms_np > 0]
-    # Convert the final combined image back to a PIL image
-    combined_image_pil = Image.fromarray(combined_image)
-    return combined_image_pil
 @spaces.GPU
@@ -313,33 +411,40 @@ def tryon():
 def tryon_full():
     data = request.json
-    # Decode input images
     tops_image = decode_image_from_base64(data['tops_image'])
     bottoms_image = decode_image_from_base64(data['bottoms_image'])
     model_parse_tops = decode_image_from_base64(data['model_parse_tops'])
     model_parse_bottoms = decode_image_from_base64(data['model_parse_bottoms'])
-    # Decode keypoints
-    keypoints_tops = data.get('keypoints_tops', {})
-    keypoints_bottoms = data.get('keypoints_bottoms', {})
-    # Call the start_tryon function
-    output_image = start_tryon_full_body(
         tops_image,
         bottoms_image,
         model_parse_tops,
         model_parse_bottoms,
-        keypoints_tops,
-        keypoints_bottoms
     )
     # Convert output image to base64
     output_base64 = encode_image_to_base64(output_image)
     return jsonify({
-        'output_image': output_base64
     })
 if __name__ == "__main__":
     app.run(debug=True, host="0.0.0.0", port=7860)

     return output_mask
 @spaces.GPU
+def start_tryon_full_body(tops_img, bottoms_img, model_parse_tops, model_parse_bottoms, is_checked, is_checked_crop, denoise_steps, seed):
+    device = "cuda"
+    openpose_model.preprocessor.body_estimation.model.to(device)
+    pipe.to(device)
+    pipe.unet_encoder.to(device)
+    # Convert and resize images
+    tops_img = tops_img.convert("RGB").resize((768, 1024))
+    bottoms_img = bottoms_img.convert("RGB").resize((768, 1024))
+    # Process tops image
+    human_img_orig_tops = model_parse_tops.convert("RGB").resize((768, 1024))
+    if is_checked:
+        # Assuming mask creation for the tops
+        mask_tops, _ = get_mask_location('hd', "upper_body", model_parse_tops, {})
+        mask_tops = mask_tops.resize((768, 1024))
+    else:
+        mask_tops = pil_to_binary_mask(model_parse_tops.convert("RGB").resize((768, 1024)))
+    mask_gray_tops = (1 - transforms.ToTensor()(mask_tops)) * tensor_transfrom(human_img_orig_tops)
+    mask_gray_tops = to_pil_image((mask_gray_tops + 1.0) / 2.0)
+    human_img_arg_tops = _apply_exif_orientation(human_img_orig_tops.resize((384, 512)))
+    human_img_arg_tops = convert_PIL_to_numpy(human_img_arg_tops, format="BGR")
+    args = apply_net.create_argument_parser().parse_args(('show', './configs/densepose_rcnn_R_50_FPN_s1x.yaml', './ckpt/densepose/model_final_162be9.pkl', 'dp_segm', '-v', '--opts', 'MODEL.DEVICE', 'cuda'))
+    pose_img_tops = args.func(args, human_img_arg_tops)
+    pose_img_tops = pose_img_tops[:, :, ::-1]
+    pose_img_tops = Image.fromarray(pose_img_tops).resize((768, 1024))
+    # Process bottoms image
+    human_img_orig_bottoms = model_parse_bottoms.convert("RGB").resize((768, 1024))
+    if is_checked:
+        # Assuming mask creation for the bottoms
+        mask_bottoms, _ = get_mask_location('hd', "lower_body", model_parse_bottoms, {})
+        mask_bottoms = mask_bottoms.resize((768, 1024))
+    else:
+        mask_bottoms = pil_to_binary_mask(model_parse_bottoms.convert("RGB").resize((768, 1024)))
+    mask_gray_bottoms = (1 - transforms.ToTensor()(mask_bottoms)) * tensor_transfrom(human_img_orig_bottoms)
+    mask_gray_bottoms = to_pil_image((mask_gray_bottoms + 1.0) / 2.0)
+    human_img_arg_bottoms = _apply_exif_orientation(human_img_orig_bottoms.resize((384, 512)))
+    human_img_arg_bottoms = convert_PIL_to_numpy(human_img_arg_bottoms, format="BGR")
+    pose_img_bottoms = args.func(args, human_img_arg_bottoms)
+    pose_img_bottoms = pose_img_bottoms[:, :, ::-1]
+    pose_img_bottoms = Image.fromarray(pose_img_bottoms).resize((768, 1024))
+    with torch.no_grad():
+        with torch.cuda.amp.autocast():
+            prompt_tops = "model is wearing tops"
+            negative_prompt_tops = "monochrome, lowres, bad anatomy, worst quality, low quality"
+            prompt_bottoms = "model is wearing bottoms"
+            negative_prompt_bottoms = "monochrome, lowres, bad anatomy, worst quality, low quality"
+            # Encode prompts
+            (
+                prompt_embeds_tops,
+                negative_prompt_embeds_tops,
+                pooled_prompt_embeds_tops,
+                negative_pooled_prompt_embeds_tops,
+            ) = pipe.encode_prompt(
+                prompt_tops,
+                num_images_per_prompt=1,
+                do_classifier_free_guidance=True,
+                negative_prompt=negative_prompt_tops,
+            )
+            (
+                prompt_embeds_bottoms,
+                negative_prompt_embeds_bottoms,
+                pooled_prompt_embeds_bottoms,
+                negative_pooled_prompt_embeds_bottoms,
+            ) = pipe.encode_prompt(
+                prompt_bottoms,
+                num_images_per_prompt=1,
+                do_classifier_free_guidance=True,
+                negative_prompt=negative_prompt_bottoms,
+            )
+            pose_img_tops = tensor_transfrom(pose_img_tops).unsqueeze(0).to(device, torch.float16)
+            pose_img_bottoms = tensor_transfrom(pose_img_bottoms).unsqueeze(0).to(device, torch.float16)
+            garm_tensor_tops = tensor_transfrom(tops_img).unsqueeze(0).to(device, torch.float16)
+            garm_tensor_bottoms = tensor_transfrom(bottoms_img).unsqueeze(0).to(device, torch.float16)
+            generator = torch.Generator(device).manual_seed(seed) if seed is not None else None
+            images_tops = pipe(
+                prompt_embeds=prompt_embeds_tops.to(device, torch.float16),
+                negative_prompt_embeds=negative_prompt_embeds_tops.to(device, torch.float16),
+                pooled_prompt_embeds=pooled_prompt_embeds_tops.to(device, torch.float16),
+                negative_pooled_prompt_embeds=negative_pooled_prompt_embeds_tops.to(device, torch.float16),
+                num_inference_steps=denoise_steps,
+                generator=generator,
+                strength=1.0,
+                pose_img=pose_img_tops.to(device, torch.float16),
+                text_embeds_cloth=prompt_embeds_tops.to(device, torch.float16),
+                cloth=garm_tensor_tops.to(device, torch.float16),
+                mask_image=mask_tops,
+                image=human_img_orig_tops,
+                height=1024,
+                width=768,
+                ip_adapter_image=tops_img.resize((768, 1024)),
+                guidance_scale=2.0,
+            )[0]
+            images_bottoms = pipe(
+                prompt_embeds=prompt_embeds_bottoms.to(device, torch.float16),
+                negative_prompt_embeds=negative_prompt_embeds_bottoms.to(device, torch.float16),
+                pooled_prompt_embeds=pooled_prompt_embeds_bottoms.to(device, torch.float16),
+                negative_pooled_prompt_embeds=negative_pooled_prompt_embeds_bottoms.to(device, torch.float16),
+                num_inference_steps=denoise_steps,
+                generator=generator,
+                strength=1.0,
+                pose_img=pose_img_bottoms.to(device, torch.float16),
+                text_embeds_cloth=prompt_embeds_bottoms.to(device, torch.float16),
+                cloth=garm_tensor_bottoms.to(device, torch.float16),
+                mask_image=mask_bottoms,
+                image=human_img_orig_bottoms,
+                height=1024,
+                width=768,
+                ip_adapter_image=bottoms_img.resize((768, 1024)),
+                guidance_scale=2.0,
+            )[0]
+    # Combine images
+    combined_img = Image.new("RGB", (768, 2048))  # Height is 2x the original to accommodate both images
+    combined_img.paste(images_tops, (0, 0))
+    combined_img.paste(images_bottoms, (0, 1024))  # Paste bottoms image below tops
+    return combined_img, mask_gray_tops  # Or another mask, depending on your needs
 @spaces.GPU
 def tryon_full():
     data = request.json
+    # Decode images
     tops_image = decode_image_from_base64(data['tops_image'])
     bottoms_image = decode_image_from_base64(data['bottoms_image'])
     model_parse_tops = decode_image_from_base64(data['model_parse_tops'])
     model_parse_bottoms = decode_image_from_base64(data['model_parse_bottoms'])
+    # Retrieve additional parameters
+    is_checked = data.get('use_auto_mask', True)
+    is_checked_crop = data.get('use_auto_crop', False)
+    denoise_steps = int(data.get('denoise_steps', 30))
+    seed = int(data.get('seed', 42))
+    # Call the start_tryon_full_body function
+    output_image, mask_image = start_tryon_full_body(
         tops_image,
         bottoms_image,
         model_parse_tops,
         model_parse_bottoms,
+        is_checked,
+        is_checked_crop,
+        denoise_steps,
+        seed
     )
     # Convert output image to base64
     output_base64 = encode_image_to_base64(output_image)
+    mask_base64 = encode_image_to_base64(mask_image)
     return jsonify({
+        'output_image': output_base64,
+        'mask_image': mask_base64
     })
 if __name__ == "__main__":
     app.run(debug=True, host="0.0.0.0", port=7860)