Spaces:

jamino30
/

salient-style-transfer

Running

App Files Files Community

jamino30 commited on Nov 22, 2024

Commit

28ac920

verified ·

1 Parent(s): ecf0440

Upload folder using huggingface_hub

Browse files

Files changed (3) hide show

app.py +3 -1
inference.py +31 -54
u2net/inference.py +11 -7

app.py CHANGED Viewed

@@ -30,7 +30,8 @@ def load_model_without_module(model, model_path):
         name = k[7:] if k.startswith('module.') else k
         new_state_dict[name] = v
     model.load_state_dict(new_state_dict)
 model = VGG_19().to(device).eval()
 for param in model.parameters():
     param.requires_grad = False
@@ -43,6 +44,7 @@ style_options = {' '.join(style_file.split('.')[0].split('_')): f'./style_images
 lrs = np.logspace(np.log10(0.001), np.log10(0.1), 10).tolist()
 img_size = 512
 cached_style_features = {}
 for style_name, style_img_path in style_options.items():
     style_img = preprocess_img_from_path(style_img_path, img_size)[0].to(device)

         name = k[7:] if k.startswith('module.') else k
         new_state_dict[name] = v
     model.load_state_dict(new_state_dict)
+# load models
 model = VGG_19().to(device).eval()
 for param in model.parameters():
     param.requires_grad = False
 lrs = np.logspace(np.log10(0.001), np.log10(0.1), 10).tolist()
 img_size = 512
+# store style(s) features
 cached_style_features = {}
 for style_name, style_img_path in style_options.items():
     style_img = preprocess_img_from_path(style_img_path, img_size)[0].to(device)

inference.py CHANGED Viewed

@@ -1,37 +1,24 @@
-import os
-from tqdm import tqdm
 import torch
 import torch.optim as optim
 import torch.nn.functional as F
 from torchvision.transforms.functional import gaussian_blur
-def _gram_matrix(feature):
-    batch_size, n_feature_maps, height, width = feature.size()
-    new_feature = feature.view(batch_size * n_feature_maps, height * width)
-    return torch.mm(new_feature, new_feature.t())
-def _compute_loss(generated_features, content_features, style_features, resized_bg_masks, alpha, beta):
-    content_loss = 0
-    style_loss = 0
-    w_l = 1 / len(generated_features)
-    for i, (gf, cf, sf) in enumerate(zip(generated_features, content_features, style_features)):
-        content_loss += F.mse_loss(gf, cf)
-        if resized_bg_masks:
-            blurred_bg_mask = gaussian_blur(resized_bg_masks[i], kernel_size=5)
-            masked_gf = gf * blurred_bg_mask
-            masked_sf = sf * blurred_bg_mask
-            G = _gram_matrix(masked_gf)
-            A = _gram_matrix(masked_sf)
-        else:
-            G = _gram_matrix(gf)
-            A = _gram_matrix(sf)
-        style_loss += w_l * F.mse_loss(G, A)
-    total_loss = alpha * content_loss + beta * style_loss
-    return content_loss, style_loss, total_loss
 def inference(
     *,
@@ -41,7 +28,7 @@ def inference(
     content_image_norm,
     style_features,
     apply_to_background,
-    lr,
     iterations=101,
     optim_caller=optim.AdamW,
     alpha=1,
@@ -49,43 +36,33 @@ def inference(
 ):
     generated_image = content_image.clone().requires_grad_(True)
     optimizer = optim_caller([generated_image], lr=lr)
-    min_losses = [float('inf')] * iterations
     with torch.no_grad():
         content_features = model(content_image)
-        resized_bg_masks = []
         if apply_to_background:
-            segmentation_output = sod_model(content_image_norm)[0]
-            segmentation_output = torch.sigmoid(segmentation_output)
-            segmentation_mask = (segmentation_output > 0.7).float()
-            background_mask = (segmentation_mask == 0).float()
-            foreground_mask = 1 - background_mask
-            for cf in content_features:
-                _, _, h_i, w_i = cf.shape
-                bg_mask = F.interpolate(background_mask.unsqueeze(1), size=(h_i, w_i), mode='bilinear', align_corners=False)
-                resized_bg_masks.append(bg_mask)
-    def closure(iter):
         optimizer.zero_grad()
         generated_features = model(generated_image)
-        content_loss, style_loss, total_loss = _compute_loss(
-            generated_features, content_features, style_features, resized_bg_masks, alpha, beta
         )
         total_loss.backward()
-        # log loss
-        min_losses[iter] = min(min_losses[iter], total_loss.item())
         return total_loss
-    for iter in tqdm(range(iterations)):
-        optimizer.step(lambda: closure(iter))
         if apply_to_background:
             with torch.no_grad():
-                foreground_mask_resized = F.interpolate(foreground_mask.unsqueeze(1), size=generated_image.shape[2:], mode='nearest')
-                generated_image.data = generated_image.data * (1 - foreground_mask_resized) + content_image.data * foreground_mask_resized
     return generated_image

 import torch
 import torch.optim as optim
 import torch.nn.functional as F
 from torchvision.transforms.functional import gaussian_blur
+from tqdm import tqdm
+def gram_matrix(feature):
+    b, c, h, w = feature.size()
+    feature = feature.view(b * c, h * w)
+    return feature @ feature.t()
+def compute_loss(generated, content, style, bg_masks, alpha, beta):
+    content_loss = sum(F.mse_loss(gf, cf) for gf, cf in zip(generated, content))
+    style_loss = sum(
+        F.mse_loss(
+            gram_matrix(gf * bg) if bg is not None else gram_matrix(gf),
+            gram_matrix(sf * bg) if bg is not None else gram_matrix(sf),
+        ) / len(generated)
+        for gf, sf, bg in zip(generated, style, bg_masks or [None] * len(generated))
+    )
+    return alpha * content_loss, beta * style_loss, alpha * content_loss + beta * style_loss
 def inference(
     *,
     content_image_norm,
     style_features,
     apply_to_background,
+    lr=5e-2,
     iterations=101,
     optim_caller=optim.AdamW,
     alpha=1,
 ):
     generated_image = content_image.clone().requires_grad_(True)
     optimizer = optim_caller([generated_image], lr=lr)
     with torch.no_grad():
         content_features = model(content_image)
+        bg_masks = None
         if apply_to_background:
+            seg_output = torch.sigmoid(sod_model(content_image_norm)[0])
+            bg_mask = (seg_output <= 0.7).float()
+            bg_masks = [
+                F.interpolate(bg_mask.unsqueeze(1), size=cf.shape[2:], mode='bilinear', align_corners=False)
+                for cf in content_features
+            ]
+    def closure():
         optimizer.zero_grad()
         generated_features = model(generated_image)
+        content_loss, style_loss, total_loss = compute_loss(
+            generated_features, content_features, style_features, bg_masks, alpha, beta
         )
         total_loss.backward()
         return total_loss
+    for _ in tqdm(range(iterations)):
+        optimizer.step(closure)
         if apply_to_background:
             with torch.no_grad():
+                fg_mask = F.interpolate(1 - bg_masks[0], size=generated_image.shape[2:], mode='nearest')
+                generated_image.data.mul_(1 - fg_mask).add_(content_image.data * fg_mask)
     return generated_image

u2net/inference.py CHANGED Viewed

@@ -9,19 +9,22 @@ from matplotlib.gridspec import GridSpec
 from model import U2Net
-device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
 def preprocess_image(image_path):
     img = Image.open(image_path).convert('RGB')
     preprocess = transforms.Compose([
-        transforms.Resize((512, 512)),
         transforms.ToTensor(),
         transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
     ])
     img = preprocess(img).unsqueeze(0).to(device)
     return img
-def run_inference(model, image_path, threshold=0.5):
     input_img = preprocess_image(image_path)
     with torch.no_grad():
         d1, *_ = model(input_img)
@@ -47,15 +50,16 @@ def overlay_segmentation(original_image, binary_mask, alpha=0.5):
 if __name__ == '__main__':
     # ---
-    model_path = 'results/u2net-duts-msra.safetensors'
-    image_path = 'images/ladies.jpg'
     # ---
     model = U2Net().to(device)
     model = nn.DataParallel(model)
     model.load_state_dict(load_file(model_path, device=device.type))
     model.eval()
-    mask = run_inference(model, image_path, threshold=None)
     mask_with_threshold = run_inference(model, image_path, threshold=0.7)
     fig = plt.figure(figsize=(10, 10))
@@ -74,4 +78,4 @@ if __name__ == '__main__':
         ax.axis('off')
     plt.subplots_adjust(left=0, right=1, top=1, bottom=0)
-    plt.savefig('inference-output.jpg', format='jpg', bbox_inches='tight', pad_inches=0)

 from model import U2Net
+if torch.cuda.is_available(): device = 'cuda'
+elif torch.backends.mps.is_available(): device = 'mps'
+else: device = 'cpu'
+device = torch.device(device)
 def preprocess_image(image_path):
     img = Image.open(image_path).convert('RGB')
     preprocess = transforms.Compose([
+        transforms.Resize((512, 512), interpolation=transforms.InterpolationMode.BILINEAR),
         transforms.ToTensor(),
         transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
     ])
     img = preprocess(img).unsqueeze(0).to(device)
     return img
+def run_inference(model, image_path, threshold=None):
     input_img = preprocess_image(image_path)
     with torch.no_grad():
         d1, *_ = model(input_img)
 if __name__ == '__main__':
     # ---
+    model_path = '../testing/u2net-duts-msra.safetensors'
+    filename = input('Filename: ')
+    image_path = f'../content_images/{filename}'
     # ---
     model = U2Net().to(device)
     model = nn.DataParallel(model)
     model.load_state_dict(load_file(model_path, device=device.type))
     model.eval()
+    mask = run_inference(model, image_path)
     mask_with_threshold = run_inference(model, image_path, threshold=0.7)
     fig = plt.figure(figsize=(10, 10))
         ax.axis('off')
     plt.subplots_adjust(left=0, right=1, top=1, bottom=0)
+    plt.savefig('../testing/inference-output.jpg', format='jpg', bbox_inches='tight', pad_inches=0)