File size: 3,050 Bytes
52428d7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
import torch
from transformers import SegformerFeatureExtractor, SegformerForSemanticSegmentation, DPTFeatureExtractor, DPTForDepthEstimation
from PIL import Image, ImageFilter
import numpy as np
import gradio as gr

# Load pre-trained models and feature extractors
seg_feature_extractor = SegformerFeatureExtractor.from_pretrained("nvidia/segformer-b0-finetuned-ade-512-512")
seg_model = SegformerForSemanticSegmentation.from_pretrained("nvidia/segformer-b0-finetuned-ade-512-512")
depth_feature_extractor = DPTFeatureExtractor.from_pretrained("Intel/dpt-large")
depth_model = DPTForDepthEstimation.from_pretrained("Intel/dpt-large")

def process_image(image):
    # Preprocess the input image
    image = image.resize((512, 512))
    
    # Perform semantic segmentation
    seg_inputs = seg_feature_extractor(images=image, return_tensors="pt")
    with torch.no_grad():
        seg_outputs = seg_model(**seg_inputs)
    seg_logits = seg_outputs.logits
    segmentation = torch.argmax(seg_logits, dim=1)[0].numpy()
    
    # Create binary mask for 'person' class
    person_class_index = 12
    binary_mask = (segmentation == person_class_index).astype(np.uint8) * 255
    
    # Perform depth estimation
    depth_inputs = depth_feature_extractor(images=image, return_tensors="pt")
    with torch.no_grad():
        depth_outputs = depth_model(**depth_inputs)
    predicted_depth = depth_outputs.predicted_depth[0].cpu().numpy()
    normalized_depth = (predicted_depth - predicted_depth.min()) / (predicted_depth.max() - predicted_depth.min())
    inverted_depth = 1 - normalized_depth
    depth_weight_resized = np.array(Image.fromarray((inverted_depth * 255).astype(np.uint8)).resize((512, 512))) / 255.0
    depth_weight_resized = depth_weight_resized[:, :, np.newaxis]
    
    # Create blurred background effect
    blurred_image = image.filter(ImageFilter.GaussianBlur(radius=15))
    original_np = np.array(image).astype(np.float32)
    blurred_np = np.array(blurred_image).astype(np.float32)
    composite_np = (1 - depth_weight_resized) * original_np + depth_weight_resized * blurred_np
    composite_image = Image.fromarray(np.clip(composite_np, 0, 255).astype(np.uint8))
    
    # Return results
    binary_mask_image = Image.fromarray(binary_mask)
    depth_map_image = Image.fromarray((normalized_depth * 255).astype(np.uint8))
    return image, binary_mask_image, depth_map_image, composite_image

# Create Gradio interface
interface = gr.Interface(
    fn=process_image,
    inputs=gr.inputs.Image(type="pil"),
    outputs=[
        gr.outputs.Image(type="pil", label="Original Image"),
        gr.outputs.Image(type="pil", label="Segmentation Mask"),
        gr.outputs.Image(type="pil", label="Depth Map"),
        gr.outputs.Image(type="pil", label="Blurred Background Effect"),
    ],
    title="Semantic Segmentation and Depth Estimation",
    description="Upload an image to generate a segmentation mask, depth map, and blurred background effect."
)

# Launch the interface
if __name__ == "__main__":
    interface.launch()