Spaces:
Running
Running
import torch | |
from transformers import SegformerFeatureExtractor, SegformerForSemanticSegmentation, DPTFeatureExtractor, DPTForDepthEstimation | |
from PIL import Image, ImageFilter | |
import numpy as np | |
import gradio as gr | |
# Load pre-trained models and feature extractors | |
seg_feature_extractor = SegformerFeatureExtractor.from_pretrained("nvidia/segformer-b0-finetuned-ade-512-512") | |
seg_model = SegformerForSemanticSegmentation.from_pretrained("nvidia/segformer-b0-finetuned-ade-512-512") | |
depth_feature_extractor = DPTFeatureExtractor.from_pretrained("Intel/dpt-large") | |
depth_model = DPTForDepthEstimation.from_pretrained("Intel/dpt-large") | |
def process_image(image): | |
# Preprocess the input image | |
image = image.resize((512, 512)) | |
# Perform semantic segmentation | |
seg_inputs = seg_feature_extractor(images=image, return_tensors="pt") | |
with torch.no_grad(): | |
seg_outputs = seg_model(**seg_inputs) | |
seg_logits = seg_outputs.logits | |
segmentation = torch.argmax(seg_logits, dim=1)[0].numpy() | |
# Create binary mask for 'person' class | |
person_class_index = 12 | |
binary_mask = (segmentation == person_class_index).astype(np.uint8) * 255 | |
# Perform depth estimation | |
depth_inputs = depth_feature_extractor(images=image, return_tensors="pt") | |
with torch.no_grad(): | |
depth_outputs = depth_model(**depth_inputs) | |
predicted_depth = depth_outputs.predicted_depth[0].cpu().numpy() | |
normalized_depth = (predicted_depth - predicted_depth.min()) / (predicted_depth.max() - predicted_depth.min()) | |
inverted_depth = 1 - normalized_depth | |
depth_weight_resized = np.array(Image.fromarray((inverted_depth * 255).astype(np.uint8)).resize((512, 512))) / 255.0 | |
depth_weight_resized = depth_weight_resized[:, :, np.newaxis] | |
# Create blurred background effect | |
blurred_image = image.filter(ImageFilter.GaussianBlur(radius=15)) | |
original_np = np.array(image).astype(np.float32) | |
blurred_np = np.array(blurred_image).astype(np.float32) | |
composite_np = (1 - depth_weight_resized) * original_np + depth_weight_resized * blurred_np | |
composite_image = Image.fromarray(np.clip(composite_np, 0, 255).astype(np.uint8)) | |
# Return results | |
binary_mask_image = Image.fromarray(binary_mask) | |
depth_map_image = Image.fromarray((normalized_depth * 255).astype(np.uint8)) | |
return image, binary_mask_image, depth_map_image, composite_image | |
# Create Gradio interface | |
interface = gr.Interface( | |
fn=process_image, | |
inputs=gr.inputs.Image(type="pil"), | |
outputs=[ | |
gr.outputs.Image(type="pil", label="Original Image"), | |
gr.outputs.Image(type="pil", label="Segmentation Mask"), | |
gr.outputs.Image(type="pil", label="Depth Map"), | |
gr.outputs.Image(type="pil", label="Blurred Background Effect"), | |
], | |
title="Semantic Segmentation and Depth Estimation", | |
description="Upload an image to generate a segmentation mask, depth map, and blurred background effect." | |
) | |
# Launch the interface | |
if __name__ == "__main__": | |
interface.launch() | |