Spaces:

JuanMa360
/

vision_intelligence

Sleeping

File size: 8,891 Bytes

from PIL import Image
import json
import gradio as gr
import requests
from transformers import CLIPProcessor, CLIPModel, pipeline, BlipProcessor, BlipForConditionalGeneration

model = CLIPModel.from_pretrained("model")
processor = CLIPProcessor.from_pretrained("tokenizer")
vqa_pipeline = pipeline("visual-question-answering",model="vqa")

space_type_labels = ["living room", "bedroom", "kitchen", "terrace", "closet","bathroom", "dining room", "office", "garage", "garden",
    "balcony", "attic", "hallway","gym", "playroom", "storage room", "studio","is_exterior","swimming pool","others"]

equipment_questions = [
    "Does the image show outdoor furniture?",
    "Does the image show a parasol?",
    "Does the image show a pergola?",
    "Does the image show a grill?",
    "Does the image show a heater?",
    "Does the image show outdoor lighting?",
    "Does the image show planters?",
    "Does the image show water features?",
    "Does the image show floor coverings?",
    "Does the image show decorative items?",
    "Does the image show entertainment equipment?",
    "Does the image show protective materials?"
]

weights = {
    "Does the image show outdoor furniture?": 0.15,
    "Does the image show a parasol?": 0.05,
    "Does the image show a pergola?": 0.1,
    "Does the image show a grill?": 0.15,
    "Does the image show a heater?": 0.1,
    "Does the image show outdoor lighting?": 0.1,
    "Does the image show planters?": 0.05,
    "Does the image show water features?": 0.1,
    "Does the image show floor coverings?": 0.05,
    "Does the image show decorative items?": 0.05,
    "Does the image show entertainment equipment?": 0.05,
    "Does the image show protective materials?": 0.05
}

#luminosity_classes = [
#    'A well-lit room with abundant natural light, showcasing windows or a balcony through which sunlight passes unobstructed.',
#    'A room depicted in darkness, where there is minimal or no visible light source.',
#    'A room illuminated by artificial light sources such as lamps or ceiling lights.'
#]

luminosity_classes = [
    "A room filled with natural daylight.",
    "A room lit by artificial lights.",
    "A dark room with no lights."
]

luminosity_labels = ['natural_light', 'no_light', 'artificial_light']

#view_questions = [
#    "Is this a panoramic view?",
#    "Is this a city view?",
#    "Is this a view of greenery?",
#    "Is this a mountain view?",
#    "Is this a view of the sea?"
#]

view_questions = [
    #"This is a panoramic view, showing a wide expanse of the surroundings, including both natural and urban elements.",
    "This is a city view, showing buildings, streets, and urban infrastructure.",
    "This is a view of greenery, focusing on trees, parks, gardens, and other vegetative elements.",
    "This is a mountain view, showing mountains, hills, and rocky landscapes.",
    "This is a view of the sea, focusing on oceans, beaches, and large bodies of water.",
    "There is not window to see outside"
]

view_labels = ['city', 'greenery', 'mountain', 'sea','not_clear']

certainty_classes = [
    'Windows, balconies, or terraces with an unobstructed outward view',
    'exterior view of a building or appearance of a house or apartment',
    'Artificial or fake view of any city or sea',
    'View obstructed by objects such as buildings, trees, or other structures',
    'Hallway or interior view with no outdoor visibility'
]

#certainty_classes = ['Windows, balconies, or terraces with an unobstructed outward view','Exterior view appearance of a house or apartment','unreal picture or fake of any city or sea view','view unfree from any obstructive objects such as buildings, trees, or other structures, and ideally seen through windows, balconies, or terraces','hallway']

render_classes = [
    "This is a realistic photo of an interior.",
    "This is a computer-generated render of an interior.",
    "This is a realistic photo of an exterior.",
    "This is a computer-generated render of an exterior."
]

threshold = 0

def calculate_equipment_score(image_results, weights):
    score = sum(weights[question] for question, present in image_results.items() if present)
    return score

def calculate_luminosity_score(processed_image):
    inputs = processor(text=luminosity_classes, images=processed_image, return_tensors="pt", padding=True)
    outputs = model(**inputs)
    logits_per_image = outputs.logits_per_image
    probs = logits_per_image.softmax(dim=1)
    probabilities_list = probs.squeeze().tolist()
    luminosity_score = {class_name: probability for class_name, probability in zip(luminosity_labels, probabilities_list)}
    return luminosity_score

def calculate_space_type(processed_image):
    inputs = processor(text=space_type_labels, images=processed_image, return_tensors="pt", padding=True)
    outputs = model(**inputs)
    logits_per_image = outputs.logits_per_image
    probs = logits_per_image.softmax(dim=1)
    probabilities_list = probs.squeeze().tolist()
    space_type_score = {class_name: probability for class_name, probability in zip(space_type_labels, probabilities_list)}
    return space_type_score

def certainty(processed_image):
    inputs = processor(text=certainty_classes, images=processed_image, return_tensors="pt", padding=True)
    outputs = model(**inputs)
    logits_per_image = outputs.logits_per_image
    probs = logits_per_image.softmax(dim=1)
    probabilities_list = probs.squeeze().tolist()
    is_fake_score = {class_name: probability for class_name, probability in zip(certainty_classes, probabilities_list)}
    return is_fake_score

def views(processed_image):
    inputs = processor(text=view_questions, images=processed_image, return_tensors="pt", padding=True)
    outputs = model(**inputs)
    logits_per_image = outputs.logits_per_image
    probs = logits_per_image.softmax(dim=1)
    probabilities_list = probs.squeeze().tolist()
    views_score = {class_name: probability for class_name, probability in zip(view_labels, probabilities_list)}
    return views_score

def calculate_is_render(processed_image):
    render_inputs = processor(text=render_classes, images=processed_image, return_tensors="pt", padding=True)
    render_outputs = model(**render_inputs)
    render_logits = render_outputs.logits_per_image
    render_probs = render_logits.softmax(dim=1)
    render_probabilities_list = render_probs.squeeze().tolist()
    render_score = {class_name: probability for class_name, probability in zip(render_classes, render_probabilities_list)}
    is_render_prob = render_score["This is a realistic photo of an interior."]+render_score["This is a realistic photo of an exterior."]
    return is_render_prob

def generate_answer(image):

    processed_image = image
    processed_image = processed_image.resize((256, 256))

    image_data = {
        "image_context": None,
        "equipment_score": None,
        "luminosity_score": {"score": None,
        "has_window": {
            "score": None,
            "answer": None
        }},
        "view_type": {"views": None, "certainty_score": None}
    }

    space_type_score = calculate_space_type(processed_image)
    max_space_type = max(space_type_score, key=space_type_score.get)
    if space_type_score[max_space_type] >= threshold:
        space_type = max_space_type.lower()
        if space_type == "patio":
            space_type = "terrace"
    image_data["image_context"] = space_type_score

    image_results = {}
    if max_space_type == "terrace":
        for question in equipment_questions:
            result = vqa_pipeline(processed_image, question, top_k=1)
            answer = result[0]['answer'].lower() == "yes"
            image_results[question] = answer
        equipment_score = calculate_equipment_score(image_results, weights)
        image_data["equipment_score"] = equipment_score

    if max_space_type in ["bedroom", "living room", "kitchen"]:
        luminosity_score = calculate_luminosity_score(processed_image)
        image_data["luminosity_score"]['score'] = luminosity_score['natural_light']

        result = vqa_pipeline(processed_image, "Is there a real window?", top_k=1)
        has_window = result[0]
        image_data["luminosity_score"]["has_window"] = has_window

        view = views(processed_image)
        image_data["view_type"]["views"] = view

        certainty_score = certainty(processed_image)
        certainty_score = list(certainty_score.values())[0]
        image_data["view_type"]["certainty_score"] = certainty_score

    is_render = calculate_is_render(processed_image)
    image_data["is_render"] = is_render

    return json.dumps(image_data, indent=4)


image_input = gr.Image(type="pil", label="Upload Image")

iface = gr.Interface(
    fn=generate_answer, 
    inputs=[image_input], 
    outputs="text",
    title="Vision intelligence",
    description="Upload an image"
)

iface.launch()