Spaces:

JuanMa360
/

vision_intelligence

Sleeping

File size: 8,671 Bytes

from PIL import Image
import json
import gradio as gr
import requests
from transformers import CLIPProcessor, CLIPModel, pipeline, BlipProcessor, BlipForConditionalGeneration

model = CLIPModel.from_pretrained("model")
processor = CLIPProcessor.from_pretrained("tokenizer")
vqa_pipeline = pipeline("visual-question-answering",model="vqa")

space_type_labels = ["living room", "bedroom", "kitchen", "terrace", "closet","bathroom", "dining room", "office", "garage", "garden",
    "balcony", "attic", "hallway","gym", "playroom", "storage room", "studio","is_exterior","swimming pool","others"]

equipment_questions = [
    "Does the image show outdoor furniture?",
    "Does the image show a parasol?",
    "Does the image show a pergola?",
    "Does the image show a grill?",
    "Does the image show a heater?",
    "Does the image show outdoor lighting?",
    "Does the image show planters?",
    "Does the image show water features?",
    "Does the image show floor coverings?",
    "Does the image show decorative items?",
    "Does the image show entertainment equipment?",
    "Does the image show protective materials?"
]

weights = {
    "Does the image show outdoor furniture?": 0.15,
    "Does the image show a parasol?": 0.05,
    "Does the image show a pergola?": 0.1,
    "Does the image show a grill?": 0.15,
    "Does the image show a heater?": 0.1,
    "Does the image show outdoor lighting?": 0.1,
    "Does the image show planters?": 0.05,
    "Does the image show water features?": 0.1,
    "Does the image show floor coverings?": 0.05,
    "Does the image show decorative items?": 0.05,
    "Does the image show entertainment equipment?": 0.05,
    "Does the image show protective materials?": 0.05
}

luminosity_classes = [
    'A well-lit room with abundant natural light, showcasing windows or a balcony through which sunlight passes unobstructed.',
    'A room depicted in darkness, where there is minimal or no visible light source.',
    'A room illuminated by artificial light sources such as lamps or ceiling lights.'
]

#luminosity_classes = [
#    "A room filled with natural daylight.",
#    "A room lit by artificial lights.",
#    "A dark room with no lights."
#]

luminosity_labels = ['natural_light', 'no_light', 'artificial_light']

#view_questions = [
    #"Is this a panoramic view?",
#    "Is this a city view?",
#    "Is this a view of greenery?",
#    "Is this a mountain view?",
#    "Is this a view of the sea?"
#]

view_questions = [
   # "This is a panoramic view, showing a wide expanse of the surroundings.",
    "This is a city view, showing buildings, streets, and urban areas.",
    "This is a view of greenery, including trees, parks, or gardens.",
    "This is a mountain view, showing mountains and hilly landscapes.",
    "This is a view of the sea"
]

view_labels = ['city', 'greenery', 'mountain', 'sea']

certainty_classes = [
    'Windows, balconies, or terraces with an unobstructed outward view',
    'exterior view of a building or appearance of a house or apartment',
    'Artificial or fake view of any city or sea',
    'View obstructed by objects such as buildings, trees, or other structures',
    'Hallway or interior view with no outdoor visibility'
]

#certainty_classes = ['Windows, balconies, or terraces with an unobstructed outward view','Exterior view appearance of a house or apartment','unreal picture or fake of any city or sea view','view unfree from any obstructive objects such as buildings, trees, or other structures, and ideally seen through windows, balconies, or terraces','hallway']

render_classes = [
    "This is a realistic photo of an interior.",
    "This is a computer-generated render of an interior.",
    "This is a realistic photo of an exterior.",
    "This is a computer-generated render of an exterior."
]

threshold = 0

def calculate_equipment_score(image_results, weights):
    score = sum(weights[question] for question, present in image_results.items() if present)
    return score

def calculate_luminosity_score(processed_image):
    inputs = processor(text=luminosity_classes, images=processed_image, return_tensors="pt", padding=True)
    outputs = model(**inputs)
    logits_per_image = outputs.logits_per_image
    probs = logits_per_image.softmax(dim=1)
    probabilities_list = probs.squeeze().tolist()
    luminosity_score = {class_name: probability for class_name, probability in zip(luminosity_labels, probabilities_list)}
    return luminosity_score

def calculate_space_type(processed_image):
    inputs = processor(text=space_type_labels, images=processed_image, return_tensors="pt", padding=True)
    outputs = model(**inputs)
    logits_per_image = outputs.logits_per_image
    probs = logits_per_image.softmax(dim=1)
    probabilities_list = probs.squeeze().tolist()
    space_type_score = {class_name: probability for class_name, probability in zip(space_type_labels, probabilities_list)}
    return space_type_score

def certainty(processed_image):
    inputs = processor(text=certainty_classes, images=processed_image, return_tensors="pt", padding=True)
    outputs = model(**inputs)
    logits_per_image = outputs.logits_per_image
    probs = logits_per_image.softmax(dim=1)
    probabilities_list = probs.squeeze().tolist()
    is_fake_score = {class_name: probability for class_name, probability in zip(certainty_classes, probabilities_list)}
    return is_fake_score

def views(processed_image):
    inputs = processor(text=view_questions, images=processed_image, return_tensors="pt", padding=True)
    outputs = model(**inputs)
    logits_per_image = outputs.logits_per_image
    probs = logits_per_image.softmax(dim=1)
    probabilities_list = probs.squeeze().tolist()
    views_score = {class_name: probability for class_name, probability in zip(view_labels, probabilities_list)}
    return views_score

def calculate_is_render(processed_image):
    render_inputs = processor(text=render_classes, images=processed_image, return_tensors="pt", padding=True)
    render_outputs = model(**render_inputs)
    render_logits = render_outputs.logits_per_image
    render_probs = render_logits.softmax(dim=1)
    render_probabilities_list = render_probs.squeeze().tolist()
    render_score = {class_name: probability for class_name, probability in zip(render_classes, render_probabilities_list)}
    is_render_prob = render_score["This is a realistic photo of an interior."]+render_score["This is a realistic photo of an exterior."]
    return is_render_prob

def generate_answer(image):

    processed_image = image

    image_data = {
        "image_context": None,
        "validation": None,
        "equipment_score": None,
        "luminosity_score": {"score": None},
        "view_type": {"views": None, "certainty_score": None}
    }

    space_type_score = calculate_space_type(processed_image)
    max_space_type = max(space_type_score, key=space_type_score.get)
    if space_type_score[max_space_type] >= 0:
        space_type = max_space_type.lower()
        if space_type == "patio":
            space_type = "terrace"
    image_data["image_context"] = space_type_score

    image_results = {}
    if max_space_type == "terrace":
      for question in equipment_questions:
          result = vqa_pipeline(processed_image, question, top_k=1)
          answer = result[0]['answer'].lower() == "yes"
          image_results[question] = answer
      equipment_score = calculate_equipment_score(image_results, weights)
      image_data["equipment_score"] = equipment_score

    result = vqa_pipeline(processed_image, "Is there a real window?", top_k=1)
    has_window = result[0]
    image_data["validation"] = "pass validation" if has_window['score'] > 0.9 else "No candidate"

    window_exists = has_window["answer"].lower() == "yes" and has_window["score"] > 0.9

    if max_space_type in ["bedroom", "living room", "kitchen"] and window_exists:
      luminosity_score = calculate_luminosity_score(processed_image)
      image_data["luminosity_score"]['score'] = luminosity_score['natural_light']

      view = views(processed_image)
      image_data["view_type"]["views"] = view

      certainty_score = certainty(processed_image)
      certainty_score = list(certainty_score.values())[0]
      image_data["view_type"]["certainty_score"] = certainty_score

    #is_render = calculate_is_render(processed_image)
    #image_data["is_render"] = is_render

    return json.dumps(image_data, indent=4)


image_input = gr.Image(type="pil", label="Upload Image")

iface = gr.Interface(
    fn=generate_answer, 
    inputs=[image_input], 
    outputs="text",
    title="Vision intelligence",
    description="Upload an image"
)

iface.launch()