gemini_vision_objects

Running

File size: 3,495 Bytes

import os
import re
import io
import streamlit as st
from PIL import Image, ImageDraw
from google import genai
from google.genai import types

# Hilfsfunktionen
def parse_list_boxes(text):
    """Extrahiert Bounding Boxes aus dem Antworttext"""
    pattern = r'\[([\d\.]+),\s*([\d\.]+),\s*([\d\.]+),\s*([\d\.]+)\]'
    matches = re.findall(pattern, text)
    return [[float(m) for m in match] for match in matches]

def draw_bounding_boxes(image, boxes):
    """Zeichnet Bounding Boxes auf das Bild"""
    draw = ImageDraw.Draw(image)
    width, height = image.size
    for box in boxes:
        ymin, xmin, ymax, xmax = box
        draw.rectangle([
            xmin * width,
            ymin * height,
            xmax * width,
            ymax * height
        ], outline="red", width=3)
    return image

# Streamlit UI
st.title("Bildanalyse mit Gemini")
col1, col2 = st.columns(2)

with col1:
    uploaded_file = st.file_uploader("Bild hochladen", type=["jpg", "png", "jpeg"])
    object_name = st.text_input("Objekt zur Erkennung", placeholder="z.B. 'Auto', 'Person'")

    if uploaded_file and object_name:
        image = Image.open(uploaded_file)
        st.image(image, caption="Hochgeladenes Bild", use_container_width=True)

        if st.button("Analysieren"):
            with st.spinner("Analysiere Bild..."):
                try:
                    # Bildvorbereitung
                    image_bytes = io.BytesIO()
                    image.save(image_bytes, format=image.format)
                    image_part = types.Part.from_bytes(
                        data=image_bytes.getvalue(),
                        mime_type=f"image/{image.format.lower()}"
                    )

                    # API-Client
                    client = genai.Client(api_key=os.getenv("KEY"))

                    # Bildbeschreibung
                    desc_response = client.models.generate_content(
                        model="gemini-2.0-flash-exp",
                        contents=["Beschreibe dieses Bild detailliert.", image_part]
                    )

                    # Objekterkennung
                    detection_prompt = (
                        f"Gib alle Bounding Boxes für {object_name} im Format "
                        "[ymin, xmin, ymax, xmax] als Liste. Nur die Liste zurückgeben!"
                    )
                    box_response = client.models.generate_content(
                        model="gemini-1.0-pro-vision",
                        contents=[detection_prompt, image_part]
                    )

                    # Verarbeitung
                    boxes = parse_list_boxes(box_response.text)
                    annotated_image = image.copy()
                    
                    if boxes:
                        annotated_image = draw_bounding_boxes(annotated_image, boxes)
                        result_text = f"{len(boxes)} {object_name} erkannt"
                    else:
                        result_text = "Keine Objekte gefunden"

                    # Ergebnisse anzeigen
                    with col2:
                        st.write("## Beschreibung:")
                        st.write(desc_response.text)
                        
                        st.write("## Objekterkennung:")
                        st.write(result_text)
                        st.image(annotated_image, caption="Erkannte Objekte", use_column_width=True)

                except Exception as e:
                    st.error(f"Fehler: {str(e)}")