gemini_vision_objects

Running

File size: 5,044 Bytes

import os
import re
import io
import streamlit as st
from PIL import Image, ImageDraw
from google import genai
from google.genai import types

# Hilfsfunktionen
def parse_list_boxes(text):
    """Extrahiert Bounding Boxes aus dem Antworttext"""
    pattern = r'\[([\d\.]+),\s*([\d\.]+),\s*([\d\.]+),\s*([\d\.]+)\]'
    matches = re.findall(pattern, text)
    return [[float(m) for m in match] for match in matches]

def draw_bounding_boxes(image, boxes):
    """Zeichnet Bounding Boxes auf das Bild"""
    draw = ImageDraw.Draw(image)
    width, height = image.size
    
    for box in boxes:
        # Sicherstellen, dass alle Werte zwischen 0-1 liegen
        ymin = max(0.0, min(1.0, box[0]))
        xmin = max(0.0, min(1.0, box[1]))
        ymax = max(0.0, min(1.0, box[2]))
        xmax = max(0.0, min(1.0, box[3]))

        # Zeichne den Rahmen
        draw.rectangle([
            xmin * width,
            ymin * height,
            xmax * width,
            ymax * height
        ], outline="#00FF00", width=7)  # Neon green mit dicken Linien
    return image

# Streamlit UI
st.title("Objekterkennung mit Gemini")
col1, col2 = st.columns(2)

with col1:
    uploaded_file = st.file_uploader("Bild hochladen", type=["jpg", "png", "jpeg"])
    object_name = st.text_input("Objekt zur Erkennung", placeholder="z.B. 'Auto', 'Person'")

    if uploaded_file and object_name:
        image = Image.open(uploaded_file)
        width, height = image.size
        st.image(image, caption="Hochgeladenes Bild", use_container_width=True)

        if st.button("Analysieren"):
            with st.spinner("Analysiere Bild..."):
                try:
                    # Bildvorbereitung
                    image_bytes = io.BytesIO()
                    image.save(image_bytes, format=image.format)
                    image_part = types.Part.from_bytes(
                        data=image_bytes.getvalue(),
                        mime_type=f"image/{image.format.lower()}"
                    )

                    # API-Client
                    client = genai.Client(api_key=os.getenv("KEY"))

                    # Bildbeschreibung
                    desc_response = client.models.generate_content(
                        model="gemini-2.0-flash-exp",
                        contents=["Beschreibe dieses Bild detailliert.", image_part]
                    )

                    # Objekterkennung
                    detection_prompt = (
                        f"Gib exakt 4 Dezimalzahlen pro Box für alle {object_name} im Format "
                        "[ymin, xmin, ymax, xmax] als reine Python-Liste ohne weiteren Text. "
                        "Beispiel: [[0.1, 0.2, 0.3, 0.4], [0.5, 0.6, 0.7, 0.8]]"
                    )
                    box_response = client.models.generate_content(
                        model="gemini-2.0-flash-exp",
                        contents=[detection_prompt, image_part]
                    )
                    
                    # Verarbeitung
                    try:
                        boxes = parse_list_boxes(box_response.text)
                        st.write("**Parsed Boxes:**", boxes)
                    except Exception as e:
                        st.error(f"Parsing Error: {str(e)}")
                        boxes = []

                    annotated_image = image.copy()
                    
                    if boxes:
                        annotated_image = draw_bounding_boxes(annotated_image, boxes)
                        result_text = f"{len(boxes)} {object_name} erkannt"

                        # Zoom auf erste Box
                        ymin, xmin, ymax, xmax = boxes[0]
                        zoom_area = (
                            max(0, int(xmin * width - 50)),
                            max(0, int(ymin * height - 50)),
                            min(width, int(xmax * width + 50)),
                            min(height, int(ymax * height + 50))
                        )
                        zoomed_image = annotated_image.crop(zoom_area)

                    else:
                        result_text = "Keine Objekte gefunden"
                        zoomed_image = None

                    # Ergebnisse anzeigen
                    with col2:
                        
                        st.write("## Objekterkennung:")
                        st.write(result_text)
                        
                        if boxes:
                            st.image(
                                [annotated_image, zoomed_image],
                                caption=["Gesamtbild", "Zoom auf Erkennung"],
                                width=400
                            )
                        else:
                            st.image(annotated_image, caption="Keine Objekte erkannt", width=400)
                        
                        st.write("## Beschreibung:")
                        st.write(desc_response.text)
                except Exception as e:
                    st.error(f"Fehler: {str(e)}")