import os import re import io import streamlit as st from PIL import Image, ImageDraw from google import genai from google.genai import types # Hilfsfunktionen def parse_list_boxes(text): """Extrahiert Bounding Boxes aus dem Antworttext""" pattern = r'\[([\d\.]+),\s*([\d\.]+),\s*([\d\.]+),\s*([\d\.]+)\]' matches = re.findall(pattern, text) return [[float(m) for m in match] for match in matches] def draw_bounding_boxes(image, boxes): """Zeichnet Bounding Boxes auf das Bild""" draw = ImageDraw.Draw(image) width, height = image.size for box in boxes: # Sicherstellen, dass alle Werte zwischen 0-1 liegen ymin = max(0.0, min(1.0, box[0])) xmin = max(0.0, min(1.0, box[1])) ymax = max(0.0, min(1.0, box[2])) xmax = max(0.0, min(1.0, box[3])) # Zeichne den Rahmen draw.rectangle([ xmin * width, ymin * height, xmax * width, ymax * height ], outline="#00FF00", width=7) # Neon green mit dicken Linien return image # Streamlit UI st.title("Objekterkennung mit Gemini") col1, col2 = st.columns(2) with col1: uploaded_file = st.file_uploader("Bild hochladen", type=["jpg", "png", "jpeg"]) object_name = st.text_input("Objekt zur Erkennung", placeholder="z.B. 'Auto', 'Person'") if uploaded_file and object_name: image = Image.open(uploaded_file) width, height = image.size st.image(image, caption="Hochgeladenes Bild", use_container_width=True) if st.button("Analysieren"): with st.spinner("Analysiere Bild..."): try: # Bildvorbereitung image_bytes = io.BytesIO() image.save(image_bytes, format=image.format) image_part = types.Part.from_bytes( data=image_bytes.getvalue(), mime_type=f"image/{image.format.lower()}" ) # API-Client client = genai.Client(api_key=os.getenv("KEY")) # Bildbeschreibung desc_response = client.models.generate_content( model="gemini-2.0-flash-exp", contents=["Beschreibe dieses Bild detailliert.", image_part] ) # Objekterkennung detection_prompt = ( f"Gib exakt 4 Dezimalzahlen pro Box für alle {object_name} im Format " "[ymin, xmin, ymax, xmax] als reine Python-Liste ohne weiteren Text. " "Beispiel: [[0.1, 0.2, 0.3, 0.4], [0.5, 0.6, 0.7, 0.8]]" ) box_response = client.models.generate_content( model="gemini-2.0-flash-exp", contents=[detection_prompt, image_part] ) # Verarbeitung try: boxes = parse_list_boxes(box_response.text) st.write("**Parsed Boxes:**", boxes) except Exception as e: st.error(f"Parsing Error: {str(e)}") boxes = [] annotated_image = image.copy() if boxes: annotated_image = draw_bounding_boxes(annotated_image, boxes) result_text = f"{len(boxes)} {object_name} erkannt" # Zoom auf erste Box ymin, xmin, ymax, xmax = boxes[0] zoom_area = ( max(0, int(xmin * width - 50)), max(0, int(ymin * height - 50)), min(width, int(xmax * width + 50)), min(height, int(ymax * height + 50)) ) zoomed_image = annotated_image.crop(zoom_area) else: result_text = "Keine Objekte gefunden" zoomed_image = None # Ergebnisse anzeigen with col2: st.write("## Objekterkennung:") st.write(result_text) if boxes: st.image( [annotated_image, zoomed_image], caption=["Gesamtbild", "Zoom auf Erkennung"], width=400 ) else: st.image(annotated_image, caption="Keine Objekte erkannt", width=400) st.write("## Beschreibung:") st.write(desc_response.text) except Exception as e: st.error(f"Fehler: {str(e)}")