Spaces:

orasul
/

deki

Running

App Files Files Community

orasul commited on Jun 22

Commit

fe57b03

1 Parent(s): 02544bc

Fix bugs

Browse files

Files changed (1) hide show

app.py +44 -9

app.py CHANGED Viewed

@@ -136,17 +136,47 @@ def handle_action(openai_key, image, prompt):
             image_description = run_wrapper(original_image_path, temp_dir, skip_ocr=False, skip_spell=True, json_mini=True)
             with open(yolo_updated_image_path, "rb") as f:
                 yolo_updated_img_bytes = f.read()
             _, new_b64 = preprocess_image(yolo_updated_img_bytes, threshold=2000, scale=0.5, fmt="png")
         base64_image_url = f"data:image/png;base64,{new_b64}"
-        prompt_text = f"""You are an AI agent... (rest of your long prompt)
-        The user said: "{prompt}"
-        Description: "{image_description}" """
-        messages = [{"role": "user", "content": [{"type": "text", "text": prompt_text}, {"type": "image_url", "image_url": {"url": base64_image_url, "detail": "high"}}]}]
         response = llm_client.chat.completions.create(model="gpt-4.1", messages=messages, temperature=0.2)
         return response.choices[0].message.content.strip()
@@ -164,7 +194,7 @@ def handle_analyze(image, output_style):
             image_path = os.path.join(temp_dir, "image_to_analyze.png")
             save_base64_image(image_b64, image_path)
-            is_mini = (output_style == "Mini JSON")
             description_str = run_wrapper(image_path=image_path, output_dir=temp_dir, json_mini=is_mini)
             parsed_json = json.loads(description_str)
@@ -185,7 +215,7 @@ def handle_analyze_yolo(image, output_style):
             yolo_image_path = os.path.join(temp_dir, f"{request_id}_yolo_updated.png")
             save_base64_image(image_b64, image_path)
-            is_mini = (output_style == "Mini JSON")
             description_str = run_wrapper(image_path=image_path, output_dir=temp_dir, json_mini=is_mini)
             parsed_json = json.loads(description_str)
@@ -215,15 +245,20 @@ def handle_generate(openai_key, image, prompt):
             image_description = run_wrapper(image_path=original_image_path, output_dir=temp_dir, json_mini=False)
             with open(yolo_updated_image_path, "rb") as f:
                 yolo_updated_img_bytes = f.read()
             _, new_b64 = preprocess_image(yolo_updated_img_bytes, threshold=1500, scale=0.5, fmt="png")
         base64_image_url = f"data:image/png;base64,{new_b64}"
         messages = [
             {"role": "user", "content": [
-                {"type": "text", "text": f'"Prompt: {prompt}"\nImage description:\n"{image_description}"'},
                 {"type": "image_url", "image_url": {"url": base64_image_url, "detail": "high"}}
             ]}
         ]
@@ -272,7 +307,7 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
             with gr.Row():
                 image_input_analyze = gr.Image(type="pil", label="Upload Screen Image")
                 with gr.Column():
-                    output_style_analyze = gr.Radio(["Standard JSON", "Mini JSON"], label="Output Format", value="Standard JSON")
                     analyze_button = gr.Button("Analyze Image", variant="primary")
             analyze_output = gr.JSON(label="JSON Description")
             with gr.Row():
@@ -284,7 +319,7 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
             with gr.Row():
                 image_input_yolo = gr.Image(type="pil", label="Upload Screen Image")
                 with gr.Column():
-                     output_style_yolo = gr.Radio(["Standard JSON", "Mini JSON"], label="Output Format", value="Standard JSON")
                      yolo_button = gr.Button("Analyze and Visualize", variant="primary")
             with gr.Row():
                 yolo_image_output = gr.Image(label="YOLO Annotated Image")

             image_description = run_wrapper(original_image_path, temp_dir, skip_ocr=False, skip_spell=True, json_mini=True)
+            if not os.path.exists(yolo_updated_image_path):
+                raise FileNotFoundError(f"YOLO updated image not found at {yolo_updated_image_path}")
             with open(yolo_updated_image_path, "rb") as f:
                 yolo_updated_img_bytes = f.read()
             _, new_b64 = preprocess_image(yolo_updated_img_bytes, threshold=2000, scale=0.5, fmt="png")
         base64_image_url = f"data:image/png;base64,{new_b64}"
+        prompt_text = f"""You are an AI agent that controls a mobile device and sees the content of screen.
+User can ask you about some information or to do some task and you need to do these tasks.
+You can only respond with one of these commands (in quotes) but some variables are dynamic
+and can be changed based on the context:
+1. "Swipe left. From start coordinates 300, 400" (or other coordinates) (Goes right)
+2. "Swipe right. From start coordinates 500, 650" (or other coordinates) (Goes left)
+3. "Swipe top. From start coordinates 600, 510" (or other coordinates) (Goes bottom)
+4. "Swipe bottom. From start coordinates 640, 500" (or other coordinates) (Goes top)
+5. "Go home"
+6. "Go back"
+8. "Open com.whatsapp" (or other app)
+9. "Tap coordinates 160, 820" (or other coordinates)
+10. "Insert text 210, 820:Hello world" (or other coordinates and text)
+11. "Screen is in a loading state. Try again" (send image again)
+12. "Answer: There are no new important mails today" (or other answer)
+13. "Finished" (task is finished)
+14. "Can't proceed" (can't understand what to do or image has problem etc.)
+The user said: "{prompt}"
+I will share the screenshot of the current state of the phone (with UI elements highlighted and the corresponding
+index of these UI elements) and the description (sizes, coordinates and indexes) of UI elements.
+Description:
+"{image_description}" """
+        messages = [
+            {"role": "user", "content": [
+                {"type": "text", "text": prompt_text},
+                # We are correctly sending the YOLO-annotated image here
+                {"type": "image_url", "image_url": {"url": base64_image_url, "detail": "high"}}
+            ]}
+        ]
         response = llm_client.chat.completions.create(model="gpt-4.1", messages=messages, temperature=0.2)
         return response.choices[0].message.content.strip()
             image_path = os.path.join(temp_dir, "image_to_analyze.png")
             save_base64_image(image_b64, image_path)
+            is_mini = (output_style == "mini JSON")
             description_str = run_wrapper(image_path=image_path, output_dir=temp_dir, json_mini=is_mini)
             parsed_json = json.loads(description_str)
             yolo_image_path = os.path.join(temp_dir, f"{request_id}_yolo_updated.png")
             save_base64_image(image_b64, image_path)
+            is_mini = (output_style == "mini JSON")
             description_str = run_wrapper(image_path=image_path, output_dir=temp_dir, json_mini=is_mini)
             parsed_json = json.loads(description_str)
             image_description = run_wrapper(image_path=original_image_path, output_dir=temp_dir, json_mini=False)
+            if not os.path.exists(yolo_updated_image_path):
+                raise FileNotFoundError(f"YOLO updated image not found at {yolo_updated_image_path}")
             with open(yolo_updated_image_path, "rb") as f:
                 yolo_updated_img_bytes = f.read()
             _, new_b64 = preprocess_image(yolo_updated_img_bytes, threshold=1500, scale=0.5, fmt="png")
         base64_image_url = f"data:image/png;base64,{new_b64}"
+        prompt_text = f'"Prompt: {prompt}"\nImage description:\n"{image_description}"'
         messages = [
             {"role": "user", "content": [
+                {"type": "text", "text": prompt_text},
                 {"type": "image_url", "image_url": {"url": base64_image_url, "detail": "high"}}
             ]}
         ]
             with gr.Row():
                 image_input_analyze = gr.Image(type="pil", label="Upload Screen Image")
                 with gr.Column():
+                    output_style_analyze = gr.Radio(["Standard JSON", "mini JSON"], label="Output Format", value="Standard JSON")
                     analyze_button = gr.Button("Analyze Image", variant="primary")
             analyze_output = gr.JSON(label="JSON Description")
             with gr.Row():
             with gr.Row():
                 image_input_yolo = gr.Image(type="pil", label="Upload Screen Image")
                 with gr.Column():
+                     output_style_yolo = gr.Radio(["Standard JSON", "mini JSON"], label="Output Format", value="Standard JSON")
                      yolo_button = gr.Button("Analyze and Visualize", variant="primary")
             with gr.Row():
                 yolo_image_output = gr.Image(label="YOLO Annotated Image")