Fix bugs
Browse files
app.py
CHANGED
@@ -136,17 +136,47 @@ def handle_action(openai_key, image, prompt):
|
|
136 |
|
137 |
image_description = run_wrapper(original_image_path, temp_dir, skip_ocr=False, skip_spell=True, json_mini=True)
|
138 |
|
|
|
|
|
139 |
with open(yolo_updated_image_path, "rb") as f:
|
140 |
yolo_updated_img_bytes = f.read()
|
141 |
|
142 |
_, new_b64 = preprocess_image(yolo_updated_img_bytes, threshold=2000, scale=0.5, fmt="png")
|
143 |
|
144 |
base64_image_url = f"data:image/png;base64,{new_b64}"
|
145 |
-
|
146 |
-
|
147 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
148 |
|
149 |
-
messages = [
|
|
|
|
|
|
|
|
|
|
|
|
|
150 |
|
151 |
response = llm_client.chat.completions.create(model="gpt-4.1", messages=messages, temperature=0.2)
|
152 |
return response.choices[0].message.content.strip()
|
@@ -164,7 +194,7 @@ def handle_analyze(image, output_style):
|
|
164 |
image_path = os.path.join(temp_dir, "image_to_analyze.png")
|
165 |
save_base64_image(image_b64, image_path)
|
166 |
|
167 |
-
is_mini = (output_style == "
|
168 |
description_str = run_wrapper(image_path=image_path, output_dir=temp_dir, json_mini=is_mini)
|
169 |
|
170 |
parsed_json = json.loads(description_str)
|
@@ -185,7 +215,7 @@ def handle_analyze_yolo(image, output_style):
|
|
185 |
yolo_image_path = os.path.join(temp_dir, f"{request_id}_yolo_updated.png")
|
186 |
save_base64_image(image_b64, image_path)
|
187 |
|
188 |
-
is_mini = (output_style == "
|
189 |
description_str = run_wrapper(image_path=image_path, output_dir=temp_dir, json_mini=is_mini)
|
190 |
|
191 |
parsed_json = json.loads(description_str)
|
@@ -215,15 +245,20 @@ def handle_generate(openai_key, image, prompt):
|
|
215 |
|
216 |
image_description = run_wrapper(image_path=original_image_path, output_dir=temp_dir, json_mini=False)
|
217 |
|
|
|
|
|
218 |
with open(yolo_updated_image_path, "rb") as f:
|
219 |
yolo_updated_img_bytes = f.read()
|
220 |
|
221 |
_, new_b64 = preprocess_image(yolo_updated_img_bytes, threshold=1500, scale=0.5, fmt="png")
|
222 |
|
223 |
base64_image_url = f"data:image/png;base64,{new_b64}"
|
|
|
|
|
|
|
224 |
messages = [
|
225 |
{"role": "user", "content": [
|
226 |
-
{"type": "text", "text":
|
227 |
{"type": "image_url", "image_url": {"url": base64_image_url, "detail": "high"}}
|
228 |
]}
|
229 |
]
|
@@ -272,7 +307,7 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
|
|
272 |
with gr.Row():
|
273 |
image_input_analyze = gr.Image(type="pil", label="Upload Screen Image")
|
274 |
with gr.Column():
|
275 |
-
output_style_analyze = gr.Radio(["Standard JSON", "
|
276 |
analyze_button = gr.Button("Analyze Image", variant="primary")
|
277 |
analyze_output = gr.JSON(label="JSON Description")
|
278 |
with gr.Row():
|
@@ -284,7 +319,7 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
|
|
284 |
with gr.Row():
|
285 |
image_input_yolo = gr.Image(type="pil", label="Upload Screen Image")
|
286 |
with gr.Column():
|
287 |
-
output_style_yolo = gr.Radio(["Standard JSON", "
|
288 |
yolo_button = gr.Button("Analyze and Visualize", variant="primary")
|
289 |
with gr.Row():
|
290 |
yolo_image_output = gr.Image(label="YOLO Annotated Image")
|
|
|
136 |
|
137 |
image_description = run_wrapper(original_image_path, temp_dir, skip_ocr=False, skip_spell=True, json_mini=True)
|
138 |
|
139 |
+
if not os.path.exists(yolo_updated_image_path):
|
140 |
+
raise FileNotFoundError(f"YOLO updated image not found at {yolo_updated_image_path}")
|
141 |
with open(yolo_updated_image_path, "rb") as f:
|
142 |
yolo_updated_img_bytes = f.read()
|
143 |
|
144 |
_, new_b64 = preprocess_image(yolo_updated_img_bytes, threshold=2000, scale=0.5, fmt="png")
|
145 |
|
146 |
base64_image_url = f"data:image/png;base64,{new_b64}"
|
147 |
+
|
148 |
+
prompt_text = f"""You are an AI agent that controls a mobile device and sees the content of screen.
|
149 |
+
User can ask you about some information or to do some task and you need to do these tasks.
|
150 |
+
You can only respond with one of these commands (in quotes) but some variables are dynamic
|
151 |
+
and can be changed based on the context:
|
152 |
+
1. "Swipe left. From start coordinates 300, 400" (or other coordinates) (Goes right)
|
153 |
+
2. "Swipe right. From start coordinates 500, 650" (or other coordinates) (Goes left)
|
154 |
+
3. "Swipe top. From start coordinates 600, 510" (or other coordinates) (Goes bottom)
|
155 |
+
4. "Swipe bottom. From start coordinates 640, 500" (or other coordinates) (Goes top)
|
156 |
+
5. "Go home"
|
157 |
+
6. "Go back"
|
158 |
+
8. "Open com.whatsapp" (or other app)
|
159 |
+
9. "Tap coordinates 160, 820" (or other coordinates)
|
160 |
+
10. "Insert text 210, 820:Hello world" (or other coordinates and text)
|
161 |
+
11. "Screen is in a loading state. Try again" (send image again)
|
162 |
+
12. "Answer: There are no new important mails today" (or other answer)
|
163 |
+
13. "Finished" (task is finished)
|
164 |
+
14. "Can't proceed" (can't understand what to do or image has problem etc.)
|
165 |
+
|
166 |
+
The user said: "{prompt}"
|
167 |
+
|
168 |
+
I will share the screenshot of the current state of the phone (with UI elements highlighted and the corresponding
|
169 |
+
index of these UI elements) and the description (sizes, coordinates and indexes) of UI elements.
|
170 |
+
Description:
|
171 |
+
"{image_description}" """
|
172 |
|
173 |
+
messages = [
|
174 |
+
{"role": "user", "content": [
|
175 |
+
{"type": "text", "text": prompt_text},
|
176 |
+
# We are correctly sending the YOLO-annotated image here
|
177 |
+
{"type": "image_url", "image_url": {"url": base64_image_url, "detail": "high"}}
|
178 |
+
]}
|
179 |
+
]
|
180 |
|
181 |
response = llm_client.chat.completions.create(model="gpt-4.1", messages=messages, temperature=0.2)
|
182 |
return response.choices[0].message.content.strip()
|
|
|
194 |
image_path = os.path.join(temp_dir, "image_to_analyze.png")
|
195 |
save_base64_image(image_b64, image_path)
|
196 |
|
197 |
+
is_mini = (output_style == "mini JSON")
|
198 |
description_str = run_wrapper(image_path=image_path, output_dir=temp_dir, json_mini=is_mini)
|
199 |
|
200 |
parsed_json = json.loads(description_str)
|
|
|
215 |
yolo_image_path = os.path.join(temp_dir, f"{request_id}_yolo_updated.png")
|
216 |
save_base64_image(image_b64, image_path)
|
217 |
|
218 |
+
is_mini = (output_style == "mini JSON")
|
219 |
description_str = run_wrapper(image_path=image_path, output_dir=temp_dir, json_mini=is_mini)
|
220 |
|
221 |
parsed_json = json.loads(description_str)
|
|
|
245 |
|
246 |
image_description = run_wrapper(image_path=original_image_path, output_dir=temp_dir, json_mini=False)
|
247 |
|
248 |
+
if not os.path.exists(yolo_updated_image_path):
|
249 |
+
raise FileNotFoundError(f"YOLO updated image not found at {yolo_updated_image_path}")
|
250 |
with open(yolo_updated_image_path, "rb") as f:
|
251 |
yolo_updated_img_bytes = f.read()
|
252 |
|
253 |
_, new_b64 = preprocess_image(yolo_updated_img_bytes, threshold=1500, scale=0.5, fmt="png")
|
254 |
|
255 |
base64_image_url = f"data:image/png;base64,{new_b64}"
|
256 |
+
|
257 |
+
prompt_text = f'"Prompt: {prompt}"\nImage description:\n"{image_description}"'
|
258 |
+
|
259 |
messages = [
|
260 |
{"role": "user", "content": [
|
261 |
+
{"type": "text", "text": prompt_text},
|
262 |
{"type": "image_url", "image_url": {"url": base64_image_url, "detail": "high"}}
|
263 |
]}
|
264 |
]
|
|
|
307 |
with gr.Row():
|
308 |
image_input_analyze = gr.Image(type="pil", label="Upload Screen Image")
|
309 |
with gr.Column():
|
310 |
+
output_style_analyze = gr.Radio(["Standard JSON", "mini JSON"], label="Output Format", value="Standard JSON")
|
311 |
analyze_button = gr.Button("Analyze Image", variant="primary")
|
312 |
analyze_output = gr.JSON(label="JSON Description")
|
313 |
with gr.Row():
|
|
|
319 |
with gr.Row():
|
320 |
image_input_yolo = gr.Image(type="pil", label="Upload Screen Image")
|
321 |
with gr.Column():
|
322 |
+
output_style_yolo = gr.Radio(["Standard JSON", "mini JSON"], label="Output Format", value="Standard JSON")
|
323 |
yolo_button = gr.Button("Analyze and Visualize", variant="primary")
|
324 |
with gr.Row():
|
325 |
yolo_image_output = gr.Image(label="YOLO Annotated Image")
|