orasul commited on
Commit
fe57b03
·
1 Parent(s): 02544bc
Files changed (1) hide show
  1. app.py +44 -9
app.py CHANGED
@@ -136,17 +136,47 @@ def handle_action(openai_key, image, prompt):
136
 
137
  image_description = run_wrapper(original_image_path, temp_dir, skip_ocr=False, skip_spell=True, json_mini=True)
138
 
 
 
139
  with open(yolo_updated_image_path, "rb") as f:
140
  yolo_updated_img_bytes = f.read()
141
 
142
  _, new_b64 = preprocess_image(yolo_updated_img_bytes, threshold=2000, scale=0.5, fmt="png")
143
 
144
  base64_image_url = f"data:image/png;base64,{new_b64}"
145
- prompt_text = f"""You are an AI agent... (rest of your long prompt)
146
- The user said: "{prompt}"
147
- Description: "{image_description}" """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
148
 
149
- messages = [{"role": "user", "content": [{"type": "text", "text": prompt_text}, {"type": "image_url", "image_url": {"url": base64_image_url, "detail": "high"}}]}]
 
 
 
 
 
 
150
 
151
  response = llm_client.chat.completions.create(model="gpt-4.1", messages=messages, temperature=0.2)
152
  return response.choices[0].message.content.strip()
@@ -164,7 +194,7 @@ def handle_analyze(image, output_style):
164
  image_path = os.path.join(temp_dir, "image_to_analyze.png")
165
  save_base64_image(image_b64, image_path)
166
 
167
- is_mini = (output_style == "Mini JSON")
168
  description_str = run_wrapper(image_path=image_path, output_dir=temp_dir, json_mini=is_mini)
169
 
170
  parsed_json = json.loads(description_str)
@@ -185,7 +215,7 @@ def handle_analyze_yolo(image, output_style):
185
  yolo_image_path = os.path.join(temp_dir, f"{request_id}_yolo_updated.png")
186
  save_base64_image(image_b64, image_path)
187
 
188
- is_mini = (output_style == "Mini JSON")
189
  description_str = run_wrapper(image_path=image_path, output_dir=temp_dir, json_mini=is_mini)
190
 
191
  parsed_json = json.loads(description_str)
@@ -215,15 +245,20 @@ def handle_generate(openai_key, image, prompt):
215
 
216
  image_description = run_wrapper(image_path=original_image_path, output_dir=temp_dir, json_mini=False)
217
 
 
 
218
  with open(yolo_updated_image_path, "rb") as f:
219
  yolo_updated_img_bytes = f.read()
220
 
221
  _, new_b64 = preprocess_image(yolo_updated_img_bytes, threshold=1500, scale=0.5, fmt="png")
222
 
223
  base64_image_url = f"data:image/png;base64,{new_b64}"
 
 
 
224
  messages = [
225
  {"role": "user", "content": [
226
- {"type": "text", "text": f'"Prompt: {prompt}"\nImage description:\n"{image_description}"'},
227
  {"type": "image_url", "image_url": {"url": base64_image_url, "detail": "high"}}
228
  ]}
229
  ]
@@ -272,7 +307,7 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
272
  with gr.Row():
273
  image_input_analyze = gr.Image(type="pil", label="Upload Screen Image")
274
  with gr.Column():
275
- output_style_analyze = gr.Radio(["Standard JSON", "Mini JSON"], label="Output Format", value="Standard JSON")
276
  analyze_button = gr.Button("Analyze Image", variant="primary")
277
  analyze_output = gr.JSON(label="JSON Description")
278
  with gr.Row():
@@ -284,7 +319,7 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
284
  with gr.Row():
285
  image_input_yolo = gr.Image(type="pil", label="Upload Screen Image")
286
  with gr.Column():
287
- output_style_yolo = gr.Radio(["Standard JSON", "Mini JSON"], label="Output Format", value="Standard JSON")
288
  yolo_button = gr.Button("Analyze and Visualize", variant="primary")
289
  with gr.Row():
290
  yolo_image_output = gr.Image(label="YOLO Annotated Image")
 
136
 
137
  image_description = run_wrapper(original_image_path, temp_dir, skip_ocr=False, skip_spell=True, json_mini=True)
138
 
139
+ if not os.path.exists(yolo_updated_image_path):
140
+ raise FileNotFoundError(f"YOLO updated image not found at {yolo_updated_image_path}")
141
  with open(yolo_updated_image_path, "rb") as f:
142
  yolo_updated_img_bytes = f.read()
143
 
144
  _, new_b64 = preprocess_image(yolo_updated_img_bytes, threshold=2000, scale=0.5, fmt="png")
145
 
146
  base64_image_url = f"data:image/png;base64,{new_b64}"
147
+
148
+ prompt_text = f"""You are an AI agent that controls a mobile device and sees the content of screen.
149
+ User can ask you about some information or to do some task and you need to do these tasks.
150
+ You can only respond with one of these commands (in quotes) but some variables are dynamic
151
+ and can be changed based on the context:
152
+ 1. "Swipe left. From start coordinates 300, 400" (or other coordinates) (Goes right)
153
+ 2. "Swipe right. From start coordinates 500, 650" (or other coordinates) (Goes left)
154
+ 3. "Swipe top. From start coordinates 600, 510" (or other coordinates) (Goes bottom)
155
+ 4. "Swipe bottom. From start coordinates 640, 500" (or other coordinates) (Goes top)
156
+ 5. "Go home"
157
+ 6. "Go back"
158
+ 8. "Open com.whatsapp" (or other app)
159
+ 9. "Tap coordinates 160, 820" (or other coordinates)
160
+ 10. "Insert text 210, 820:Hello world" (or other coordinates and text)
161
+ 11. "Screen is in a loading state. Try again" (send image again)
162
+ 12. "Answer: There are no new important mails today" (or other answer)
163
+ 13. "Finished" (task is finished)
164
+ 14. "Can't proceed" (can't understand what to do or image has problem etc.)
165
+
166
+ The user said: "{prompt}"
167
+
168
+ I will share the screenshot of the current state of the phone (with UI elements highlighted and the corresponding
169
+ index of these UI elements) and the description (sizes, coordinates and indexes) of UI elements.
170
+ Description:
171
+ "{image_description}" """
172
 
173
+ messages = [
174
+ {"role": "user", "content": [
175
+ {"type": "text", "text": prompt_text},
176
+ # We are correctly sending the YOLO-annotated image here
177
+ {"type": "image_url", "image_url": {"url": base64_image_url, "detail": "high"}}
178
+ ]}
179
+ ]
180
 
181
  response = llm_client.chat.completions.create(model="gpt-4.1", messages=messages, temperature=0.2)
182
  return response.choices[0].message.content.strip()
 
194
  image_path = os.path.join(temp_dir, "image_to_analyze.png")
195
  save_base64_image(image_b64, image_path)
196
 
197
+ is_mini = (output_style == "mini JSON")
198
  description_str = run_wrapper(image_path=image_path, output_dir=temp_dir, json_mini=is_mini)
199
 
200
  parsed_json = json.loads(description_str)
 
215
  yolo_image_path = os.path.join(temp_dir, f"{request_id}_yolo_updated.png")
216
  save_base64_image(image_b64, image_path)
217
 
218
+ is_mini = (output_style == "mini JSON")
219
  description_str = run_wrapper(image_path=image_path, output_dir=temp_dir, json_mini=is_mini)
220
 
221
  parsed_json = json.loads(description_str)
 
245
 
246
  image_description = run_wrapper(image_path=original_image_path, output_dir=temp_dir, json_mini=False)
247
 
248
+ if not os.path.exists(yolo_updated_image_path):
249
+ raise FileNotFoundError(f"YOLO updated image not found at {yolo_updated_image_path}")
250
  with open(yolo_updated_image_path, "rb") as f:
251
  yolo_updated_img_bytes = f.read()
252
 
253
  _, new_b64 = preprocess_image(yolo_updated_img_bytes, threshold=1500, scale=0.5, fmt="png")
254
 
255
  base64_image_url = f"data:image/png;base64,{new_b64}"
256
+
257
+ prompt_text = f'"Prompt: {prompt}"\nImage description:\n"{image_description}"'
258
+
259
  messages = [
260
  {"role": "user", "content": [
261
+ {"type": "text", "text": prompt_text},
262
  {"type": "image_url", "image_url": {"url": base64_image_url, "detail": "high"}}
263
  ]}
264
  ]
 
307
  with gr.Row():
308
  image_input_analyze = gr.Image(type="pil", label="Upload Screen Image")
309
  with gr.Column():
310
+ output_style_analyze = gr.Radio(["Standard JSON", "mini JSON"], label="Output Format", value="Standard JSON")
311
  analyze_button = gr.Button("Analyze Image", variant="primary")
312
  analyze_output = gr.JSON(label="JSON Description")
313
  with gr.Row():
 
319
  with gr.Row():
320
  image_input_yolo = gr.Image(type="pil", label="Upload Screen Image")
321
  with gr.Column():
322
+ output_style_yolo = gr.Radio(["Standard JSON", "mini JSON"], label="Output Format", value="Standard JSON")
323
  yolo_button = gr.Button("Analyze and Visualize", variant="primary")
324
  with gr.Row():
325
  yolo_image_output = gr.Image(label="YOLO Annotated Image")