prthm11 commited on
Commit
0931b84
·
verified ·
1 Parent(s): 25c097f

Update app_main.py

Browse files
Files changed (1) hide show
  1. app_main.py +500 -499
app_main.py CHANGED
@@ -1,499 +1,500 @@
1
- from flask import Flask, render_template, Response, flash, redirect, url_for, request, jsonify
2
- import cv2
3
- import numpy as np
4
- from unstructured.partition.pdf import partition_pdf
5
- import json
6
- import base64
7
- import io
8
- import os
9
- from PIL import Image, ImageEnhance, ImageDraw
10
- from imutils.perspective import four_point_transform
11
- from dotenv import load_dotenv
12
- import pytesseract
13
- from transformers import AutoProcessor, AutoModelForImageTextToText, AutoModelForVision2Seq
14
- from langchain_community.document_loaders.image_captions import ImageCaptionLoader
15
- from werkzeug.utils import secure_filename
16
- import tempfile
17
- import torch
18
- from langchain_groq import ChatGroq
19
- from langgraph.prebuilt import create_react_agent
20
- import logging
21
-
22
- # Configure logging
23
- logging.basicConfig(
24
- level=logging.DEBUG, # Use INFO or ERROR in production
25
- format="%(asctime)s [%(levelname)s] %(message)s",
26
- handlers=[
27
- logging.FileHandler("app.log"),
28
- logging.StreamHandler()
29
- ]
30
- )
31
-
32
- logger = logging.getLogger(__name__)
33
-
34
- load_dotenv()
35
- # os.environ["GROQ_API_KEY"] = os.getenv("GROQ_API_KEY")
36
- groq_api_key = os.getenv("GROQ_API_KEY")
37
-
38
- llm = ChatGroq(
39
- model="meta-llama/llama-4-maverick-17b-128e-instruct",
40
- temperature=0,
41
- max_tokens=None,
42
- )
43
-
44
- app = Flask(__name__)
45
-
46
- pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"
47
- poppler_path = r"C:\poppler-23.11.0\Library\bin"
48
-
49
- count = 0
50
- PDF_GET = r"E:\Pratham\2025\Harsh Sir\Scratch Vision\images\scratch_crab.pdf"
51
-
52
- OUTPUT_FOLDER = "OUTPUTS"
53
- DETECTED_IMAGE_FOLDER_PATH = os.path.join(OUTPUT_FOLDER, "DETECTED_IMAGE")
54
- IMAGE_FOLDER_PATH = os.path.join(OUTPUT_FOLDER, "SCANNED_IMAGE")
55
- JSON_FOLDER_PATH = os.path.join(OUTPUT_FOLDER, "EXTRACTED_JSON")
56
-
57
- for path in [OUTPUT_FOLDER, IMAGE_FOLDER_PATH, DETECTED_IMAGE_FOLDER_PATH, JSON_FOLDER_PATH]:
58
- os.makedirs(path, exist_ok=True)
59
-
60
- # Model Initialization
61
- try:
62
- smolvlm256m_processor = AutoProcessor.from_pretrained(
63
- "HuggingFaceTB/SmolVLM-256M-Instruct")
64
- # smolvlm256m_model = AutoModelForImageTextToText.from_pretrained("HuggingFaceTB/SmolVLM-256M-Instruct").to("cpu")
65
- smolvlm256m_model = AutoModelForVision2Seq.from_pretrained(
66
- "HuggingFaceTB/SmolVLM-256M-Instruct",
67
- torch_dtype=torch.bfloat16 if hasattr(
68
- torch, "bfloat16") else torch.float32,
69
- _attn_implementation="eager"
70
- ).to("cpu")
71
- except Exception as e:
72
- raise RuntimeError(f"❌ Failed to load SmolVLM model: {str(e)}")
73
-
74
- # SmolVLM Image Captioning functioning
75
-
76
-
77
- def get_smolvlm_caption(image: Image.Image, prompt: str = "") -> str:
78
- try:
79
- # Ensure exactly one <image> token
80
- if "<image>" not in prompt:
81
- prompt = f"<image> {prompt.strip()}"
82
-
83
- num_image_tokens = prompt.count("<image>")
84
- if num_image_tokens != 1:
85
- raise ValueError(
86
- f"Prompt must contain exactly 1 <image> token. Found {num_image_tokens}")
87
-
88
- inputs = smolvlm256m_processor(
89
- images=[image], text=[prompt], return_tensors="pt").to("cpu")
90
- output_ids = smolvlm256m_model.generate(**inputs, max_new_tokens=100)
91
- return smolvlm256m_processor.decode(output_ids[0], skip_special_tokens=True)
92
- except Exception as e:
93
- return f"❌ Error during caption generation: {str(e)}"
94
-
95
- # --- FUNCTION: Extract images from saved PDF ---
96
-
97
-
98
- def extract_images_from_pdf(pdf_path, output_json_path):
99
- ''' Extract images from PDF and generate structured sprite JSON '''
100
-
101
- try:
102
- pdf_filename = os.path.splitext(os.path.basename(pdf_path))[
103
- 0] # e.g., "scratch_crab"
104
- pdf_dir_path = os.path.dirname(pdf_path).replace("/", "\\")
105
-
106
- # Create subfolders
107
- extracted_image_subdir = os.path.join(
108
- DETECTED_IMAGE_FOLDER_PATH, pdf_filename)
109
- json_subdir = os.path.join(JSON_FOLDER_PATH, pdf_filename)
110
- os.makedirs(extracted_image_subdir, exist_ok=True)
111
- os.makedirs(json_subdir, exist_ok=True)
112
-
113
- # Output paths
114
- output_json_path = os.path.join(json_subdir, "extracted.json")
115
- final_json_path = os.path.join(json_subdir, "extracted_sprites.json")
116
-
117
- try:
118
- elements = partition_pdf(
119
- filename=pdf_path,
120
- strategy="hi_res",
121
- extract_image_block_types=["Image"],
122
- extract_image_block_to_payload=True, # Set to True to get base64 in output
123
- )
124
- except Exception as e:
125
- raise RuntimeError(
126
- f"❌ Failed to extract images from PDF: {str(e)}")
127
-
128
- try:
129
- with open(output_json_path, "w") as f:
130
- json.dump([element.to_dict()
131
- for element in elements], f, indent=4)
132
- except Exception as e:
133
- raise RuntimeError(f"❌ Failed to write extracted.json: {str(e)}")
134
-
135
- try:
136
- # Display extracted images
137
- with open(output_json_path, 'r') as file:
138
- file_elements = json.load(file)
139
- except Exception as e:
140
- raise RuntimeError(f"❌ Failed to read extracted.json: {str(e)}")
141
-
142
- # Prepare manipulated sprite JSON structure
143
- manipulated_json = {}
144
-
145
- # SET A SYSTEM PROMPT
146
- system_prompt = """
147
- You are an expert in visual scene understanding.
148
- Your Job is to analyze an image and respond acoording if asked for name give simple name by analyzing it and if ask for descrption generate a short description covering its elements.
149
-
150
- Guidelines:
151
- - Focus only the images given in Square Shape.
152
- - Don't Consider Blank areas in Image as.
153
- - Don't include generic summary or explanation outside the fields.
154
- Return only string.
155
- """
156
-
157
- agent = create_react_agent(
158
- model=llm,
159
- tools=[],
160
- prompt=system_prompt
161
- )
162
-
163
- # If JSON already exists, load it and find the next available Sprite number
164
- if os.path.exists(final_json_path):
165
- with open(final_json_path, "r") as existing_file:
166
- manipulated = json.load(existing_file)
167
- # Determine the next available index (e.g., Sprite 4 if 1–3 already exist)
168
- existing_keys = [int(k.replace("Sprite ", ""))
169
- for k in manipulated.keys()]
170
- start_count = max(existing_keys, default=0) + 1
171
- else:
172
- start_count = 1
173
-
174
- sprite_count = start_count
175
- for i, element in enumerate(file_elements):
176
- if "image_base64" in element["metadata"]:
177
- try:
178
- image_data = base64.b64decode(
179
- element["metadata"]["image_base64"])
180
- image = Image.open(io.BytesIO(image_data)).convert("RGB")
181
- image.show(title=f"Extracted Image {i+1}")
182
- image_path = os.path.join(
183
- extracted_image_subdir, f"Sprite_{i+1}.png")
184
- image.save(image_path)
185
- with open(image_path, "rb") as image_file:
186
- image_bytes = image_file.read()
187
- img_base64 = base64.b64encode(image_bytes).decode("utf-8")
188
- # description = get_smolvlm_caption(image, prompt="Give a brief Description")
189
- # name = get_smolvlm_caption(image, prompt="give a short name/title of this Image.")
190
-
191
- def clean_caption_output(raw_output: str, prompt: str) -> str:
192
- answer = raw_output.replace(prompt, '').replace(
193
- "<image>", '').strip(" :-\n")
194
- return answer
195
-
196
- prompt_description = "Give a brief Captioning."
197
- prompt_name = "give a short name caption of this Image."
198
-
199
- content1 = [
200
- {
201
- "type": "text",
202
- "text": f"{prompt_description}"
203
- },
204
- {
205
- "type": "image_url",
206
- "image_url": {
207
- "url": f"data:image/jpeg;base64,{img_base64}"
208
- }
209
- }
210
- ]
211
- response1 = agent.invoke(
212
- {"messages": [{"role": "user", "content": content1}]})
213
- print(response1)
214
- description = response1["messages"][-1].content
215
-
216
- content2 = [
217
- {
218
- "type": "text",
219
- "text": f"{prompt_name}"
220
- },
221
- {
222
- "type": "image_url",
223
- "image_url": {
224
- "url": f"data:image/jpeg;base64,{img_base64}"
225
- }
226
- }
227
- ]
228
-
229
- response2 = agent.invoke(
230
- {"messages": [{"role": "user", "content": content2}]})
231
- print(response2)
232
- name = response2["messages"][-1].content
233
-
234
- # raw_description = get_smolvlm_caption(image, prompt=prompt_description)
235
- # raw_name = get_smolvlm_caption(image, prompt=prompt_name)
236
-
237
- # description = clean_caption_output(raw_description, prompt_description)
238
- # name = clean_caption_output(raw_name, prompt_name)
239
-
240
- manipulated_json[f"Sprite {sprite_count}"] = {
241
- "name": name,
242
- "base64": element["metadata"]["image_base64"],
243
- "file-path": pdf_dir_path,
244
- "description": description
245
- }
246
- sprite_count += 1
247
- except Exception as e:
248
- print(f"⚠️ Error processing Sprite {i+1}: {str(e)}")
249
-
250
- # Save manipulated JSON
251
- with open(final_json_path, "w") as sprite_file:
252
- json.dump(manipulated_json, sprite_file, indent=4)
253
-
254
- print(f"✅ Manipulated sprite JSON saved: {final_json_path}")
255
- return final_json_path, manipulated_json
256
-
257
- except Exception as e:
258
- raise RuntimeError(f"❌ Error in extract_images_from_pdf: {str(e)}")
259
-
260
-
261
- def similarity_matching(input_json_path: str) -> str:
262
- import uuid
263
- import shutil
264
- import tempfile
265
- from langchain_experimental.open_clip.open_clip import OpenCLIPEmbeddings
266
- from matplotlib.offsetbox import OffsetImage, AnnotationBbox
267
- from io import BytesIO
268
-
269
- logger.info("🔍 Running similarity matching...")
270
-
271
- # ============================== #
272
- # DEFINE PATHS #
273
- # ============================== #
274
- backdrop_images_path = r"E:\Pratham\2025\Harsh Sir\Scratch Vision\images\Backdrops"
275
- sprite_images_path = r"E:\Pratham\2025\Harsh Sir\Scratch Vision\images\sprites"
276
- image_dirs = [backdrop_images_path, sprite_images_path]
277
-
278
- # ================================================= #
279
- # Generate Random UUID for project folder name #
280
- # ================================================= #
281
- random_id = str(uuid.uuid4()).replace('-', '')
282
- project_folder = os.path.join("outputs", f"project_{random_id}")
283
-
284
- # =========================================================================== #
285
- # Create empty json in project_{random_id} folder #
286
- # =========================================================================== #
287
- os.makedirs(project_folder, exist_ok=True)
288
- project_json_path = os.path.join(project_folder, "project.json")
289
-
290
- # ============================== #
291
- # READ SPRITE METADATA #
292
- # ============================== #
293
- with open(input_json_path, 'r') as f:
294
- sprites_data = json.load(f)
295
-
296
- sprite_ids, texts, sprite_base64 = [], [], []
297
- for sid, sprite in sprites_data.items():
298
- sprite_ids.append(sid)
299
- texts.append(
300
- "This is " + sprite.get("description", sprite.get("name", "")))
301
- sprite_base64.append(sprite["base64"])
302
-
303
- # ============================== #
304
- # INITIALIZE CLIP EMBEDDER #
305
- # ============================== #
306
- clip_embd = OpenCLIPEmbeddings()
307
-
308
- # ========================================= #
309
- # Walk folders to collect all image paths #
310
- # ========================================= #
311
- # folder_image_paths = []
312
- # for image_dir in image_dirs:
313
- # for root, _, files in os.walk(image_dir):
314
- # for fname in files:
315
- # if fname.lower().endswith((".png", ".jpg", ".jpeg")):
316
- # folder_image_paths.append(os.path.join(root, fname))
317
-
318
- # # ============================== #
319
- # # EMBED FOLDER IMAGES (REF) #
320
- # # ============================== #
321
- # img_features = clip_embd.embed_image(folder_image_paths)
322
-
323
- # # ============================== #
324
- # # Store image embeddings #
325
- # # ============================== #
326
- # embedding_json = []
327
- # for i, path in enumerate(folder_image_paths):
328
- # embedding_json.append({
329
- # "name":os.path.basename(path),
330
- # "file-path": path,
331
- # "embeddings": list(img_features[i])
332
- # })
333
-
334
- # # Save to embeddings.json
335
- # with open(f"{OUTPUT_FOLDER}/embeddings.json", "w") as f:
336
- # json.dump(embedding_json, f, indent=2)
337
- # ============================== #
338
- # DECODE SPRITE IMAGES #
339
- # ============================== #
340
- temp_dir = tempfile.mkdtemp()
341
- sprite_image_paths = []
342
- for idx, b64 in enumerate(sprite_base64):
343
- image_data = base64.b64decode(b64.split(",")[-1])
344
- img = Image.open(BytesIO(image_data)).convert("RGB")
345
- temp_path = os.path.join(temp_dir, f"sprite_{idx}.png")
346
- img.save(temp_path)
347
- sprite_image_paths.append(temp_path)
348
-
349
- # ============================== #
350
- # EMBED SPRITE IMAGES #
351
- # ============================== #
352
- sprite_features = clip_embd.embed_image(sprite_image_paths)
353
-
354
- # ============================== #
355
- # COMPUTE SIMILARITIES #
356
- # ============================== #
357
- with open(f"{OUTPUT_FOLDER}/embeddings.json", "r") as f:
358
- embedding_json = json.load(f)
359
-
360
- img_matrix = np.array([img["embeddings"] for img in embedding_json])
361
- sprite_matrix = np.array(sprite_features)
362
-
363
- similarity = np.matmul(sprite_matrix, img_matrix.T)
364
- most_similar_indices = np.argmax(similarity, axis=1)
365
-
366
- # ============= Match and copy ================
367
- project_data, backdrop_data = [], []
368
- copied_folders = set()
369
- for sprite_idx, matched_idx in enumerate(most_similar_indices):
370
- matched_entry = embedding_json[matched_idx]
371
- # matched_image_path = os.path.normpath(folder_image_paths[matched_idx])
372
- matched_image_path = os.path.normpath(matched_entry["file-path"])
373
- matched_folder = os.path.dirname(matched_image_path)
374
- if matched_folder in copied_folders:
375
- continue
376
- copied_folders.add(matched_folder)
377
-
378
- # Sprite
379
- sprite_json_path = os.path.join(matched_folder, 'sprite.json')
380
- if os.path.exists(sprite_json_path):
381
- with open(sprite_json_path, 'r') as f:
382
- sprite_data = json.load(f)
383
- project_data.append(sprite_data)
384
-
385
- for fname in os.listdir(matched_folder):
386
- if fname not in {os.path.basename(matched_image_path), 'sprite.json'}:
387
- shutil.copy2(os.path.join(
388
- matched_folder, fname), project_folder)
389
-
390
- # Backdrop
391
- if matched_image_path.startswith(os.path.normpath(backdrop_images_path)):
392
- backdrop_json_path = os.path.join(matched_folder, 'project.json')
393
- if os.path.exists(backdrop_json_path):
394
- with open(backdrop_json_path, 'r') as f:
395
- backdrop_json_data = json.load(f)
396
- for target in backdrop_json_data.get("targets", []):
397
- if target.get("isStage"):
398
- backdrop_data.append(target)
399
- for fname in os.listdir(matched_folder):
400
- if fname not in {os.path.basename(matched_image_path), 'project.json'}:
401
- shutil.copy2(os.path.join(
402
- matched_folder, fname), project_folder)
403
-
404
- # Merge JSON structure
405
- final_project = {
406
- "targets": [],
407
- "monitors": [],
408
- "extensions": [],
409
- "meta": {
410
- "semver": "3.0.0",
411
- "vm": "11.3.0",
412
- "agent": "OpenAI ScratchVision Agent"
413
- }
414
- }
415
-
416
- for sprite in project_data:
417
- if not sprite.get("isStage", False):
418
- final_project["targets"].append(sprite)
419
-
420
- if backdrop_data:
421
- all_costumes, sounds = [], []
422
- for idx, bd in enumerate(backdrop_data):
423
- all_costumes.extend(bd.get("costumes", []))
424
- if idx == 0 and "sounds" in bd:
425
- sounds = bd["sounds"]
426
- final_project["targets"].append({
427
- "isStage": True,
428
- "name": "Stage",
429
- "variables": {},
430
- "lists": {},
431
- "broadcasts": {},
432
- "blocks": {},
433
- "comments": {},
434
- "currentCostume": 1 if len(all_costumes) > 1 else 0,
435
- "costumes": all_costumes,
436
- "sounds": sounds,
437
- "volume": 100,
438
- "layerOrder": 0,
439
- "tempo": 60,
440
- "videoTransparency": 50,
441
- "videoState": "on",
442
- "textToSpeechLanguage": None
443
- })
444
-
445
- with open(project_json_path, 'w') as f:
446
- json.dump(final_project, f, indent=2)
447
-
448
- logger.info(f"🎉 Final project saved: {project_json_path}")
449
- return project_json_path
450
-
451
-
452
- @app.route('/')
453
- def index():
454
- return render_template('app_index.html')
455
-
456
- # API endpoint
457
-
458
-
459
- @app.route('/process_pdf', methods=['POST'])
460
- def process_pdf():
461
- try:
462
- logger.info("Received request to process PDF.")
463
- if 'pdf_file' not in request.files:
464
- logger.warning("No PDF file found in request.")
465
- return jsonify({"error": "Missing PDF file in form-data with key 'pdf_file'"}), 400
466
-
467
- pdf_file = request.files['pdf_file']
468
- if pdf_file.filename == '':
469
- return jsonify({"error": "Empty filename"}), 400
470
-
471
- # Save the uploaded PDF temporarily
472
- filename = secure_filename(pdf_file.filename)
473
- temp_dir = tempfile.mkdtemp()
474
- saved_pdf_path = os.path.join(temp_dir, filename)
475
- pdf_file.save(saved_pdf_path)
476
-
477
- logger.info(f"Saved uploaded PDF to: {saved_pdf_path}")
478
-
479
- # Extract & process
480
- json_path = None
481
- output_path, result = extract_images_from_pdf(
482
- saved_pdf_path, json_path)
483
-
484
- project_output = similarity_matching(output_path)
485
- logger.info("Received request to process PDF.")
486
-
487
- return jsonify({
488
- "message": "✅ PDF processed successfully",
489
- "output_json": output_path,
490
- "sprites": result,
491
- "project_output_json": project_output
492
- })
493
- except Exception as e:
494
- logger.exception("❌ Failed to process PDF")
495
- return jsonify({"error": f"❌ Failed to process PDF: {str(e)}"}), 500
496
-
497
-
498
- if __name__ == '__main__':
499
- app.run(host='0.0.0.0', port=7860, debug=True)
 
 
1
+ from flask import Flask, render_template, Response, flash, redirect, url_for, request, jsonify
2
+ import cv2
3
+ import numpy as np
4
+ from unstructured.partition.pdf import partition_pdf
5
+ import json
6
+ import base64
7
+ import io
8
+ import os
9
+ from PIL import Image, ImageEnhance, ImageDraw
10
+ from imutils.perspective import four_point_transform
11
+ from dotenv import load_dotenv
12
+ import pytesseract
13
+ from transformers import AutoProcessor, AutoModelForImageTextToText, AutoModelForVision2Seq
14
+ from langchain_community.document_loaders.image_captions import ImageCaptionLoader
15
+ from werkzeug.utils import secure_filename
16
+ import tempfile
17
+ import torch
18
+ from langchain_groq import ChatGroq
19
+ from langgraph.prebuilt import create_react_agent
20
+ import logging
21
+
22
+ # Configure logging
23
+ logging.basicConfig(
24
+ level=logging.DEBUG, # Use INFO or ERROR in production
25
+ format="%(asctime)s [%(levelname)s] %(message)s",
26
+ handlers=[
27
+ logging.FileHandler("app.log"),
28
+ logging.StreamHandler()
29
+ ]
30
+ )
31
+
32
+ logger = logging.getLogger(__name__)
33
+
34
+ load_dotenv()
35
+ # os.environ["GROQ_API_KEY"] = os.getenv("GROQ_API_KEY")
36
+ groq_api_key = os.getenv("GROQ_API_KEY")
37
+
38
+ llm = ChatGroq(
39
+ model="meta-llama/llama-4-maverick-17b-128e-instruct",
40
+ temperature=0,
41
+ max_tokens=None,
42
+ )
43
+
44
+ app = Flask(__name__)
45
+
46
+ pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"
47
+ poppler_path = r"C:\poppler-23.11.0\Library\bin"
48
+
49
+ count = 0
50
+ PDF_GET = r"E:\Pratham\2025\Harsh Sir\Scratch Vision\images\scratch_crab.pdf"
51
+
52
+ OUTPUT_FOLDER = "OUTPUTS"
53
+ DETECTED_IMAGE_FOLDER_PATH = os.path.join(OUTPUT_FOLDER, "DETECTED_IMAGE")
54
+ IMAGE_FOLDER_PATH = os.path.join(OUTPUT_FOLDER, "SCANNED_IMAGE")
55
+ JSON_FOLDER_PATH = os.path.join(OUTPUT_FOLDER, "EXTRACTED_JSON")
56
+
57
+ for path in [OUTPUT_FOLDER, IMAGE_FOLDER_PATH, DETECTED_IMAGE_FOLDER_PATH, JSON_FOLDER_PATH]:
58
+ os.makedirs(path, exist_ok=True)
59
+
60
+ # Model Initialization
61
+ try:
62
+ smolvlm256m_processor = AutoProcessor.from_pretrained(
63
+ "HuggingFaceTB/SmolVLM-256M-Instruct")
64
+ # smolvlm256m_model = AutoModelForImageTextToText.from_pretrained("HuggingFaceTB/SmolVLM-256M-Instruct").to("cpu")
65
+ smolvlm256m_model = AutoModelForVision2Seq.from_pretrained(
66
+ "HuggingFaceTB/SmolVLM-256M-Instruct",
67
+ torch_dtype=torch.bfloat16 if hasattr(
68
+ torch, "bfloat16") else torch.float32,
69
+ _attn_implementation="eager"
70
+ ).to("cpu")
71
+ except Exception as e:
72
+ raise RuntimeError(f"❌ Failed to load SmolVLM model: {str(e)}")
73
+
74
+ # SmolVLM Image Captioning functioning
75
+
76
+
77
+ def get_smolvlm_caption(image: Image.Image, prompt: str = "") -> str:
78
+ try:
79
+ # Ensure exactly one <image> token
80
+ if "<image>" not in prompt:
81
+ prompt = f"<image> {prompt.strip()}"
82
+
83
+ num_image_tokens = prompt.count("<image>")
84
+ if num_image_tokens != 1:
85
+ raise ValueError(
86
+ f"Prompt must contain exactly 1 <image> token. Found {num_image_tokens}")
87
+
88
+ inputs = smolvlm256m_processor(
89
+ images=[image], text=[prompt], return_tensors="pt").to("cpu")
90
+ output_ids = smolvlm256m_model.generate(**inputs, max_new_tokens=100)
91
+ return smolvlm256m_processor.decode(output_ids[0], skip_special_tokens=True)
92
+ except Exception as e:
93
+ return f"❌ Error during caption generation: {str(e)}"
94
+
95
+ # --- FUNCTION: Extract images from saved PDF ---
96
+
97
+
98
+ def extract_images_from_pdf(pdf_path, output_json_path):
99
+ ''' Extract images from PDF and generate structured sprite JSON '''
100
+
101
+ try:
102
+ pdf_filename = os.path.splitext(os.path.basename(pdf_path))[
103
+ 0] # e.g., "scratch_crab"
104
+ pdf_dir_path = os.path.dirname(pdf_path).replace("/", "\\")
105
+
106
+ # Create subfolders
107
+ extracted_image_subdir = os.path.join(
108
+ DETECTED_IMAGE_FOLDER_PATH, pdf_filename)
109
+ json_subdir = os.path.join(JSON_FOLDER_PATH, pdf_filename)
110
+ os.makedirs(extracted_image_subdir, exist_ok=True)
111
+ os.makedirs(json_subdir, exist_ok=True)
112
+
113
+ # Output paths
114
+ output_json_path = os.path.join(json_subdir, "extracted.json")
115
+ final_json_path = os.path.join(json_subdir, "extracted_sprites.json")
116
+
117
+ try:
118
+ elements = partition_pdf(
119
+ filename=pdf_path,
120
+ strategy="hi_res",
121
+ extract_image_block_types=["Image"],
122
+ extract_image_block_to_payload=True, # Set to True to get base64 in output
123
+ )
124
+ except Exception as e:
125
+ raise RuntimeError(
126
+ f"❌ Failed to extract images from PDF: {str(e)}")
127
+
128
+ try:
129
+ with open(output_json_path, "w") as f:
130
+ json.dump([element.to_dict()
131
+ for element in elements], f, indent=4)
132
+ except Exception as e:
133
+ raise RuntimeError(f"❌ Failed to write extracted.json: {str(e)}")
134
+
135
+ try:
136
+ # Display extracted images
137
+ with open(output_json_path, 'r') as file:
138
+ file_elements = json.load(file)
139
+ except Exception as e:
140
+ raise RuntimeError(f"❌ Failed to read extracted.json: {str(e)}")
141
+
142
+ # Prepare manipulated sprite JSON structure
143
+ manipulated_json = {}
144
+
145
+ # SET A SYSTEM PROMPT
146
+ system_prompt = """
147
+ You are an expert in visual scene understanding.
148
+ Your Job is to analyze an image and respond acoording if asked for name give simple name by analyzing it and if ask for descrption generate a short description covering its elements.
149
+
150
+ Guidelines:
151
+ - Focus only the images given in Square Shape.
152
+ - Don't Consider Blank areas in Image as.
153
+ - Don't include generic summary or explanation outside the fields.
154
+ Return only string.
155
+ """
156
+
157
+ agent = create_react_agent(
158
+ model=llm,
159
+ tools=[],
160
+ prompt=system_prompt
161
+ )
162
+
163
+ # If JSON already exists, load it and find the next available Sprite number
164
+ if os.path.exists(final_json_path):
165
+ with open(final_json_path, "r") as existing_file:
166
+ manipulated = json.load(existing_file)
167
+ # Determine the next available index (e.g., Sprite 4 if 1–3 already exist)
168
+ existing_keys = [int(k.replace("Sprite ", ""))
169
+ for k in manipulated.keys()]
170
+ start_count = max(existing_keys, default=0) + 1
171
+ else:
172
+ start_count = 1
173
+
174
+ sprite_count = start_count
175
+ for i, element in enumerate(file_elements):
176
+ if "image_base64" in element["metadata"]:
177
+ try:
178
+ image_data = base64.b64decode(
179
+ element["metadata"]["image_base64"])
180
+ image = Image.open(io.BytesIO(image_data)).convert("RGB")
181
+ image.show(title=f"Extracted Image {i+1}")
182
+ image_path = os.path.join(
183
+ extracted_image_subdir, f"Sprite_{i+1}.png")
184
+ image.save(image_path)
185
+ with open(image_path, "rb") as image_file:
186
+ image_bytes = image_file.read()
187
+ img_base64 = base64.b64encode(image_bytes).decode("utf-8")
188
+ # description = get_smolvlm_caption(image, prompt="Give a brief Description")
189
+ # name = get_smolvlm_caption(image, prompt="give a short name/title of this Image.")
190
+
191
+ def clean_caption_output(raw_output: str, prompt: str) -> str:
192
+ answer = raw_output.replace(prompt, '').replace(
193
+ "<image>", '').strip(" :-\n")
194
+ return answer
195
+
196
+ prompt_description = "Give a brief Captioning."
197
+ prompt_name = "give a short name caption of this Image."
198
+
199
+ content1 = [
200
+ {
201
+ "type": "text",
202
+ "text": f"{prompt_description}"
203
+ },
204
+ {
205
+ "type": "image_url",
206
+ "image_url": {
207
+ "url": f"data:image/jpeg;base64,{img_base64}"
208
+ }
209
+ }
210
+ ]
211
+ response1 = agent.invoke(
212
+ {"messages": [{"role": "user", "content": content1}]})
213
+ print(response1)
214
+ description = response1["messages"][-1].content
215
+
216
+ content2 = [
217
+ {
218
+ "type": "text",
219
+ "text": f"{prompt_name}"
220
+ },
221
+ {
222
+ "type": "image_url",
223
+ "image_url": {
224
+ "url": f"data:image/jpeg;base64,{img_base64}"
225
+ }
226
+ }
227
+ ]
228
+
229
+ response2 = agent.invoke(
230
+ {"messages": [{"role": "user", "content": content2}]})
231
+ print(response2)
232
+ name = response2["messages"][-1].content
233
+
234
+ # raw_description = get_smolvlm_caption(image, prompt=prompt_description)
235
+ # raw_name = get_smolvlm_caption(image, prompt=prompt_name)
236
+
237
+ # description = clean_caption_output(raw_description, prompt_description)
238
+ # name = clean_caption_output(raw_name, prompt_name)
239
+
240
+ manipulated_json[f"Sprite {sprite_count}"] = {
241
+ "name": name,
242
+ "base64": element["metadata"]["image_base64"],
243
+ "file-path": pdf_dir_path,
244
+ "description": description
245
+ }
246
+ sprite_count += 1
247
+ except Exception as e:
248
+ print(f"⚠️ Error processing Sprite {i+1}: {str(e)}")
249
+
250
+ # Save manipulated JSON
251
+ with open(final_json_path, "w") as sprite_file:
252
+ json.dump(manipulated_json, sprite_file, indent=4)
253
+
254
+ print(f"✅ Manipulated sprite JSON saved: {final_json_path}")
255
+ return final_json_path, manipulated_json
256
+
257
+ except Exception as e:
258
+ raise RuntimeError(f"❌ Error in extract_images_from_pdf: {str(e)}")
259
+
260
+
261
+ def similarity_matching(input_json_path: str) -> str:
262
+ import uuid
263
+ import shutil
264
+ import tempfile
265
+ from langchain_experimental.open_clip.open_clip import OpenCLIPEmbeddings
266
+ from matplotlib.offsetbox import OffsetImage, AnnotationBbox
267
+ from io import BytesIO
268
+
269
+ logger.info("🔍 Running similarity matching...")
270
+
271
+ # ============================== #
272
+ # DEFINE PATHS #
273
+ # ============================== #
274
+ backdrop_images_path = os.getenv("BACKDROP_FOLDER_PATH", "/app/reference/backdrops")
275
+ sprite_images_path = os.getenv("SPRITE_FOLDER_PATH", "/app/reference/sprites")
276
+ image_dirs = [backdrop_images_path, sprite_images_path]
277
+
278
+ # ================================================= #
279
+ # Generate Random UUID for project folder name #
280
+ # ================================================= #
281
+ random_id = str(uuid.uuid4()).replace('-', '')
282
+ project_folder = os.path.join("outputs", f"project_{random_id}")
283
+
284
+ # =========================================================================== #
285
+ # Create empty json in project_{random_id} folder #
286
+ # =========================================================================== #
287
+ os.makedirs(project_folder, exist_ok=True)
288
+ project_json_path = os.path.join(project_folder, "project.json")
289
+
290
+ # ============================== #
291
+ # READ SPRITE METADATA #
292
+ # ============================== #
293
+ with open(input_json_path, 'r') as f:
294
+ sprites_data = json.load(f)
295
+
296
+ sprite_ids, texts, sprite_base64 = [], [], []
297
+ for sid, sprite in sprites_data.items():
298
+ sprite_ids.append(sid)
299
+ texts.append(
300
+ "This is " + sprite.get("description", sprite.get("name", "")))
301
+ sprite_base64.append(sprite["base64"])
302
+
303
+ # ============================== #
304
+ # INITIALIZE CLIP EMBEDDER #
305
+ # ============================== #
306
+ clip_embd = OpenCLIPEmbeddings()
307
+
308
+ # ========================================= #
309
+ # Walk folders to collect all image paths #
310
+ # ========================================= #
311
+ folder_image_paths = []
312
+ for image_dir in image_dirs:
313
+ for root, _, files in os.walk(image_dir):
314
+ for fname in files:
315
+ if fname.lower().endswith((".png", ".jpg", ".jpeg")):
316
+ folder_image_paths.append(os.path.join(root, fname))
317
+
318
+ # ============================== #
319
+ # EMBED FOLDER IMAGES (REF) #
320
+ # ============================== #
321
+ img_features = clip_embd.embed_image(folder_image_paths)
322
+
323
+ # ============================== #
324
+ # Store image embeddings #
325
+ # ============================== #
326
+ embedding_json = []
327
+ for i, path in enumerate(folder_image_paths):
328
+ embedding_json.append({
329
+ "name":os.path.basename(path),
330
+ "file-path": path,
331
+ "embeddings": list(img_features[i])
332
+ })
333
+
334
+ # Save to embeddings.json
335
+ with open(f"{OUTPUT_FOLDER}/embeddings.json", "w") as f:
336
+ json.dump(embedding_json, f, indent=2)
337
+
338
+ # ============================== #
339
+ # DECODE SPRITE IMAGES #
340
+ # ============================== #
341
+ temp_dir = tempfile.mkdtemp()
342
+ sprite_image_paths = []
343
+ for idx, b64 in enumerate(sprite_base64):
344
+ image_data = base64.b64decode(b64.split(",")[-1])
345
+ img = Image.open(BytesIO(image_data)).convert("RGB")
346
+ temp_path = os.path.join(temp_dir, f"sprite_{idx}.png")
347
+ img.save(temp_path)
348
+ sprite_image_paths.append(temp_path)
349
+
350
+ # ============================== #
351
+ # EMBED SPRITE IMAGES #
352
+ # ============================== #
353
+ sprite_features = clip_embd.embed_image(sprite_image_paths)
354
+
355
+ # ============================== #
356
+ # COMPUTE SIMILARITIES #
357
+ # ============================== #
358
+ # with open(f"{OUTPUT_FOLDER}/embeddings.json", "r") as f:
359
+ # embedding_json = json.load(f)
360
+
361
+ img_matrix = np.array([img["embeddings"] for img in embedding_json])
362
+ sprite_matrix = np.array(sprite_features)
363
+
364
+ similarity = np.matmul(sprite_matrix, img_matrix.T)
365
+ most_similar_indices = np.argmax(similarity, axis=1)
366
+
367
+ # ============= Match and copy ================
368
+ project_data, backdrop_data = [], []
369
+ copied_folders = set()
370
+ for sprite_idx, matched_idx in enumerate(most_similar_indices):
371
+ matched_entry = embedding_json[matched_idx]
372
+ # matched_image_path = os.path.normpath(folder_image_paths[matched_idx])
373
+ matched_image_path = os.path.normpath(matched_entry["file-path"])
374
+ matched_folder = os.path.dirname(matched_image_path)
375
+ if matched_folder in copied_folders:
376
+ continue
377
+ copied_folders.add(matched_folder)
378
+
379
+ # Sprite
380
+ sprite_json_path = os.path.join(matched_folder, 'sprite.json')
381
+ if os.path.exists(sprite_json_path):
382
+ with open(sprite_json_path, 'r') as f:
383
+ sprite_data = json.load(f)
384
+ project_data.append(sprite_data)
385
+
386
+ for fname in os.listdir(matched_folder):
387
+ if fname not in {os.path.basename(matched_image_path), 'sprite.json'}:
388
+ shutil.copy2(os.path.join(
389
+ matched_folder, fname), project_folder)
390
+
391
+ # Backdrop
392
+ if matched_image_path.startswith(os.path.normpath(backdrop_images_path)):
393
+ backdrop_json_path = os.path.join(matched_folder, 'project.json')
394
+ if os.path.exists(backdrop_json_path):
395
+ with open(backdrop_json_path, 'r') as f:
396
+ backdrop_json_data = json.load(f)
397
+ for target in backdrop_json_data.get("targets", []):
398
+ if target.get("isStage"):
399
+ backdrop_data.append(target)
400
+ for fname in os.listdir(matched_folder):
401
+ if fname not in {os.path.basename(matched_image_path), 'project.json'}:
402
+ shutil.copy2(os.path.join(
403
+ matched_folder, fname), project_folder)
404
+
405
+ # Merge JSON structure
406
+ final_project = {
407
+ "targets": [],
408
+ "monitors": [],
409
+ "extensions": [],
410
+ "meta": {
411
+ "semver": "3.0.0",
412
+ "vm": "11.3.0",
413
+ "agent": "OpenAI ScratchVision Agent"
414
+ }
415
+ }
416
+
417
+ for sprite in project_data:
418
+ if not sprite.get("isStage", False):
419
+ final_project["targets"].append(sprite)
420
+
421
+ if backdrop_data:
422
+ all_costumes, sounds = [], []
423
+ for idx, bd in enumerate(backdrop_data):
424
+ all_costumes.extend(bd.get("costumes", []))
425
+ if idx == 0 and "sounds" in bd:
426
+ sounds = bd["sounds"]
427
+ final_project["targets"].append({
428
+ "isStage": True,
429
+ "name": "Stage",
430
+ "variables": {},
431
+ "lists": {},
432
+ "broadcasts": {},
433
+ "blocks": {},
434
+ "comments": {},
435
+ "currentCostume": 1 if len(all_costumes) > 1 else 0,
436
+ "costumes": all_costumes,
437
+ "sounds": sounds,
438
+ "volume": 100,
439
+ "layerOrder": 0,
440
+ "tempo": 60,
441
+ "videoTransparency": 50,
442
+ "videoState": "on",
443
+ "textToSpeechLanguage": None
444
+ })
445
+
446
+ with open(project_json_path, 'w') as f:
447
+ json.dump(final_project, f, indent=2)
448
+
449
+ logger.info(f"🎉 Final project saved: {project_json_path}")
450
+ return project_json_path
451
+
452
+
453
+ @app.route('/')
454
+ def index():
455
+ return render_template('app_index.html')
456
+
457
+ # API endpoint
458
+
459
+
460
+ @app.route('/process_pdf', methods=['POST'])
461
+ def process_pdf():
462
+ try:
463
+ logger.info("Received request to process PDF.")
464
+ if 'pdf_file' not in request.files:
465
+ logger.warning("No PDF file found in request.")
466
+ return jsonify({"error": "Missing PDF file in form-data with key 'pdf_file'"}), 400
467
+
468
+ pdf_file = request.files['pdf_file']
469
+ if pdf_file.filename == '':
470
+ return jsonify({"error": "Empty filename"}), 400
471
+
472
+ # Save the uploaded PDF temporarily
473
+ filename = secure_filename(pdf_file.filename)
474
+ temp_dir = tempfile.mkdtemp()
475
+ saved_pdf_path = os.path.join(temp_dir, filename)
476
+ pdf_file.save(saved_pdf_path)
477
+
478
+ logger.info(f"Saved uploaded PDF to: {saved_pdf_path}")
479
+
480
+ # Extract & process
481
+ json_path = None
482
+ output_path, result = extract_images_from_pdf(
483
+ saved_pdf_path, json_path)
484
+
485
+ project_output = similarity_matching(output_path)
486
+ logger.info("Received request to process PDF.")
487
+
488
+ return jsonify({
489
+ "message": "✅ PDF processed successfully",
490
+ "output_json": output_path,
491
+ "sprites": result,
492
+ "project_output_json": project_output
493
+ })
494
+ except Exception as e:
495
+ logger.exception("❌ Failed to process PDF")
496
+ return jsonify({"error": f"❌ Failed to process PDF: {str(e)}"}), 500
497
+
498
+
499
+ if __name__ == '__main__':
500
+ app.run(host='0.0.0.0', port=7860, debug=True)