prthm11 commited on
Commit
25c097f
·
verified ·
1 Parent(s): b8e8cb6

Upload app_main.py

Browse files
Files changed (1) hide show
  1. app_main.py +499 -455
app_main.py CHANGED
@@ -1,455 +1,499 @@
1
- import os
2
- os.environ["NLTK_DATA"] = "/app/nltk_data"
3
-
4
- from flask import Flask, render_template, Response, flash, redirect, url_for, request, jsonify
5
- import cv2
6
- import numpy as np
7
- from unstructured.partition.pdf import partition_pdf
8
- import json, base64, io
9
- from PIL import Image, ImageEnhance, ImageDraw
10
- from imutils.perspective import four_point_transform
11
- from dotenv import load_dotenv
12
- import pytesseract
13
- from transformers import AutoProcessor, AutoModelForImageTextToText, AutoModelForVision2Seq
14
- from langchain_community.document_loaders.image_captions import ImageCaptionLoader
15
- from werkzeug.utils import secure_filename
16
- import tempfile
17
- import torch
18
- from langchain_groq import ChatGroq
19
- from langgraph.prebuilt import create_react_agent
20
- import logging
21
-
22
- # Configure logging
23
- logging.basicConfig(
24
- level=logging.DEBUG, # Use INFO or ERROR in production
25
- format="%(asctime)s [%(levelname)s] %(message)s",
26
- handlers=[
27
- logging.FileHandler("app.log"),
28
- logging.StreamHandler()
29
- ]
30
- )
31
-
32
- logger = logging.getLogger(__name__)
33
-
34
- load_dotenv()
35
- # os.environ["GROQ_API_KEY"] = os.getenv("GROQ_API_KEY")
36
- groq_api_key = os.getenv("GROQ_API_KEY")
37
-
38
- llm = ChatGroq(
39
- model="meta-llama/llama-4-maverick-17b-128e-instruct",
40
- temperature=0,
41
- max_tokens=None,
42
- )
43
-
44
- app = Flask(__name__)
45
-
46
- pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"
47
- poppler_path=r"C:\poppler-23.11.0\Library\bin"
48
-
49
- count = 0
50
- PDF_GET = r"E:\Pratham\2025\Harsh Sir\Scratch Vision\images\scratch_crab.pdf"
51
-
52
- OUTPUT_FOLDER = "OUTPUTS"
53
- DETECTED_IMAGE_FOLDER_PATH = os.path.join(OUTPUT_FOLDER,"DETECTED_IMAGE")
54
- IMAGE_FOLDER_PATH = os.path.join(OUTPUT_FOLDER, "SCANNED_IMAGE")
55
- JSON_FOLDER_PATH = os.path.join(OUTPUT_FOLDER, "EXTRACTED_JSON")
56
-
57
- for path in [OUTPUT_FOLDER, IMAGE_FOLDER_PATH, DETECTED_IMAGE_FOLDER_PATH, JSON_FOLDER_PATH]:
58
- os.makedirs(path, exist_ok=True)
59
-
60
- # Model Initialization
61
- try:
62
- smolvlm256m_processor = AutoProcessor.from_pretrained("HuggingFaceTB/SmolVLM-256M-Instruct")
63
- # smolvlm256m_model = AutoModelForImageTextToText.from_pretrained("HuggingFaceTB/SmolVLM-256M-Instruct").to("cpu")
64
- smolvlm256m_model = AutoModelForVision2Seq.from_pretrained(
65
- "HuggingFaceTB/SmolVLM-256M-Instruct",
66
- torch_dtype=torch.bfloat16 if hasattr(torch, "bfloat16") else torch.float32,
67
- _attn_implementation="eager"
68
- ).to("cpu")
69
- except Exception as e:
70
- raise RuntimeError(f"❌ Failed to load SmolVLM model: {str(e)}")
71
-
72
- # SmolVLM Image Captioning functioning
73
- def get_smolvlm_caption(image: Image.Image, prompt: str = "") -> str:
74
- try:
75
- # Ensure exactly one <image> token
76
- if "<image>" not in prompt:
77
- prompt = f"<image> {prompt.strip()}"
78
-
79
- num_image_tokens = prompt.count("<image>")
80
- if num_image_tokens != 1:
81
- raise ValueError(f"Prompt must contain exactly 1 <image> token. Found {num_image_tokens}")
82
-
83
- inputs = smolvlm256m_processor(images=[image], text=[prompt], return_tensors="pt").to("cpu")
84
- output_ids = smolvlm256m_model.generate(**inputs, max_new_tokens=100)
85
- return smolvlm256m_processor.decode(output_ids[0], skip_special_tokens=True)
86
- except Exception as e:
87
- return f"❌ Error during caption generation: {str(e)}"
88
-
89
- # --- FUNCTION: Extract images from saved PDF ---
90
- def extract_images_from_pdf(pdf_path, output_json_path):
91
- ''' Extract images from PDF and generate structured sprite JSON '''
92
-
93
- try:
94
- pdf_filename = os.path.splitext(os.path.basename(pdf_path))[0] # e.g., "scratch_crab"
95
- pdf_dir_path = os.path.dirname(pdf_path).replace("/", "\\")
96
-
97
- # Create subfolders
98
- extracted_image_subdir = os.path.join(DETECTED_IMAGE_FOLDER_PATH, pdf_filename)
99
- json_subdir = os.path.join(JSON_FOLDER_PATH, pdf_filename)
100
- os.makedirs(extracted_image_subdir, exist_ok=True)
101
- os.makedirs(json_subdir, exist_ok=True)
102
-
103
- # Output paths
104
- output_json_path = os.path.join(json_subdir, "extracted.json")
105
- final_json_path = os.path.join(json_subdir, "extracted_sprites.json")
106
-
107
- try:
108
- elements = partition_pdf(
109
- filename=pdf_path,
110
- strategy="hi_res",
111
- extract_image_block_types=["Image"],
112
- extract_image_block_to_payload=True, # Set to True to get base64 in output
113
- )
114
- except Exception as e:
115
- raise RuntimeError(f"❌ Failed to extract images from PDF: {str(e)}")
116
-
117
- try:
118
- with open(output_json_path, "w") as f:
119
- json.dump([element.to_dict() for element in elements], f, indent=4)
120
- except Exception as e:
121
- raise RuntimeError(f"❌ Failed to write extracted.json: {str(e)}")
122
-
123
- try:
124
- # Display extracted images
125
- with open(output_json_path, 'r') as file:
126
- file_elements = json.load(file)
127
- except Exception as e:
128
- raise RuntimeError(f"❌ Failed to read extracted.json: {str(e)}")
129
-
130
- # Prepare manipulated sprite JSON structure
131
- manipulated_json = {}
132
-
133
- # SET A SYSTEM PROMPT
134
- system_prompt = """
135
- You are an expert in visual scene understanding.
136
- Your Job is to analyze an image and respond acoording if asked for name give simple name by analyzing it and if ask for descrption generate a short description covering its elements.
137
-
138
- Guidelines:
139
- - Focus only the images given in Square Shape.
140
- - Don't Consider Blank areas in Image as.
141
- - Don't include generic summary or explanation outside the fields.
142
- Return only string.
143
- """
144
-
145
- agent = create_react_agent(
146
- model = llm,
147
- tools = [],
148
- prompt = system_prompt
149
- )
150
-
151
- # If JSON already exists, load it and find the next available Sprite number
152
- if os.path.exists(final_json_path):
153
- with open(final_json_path, "r") as existing_file:
154
- manipulated = json.load(existing_file)
155
- # Determine the next available index (e.g., Sprite 4 if 1–3 already exist)
156
- existing_keys = [int(k.replace("Sprite ", "")) for k in manipulated.keys()]
157
- start_count = max(existing_keys, default=0) + 1
158
- else:
159
- start_count = 1
160
-
161
- sprite_count = start_count
162
- for i,element in enumerate(file_elements):
163
- if "image_base64" in element["metadata"]:
164
- try:
165
- image_data = base64.b64decode(element["metadata"]["image_base64"])
166
- image = Image.open(io.BytesIO(image_data)).convert("RGB")
167
- image.show(title=f"Extracted Image {i+1}")
168
- image_path = os.path.join(extracted_image_subdir, f"Sprite_{i+1}.png")
169
- image.save(image_path)
170
- with open(image_path, "rb") as image_file:
171
- image_bytes = image_file.read()
172
- img_base64 = base64.b64encode(image_bytes).decode("utf-8")
173
- # description = get_smolvlm_caption(image, prompt="Give a brief Description")
174
- # name = get_smolvlm_caption(image, prompt="give a short name/title of this Image.")
175
- def clean_caption_output(raw_output: str, prompt: str) -> str:
176
- answer = raw_output.replace(prompt, '').replace("<image>", '').strip(" :-\n")
177
- return answer
178
-
179
- prompt_description = "Give a brief Captioning."
180
- prompt_name = "give a short name caption of this Image."
181
-
182
- content1 = [
183
- {
184
- "type": "text",
185
- "text": f"{prompt_description}"
186
- },
187
- {
188
- "type": "image_url",
189
- "image_url": {
190
- "url": f"data:image/jpeg;base64,{img_base64}"
191
- }
192
- }
193
- ]
194
- response1 = agent.invoke({"messages": [{"role": "user", "content":content1}]})
195
- print(response1)
196
- description = response1["messages"][-1].content
197
-
198
- content2 = [
199
- {
200
- "type": "text",
201
- "text": f"{prompt_name}"
202
- },
203
- {
204
- "type": "image_url",
205
- "image_url": {
206
- "url": f"data:image/jpeg;base64,{img_base64}"
207
- }
208
- }
209
- ]
210
-
211
- response2 = agent.invoke({"messages": [{"role": "user", "content":content2}]})
212
- print(response2)
213
- name = response2["messages"][-1].content
214
-
215
- #raw_description = get_smolvlm_caption(image, prompt=prompt_description)
216
- #raw_name = get_smolvlm_caption(image, prompt=prompt_name)
217
-
218
- #description = clean_caption_output(raw_description, prompt_description)
219
- #name = clean_caption_output(raw_name, prompt_name)
220
-
221
- manipulated_json[f"Sprite {sprite_count}"] = {
222
- "name": name,
223
- "base64": element["metadata"]["image_base64"],
224
- "file-path": pdf_dir_path,
225
- "description":description
226
- }
227
- sprite_count += 1
228
- except Exception as e:
229
- print(f"⚠️ Error processing Sprite {i+1}: {str(e)}")
230
-
231
- # Save manipulated JSON
232
- with open(final_json_path, "w") as sprite_file:
233
- json.dump(manipulated_json, sprite_file, indent=4)
234
-
235
- print(f"✅ Manipulated sprite JSON saved: {final_json_path}")
236
- return final_json_path, manipulated_json
237
-
238
- except Exception as e:
239
- raise RuntimeError(f"❌ Error in extract_images_from_pdf: {str(e)}")
240
-
241
- os.environ["TRANSFORMERS_CACHE"] = "/app/cache"
242
- os.environ["HF_HOME"] = "/app/cache"
243
-
244
- def similarity_matching(input_json_path: str) -> str:
245
- import uuid, shutil, tempfile
246
- from langchain_experimental.open_clip.open_clip import OpenCLIPEmbeddings
247
- from matplotlib.offsetbox import OffsetImage, AnnotationBbox
248
- from io import BytesIO
249
-
250
- logger.info("🔍 Running similarity matching...")
251
-
252
- # ============================== #
253
- # DEFINE PATHS #
254
- # ============================== #
255
- backdrop_images_path = os.getenv("BACKDROP_FOLDER_PATH", "/app/reference/backdrops")
256
- sprite_images_path = os.getenv("SPRITE_FOLDER_PATH", "/app/reference/sprites")
257
-
258
- image_dirs = [backdrop_images_path, sprite_images_path]
259
-
260
-
261
- # ================================================= #
262
- # Generate Random UUID for project folder name #
263
- # ================================================= #
264
- random_id = str(uuid.uuid4()).replace('-', '')
265
- project_folder = os.path.join("outputs", f"project_{random_id}")
266
-
267
- # =========================================================================== #
268
- # Create empty json in project_{random_id} folder #
269
- # =========================================================================== #
270
- os.makedirs(project_folder, exist_ok=True)
271
- project_json_path = os.path.join(project_folder, "project.json")
272
-
273
- # ============================== #
274
- # READ SPRITE METADATA #
275
- # ============================== #
276
- with open(input_json_path, 'r') as f:
277
- sprites_data = json.load(f)
278
-
279
- sprite_ids, texts, sprite_base64 = [], [], []
280
- for sid, sprite in sprites_data.items():
281
- sprite_ids.append(sid)
282
- texts.append("This is " + sprite.get("description", sprite.get("name", "")))
283
- sprite_base64.append(sprite["base64"])
284
-
285
- # ========================================= #
286
- # Walk folders to collect all image paths #
287
- # ========================================= #
288
- folder_image_paths = []
289
- for image_dir in image_dirs:
290
- for root, dirs, files in os.walk(image_dir):
291
- for fname in files:
292
- if fname.lower().endswith((".png", ".jpg", ".jpeg")):
293
- folder_image_paths.append(os.path.join(root, fname))
294
-
295
- # ============================== #
296
- # INITIALIZE CLIP EMBEDDER #
297
- # ============================== #
298
- clip_embd = OpenCLIPEmbeddings()
299
-
300
- # ============================== #
301
- # EMBED FOLDER IMAGES (REF) #
302
- # ============================== #
303
- img_features = clip_embd.embed_image(folder_image_paths)
304
-
305
- # ============================== #
306
- # DECODE SPRITE IMAGES #
307
- # ============================== #
308
- temp_dir = tempfile.mkdtemp()
309
- sprite_image_paths = []
310
- for idx, b64 in enumerate(sprite_base64):
311
- image_data = base64.b64decode(b64.split(",")[-1])
312
- img = Image.open(BytesIO(image_data)).convert("RGB")
313
- temp_path = os.path.join(temp_dir, f"sprite_{idx}.png")
314
- img.save(temp_path)
315
- sprite_image_paths.append(temp_path)
316
-
317
- # ============================== #
318
- # EMBED SPRITE IMAGES #
319
- # ============================== #
320
- sprite_features = clip_embd.embed_image(sprite_image_paths)
321
-
322
- # ============================== #
323
- # COMPUTE SIMILARITIES #
324
- # ============================== #
325
- if not sprite_features or not img_features:
326
- raise ValueError("❌ Embedding generation failed: One of the image feature lists is empty.")
327
- similarity = np.matmul(np.array(sprite_features), np.array(img_features).T)
328
- most_similar_indices = np.argmax(similarity, axis=1)
329
-
330
- # ============= Match and copy ================
331
- project_data, backdrop_data = [], []
332
- copied_folders = set()
333
- for sprite_idx, matched_idx in enumerate(most_similar_indices):
334
- matched_image_path = os.path.normpath(folder_image_paths[matched_idx])
335
- matched_folder = os.path.dirname(matched_image_path)
336
- if matched_folder in copied_folders:
337
- continue
338
- copied_folders.add(matched_folder)
339
-
340
- # Sprite
341
- sprite_json_path = os.path.join(matched_folder, 'sprite.json')
342
- if os.path.exists(sprite_json_path):
343
- with open(sprite_json_path, 'r') as f:
344
- sprite_data = json.load(f)
345
- project_data.append(sprite_data)
346
-
347
- for fname in os.listdir(matched_folder):
348
- if fname not in {os.path.basename(matched_image_path), 'sprite.json'}:
349
- shutil.copy2(os.path.join(matched_folder, fname), project_folder)
350
-
351
- # Backdrop
352
- if matched_image_path.startswith(os.path.normpath(backdrop_images_path)):
353
- backdrop_json_path = os.path.join(matched_folder, 'project.json')
354
- if os.path.exists(backdrop_json_path):
355
- with open(backdrop_json_path, 'r') as f:
356
- backdrop_json_data = json.load(f)
357
- for target in backdrop_json_data.get("targets", []):
358
- if target.get("isStage"):
359
- backdrop_data.append(target)
360
- for fname in os.listdir(matched_folder):
361
- if fname not in {os.path.basename(matched_image_path), 'project.json'}:
362
- shutil.copy2(os.path.join(matched_folder, fname), project_folder)
363
-
364
- # Merge JSON structure
365
- final_project = {
366
- "targets": [],
367
- "monitors": [],
368
- "extensions": [],
369
- "meta": {
370
- "semver": "3.0.0",
371
- "vm": "11.3.0",
372
- "agent": "OpenAI ScratchVision Agent"
373
- }
374
- }
375
-
376
- for sprite in project_data:
377
- if not sprite.get("isStage", False):
378
- final_project["targets"].append(sprite)
379
-
380
- if backdrop_data:
381
- all_costumes, sounds = [], []
382
- for idx, bd in enumerate(backdrop_data):
383
- all_costumes.extend(bd.get("costumes", []))
384
- if idx == 0 and "sounds" in bd:
385
- sounds = bd["sounds"]
386
- final_project["targets"].append({
387
- "isStage": True,
388
- "name": "Stage",
389
- "variables": {},
390
- "lists": {},
391
- "broadcasts": {},
392
- "blocks": {},
393
- "comments": {},
394
- "currentCostume": 1 if len(all_costumes) > 1 else 0,
395
- "costumes": all_costumes,
396
- "sounds": sounds,
397
- "volume": 100,
398
- "layerOrder": 0,
399
- "tempo": 60,
400
- "videoTransparency": 50,
401
- "videoState": "on",
402
- "textToSpeechLanguage": None
403
- })
404
-
405
- with open(project_json_path, 'w') as f:
406
- json.dump(final_project, f, indent=2)
407
-
408
- logger.info(f"🎉 Final project saved: {project_json_path}")
409
- return project_json_path
410
-
411
-
412
- @app.route('/')
413
- def index():
414
- return render_template('app_index.html')
415
-
416
- # API endpoint
417
- @app.route('/process_pdf', methods=['POST'])
418
- def process_pdf():
419
- try:
420
- logger.info("Received request to process PDF.")
421
- if 'pdf_file' not in request.files:
422
- logger.warning("No PDF file found in request.")
423
- return jsonify({"error": "Missing PDF file in form-data with key 'pdf_file'"}), 400
424
-
425
- pdf_file = request.files['pdf_file']
426
- if pdf_file.filename == '':
427
- return jsonify({"error": "Empty filename"}), 400
428
-
429
- # Save the uploaded PDF temporarily
430
- filename = secure_filename(pdf_file.filename)
431
- temp_dir = tempfile.mkdtemp()
432
- saved_pdf_path = os.path.join(temp_dir, filename)
433
- pdf_file.save(saved_pdf_path)
434
-
435
- logger.info(f"Saved uploaded PDF to: {saved_pdf_path}")
436
-
437
- # Extract & process
438
- json_path = None
439
- output_path, result = extract_images_from_pdf(saved_pdf_path, json_path)
440
-
441
- project_output = similarity_matching(output_path)
442
- logger.info("Received request to process PDF.")
443
-
444
- return jsonify({
445
- "message": "✅ PDF processed successfully",
446
- "output_json": output_path,
447
- "sprites": result,
448
- "project_output_json": project_output
449
- })
450
- except Exception as e:
451
- logger.exception("❌ Failed to process PDF")
452
- return jsonify({"error": f"❌ Failed to process PDF: {str(e)}"}), 500
453
-
454
- if __name__ == '__main__':
455
- app.run(host='0.0.0.0', port=7860, debug=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from flask import Flask, render_template, Response, flash, redirect, url_for, request, jsonify
2
+ import cv2
3
+ import numpy as np
4
+ from unstructured.partition.pdf import partition_pdf
5
+ import json
6
+ import base64
7
+ import io
8
+ import os
9
+ from PIL import Image, ImageEnhance, ImageDraw
10
+ from imutils.perspective import four_point_transform
11
+ from dotenv import load_dotenv
12
+ import pytesseract
13
+ from transformers import AutoProcessor, AutoModelForImageTextToText, AutoModelForVision2Seq
14
+ from langchain_community.document_loaders.image_captions import ImageCaptionLoader
15
+ from werkzeug.utils import secure_filename
16
+ import tempfile
17
+ import torch
18
+ from langchain_groq import ChatGroq
19
+ from langgraph.prebuilt import create_react_agent
20
+ import logging
21
+
22
+ # Configure logging
23
+ logging.basicConfig(
24
+ level=logging.DEBUG, # Use INFO or ERROR in production
25
+ format="%(asctime)s [%(levelname)s] %(message)s",
26
+ handlers=[
27
+ logging.FileHandler("app.log"),
28
+ logging.StreamHandler()
29
+ ]
30
+ )
31
+
32
+ logger = logging.getLogger(__name__)
33
+
34
+ load_dotenv()
35
+ # os.environ["GROQ_API_KEY"] = os.getenv("GROQ_API_KEY")
36
+ groq_api_key = os.getenv("GROQ_API_KEY")
37
+
38
+ llm = ChatGroq(
39
+ model="meta-llama/llama-4-maverick-17b-128e-instruct",
40
+ temperature=0,
41
+ max_tokens=None,
42
+ )
43
+
44
+ app = Flask(__name__)
45
+
46
+ pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"
47
+ poppler_path = r"C:\poppler-23.11.0\Library\bin"
48
+
49
+ count = 0
50
+ PDF_GET = r"E:\Pratham\2025\Harsh Sir\Scratch Vision\images\scratch_crab.pdf"
51
+
52
+ OUTPUT_FOLDER = "OUTPUTS"
53
+ DETECTED_IMAGE_FOLDER_PATH = os.path.join(OUTPUT_FOLDER, "DETECTED_IMAGE")
54
+ IMAGE_FOLDER_PATH = os.path.join(OUTPUT_FOLDER, "SCANNED_IMAGE")
55
+ JSON_FOLDER_PATH = os.path.join(OUTPUT_FOLDER, "EXTRACTED_JSON")
56
+
57
+ for path in [OUTPUT_FOLDER, IMAGE_FOLDER_PATH, DETECTED_IMAGE_FOLDER_PATH, JSON_FOLDER_PATH]:
58
+ os.makedirs(path, exist_ok=True)
59
+
60
+ # Model Initialization
61
+ try:
62
+ smolvlm256m_processor = AutoProcessor.from_pretrained(
63
+ "HuggingFaceTB/SmolVLM-256M-Instruct")
64
+ # smolvlm256m_model = AutoModelForImageTextToText.from_pretrained("HuggingFaceTB/SmolVLM-256M-Instruct").to("cpu")
65
+ smolvlm256m_model = AutoModelForVision2Seq.from_pretrained(
66
+ "HuggingFaceTB/SmolVLM-256M-Instruct",
67
+ torch_dtype=torch.bfloat16 if hasattr(
68
+ torch, "bfloat16") else torch.float32,
69
+ _attn_implementation="eager"
70
+ ).to("cpu")
71
+ except Exception as e:
72
+ raise RuntimeError(f"❌ Failed to load SmolVLM model: {str(e)}")
73
+
74
+ # SmolVLM Image Captioning functioning
75
+
76
+
77
+ def get_smolvlm_caption(image: Image.Image, prompt: str = "") -> str:
78
+ try:
79
+ # Ensure exactly one <image> token
80
+ if "<image>" not in prompt:
81
+ prompt = f"<image> {prompt.strip()}"
82
+
83
+ num_image_tokens = prompt.count("<image>")
84
+ if num_image_tokens != 1:
85
+ raise ValueError(
86
+ f"Prompt must contain exactly 1 <image> token. Found {num_image_tokens}")
87
+
88
+ inputs = smolvlm256m_processor(
89
+ images=[image], text=[prompt], return_tensors="pt").to("cpu")
90
+ output_ids = smolvlm256m_model.generate(**inputs, max_new_tokens=100)
91
+ return smolvlm256m_processor.decode(output_ids[0], skip_special_tokens=True)
92
+ except Exception as e:
93
+ return f"❌ Error during caption generation: {str(e)}"
94
+
95
+ # --- FUNCTION: Extract images from saved PDF ---
96
+
97
+
98
+ def extract_images_from_pdf(pdf_path, output_json_path):
99
+ ''' Extract images from PDF and generate structured sprite JSON '''
100
+
101
+ try:
102
+ pdf_filename = os.path.splitext(os.path.basename(pdf_path))[
103
+ 0] # e.g., "scratch_crab"
104
+ pdf_dir_path = os.path.dirname(pdf_path).replace("/", "\\")
105
+
106
+ # Create subfolders
107
+ extracted_image_subdir = os.path.join(
108
+ DETECTED_IMAGE_FOLDER_PATH, pdf_filename)
109
+ json_subdir = os.path.join(JSON_FOLDER_PATH, pdf_filename)
110
+ os.makedirs(extracted_image_subdir, exist_ok=True)
111
+ os.makedirs(json_subdir, exist_ok=True)
112
+
113
+ # Output paths
114
+ output_json_path = os.path.join(json_subdir, "extracted.json")
115
+ final_json_path = os.path.join(json_subdir, "extracted_sprites.json")
116
+
117
+ try:
118
+ elements = partition_pdf(
119
+ filename=pdf_path,
120
+ strategy="hi_res",
121
+ extract_image_block_types=["Image"],
122
+ extract_image_block_to_payload=True, # Set to True to get base64 in output
123
+ )
124
+ except Exception as e:
125
+ raise RuntimeError(
126
+ f"❌ Failed to extract images from PDF: {str(e)}")
127
+
128
+ try:
129
+ with open(output_json_path, "w") as f:
130
+ json.dump([element.to_dict()
131
+ for element in elements], f, indent=4)
132
+ except Exception as e:
133
+ raise RuntimeError(f"❌ Failed to write extracted.json: {str(e)}")
134
+
135
+ try:
136
+ # Display extracted images
137
+ with open(output_json_path, 'r') as file:
138
+ file_elements = json.load(file)
139
+ except Exception as e:
140
+ raise RuntimeError(f"❌ Failed to read extracted.json: {str(e)}")
141
+
142
+ # Prepare manipulated sprite JSON structure
143
+ manipulated_json = {}
144
+
145
+ # SET A SYSTEM PROMPT
146
+ system_prompt = """
147
+ You are an expert in visual scene understanding.
148
+ Your Job is to analyze an image and respond acoording if asked for name give simple name by analyzing it and if ask for descrption generate a short description covering its elements.
149
+
150
+ Guidelines:
151
+ - Focus only the images given in Square Shape.
152
+ - Don't Consider Blank areas in Image as.
153
+ - Don't include generic summary or explanation outside the fields.
154
+ Return only string.
155
+ """
156
+
157
+ agent = create_react_agent(
158
+ model=llm,
159
+ tools=[],
160
+ prompt=system_prompt
161
+ )
162
+
163
+ # If JSON already exists, load it and find the next available Sprite number
164
+ if os.path.exists(final_json_path):
165
+ with open(final_json_path, "r") as existing_file:
166
+ manipulated = json.load(existing_file)
167
+ # Determine the next available index (e.g., Sprite 4 if 1–3 already exist)
168
+ existing_keys = [int(k.replace("Sprite ", ""))
169
+ for k in manipulated.keys()]
170
+ start_count = max(existing_keys, default=0) + 1
171
+ else:
172
+ start_count = 1
173
+
174
+ sprite_count = start_count
175
+ for i, element in enumerate(file_elements):
176
+ if "image_base64" in element["metadata"]:
177
+ try:
178
+ image_data = base64.b64decode(
179
+ element["metadata"]["image_base64"])
180
+ image = Image.open(io.BytesIO(image_data)).convert("RGB")
181
+ image.show(title=f"Extracted Image {i+1}")
182
+ image_path = os.path.join(
183
+ extracted_image_subdir, f"Sprite_{i+1}.png")
184
+ image.save(image_path)
185
+ with open(image_path, "rb") as image_file:
186
+ image_bytes = image_file.read()
187
+ img_base64 = base64.b64encode(image_bytes).decode("utf-8")
188
+ # description = get_smolvlm_caption(image, prompt="Give a brief Description")
189
+ # name = get_smolvlm_caption(image, prompt="give a short name/title of this Image.")
190
+
191
+ def clean_caption_output(raw_output: str, prompt: str) -> str:
192
+ answer = raw_output.replace(prompt, '').replace(
193
+ "<image>", '').strip(" :-\n")
194
+ return answer
195
+
196
+ prompt_description = "Give a brief Captioning."
197
+ prompt_name = "give a short name caption of this Image."
198
+
199
+ content1 = [
200
+ {
201
+ "type": "text",
202
+ "text": f"{prompt_description}"
203
+ },
204
+ {
205
+ "type": "image_url",
206
+ "image_url": {
207
+ "url": f"data:image/jpeg;base64,{img_base64}"
208
+ }
209
+ }
210
+ ]
211
+ response1 = agent.invoke(
212
+ {"messages": [{"role": "user", "content": content1}]})
213
+ print(response1)
214
+ description = response1["messages"][-1].content
215
+
216
+ content2 = [
217
+ {
218
+ "type": "text",
219
+ "text": f"{prompt_name}"
220
+ },
221
+ {
222
+ "type": "image_url",
223
+ "image_url": {
224
+ "url": f"data:image/jpeg;base64,{img_base64}"
225
+ }
226
+ }
227
+ ]
228
+
229
+ response2 = agent.invoke(
230
+ {"messages": [{"role": "user", "content": content2}]})
231
+ print(response2)
232
+ name = response2["messages"][-1].content
233
+
234
+ # raw_description = get_smolvlm_caption(image, prompt=prompt_description)
235
+ # raw_name = get_smolvlm_caption(image, prompt=prompt_name)
236
+
237
+ # description = clean_caption_output(raw_description, prompt_description)
238
+ # name = clean_caption_output(raw_name, prompt_name)
239
+
240
+ manipulated_json[f"Sprite {sprite_count}"] = {
241
+ "name": name,
242
+ "base64": element["metadata"]["image_base64"],
243
+ "file-path": pdf_dir_path,
244
+ "description": description
245
+ }
246
+ sprite_count += 1
247
+ except Exception as e:
248
+ print(f"⚠️ Error processing Sprite {i+1}: {str(e)}")
249
+
250
+ # Save manipulated JSON
251
+ with open(final_json_path, "w") as sprite_file:
252
+ json.dump(manipulated_json, sprite_file, indent=4)
253
+
254
+ print(f"✅ Manipulated sprite JSON saved: {final_json_path}")
255
+ return final_json_path, manipulated_json
256
+
257
+ except Exception as e:
258
+ raise RuntimeError(f"❌ Error in extract_images_from_pdf: {str(e)}")
259
+
260
+
261
+ def similarity_matching(input_json_path: str) -> str:
262
+ import uuid
263
+ import shutil
264
+ import tempfile
265
+ from langchain_experimental.open_clip.open_clip import OpenCLIPEmbeddings
266
+ from matplotlib.offsetbox import OffsetImage, AnnotationBbox
267
+ from io import BytesIO
268
+
269
+ logger.info("🔍 Running similarity matching...")
270
+
271
+ # ============================== #
272
+ # DEFINE PATHS #
273
+ # ============================== #
274
+ backdrop_images_path = r"E:\Pratham\2025\Harsh Sir\Scratch Vision\images\Backdrops"
275
+ sprite_images_path = r"E:\Pratham\2025\Harsh Sir\Scratch Vision\images\sprites"
276
+ image_dirs = [backdrop_images_path, sprite_images_path]
277
+
278
+ # ================================================= #
279
+ # Generate Random UUID for project folder name #
280
+ # ================================================= #
281
+ random_id = str(uuid.uuid4()).replace('-', '')
282
+ project_folder = os.path.join("outputs", f"project_{random_id}")
283
+
284
+ # =========================================================================== #
285
+ # Create empty json in project_{random_id} folder #
286
+ # =========================================================================== #
287
+ os.makedirs(project_folder, exist_ok=True)
288
+ project_json_path = os.path.join(project_folder, "project.json")
289
+
290
+ # ============================== #
291
+ # READ SPRITE METADATA #
292
+ # ============================== #
293
+ with open(input_json_path, 'r') as f:
294
+ sprites_data = json.load(f)
295
+
296
+ sprite_ids, texts, sprite_base64 = [], [], []
297
+ for sid, sprite in sprites_data.items():
298
+ sprite_ids.append(sid)
299
+ texts.append(
300
+ "This is " + sprite.get("description", sprite.get("name", "")))
301
+ sprite_base64.append(sprite["base64"])
302
+
303
+ # ============================== #
304
+ # INITIALIZE CLIP EMBEDDER #
305
+ # ============================== #
306
+ clip_embd = OpenCLIPEmbeddings()
307
+
308
+ # ========================================= #
309
+ # Walk folders to collect all image paths #
310
+ # ========================================= #
311
+ # folder_image_paths = []
312
+ # for image_dir in image_dirs:
313
+ # for root, _, files in os.walk(image_dir):
314
+ # for fname in files:
315
+ # if fname.lower().endswith((".png", ".jpg", ".jpeg")):
316
+ # folder_image_paths.append(os.path.join(root, fname))
317
+
318
+ # # ============================== #
319
+ # # EMBED FOLDER IMAGES (REF) #
320
+ # # ============================== #
321
+ # img_features = clip_embd.embed_image(folder_image_paths)
322
+
323
+ # # ============================== #
324
+ # # Store image embeddings #
325
+ # # ============================== #
326
+ # embedding_json = []
327
+ # for i, path in enumerate(folder_image_paths):
328
+ # embedding_json.append({
329
+ # "name":os.path.basename(path),
330
+ # "file-path": path,
331
+ # "embeddings": list(img_features[i])
332
+ # })
333
+
334
+ # # Save to embeddings.json
335
+ # with open(f"{OUTPUT_FOLDER}/embeddings.json", "w") as f:
336
+ # json.dump(embedding_json, f, indent=2)
337
+ # ============================== #
338
+ # DECODE SPRITE IMAGES #
339
+ # ============================== #
340
+ temp_dir = tempfile.mkdtemp()
341
+ sprite_image_paths = []
342
+ for idx, b64 in enumerate(sprite_base64):
343
+ image_data = base64.b64decode(b64.split(",")[-1])
344
+ img = Image.open(BytesIO(image_data)).convert("RGB")
345
+ temp_path = os.path.join(temp_dir, f"sprite_{idx}.png")
346
+ img.save(temp_path)
347
+ sprite_image_paths.append(temp_path)
348
+
349
+ # ============================== #
350
+ # EMBED SPRITE IMAGES #
351
+ # ============================== #
352
+ sprite_features = clip_embd.embed_image(sprite_image_paths)
353
+
354
+ # ============================== #
355
+ # COMPUTE SIMILARITIES #
356
+ # ============================== #
357
+ with open(f"{OUTPUT_FOLDER}/embeddings.json", "r") as f:
358
+ embedding_json = json.load(f)
359
+
360
+ img_matrix = np.array([img["embeddings"] for img in embedding_json])
361
+ sprite_matrix = np.array(sprite_features)
362
+
363
+ similarity = np.matmul(sprite_matrix, img_matrix.T)
364
+ most_similar_indices = np.argmax(similarity, axis=1)
365
+
366
+ # ============= Match and copy ================
367
+ project_data, backdrop_data = [], []
368
+ copied_folders = set()
369
+ for sprite_idx, matched_idx in enumerate(most_similar_indices):
370
+ matched_entry = embedding_json[matched_idx]
371
+ # matched_image_path = os.path.normpath(folder_image_paths[matched_idx])
372
+ matched_image_path = os.path.normpath(matched_entry["file-path"])
373
+ matched_folder = os.path.dirname(matched_image_path)
374
+ if matched_folder in copied_folders:
375
+ continue
376
+ copied_folders.add(matched_folder)
377
+
378
+ # Sprite
379
+ sprite_json_path = os.path.join(matched_folder, 'sprite.json')
380
+ if os.path.exists(sprite_json_path):
381
+ with open(sprite_json_path, 'r') as f:
382
+ sprite_data = json.load(f)
383
+ project_data.append(sprite_data)
384
+
385
+ for fname in os.listdir(matched_folder):
386
+ if fname not in {os.path.basename(matched_image_path), 'sprite.json'}:
387
+ shutil.copy2(os.path.join(
388
+ matched_folder, fname), project_folder)
389
+
390
+ # Backdrop
391
+ if matched_image_path.startswith(os.path.normpath(backdrop_images_path)):
392
+ backdrop_json_path = os.path.join(matched_folder, 'project.json')
393
+ if os.path.exists(backdrop_json_path):
394
+ with open(backdrop_json_path, 'r') as f:
395
+ backdrop_json_data = json.load(f)
396
+ for target in backdrop_json_data.get("targets", []):
397
+ if target.get("isStage"):
398
+ backdrop_data.append(target)
399
+ for fname in os.listdir(matched_folder):
400
+ if fname not in {os.path.basename(matched_image_path), 'project.json'}:
401
+ shutil.copy2(os.path.join(
402
+ matched_folder, fname), project_folder)
403
+
404
+ # Merge JSON structure
405
+ final_project = {
406
+ "targets": [],
407
+ "monitors": [],
408
+ "extensions": [],
409
+ "meta": {
410
+ "semver": "3.0.0",
411
+ "vm": "11.3.0",
412
+ "agent": "OpenAI ScratchVision Agent"
413
+ }
414
+ }
415
+
416
+ for sprite in project_data:
417
+ if not sprite.get("isStage", False):
418
+ final_project["targets"].append(sprite)
419
+
420
+ if backdrop_data:
421
+ all_costumes, sounds = [], []
422
+ for idx, bd in enumerate(backdrop_data):
423
+ all_costumes.extend(bd.get("costumes", []))
424
+ if idx == 0 and "sounds" in bd:
425
+ sounds = bd["sounds"]
426
+ final_project["targets"].append({
427
+ "isStage": True,
428
+ "name": "Stage",
429
+ "variables": {},
430
+ "lists": {},
431
+ "broadcasts": {},
432
+ "blocks": {},
433
+ "comments": {},
434
+ "currentCostume": 1 if len(all_costumes) > 1 else 0,
435
+ "costumes": all_costumes,
436
+ "sounds": sounds,
437
+ "volume": 100,
438
+ "layerOrder": 0,
439
+ "tempo": 60,
440
+ "videoTransparency": 50,
441
+ "videoState": "on",
442
+ "textToSpeechLanguage": None
443
+ })
444
+
445
+ with open(project_json_path, 'w') as f:
446
+ json.dump(final_project, f, indent=2)
447
+
448
+ logger.info(f"🎉 Final project saved: {project_json_path}")
449
+ return project_json_path
450
+
451
+
452
+ @app.route('/')
453
+ def index():
454
+ return render_template('app_index.html')
455
+
456
+ # API endpoint
457
+
458
+
459
+ @app.route('/process_pdf', methods=['POST'])
460
+ def process_pdf():
461
+ try:
462
+ logger.info("Received request to process PDF.")
463
+ if 'pdf_file' not in request.files:
464
+ logger.warning("No PDF file found in request.")
465
+ return jsonify({"error": "Missing PDF file in form-data with key 'pdf_file'"}), 400
466
+
467
+ pdf_file = request.files['pdf_file']
468
+ if pdf_file.filename == '':
469
+ return jsonify({"error": "Empty filename"}), 400
470
+
471
+ # Save the uploaded PDF temporarily
472
+ filename = secure_filename(pdf_file.filename)
473
+ temp_dir = tempfile.mkdtemp()
474
+ saved_pdf_path = os.path.join(temp_dir, filename)
475
+ pdf_file.save(saved_pdf_path)
476
+
477
+ logger.info(f"Saved uploaded PDF to: {saved_pdf_path}")
478
+
479
+ # Extract & process
480
+ json_path = None
481
+ output_path, result = extract_images_from_pdf(
482
+ saved_pdf_path, json_path)
483
+
484
+ project_output = similarity_matching(output_path)
485
+ logger.info("Received request to process PDF.")
486
+
487
+ return jsonify({
488
+ "message": "✅ PDF processed successfully",
489
+ "output_json": output_path,
490
+ "sprites": result,
491
+ "project_output_json": project_output
492
+ })
493
+ except Exception as e:
494
+ logger.exception("❌ Failed to process PDF")
495
+ return jsonify({"error": f"❌ Failed to process PDF: {str(e)}"}), 500
496
+
497
+
498
+ if __name__ == '__main__':
499
+ app.run(host='0.0.0.0', port=7860, debug=True)