Scratch_Vision_Game_dup

Sleeping

App Files Files Community

Scratch_Vision_Game_dup / app_main.py

prthm11

Update app_main.py

75c46c8 verified about 1 month ago

raw

history blame

6.17 kB

	from flask import Flask, render_template, Response, flash, redirect, url_for, request, jsonify
	import cv2
	import numpy as np
	from unstructured.partition.pdf import partition_pdf
	import json, base64, io, os
	from PIL import Image, ImageEnhance, ImageDraw
	from imutils.perspective import four_point_transform
	from dotenv import load_dotenv
	import pytesseract
	from transformers import AutoProcessor, AutoModelForImageTextToText
	from langchain_community.document_loaders.image_captions import ImageCaptionLoader

	app = Flask(__name__)

	pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"
	poppler_path=r"C:\poppler-23.11.0\Library\bin"

	count = 0
	PDF_GET = r"E:\Pratham\2025\Harsh Sir\Scratch Vision\images\scratch_crab.pdf"

	OUTPUT_FOLDER = "OUTPUTS"
	DETECTED_IMAGE_FOLDER_PATH = os.path.join(OUTPUT_FOLDER,"DETECTED_IMAGE")
	IMAGE_FOLDER_PATH = os.path.join(OUTPUT_FOLDER, "SCANNED_IMAGE")
	JSON_FOLDER_PATH = os.path.join(OUTPUT_FOLDER, "EXTRACTED_JSON")

	for path in [OUTPUT_FOLDER, IMAGE_FOLDER_PATH, DETECTED_IMAGE_FOLDER_PATH, JSON_FOLDER_PATH]:
	os.makedirs(path, exist_ok=True)

	# Model Initialization
	smolvlm256m_processor = AutoProcessor.from_pretrained("HuggingFaceTB/SmolVLM-256M-Instruct")
	smolvlm256m_model = AutoModelForImageTextToText.from_pretrained("HuggingFaceTB/SmolVLM-256M-Instruct").to("cpu")

	# SmolVLM Image Captioning functioning
	def get_smolvlm_caption(image: Image.Image, prompt: str = "") -> str:
	# Ensure exactly one <image> token
	if "<image>" not in prompt:
	prompt = f"<image> {prompt.strip()}"

	num_image_tokens = prompt.count("<image>")
	if num_image_tokens != 1:
	raise ValueError(f"Prompt must contain exactly 1 <image> token. Found {num_image_tokens}")

	inputs = smolvlm256m_processor(images=[image], text=[prompt], return_tensors="pt").to("cpu")
	output_ids = smolvlm256m_model.generate(**inputs, max_new_tokens=100)
	return smolvlm256m_processor.decode(output_ids[0], skip_special_tokens=True)

	# --- FUNCTION: Extract images from saved PDF ---
	def extract_images_from_pdf(pdf_path, output_json_path):
	''' Extract images from PDF and generate structured sprite JSON '''

	pdf_filename = os.path.splitext(os.path.basename(pdf_path))[0] # e.g., "scratch_crab"
	pdf_dir_path = os.path.dirname(pdf_path).replace("/", "\\")

	# Create subfolders
	extracted_image_subdir = os.path.join(DETECTED_IMAGE_FOLDER_PATH, pdf_filename)
	json_subdir = os.path.join(JSON_FOLDER_PATH, pdf_filename)
	os.makedirs(extracted_image_subdir, exist_ok=True)
	os.makedirs(json_subdir, exist_ok=True)

	# Output paths
	output_json_path = os.path.join(json_subdir, "extracted.json")
	final_json_path = os.path.join(json_subdir, "extracted_sprites.json")

	elements = partition_pdf(
	filename=pdf_path,
	strategy="hi_res",
	extract_image_block_types=["Image"],
	extract_image_block_to_payload=True, # Set to True to get base64 in output
	)

	with open(output_json_path, "w") as f:
	json.dump([element.to_dict() for element in elements], f, indent=4)

	# Display extracted images
	with open(output_json_path, 'r') as file:
	file_elements = json.load(file)

	# extracted_images_dir = os.path.join(os.path.dirname(output_json_path), "extracted_images")
	# os.makedirs(extracted_images_dir, exist_ok=True)

	# Prepare manipulated sprite JSON structure
	manipulated_json = {}

	# Final manipulated file (for captions)
	final_json_path = output_json_path.replace(".json", "_sprites.json")

	# If JSON already exists, load it and find the next available Sprite number
	if os.path.exists(final_json_path):
	with open(final_json_path, "r") as existing_file:
	manipulated = json.load(existing_file)
	# Determine the next available index (e.g., Sprite 4 if 1–3 already exist)
	existing_keys = [int(k.replace("Sprite ", "")) for k in manipulated.keys()]
	start_count = max(existing_keys, default=0) + 1
	else:
	start_count = 1

	sprite_count = start_count
	for i,element in enumerate(file_elements):
	if "image_base64" in element["metadata"]:
	image_data = base64.b64decode(element["metadata"]["image_base64"])
	image = Image.open(io.BytesIO(image_data)).convert("RGB")
	image.show(title=f"Extracted Image {i+1}")
	image_path = os.path.join(extracted_image_subdir, f"Sprite_{i+1}.png")
	image.save(image_path)

	description = get_smolvlm_caption(image, prompt="Give a brief Description")
	name = get_smolvlm_caption(image, prompt="give a short name/title of this Image.")

	manipulated_json[f"Sprite {sprite_count}"] = {
	"name": name,
	"base64": element["metadata"]["image_base64"],
	"file-path": pdf_dir_path,
	"description":description
	}
	sprite_count += 1

	# Save manipulated JSON
	with open(final_json_path, "w") as sprite_file:
	json.dump(manipulated_json, sprite_file, indent=4)

	print(f"✅ Manipulated sprite JSON saved: {final_json_path}")
	return final_json_path, manipulated_json

	# API endpoint
	@app.route('/process_static_pdf', methods=['POST'])
	def process_static_pdf():
	# Option 1: Use hardcoded static PDF
	pdf_path = r"E:\Pratham\2025\Harsh Sir\Scratch Vision\images\scratch_crab.pdf"

	# Optional: Allow override via JSON request body
	if request.json and "pdf_path" in request.json:
	pdf_path = request.json["pdf_path"]

	if not os.path.isfile(pdf_path):
	return jsonify({"error": f"File not found: {pdf_path}"}), 400

	# json_path = os.path.join(JSON_FOLDER_PATH, "extracted.json")
	json_path = None
	output_path, result = extract_images_from_pdf(pdf_path, json_path)

	return jsonify({
	"message": "✅ PDF processed successfully",
	"output_json": output_path,
	"sprites": result
	})

	if __name__ == '__main__':
	app.run(host='0.0.0.0', port=7860, debug=True)