prthm11's picture
Update app_main.py
cf438e2 verified
raw
history blame
7.13 kB
from flask import Flask, render_template, Response, flash, redirect, url_for, request, jsonify
import cv2
import numpy as np
from unstructured.partition.pdf import partition_pdf
import json, base64, io, os
from PIL import Image, ImageEnhance, ImageDraw
from imutils.perspective import four_point_transform
from dotenv import load_dotenv
import pytesseract
from transformers import AutoProcessor, AutoModelForImageTextToText
from langchain_community.document_loaders.image_captions import ImageCaptionLoader
from werkzeug.utils import secure_filename
import tempfile, logging
app = Flask(__name__)
# Configure logging
logging.basicConfig(
level=logging.DEBUG, # Use INFO or ERROR in production
format="%(asctime)s [%(levelname)s] %(message)s",
handlers=[
logging.FileHandler("app.log"),
logging.StreamHandler()
]
)
logger = logging.getLogger(__name__)
pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"
poppler_path=r"C:\poppler-23.11.0\Library\bin"
count = 0
PDF_GET = r"E:\Pratham\2025\Harsh Sir\Scratch Vision\images\scratch_crab.pdf"
OUTPUT_FOLDER = "OUTPUTS"
DETECTED_IMAGE_FOLDER_PATH = os.path.join(OUTPUT_FOLDER,"DETECTED_IMAGE")
IMAGE_FOLDER_PATH = os.path.join(OUTPUT_FOLDER, "SCANNED_IMAGE")
JSON_FOLDER_PATH = os.path.join(OUTPUT_FOLDER, "EXTRACTED_JSON")
for path in [OUTPUT_FOLDER, IMAGE_FOLDER_PATH, DETECTED_IMAGE_FOLDER_PATH, JSON_FOLDER_PATH]:
os.makedirs(path, exist_ok=True)
# Model Initialization
smolvlm256m_processor = AutoProcessor.from_pretrained("HuggingFaceTB/SmolVLM-256M-Instruct")
smolvlm256m_model = AutoModelForImageTextToText.from_pretrained("HuggingFaceTB/SmolVLM-256M-Instruct").to("cpu")
# SmolVLM Image Captioning functioning
def get_smolvlm_caption(image: Image.Image, prompt: str = "") -> str:
# Ensure exactly one <image> token
if "<image>" not in prompt:
prompt = f"<image> {prompt.strip()}"
num_image_tokens = prompt.count("<image>")
if num_image_tokens != 1:
raise ValueError(f"Prompt must contain exactly 1 <image> token. Found {num_image_tokens}")
inputs = smolvlm256m_processor(images=[image], text=[prompt], return_tensors="pt").to("cpu")
output_ids = smolvlm256m_model.generate(**inputs, max_new_tokens=100)
return smolvlm256m_processor.decode(output_ids[0], skip_special_tokens=True)
# --- FUNCTION: Extract images from saved PDF ---
def extract_images_from_pdf(pdf_path, output_json_path):
''' Extract images from PDF and generate structured sprite JSON '''
pdf_filename = os.path.splitext(os.path.basename(pdf_path))[0] # e.g., "scratch_crab"
pdf_dir_path = os.path.dirname(pdf_path).replace("/", "\\")
# Create subfolders
extracted_image_subdir = os.path.join(DETECTED_IMAGE_FOLDER_PATH, pdf_filename)
json_subdir = os.path.join(JSON_FOLDER_PATH, pdf_filename)
os.makedirs(extracted_image_subdir, exist_ok=True)
os.makedirs(json_subdir, exist_ok=True)
# Output paths
output_json_path = os.path.join(json_subdir, "extracted.json")
final_json_path = os.path.join(json_subdir, "extracted_sprites.json")
elements = partition_pdf(
filename=pdf_path,
strategy="hi_res",
extract_image_block_types=["Image"],
extract_image_block_to_payload=True, # Set to True to get base64 in output
)
with open(output_json_path, "w") as f:
json.dump([element.to_dict() for element in elements], f, indent=4)
# Display extracted images
with open(output_json_path, 'r') as file:
file_elements = json.load(file)
# extracted_images_dir = os.path.join(os.path.dirname(output_json_path), "extracted_images")
# os.makedirs(extracted_images_dir, exist_ok=True)
# Prepare manipulated sprite JSON structure
manipulated_json = {}
# Final manipulated file (for captions)
final_json_path = output_json_path.replace(".json", "_sprites.json")
# If JSON already exists, load it and find the next available Sprite number
if os.path.exists(final_json_path):
with open(final_json_path, "r") as existing_file:
manipulated = json.load(existing_file)
# Determine the next available index (e.g., Sprite 4 if 1–3 already exist)
existing_keys = [int(k.replace("Sprite ", "")) for k in manipulated.keys()]
start_count = max(existing_keys, default=0) + 1
else:
start_count = 1
sprite_count = start_count
for i,element in enumerate(file_elements):
if "image_base64" in element["metadata"]:
image_data = base64.b64decode(element["metadata"]["image_base64"])
image = Image.open(io.BytesIO(image_data)).convert("RGB")
image.show(title=f"Extracted Image {i+1}")
image_path = os.path.join(extracted_image_subdir, f"Sprite_{i+1}.png")
image.save(image_path)
description = get_smolvlm_caption(image, prompt="Give a brief Description")
name = get_smolvlm_caption(image, prompt="give a short name/title of this Image.")
manipulated_json[f"Sprite {sprite_count}"] = {
"name": name,
"base64": element["metadata"]["image_base64"],
"file-path": pdf_dir_path,
"description":description
}
sprite_count += 1
# Save manipulated JSON
with open(final_json_path, "w") as sprite_file:
json.dump(manipulated_json, sprite_file, indent=4)
print(f"✅ Manipulated sprite JSON saved: {final_json_path}")
return final_json_path, manipulated_json
@app.route('/')
def index():
return render_template('app_index.html')
# API endpoint
@app.route('/process_pdf', methods=['POST'])
def process_pdf():
try:
logger.info("Received request to process PDF.")
if 'pdf_file' not in request.files:
logger.warning("No PDF file found in request.")
return jsonify({"error": "Missing PDF file in form-data with key 'pdf_file'"}), 400
pdf_file = request.files['pdf_file']
if pdf_file.filename == '':
return jsonify({"error": "Empty filename"}), 400
# Save the uploaded PDF temporarily
filename = secure_filename(pdf_file.filename)
temp_dir = tempfile.mkdtemp()
saved_pdf_path = os.path.join(temp_dir, filename)
pdf_file.save(saved_pdf_path)
logger.info(f"Saved uploaded PDF to: {saved_pdf_path}")
# Extract & process
json_path = None
output_path, result = extract_images_from_pdf(saved_pdf_path, json_path)
logger.info("Received request to process PDF.")
return jsonify({
"message": "✅ PDF processed successfully",
"output_json": output_path,
"sprites": result
})
except Exception as e:
logger.exception("❌ Failed to process PDF")
return jsonify({"error": f"❌ Failed to process PDF: {str(e)}"}), 500
if __name__ == '__main__':
app.run(host='0.0.0.0', port=7860, debug=True)