#==================================================================================# # Find Contours from Image and Convert into PDF # #==================================================================================# import cv2, os import numpy as np from imutils.perspective import four_point_transform from PIL import Image from unstructured.partition.pdf import partition_pdf import json, base64, io from flask import Flask, render_template, flash, redirect, url_for from dotenv import load_dotenv import pytesseract load_dotenv() app = Flask(__name__) app.secret_key = os.getenv("SECRET_KEY") pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe" poppler_path=r"C:\poppler-23.11.0\Library\bin" count = 0 OUTPUT_FOLDER = "OUTPUTS" # os.makedirs(OUTPUT_FOLDER, exist_ok=True) IMAGE_FOLDER_PATH = os.path.join(OUTPUT_FOLDER, "SCANNED_IMAGE") # os.makedirs(IMAGE_FOLDER_PATH, exist_ok=True) PDF_FOLDER_PATH = os.path.join(OUTPUT_FOLDER, "SCANNED_PDF") # os.makedirs(PDF_FOLDER_PATH, exist_ok=True) JSON_FOLDER_PATH = os.path.join(OUTPUT_FOLDER, "EXTRACTED_JSON") for path in [OUTPUT_FOLDER, IMAGE_FOLDER_PATH, PDF_FOLDER_PATH, JSON_FOLDER_PATH]: os.makedirs(path, exist_ok=True) # --- FUNCTION: Detect document contour --- def detect_document_contour(image): gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) blur = cv2.GaussianBlur(gray, (5, 5), 0) _, thresh = cv2.threshold(blur, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU) contours, _ = cv2.findContours(thresh, cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE) contours = sorted(contours, key=cv2.contourArea, reverse=True) for contour in contours: area = cv2.contourArea(contour) if area > 1000: peri = cv2.arcLength(contour, True) approx = cv2.approxPolyDP(contour, 0.02 * peri, True) if len(approx) == 4: return approx return None # --- FUNCTION: Extract images from saved PDF --- def extract_images_from_pdf(pdf_path, output_json_path): elements = partition_pdf( filename=pdf_path, strategy="hi_res", extract_image_block_types=["Image"], # or ["Image", "Table"] extract_image_block_to_payload=True, # Set to True to get base64 in output ) with open(output_json_path, "w") as f: json.dump([element.to_dict() for element in elements], f, indent=4) # Display extracted images with open(output_json_path, 'r') as file: file_elements = json.load(file) for i, element in enumerate(file_elements): if "image_base64" in element["metadata"]: image_data = base64.b64decode(element["metadata"]["image_base64"]) image = Image.open(io.BytesIO(image_data)) image.show(title=f"Extracted Image {i+1}") # --- Route: Home Page --- @app.route("/") def index(): return render_template("index.html") # --- Route: Scan Document --- @app.route("/scan") def scan_document(): global count cap = cv2.VideoCapture(0 + cv2.CAP_DSHOW) cap.set(cv2.CAP_PROP_FRAME_WIDTH, 1920) cap.set(cv2.CAP_PROP_FRAME_HEIGHT, 1080) scale = 0.5 contour = None while True: ret, frame = cap.read() if not ret: flash("Camera Error!", "error") break frame = cv2.rotate(frame, cv2.ROTATE_180) display = frame.copy() contour = detect_document_contour(display) if contour is not None: cv2.drawContours(display, [contour], -1, (0, 255, 0), 3) resized = cv2.resize(display, (int(scale * display.shape[1]), int(scale * display.shape[0]))) cv2.imshow("📷 Scan Document - Press 's' to Save, ESC to Exit", resized) key = cv2.waitKey(1) & 0xFF if key == 27: # ESC break elif key == ord('s') and contour is not None: warped = four_point_transform(frame, contour.reshape(4, 2)) image_path = os.path.join(IMAGE_FOLDER_PATH, f"scanned_colored_{count}.jpg") pdf_path = os.path.join(PDF_FOLDER_PATH, f"scanned_colored_{count}.pdf") json_path = os.path.join(JSON_FOLDER_PATH, f"scanned_{count}.json") cv2.imwrite(image_path, warped) img = Image.open(image_path).convert("RGB") img.save(pdf_path) extract_images_from_pdf(pdf_path, json_path) flash("✅ Document scanned and saved!", "success") count += 1 break cap.release() cv2.destroyAllWindows() return redirect(url_for("index")) # --- Run --- if __name__ == "__main__": app.run(host="0.0.0.0", port=7860, debug=False) # while True: # ret, frame = cap.read() # if not ret: # break # frame = cv2.rotate(frame, cv2.ROTATE_180) # display = frame.copy() # contour = detect_document_contour(display) # if contour is not None: # cv2.drawContours(display, [contour], -1, (0, 255, 0), 3) # cv2.imshow("Document Scanner", cv2.resize(display, (int(scale * display.shape[1]), int(scale * display.shape[0])))) # key = cv2.waitKey(1) & 0xFF # if key == 27: # ESC to exit # break # elif key == ord('s') and contour is not None: # warped = four_point_transform(frame, contour.reshape(4, 2)) # image_path = os.path.join(IMAGE_FOLDER_PATH, f"scanned_colored_{count}.jpg") # pdf_path = os.path.join(PDF_FOLDER_PATH,f"scanned_colored_{count}.pdf") # # Save the Image # cv2.imwrite(image_path, warped) # print(f"[INFO] Saved: {image_path}") # # Convert to PDF # img = Image.open(image_path) # img_rgb = img.convert("RGB") # img_rgb.save(pdf_path) # print(f"[INFO] Converted to PDF: {pdf_path}") # # Extract and show embedded images from PDF # print(f"[INFO] Extracting embedded images from PDF...") # # extract_images_from_pdf(pdf_path, JSON_FOLDER_PATH) # count += 1 # cap.release() # cv2.destroyAllWindows() ''' Simple version Not a Flask APP ''' # import cv2, os, json, base64, io # import numpy as np # from imutils.perspective import four_point_transform # from PIL import Image # from unstructured.partition.pdf import partition_pdf # import pytesseract # # --- PATH CONFIGURATION --- # pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe" # POPPLER_PATH = r"C:\poppler-23.11.0\Library\bin" # OUTPUT_FOLDER = "OUTPUTS" # IMAGE_FOLDER_PATH = os.path.join(OUTPUT_FOLDER, "SCANNED_IMAGE") # PDF_FOLDER_PATH = os.path.join(OUTPUT_FOLDER, "SCANNED_PDF") # JSON_OUTPUT_FOLDER = os.path.join(OUTPUT_FOLDER, "EXTRACTED_JSON") # for path in [OUTPUT_FOLDER, IMAGE_FOLDER_PATH, PDF_FOLDER_PATH, JSON_OUTPUT_FOLDER]: # os.makedirs(path, exist_ok=True) # # --- FUNCTION: Detect document contour --- # def detect_document_contour(image): # gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) # blur = cv2.GaussianBlur(gray, (5, 5), 0) # _, thresh = cv2.threshold(blur, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU) # contours, _ = cv2.findContours(thresh, cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE) # contours = sorted(contours, key=cv2.contourArea, reverse=True) # for contour in contours: # area = cv2.contourArea(contour) # if area > 1000: # peri = cv2.arcLength(contour, True) # approx = cv2.approxPolyDP(contour, 0.02 * peri, True) # if len(approx) == 4: # return approx # return None # # --- FUNCTION: Extract images from saved PDF --- # def extract_images_from_pdf(pdf_path, output_json_path): # elements = partition_pdf( # filename=pdf_path, # poppler_path=POPPLER_PATH, # strategy="hi_res", # extract_image_block_types=["Image"], # extract_image_block_to_payload=True, # ) # with open(output_json_path, "w") as f: # json.dump([element.to_dict() for element in elements], f, indent=4) # # Display extracted images # with open(output_json_path, 'r') as file: # file_elements = json.load(file) # for i, element in enumerate(file_elements): # if "image_base64" in element["metadata"]: # image_data = base64.b64decode(element["metadata"]["image_base64"]) # image = Image.open(io.BytesIO(image_data)) # image.show(title=f"Extracted Image {i+1}") # # --- WEBCAM SCANNER START --- # # cap = cv2.VideoCapture(0 + cv2.CAP_DSHOW) # cap = cv2.VideoCapture("http://100.71.6.36:8080/video") # cap.set(cv2.CAP_PROP_FRAME_WIDTH, 1920) # cap.set(cv2.CAP_PROP_FRAME_HEIGHT, 1080) # scale = 0.5 # count = 0 # while True: # ret, frame = cap.read() # if not ret: # break # frame = cv2.rotate(frame, cv2.ROTATE_180) # display = frame.copy() # contour = detect_document_contour(display) # if contour is not None: # cv2.drawContours(display, [contour], -1, (0, 255, 0), 3) # cv2.imshow("Document Scanner", cv2.resize(display, (int(scale * display.shape[1]), int(scale * display.shape[0])))) # key = cv2.waitKey(1) & 0xFF # if key == 27: # ESC to exit # break # elif key == ord('s') and contour is not None: # warped = four_point_transform(frame, contour.reshape(4, 2)) # image_path = os.path.join(IMAGE_FOLDER_PATH, f"scanned_colored_{count}.jpg") # pdf_path = os.path.join(PDF_FOLDER_PATH, f"scanned_colored_{count}.pdf") # json_path = os.path.join(JSON_OUTPUT_FOLDER, f"embedded_images_{count}.json") # # Save Image # cv2.imwrite(image_path, warped) # print(f"[INFO] Saved image: {image_path}") # # Convert to PDF # img = Image.open(image_path) # img_rgb = img.convert("RGB") # img_rgb.save(pdf_path) # print(f"[INFO] Converted to PDF: {pdf_path}") # # Extract and show embedded images from PDF # print(f"[INFO] Extracting embedded images from PDF...") # extract_images_from_pdf(pdf_path, json_path) # count += 1 # cap.release() # cv2.destroyAllWindows() ''' #==================================================================================# # Extract Images from PDF # #==================================================================================# from unstructured.partition.pdf import partition_pdf import pytesseract pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe" elements = partition_pdf( filename=r"E:\Pratham\2025\Harsh Sir\Scratch Vision\images\page1.pdf", poppler_path=r"C:\poppler-23.11.0\Library\bin", strategy="hi_res", extract_image_block_types=["Image"], # or ["Image", "Table"] extract_image_block_to_payload=True, # Set to True to get base64 in output ) import json, base64, io, os from PIL import Image # Save JSON output os.makedirs("output", exist_ok=True) with open("output/embedded-images-tables.json", "w") as f: json.dump([element.to_dict() for element in elements], f, indent=4) def get_image_block_types(input_json_file_path: str): with open(input_json_file_path, 'r') as file: file_elements = json.load(file) for element in file_elements: if "image_base64" in element["metadata"]: image_data = base64.b64decode(element["metadata"]["image_base64"]) image = Image.open(io.BytesIO(image_data)) image.show() # Example usage: get_image_block_types("output/embedded-images-tables.json")''' # from unstructured_client import UnstructuredClient # from unstructured_client.models import operations, shared # from unstructured.staging.base import elements_from_dicts, elements_to_json # import os # import base64 # from PIL import Image # import io # if __name__ == "__main__": # client = UnstructuredClient( # api_key_auth=os.getenv("UNSTRUCTURED_API_KEY") # ) # # Path to your PDF file # local_input_filepath = "your-pdf-file.pdf" # local_output_filepath = "output.json" # with open(local_input_filepath, "rb") as f: # files = shared.Files( # content=f.read(), # file_name=local_input_filepath # ) # request = operations.PartitionRequest( # shared.PartitionParameters( # files=files, # split_pdf_page=True, # split_pdf_allow_failed=True, # split_pdf_concurrency_level=15, # # Extract Base64-encoded images and tables # extract_image_block_types=["Image", "Table"] # ) # ) # try: # result = client.general.partition(request=request) # for element in result.elements: # if "image_base64" in element["metadata"]: # # Decode and display the image # image_data = base64.b64decode(element["metadata"]["image_base64"]) # image = Image.open(io.BytesIO(image_data)) # image.show() # This will open the image # # Save results as JSON # dict_elements = elements_from_dicts(element_dicts=result.elements) # elements_to_json( # elements=dict_elements, # indent=2, # filename=local_output_filepath # ) # except Exception as e: # print(e) # -------------------------------------------------------------------------------------- # # # STEP 1 # # import libraries # import fitz # PyMuPDF # import io # from PIL import Image # # STEP 2 # # file path you want to extract images from # file = r"E:\Pratham\2025\Harsh Sir\Scratch Vision\images/page1_orig.pdf" # # open the file # pdf_file = fitz.open(file) # # STEP 3 # # iterate over PDF pages # for page_index in range(len(pdf_file)): # # get the page itself # page = pdf_file.load_page(page_index) # load the page # image_list = page.get_images(full=True) # get images on the page # # printing number of images found in this page # if image_list: # print(f"[+] Found a total of {len(image_list)} images on page {page_index}") # else: # print("[!] No images found on page", page_index) # for image_index, img in enumerate(image_list, start=1): # # get the XREF of the image # xref = img[0] # # extract the image bytes # base_image = pdf_file.extract_image(xref) # image_bytes = base_image["image"] # # get the image extension # image_ext = base_image["ext"] # # save the image # image_name = f"image{page_index+1}_{image_index}.{image_ext}" # with open(image_name, "wb") as image_file: # image_file.write(image_bytes) # print(f"[+] Image saved as {image_name}") # -------------------------------------------------------------------------------------- # # from pdf2image import convert_from_path # import numpy as np # import cv2 # def extract_grid_cells_from_pdf(pdf_path, prefix="sub"): # # Convert PDF's first page to image # pages = convert_from_path( # pdf_path, # dpi=300, # poppler_path=r"C:\poppler-23.11.0\Library\bin" # ) # pil = pages[0] # img = np.array(pil)[:, :, ::-1] # RGB→BGR # gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) # _, thresh = cv2.threshold(gray, 240, 255, cv2.THRESH_BINARY_INV) # kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (5,5)) # dil = cv2.dilate(thresh, kernel, iterations=2) # cnts, _ = cv2.findContours(dil, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) # cells = [cv2.boundingRect(c) for c in cnts if cv2.contourArea(c) > 1000] # cells = sorted(cells, key=lambda r: (r[1], r[0])) # for i, (x, y, w, h) in enumerate(cells): # crop = img[y:y+h, x:x+w] # cv2.imwrite(f"{prefix}_{i:02d}.png", crop) # print("Saved", f"{prefix}_{i:02d}.png") # if __name__ == "__main__": # extract_grid_cells_from_pdf( # r"E:\Pratham\2025\Harsh Sir\Scratch Vision\images\page1_orig.pdf" # ) # import cv2 # import layoutparser as lp # from pdf2image import convert_from_path # from reportlab.pdfgen import canvas # from reportlab.lib.pagesizes import letter # import numpy as np # import tempfile # import os # # 1️⃣ Setup LayoutParser model # model = lp.Detectron2LayoutModel( # "lp://PrimaLayout/PrimaLayout/mask_rcnn_R_50_FPN_3x/config", # label_map={0: "Text", 1: "Title", 2: "List", 3: "Table", 4: "Figure"} # ) # # 2️⃣ Utility to crop and save a layout region # def crop_and_save(img, block, out_dir, idx): # x1, y1, x2, y2 = map(int, block.block.x_1_y_2_x_2_y_2) # cropped = img[y1:y2, x1:x2] # path = os.path.join(out_dir, f"crop_{idx}.png") # cv2.imwrite(path, cropped) # return path # # 3️⃣ Convert cropped images into multi-page PDF # def imgs_to_pdf(img_paths, output_pdf): # c = canvas.Canvas(output_pdf, pagesize=letter) # w, h = letter # for img in img_paths: # c.drawImage(img, 0, 0, width=w, height=h) # c.showPage() # c.save() # # 4️⃣ If user input is a PDF or image folder # def process_document(pdf_path, output_pdf): # imgs = convert_from_path(pdf_path) # cropped_paths = [] # with tempfile.TemporaryDirectory() as tmp: # for page_idx, pil_im in enumerate(imgs): # img = cv2.cvtColor(np.array(pil_im), cv2.COLOR_RGB2BGR) # layout = model.detect(img) # for idx, block in enumerate(layout): # path = crop_and_save(img, block, tmp, f"{page_idx}_{idx}") # cropped_paths.append(path) # imgs_to_pdf(cropped_paths, output_pdf) # # 5️⃣ Real-time camera/video feed # def process_video(output_pdf, src=0, frame_limit=100): # cap = cv2.VideoCapture(src) # idx = 0 # cropped_paths = [] # with tempfile.TemporaryDirectory() as tmp: # while idx < frame_limit: # ret, img = cap.read() # if not ret: # break # layout = model.detect(img) # for i, block in enumerate(layout): # path = crop_and_save(img, block, tmp, f"{idx}_{i}") # cropped_paths.append(path) # idx += 1 # cap.release() # imgs_to_pdf(cropped_paths, output_pdf) # if __name__ == "__main__": # import argparse # ap = argparse.ArgumentParser() # ap.add_argument("--input", required=True, # help="path to PDF or 'cam' for camera") # ap.add_argument("--output", required=True, help="output PDF path") # ap.add_argument("--frames", type=int, default=50, # help="frames to scan if using camera") # args = ap.parse_args() # if args.input.lower().endswith(".pdf"): # process_document(args.input, args.output) # elif args.input.lower() == "cam": # process_video(args.output, src=0, frame_limit=args.frames) # else: # print("Unsupported input. Use PDF path or 'cam'.") # import cv2 # from PIL import Image # import numpy as np # def get_contours(frame): # gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY) # # Threshold to binary # _, thresh = cv2.threshold(gray, 127, 255, cv2.THRESH_BINARY_INV) # # Find contours # contours, _ = cv2.findContours(thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) # return contours # def extract_regions(frame, contours): # rois = [] # for cnt in contours: # x, y, w, h = cv2.boundingRect(cnt) # if w*h < 1000: # skip small noise # continue # roi = frame[y:y+h, x:x+w] # rois.append(roi) # return rois # def save_rois_as_pdf(rois, output_path): # pil_imgs = [] # for roi in rois: # rgb = cv2.cvtColor(roi, cv2.COLOR_BGR2RGB) # pil = Image.fromarray(rgb) # pil_imgs.append(pil) # if pil_imgs: # pil_imgs[0].save(output_path, save_all=True, append_images=pil_imgs[1:]) # print(f"Saved {len(pil_imgs)} regions to {output_path}") # def main(): # cap = cv2.VideoCapture(0) # all_rois = [] # print("Press 'c' to capture and extract; 'q' to quit.") # while True: # ret, frame = cap.read() # if not ret: # break # cv2.imshow("Live Feed", frame) # key = cv2.waitKey(1) & 0xFF # if key == ord('c'): # contours = get_contours(frame) # rois = extract_regions(frame, contours) # all_rois.extend(rois) # print(f"Captured {len(rois)} regions.") # elif key == ord('q'): # break # cap.release() # cv2.destroyAllWindows() # if all_rois: # save_rois_as_pdf(all_rois, "output_contours.pdf") # else: # print("No regions captured.") # if __name__ == "__main__": # main() # import cv2 # from PIL import Image # import numpy as np # def get_edge_contours(frame, low=50, high=150): # gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY) # blurred = cv2.GaussianBlur(gray, (5, 5), 1.0) # edges = cv2.Canny(blurred, low, high) # contours, _ = cv2.findContours(edges, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) # return contours, edges # def extract_edge_rois(frame, contours, min_area=1000): # rois = [] # for cnt in contours: # x, y, w, h = cv2.boundingRect(cnt) # if w * h < min_area: # continue # roi = frame[y:y+h, x:x+w] # rois.append(roi) # return rois # def save_rois_as_pdf(rois, output_path): # pil_imgs = [] # for roi in rois: # rgb = cv2.cvtColor(roi, cv2.COLOR_BGR2RGB) # pil_imgs.append(Image.fromarray(rgb)) # if pil_imgs: # pil_imgs[0].save(output_path, save_all=True, append_images=pil_imgs[1:]) # print(f"✅ Saved {len(pil_imgs)} edge-region(s) to {output_path}") # else: # print("⚠️ No edge-based regions detected—PDF not created.") # def main(): # cap = cv2.VideoCapture(0) # all_rois = [] # print("Press ‘c’ to capture current edge regions, ‘q’ to quit.") # while True: # ret, frame = cap.read() # if not ret: # break # contours, edges = get_edge_contours(frame) # cv2.imshow("Live Feed", frame) # cv2.imshow("Edges", edges) # key = cv2.waitKey(1) & 0xFF # if key == ord('c'): # rois = extract_edge_rois(frame, contours) # all_rois.extend(rois) # print(f"🔄 Captured {len(rois)} edge-region(s). Total: {len(all_rois)}") # elif key == ord('q'): # break # cap.release() # cv2.destroyAllWindows() # if all_rois: # save_rois_as_pdf(all_rois, "edge_contours.pdf") # else: # print("❌ No regions captured.") # if __name__ == "__main__": # main()