Spaces:

mike23415
/

Vlm-test

Sleeping

App Files Files Community

Vlm-test / app.py

mike23415

Update app.py

035a6f9 verified 2 months ago

raw

history blame contribute delete

5.48 kB

	from flask import Flask, request, jsonify
	from flask_cors import CORS
	from PIL import Image
	import io
	import os

	# Option 1: Using TrOCR (Transformer-based OCR)
	from transformers import TrOCRProcessor, VisionEncoderDecoderModel
	import torch

	# Option 2: Using EasyOCR (commented out - uncomment if you prefer this)
	# import easyocr

	# Option 3: Using Tesseract (commented out - uncomment if you prefer this)
	# import pytesseract

	import fitz # PyMuPDF

	# Initialize Flask
	app = Flask(__name__)
	CORS(app)

	# Load TrOCR model and processor (better for text extraction)
	device = "cuda" if torch.cuda.is_available() else "cpu"
	processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-printed")
	model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-base-printed").to(device)
	model.eval()

	# Alternative: Initialize EasyOCR reader (uncomment if using EasyOCR)
	# reader = easyocr.Reader(['en'])

	def convert_pdf_to_image(file_stream):
	"""Convert PDF to image with higher DPI for better OCR"""
	doc = fitz.open(stream=file_stream.read(), filetype="pdf")
	page = doc.load_page(0)
	# Increase DPI for better text recognition
	pix = page.get_pixmap(dpi=300) # Higher DPI
	img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
	doc.close()
	return img

	def preprocess_image(image):
	"""Preprocess image for better OCR results"""
	# Convert to grayscale if needed
	if image.mode != 'RGB':
	image = image.convert('RGB')

	# Resize if image is too small
	width, height = image.size
	if width < 1000 or height < 1000:
	scale_factor = max(1000/width, 1000/height)
	new_width = int(width * scale_factor)
	new_height = int(height * scale_factor)
	image = image.resize((new_width, new_height), Image.Resampling.LANCZOS)

	return image

	def extract_text_trocr(image):
	"""Extract text using TrOCR"""
	try:
	# Split image into chunks if it's large (TrOCR works better on smaller sections)
	width, height = image.size
	chunk_height = 400 # Process in chunks
	extracted_texts = []

	for y in range(0, height, chunk_height):
	chunk = image.crop((0, y, width, min(y + chunk_height, height)))

	# Process with TrOCR
	pixel_values = processor(chunk, return_tensors="pt").pixel_values.to(device)

	with torch.no_grad():
	generated_ids = model.generate(pixel_values, max_length=512)

	generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
	if generated_text.strip():
	extracted_texts.append(generated_text.strip())

	return "\n".join(extracted_texts)
	except Exception as e:
	print(f"TrOCR error: {e}")
	return ""

	def extract_text_easyocr(image):
	"""Extract text using EasyOCR (uncomment the import and initialization above)"""
	try:
	results = reader.readtext(image)
	extracted_text = []
	for (bbox, text, confidence) in results:
	if confidence > 0.5: # Filter low confidence detections
	extracted_text.append(text)
	return "\n".join(extracted_text)
	except Exception as e:
	print(f"EasyOCR error: {e}")
	return ""

	def extract_text_tesseract(image):
	"""Extract text using Tesseract (uncomment the import above)"""
	try:
	# Convert to grayscale for better OCR
	gray_image = image.convert('L')
	text = pytesseract.image_to_string(gray_image, config='--psm 6')
	return text.strip()
	except Exception as e:
	print(f"Tesseract error: {e}")
	return ""

	@app.route("/ocr", methods=["POST"])
	def ocr():
	if "file" not in request.files:
	return jsonify({"error": "No file uploaded"}), 400

	file = request.files["file"]
	if not file.filename:
	return jsonify({"error": "No file selected"}), 400

	filename = file.filename.lower()

	try:
	# Convert input to PIL image
	if filename.endswith(".pdf"):
	image = convert_pdf_to_image(file)
	else:
	image = Image.open(io.BytesIO(file.read())).convert("RGB")

	# Preprocess image
	image = preprocess_image(image)

	# Extract text using TrOCR (primary method)
	extracted_text = extract_text_trocr(image)

	# If TrOCR fails or returns empty, try alternative methods
	if not extracted_text:
	print("TrOCR failed, trying alternative methods...")
	# Uncomment one of these if you have the libraries installed:
	# extracted_text = extract_text_easyocr(image)
	# extracted_text = extract_text_tesseract(image)

	if not extracted_text:
	return jsonify({
	"text": "",
	"message": "No text could be extracted from the image. The image might be too blurry, have low contrast, or contain handwritten text."
	})

	return jsonify({
	"text": extracted_text,
	"message": "Text extracted successfully"
	})

	except Exception as e:
	print(f"OCR processing error: {e}")
	return jsonify({"error": f"Failed to process file: {str(e)}"}), 500

	@app.route("/", methods=["GET"])
	def index():
	return "Smart OCR Flask API (TrOCR-based)"

	if __name__ == "__main__":
	app.run(host="0.0.0.0", port=7860, debug=True)