File size: 5,480 Bytes
03f2dd0
 
 
 
 
 
035a6f9
 
03f2dd0
035a6f9
 
 
 
 
 
 
03f2dd0
 
 
 
 
 
035a6f9
 
 
 
03f2dd0
 
035a6f9
 
 
03f2dd0
035a6f9
03f2dd0
 
035a6f9
 
03f2dd0
035a6f9
03f2dd0
 
035a6f9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
03f2dd0
 
 
 
 
 
035a6f9
 
 
03f2dd0
 
035a6f9
 
 
 
 
 
03f2dd0
035a6f9
 
03f2dd0
035a6f9
 
03f2dd0
035a6f9
 
 
 
 
 
 
 
 
 
 
 
03f2dd0
035a6f9
 
 
 
03f2dd0
035a6f9
 
 
03f2dd0
 
 
035a6f9
03f2dd0
 
035a6f9
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
from flask import Flask, request, jsonify
from flask_cors import CORS
from PIL import Image
import io
import os

# Option 1: Using TrOCR (Transformer-based OCR)
from transformers import TrOCRProcessor, VisionEncoderDecoderModel
import torch

# Option 2: Using EasyOCR (commented out - uncomment if you prefer this)
# import easyocr

# Option 3: Using Tesseract (commented out - uncomment if you prefer this)
# import pytesseract

import fitz  # PyMuPDF

# Initialize Flask
app = Flask(__name__)
CORS(app)

# Load TrOCR model and processor (better for text extraction)
device = "cuda" if torch.cuda.is_available() else "cpu"
processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-printed")
model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-base-printed").to(device)
model.eval()

# Alternative: Initialize EasyOCR reader (uncomment if using EasyOCR)
# reader = easyocr.Reader(['en'])

def convert_pdf_to_image(file_stream):
    """Convert PDF to image with higher DPI for better OCR"""
    doc = fitz.open(stream=file_stream.read(), filetype="pdf")
    page = doc.load_page(0)
    # Increase DPI for better text recognition
    pix = page.get_pixmap(dpi=300)  # Higher DPI
    img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
    doc.close()
    return img

def preprocess_image(image):
    """Preprocess image for better OCR results"""
    # Convert to grayscale if needed
    if image.mode != 'RGB':
        image = image.convert('RGB')
    
    # Resize if image is too small
    width, height = image.size
    if width < 1000 or height < 1000:
        scale_factor = max(1000/width, 1000/height)
        new_width = int(width * scale_factor)
        new_height = int(height * scale_factor)
        image = image.resize((new_width, new_height), Image.Resampling.LANCZOS)
    
    return image

def extract_text_trocr(image):
    """Extract text using TrOCR"""
    try:
        # Split image into chunks if it's large (TrOCR works better on smaller sections)
        width, height = image.size
        chunk_height = 400  # Process in chunks
        extracted_texts = []
        
        for y in range(0, height, chunk_height):
            chunk = image.crop((0, y, width, min(y + chunk_height, height)))
            
            # Process with TrOCR
            pixel_values = processor(chunk, return_tensors="pt").pixel_values.to(device)
            
            with torch.no_grad():
                generated_ids = model.generate(pixel_values, max_length=512)
            
            generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
            if generated_text.strip():
                extracted_texts.append(generated_text.strip())
        
        return "\n".join(extracted_texts)
    except Exception as e:
        print(f"TrOCR error: {e}")
        return ""

def extract_text_easyocr(image):
    """Extract text using EasyOCR (uncomment the import and initialization above)"""
    try:
        results = reader.readtext(image)
        extracted_text = []
        for (bbox, text, confidence) in results:
            if confidence > 0.5:  # Filter low confidence detections
                extracted_text.append(text)
        return "\n".join(extracted_text)
    except Exception as e:
        print(f"EasyOCR error: {e}")
        return ""

def extract_text_tesseract(image):
    """Extract text using Tesseract (uncomment the import above)"""
    try:
        # Convert to grayscale for better OCR
        gray_image = image.convert('L')
        text = pytesseract.image_to_string(gray_image, config='--psm 6')
        return text.strip()
    except Exception as e:
        print(f"Tesseract error: {e}")
        return ""

@app.route("/ocr", methods=["POST"])
def ocr():
    if "file" not in request.files:
        return jsonify({"error": "No file uploaded"}), 400

    file = request.files["file"]
    if not file.filename:
        return jsonify({"error": "No file selected"}), 400

    filename = file.filename.lower()

    try:
        # Convert input to PIL image
        if filename.endswith(".pdf"):
            image = convert_pdf_to_image(file)
        else:
            image = Image.open(io.BytesIO(file.read())).convert("RGB")

        # Preprocess image
        image = preprocess_image(image)

        # Extract text using TrOCR (primary method)
        extracted_text = extract_text_trocr(image)

        # If TrOCR fails or returns empty, try alternative methods
        if not extracted_text:
            print("TrOCR failed, trying alternative methods...")
            # Uncomment one of these if you have the libraries installed:
            # extracted_text = extract_text_easyocr(image)
            # extracted_text = extract_text_tesseract(image)
            
        if not extracted_text:
            return jsonify({
                "text": "",
                "message": "No text could be extracted from the image. The image might be too blurry, have low contrast, or contain handwritten text."
            })

        return jsonify({
            "text": extracted_text,
            "message": "Text extracted successfully"
        })

    except Exception as e:
        print(f"OCR processing error: {e}")
        return jsonify({"error": f"Failed to process file: {str(e)}"}), 500

@app.route("/", methods=["GET"])
def index():
    return "Smart OCR Flask API (TrOCR-based)"

if __name__ == "__main__":
    app.run(host="0.0.0.0", port=7860, debug=True)