Spaces:

prodevroger
/

handwritten

Sleeping

App Files Files Community

IZERE HIRWA Roger commited on 18 days ago

Commit

b7d66ac

1 Parent(s): 1cc833a

olm

Browse files

Files changed (4) hide show

app.py +139 -5
requirements.txt +4 -1
static/index.html +8 -3
static/script.js +7 -3

app.py CHANGED Viewed

@@ -17,6 +17,11 @@ from datetime import datetime, timedelta
 import jwt
 import sqlite3
 import tempfile
 app = Flask(__name__)
 app.config['SECRET_KEY'] = 'your-secret-key-change-this-in-production'
@@ -109,6 +114,29 @@ except Exception as e:
     clip_model = None
     preprocess = None
 # Helper functions
 def save_index():
     try:
@@ -151,7 +179,95 @@ def image_from_pdf(pdf_bytes):
         print(f"❌ PDF conversion error: {e}")
         return None
-def extract_text(image):
     try:
         if image.mode != 'RGB':
             image = image.convert('RGB')
@@ -161,6 +277,13 @@ def extract_text(image):
     except Exception as e:
         return f"❌ OCR error: {str(e)}"
 def get_clip_embedding(image):
     try:
         if clip_model is None:
@@ -297,10 +420,10 @@ def classify_document():
                     sim = float(1 - D[0][i])
                     matches.append({"category": labels[I[0][i]], "similarity": round(sim, 3)})
-            # Save classified document to SQLite
             if similarity >= confidence_threshold:
                 saved_filename = save_uploaded_file(file_content, file.filename)
-                ocr_text = extract_text(image)
                 document_id = str(uuid.uuid4())
                 conn = sqlite3.connect(DATABASE_PATH)
@@ -320,7 +443,8 @@ def classify_document():
                     "confidence": "high",
                     "matches": matches,
                     "document_saved": True,
-                    "document_id": document_id
                 })
             else:
                 return jsonify({
@@ -452,8 +576,18 @@ def ocr_document():
         if image is None:
             return jsonify({"error": "Failed to process image"}), 400
         text = extract_text(image)
-        return jsonify({"text": text, "status": "success"})
     except Exception as e:
         return jsonify({"error": str(e)}), 500

 import jwt
 import sqlite3
 import tempfile
+import base64
+from io import BytesIO
+from transformers import AutoModelForImageTextToText, AutoProcessor, AutoTokenizer, TextIteratorStreamer
+from threading import Thread
+import time
 app = Flask(__name__)
 app.config['SECRET_KEY'] = 'your-secret-key-change-this-in-production'
     clip_model = None
     preprocess = None
+# Initialize Nanonets OCR model
+ocr_model = None
+ocr_processor = None
+ocr_tokenizer = None
+try:
+    model_path = "nanonets/Nanonets-OCR-s"
+    print("Loading Nanonets OCR model...")
+    ocr_model = AutoModelForImageTextToText.from_pretrained(
+        model_path,
+        torch_dtype="auto",
+        device_map="auto",
+        trust_remote_code=True
+    )
+    ocr_model.eval()
+    ocr_processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
+    ocr_tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
+    print("✅ Nanonets OCR model loaded successfully!")
+except Exception as e:
+    print(f"❌ Failed to load Nanonets OCR model: {e}")
+    print("📝 Falling back to pytesseract for OCR")
 # Helper functions
 def save_index():
     try:
         print(f"❌ PDF conversion error: {e}")
         return None
+def process_tags(content: str) -> str:
+    """Process special tags from Nanonets OCR output"""
+    content = content.replace("<img>", "&lt;img&gt;")
+    content = content.replace("</img>", "&lt;/img&gt;")
+    content = content.replace("<watermark>", "&lt;watermark&gt;")
+    content = content.replace("</watermark>", "&lt;/watermark&gt;")
+    content = content.replace("<page_number>", "&lt;page_number&gt;")
+    content = content.replace("</page_number>", "&lt;/page_number&gt;")
+    content = content.replace("<signature>", "&lt;signature&gt;")
+    content = content.replace("</signature>", "&lt;/signature&gt;")
+    return content
+def encode_image(image: Image) -> str:
+    """Encode image to base64 for Nanonets OCR"""
+    buffered = BytesIO()
+    image.save(buffered, format="JPEG")
+    img_str = base64.b64encode(buffered.getvalue()).decode("utf-8")
+    return img_str
+def nanonets_ocr_extract(image):
+    """Extract text using Nanonets OCR model"""
+    try:
+        if ocr_model is None or ocr_processor is None or ocr_tokenizer is None:
+            # Fallback to pytesseract
+            return extract_text_pytesseract(image)
+        if image.mode != 'RGB':
+            image = image.convert('RGB')
+        # Resize image for optimal processing
+        image = image.resize((2048, 2048))
+        # Prepare prompt for OCR extraction
+        user_prompt = """Extract the text from the above document as if you were reading it naturally. Return the tables in html format. Return the equations in LaTeX representation. Watermarks should be wrapped in brackets. Ex: <watermark>OFFICIAL COPY</watermark>. Page numbers should be wrapped in brackets. Ex: <page_number>14</page_number> or <page_number>9/22</page_number>. Prefer using ☐ and ☑ for check boxes."""
+        # Format messages for the model
+        formatted_messages = [
+            {"role": "system", "content": "You are a helpful assistant."},
+            {"role": "user", "content": [
+                {"type": "image", "image": image},
+                {"type": "text", "text": user_prompt},
+            ]},
+        ]
+        # Apply chat template
+        text = ocr_processor.apply_chat_template(
+            formatted_messages,
+            tokenize=False,
+            add_generation_prompt=True
+        )
+        # Process inputs
+        inputs = ocr_processor(
+            text=[text],
+            images=[image],
+            padding=True,
+            return_tensors="pt"
+        )
+        # Move inputs to model device
+        inputs = {k: v.to(ocr_model.device) if hasattr(v, 'to') else v for k, v in inputs.items()}
+        # Generate text
+        with torch.no_grad():
+            generated_ids = ocr_model.generate(
+                **inputs,
+                max_new_tokens=4096,
+                do_sample=False,
+                pad_token_id=ocr_tokenizer.eos_token_id,
+            )
+        # Decode the generated text
+        generated_text = ocr_tokenizer.decode(
+            generated_ids[0][inputs['input_ids'].shape[1]:],
+            skip_special_tokens=True
+        )
+        # Process special tags
+        processed_text = process_tags(generated_text)
+        return processed_text.strip() if processed_text.strip() else "❓ No text detected"
+    except Exception as e:
+        print(f"❌ Nanonets OCR error: {e}")
+        # Fallback to pytesseract
+        return extract_text_pytesseract(image)
+def extract_text_pytesseract(image):
+    """Fallback OCR using pytesseract"""
     try:
         if image.mode != 'RGB':
             image = image.convert('RGB')
     except Exception as e:
         return f"❌ OCR error: {str(e)}"
+def extract_text(image):
+    """Main OCR function - tries Nanonets first, falls back to pytesseract"""
+    if ocr_model is not None:
+        return nanonets_ocr_extract(image)
+    else:
+        return extract_text_pytesseract(image)
 def get_clip_embedding(image):
     try:
         if clip_model is None:
                     sim = float(1 - D[0][i])
                     matches.append({"category": labels[I[0][i]], "similarity": round(sim, 3)})
+            # Save classified document to SQLite with enhanced OCR
             if similarity >= confidence_threshold:
                 saved_filename = save_uploaded_file(file_content, file.filename)
+                ocr_text = extract_text(image)  # Now uses Nanonets OCR
                 document_id = str(uuid.uuid4())
                 conn = sqlite3.connect(DATABASE_PATH)
                     "confidence": "high",
                     "matches": matches,
                     "document_saved": True,
+                    "document_id": document_id,
+                    "ocr_preview": ocr_text[:200] + "..." if len(ocr_text) > 200 else ocr_text
                 })
             else:
                 return jsonify({
         if image is None:
             return jsonify({"error": "Failed to process image"}), 400
+        # Use enhanced Nanonets OCR
         text = extract_text(image)
+        # Determine OCR method used
+        ocr_method = "Nanonets OCR-s" if ocr_model is not None else "Pytesseract"
+        return jsonify({
+            "text": text,
+            "status": "success",
+            "ocr_method": ocr_method,
+            "enhanced_features": ocr_model is not None
+        })
     except Exception as e:
         return jsonify({"error": str(e)}), 500

requirements.txt CHANGED Viewed

@@ -8,4 +8,7 @@ torchvision
 Pillow
 PyJWT
 git+https://github.com/openai/CLIP.git
-poppler-utils

 Pillow
 PyJWT
 git+https://github.com/openai/CLIP.git
+poppler-utils
+transformers
+accelerate
+spaces

static/index.html CHANGED Viewed

@@ -255,10 +255,14 @@
             <div id="ocr" class="tab-content">
                 <div class="card">
                     <div class="card-header bg-warning text-dark">
-                        <h4><i class="fas fa-eye me-2"></i>OCR Text Extraction</h4>
                     </div>
                     <div class="card-body">
-                        <p class="text-muted">Extract text from documents using Optical Character Recognition.</p>
                         <form id="ocrForm" class="row g-3">
                             <div class="col-12">
@@ -266,12 +270,13 @@
                                 <div class="file-upload border rounded p-4 text-center" id="ocrUpload">
                                     <i class="fas fa-file-alt fa-3x text-warning mb-3"></i>
                                     <p class="mb-0">Click to select or drag & drop files here</p>
                                     <input type="file" id="ocrFile" accept="image/*,.pdf" class="d-none">
                                 </div>
                             </div>
                             <div class="col-12">
                                 <button type="submit" class="btn btn-warning">
-                                    <i class="fas fa-search me-2"></i>Extract Text
                                 </button>
                             </div>
                         </form>

             <div id="ocr" class="tab-content">
                 <div class="card">
                     <div class="card-header bg-warning text-dark">
+                        <h4><i class="fas fa-eye me-2"></i>Advanced OCR Text Extraction</h4>
                     </div>
                     <div class="card-body">
+                        <div class="alert alert-info" role="alert">
+                            <i class="fas fa-info-circle me-2"></i>
+                            <strong>Enhanced OCR Features:</strong> Our advanced Nanonets OCR-s model supports table extraction (HTML), LaTeX equations, watermark detection, signature recognition, and checkbox handling.
+                        </div>
+                        <p class="text-muted">Extract text from documents using advanced Optical Character Recognition with AI-powered document understanding.</p>
                         <form id="ocrForm" class="row g-3">
                             <div class="col-12">
                                 <div class="file-upload border rounded p-4 text-center" id="ocrUpload">
                                     <i class="fas fa-file-alt fa-3x text-warning mb-3"></i>
                                     <p class="mb-0">Click to select or drag & drop files here</p>
+                                    <small class="text-muted">Supports: PDF, JPEG, PNG, TIFF</small>
                                     <input type="file" id="ocrFile" accept="image/*,.pdf" class="d-none">
                                 </div>
                             </div>
                             <div class="col-12">
                                 <button type="submit" class="btn btn-warning">
+                                    <i class="fas fa-robot me-2"></i>Extract Text with AI OCR
                                 </button>
                             </div>
                         </form>

static/script.js CHANGED Viewed

@@ -625,7 +625,7 @@ document.getElementById('ocrForm').addEventListener('submit', async (e) => {
     const formData = new FormData();
     formData.append('file', fileInput.files[0]);
-    showResult(resultDiv, '<div class="loading"></div> Extracting text...', 'info');
     try {
         const response = await authenticatedFetch('/api/ocr', {
@@ -636,9 +636,13 @@ document.getElementById('ocrForm').addEventListener('submit', async (e) => {
         const result = await response.json();
         if (response.ok) {
-            showResult(resultDiv, result.text, 'success');
         } else {
-            showResult(resultDiv, result.detail, 'error');
         }
     } catch (error) {
         showResult(resultDiv, 'OCR failed: ' + error.message, 'error');

     const formData = new FormData();
     formData.append('file', fileInput.files[0]);
+    showResult(resultDiv, '<div class="loading"></div> Extracting text with advanced OCR...', 'info');
     try {
         const response = await authenticatedFetch('/api/ocr', {
         const result = await response.json();
         if (response.ok) {
+            const ocrInfo = result.enhanced_features ?
+                `🤖 Processed with ${result.ocr_method} (Enhanced Features: Tables, LaTeX, Watermarks)\n\n` :
+                `📝 Processed with ${result.ocr_method}\n\n`;
+            showResult(resultDiv, ocrInfo + result.text, 'success');
         } else {
+            showResult(resultDiv, result.error || result.detail, 'error');
         }
     } catch (error) {
         showResult(resultDiv, 'OCR failed: ' + error.message, 'error');