Spaces:

marianeft
/

handwritten_name_recognizer

Build error

App Files Files Community

marianeft commited on Jun 6

Commit

15dba6b

verified ·

1 Parent(s): 0c48050

Training Model Complete

Browse files

Files changed (5) hide show

app.py +227 -219
config.py +13 -77
data_handler_ocr.py +165 -151
model_ocr.py +285 -286
utils_ocr.py +60 -161

app.py CHANGED Viewed

@@ -1,219 +1,227 @@
-# -*- coding: utf-8 -*-
-# app.py
-import os
-# CRITICAL FIX: Disable Streamlit's file watcher to prevent conflicts with PyTorch
-# This MUST be the first thing, before any other imports or Streamlit calls
-os.environ["STREAMLIT_SERVER_ENABLE_FILE_WATCHER"] = "false"
-import streamlit as st
-import pandas as pd
-import numpy as np
-from PIL import Image
-import torch
-import torch.nn.functional as F # Added F for log_softmax in inference
-import torchvision.transforms as transforms
-import traceback # For detailed error logging
-# Import all necessary configuration values from config.py
-from config import (
-    IMG_HEIGHT, NUM_CLASSES, BLANK_TOKEN, VOCABULARY, BLANK_TOKEN_SYMBOL,
-    TRAIN_CSV_PATH, TEST_CSV_PATH, TRAIN_IMAGES_DIR, TEST_IMAGES_DIR,
-    MODEL_SAVE_PATH, BATCH_SIZE, NUM_EPOCHS
-)
-# Import classes and functions from data_handler_ocr.py and model_ocr.py
-from data_handler_ocr import CharIndexer, OCRDataset, ocr_collate_fn, load_ocr_dataframes, create_ocr_dataloaders
-from model_ocr import CRNN, train_ocr_model, save_ocr_model, load_ocr_model, ctc_greedy_decode
-from utils_ocr import preprocess_user_image_for_ocr, binarize_image, resize_image_for_ocr, normalize_image_for_model # Ensure these are imported if needed
-# --- Global Variables ---
-# These will hold the model and char_indexer instance after training or loading
-trained_ocr_model = None
-char_indexer = None
-training_history = None
-# Determine the device (GPU if available, else CPU)
-device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-# --- Streamlit App Setup ---
-st.set_page_config(layout="wide", page_title="Handwritten Name OCR App") # Changed to wide layout for better display
-st.title("📝 Handwritten Name Recognition (OCR) App") # Updated title for consistency
-st.markdown("""
-    This application uses a Convolutional Recurrent Neural Network (CRNN) to perform
-    Optical Character Recognition (OCR) on handwritten names. You can upload an image
-    of a handwritten name for prediction or train a new model using the provided dataset.
-    **Note:** Training a robust OCR model can be time-consuming.
-""")
-# --- Initialize CharIndexer ---
-# CRITICAL FIX: Initialize CharIndexer with VOCABULARY and BLANK_TOKEN_SYMBOL
-# This resolves the ValueError: "Blank token symbol '95' not found..."
-char_indexer = CharIndexer(vocabulary_string=VOCABULARY, blank_token_symbol=BLANK_TOKEN_SYMBOL)
-# --- Model Loading / Initialization ---
-@st.cache_resource # Cache the model to prevent reloading on every rerun
-def get_and_load_ocr_model_cached(num_classes, model_path):
-    """
-    Initializes the OCR model and attempts to load a pre-trained model.
-    If no pre-trained model exists, a new model instance is returned.
-    """
-    model_instance = CRNN(num_classes=num_classes, cnn_output_channels=512, rnn_hidden_size=256, rnn_num_layers=2)
-    if os.path.exists(model_path):
-        st.sidebar.info("Loading pre-trained OCR model...")
-        try:
-            # Load model to CPU first, then move to device
-            model_instance.load_state_dict(torch.load(model_path, map_location=torch.device('cpu')))
-            st.sidebar.success("OCR model loaded successfully!")
-        except Exception as e:
-            st.sidebar.error(f"Error loading model: {e}. A new model will be initialized.")
-            # If loading fails, re-initialize an untrained model
-            model_instance = CRNN(num_classes=num_classes, cnn_output_channels=512, rnn_hidden_size=256, rnn_num_layers=2)
-    else:
-        st.sidebar.warning("No pre-trained OCR model found. Please train a model using the sidebar option.")
-    return model_instance
-# Get the model instance
-ocr_model = get_and_load_ocr_model_cached(char_indexer.num_classes, MODEL_SAVE_PATH)
-# Determine the device (GPU if available, else CPU)
-ocr_model.to(device)
-ocr_model.eval() # Set model to evaluation mode for inference by default
-# --- Sidebar for Model Training ---
-st.sidebar.header("Model Training (Optional)")
-st.sidebar.markdown("If you want to train a new model or no model is found:")
-# Initialize Streamlit widgets outside the button block
-training_progress_bar = st.sidebar.empty() # Placeholder for progress bar in sidebar
-status_text = st.sidebar.empty()            # Placeholder for status messages in sidebar
-if st.sidebar.button("📊 Train New OCR Model"): # Keep button in sidebar as per user's last provided code
-    # Clear previous messages/widgets if button is clicked again
-    training_progress_bar.progress(0) # Reset progress bar
-    training_progress_bar.empty()
-    status_text.empty() # Clear status text
-    # Check for existence of CSVs and image directories
-    if not os.path.exists(TRAIN_CSV_PATH) or not os.path.isdir(TRAIN_IMAGES_DIR):
-        status_text.error(f"Training CSV '{TRAIN_CSV_PATH}' or Images directory '{TRAIN_IMAGES_DIR}' not found!")
-    elif not os.path.exists(TEST_CSV_PATH) or not os.path.isdir(TEST_IMAGES_DIR):
-        status_text.warning(f"Test CSV '{TEST_CSV_PATH}' or Images directory '{TEST_IMAGES_DIR}' not found. "
-                   "Evaluation might be affected or skipped. Please ensure all data paths are correct.")
-    else:
-        status_text.info(f"Training a new CRNN model for {NUM_EPOCHS} epochs. This will take significant time...")
-        # Define the progress bar instance here for the callback
-        training_progress_bar_instance = training_progress_bar.progress(0.0, text="Training in progress. Please wait.")
-        def update_progress_callback_sidebar(value, text):
-            """Callback function to update Streamlit progress bar in sidebar."""
-            training_progress_bar_instance.progress(int(value * 100))
-            status_text.text(text) # Update status text in sidebar
-        try:
-            train_df, test_df = load_ocr_dataframes(TRAIN_CSV_PATH, TEST_CSV_PATH)
-            status_text.success("Training and Test DataFrames loaded successfully.")
-            char_indexer = CharIndexer(vocabulary_string=VOCABULARY, blank_token_symbol=BLANK_TOKEN_SYMBOL)
-            status_text.success(f"CharIndexer initialized with {char_indexer.num_classes} classes.")
-            # Pass the limits to create_ocr_dataloaders
-            train_loader, test_loader = create_ocr_dataloaders(
-                train_df, test_df, char_indexer, BATCH_SIZE
-            )
-            status_text.success("DataLoaders created successfully.")
-            ocr_model_for_training = CRNN(num_classes=NUM_CLASSES) # Create a new instance for training
-            ocr_model_for_training.to(device)
-            status_text.info(f"CRNN model initialized and moved to {device}.")
-            status_text.write("Training in progress... This may take a while.")
-            trained_ocr_model, training_history = train_ocr_model(
-                model=ocr_model_for_training, # Pass the new instance
-                train_loader=train_loader,
-                test_loader=test_loader,
-                char_indexer=char_indexer, # Pass char_indexer for CER calculation
-                epochs=NUM_EPOCHS,
-                device=device,
-                progress_callback=update_progress_callback_sidebar # Pass the sidebar callback
-            )
-            status_text.success("OCR model training finished!")
-            update_progress_callback_sidebar(1.0, "Training complete!")
-            os.makedirs(os.path.dirname(MODEL_SAVE_PATH), exist_ok=True)
-            save_ocr_model(trained_ocr_model, MODEL_SAVE_PATH)
-            status_text.success(f"Trained model saved to `{MODEL_SAVE_PATH}`")
-            # Display training history chart in the main section, not sidebar
-            if training_history:
-                st.subheader("Training History Plots")
-                history_df = pd.DataFrame({
-                    'Epoch': range(1, len(training_history['train_loss']) + 1),
-                    'Train Loss': training_history['train_loss'],
-                    'Test Loss': training_history['test_loss'],
-                    'Test CER (%)': [cer * 100 for cer in training_history['test_cer']],
-                    'Test Exact Match Accuracy (%)': [acc * 100 for acc in training_history['test_exact_match_accuracy']]
-                })
-                st.markdown("**Loss over Epochs**")
-                st.line_chart(history_df.set_index('Epoch')[['Train Loss', 'Test Loss']])
-                st.caption("Lower loss indicates better model performance.")
-                st.markdown("**Character Error Rate (CER) over Epochs**")
-                st.line_chart(history_df.set_index('Epoch')[['Test CER (%)']])
-                st.caption("Lower CER indicates fewer character errors (0% is perfect).")
-                st.markdown("**Exact Match Accuracy over Epochs**")
-                st.line_chart(history_df.set_index('Epoch')[['Test Exact Match Accuracy (%)']])
-                st.caption("Higher exact match accuracy indicates more perfectly recognized names.")
-                st.markdown("**Performance Metrics over Epochs (CER vs. Exact Match Accuracy)**")
-                st.line_chart(history_df.set_index('Epoch')[['Test CER (%)', 'Test Exact Match Accuracy (%)']])
-                st.caption("CER should decrease, Accuracy should increase.")
-        except Exception as e:
-            status_text.error(f"An error occurred during training: {e}")
-            status_text.exception(e) # Display full traceback in Streamlit
-            update_progress_callback_sidebar(0.0, "Training failed!")
-# --- Main Content: Name Prediction ---
-st.header("Predict Your Handwritten Name")
-st.markdown("Upload a clear image of a single handwritten name or word.")
-uploaded_file = st.file_uploader("🖼️ Choose an image...", type=["png", "jpg", "jpeg"])
-if uploaded_file is not None:
-    try:
-        # Open the uploaded image
-        image_pil = Image.open(uploaded_file).convert('L') # Ensure grayscale
-        # Use use_container_width for deprecation warning fix
-        st.image(image_pil, caption="Uploaded Image", use_container_width=True)
-        st.write("---")
-        st.write("Processing and Recognizing...")
-        # Preprocess the image for the model using utils_ocr function
-        processed_image_tensor = preprocess_user_image_for_ocr(image_pil, IMG_HEIGHT).to(device)
-        trained_ocr_model.eval() # Ensure model is in evaluation mode
-        with torch.no_grad(): # Disable gradient calculation for inference
-            output = trained_ocr_model(processed_image_tensor) # (sequence_length, batch_size, num_classes)
-        predicted_texts = ctc_greedy_decode(output, char_indexer)
-        predicted_text = predicted_texts[0] # Get the first (and only) prediction
-        st.success(f"Recognized Text: **{predicted_text}**")
-    except Exception as e:
-        st.error(f"Error processing image or recognizing text: {e}")
-        st.info("💡 **Tips for best results:**\n"
-                "- Ensure the handwritten text is clear and on a clean background.\n"
-                "- Only include one name/word per image.\n"
-                "- The model is trained on specific characters. Unusual symbols might not be recognized.")
-        st.exception(e) # Display full traceback in Streamlit
-st.markdown("""
-    ---
-    *Built using Streamlit, PyTorch, OpenCV, and EditDistance ©2025 by MFT*
-    """)

+# -*- coding: utf-8 -*-
+# app.py
+import os
+# Disable Streamlit file watcher to prevent conflicts with PyTorch
+os.environ["STREAMLIT_SERVER_ENABLE_FILE_WATCHER"] = "false"
+import streamlit as st
+import pandas as pd
+import numpy as np
+from PIL import Image
+import torch
+import torch.nn.functional as F
+import torchvision.transforms as transforms
+import traceback
+# Import all necessary configuration values from config.py
+from config import (
+    IMG_HEIGHT, NUM_CLASSES, BLANK_TOKEN, VOCABULARY, BLANK_TOKEN_SYMBOL,
+    TRAIN_CSV_PATH, TEST_CSV_PATH, TRAIN_IMAGES_DIR, TEST_IMAGES_DIR,
+    MODEL_SAVE_PATH, BATCH_SIZE, NUM_EPOCHS
+)
+# Import classes and functions from data_handler_ocr.py and model_ocr.py
+from data_handler_ocr import CharIndexer, OCRDataset, ocr_collate_fn, load_ocr_dataframes, create_ocr_dataloaders
+from model_ocr import CRNN, train_ocr_model, save_ocr_model, load_ocr_model, ctc_greedy_decode
+from utils_ocr import preprocess_user_image_for_ocr, binarize_image, resize_image_for_ocr, normalize_image_for_model
+# --- Global Variables ---
+ocr_model = None
+char_indexer = None
+training_history = None
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+# --- Streamlit App Setup ---
+st.set_page_config(layout="wide", page_title="Handwritten Name OCR App",)
+st.title("📝 Handwritten Name Recognition (OCR) App")
+st.markdown("""
+    This application uses a Convolutional Recurrent Neural Network (CRNN) to perform
+    Optical Character Recognition (OCR) on handwritten names. You can upload an image
+    of a handwritten name for prediction or train a new model using the provided dataset.
+    **Note:** Training a robust OCR model can be time-consuming.
+""")
+# --- Initialize CharIndexer ---
+# This initializes char_indexer once when the script starts
+char_indexer = CharIndexer(vocabulary_string=VOCABULARY, blank_token_symbol=BLANK_TOKEN_SYMBOL)
+# --- Model Loading / Initialization ---
+@st.cache_resource # Cache the model to prevent reloading on every rerun
+def get_and_load_ocr_model_cached(num_classes, model_path):
+    """
+    Initializes the OCR model and attempts to load a pre-trained model.
+    If no pre-trained model exists, a new model instance is returned.
+    """
+    model_instance = CRNN(num_classes=num_classes, cnn_output_channels=512, rnn_hidden_size=256, rnn_num_layers=2)
+    if os.path.exists(model_path):
+        st.sidebar.info("Loading pre-trained OCR model...")
+        try:
+            # Load model to CPU first, then move to device
+            model_instance.load_state_dict(torch.load(model_path, map_location=torch.device('cpu')))
+            st.sidebar.success("OCR model loaded successfully!")
+        except Exception as e:
+            st.sidebar.error(f"Error loading model: {e}. A new model will be initialized.")
+            # If loading fails, re-initialize an untrained model
+            model_instance = CRNN(num_classes=num_classes, cnn_output_channels=512, rnn_hidden_size=256, rnn_num_layers=2)
+    else:
+        st.sidebar.warning("No pre-trained OCR model found. Please train a model using the sidebar option.")
+    return model_instance
+# Get the model instance and assign it to the global 'ocr_model'
+ocr_model = get_and_load_ocr_model_cached(char_indexer.num_classes, MODEL_SAVE_PATH)
+# Ensure the model is on the correct device for inference
+ocr_model.to(device)
+ocr_model.eval() # Set model to evaluation mode for inference by default
+# --- Sidebar for Model Training ---
+st.sidebar.header("Train OCR Model")
+st.sidebar.write("Click the button below to start training the OCR model.")
+# Progress bar and label for training in the sidebar
+progress_bar_sidebar = st.sidebar.progress(0)
+progress_label_sidebar = st.sidebar.empty()
+def update_progress_callback_sidebar(value, text):
+    progress_bar_sidebar.progress(int(value * 100))
+    progress_label_sidebar.text(text)
+if st.sidebar.button("📊 Start Training"):
+    progress_bar_sidebar.progress(0)
+    progress_label_sidebar.empty()
+    st.empty()
+    if not os.path.exists(TRAIN_CSV_PATH) or not os.path.isdir(TRAIN_IMAGES_DIR):
+        st.sidebar.error(f"Training CSV '{TRAIN_CSV_PATH}' or Images directory '{TRAIN_IMAGES_DIR}' not found!")
+    elif not os.path.exists(TEST_CSV_PATH) or not os.path.isdir(TEST_IMAGES_DIR):
+        st.sidebar.warning(f"Test CSV '{TEST_CSV_PATH}' or Images directory '{TEST_IMAGES_DIR}' not found. "
+                   "Evaluation might be affected or skipped. Please ensure all data paths are correct.")
+    else:
+        st.sidebar.info(f"Training a new CRNN model for {NUM_EPOCHS} epochs. This will take significant time...")
+        try:
+            train_df, test_df = load_ocr_dataframes(TRAIN_CSV_PATH, TEST_CSV_PATH)
+            st.sidebar.success("Training and Test DataFrames loaded successfully.")
+            st.sidebar.success(f"CharIndexer initialized with {char_indexer.num_classes} classes.")
+            train_loader, test_loader = create_ocr_dataloaders(train_df, test_df, char_indexer, BATCH_SIZE)
+            st.sidebar.success("DataLoaders created successfully.")
+            ocr_model.train()
+            st.sidebar.write("Training in progress... This may take a while.")
+            ocr_model, training_history = train_ocr_model(
+                model=ocr_model,
+                train_loader=train_loader,
+                test_loader=test_loader,
+                char_indexer=char_indexer,
+                epochs=NUM_EPOCHS,
+                device=device,
+                progress_callback=update_progress_callback_sidebar
+            )
+            st.sidebar.success("OCR model training finished!")
+            update_progress_callback_sidebar(1.0, "Training complete!")
+            os.makedirs(os.path.dirname(MODEL_SAVE_PATH), exist_ok=True)
+            save_ocr_model(ocr_model, MODEL_SAVE_PATH)
+            st.sidebar.success(f"Trained model saved to `{MODEL_SAVE_PATH}`")
+        except Exception as e:
+            st.sidebar.error(f"An error occurred during training: {e}")
+            st.exception(e)
+            update_progress_callback_sidebar(0.0, "Training failed!")
+# --- Sidebar for Model Loading ---
+st.sidebar.header("Load Pre-trained Model")
+st.sidebar.write("If you have a saved model, you can load it here instead of training.")
+if st.sidebar.button("💾 Load Model"):
+    if os.path.exists(MODEL_SAVE_PATH):
+        try:
+            loaded_model = CRNN(num_classes=char_indexer.num_classes)
+            load_ocr_model(loaded_model, MODEL_SAVE_PATH)
+            loaded_model.to(device)
+            st.sidebar.success(f"Model loaded successfully from `{MODEL_SAVE_PATH}`")
+        except Exception as e:
+            st.sidebar.error(f"Error loading model: {e}")
+            st.exception(e)
+    else:
+        st.sidebar.warning(f"No model found at `{MODEL_SAVE_PATH}`. Please train a model first or check the path.")
+# --- Main Content: Prediction Section and Training History  ---
+# Display training history chart
+if training_history:
+    st.subheader("Training History Plots")
+    history_df = pd.DataFrame({
+        'Epoch': range(1, len(training_history['train_loss']) + 1),
+        'Train Loss': training_history['train_loss'],
+        'Test Loss': training_history['test_loss'],
+        'Test CER (%)': [cer * 100 for cer in training_history['test_cer']],
+        'Test Exact Match Accuracy (%)': [acc * 100 for acc in training_history['test_exact_match_accuracy']]
+    })
+    st.markdown("**Loss over Epochs**")
+    st.line_chart(history_df.set_index('Epoch')[['Train Loss', 'Test Loss']])
+    st.caption("Lower loss indicates better model performance.")
+    st.markdown("**Character Error Rate (CER) over Epochs**")
+    st.line_chart(history_df.set_index('Epoch')[['Test CER (%)']])
+    st.caption("Lower CER indicates fewer character errors (0% is perfect).")
+    st.markdown("**Exact Match Accuracy over Epochs**")
+    st.line_chart(history_df.set_index('Epoch')[['Test Exact Match Accuracy (%)']])
+    st.caption("Higher exact match accuracy indicates more perfectly recognized names.")
+    st.markdown("**Performance Metrics over Epochs (CER vs. Exact Match Accuracy)**")
+    st.line_chart(history_df.set_index('Epoch')[['Test CER (%)', 'Test Exact Match Accuracy (%)']])
+    st.caption("CER should decrease, Accuracy should increase.")
+    st.write("---") # Separator after charts
+# Predict on a New Image
+if ocr_model is None:
+    st.warning("Please train or load a model before attempting prediction.")
+else:
+    uploaded_file = st.file_uploader("🖼️ Choose an image...", type=["png", "jpg", "jpeg", "jfif"])
+    if uploaded_file is not None:
+        try:
+            image_pil = Image.open(uploaded_file).convert('L')
+            st.image(image_pil, caption="Uploaded Image", use_container_width=True)
+            st.write("---")
+            st.write("Processing and Recognizing...")
+            processed_image_tensor = preprocess_user_image_for_ocr(image_pil, IMG_HEIGHT).to(device)
+            ocr_model.eval()
+            with torch.no_grad():
+                output = ocr_model(processed_image_tensor)
+            predicted_texts = ctc_greedy_decode(output, char_indexer)
+            predicted_text = predicted_texts[0]
+            st.success(f"Recognized Text: **{predicted_text}**")
+        except Exception as e:
+            st.error(f"Error processing image or recognizing text: {e}")
+            st.info("💡 **Tips for best results:**\n"
+                    "- Ensure the handwritten text is clear and on a clean background.\n"
+                    "- Only include one name/word per image.\n"
+                    "- The model is trained on specific characters. Unusual symbols might not be recognized.")
+            st.exception(e)
+st.markdown("""
+    ---
+    *Built using Streamlit, PyTorch, OpenCV, and EditDistance ©2025 by MFT*
+    """)

config.py CHANGED Viewed

@@ -1,4 +1,3 @@
-<<<<<<< HEAD
 # config.py
 import os
@@ -8,8 +7,8 @@ BASE_DIR = os.path.dirname(os.path.abspath(__file__))
 DATA_DIR = os.path.join(BASE_DIR, 'data')
 MODELS_DIR = os.path.join(BASE_DIR, 'models')
-TRAIN_IMAGES_DIR = os.path.join(DATA_DIR, 'images', 'train')
-TEST_IMAGES_DIR = os.path.join(DATA_DIR, 'images', 'test')
 TRAIN_CSV_PATH = os.path.join(DATA_DIR, 'train.csv')
 TEST_CSV_PATH = os.path.join(DATA_DIR, 'test.csv')
@@ -17,26 +16,13 @@ TEST_CSV_PATH = os.path.join(DATA_DIR, 'test.csv')
 MODEL_SAVE_PATH = os.path.join(MODELS_DIR, 'handwritten_name_ocr_model.pth')
 # --- Character Set and OCR Configuration ---
-# This character set MUST cover all characters present in your dataset.
-# Add any special characters if needed.
-# The order here is crucial as it defines the indices for your characters.
 CHARS = " !\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~"
-# Define the character for the blank token. It MUST NOT be in CHARS.
-BLANK_TOKEN_SYMBOL = 'Þ'
-# Construct the full vocabulary string. It's conventional to put the blank token last.
-# This VOCABULARY string is what you pass to CharIndexer.
 VOCABULARY = CHARS + BLANK_TOKEN_SYMBOL
-# NUM_CLASSES is the total number of unique symbols in the vocabulary, including the blank.
 NUM_CLASSES = len(VOCABULARY)
-# BLANK_TOKEN is the actual index of the blank symbol within the VOCABULARY.
-# Since we appended it last, its index will be len(CHARS).
 BLANK_TOKEN = VOCABULARY.find(BLANK_TOKEN_SYMBOL)
-# --- Sanity Checks (Highly Recommended) ---
 if BLANK_TOKEN == -1:
     raise ValueError(f"Error: BLANK_TOKEN_SYMBOL '{BLANK_TOKEN_SYMBOL}' not found in VOCABULARY. Check config.py definitions.")
 if BLANK_TOKEN >= NUM_CLASSES:
@@ -48,65 +34,15 @@ print(f"Blank Symbol: '{BLANK_TOKEN_SYMBOL}' at index {BLANK_TOKEN}")
 # --- Image Preprocessing Parameters ---
-IMG_HEIGHT = 32
 # --- Training Parameters ---
-BATCH_SIZE = 64
 LEARNING_RATE = 0.001
-=======
-# config.py
-import os
-# --- Paths ---
-BASE_DIR = os.path.dirname(os.path.abspath(__file__))
-DATA_DIR = os.path.join(BASE_DIR, 'data')
-MODELS_DIR = os.path.join(BASE_DIR, 'models')
-TRAIN_IMAGES_DIR = os.path.join(DATA_DIR, 'images', 'train')
-TEST_IMAGES_DIR = os.path.join(DATA_DIR, 'images', 'test')
-TRAIN_CSV_PATH = os.path.join(DATA_DIR, 'train.csv')
-TEST_CSV_PATH = os.path.join(DATA_DIR, 'test.csv')
-MODEL_SAVE_PATH = os.path.join(MODELS_DIR, 'handwritten_name_ocr_model.pth')
-# --- Character Set and OCR Configuration ---
-# This character set MUST cover all characters present in your dataset.
-# Add any special characters if needed.
-# The order here is crucial as it defines the indices for your characters.
-CHARS = " !\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~"
-# Define the character for the blank token. It MUST NOT be in CHARS.
-BLANK_TOKEN_SYMBOL = 'Þ'
-# Construct the full vocabulary string. It's conventional to put the blank token last.
-# This VOCABULARY string is what you pass to CharIndexer.
-VOCABULARY = CHARS + BLANK_TOKEN_SYMBOL
-# NUM_CLASSES is the total number of unique symbols in the vocabulary, including the blank.
-NUM_CLASSES = len(VOCABULARY)
-# BLANK_TOKEN is the actual index of the blank symbol within the VOCABULARY.
-# Since we appended it last, its index will be len(CHARS).
-BLANK_TOKEN = VOCABULARY.find(BLANK_TOKEN_SYMBOL)
-# --- Sanity Checks (Highly Recommended) ---
-if BLANK_TOKEN == -1:
-    raise ValueError(f"Error: BLANK_TOKEN_SYMBOL '{BLANK_TOKEN_SYMBOL}' not found in VOCABULARY. Check config.py definitions.")
-if BLANK_TOKEN >= NUM_CLASSES:
-     raise ValueError(f"Error: BLANK_TOKEN index ({BLANK_TOKEN}) must be less than NUM_CLASSES ({NUM_CLASSES}).")
-print(f"Config Loaded: NUM_CLASSES={NUM_CLASSES}, BLANK_TOKEN_INDEX={BLANK_TOKEN}")
-print(f"Vocabulary Length: {len(VOCABULARY)}")
-print(f"Blank Symbol: '{BLANK_TOKEN_SYMBOL}' at index {BLANK_TOKEN}")
-# --- Image Preprocessing Parameters ---
-IMG_HEIGHT = 32
-# --- Training Parameters ---
-BATCH_SIZE = 64
-LEARNING_RATE = 0.001
->>>>>>> ee59e5b21399d8b323cff452a961ea2fd6c65308
-NUM_EPOCHS = 3

 # config.py
 import os
 DATA_DIR = os.path.join(BASE_DIR, 'data')
 MODELS_DIR = os.path.join(BASE_DIR, 'models')
+TRAIN_IMAGES_DIR = os.path.join(DATA_DIR, 'images')
+TEST_IMAGES_DIR = os.path.join(DATA_DIR, 'images')
 TRAIN_CSV_PATH = os.path.join(DATA_DIR, 'train.csv')
 TEST_CSV_PATH = os.path.join(DATA_DIR, 'test.csv')
 MODEL_SAVE_PATH = os.path.join(MODELS_DIR, 'handwritten_name_ocr_model.pth')
 # --- Character Set and OCR Configuration ---
 CHARS = " !\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~"
+BLANK_TOKEN_SYMBOL = 'Þ'
 VOCABULARY = CHARS + BLANK_TOKEN_SYMBOL
 NUM_CLASSES = len(VOCABULARY)
 BLANK_TOKEN = VOCABULARY.find(BLANK_TOKEN_SYMBOL)
+# --- Sanity Checks ---
 if BLANK_TOKEN == -1:
     raise ValueError(f"Error: BLANK_TOKEN_SYMBOL '{BLANK_TOKEN_SYMBOL}' not found in VOCABULARY. Check config.py definitions.")
 if BLANK_TOKEN >= NUM_CLASSES:
 # --- Image Preprocessing Parameters ---
+IMG_HEIGHT = 32 # Target height for all input images to the model
+MAX_IMG_WIDTH = 1024 # Adjust this value based on your typical image widths and available RAM
 # --- Training Parameters ---
+BATCH_SIZE = 10
+# NEW: Dataset Limits
+TRAIN_SAMPLES_LIMIT = 1000
+TEST_SAMPLES_LIMIT = 1000
+NUM_EPOCHS = 5
 LEARNING_RATE = 0.001

data_handler_ocr.py CHANGED Viewed

@@ -1,151 +1,165 @@
-#data_handler_ocr.py
-import pandas as pd
-import torch
-from torch.utils.data import Dataset, DataLoader
-from torchvision import transforms
-import os
-from PIL import Image
-import numpy as np
-import torch.nn.functional as F
-# Import utility functions and config
-from config import VOCABULARY, BLANK_TOKEN, BLANK_TOKEN_SYMBOL, IMG_HEIGHT, TRAIN_IMAGES_DIR, TEST_IMAGES_DIR
-from utils_ocr import load_image_as_grayscale, binarize_image, resize_image_for_ocr, normalize_image_for_model
-class CharIndexer:
-    """Manages character-to-index and index-to-character mappings."""
-    def __init__(self, vocabulary_string: str, blank_token_symbol: str):
-        self.chars = sorted(list(set(vocabulary_string)))
-        self.char_to_idx = {char: i for i, char in enumerate(self.chars)}
-        self.idx_to_char = {i: char for i, char in enumerate(self.chars)}
-        if blank_token_symbol not in self.char_to_idx:
-            raise ValueError(f"Blank token symbol '{blank_token_symbol}' not found in provided vocabulary string: '{vocabulary_string}'")
-        self.blank_token_idx = self.char_to_idx[blank_token_symbol]
-        self.num_classes = len(self.chars)
-        if self.blank_token_idx >= self.num_classes:
-             raise ValueError(f"Blank token index ({self.blank_token_idx}) is out of range for num_classes ({self.num_classes}). This indicates a configuration mismatch.")
-        print(f"CharIndexer initialized: num_classes={self.num_classes}, blank_token_idx={self.blank_token_idx}")
-        print(f"Mapped blank symbol: '{self.idx_to_char[self.blank_token_idx]}'")
-    def encode(self, text: str) -> list[int]:
-        """Converts a text string to a list of integer indices."""
-        encoded_list = []
-        for char in text:
-            if char in self.char_to_idx:
-                encoded_list.append(self.char_to_idx[char])
-            else:
-                print(f"Warning: Character '{char}' not found in CharIndexer vocabulary. Mapping to blank token.")
-                encoded_list.append(self.blank_token_idx)
-        return encoded_list
-    def decode(self, indices: list[int]) -> str:
-        """Converts a list of integer indices back to a text string."""
-        decoded_text = []
-        for i, idx in enumerate(indices):
-            if idx == self.blank_token_idx:
-                continue
-            if i > 0 and indices[i-1] == idx:
-                continue
-            if idx in self.idx_to_char:
-                decoded_text.append(self.idx_to_char[idx])
-            else:
-                print(f"Warning: Index {idx} not found in CharIndexer's idx_to_char mapping during decoding.")
-        return "".join(decoded_text)
-class OCRDataset(Dataset):
-    """
-    Custom PyTorch Dataset for the Handwritten Name Recognition task.
-    Loads images and their corresponding text labels.
-    """
-    def __init__(self, dataframe: pd.DataFrame, char_indexer: CharIndexer, image_dir: str, transform=None):
-        self.data = dataframe
-        self.char_indexer = char_indexer
-        self.image_dir = image_dir
-        if transform is None:
-            self.transform = transforms.Compose([
-                transforms.Lambda(lambda img: binarize_image(img)),
-                transforms.Lambda(lambda img: resize_image_for_ocr(img, IMG_HEIGHT)),
-                transforms.ToTensor(),
-                transforms.Lambda(normalize_image_for_model)
-            ])
-        else:
-            self.transform = transform
-    def __len__(self) -> int:
-        return len(self.data)
-    def __getitem__(self, idx):
-        raw_filename_entry = self.data.loc[idx, 'FILENAME']
-        ground_truth_text = self.data.loc[idx, 'IDENTITY']
-        filename_only = raw_filename_entry.split(',')[0].strip()
-        img_path = os.path.join(self.image_dir, filename_only)
-        ground_truth_text = str(ground_truth_text)
-        try:
-            image = load_image_as_grayscale(img_path)
-        except FileNotFoundError:
-            print(f"Error: Image file not found at {img_path}. Please check your dataset and config.py paths.")
-            raise
-        if self.transform:
-            image = self.transform(image)
-        image_width = image.shape[2]
-        text_encoded = torch.tensor(self.char_indexer.encode(ground_truth_text), dtype=torch.long)
-        text_length = len(text_encoded)
-        return image, text_encoded, image_width, text_length
-def ocr_collate_fn(batch: list) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
-    """
-    Custom collate function for the DataLoader to handle variable-width images
-    and variable-length text sequences for CTC loss.
-    """
-    images, texts, image_widths, text_lengths = zip(*batch)
-    max_batch_width = max(image_widths)
-    padded_images = [F.pad(img, (0, max_batch_width - img.shape[2]), 'constant', 0) for img in images]
-    images_batch = torch.stack(padded_images, 0)
-    texts_batch = torch.cat(texts, 0)
-    text_lengths_tensor = torch.tensor(text_lengths, dtype=torch.long)
-    image_widths_tensor = torch.tensor(image_widths, dtype=torch.long)
-    return images_batch, texts_batch, image_widths_tensor, text_lengths_tensor
-def load_ocr_dataframes(train_csv_path: str, test_csv_path: str) -> tuple[pd.DataFrame, pd.DataFrame]:
-    """
-    Loads training and testing dataframes.
-    Assumes CSVs have 'FILENAME' and 'IDENTITY' columns and are comma-delimited with no header.
-    """
-    train_df = pd.read_csv(train_csv_path, delimiter=',', names=['FILENAME', 'IDENTITY'], header=None, encoding='utf-8')
-    test_df = pd.read_csv(test_csv_path, delimiter=',', names=['FILENAME', 'IDENTITY'], header=None, encoding='utf-8')
-    return train_df, test_df
-def create_ocr_dataloaders(train_df: pd.DataFrame, test_df: pd.DataFrame,
-                           char_indexer: CharIndexer, batch_size: int) -> tuple[DataLoader, DataLoader]:
-    """
-    Creates PyTorch DataLoader objects for OCR training and testing datasets,
-    using specific image directories for train/test.
-    """
-    train_dataset = OCRDataset(dataframe=train_df, char_indexer=char_indexer, image_dir=TRAIN_IMAGES_DIR)
-    test_dataset = OCRDataset(dataframe=test_df, char_indexer=char_indexer, image_dir=TEST_IMAGES_DIR)
-    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True,
-                              num_workers=0, collate_fn=ocr_collate_fn)
-    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False,
-                             num_workers=0, collate_fn=ocr_collate_fn)
-    return train_loader, test_loader

+#data_handler_ocr.py
+import pandas as pd
+import torch
+from torch.utils.data import Dataset, DataLoader
+from torchvision import transforms
+import os
+from PIL import Image
+import numpy as np
+import torch.nn.functional as F
+# Import utility functions and config
+from config import (
+    VOCABULARY, BLANK_TOKEN, BLANK_TOKEN_SYMBOL, IMG_HEIGHT,
+    TRAIN_IMAGES_DIR, TEST_IMAGES_DIR,
+    TRAIN_SAMPLES_LIMIT, TEST_SAMPLES_LIMIT
+)
+from utils_ocr import load_image_as_grayscale, binarize_image, resize_image_for_ocr, normalize_image_for_model
+class CharIndexer:
+    """Manages character-to-index and index-to-character mappings."""
+    def __init__(self, vocabulary_string: str, blank_token_symbol: str):
+        self.chars = sorted(list(set(vocabulary_string)))
+        self.char_to_idx = {char: i for i, char in enumerate(self.chars)}
+        self.idx_to_char = {i: char for i, char in enumerate(self.chars)}
+        if blank_token_symbol not in self.char_to_idx:
+            raise ValueError(f"Blank token symbol '{blank_token_symbol}' not found in provided vocabulary string: '{vocabulary_string}'")
+        self.blank_token_idx = self.char_to_idx[blank_token_symbol]
+        self.num_classes = len(self.chars)
+        if self.blank_token_idx >= self.num_classes:
+             raise ValueError(f"Blank token index ({self.blank_token_idx}) is out of range for num_classes ({self.num_classes}). This indicates a configuration mismatch.")
+        print(f"CharIndexer initialized: num_classes={self.num_classes}, blank_token_idx={self.blank_token_idx}")
+        print(f"Mapped blank symbol: '{self.idx_to_char[self.blank_token_idx]}'")
+    def encode(self, text: str) -> list[int]:
+        """Converts a text string to a list of integer indices."""
+        encoded_list = []
+        for char in text:
+            if char in self.char_to_idx:
+                encoded_list.append(self.char_to_idx[char])
+            else:
+                print(f"Warning: Character '{char}' not found in CharIndexer vocabulary. Mapping to blank token.")
+                encoded_list.append(self.blank_token_idx)
+        return encoded_list
+    def decode(self, indices: list[int]) -> str:
+        """Converts a list of integer indices back to a text string."""
+        decoded_text = []
+        for i, idx in enumerate(indices):
+            if idx == self.blank_token_idx:
+                continue # Skip blank tokens
+            if i > 0 and indices[i-1] == idx:
+                continue
+            if idx in self.idx_to_char:
+                decoded_text.append(self.idx_to_char[idx])
+            else:
+                print(f"Warning: Index {idx} not found in CharIndexer's idx_to_char mapping during decoding.")
+        return "".join(decoded_text)
+class OCRDataset(Dataset):
+    """
+    Custom PyTorch Dataset for the Handwritten Name Recognition task.
+    Loads images and their corresponding text labels.
+    """
+    def __init__(self, dataframe: pd.DataFrame, char_indexer: CharIndexer, image_dir: str, transform=None):
+        self.data = dataframe
+        self.char_indexer = char_indexer
+        self.image_dir = image_dir
+        if transform is None:
+            self.transform = transforms.Compose([
+                transforms.Lambda(lambda img: binarize_image(img)),
+                transforms.Lambda(lambda img: resize_image_for_ocr(img, IMG_HEIGHT)), # Resize image to fixed height
+                transforms.ToTensor(), # Convert PIL Image to PyTorch Tensor (H, W) -> (1, H, W), scales to [0,1]
+                transforms.Lambda(normalize_image_for_model) # Normalize pixel values to [-1, 1]
+            ])
+        else:
+            self.transform = transform
+    def __len__(self) -> int:
+        return len(self.data)
+    def __getitem__(self, idx):
+        raw_filename_entry = self.data.loc[idx, 'FILENAME']
+        ground_truth_text = self.data.loc[idx, 'IDENTITY']
+        filename = raw_filename_entry.split(',')[0].strip()
+        img_path = os.path.join(self.image_dir, filename)
+        ground_truth_text = str(ground_truth_text)
+        try:
+            image = load_image_as_grayscale(img_path) # Returns PIL Image 'L'
+        except FileNotFoundError:
+            print(f"Error: Image file not found at {img_path}. Skipping this item.")
+            raise
+        if self.transform:
+            image = self.transform(image)
+        image_width = image.shape[2] # Assuming image is (C, H, W) after transform
+        text_encoded = torch.tensor(self.char_indexer.encode(ground_truth_text), dtype=torch.long)
+        text_length = len(text_encoded)
+        return image, text_encoded, image_width, text_length
+def ocr_collate_fn(batch: list) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+    """
+    Custom collate function for the DataLoader to handle variable-width images
+    and variable-length text sequences for CTC loss.
+    """
+    images, texts, image_widths, text_lengths = zip(*batch)
+    max_batch_width = max(image_widths)
+    padded_images = [F.pad(img, (0, max_batch_width - img.shape[2]), 'constant', 0) for img in images]
+    images_batch = torch.stack(padded_images, 0)
+    texts_batch = torch.cat(texts, 0)
+    text_lengths_tensor = torch.tensor(list(text_lengths), dtype=torch.long)
+    image_widths_tensor = torch.tensor(image_widths, dtype=torch.long)
+    return images_batch, texts_batch, image_widths_tensor, text_lengths_tensor
+def load_ocr_dataframes(train_csv_path: str, test_csv_path: str) -> tuple[pd.DataFrame, pd.DataFrame]:
+    """
+    Loads training and testing dataframes.
+    Assumes CSVs have 'FILENAME' and 'IDENTITY' columns.
+    Applies dataset limits from config.py.
+    """
+    train_df = pd.read_csv(train_csv_path, encoding='ISO-8859-1')
+    test_df = pd.read_csv(test_csv_path, encoding='ISO-8859-1')
+    # Apply limits if they are set (not 0)
+    if TRAIN_SAMPLES_LIMIT > 0:
+        train_df = train_df.head(TRAIN_SAMPLES_LIMIT)
+        print(f"Limited training data to {TRAIN_SAMPLES_LIMIT} samples.")
+    if TEST_SAMPLES_LIMIT > 0:
+        test_df = test_df.head(TEST_SAMPLES_LIMIT)
+        print(f"Limited test data to {TEST_SAMPLES_LIMIT} samples.")
+    return train_df, test_df
+def create_ocr_dataloaders(train_df: pd.DataFrame, test_df: pd.DataFrame,
+                           char_indexer: CharIndexer, batch_size: int) -> tuple[DataLoader, DataLoader]:
+    """
+    Creates PyTorch DataLoader objects for OCR training and testing datasets,
+    using specific image directories for train/test.
+    """
+    train_dataset = OCRDataset(train_df, char_indexer, TRAIN_IMAGES_DIR)
+    test_dataset = OCRDataset(test_df, char_indexer, TEST_IMAGES_DIR)
+    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True,
+                              num_workers=0, collate_fn=ocr_collate_fn)
+    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False,
+                             num_workers=0, collate_fn=ocr_collate_fn)
+    return train_loader, test_loader

model_ocr.py CHANGED Viewed

@@ -1,286 +1,285 @@
-# model_ocr.py
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-import torch.optim as optim
-from torch.utils.data import DataLoader # Keep DataLoader for type hinting
-from tqdm import tqdm
-from sklearn.metrics import accuracy_score
-import editdistance
-# Import config and char_indexer
-from config import IMG_HEIGHT, NUM_CLASSES, BLANK_TOKEN
-from data_handler_ocr import CharIndexer
-from utils_ocr import binarize_image, resize_image_for_ocr, normalize_image_for_model
-class CNN_Backbone(nn.Module):
-    """
-    CNN feature extractor for OCR. Designed to produce features suitable for RNN.
-    Output feature map should have height 1 after the final pooling/reduction.
-    """
-    def __init__(self, input_channels=1, output_channels=512):
-        super(CNN_Backbone, self).__init__()
-        self.cnn = nn.Sequential(
-            # First block
-            nn.Conv2d(input_channels, 64, kernel_size=3, stride=1, padding=1),
-            nn.ReLU(True),
-            nn.MaxPool2d(kernel_size=2, stride=2), # H: 32 -> 16, W: W_in -> W_in/2
-            # Second block
-            nn.Conv2d(64, 128, kernel_size=3, stride=1, padding=1),
-            nn.ReLU(True),
-            nn.MaxPool2d(kernel_size=2, stride=2), # H: 16 -> 8, W: W_in/2 -> W_in/4
-            # Third block (with two conv layers)
-            nn.Conv2d(128, 256, kernel_size=3, stride=1, padding=1),
-            nn.ReLU(True),
-            nn.Conv2d(256, 256, kernel_size=3, stride=1, padding=1),
-            nn.ReLU(True),
-            # This MaxPool2d effectively brings height from 8 to 4, with a small width adjustment due to padding
-            nn.MaxPool2d(kernel_size=(2, 2), stride=(2, 1), padding=(0, 1)), # H: 8 -> 4, W: (W/4) -> (W/4 + 1) (approx)
-            # Fourth block
-            nn.Conv2d(256, output_channels, kernel_size=3, stride=1, padding=1),
-            nn.ReLU(True),
-            # This AdaptiveAvgPool2d makes sure the height dimension becomes 1
-            # while preserving the width. This is crucial for RNN input.
-            nn.AdaptiveAvgPool2d((1, None)) # Output height 1, preserve width
-        )
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        # x: (N, C, H, W) e.g., (B, 1, 32, W_img)
-        # Pass through the CNN layers
-        conv_features = self.cnn(x) # Output: (N, cnn_out_channels, 1, W_prime)
-        # Squeeze the height dimension (which is 1)
-        # This transforms (N, C_out, 1, W_prime) to (N, C_out, W_prime)
-        conv_features = conv_features.squeeze(2)
-        # Permute for RNN input: (sequence_length, batch_size, input_size)
-        # This transforms (N, C_out, W_prime) to (W_prime, N, C_out)
-        conv_features = conv_features.permute(2, 0, 1)
-        # Return the CNN features, ready for the RNN layer in CRNN
-        return conv_features
-class BidirectionalLSTM(nn.Module):
-    """Bidirectional LSTM layer for sequence modeling."""
-    def __init__(self, input_size: int, hidden_size: int, num_layers: int, dropout: float = 0.5):
-        super(BidirectionalLSTM, self).__init__()
-        self.lstm = nn.LSTM(input_size, hidden_size, num_layers,
-                            bidirectional=True, dropout=dropout, batch_first=False)
-        # batch_first=False expects input as (sequence_length, batch_size, input_size)
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        output, _ = self.lstm(x) # [0] returns the output, [1] returns (h_n, c_n)
-        return output
-class CRNN(nn.Module):
-    """
-    Convolutional Recurrent Neural Network for OCR.
-    Combines CNN for feature extraction, LSTMs for sequence modeling,
-    and a final linear layer for character prediction.
-    """
-    def __init__(self, num_classes: int, cnn_output_channels: int = 512,
-                 rnn_hidden_size: int = 256, rnn_num_layers: int = 2):
-        super(CRNN, self).__init__()
-        self.cnn = CNN_Backbone(output_channels=cnn_output_channels)
-        # Input to LSTM is the number of channels from the CNN output
-        self.rnn = BidirectionalLSTM(cnn_output_channels, rnn_hidden_size, rnn_num_layers)
-        # Output of bidirectional LSTM is hidden_size * 2
-        self.fc = nn.Linear(rnn_hidden_size * 2, num_classes)
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        # x: (N, C, H, W) e.g., (B, 1, 32, W_img)
-        # 1. Pass through the CNN to extract features
-        conv_features = self.cnn(x) # Output: (W_prime, N, C_out) after permute in CNN_Backbone
-        # 2. Pass CNN features through the RNN (LSTM)
-        rnn_features = self.rnn(conv_features) # Output: (W_prime, N, rnn_hidden_size * 2)
-        # 3. Pass RNN features through the final fully connected layer
-        # Apply the linear layer to each time step independently
-        # output will be (W_prime, N, num_classes)
-        output = self.fc(rnn_features)
-        return output
-# --- Decoding Function ---
-def ctc_greedy_decode(output: torch.Tensor, char_indexer: CharIndexer) -> list[str]:
-    """
-    Performs greedy decoding on the CTC output.
-    output: (sequence_length, batch_size, num_classes) - raw logits
-    """
-    # Apply log_softmax to get probabilities for argmax
-    log_probs = F.log_softmax(output, dim=2)
-    # Permute to (batch_size, sequence_length, num_classes) for argmax along class dim
-    # This gives us the index of the most probable character at each time step for each sample in the batch.
-    predicted_indices = torch.argmax(log_probs.permute(1, 0, 2), dim=2).cpu().numpy()
-    decoded_texts = []
-    for seq in predicted_indices:
-        # Use char_indexer's decode method, which handles blank removal and duplicate collapse
-        decoded_texts.append(char_indexer.decode(seq.tolist())) # Convert numpy array to list
-    return decoded_texts
-# --- Evaluation Function ---
-def evaluate_model(model: nn.Module, dataloader: DataLoader, char_indexer: CharIndexer, device: str):
-    model.eval() # Set model to evaluation mode
-    # CTCLoss needs the blank token index, which is available from char_indexer
-    criterion = nn.CTCLoss(blank=char_indexer.blank_token_idx, zero_infinity=True)
-    total_loss = 0
-    all_predictions = []
-    all_ground_truths = []
-    with torch.no_grad(): # Disable gradient calculation for evaluation
-        for inputs, targets_padded, _, target_lengths in tqdm(dataloader, desc="Evaluating"):
-            inputs = inputs.to(device)
-            targets_padded = targets_padded.to(device)
-            target_lengths = target_lengths.to(device)
-            output = model(inputs) # (seq_len, batch_size, num_classes)
-            # Calculate input_lengths for CTCLoss. This is the sequence length produced by the CNN/RNN.
-            # It's the `output.shape[0]` (sequence_length) for each item in the batch.
-            outputs_seq_len_for_ctc = torch.full(
-                size=(output.shape[1],), # batch_size
-                fill_value=output.shape[0], # actual sequence length (T) from model output
-                dtype=torch.long,
-                device=device
-            )
-            # CTC Loss calculation requires log_softmax on the output logits
-            log_probs_for_loss = F.log_softmax(output, dim=2) # (T, N, C)
-            loss = criterion(log_probs_for_loss, targets_padded, outputs_seq_len_for_ctc, target_lengths)
-            total_loss += loss.item() * inputs.size(0) # Multiply by batch size for correct average
-            # Decode predictions for metrics
-            decoded_preds = ctc_greedy_decode(output, char_indexer)
-            # Reconstruct ground truths from encoded tensors
-            ground_truths = []
-            # Loop through each sample in the batch
-            for i in range(targets_padded.size(0)):
-                # Extract the actual target sequence for the i-th sample using its length
-                # Convert to list before passing to char_indexer.decode
-                ground_truths.append(char_indexer.decode(targets_padded[i, :target_lengths[i]].tolist()))
-            all_predictions.extend(decoded_preds)
-            all_ground_truths.extend(ground_truths)
-    avg_loss = total_loss / len(dataloader.dataset)
-    # Calculate Character Error Rate (CER)
-    cer_sum = 0
-    total_chars = 0
-    for pred, gt in zip(all_predictions, all_ground_truths):
-        cer_sum += editdistance.eval(pred, gt)
-        total_chars += len(gt)
-    char_error_rate = cer_sum / total_chars if total_chars > 0 else 0.0
-    # Calculate Exact Match Accuracy (Word-level Accuracy)
-    exact_match_accuracy = accuracy_score(all_ground_truths, all_predictions)
-    return avg_loss, char_error_rate, exact_match_accuracy
-# --- Training Function ---
-def train_ocr_model(model: nn.Module, train_loader: DataLoader,
-                    test_loader: DataLoader, char_indexer: CharIndexer,
-                    epochs: int, device: str, progress_callback=None) -> tuple[nn.Module, dict]:
-    """
-    Trains the OCR model using CTC loss.
-    """
-    # CTCLoss needs the blank token index
-    criterion = nn.CTCLoss(blank=char_indexer.blank_token_idx, zero_infinity=True)
-    optimizer = optim.Adam(model.parameters(), lr=0.001) # Using a fixed LR for now
-    # Using ReduceLROnPlateau to adjust LR based on test loss (monitor 'min' loss)
-    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.8, patience=5) # Removed verbose=True
-    model.to(device) # Ensure model is on the correct device
-    model.train() # Set model to training mode
-    training_history = {
-        'train_loss': [],
-        'test_loss': [],
-        'test_cer': [],
-        'test_exact_match_accuracy': []
-    }
-    for epoch in range(epochs):
-        running_loss = 0.0
-        pbar_train = tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs} (Train)")
-        for images, texts_encoded, _, text_lengths in pbar_train:
-            images = images.to(device)
-            # Ensure target tensors are on the correct device for CTCLoss calculation
-            texts_encoded = texts_encoded.to(device)
-            text_lengths = text_lengths.to(device)
-            optimizer.zero_grad() # Clear gradients from previous step
-            outputs = model(images) # (sequence_length_from_cnn, batch_size, num_classes)
-            # `outputs.shape[0]` is the actual sequence length (T) produced by the model.
-            # CTC loss expects `input_lengths` to be a tensor of shape (batch_size,) with these values.
-            outputs_seq_len_for_ctc = torch.full(
-                size=(outputs.shape[1],), # batch_size
-                fill_value=outputs.shape[0], # actual sequence length (T) from model output
-                dtype=torch.long,
-                device=device
-            )
-            # CTC Loss calculation requires log_softmax on the output logits
-            log_probs_for_loss = F.log_softmax(outputs, dim=2) # (T, N, C)
-            # Use outputs_seq_len_for_ctc for the input_lengths argument
-            loss = criterion(log_probs_for_loss, texts_encoded, outputs_seq_len_for_ctc, text_lengths)
-            loss.backward() # Backpropagate
-            optimizer.step() # Update model weights
-            running_loss += loss.item() * images.size(0) # Multiply by batch size for correct average
-            pbar_train.set_postfix(loss=loss.item())
-        epoch_train_loss = running_loss / len(train_loader.dataset)
-        training_history['train_loss'].append(epoch_train_loss)
-        # Evaluate on test set using the dedicated function
-        # Ensure model is in eval mode before calling evaluate_model
-        model.eval()
-        test_loss, test_cer, test_exact_match_accuracy = evaluate_model(model, test_loader, char_indexer, device)
-        training_history['test_loss'].append(test_loss)
-        training_history['test_cer'].append(test_cer)
-        training_history['test_exact_match_accuracy'].append(test_exact_match_accuracy)
-        # Adjust learning rate based on test loss (this is where scheduler.step() is called)
-        scheduler.step(test_loss)
-        print(f"Epoch {epoch+1}/{epochs}: Train Loss={epoch_train_loss:.4f}, "
-              f"Test Loss={test_loss:.4f}, Test CER={test_cer:.4f}, Test Exact Match Acc={test_exact_match_accuracy:.4f}")
-        if progress_callback:
-            # Update progress bar with current epoch and key metrics
-            progress_val = (epoch + 1) / epochs
-            progress_callback(progress_val, text=f"Epoch {epoch+1}/{epochs} done. Test CER: {test_cer:.4f}, Test Exact Match Acc: {test_exact_match_accuracy:.4f}")
-        model.train() # Set model back to training mode after evaluation
-    return model, training_history
-def save_ocr_model(model: nn.Module, path: str):
-    """Saves the state dictionary of the trained OCR model."""
-    torch.save(model.state_dict(), path)
-    print(f"OCR model saved to {path}")
-def load_ocr_model(model: nn.Module, path: str):
-    """
-    Loads a trained OCR model's state dictionary.
-    Includes map_location to handle loading models trained on GPU to CPU, and vice versa.
-    """
-    model.load_state_dict(torch.load(path, map_location=torch.device('cpu')))
-    model.eval() # Set to evaluation mode
-    print(f"OCR model loaded from {path}")

+# model_ocr.py
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.optim as optim
+from torch.utils.data import DataLoader
+from tqdm import tqdm
+from sklearn.metrics import accuracy_score
+import editdistance
+# Import config and char_indexer
+from config import IMG_HEIGHT, NUM_CLASSES, BLANK_TOKEN
+from data_handler_ocr import CharIndexer
+from utils_ocr import binarize_image, resize_image_for_ocr, normalize_image_for_model
+class CNN_Backbone(nn.Module):
+    """
+    CNN feature extractor for OCR. Designed to produce features suitable for RNN.
+    Output feature map should have height 1 after the final pooling/reduction.
+    """
+    def __init__(self, input_channels=1, output_channels=512):
+        super(CNN_Backbone, self).__init__()
+        self.cnn = nn.Sequential(
+            # First block
+            nn.Conv2d(input_channels, 64, kernel_size=3, stride=1, padding=1),
+            nn.ReLU(True),
+            nn.MaxPool2d(kernel_size=2, stride=2), # H: 32 -> 16, W: W_in -> W_in/2
+            # Second block
+            nn.Conv2d(64, 128, kernel_size=3, stride=1, padding=1),
+            nn.ReLU(True),
+            nn.MaxPool2d(kernel_size=2, stride=2), # H: 16 -> 8, W: W_in/2 -> W_in/4
+            # Third block (with two conv layers)
+            nn.Conv2d(128, 256, kernel_size=3, stride=1, padding=1),
+            nn.ReLU(True),
+            nn.Conv2d(256, 256, kernel_size=3, stride=1, padding=1),
+            nn.ReLU(True),
+            # This MaxPool2d effectively brings height from 8 to 4, with a small width adjustment due to padding
+            nn.MaxPool2d(kernel_size=(2, 2), stride=(2, 1), padding=(0, 1)), # H: 8 -> 4, W: (W/4) -> (W/4 + 1) (approx)
+            # Fourth block
+            nn.Conv2d(256, output_channels, kernel_size=3, stride=1, padding=1),
+            nn.ReLU(True),
+            # This AdaptiveAvgPool2d makes sure the height dimension becomes 1
+            # while preserving the width. This is crucial for RNN input.
+            nn.AdaptiveAvgPool2d((1, None)) # Output height 1, preserve width
+        )
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        # x: (N, C, H, W) e.g., (B, 1, 32, W_img)
+        # Pass through the CNN layers
+        conv_features = self.cnn(x) # Output: (N, cnn_out_channels, 1, W_prime)
+        # Squeeze the height dimension (which is 1)
+        # This transforms (N, C_out, 1, W_prime) to (N, C_out, W_prime)
+        conv_features = conv_features.squeeze(2)
+        # Permute for RNN input: (sequence_length, batch_size, input_size)
+        # This transforms (N, C_out, W_prime) to (W_prime, N, C_out)
+        conv_features = conv_features.permute(2, 0, 1)
+        # Return the CNN features, ready for the RNN layer in CRNN
+        return conv_features
+class BidirectionalLSTM(nn.Module):
+    """Bidirectional LSTM layer for sequence modeling."""
+    def __init__(self, input_size: int, hidden_size: int, num_layers: int, dropout: float = 0.5):
+        super(BidirectionalLSTM, self).__init__()
+        self.lstm = nn.LSTM(input_size, hidden_size, num_layers,
+                            bidirectional=True, dropout=dropout, batch_first=False)
+        # batch_first=False expects input as (sequence_length, batch_size, input_size)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        output, _ = self.lstm(x) # [0] returns the output, [1] returns (h_n, c_n)
+        return output
+class CRNN(nn.Module):
+    """
+    Convolutional Recurrent Neural Network for OCR.
+    Combines CNN for feature extraction, LSTMs for sequence modeling,
+    and a final linear layer for character prediction.
+    """
+    def __init__(self, num_classes: int, cnn_output_channels: int = 512,
+                 rnn_hidden_size: int = 256, rnn_num_layers: int = 2): # Corrected parameter name
+        super(CRNN, self).__init__()
+        self.cnn = CNN_Backbone(output_channels=cnn_output_channels)
+        # Input to LSTM is the number of channels from the CNN output
+        self.rnn = BidirectionalLSTM(cnn_output_channels, rnn_hidden_size, rnn_num_layers) # Corrected usage
+        # Output of bidirectional LSTM is hidden_size * 2
+        self.fc = nn.Linear(rnn_hidden_size * 2, num_classes) # Final linear layer for classes
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        # x: (N, C, H, W) e.g., (B, 1, 32, W_img)
+        # 1. Pass through the CNN to extract features
+        conv_features = self.cnn(x) # Output: (W_prime, N, C_out) after permute in CNN_Backbone
+        # 2. Pass CNN features through the RNN (LSTM)
+        rnn_features = self.rnn(conv_features) # Output: (W_prime, N, rnn_hidden_size * 2)
+        # 3. Pass RNN features through the final fully connected layer
+        # Apply the linear layer to each time step independently
+        # output will be (W_prime, N, num_classes)
+        output = self.fc(rnn_features)
+        return output
+# --- Decoding Function ---
+def ctc_greedy_decode(output: torch.Tensor, char_indexer: CharIndexer) -> list[str]:
+    """
+    Performs greedy decoding on the CTC output.
+    output: (sequence_length, batch_size, num_classes) - raw logits
+    """
+    # Apply log_softmax to get probabilities for argmax
+    log_probs = F.log_softmax(output, dim=2)
+    # Permute to (batch_size, sequence_length, num_classes) for argmax along class dim
+    predicted_indices = torch.argmax(log_probs.permute(1, 0, 2), dim=2).cpu().numpy()
+    decoded_texts = []
+    for seq in predicted_indices:
+        # Use char_indexer's decode method, which handles blank removal and duplicate collapse
+        decoded_texts.append(char_indexer.decode(seq.tolist()))
+    return decoded_texts
+# --- Evaluation Function ---
+def evaluate_model(model: nn.Module, dataloader: DataLoader, char_indexer: CharIndexer, device: str):
+    model.eval()
+    criterion = nn.CTCLoss(blank=char_indexer.blank_token_idx, zero_infinity=True)
+    total_loss = 0
+    all_predictions = []
+    all_ground_truths = []
+    with torch.no_grad():
+        for inputs, targets_padded, _, target_lengths in tqdm(dataloader, desc="Evaluating"):
+            inputs = inputs.to(device)
+            targets_padded = targets_padded.to(device)
+            target_lengths_tensor = target_lengths.to(device)
+            output = model(inputs)
+            outputs_seq_len_for_ctc = torch.full(
+                size=(output.shape[1],),
+                fill_value=output.shape[0],
+                dtype=torch.long,
+                device=device
+            )
+            # CTC Loss calculation requires log_softmax on the output logits
+            log_probs_for_loss = F.log_softmax(output, dim=2)
+            # CTCLoss expects targets_padded as a 1D tensor and target_lengths_tensor as corresponding lengths
+            loss = criterion(log_probs_for_loss, targets_padded, outputs_seq_len_for_ctc, target_lengths_tensor)
+            total_loss += loss.item() * inputs.size(0)
+            decoded_preds = ctc_greedy_decode(output, char_indexer)
+            all_predictions.extend(decoded_preds)
+            ground_truths_batch = []
+            current_idx_in_concatenated_targets = 0
+            target_lengths_list = target_lengths.cpu().tolist()
+            for i in range(inputs.size(0)):
+                length = target_lengths_list[i]
+                current_target_segment = targets_padded[current_idx_in_concatenated_targets : current_idx_in_concatenated_targets + length].tolist()
+                ground_truths_batch.append(char_indexer.decode(current_target_segment))
+                current_idx_in_concatenated_targets += length
+            all_ground_truths.extend(ground_truths_batch)
+    avg_loss = total_loss / len(dataloader.dataset)
+    # Calculate Character Error Rate (CER)
+    cer_sum = 0
+    total_chars = 0
+    for pred, gt in zip(all_predictions, all_ground_truths):
+        cer_sum += editdistance.eval(pred, gt)
+        total_chars += len(gt)
+    char_error_rate = cer_sum / total_chars if total_chars > 0 else 0.0
+    # Calculate Exact Match Accuracy (Word-level Accuracy)
+    exact_match_accuracy = accuracy_score(all_ground_truths, all_predictions)
+    return avg_loss, char_error_rate, exact_match_accuracy
+# --- Training Function ---
+def train_ocr_model(model: nn.Module, train_loader: DataLoader,
+                    test_loader: DataLoader, char_indexer: CharIndexer,
+                    epochs: int, device: str, progress_callback=None) -> tuple[nn.Module, dict]:
+    """
+    Trains the OCR model using CTC loss.
+    """
+    # CTCLoss needs the blank token index
+    criterion = nn.CTCLoss(blank=char_indexer.blank_token_idx, zero_infinity=True)
+    optimizer = optim.Adam(model.parameters(), lr=0.001) # Using a fixed LR for now
+    # Using ReduceLROnPlateau to adjust LR based on test loss (monitor 'min' loss)
+    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.8, patience=5) # Removed verbose=True
+    model.to(device) # Ensure model is on the correct device
+    model.train() # Set model to training mode
+    training_history = {
+        'train_loss': [],
+        'test_loss': [],
+        'test_cer': [],
+        'test_exact_match_accuracy': []
+    }
+    for epoch in range(epochs):
+        running_loss = 0.0
+        pbar_train = tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs} (Train)")
+        for images, texts_encoded, _, text_lengths in pbar_train:
+            images = images.to(device)
+            # Ensure target tensors are on the correct device for CTCLoss calculation
+            texts_encoded = texts_encoded.to(device)
+            text_lengths = text_lengths.to(device)
+            optimizer.zero_grad() # Clear gradients from previous step
+            outputs = model(images) # (sequence_length_from_cnn, batch_size, num_classes)
+            # `outputs.shape[0]` is the actual sequence length (T) produced by the model.
+            # CTC loss expects `input_lengths` to be a tensor of shape (batch_size,) with these values.
+            outputs_seq_len_for_ctc = torch.full(
+                size=(outputs.shape[1],), # batch_size
+                fill_value=outputs.shape[0], # actual sequence length (T) from model output
+                dtype=torch.long,
+                device=device
+            )
+            # CTC Loss calculation requires log_softmax on the output logits
+            log_probs_for_loss = F.log_softmax(outputs, dim=2) # (T, N, C)
+            # Use outputs_seq_len_for_ctc for the input_lengths argument
+            loss = criterion(log_probs_for_loss, texts_encoded, outputs_seq_len_for_ctc, text_lengths)
+            loss.backward() # Backpropagate
+            optimizer.step() # Update model weights
+            running_loss += loss.item() * images.size(0) # Multiply by batch size for correct average
+            pbar_train.set_postfix(loss=loss.item())
+        epoch_train_loss = running_loss / len(train_loader.dataset)
+        training_history['train_loss'].append(epoch_train_loss)
+        # Evaluate on test set using the dedicated function
+        # Ensure model is in eval mode before calling evaluate_model
+        model.eval()
+        test_loss, test_cer, test_exact_match_accuracy = evaluate_model(model, test_loader, char_indexer, device)
+        training_history['test_loss'].append(test_loss)
+        training_history['test_cer'].append(test_cer)
+        training_history['test_exact_match_accuracy'].append(test_exact_match_accuracy)
+        # Adjust learning rate based on test loss
+        scheduler.step(test_loss)
+        print(f"Epoch {epoch+1}/{epochs}: Train Loss={epoch_train_loss:.4f}, "
+              f"Test Loss={test_loss:.4f}, Test CER={test_cer:.4f}, Test Exact Match Acc={test_exact_match_accuracy:.4f}")
+        if progress_callback:
+            # Update progress bar with current epoch and key metrics
+            progress_val = (epoch + 1) / epochs
+            progress_callback(progress_val, text=f"Epoch {epoch+1}/{epochs} done. Test CER: {test_cer:.4f}, Test Exact Match Acc: {test_exact_match_accuracy:.4f}")
+        model.train() # Set model back to training mode after evaluation
+    return model, training_history
+def save_ocr_model(model: nn.Module, path: str):
+    """Saves the state dictionary of the trained OCR model."""
+    torch.save(model.state_dict(), path)
+    print(f"OCR model saved to {path}")
+def load_ocr_model(model: nn.Module, path: str):
+    """
+    Loads a trained OCR model's state dictionary.
+    Includes map_location to handle loading models trained on GPU to CPU, and vice versa.
+    """
+    model.load_state_dict(torch.load(path, map_location=torch.device('cpu'))) # Always load to CPU first
+    model.eval() # Set to evaluation mode
+    print(f"OCR model loaded from {path}")

utils_ocr.py CHANGED Viewed

@@ -1,184 +1,83 @@
-<<<<<<< HEAD
 #utils_ocr.py
 import cv2
-from matplotlib.pylab import f
 import numpy as np
 from PIL import Image
 import torch
-from torchvision import transforms
-# --- Image Preprocessing for OCR ---
 def load_image_as_grayscale(image_path: str) -> Image.Image:
     """Loads an image from path and converts it to grayscale PIL Image."""
-    # Use PIL for robust image loading and conversion to grayscale 'L' mode
-    img = Image.open(image_path).convert('L')
-    return img
-def binarize_image(image_pil: Image.Image) -> Image.Image:
-    """Binarizes a grayscale PIL Image (black and white)."""
-    # Convert PIL to OpenCV format (numpy array)
-    img_np = np.array(image_pil)
-    # Apply Otsu's thresholding for adaptive binarization
-    _, img_bin = cv2.threshold(img_np, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
-    # Invert colors: Handwritten text usually dark on light. OCR models often
-    # prefer light text on dark background. Check your training data's style.
-    # This example assumes dark text on light background and inverts to white text on black.
-    img_bin = 255 - img_bin
-    return Image.fromarray(img_bin)
-def resize_image_for_ocr(image_pil: Image.Image, target_height: int) -> Image.Image:
     """
-    Resizes a PIL Image to a target height while maintaining aspect ratio.
-    Pads width if necessary to avoid distortion.
     """
-    original_width, original_height = image_pil.size
-    # Calculate new width based on target height and original aspect ratio
-    new_width = int(original_width * (target_height / original_height))
-    resized_img = image_pil.resize((new_width, target_height), Image.LANCZOS)
-    return resized_img
-def normalize_image_for_model(image_pil: Image.Image) -> torch.Tensor:
     """
-    Converts a PIL Image to a PyTorch Tensor and normalizes pixel values.
     """
-    # Convert to tensor (scales to 0-1 automatically)
-    tensor_transform = transforms.ToTensor()
-    img_tensor = tensor_transform(image_pil)
-    # For grayscale images, mean and std are single values.
-    # Adjust normalization values if your training data uses different ones.
-    img_tensor = transforms.Normalize((0.5,), (0.5,))(img_tensor) # Normalize to [-1, 1]
-    return img_tensor
-def preprocess_user_image_for_ocr(uploaded_image_pil: Image.Image, target_height: int) -> torch.Tensor:
     """
-    Combines all preprocessing steps for a single user-uploaded image
-    to prepare it for the OCR model.
     """
-    # Ensure it's grayscale
-    img_gray = uploaded_image_pil.convert('L')
-    # Binarize
-    img_bin = binarize_image(img_gray)
-    # Resize (maintain aspect ratio)
-    img_resized = resize_image_for_ocr(img_bin, target_height)
-    # Normalize and convert to tensor
-    img_tensor = normalize_image_for_model(img_resized)
-    # Add batch dimension: (C, H, W) -> (1, C, H, W)
-    img_tensor = img_tensor.unsqueeze(0)
     return img_tensor
-def pad_image_tensor(image_tensor: torch.Tensor, max_width: int) -> torch.Tensor:
     """
-    Pads a single image tensor to a max_width with zeros.
-    Input tensor shape: (C, H, W)
-    Output tensor shape: (C, H, max_width)
     """
-    C, H, W = image_tensor.shape
-    if W > max_width:
-        # If image is wider than max_width, you might want to crop or resize it.
-        # For this example, we'll just return a warning or clip.
-        # A more robust solution might split text lines or use a different resizing strategy.
-        print(f"Warning: Image width {W} exceeds max_width {max_width}. Cropping.")
-        return image_tensor[:, :, :max_width] # Simple cropping
-    padding = max_width - W
-    # Pad on the right (P_left, P_right, P_top, P_bottom)
-    padded_tensor = f.pad(image_tensor, (0, padding), 'constant', 0)
-=======
-#utils_ocr.py
-import cv2
-from matplotlib.pylab import f
-import numpy as np
-from PIL import Image
-import torch
-from torchvision import transforms
-# --- Image Preprocessing for OCR ---
-def load_image_as_grayscale(image_path: str) -> Image.Image:
-    """Loads an image from path and converts it to grayscale PIL Image."""
-    # Use PIL for robust image loading and conversion to grayscale 'L' mode
-    img = Image.open(image_path).convert('L')
-    return img
-def binarize_image(image_pil: Image.Image) -> Image.Image:
-    """Binarizes a grayscale PIL Image (black and white)."""
-    # Convert PIL to OpenCV format (numpy array)
-    img_np = np.array(image_pil)
-    # Apply Otsu's thresholding for adaptive binarization
-    _, img_bin = cv2.threshold(img_np, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
-    # Invert colors: Handwritten text usually dark on light. OCR models often
-    # prefer light text on dark background. Check your training data's style.
-    # This example assumes dark text on light background and inverts to white text on black.
-    img_bin = 255 - img_bin
-    return Image.fromarray(img_bin)
-def resize_image_for_ocr(image_pil: Image.Image, target_height: int) -> Image.Image:
-    """
-    Resizes a PIL Image to a target height while maintaining aspect ratio.
-    Pads width if necessary to avoid distortion.
-    """
-    original_width, original_height = image_pil.size
-    # Calculate new width based on target height and original aspect ratio
-    new_width = int(original_width * (target_height / original_height))
-    resized_img = image_pil.resize((new_width, target_height), Image.LANCZOS)
-    return resized_img
-def normalize_image_for_model(image_pil: Image.Image) -> torch.Tensor:
-    """
-    Converts a PIL Image to a PyTorch Tensor and normalizes pixel values.
-    """
-    # Convert to tensor (scales to 0-1 automatically)
-    tensor_transform = transforms.ToTensor()
-    img_tensor = tensor_transform(image_pil)
-    # For grayscale images, mean and std are single values.
-    # Adjust normalization values if your training data uses different ones.
-    img_tensor = transforms.Normalize((0.5,), (0.5,))(img_tensor) # Normalize to [-1, 1]
-    return img_tensor
-def preprocess_user_image_for_ocr(uploaded_image_pil: Image.Image, target_height: int) -> torch.Tensor:
-    """
-    Combines all preprocessing steps for a single user-uploaded image
-    to prepare it for the OCR model.
-    """
-    # Ensure it's grayscale
-    img_gray = uploaded_image_pil.convert('L')
-    # Binarize
-    img_bin = binarize_image(img_gray)
-    # Resize (maintain aspect ratio)
-    img_resized = resize_image_for_ocr(img_bin, target_height)
-    # Normalize and convert to tensor
-    img_tensor = normalize_image_for_model(img_resized)
-    # Add batch dimension: (C, H, W) -> (1, C, H, W)
-    img_tensor = img_tensor.unsqueeze(0)
-    return img_tensor
-def pad_image_tensor(image_tensor: torch.Tensor, max_width: int) -> torch.Tensor:
-    """
-    Pads a single image tensor to a max_width with zeros.
-    Input tensor shape: (C, H, W)
-    Output tensor shape: (C, H, max_width)
-    """
-    C, H, W = image_tensor.shape
-    if W > max_width:
-        # If image is wider than max_width, you might want to crop or resize it.
-        # For this example, we'll just return a warning or clip.
-        # A more robust solution might split text lines or use a different resizing strategy.
-        print(f"Warning: Image width {W} exceeds max_width {max_width}. Cropping.")
-        return image_tensor[:, :, :max_width] # Simple cropping
-    padding = max_width - W
-    # Pad on the right (P_left, P_right, P_top, P_bottom)
-    padded_tensor = f.pad(image_tensor, (0, padding), 'constant', 0)
->>>>>>> ee59e5b21399d8b323cff452a961ea2fd6c65308
-    return padded_tensor

 #utils_ocr.py
 import cv2
 import numpy as np
 from PIL import Image
 import torch
+import torchvision.transforms as transforms
+import os
+# Import config for IMG_HEIGHT and MAX_IMG_WIDTH
+from config import IMG_HEIGHT, MAX_IMG_WIDTH
+# --- Image Preprocessing Functions ---
 def load_image_as_grayscale(image_path: str) -> Image.Image:
     """Loads an image from path and converts it to grayscale PIL Image."""
+    if not os.path.exists(image_path):
+        raise FileNotFoundError(f"Image not found at: {image_path}")
+    return Image.open(image_path).convert('L') # 'L' for grayscale
+def binarize_image(img: Image.Image) -> Image.Image:
     """
+    Binarizes a grayscale PIL Image using Otsu's method.
+    Returns a PIL Image.
     """
+    # Convert PIL Image to OpenCV format (numpy array)
+    img_np = np.array(img)
+    # Apply Otsu's binarization
+    _, binary_img = cv2.threshold(img_np, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
+    # Convert back to PIL Image
+    return Image.fromarray(binary_img)
+def resize_image_for_ocr(img: Image.Image, img_height: int) -> Image.Image:
     """
+    Resizes a PIL Image to a fixed height while maintaining aspect ratio.
+    Also ensures the width does not exceed MAX_IMG_WIDTH.
     """
+    width, height = img.size
+    # Calculate new width based on target height, maintaining aspect ratio
+    new_width = int(width * (img_height / height))
+    if new_width > MAX_IMG_WIDTH:
+        new_width = MAX_IMG_WIDTH
+        resized_img = img.resize((new_width, img_height), Image.Resampling.LANCZOS)
+        if resized_img.width > MAX_IMG_WIDTH:
+            # Crop the image from the left to MAX_IMG_WIDTH
+            resized_img = resized_img.crop((0, 0, MAX_IMG_WIDTH, img_height))
+        return resized_img
+    return img.resize((new_width, img_height), Image.Resampling.LANCZOS) # Use LANCZOS for high-quality downsampling
+def normalize_image_for_model(img_tensor: torch.Tensor) -> torch.Tensor:
     """
+    Normalizes a torch.Tensor image (grayscale) for input into the model.
+    Puts pixel values in range [-1, 1].
+    Assumes image is already a torch.Tensor with values in [0, 1] (e.g., after ToTensor).
     """
+    # Formula: (pixel_value - mean) / std_dev
+    # For [0, 1] to [-1, 1], mean = 0.5, std_dev = 0.5
+    img_tensor = (img_tensor - 0.5) / 0.5
     return img_tensor
+def preprocess_user_image_for_ocr(image_pil: Image.Image, target_height: int) -> torch.Tensor:
     """
+    Applies all necessary preprocessing steps to a user-uploaded PIL Image
+    to prepare it for the OCR model.
     """
+    # Define a transformation pipeline similar to the dataset, but including ToTensor
+    transform_pipeline = transforms.Compose([
+        transforms.Lambda(lambda img: binarize_image(img)), # PIL Image -> PIL Image
+        # Use the updated resize function that also handles MAX_IMG_WIDTH
+        transforms.Lambda(lambda img: resize_image_for_ocr(img, target_height)), # PIL Image -> PIL Image
+        transforms.ToTensor(), # PIL Image -> Tensor [0, 1]
+        transforms.Lambda(normalize_image_for_model) # Tensor [0, 1] -> Tensor [-1, 1]
+    ])
+    processed_image = transform_pipeline(image_pil)
+    # Add a batch dimension (C, H, W) -> (1, C, H, W) for single image inference
+    return processed_image.unsqueeze(0)