Spaces:

marianeft
/

handwritten_name_recognizer

Build error

App Files Files Community

marianeft commited on May 24

Commit

8900f0a

verified ·

1 Parent(s): 2c31f12

Initial update of files

Browse files

Files changed (7) hide show

LICENSE +201 -0
app.py +426 -0
config.py +112 -0
data_handler_ocr.py +270 -0
model_ocr.py +584 -0
requirements.txt +32 -0
utils_ocr.py +184 -0

LICENSE ADDED Viewed

	@@ -0,0 +1,201 @@

+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright [yyyy] [name of copyright owner]
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.

app.py ADDED Viewed

	@@ -0,0 +1,426 @@

+<<<<<<< HEAD
+# app.py
+import streamlit as st
+import pandas as pd
+import numpy as np
+from PIL import Image
+import torch
+import torch.nn.functional as F # Added F for log_softmax in inference
+import torchvision.transforms as transforms
+import os
+import traceback # For detailed error logging
+# Import custom modules
+from config import CHARS, BLANK_TOKEN, IMG_HEIGHT, TRAIN_CSV_PATH, TEST_CSV_PATH, \
+                    TRAIN_IMAGES_DIR, TEST_IMAGES_DIR, MODEL_SAVE_PATH, NUM_CLASSES, NUM_EPOCHS, BATCH_SIZE
+from data_handler_ocr import CharIndexer, OCRDataset
+from model_ocr import CRNN, train_ocr_model, save_ocr_model, load_ocr_model, ctc_greedy_decode
+from utils_ocr import preprocess_user_image_for_ocr
+# --- Streamlit App Setup ---
+st.set_page_config(page_title="Handwritten Name Recognizer", layout="centered")
+st.title("📝 Handwritten Name Recognition (OCR)")
+st.markdown("""
+    This application uses a Convolutional Recurrent Neural Network (CRNN) to perform
+    Optical Character Recognition (OCR) on handwritten names. You can upload an image
+    of a handwritten name for prediction or train a new model using the provided dataset.
+    **Note:** Training a robust OCR model can be time-consuming.
+""")
+# --- Initialize CharIndexer ---
+# The CHARS variable should contain all possible characters your model can recognize.
+# Make sure it's comprehensive based on your dataset.
+char_indexer = CharIndexer(CHARS, BLANK_TOKEN)
+# For robustness, it's best to always use char_indexer.num_classes
+# If NUM_CLASSES from config is used to initialize CRNN, ensure it matches char_indexer.num_classes
+# --- Model Loading / Initialization ---
+@st.cache_resource # Cache the model to prevent reloading on every rerun
+def get_and_load_ocr_model_cached(num_classes, model_path):
+    """
+    Initializes the OCR model and attempts to load a pre-trained model.
+    If no pre-trained model exists, a new model instance is returned.
+    """
+    model_instance = CRNN(num_classes=num_classes, cnn_output_channels=512, rnn_hidden_size=256, rnn_num_layers=2)
+    if os.path.exists(model_path):
+        st.sidebar.info("Loading pre-trained OCR model...")
+        try:
+            # Load model to CPU first, then move to device
+            model_instance.load_state_dict(torch.load(model_path, map_location=torch.device('cpu')))
+            st.sidebar.success("OCR model loaded successfully!")
+        except Exception as e:
+            st.sidebar.error(f"Error loading model: {e}. A new model will be initialized.")
+            # If loading fails, re-initialize an untrained model
+            model_instance = CRNN(num_classes=num_classes, cnn_output_channels=512, rnn_hidden_size=256, rnn_num_layers=2)
+    else:
+        st.sidebar.warning("No pre-trained OCR model found. Please train a model using the sidebar option.")
+    return model_instance
+# Get the model instance
+ocr_model = get_and_load_ocr_model_cached(char_indexer.num_classes, MODEL_SAVE_PATH)
+# Determine the device (GPU if available, else CPU)
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+ocr_model.to(device)
+ocr_model.eval() # Set model to evaluation mode for inference by default
+# --- Sidebar for Model Training ---
+st.sidebar.header("Model Training (Optional)")
+st.sidebar.markdown("If you want to train a new model or no model is found:")
+# Initialize Streamlit widgets outside the button block
+training_progress_bar = st.sidebar.empty() # Placeholder for progress bar
+status_text = st.sidebar.empty()            # Placeholder for status messages
+if st.sidebar.button("📊 Train New OCR Model"):
+    # Clear previous messages/widgets if button is clicked again
+    training_progress_bar.empty()
+    status_text.empty()
+    # Check for existence of CSVs and image directories
+    if not os.path.exists(TRAIN_CSV_PATH) or not os.path.exists(TEST_CSV_PATH) or \
+       not os.path.isdir(TRAIN_IMAGES_DIR) or not os.path.isdir(TEST_IMAGES_DIR):
+        status_text.error(f"""Dataset files or image directories not found.
+        Please ensure '{TRAIN_CSV_PATH}', '{TEST_CSV_PATH}', and directories '{TRAIN_IMAGES_DIR}'
+        and '{TEST_IMAGES_DIR}' exist. Refer to your project structure.""")
+    else:
+        status_text.write(f"Training a new CRNN model for {NUM_EPOCHS} epochs. This will take significant time...")
+        training_progress_bar_instance = training_progress_bar.progress(0.0, text="Training in progress. Please wait.")
+        try:
+            train_df = pd.read_csv(TRAIN_CSV_PATH, delimiter=';', names=['FILENAME', 'IDENTITY'], header=None)
+            test_df = pd.read_csv(TEST_CSV_PATH, delimiter=';', names=['FILENAME', 'IDENTITY'], header=None)
+            # Define standard image transforms for consistency
+            train_transform = transforms.Compose([
+                transforms.Resize((IMG_HEIGHT, 100)), # Resize to fixed height, width will be 100 (adjust as needed for variable width)
+                transforms.ToTensor(), # Converts PIL Image to PyTorch Tensor (H, W) -> (C, H, W), normalizes to [0,1]
+            ])
+            test_transform = transforms.Compose([
+                transforms.Resize((IMG_HEIGHT, 100)), # Same transformation as train
+                transforms.ToTensor(),
+            ])
+            # Create dataset instances
+            train_dataset = OCRDataset(dataframe=train_df, char_indexer=char_indexer, image_dir=TRAIN_IMAGES_DIR, transform=train_transform)
+            test_dataset = OCRDataset(dataframe=test_df, char_indexer=char_indexer, image_dir=TEST_IMAGES_DIR, transform=test_transform)
+            # Create DataLoader instances
+            train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=0) # num_workers=0 for Windows
+            test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=0)
+            # Train the model, passing the progress callback
+            trained_ocr_model, training_history = train_ocr_model(
+                ocr_model, # Pass the initialized model instance
+                train_loader,
+                test_loader,
+                char_indexer, # Pass char_indexer for CER calculation
+                epochs=NUM_EPOCHS,
+                device=device,
+                progress_callback=training_progress_bar_instance.progress # Pass the instance's progress method
+            )
+            # Ensure the directory for saving the model exists
+            os.makedirs(os.path.dirname(MODEL_SAVE_PATH), exist_ok=True)
+            save_ocr_model(trained_ocr_model, MODEL_SAVE_PATH)
+            status_text.success(f"Model training complete and saved to `{MODEL_SAVE_PATH}`!")
+            # Display training history chart
+            st.sidebar.subheader("Training History Plots")
+            history_df = pd.DataFrame({
+                'Epoch': range(1, len(training_history['train_loss']) + 1),
+                'Train Loss': training_history['train_loss'],
+                'Test Loss': training_history['test_loss'],
+                'Test CER (%)': [cer * 100 for cer in training_history['test_cer']], # Convert CER to percentage for display
+                'Test Exact Match Accuracy (%)': [acc * 100 for acc in training_history['test_exact_match_accuracy']] # Convert to percentage
+            })
+            # Plot 1: Training and Test Loss
+            st.sidebar.markdown("**Loss over Epochs**")
+            st.sidebar.line_chart(
+                history_df.set_index('Epoch')[['Train Loss', 'Test Loss']]
+            )
+            st.sidebar.caption("Lower loss indicates better model performance.")
+            # Plot 2: Character Error Rate (CER)
+            st.sidebar.markdown("**Character Error Rate (CER) over Epochs**")
+            st.sidebar.line_chart(
+                history_df.set_index('Epoch')[['Test CER (%)']]
+            )
+            st.sidebar.caption("Lower CER indicates fewer character errors (0% is perfect).")
+            # Plot 3: Exact Match Accuracy
+            st.sidebar.markdown("**Exact Match Accuracy over Epochs**")
+            st.sidebar.line_chart(
+                history_df.set_index('Epoch')[['Test Exact Match Accuracy (%)']]
+            )
+            st.sidebar.caption("Higher exact match accuracy indicates more perfectly recognized names.")
+            # Update the global model instance to the newly trained one for immediate inference
+            ocr_model = trained_ocr_model
+            ocr_model.eval()
+        except Exception as e:
+            status_text.error(f"An error occurred during training: {e}")
+            st.sidebar.text(traceback.format_exc()) # Show full traceback for debugging
+# --- Main Content: Name Prediction ---
+st.header("Predict Your Handwritten Name")
+st.markdown("Upload a clear image of a single handwritten name or word.")
+uploaded_file = st.file_uploader("🖼️ Choose an image...", type=["png", "jpg", "jpeg"])
+if uploaded_file is not None:
+    try:
+        # Open the uploaded image
+        image_pil = Image.open(uploaded_file).convert('L') # Ensure grayscale
+        st.image(image_pil, caption="Uploaded Image", use_column_width=True)
+        st.write("---")
+        st.write("Processing and Recognizing...")
+        # Preprocess the image for the model using utils_ocr function
+        processed_image_tensor = preprocess_user_image_for_ocr(image_pil, IMG_HEIGHT).to(device)
+        # Make prediction
+        ocr_model.eval() # Ensure model is in evaluation mode
+        with torch.no_grad(): # Disable gradient calculation for inference
+            output = ocr_model(processed_image_tensor) # (sequence_length, batch_size, num_classes)
+            # ctc_greedy_decode expects (sequence_length, batch_size, num_classes)
+            # It returns a list of strings, so get the first element for single image inference.
+            predicted_texts = ctc_greedy_decode(output, char_indexer)
+            predicted_text = predicted_texts[0] # Get the first (and only) prediction
+        st.success(f"Recognized Text: **{predicted_text}**")
+    except Exception as e:
+        st.error(f"Error processing image or recognizing text: {e}")
+        st.info("💡 **Tips for best results:**\n"
+                "- Ensure the handwritten text is clear and on a clean background.\n"
+                "- Only include one name/word per image.\n"
+                "- The model is trained on specific characters. Unusual symbols might not be recognized.")
+        st.text(traceback.format_exc())
+st.markdown("""
+    ---
+    *Built using Streamlit, PyTorch, OpenCV, and EditDistance ©2025 by MFT*
+=======
+# app.py
+import streamlit as st
+import pandas as pd
+import numpy as np
+from PIL import Image
+import torch
+import torch.nn.functional as F # Added F for log_softmax in inference
+import torchvision.transforms as transforms
+import os
+import traceback # For detailed error logging
+# Import custom modules
+from config import CHARS, BLANK_TOKEN, IMG_HEIGHT, TRAIN_CSV_PATH, TEST_CSV_PATH, \
+                    TRAIN_IMAGES_DIR, TEST_IMAGES_DIR, MODEL_SAVE_PATH, NUM_CLASSES, NUM_EPOCHS, BATCH_SIZE
+from data_handler_ocr import CharIndexer, OCRDataset
+from model_ocr import CRNN, train_ocr_model, save_ocr_model, load_ocr_model, ctc_greedy_decode
+from utils_ocr import preprocess_user_image_for_ocr
+# --- Streamlit App Setup ---
+st.set_page_config(page_title="Handwritten Name Recognizer", layout="centered")
+st.title("📝 Handwritten Name Recognition (OCR)")
+st.markdown("""
+    This application uses a Convolutional Recurrent Neural Network (CRNN) to perform
+    Optical Character Recognition (OCR) on handwritten names. You can upload an image
+    of a handwritten name for prediction or train a new model using the provided dataset.
+    **Note:** Training a robust OCR model can be time-consuming.
+""")
+# --- Initialize CharIndexer ---
+# The CHARS variable should contain all possible characters your model can recognize.
+# Make sure it's comprehensive based on your dataset.
+char_indexer = CharIndexer(CHARS, BLANK_TOKEN)
+# For robustness, it's best to always use char_indexer.num_classes
+# If NUM_CLASSES from config is used to initialize CRNN, ensure it matches char_indexer.num_classes
+# --- Model Loading / Initialization ---
+@st.cache_resource # Cache the model to prevent reloading on every rerun
+def get_and_load_ocr_model_cached(num_classes, model_path):
+    """
+    Initializes the OCR model and attempts to load a pre-trained model.
+    If no pre-trained model exists, a new model instance is returned.
+    """
+    model_instance = CRNN(num_classes=num_classes, cnn_output_channels=512, rnn_hidden_size=256, rnn_num_layers=2)
+    if os.path.exists(model_path):
+        st.sidebar.info("Loading pre-trained OCR model...")
+        try:
+            # Load model to CPU first, then move to device
+            model_instance.load_state_dict(torch.load(model_path, map_location=torch.device('cpu')))
+            st.sidebar.success("OCR model loaded successfully!")
+        except Exception as e:
+            st.sidebar.error(f"Error loading model: {e}. A new model will be initialized.")
+            # If loading fails, re-initialize an untrained model
+            model_instance = CRNN(num_classes=num_classes, cnn_output_channels=512, rnn_hidden_size=256, rnn_num_layers=2)
+    else:
+        st.sidebar.warning("No pre-trained OCR model found. Please train a model using the sidebar option.")
+    return model_instance
+# Get the model instance
+ocr_model = get_and_load_ocr_model_cached(char_indexer.num_classes, MODEL_SAVE_PATH)
+# Determine the device (GPU if available, else CPU)
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+ocr_model.to(device)
+ocr_model.eval() # Set model to evaluation mode for inference by default
+# --- Sidebar for Model Training ---
+st.sidebar.header("Model Training (Optional)")
+st.sidebar.markdown("If you want to train a new model or no model is found:")
+# Initialize Streamlit widgets outside the button block
+training_progress_bar = st.sidebar.empty() # Placeholder for progress bar
+status_text = st.sidebar.empty()            # Placeholder for status messages
+if st.sidebar.button("📊 Train New OCR Model"):
+    # Clear previous messages/widgets if button is clicked again
+    training_progress_bar.empty()
+    status_text.empty()
+    # Check for existence of CSVs and image directories
+    if not os.path.exists(TRAIN_CSV_PATH) or not os.path.exists(TEST_CSV_PATH) or \
+       not os.path.isdir(TRAIN_IMAGES_DIR) or not os.path.isdir(TEST_IMAGES_DIR):
+        status_text.error(f"""Dataset files or image directories not found.
+        Please ensure '{TRAIN_CSV_PATH}', '{TEST_CSV_PATH}', and directories '{TRAIN_IMAGES_DIR}'
+        and '{TEST_IMAGES_DIR}' exist. Refer to your project structure.""")
+    else:
+        status_text.write(f"Training a new CRNN model for {NUM_EPOCHS} epochs. This will take significant time...")
+        training_progress_bar_instance = training_progress_bar.progress(0.0, text="Training in progress. Please wait.")
+        try:
+            train_df = pd.read_csv(TRAIN_CSV_PATH, delimiter=';', names=['FILENAME', 'IDENTITY'], header=None)
+            test_df = pd.read_csv(TEST_CSV_PATH, delimiter=';', names=['FILENAME', 'IDENTITY'], header=None)
+            # Define standard image transforms for consistency
+            train_transform = transforms.Compose([
+                transforms.Resize((IMG_HEIGHT, 100)), # Resize to fixed height, width will be 100 (adjust as needed for variable width)
+                transforms.ToTensor(), # Converts PIL Image to PyTorch Tensor (H, W) -> (C, H, W), normalizes to [0,1]
+            ])
+            test_transform = transforms.Compose([
+                transforms.Resize((IMG_HEIGHT, 100)), # Same transformation as train
+                transforms.ToTensor(),
+            ])
+            # Create dataset instances
+            train_dataset = OCRDataset(dataframe=train_df, char_indexer=char_indexer, image_dir=TRAIN_IMAGES_DIR, transform=train_transform)
+            test_dataset = OCRDataset(dataframe=test_df, char_indexer=char_indexer, image_dir=TEST_IMAGES_DIR, transform=test_transform)
+            # Create DataLoader instances
+            train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=0) # num_workers=0 for Windows
+            test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=0)
+            # Train the model, passing the progress callback
+            trained_ocr_model, training_history = train_ocr_model(
+                ocr_model, # Pass the initialized model instance
+                train_loader,
+                test_loader,
+                char_indexer, # Pass char_indexer for CER calculation
+                epochs=NUM_EPOCHS,
+                device=device,
+                progress_callback=training_progress_bar_instance.progress # Pass the instance's progress method
+            )
+            # Ensure the directory for saving the model exists
+            os.makedirs(os.path.dirname(MODEL_SAVE_PATH), exist_ok=True)
+            save_ocr_model(trained_ocr_model, MODEL_SAVE_PATH)
+            status_text.success(f"Model training complete and saved to `{MODEL_SAVE_PATH}`!")
+            # Display training history chart
+            st.sidebar.subheader("Training History Plots")
+            history_df = pd.DataFrame({
+                'Epoch': range(1, len(training_history['train_loss']) + 1),
+                'Train Loss': training_history['train_loss'],
+                'Test Loss': training_history['test_loss'],
+                'Test CER (%)': [cer * 100 for cer in training_history['test_cer']], # Convert CER to percentage for display
+                'Test Exact Match Accuracy (%)': [acc * 100 for acc in training_history['test_exact_match_accuracy']] # Convert to percentage
+            })
+            # Plot 1: Training and Test Loss
+            st.sidebar.markdown("**Loss over Epochs**")
+            st.sidebar.line_chart(
+                history_df.set_index('Epoch')[['Train Loss', 'Test Loss']]
+            )
+            st.sidebar.caption("Lower loss indicates better model performance.")
+            # Plot 2: Character Error Rate (CER)
+            st.sidebar.markdown("**Character Error Rate (CER) over Epochs**")
+            st.sidebar.line_chart(
+                history_df.set_index('Epoch')[['Test CER (%)']]
+            )
+            st.sidebar.caption("Lower CER indicates fewer character errors (0% is perfect).")
+            # Plot 3: Exact Match Accuracy
+            st.sidebar.markdown("**Exact Match Accuracy over Epochs**")
+            st.sidebar.line_chart(
+                history_df.set_index('Epoch')[['Test Exact Match Accuracy (%)']]
+            )
+            st.sidebar.caption("Higher exact match accuracy indicates more perfectly recognized names.")
+            # Update the global model instance to the newly trained one for immediate inference
+            ocr_model = trained_ocr_model
+            ocr_model.eval()
+        except Exception as e:
+            status_text.error(f"An error occurred during training: {e}")
+            st.sidebar.text(traceback.format_exc()) # Show full traceback for debugging
+# --- Main Content: Name Prediction ---
+st.header("Predict Your Handwritten Name")
+st.markdown("Upload a clear image of a single handwritten name or word.")
+uploaded_file = st.file_uploader("🖼️ Choose an image...", type=["png", "jpg", "jpeg"])
+if uploaded_file is not None:
+    try:
+        # Open the uploaded image
+        image_pil = Image.open(uploaded_file).convert('L') # Ensure grayscale
+        st.image(image_pil, caption="Uploaded Image", use_column_width=True)
+        st.write("---")
+        st.write("Processing and Recognizing...")
+        # Preprocess the image for the model using utils_ocr function
+        processed_image_tensor = preprocess_user_image_for_ocr(image_pil, IMG_HEIGHT).to(device)
+        # Make prediction
+        ocr_model.eval() # Ensure model is in evaluation mode
+        with torch.no_grad(): # Disable gradient calculation for inference
+            output = ocr_model(processed_image_tensor) # (sequence_length, batch_size, num_classes)
+            # ctc_greedy_decode expects (sequence_length, batch_size, num_classes)
+            # It returns a list of strings, so get the first element for single image inference.
+            predicted_texts = ctc_greedy_decode(output, char_indexer)
+            predicted_text = predicted_texts[0] # Get the first (and only) prediction
+        st.success(f"Recognized Text: **{predicted_text}**")
+    except Exception as e:
+        st.error(f"Error processing image or recognizing text: {e}")
+        st.info("💡 **Tips for best results:**\n"
+                "- Ensure the handwritten text is clear and on a clean background.\n"
+                "- Only include one name/word per image.\n"
+                "- The model is trained on specific characters. Unusual symbols might not be recognized.")
+        st.text(traceback.format_exc())
+st.markdown("""
+    ---
+    *Built using Streamlit, PyTorch, OpenCV, and EditDistance ©2025 by MFT*
+>>>>>>> ee59e5b21399d8b323cff452a961ea2fd6c65308
+    """)

config.py ADDED Viewed

	@@ -0,0 +1,112 @@

+<<<<<<< HEAD
+# config.py
+import os
+# --- Paths ---
+BASE_DIR = os.path.dirname(os.path.abspath(__file__))
+DATA_DIR = os.path.join(BASE_DIR, 'data')
+MODELS_DIR = os.path.join(BASE_DIR, 'models')
+TRAIN_IMAGES_DIR = os.path.join(DATA_DIR, 'images', 'train')
+TEST_IMAGES_DIR = os.path.join(DATA_DIR, 'images', 'test')
+TRAIN_CSV_PATH = os.path.join(DATA_DIR, 'train.csv')
+TEST_CSV_PATH = os.path.join(DATA_DIR, 'test.csv')
+MODEL_SAVE_PATH = os.path.join(MODELS_DIR, 'handwritten_name_ocr_model.pth')
+# --- Character Set and OCR Configuration ---
+# This character set MUST cover all characters present in your dataset.
+# Add any special characters if needed.
+# The order here is crucial as it defines the indices for your characters.
+CHARS = " !\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~"
+# Define the character for the blank token. It MUST NOT be in CHARS.
+BLANK_TOKEN_SYMBOL = 'Þ'
+# Construct the full vocabulary string. It's conventional to put the blank token last.
+# This VOCABULARY string is what you pass to CharIndexer.
+VOCABULARY = CHARS + BLANK_TOKEN_SYMBOL
+# NUM_CLASSES is the total number of unique symbols in the vocabulary, including the blank.
+NUM_CLASSES = len(VOCABULARY)
+# BLANK_TOKEN is the actual index of the blank symbol within the VOCABULARY.
+# Since we appended it last, its index will be len(CHARS).
+BLANK_TOKEN = VOCABULARY.find(BLANK_TOKEN_SYMBOL)
+# --- Sanity Checks (Highly Recommended) ---
+if BLANK_TOKEN == -1:
+    raise ValueError(f"Error: BLANK_TOKEN_SYMBOL '{BLANK_TOKEN_SYMBOL}' not found in VOCABULARY. Check config.py definitions.")
+if BLANK_TOKEN >= NUM_CLASSES:
+     raise ValueError(f"Error: BLANK_TOKEN index ({BLANK_TOKEN}) must be less than NUM_CLASSES ({NUM_CLASSES}).")
+print(f"Config Loaded: NUM_CLASSES={NUM_CLASSES}, BLANK_TOKEN_INDEX={BLANK_TOKEN}")
+print(f"Vocabulary Length: {len(VOCABULARY)}")
+print(f"Blank Symbol: '{BLANK_TOKEN_SYMBOL}' at index {BLANK_TOKEN}")
+# --- Image Preprocessing Parameters ---
+IMG_HEIGHT = 32
+# --- Training Parameters ---
+BATCH_SIZE = 64
+LEARNING_RATE = 0.001
+=======
+# config.py
+import os
+# --- Paths ---
+BASE_DIR = os.path.dirname(os.path.abspath(__file__))
+DATA_DIR = os.path.join(BASE_DIR, 'data')
+MODELS_DIR = os.path.join(BASE_DIR, 'models')
+TRAIN_IMAGES_DIR = os.path.join(DATA_DIR, 'images', 'train')
+TEST_IMAGES_DIR = os.path.join(DATA_DIR, 'images', 'test')
+TRAIN_CSV_PATH = os.path.join(DATA_DIR, 'train.csv')
+TEST_CSV_PATH = os.path.join(DATA_DIR, 'test.csv')
+MODEL_SAVE_PATH = os.path.join(MODELS_DIR, 'handwritten_name_ocr_model.pth')
+# --- Character Set and OCR Configuration ---
+# This character set MUST cover all characters present in your dataset.
+# Add any special characters if needed.
+# The order here is crucial as it defines the indices for your characters.
+CHARS = " !\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~"
+# Define the character for the blank token. It MUST NOT be in CHARS.
+BLANK_TOKEN_SYMBOL = 'Þ'
+# Construct the full vocabulary string. It's conventional to put the blank token last.
+# This VOCABULARY string is what you pass to CharIndexer.
+VOCABULARY = CHARS + BLANK_TOKEN_SYMBOL
+# NUM_CLASSES is the total number of unique symbols in the vocabulary, including the blank.
+NUM_CLASSES = len(VOCABULARY)
+# BLANK_TOKEN is the actual index of the blank symbol within the VOCABULARY.
+# Since we appended it last, its index will be len(CHARS).
+BLANK_TOKEN = VOCABULARY.find(BLANK_TOKEN_SYMBOL)
+# --- Sanity Checks (Highly Recommended) ---
+if BLANK_TOKEN == -1:
+    raise ValueError(f"Error: BLANK_TOKEN_SYMBOL '{BLANK_TOKEN_SYMBOL}' not found in VOCABULARY. Check config.py definitions.")
+if BLANK_TOKEN >= NUM_CLASSES:
+     raise ValueError(f"Error: BLANK_TOKEN index ({BLANK_TOKEN}) must be less than NUM_CLASSES ({NUM_CLASSES}).")
+print(f"Config Loaded: NUM_CLASSES={NUM_CLASSES}, BLANK_TOKEN_INDEX={BLANK_TOKEN}")
+print(f"Vocabulary Length: {len(VOCABULARY)}")
+print(f"Blank Symbol: '{BLANK_TOKEN_SYMBOL}' at index {BLANK_TOKEN}")
+# --- Image Preprocessing Parameters ---
+IMG_HEIGHT = 32
+# --- Training Parameters ---
+BATCH_SIZE = 64
+LEARNING_RATE = 0.001
+>>>>>>> ee59e5b21399d8b323cff452a961ea2fd6c65308
+NUM_EPOCHS = 3

data_handler_ocr.py ADDED Viewed

	@@ -0,0 +1,270 @@

+<<<<<<< HEAD
+#data_handler_ocr.py
+import pandas as pd
+import torch
+from torch.utils.data import Dataset, DataLoader
+from torchvision import transforms
+import os
+from PIL import Image
+import numpy as np
+import torch.nn.functional as F
+# Import utility functions and config
+from config import CHARS, BLANK_TOKEN, IMG_HEIGHT, TRAIN_IMAGES_DIR, TEST_IMAGES_DIR
+from utils_ocr import load_image_as_grayscale, binarize_image, resize_image_for_ocr, normalize_image_for_model
+class CharIndexer:
+    """Manages character-to-index and index-to-character mappings."""
+    def __init__(self, chars: str, blank_token: str):
+        self.char_to_idx = {char: i for i, char in enumerate(chars)}
+        self.idx_to_char = {i: char for i, char in enumerate(chars)}
+        self.blank_token_idx = len(chars) # Index for the blank token
+        self.idx_to_char[self.blank_token_idx] = blank_token # Add blank token to idx_to_char
+        self.num_classes = len(chars) + 1 # Total classes including blank
+    def encode(self, text: str) -> list[int]:
+        """Converts a text string to a list of integer indices."""
+        return [self.char_to_idx[char] for char in text]
+    def decode(self, indices: list[int]) -> str:
+        """Converts a list of integer indices back to a text string."""
+        # CTC decoding often produces repeated characters and blank tokens.
+        # This simple decoder removes blanks and duplicates.
+        decoded_text = []
+        for i, idx in enumerate(indices):
+            if idx == self.blank_token_idx:
+                continue
+            # Remove consecutive duplicates
+            if i > 0 and indices[i-1] == idx:
+                continue
+            decoded_text.append(self.idx_to_char[idx])
+        return "".join(decoded_text)
+class OCRDataset(Dataset):
+    """
+    Custom PyTorch Dataset for the Handwritten Name Recognition task.
+    Loads images and their corresponding text labels.
+    """
+    def __init__(self, dataframe: pd.DataFrame, char_indexer: CharIndexer, image_dir: str, transform=None):
+        """
+        Initializes the OCR Dataset.
+        Args:
+            dataframe (pd.DataFrame): A DataFrame containing 'image_path' and 'label' columns.
+            char_indexer (CharIndexer): An instance of CharIndexer for character encoding.
+            transform (callable, optional): Optional transform to be applied on an image.
+        """
+        self.data = dataframe
+        self.char_indexer = char_indexer
+        self.image_dir = image_dir
+        self.transform = transform
+    def __len__(self) -> int:
+        return len(self.data)
+    def __getitem__(self, idx):
+        raw_filename_entry = self.data.iloc[idx]['FILENAME']
+        ground_truth_text = self.data.iloc[idx]['IDENTITY']
+        filename = raw_filename_entry.split(',')[0].strip() # .strip() removes any whitespace
+        # Construct the full image path
+        img_path = os.path.join(self.image_dir, filename)
+        # Ensure ground_truth_text is a string
+        ground_truth_text = str(ground_truth_text)
+        # Load and transform image
+        try:
+            image = Image.open(img_path).convert('L') # Convert to grayscale
+        except FileNotFoundError:
+            print(f"Error: Image file not found at {img_path}. Skipping this item.")
+            raise # Re-raise to let the main traceback be seen.
+        if self.transform:
+            image = self.transform(image)
+        image_width = image.size(2) # Assuming image is a tensor (C, H, W) after transform
+        text_encoded = torch.tensor(self.char_indexer.encode(ground_truth_text), dtype=torch.long)
+        text_length = len(text_encoded)
+        return image, text_encoded, image_width, text_length
+def ocr_collate_fn(batch: list) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+    """
+    Custom collate function for the DataLoader to handle variable-width images
+    and variable-length text sequences for CTC loss.
+    """
+    images, texts, image_widths, text_lengths = zip(*batch)
+    # Pad images to the maximum width in the current batch
+    max_batch_width = max(image_widths)
+    padded_images = [F.pad(img, (0, max_batch_width - img.shape[2]), 'constant', 0) for img in images]
+    images_batch = torch.stack(padded_images, 0) # Stack to (N, C, H, max_W)
+    # Concatenate all text sequences and get their lengths
+    texts_batch = torch.cat(texts, 0)
+    text_lengths_tensor = torch.tensor(text_lengths, dtype=torch.long)
+    image_widths_tensor = torch.tensor(image_widths, dtype=torch.long) # Actual widths
+    return images_batch, texts_batch, image_widths_tensor, text_lengths_tensor
+def load_ocr_dataframes(train_csv_path: str, test_csv_path: str) -> tuple[pd.DataFrame, pd.DataFrame]:
+    """
+    Loads training and testing dataframes.
+    Assumes CSVs have 'filename' and 'name' columns.
+    """
+    train_df = pd.read_csv(train_csv_path)
+    test_df = pd.read_csv(test_csv_path)
+    return train_df, test_df
+def create_ocr_dataloaders(train_df: pd.DataFrame, test_df: pd.DataFrame,
+                           char_indexer: CharIndexer, batch_size: int) -> tuple[DataLoader, DataLoader]:
+    """
+    Creates PyTorch DataLoader objects for OCR training and testing datasets,
+    using specific image directories for train/test.
+    """
+    train_dataset = OCRDataset(train_df, TRAIN_IMAGES_DIR, char_indexer)
+    test_dataset = OCRDataset(test_df, TEST_IMAGES_DIR, char_indexer)
+    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True,
+                              num_workers=0, collate_fn=ocr_collate_fn)
+    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False,
+                             num_workers=0, collate_fn=ocr_collate_fn)
+=======
+#data_handler_ocr.py
+import pandas as pd
+import torch
+from torch.utils.data import Dataset, DataLoader
+from torchvision import transforms
+import os
+from PIL import Image
+import numpy as np
+import torch.nn.functional as F
+# Import utility functions and config
+from config import CHARS, BLANK_TOKEN, IMG_HEIGHT, TRAIN_IMAGES_DIR, TEST_IMAGES_DIR
+from utils_ocr import load_image_as_grayscale, binarize_image, resize_image_for_ocr, normalize_image_for_model
+class CharIndexer:
+    """Manages character-to-index and index-to-character mappings."""
+    def __init__(self, chars: str, blank_token: str):
+        self.char_to_idx = {char: i for i, char in enumerate(chars)}
+        self.idx_to_char = {i: char for i, char in enumerate(chars)}
+        self.blank_token_idx = len(chars) # Index for the blank token
+        self.idx_to_char[self.blank_token_idx] = blank_token # Add blank token to idx_to_char
+        self.num_classes = len(chars) + 1 # Total classes including blank
+    def encode(self, text: str) -> list[int]:
+        """Converts a text string to a list of integer indices."""
+        return [self.char_to_idx[char] for char in text]
+    def decode(self, indices: list[int]) -> str:
+        """Converts a list of integer indices back to a text string."""
+        # CTC decoding often produces repeated characters and blank tokens.
+        # This simple decoder removes blanks and duplicates.
+        decoded_text = []
+        for i, idx in enumerate(indices):
+            if idx == self.blank_token_idx:
+                continue
+            # Remove consecutive duplicates
+            if i > 0 and indices[i-1] == idx:
+                continue
+            decoded_text.append(self.idx_to_char[idx])
+        return "".join(decoded_text)
+class OCRDataset(Dataset):
+    """
+    Custom PyTorch Dataset for the Handwritten Name Recognition task.
+    Loads images and their corresponding text labels.
+    """
+    def __init__(self, dataframe: pd.DataFrame, char_indexer: CharIndexer, image_dir: str, transform=None):
+        """
+        Initializes the OCR Dataset.
+        Args:
+            dataframe (pd.DataFrame): A DataFrame containing 'image_path' and 'label' columns.
+            char_indexer (CharIndexer): An instance of CharIndexer for character encoding.
+            transform (callable, optional): Optional transform to be applied on an image.
+        """
+        self.data = dataframe
+        self.char_indexer = char_indexer
+        self.image_dir = image_dir
+        self.transform = transform
+    def __len__(self) -> int:
+        return len(self.data)
+    def __getitem__(self, idx):
+        raw_filename_entry = self.data.iloc[idx]['FILENAME']
+        ground_truth_text = self.data.iloc[idx]['IDENTITY']
+        filename = raw_filename_entry.split(',')[0].strip() # .strip() removes any whitespace
+        # Construct the full image path
+        img_path = os.path.join(self.image_dir, filename)
+        # Ensure ground_truth_text is a string
+        ground_truth_text = str(ground_truth_text)
+        # Load and transform image
+        try:
+            image = Image.open(img_path).convert('L') # Convert to grayscale
+        except FileNotFoundError:
+            print(f"Error: Image file not found at {img_path}. Skipping this item.")
+            raise # Re-raise to let the main traceback be seen.
+        if self.transform:
+            image = self.transform(image)
+        image_width = image.size(2) # Assuming image is a tensor (C, H, W) after transform
+        text_encoded = torch.tensor(self.char_indexer.encode(ground_truth_text), dtype=torch.long)
+        text_length = len(text_encoded)
+        return image, text_encoded, image_width, text_length
+def ocr_collate_fn(batch: list) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+    """
+    Custom collate function for the DataLoader to handle variable-width images
+    and variable-length text sequences for CTC loss.
+    """
+    images, texts, image_widths, text_lengths = zip(*batch)
+    # Pad images to the maximum width in the current batch
+    max_batch_width = max(image_widths)
+    padded_images = [F.pad(img, (0, max_batch_width - img.shape[2]), 'constant', 0) for img in images]
+    images_batch = torch.stack(padded_images, 0) # Stack to (N, C, H, max_W)
+    # Concatenate all text sequences and get their lengths
+    texts_batch = torch.cat(texts, 0)
+    text_lengths_tensor = torch.tensor(text_lengths, dtype=torch.long)
+    image_widths_tensor = torch.tensor(image_widths, dtype=torch.long) # Actual widths
+    return images_batch, texts_batch, image_widths_tensor, text_lengths_tensor
+def load_ocr_dataframes(train_csv_path: str, test_csv_path: str) -> tuple[pd.DataFrame, pd.DataFrame]:
+    """
+    Loads training and testing dataframes.
+    Assumes CSVs have 'filename' and 'name' columns.
+    """
+    train_df = pd.read_csv(train_csv_path)
+    test_df = pd.read_csv(test_csv_path)
+    return train_df, test_df
+def create_ocr_dataloaders(train_df: pd.DataFrame, test_df: pd.DataFrame,
+                           char_indexer: CharIndexer, batch_size: int) -> tuple[DataLoader, DataLoader]:
+    """
+    Creates PyTorch DataLoader objects for OCR training and testing datasets,
+    using specific image directories for train/test.
+    """
+    train_dataset = OCRDataset(train_df, TRAIN_IMAGES_DIR, char_indexer)
+    test_dataset = OCRDataset(test_df, TEST_IMAGES_DIR, char_indexer)
+    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True,
+                              num_workers=0, collate_fn=ocr_collate_fn)
+    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False,
+                             num_workers=0, collate_fn=ocr_collate_fn)
+>>>>>>> ee59e5b21399d8b323cff452a961ea2fd6c65308
+    return train_loader, test_loader

model_ocr.py ADDED Viewed

	@@ -0,0 +1,584 @@

+<<<<<<< HEAD
+# model_ocr.py
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.optim as optim
+from torch.utils.data import DataLoader # Keep DataLoader for type hinting
+from tqdm import tqdm
+from sklearn.metrics import accuracy_score
+import editdistance
+# Import config and char_indexer
+# Ensure these imports align with your current config.py
+from config import IMG_HEIGHT, NUM_CLASSES, BLANK_TOKEN
+from data_handler_ocr import CharIndexer
+# You might also need to import binarize_image, resize_image_for_ocr, normalize_image_for_model
+# if they are used directly in model_ocr.py for internal preprocessing (e.g., in evaluate_model if not using DataLoader)
+# For now, assuming they are handled by DataLoader transforms.
+from utils_ocr import binarize_image, resize_image_for_ocr, normalize_image_for_model # Add this for completeness if needed elsewhere
+class CNN_Backbone(nn.Module):
+    """
+    CNN feature extractor for OCR. Designed to produce features suitable for RNN.
+    Output feature map should have height 1 after the final pooling/reduction.
+    """
+    def __init__(self, input_channels=1, output_channels=512):
+        super(CNN_Backbone, self).__init__()
+        self.cnn = nn.Sequential(
+            # First block
+            nn.Conv2d(input_channels, 64, kernel_size=3, stride=1, padding=1),
+            nn.ReLU(True),
+            nn.MaxPool2d(kernel_size=2, stride=2), # H: 32 -> 16, W: W_in -> W_in/2
+            # Second block
+            nn.Conv2d(64, 128, kernel_size=3, stride=1, padding=1),
+            nn.ReLU(True),
+            nn.MaxPool2d(kernel_size=2, stride=2), # H: 16 -> 8, W: W_in/2 -> W_in/4
+            # Third block (with two conv layers)
+            nn.Conv2d(128, 256, kernel_size=3, stride=1, padding=1),
+            nn.ReLU(True),
+            nn.Conv2d(256, 256, kernel_size=3, stride=1, padding=1),
+            nn.ReLU(True),
+            # This MaxPool2d effectively brings height from 8 to 4, with a small width adjustment due to padding
+            # The original comment (W/4 + 1) is due to padding=1 and stride=1 on width, which is fine.
+            nn.MaxPool2d(kernel_size=(2, 2), stride=(2, 1), padding=(0, 1)), # H: 8 -> 4, W: (W/4) -> (W/4 + 1) (approx)
+            # Fourth block
+            nn.Conv2d(256, output_channels, kernel_size=3, stride=1, padding=1),
+            nn.ReLU(True),
+            # This AdaptiveAvgPool2d makes sure the height dimension becomes 1
+            # while preserving the width. This is crucial for RNN input.
+            nn.AdaptiveAvgPool2d((1, None)) # Output height 1, preserve width
+        )
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        # x: (N, C, H, W) e.g., (B, 1, 32, W_img)
+        # Pass through the CNN layers
+        conv_features = self.cnn(x) # Output: (N, cnn_out_channels, 1, W_prime)
+        # Squeeze the height dimension (which is 1)
+        # This transforms (N, C_out, 1, W_prime) to (N, C_out, W_prime)
+        conv_features = conv_features.squeeze(2)
+        # Permute for RNN input: (sequence_length, batch_size, input_size)
+        # This transforms (N, C_out, W_prime) to (W_prime, N, C_out)
+        conv_features = conv_features.permute(2, 0, 1)
+        # Return the CNN features, ready for the RNN layer in CRNN
+        return conv_features
+class BidirectionalLSTM(nn.Module):
+    """Bidirectional LSTM layer for sequence modeling."""
+    def __init__(self, input_size: int, hidden_size: int, num_layers: int, dropout: float = 0.5):
+        super(BidirectionalLSTM, self).__init__()
+        self.lstm = nn.LSTM(input_size, hidden_size, num_layers,
+                            bidirectional=True, dropout=dropout, batch_first=False)
+        # batch_first=False expects input as (sequence_length, batch_size, input_size)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        output, _ = self.lstm(x) # [0] returns the output, [1] returns (h_n, c_n)
+        return output
+class CRNN(nn.Module):
+    """
+    Convolutional Recurrent Neural Network for OCR.
+    Combines CNN for feature extraction, LSTMs for sequence modeling,
+    and a final linear layer for character prediction.
+    """
+    def __init__(self, num_classes: int, cnn_output_channels: int = 512,
+                 rnn_hidden_size: int = 256, rnn_num_layers: int = 2):
+        super(CRNN, self).__init__()
+        self.cnn = CNN_Backbone(output_channels=cnn_output_channels)
+        # Input to LSTM is the number of channels from the CNN output
+        self.rnn = BidirectionalLSTM(cnn_output_channels, rnn_hidden_size, rnn_num_layers)
+        # Output of bidirectional LSTM is hidden_size * 2
+        self.fc = nn.Linear(rnn_hidden_size * 2, num_classes) # Final linear layer for classes
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        # x: (N, C, H, W) e.g., (B, 1, 32, W_img)
+        # 1. Pass through the CNN to extract features
+        conv_features = self.cnn(x) # Output: (W_prime, N, C_out) after permute in CNN_Backbone
+        # 2. Pass CNN features through the RNN (LSTM)
+        rnn_features = self.rnn(conv_features) # Output: (W_prime, N, rnn_hidden_size * 2)
+        # 3. Pass RNN features through the final fully connected layer
+        # Apply the linear layer to each time step independently
+        # output will be (W_prime, N, num_classes)
+        output = self.fc(rnn_features)
+        return output
+# --- Decoding Function ---
+def ctc_greedy_decode(output: torch.Tensor, char_indexer: CharIndexer) -> list[str]:
+    """
+    Performs greedy decoding on the CTC output.
+    output: (sequence_length, batch_size, num_classes) - raw logits
+    """
+    # Apply log_softmax to get probabilities for argmax
+    log_probs = F.log_softmax(output, dim=2)
+    # Permute to (batch_size, sequence_length, num_classes) for argmax along class dim
+    # This gives us the index of the most probable character at each time step for each sample in the batch.
+    predicted_indices = torch.argmax(log_probs.permute(1, 0, 2), dim=2).cpu().numpy()
+    decoded_texts = []
+    for seq in predicted_indices:
+        # Use char_indexer's decode method, which handles blank removal and duplicate collapse
+        decoded_texts.append(char_indexer.decode(seq.tolist())) # Convert numpy array to list
+    return decoded_texts
+# --- Evaluation Function ---
+def evaluate_model(model: nn.Module, dataloader: DataLoader, char_indexer: CharIndexer, device: str):
+    model.eval() # Set model to evaluation mode
+    # CTCLoss needs the blank token index, which is available from char_indexer
+    criterion = nn.CTCLoss(blank=char_indexer.blank_token_idx, zero_infinity=True)
+    total_loss = 0
+    all_predictions = []
+    all_ground_truths = []
+    with torch.no_grad(): # Disable gradient calculation for evaluation
+        for inputs, targets_padded, _, target_lengths in tqdm(dataloader, desc="Evaluating"):
+            inputs = inputs.to(device)
+            targets_padded = targets_padded.to(device)
+            target_lengths = target_lengths.to(device)
+            output = model(inputs) # (seq_len, batch_size, num_classes)
+            # Calculate input_lengths for CTCLoss. This is the sequence length produced by the CNN/RNN.
+            # It's the `output.shape[0]` (sequence_length) for each item in the batch.
+            outputs_seq_len_for_ctc = torch.full(
+                size=(output.shape[1],), # batch_size
+                fill_value=output.shape[0], # actual sequence length (T) from model output
+                dtype=torch.long,
+                device=device
+            )
+            # CTC Loss calculation requires log_softmax on the output logits
+            log_probs_for_loss = F.log_softmax(output, dim=2) # (T, N, C)
+            loss = criterion(log_probs_for_loss, targets_padded, outputs_seq_len_for_ctc, target_lengths)
+            total_loss += loss.item() * inputs.size(0) # Multiply by batch size for correct average
+            # Decode predictions for metrics
+            decoded_preds = ctc_greedy_decode(output, char_indexer)
+            # Reconstruct ground truths from encoded tensors
+            ground_truths = []
+            # Loop through each sample in the batch
+            for i in range(targets_padded.size(0)):
+                # Extract the actual target sequence for the i-th sample using its length
+                # Convert to list before passing to char_indexer.decode
+                ground_truths.append(char_indexer.decode(targets_padded[i, :target_lengths[i]].tolist()))
+            all_predictions.extend(decoded_preds)
+            all_ground_truths.extend(ground_truths)
+    avg_loss = total_loss / len(dataloader.dataset)
+    # Calculate Character Error Rate (CER)
+    cer_sum = 0
+    total_chars = 0
+    for pred, gt in zip(all_predictions, all_ground_truths):
+        cer_sum += editdistance.eval(pred, gt)
+        total_chars += len(gt)
+    char_error_rate = cer_sum / total_chars if total_chars > 0 else 0.0
+    # Calculate Exact Match Accuracy (Word-level Accuracy)
+    exact_match_accuracy = accuracy_score(all_ground_truths, all_predictions)
+    return avg_loss, char_error_rate, exact_match_accuracy
+# --- Training Function ---
+def train_ocr_model(model: nn.Module, train_loader: DataLoader,
+                    test_loader: DataLoader, char_indexer: CharIndexer,
+                    epochs: int, device: str, progress_callback=None) -> tuple[nn.Module, dict]:
+    """
+    Trains the OCR model using CTC loss.
+    """
+    # CTCLoss needs the blank token index
+    criterion = nn.CTCLoss(blank=char_indexer.blank_token_idx, zero_infinity=True)
+    optimizer = optim.Adam(model.parameters(), lr=0.001) # Using a fixed LR for now
+    # Using ReduceLROnPlateau to adjust LR based on test loss (monitor 'min' loss)
+    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.8, patience=5)
+    model.to(device) # Ensure model is on the correct device
+    model.train() # Set model to training mode
+    training_history = {
+        'train_loss': [],
+        'test_loss': [],
+        'test_cer': [],
+        'test_exact_match_accuracy': []
+    }
+    for epoch in range(epochs):
+        running_loss = 0.0
+        pbar_train = tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs} (Train)")
+        for images, texts_encoded, _, text_lengths in pbar_train:
+            images = images.to(device)
+            # Ensure target tensors are on the correct device for CTCLoss calculation
+            texts_encoded = texts_encoded.to(device)
+            text_lengths = text_lengths.to(device)
+            optimizer.zero_grad() # Clear gradients from previous step
+            outputs = model(images) # (sequence_length_from_cnn, batch_size, num_classes)
+            # `outputs.shape[0]` is the actual sequence length (T) produced by the model.
+            # CTC loss expects `input_lengths` to be a tensor of shape (batch_size,) with these values.
+            outputs_seq_len_for_ctc = torch.full(
+                size=(outputs.shape[1],), # batch_size
+                fill_value=outputs.shape[0], # actual sequence length (T) from model output
+                dtype=torch.long,
+                device=device
+            )
+            # CTC Loss calculation requires log_softmax on the output logits
+            log_probs_for_loss = F.log_softmax(outputs, dim=2) # (T, N, C)
+            # Use outputs_seq_len_for_ctc for the input_lengths argument
+            loss = criterion(log_probs_for_loss, texts_encoded, outputs_seq_len_for_ctc, text_lengths)
+            loss.backward() # Backpropagate
+            optimizer.step() # Update model weights
+            running_loss += loss.item() * images.size(0) # Multiply by batch size for correct average
+            pbar_train.set_postfix(loss=loss.item())
+        epoch_train_loss = running_loss / len(train_loader.dataset)
+        training_history['train_loss'].append(epoch_train_loss)
+        # Evaluate on test set using the dedicated function
+        # Ensure model is in eval mode before calling evaluate_model
+        model.eval()
+        test_loss, test_cer, test_exact_match_accuracy = evaluate_model(model, test_loader, char_indexer, device)
+        training_history['test_loss'].append(test_loss)
+        training_history['test_cer'].append(test_cer)
+        training_history['test_exact_match_accuracy'].append(test_exact_match_accuracy)
+        # Adjust learning rate based on test loss (this is where scheduler.step() is called)
+        scheduler.step(test_loss)
+        print(f"Epoch {epoch+1}/{epochs}: Train Loss={epoch_train_loss:.4f}, "
+              f"Test Loss={test_loss:.4f}, Test CER={test_cer:.4f}, Test Exact Match Acc={test_exact_match_accuracy:.4f}")
+        if progress_callback:
+            # Update progress bar with current epoch and key metrics
+            progress_val = (epoch + 1) / epochs
+            progress_callback(progress_val, text=f"Epoch {epoch+1}/{epochs} done. Test CER: {test_cer:.4f}, Test Exact Match Acc: {test_exact_match_accuracy:.4f}")
+        model.train() # Set model back to training mode after evaluation
+    return model, training_history
+def save_ocr_model(model: nn.Module, path: str):
+    """Saves the state dictionary of the trained OCR model."""
+    torch.save(model.state_dict(), path)
+    print(f"OCR model saved to {path}")
+def load_ocr_model(model: nn.Module, path: str):
+    """
+    Loads a trained OCR model's state dictionary.
+    Includes map_location to handle loading models trained on GPU to CPU, and vice versa.
+    """
+    model.load_state_dict(torch.load(path, map_location=torch.device('cpu'))) # Always load to CPU first
+    model.eval() # Set to evaluation mode
+=======
+# model_ocr.py
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.optim as optim
+from torch.utils.data import DataLoader # Keep DataLoader for type hinting
+from tqdm import tqdm
+from sklearn.metrics import accuracy_score
+import editdistance
+# Import config and char_indexer
+# Ensure these imports align with your current config.py
+from config import IMG_HEIGHT, NUM_CLASSES, BLANK_TOKEN
+from data_handler_ocr import CharIndexer
+# You might also need to import binarize_image, resize_image_for_ocr, normalize_image_for_model
+# if they are used directly in model_ocr.py for internal preprocessing (e.g., in evaluate_model if not using DataLoader)
+# For now, assuming they are handled by DataLoader transforms.
+from utils_ocr import binarize_image, resize_image_for_ocr, normalize_image_for_model # Add this for completeness if needed elsewhere
+class CNN_Backbone(nn.Module):
+    """
+    CNN feature extractor for OCR. Designed to produce features suitable for RNN.
+    Output feature map should have height 1 after the final pooling/reduction.
+    """
+    def __init__(self, input_channels=1, output_channels=512):
+        super(CNN_Backbone, self).__init__()
+        self.cnn = nn.Sequential(
+            # First block
+            nn.Conv2d(input_channels, 64, kernel_size=3, stride=1, padding=1),
+            nn.ReLU(True),
+            nn.MaxPool2d(kernel_size=2, stride=2), # H: 32 -> 16, W: W_in -> W_in/2
+            # Second block
+            nn.Conv2d(64, 128, kernel_size=3, stride=1, padding=1),
+            nn.ReLU(True),
+            nn.MaxPool2d(kernel_size=2, stride=2), # H: 16 -> 8, W: W_in/2 -> W_in/4
+            # Third block (with two conv layers)
+            nn.Conv2d(128, 256, kernel_size=3, stride=1, padding=1),
+            nn.ReLU(True),
+            nn.Conv2d(256, 256, kernel_size=3, stride=1, padding=1),
+            nn.ReLU(True),
+            # This MaxPool2d effectively brings height from 8 to 4, with a small width adjustment due to padding
+            # The original comment (W/4 + 1) is due to padding=1 and stride=1 on width, which is fine.
+            nn.MaxPool2d(kernel_size=(2, 2), stride=(2, 1), padding=(0, 1)), # H: 8 -> 4, W: (W/4) -> (W/4 + 1) (approx)
+            # Fourth block
+            nn.Conv2d(256, output_channels, kernel_size=3, stride=1, padding=1),
+            nn.ReLU(True),
+            # This AdaptiveAvgPool2d makes sure the height dimension becomes 1
+            # while preserving the width. This is crucial for RNN input.
+            nn.AdaptiveAvgPool2d((1, None)) # Output height 1, preserve width
+        )
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        # x: (N, C, H, W) e.g., (B, 1, 32, W_img)
+        # Pass through the CNN layers
+        conv_features = self.cnn(x) # Output: (N, cnn_out_channels, 1, W_prime)
+        # Squeeze the height dimension (which is 1)
+        # This transforms (N, C_out, 1, W_prime) to (N, C_out, W_prime)
+        conv_features = conv_features.squeeze(2)
+        # Permute for RNN input: (sequence_length, batch_size, input_size)
+        # This transforms (N, C_out, W_prime) to (W_prime, N, C_out)
+        conv_features = conv_features.permute(2, 0, 1)
+        # Return the CNN features, ready for the RNN layer in CRNN
+        return conv_features
+class BidirectionalLSTM(nn.Module):
+    """Bidirectional LSTM layer for sequence modeling."""
+    def __init__(self, input_size: int, hidden_size: int, num_layers: int, dropout: float = 0.5):
+        super(BidirectionalLSTM, self).__init__()
+        self.lstm = nn.LSTM(input_size, hidden_size, num_layers,
+                            bidirectional=True, dropout=dropout, batch_first=False)
+        # batch_first=False expects input as (sequence_length, batch_size, input_size)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        output, _ = self.lstm(x) # [0] returns the output, [1] returns (h_n, c_n)
+        return output
+class CRNN(nn.Module):
+    """
+    Convolutional Recurrent Neural Network for OCR.
+    Combines CNN for feature extraction, LSTMs for sequence modeling,
+    and a final linear layer for character prediction.
+    """
+    def __init__(self, num_classes: int, cnn_output_channels: int = 512,
+                 rnn_hidden_size: int = 256, rnn_num_layers: int = 2):
+        super(CRNN, self).__init__()
+        self.cnn = CNN_Backbone(output_channels=cnn_output_channels)
+        # Input to LSTM is the number of channels from the CNN output
+        self.rnn = BidirectionalLSTM(cnn_output_channels, rnn_hidden_size, rnn_num_layers)
+        # Output of bidirectional LSTM is hidden_size * 2
+        self.fc = nn.Linear(rnn_hidden_size * 2, num_classes) # Final linear layer for classes
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        # x: (N, C, H, W) e.g., (B, 1, 32, W_img)
+        # 1. Pass through the CNN to extract features
+        conv_features = self.cnn(x) # Output: (W_prime, N, C_out) after permute in CNN_Backbone
+        # 2. Pass CNN features through the RNN (LSTM)
+        rnn_features = self.rnn(conv_features) # Output: (W_prime, N, rnn_hidden_size * 2)
+        # 3. Pass RNN features through the final fully connected layer
+        # Apply the linear layer to each time step independently
+        # output will be (W_prime, N, num_classes)
+        output = self.fc(rnn_features)
+        return output
+# --- Decoding Function ---
+def ctc_greedy_decode(output: torch.Tensor, char_indexer: CharIndexer) -> list[str]:
+    """
+    Performs greedy decoding on the CTC output.
+    output: (sequence_length, batch_size, num_classes) - raw logits
+    """
+    # Apply log_softmax to get probabilities for argmax
+    log_probs = F.log_softmax(output, dim=2)
+    # Permute to (batch_size, sequence_length, num_classes) for argmax along class dim
+    # This gives us the index of the most probable character at each time step for each sample in the batch.
+    predicted_indices = torch.argmax(log_probs.permute(1, 0, 2), dim=2).cpu().numpy()
+    decoded_texts = []
+    for seq in predicted_indices:
+        # Use char_indexer's decode method, which handles blank removal and duplicate collapse
+        decoded_texts.append(char_indexer.decode(seq.tolist())) # Convert numpy array to list
+    return decoded_texts
+# --- Evaluation Function ---
+def evaluate_model(model: nn.Module, dataloader: DataLoader, char_indexer: CharIndexer, device: str):
+    model.eval() # Set model to evaluation mode
+    # CTCLoss needs the blank token index, which is available from char_indexer
+    criterion = nn.CTCLoss(blank=char_indexer.blank_token_idx, zero_infinity=True)
+    total_loss = 0
+    all_predictions = []
+    all_ground_truths = []
+    with torch.no_grad(): # Disable gradient calculation for evaluation
+        for inputs, targets_padded, _, target_lengths in tqdm(dataloader, desc="Evaluating"):
+            inputs = inputs.to(device)
+            targets_padded = targets_padded.to(device)
+            target_lengths = target_lengths.to(device)
+            output = model(inputs) # (seq_len, batch_size, num_classes)
+            # Calculate input_lengths for CTCLoss. This is the sequence length produced by the CNN/RNN.
+            # It's the `output.shape[0]` (sequence_length) for each item in the batch.
+            outputs_seq_len_for_ctc = torch.full(
+                size=(output.shape[1],), # batch_size
+                fill_value=output.shape[0], # actual sequence length (T) from model output
+                dtype=torch.long,
+                device=device
+            )
+            # CTC Loss calculation requires log_softmax on the output logits
+            log_probs_for_loss = F.log_softmax(output, dim=2) # (T, N, C)
+            loss = criterion(log_probs_for_loss, targets_padded, outputs_seq_len_for_ctc, target_lengths)
+            total_loss += loss.item() * inputs.size(0) # Multiply by batch size for correct average
+            # Decode predictions for metrics
+            decoded_preds = ctc_greedy_decode(output, char_indexer)
+            # Reconstruct ground truths from encoded tensors
+            ground_truths = []
+            # Loop through each sample in the batch
+            for i in range(targets_padded.size(0)):
+                # Extract the actual target sequence for the i-th sample using its length
+                # Convert to list before passing to char_indexer.decode
+                ground_truths.append(char_indexer.decode(targets_padded[i, :target_lengths[i]].tolist()))
+            all_predictions.extend(decoded_preds)
+            all_ground_truths.extend(ground_truths)
+    avg_loss = total_loss / len(dataloader.dataset)
+    # Calculate Character Error Rate (CER)
+    cer_sum = 0
+    total_chars = 0
+    for pred, gt in zip(all_predictions, all_ground_truths):
+        cer_sum += editdistance.eval(pred, gt)
+        total_chars += len(gt)
+    char_error_rate = cer_sum / total_chars if total_chars > 0 else 0.0
+    # Calculate Exact Match Accuracy (Word-level Accuracy)
+    exact_match_accuracy = accuracy_score(all_ground_truths, all_predictions)
+    return avg_loss, char_error_rate, exact_match_accuracy
+# --- Training Function ---
+def train_ocr_model(model: nn.Module, train_loader: DataLoader,
+                    test_loader: DataLoader, char_indexer: CharIndexer,
+                    epochs: int, device: str, progress_callback=None) -> tuple[nn.Module, dict]:
+    """
+    Trains the OCR model using CTC loss.
+    """
+    # CTCLoss needs the blank token index
+    criterion = nn.CTCLoss(blank=char_indexer.blank_token_idx, zero_infinity=True)
+    optimizer = optim.Adam(model.parameters(), lr=0.001) # Using a fixed LR for now
+    # Using ReduceLROnPlateau to adjust LR based on test loss (monitor 'min' loss)
+    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.8, patience=5)
+    model.to(device) # Ensure model is on the correct device
+    model.train() # Set model to training mode
+    training_history = {
+        'train_loss': [],
+        'test_loss': [],
+        'test_cer': [],
+        'test_exact_match_accuracy': []
+    }
+    for epoch in range(epochs):
+        running_loss = 0.0
+        pbar_train = tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs} (Train)")
+        for images, texts_encoded, _, text_lengths in pbar_train:
+            images = images.to(device)
+            # Ensure target tensors are on the correct device for CTCLoss calculation
+            texts_encoded = texts_encoded.to(device)
+            text_lengths = text_lengths.to(device)
+            optimizer.zero_grad() # Clear gradients from previous step
+            outputs = model(images) # (sequence_length_from_cnn, batch_size, num_classes)
+            # `outputs.shape[0]` is the actual sequence length (T) produced by the model.
+            # CTC loss expects `input_lengths` to be a tensor of shape (batch_size,) with these values.
+            outputs_seq_len_for_ctc = torch.full(
+                size=(outputs.shape[1],), # batch_size
+                fill_value=outputs.shape[0], # actual sequence length (T) from model output
+                dtype=torch.long,
+                device=device
+            )
+            # CTC Loss calculation requires log_softmax on the output logits
+            log_probs_for_loss = F.log_softmax(outputs, dim=2) # (T, N, C)
+            # Use outputs_seq_len_for_ctc for the input_lengths argument
+            loss = criterion(log_probs_for_loss, texts_encoded, outputs_seq_len_for_ctc, text_lengths)
+            loss.backward() # Backpropagate
+            optimizer.step() # Update model weights
+            running_loss += loss.item() * images.size(0) # Multiply by batch size for correct average
+            pbar_train.set_postfix(loss=loss.item())
+        epoch_train_loss = running_loss / len(train_loader.dataset)
+        training_history['train_loss'].append(epoch_train_loss)
+        # Evaluate on test set using the dedicated function
+        # Ensure model is in eval mode before calling evaluate_model
+        model.eval()
+        test_loss, test_cer, test_exact_match_accuracy = evaluate_model(model, test_loader, char_indexer, device)
+        training_history['test_loss'].append(test_loss)
+        training_history['test_cer'].append(test_cer)
+        training_history['test_exact_match_accuracy'].append(test_exact_match_accuracy)
+        # Adjust learning rate based on test loss (this is where scheduler.step() is called)
+        scheduler.step(test_loss)
+        print(f"Epoch {epoch+1}/{epochs}: Train Loss={epoch_train_loss:.4f}, "
+              f"Test Loss={test_loss:.4f}, Test CER={test_cer:.4f}, Test Exact Match Acc={test_exact_match_accuracy:.4f}")
+        if progress_callback:
+            # Update progress bar with current epoch and key metrics
+            progress_val = (epoch + 1) / epochs
+            progress_callback(progress_val, text=f"Epoch {epoch+1}/{epochs} done. Test CER: {test_cer:.4f}, Test Exact Match Acc: {test_exact_match_accuracy:.4f}")
+        model.train() # Set model back to training mode after evaluation
+    return model, training_history
+def save_ocr_model(model: nn.Module, path: str):
+    """Saves the state dictionary of the trained OCR model."""
+    torch.save(model.state_dict(), path)
+    print(f"OCR model saved to {path}")
+def load_ocr_model(model: nn.Module, path: str):
+    """
+    Loads a trained OCR model's state dictionary.
+    Includes map_location to handle loading models trained on GPU to CPU, and vice versa.
+    """
+    model.load_state_dict(torch.load(path, map_location=torch.device('cpu'))) # Always load to CPU first
+    model.eval() # Set to evaluation mode
+>>>>>>> ee59e5b21399d8b323cff452a961ea2fd6c65308
+    print(f"OCR model loaded from {path}")

requirements.txt ADDED Viewed

	@@ -0,0 +1,32 @@

+<<<<<<< HEAD
+#requirements.txt
+# This file lists all the Python libraries required to run the Handwritten Name OCR application.
+# Install using: pip install -r requirements.txt
+streamlit>=1.33.0
+pandas>=2.2.2
+numpy>=1.26.4
+Pillow>=10.3.0
+opencv-python-headless>=4.9.0.80
+torch>=2.2.2
+torchvision>=0.17.2      # PyTorch companion library for vision tasks (datasets, transforms)
+matplotlib>=3.8.4        # For plotting training history
+tqdm>=4.66.2             # For displaying progress bars during training
+editdistance>=0.8.1      # For calculating character error rate (CER)
+=======
+#requirements.txt
+# This file lists all the Python libraries required to run the Handwritten Name OCR application.
+# Install using: pip install -r requirements.txt
+streamlit>=1.33.0
+pandas>=2.2.2
+numpy>=1.26.4
+Pillow>=10.3.0
+opencv-python-headless>=4.9.0.80
+torch>=2.2.2
+torchvision>=0.17.2      # PyTorch companion library for vision tasks (datasets, transforms)
+matplotlib>=3.8.4        # For plotting training history
+tqdm>=4.66.2             # For displaying progress bars during training
+editdistance>=0.8.1      # For calculating character error rate (CER)
+>>>>>>> ee59e5b21399d8b323cff452a961ea2fd6c65308
+scikit-learn>=1.4.2

utils_ocr.py ADDED Viewed

	@@ -0,0 +1,184 @@

+<<<<<<< HEAD
+#utils_ocr.py
+import cv2
+from matplotlib.pylab import f
+import numpy as np
+from PIL import Image
+import torch
+from torchvision import transforms
+# --- Image Preprocessing for OCR ---
+def load_image_as_grayscale(image_path: str) -> Image.Image:
+    """Loads an image from path and converts it to grayscale PIL Image."""
+    # Use PIL for robust image loading and conversion to grayscale 'L' mode
+    img = Image.open(image_path).convert('L')
+    return img
+def binarize_image(image_pil: Image.Image) -> Image.Image:
+    """Binarizes a grayscale PIL Image (black and white)."""
+    # Convert PIL to OpenCV format (numpy array)
+    img_np = np.array(image_pil)
+    # Apply Otsu's thresholding for adaptive binarization
+    _, img_bin = cv2.threshold(img_np, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
+    # Invert colors: Handwritten text usually dark on light. OCR models often
+    # prefer light text on dark background. Check your training data's style.
+    # This example assumes dark text on light background and inverts to white text on black.
+    img_bin = 255 - img_bin
+    return Image.fromarray(img_bin)
+def resize_image_for_ocr(image_pil: Image.Image, target_height: int) -> Image.Image:
+    """
+    Resizes a PIL Image to a target height while maintaining aspect ratio.
+    Pads width if necessary to avoid distortion.
+    """
+    original_width, original_height = image_pil.size
+    # Calculate new width based on target height and original aspect ratio
+    new_width = int(original_width * (target_height / original_height))
+    resized_img = image_pil.resize((new_width, target_height), Image.LANCZOS)
+    return resized_img
+def normalize_image_for_model(image_pil: Image.Image) -> torch.Tensor:
+    """
+    Converts a PIL Image to a PyTorch Tensor and normalizes pixel values.
+    """
+    # Convert to tensor (scales to 0-1 automatically)
+    tensor_transform = transforms.ToTensor()
+    img_tensor = tensor_transform(image_pil)
+    # For grayscale images, mean and std are single values.
+    # Adjust normalization values if your training data uses different ones.
+    img_tensor = transforms.Normalize((0.5,), (0.5,))(img_tensor) # Normalize to [-1, 1]
+    return img_tensor
+def preprocess_user_image_for_ocr(uploaded_image_pil: Image.Image, target_height: int) -> torch.Tensor:
+    """
+    Combines all preprocessing steps for a single user-uploaded image
+    to prepare it for the OCR model.
+    """
+    # Ensure it's grayscale
+    img_gray = uploaded_image_pil.convert('L')
+    # Binarize
+    img_bin = binarize_image(img_gray)
+    # Resize (maintain aspect ratio)
+    img_resized = resize_image_for_ocr(img_bin, target_height)
+    # Normalize and convert to tensor
+    img_tensor = normalize_image_for_model(img_resized)
+    # Add batch dimension: (C, H, W) -> (1, C, H, W)
+    img_tensor = img_tensor.unsqueeze(0)
+    return img_tensor
+def pad_image_tensor(image_tensor: torch.Tensor, max_width: int) -> torch.Tensor:
+    """
+    Pads a single image tensor to a max_width with zeros.
+    Input tensor shape: (C, H, W)
+    Output tensor shape: (C, H, max_width)
+    """
+    C, H, W = image_tensor.shape
+    if W > max_width:
+        # If image is wider than max_width, you might want to crop or resize it.
+        # For this example, we'll just return a warning or clip.
+        # A more robust solution might split text lines or use a different resizing strategy.
+        print(f"Warning: Image width {W} exceeds max_width {max_width}. Cropping.")
+        return image_tensor[:, :, :max_width] # Simple cropping
+    padding = max_width - W
+    # Pad on the right (P_left, P_right, P_top, P_bottom)
+    padded_tensor = f.pad(image_tensor, (0, padding), 'constant', 0)
+=======
+#utils_ocr.py
+import cv2
+from matplotlib.pylab import f
+import numpy as np
+from PIL import Image
+import torch
+from torchvision import transforms
+# --- Image Preprocessing for OCR ---
+def load_image_as_grayscale(image_path: str) -> Image.Image:
+    """Loads an image from path and converts it to grayscale PIL Image."""
+    # Use PIL for robust image loading and conversion to grayscale 'L' mode
+    img = Image.open(image_path).convert('L')
+    return img
+def binarize_image(image_pil: Image.Image) -> Image.Image:
+    """Binarizes a grayscale PIL Image (black and white)."""
+    # Convert PIL to OpenCV format (numpy array)
+    img_np = np.array(image_pil)
+    # Apply Otsu's thresholding for adaptive binarization
+    _, img_bin = cv2.threshold(img_np, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
+    # Invert colors: Handwritten text usually dark on light. OCR models often
+    # prefer light text on dark background. Check your training data's style.
+    # This example assumes dark text on light background and inverts to white text on black.
+    img_bin = 255 - img_bin
+    return Image.fromarray(img_bin)
+def resize_image_for_ocr(image_pil: Image.Image, target_height: int) -> Image.Image:
+    """
+    Resizes a PIL Image to a target height while maintaining aspect ratio.
+    Pads width if necessary to avoid distortion.
+    """
+    original_width, original_height = image_pil.size
+    # Calculate new width based on target height and original aspect ratio
+    new_width = int(original_width * (target_height / original_height))
+    resized_img = image_pil.resize((new_width, target_height), Image.LANCZOS)
+    return resized_img
+def normalize_image_for_model(image_pil: Image.Image) -> torch.Tensor:
+    """
+    Converts a PIL Image to a PyTorch Tensor and normalizes pixel values.
+    """
+    # Convert to tensor (scales to 0-1 automatically)
+    tensor_transform = transforms.ToTensor()
+    img_tensor = tensor_transform(image_pil)
+    # For grayscale images, mean and std are single values.
+    # Adjust normalization values if your training data uses different ones.
+    img_tensor = transforms.Normalize((0.5,), (0.5,))(img_tensor) # Normalize to [-1, 1]
+    return img_tensor
+def preprocess_user_image_for_ocr(uploaded_image_pil: Image.Image, target_height: int) -> torch.Tensor:
+    """
+    Combines all preprocessing steps for a single user-uploaded image
+    to prepare it for the OCR model.
+    """
+    # Ensure it's grayscale
+    img_gray = uploaded_image_pil.convert('L')
+    # Binarize
+    img_bin = binarize_image(img_gray)
+    # Resize (maintain aspect ratio)
+    img_resized = resize_image_for_ocr(img_bin, target_height)
+    # Normalize and convert to tensor
+    img_tensor = normalize_image_for_model(img_resized)
+    # Add batch dimension: (C, H, W) -> (1, C, H, W)
+    img_tensor = img_tensor.unsqueeze(0)
+    return img_tensor
+def pad_image_tensor(image_tensor: torch.Tensor, max_width: int) -> torch.Tensor:
+    """
+    Pads a single image tensor to a max_width with zeros.
+    Input tensor shape: (C, H, W)
+    Output tensor shape: (C, H, max_width)
+    """
+    C, H, W = image_tensor.shape
+    if W > max_width:
+        # If image is wider than max_width, you might want to crop or resize it.
+        # For this example, we'll just return a warning or clip.
+        # A more robust solution might split text lines or use a different resizing strategy.
+        print(f"Warning: Image width {W} exceeds max_width {max_width}. Cropping.")
+        return image_tensor[:, :, :max_width] # Simple cropping
+    padding = max_width - W
+    # Pad on the right (P_left, P_right, P_top, P_bottom)
+    padded_tensor = f.pad(image_tensor, (0, padding), 'constant', 0)
+>>>>>>> ee59e5b21399d8b323cff452a961ea2fd6c65308
+    return padded_tensor