Spaces:
Sleeping
Sleeping
| # download_ocr_models.py (Corrected v3) | |
| from paddleocr import PaddleOCR | |
| import os | |
| import shutil # For copying files/folders later if you want to automate it | |
| # --- CONFIGURATION --- | |
| # 1. CHOOSE THE LANGUAGE YOU WANT TO DOWNLOAD MODELS FOR: | |
| LANGUAGE_TO_DOWNLOAD = 'en' # <<< ***** CHANGE THIS TO YOUR TARGET LANGUAGE ***** | |
| # --- END CONFIGURATION --- | |
| print(f"Attempting to download/locate models for language: '{LANGUAGE_TO_DOWNLOAD}'...") | |
| try: | |
| # Initialize PaddleOCR. This action will trigger the download of models | |
| # for the specified language if they are not already in the local cache. | |
| ocr_temp_engine = PaddleOCR(use_angle_cls=True, lang=LANGUAGE_TO_DOWNLOAD, show_log=True) | |
| print(f"\nModels for '{LANGUAGE_TO_DOWNLOAD}' should now be in the PaddleOCR cache.") | |
| # --- Accessing the model paths from the initialized engine --- | |
| # The args object is an argparse.Namespace, access attributes directly. | |
| args = ocr_temp_engine.args # This is an argparse.Namespace object | |
| # Use hasattr to check if attributes exist before accessing them | |
| det_model_dir_cache = args.det_model_dir if hasattr(args, 'det_model_dir') else None | |
| rec_model_dir_cache = args.rec_model_dir if hasattr(args, 'rec_model_dir') else None | |
| cls_model_dir_cache = args.cls_model_dir if hasattr(args, 'use_angle_cls') and args.use_angle_cls and hasattr(args, 'cls_model_dir') else None | |
| rec_char_dict_path_from_args = args.rec_char_dict_path if hasattr(args, 'rec_char_dict_path') else None | |
| print("\n--- CACHE PATHS FOR THE DOWNLOADED MODELS (from PaddleOCR config) ---") | |
| if det_model_dir_cache: | |
| print(f"Detection ({LANGUAGE_TO_DOWNLOAD}) model cache path: {det_model_dir_cache}") | |
| else: | |
| print(f"Detection ({LANGUAGE_TO_DOWNLOAD}) model cache path: Not found in args (Attribute 'det_model_dir' missing).") | |
| if rec_model_dir_cache: | |
| print(f"Recognition ({LANGUAGE_TO_DOWNLOAD}) model cache path: {rec_model_dir_cache}") | |
| else: | |
| print(f"Recognition ({LANGUAGE_TO_DOWNLOAD}) model cache path: Not found in args (Attribute 'rec_model_dir' missing).") | |
| if cls_model_dir_cache: | |
| print(f"Classification model cache path: {cls_model_dir_cache}") | |
| elif hasattr(args, 'use_angle_cls') and args.use_angle_cls: | |
| print("Classification model enabled but path not found in args (Attribute 'cls_model_dir' missing or invalid).") | |
| else: | |
| print("Classification model not used or path not found in args.") | |
| # --- Instructions for copying --- | |
| print("\n--- ACTION REQUIRED ---") | |
| print("1. Create a folder named 'paddleocr_models' in your project's root directory (if it doesn't exist).") | |
| project_root = os.getcwd() | |
| project_model_dir_target = os.path.join(project_root, 'paddleocr_models') | |
| if not os.path.exists(project_model_dir_target): | |
| try: | |
| os.makedirs(project_model_dir_target) | |
| print(f" Created directory: {project_model_dir_target}") | |
| except OSError as e: | |
| print(f" ERROR creating directory {project_model_dir_target}: {e}") | |
| print(" Please create it manually.") | |
| else: | |
| print(f" Your project's 'paddleocr_models' folder is at: {project_model_dir_target}") | |
| print(f"\n2. Manually copy the following folders from the cache paths printed above (or from PaddleOCR's initial debug log) into '{project_model_dir_target}':") | |
| # Detection model | |
| if det_model_dir_cache and os.path.exists(det_model_dir_cache): | |
| det_target_name = os.path.basename(os.path.normpath(det_model_dir_cache)) | |
| print(f" - Detection Model Folder to Copy: '{det_target_name}'") | |
| print(f" (Full path of source: {det_model_dir_cache})") | |
| print(f" (Target location: {os.path.join(project_model_dir_target, det_target_name)})") | |
| else: | |
| print(f" - Detection model directory NOT FOUND or path is invalid based on script access: {det_model_dir_cache}") | |
| print(f" IMPORTANT: Please check the initial PaddleOCR debug logs (the long block of text when PaddleOCR starts).") | |
| print(f" Look for the line starting with 'det_model_dir=' and use THAT PATH to find the folder to copy manually.") | |
| # Recognition model | |
| if rec_model_dir_cache and os.path.exists(rec_model_dir_cache): | |
| rec_target_name = os.path.basename(os.path.normpath(rec_model_dir_cache)) | |
| print(f" - Recognition Model Folder to Copy: '{rec_target_name}'") | |
| print(f" (Full path of source: {rec_model_dir_cache})") | |
| print(f" (Target location: {os.path.join(project_model_dir_target, rec_target_name)})") | |
| if rec_char_dict_path_from_args and os.path.exists(rec_char_dict_path_from_args): | |
| print(f" (Dictionary file used by PaddleOCR: '{os.path.basename(rec_char_dict_path_from_args)}' found at {rec_char_dict_path_from_args})") | |
| print(f" (Ensure a similar .txt dictionary file, like '{os.path.basename(rec_char_dict_path_from_args)}', is inside the '{rec_target_name}' folder you copy)") | |
| else: | |
| found_dicts = [f for f in os.listdir(rec_model_dir_cache) if f.endswith('.txt')] | |
| if found_dicts: | |
| print(f" (Ensure dictionary file like '{found_dicts[0]}' is inside the '{rec_target_name}' folder you copy)") | |
| else: | |
| print(f" WARNING: Dictionary file (e.g., '{LANGUAGE_TO_DOWNLOAD}_dict.txt') NOT FOUND in {rec_model_dir_cache}") | |
| else: | |
| print(f" - Recognition model directory NOT FOUND or path is invalid based on script access: {rec_model_dir_cache}") | |
| print(f" IMPORTANT: Please check the initial PaddleOCR debug logs.") | |
| print(f" Look for the line starting with 'rec_model_dir=' and use THAT PATH to find the folder to copy manually.") | |
| # Classification model (optional) | |
| if cls_model_dir_cache and os.path.exists(cls_model_dir_cache): | |
| cls_target_name = os.path.basename(os.path.normpath(cls_model_dir_cache)) | |
| print(f" - Classification Model Folder to Copy (Optional): '{cls_target_name}'") | |
| print(f" (Full path of source: {cls_model_dir_cache})") | |
| print(f" (Target location: {os.path.join(project_model_dir_target, cls_target_name)})") | |
| elif hasattr(args, 'use_angle_cls') and args.use_angle_cls: | |
| print(f" - Classification model directory NOT FOUND or path is invalid based on script access: {cls_model_dir_cache}") | |
| print(f" IMPORTANT: Please check the initial PaddleOCR debug logs.") | |
| print(f" Look for the line starting with 'cls_model_dir=' and use THAT PATH to find the folder to copy manually if needed.") | |
| print("\n3. After copying, your 'paddleocr_models' directory in your project should contain these model subfolders.") | |
| print("4. Verify paths in your main `app.py` match these folder names.") | |
| print(" For example, if your log showed 'en_PP-OCRv3_det_infer' for detection, app.py should use that name.") | |
| except AttributeError as ae: | |
| print(f"An AttributeError occurred during script execution (not PaddleOCR init): {ae}") | |
| print("This might indicate an unexpected structure in the PaddleOCR object or its arguments when accessed by the script.") | |
| print("Please carefully review the FULL initial debug output from PaddleOCR when it initializes.") | |
| print("The lines starting with 'det_model_dir=', 'rec_model_dir=', 'cls_model_dir=' are key.") | |
| print("You can use those paths directly to find and copy the model folders manually.") | |
| except Exception as e: | |
| print(f"An unexpected error occurred: {e}") | |
| print("Please ensure PaddleOCR and PaddlePaddle are installed correctly.") | |