Spaces:

jbilcke-hf
/

VideoModelStudio

Running

App Files Files Community

jbilcke-hf HF Staff commited on 3 days ago

Commit

98352eb

1 Parent(s): 9fd1204

try to crack Finetrainers

Browse files

Files changed (5) hide show

vms/patches/__init__.py +6 -0
vms/patches/finetrainers_lora_loading.py +142 -0
vms/ui/project/services/training.py +131 -0
vms/ui/project/tabs/manage_tab.py +28 -0
vms/ui/project/tabs/train_tab.py +84 -1

vms/patches/__init__.py ADDED Viewed

	@@ -0,0 +1,6 @@

+"""
+Patches module for VideoModelStudio
+This module contains monkey patches and modifications for third-party libraries
+to extend their functionality for our specific use cases.
+"""

vms/patches/finetrainers_lora_loading.py ADDED Viewed

	@@ -0,0 +1,142 @@

+"""
+Monkey patch for Finetrainers to support loading existing LoRA weights as training initialization.
+This patch extends the SFTTrainer to accept a --pretrained_lora_path argument that allows
+starting training from existing LoRA weights instead of random initialization.
+"""
+import logging
+import json
+from typing import Optional, Dict, Any
+from pathlib import Path
+import safetensors.torch
+from peft import set_peft_model_state_dict
+logger = logging.getLogger(__name__)
+# Global flag to track if patch has been applied
+_PATCH_APPLIED = False
+def _load_pretrained_lora_weights(self, lora_path: str) -> None:
+    """Load existing LoRA weights as training initialization
+    Args:
+        lora_path: Path to directory containing pytorch_lora_weights.safetensors
+    """
+    lora_path = Path(lora_path)
+    # Find the safetensors file
+    safetensors_file = lora_path / "pytorch_lora_weights.safetensors"
+    if not safetensors_file.exists():
+        raise FileNotFoundError(f"LoRA weights file not found: {safetensors_file}")
+    logger.info(f"Loading pretrained LoRA weights from: {safetensors_file}")
+    try:
+        # Load the LoRA weights
+        lora_state_dict = safetensors.torch.load_file(str(safetensors_file))
+        # Extract metadata if available
+        metadata = {}
+        try:
+            with open(safetensors_file, 'rb') as f:
+                # Try to read metadata from safetensors header
+                header_size = int.from_bytes(f.read(8), 'little')
+                header_data = f.read(header_size)
+                header = json.loads(header_data.decode('utf-8'))
+                metadata = header.get('__metadata__', {})
+        except Exception as e:
+            logger.debug(f"Could not read metadata from safetensors: {e}")
+        # Log metadata info if available
+        if metadata:
+            logger.info(f"LoRA metadata: rank={metadata.get('rank', 'unknown')}, "
+                       f"alpha={metadata.get('lora_alpha', 'unknown')}")
+        # Apply the LoRA weights to the model
+        set_peft_model_state_dict(self.transformer, lora_state_dict)
+        logger.info(f"Successfully loaded LoRA weights from {safetensors_file}")
+        # Log the loaded keys for debugging
+        logger.debug(f"Loaded LoRA keys: {list(lora_state_dict.keys())}")
+    except Exception as e:
+        logger.error(f"Failed to load LoRA weights from {safetensors_file}: {e}")
+        raise RuntimeError(f"Failed to load LoRA weights: {e}")
+def patched_prepare_trainable_parameters(self) -> None:
+    """Patched version of _prepare_trainable_parameters that supports pretrained LoRA loading"""
+    # Call the original method first
+    original_prepare_trainable_parameters(self)
+    # Check if pretrained LoRA path is provided
+    if hasattr(self.args, 'pretrained_lora_path') and self.args.pretrained_lora_path:
+        logger.info(f"Pretrained LoRA path specified: {self.args.pretrained_lora_path}")
+        # Only load if we're doing LoRA training
+        if hasattr(self.args, 'training_type') and str(self.args.training_type) == 'TrainingType.LORA':
+            self._load_pretrained_lora_weights(self.args.pretrained_lora_path)
+        else:
+            logger.warning("pretrained_lora_path specified but training_type is not LORA")
+def apply_lora_loading_patch() -> None:
+    """Apply the monkey patch to enable LoRA weight loading in Finetrainers"""
+    global _PATCH_APPLIED
+    if _PATCH_APPLIED:
+        logger.debug("Finetrainers LoRA loading patch already applied")
+        return
+    try:
+        from finetrainers.trainer.sft_trainer.trainer import SFTTrainer
+        # Store reference to original method
+        global original_prepare_trainable_parameters
+        original_prepare_trainable_parameters = SFTTrainer._prepare_trainable_parameters
+        # Apply patches
+        SFTTrainer._prepare_trainable_parameters = patched_prepare_trainable_parameters
+        SFTTrainer._load_pretrained_lora_weights = _load_pretrained_lora_weights
+        _PATCH_APPLIED = True
+        logger.info("Successfully applied Finetrainers LoRA loading patch")
+    except ImportError as e:
+        logger.error(f"Failed to import Finetrainers classes for patching: {e}")
+        raise
+    except Exception as e:
+        logger.error(f"Failed to apply Finetrainers LoRA loading patch: {e}")
+        raise
+def remove_lora_loading_patch() -> None:
+    """Remove the monkey patch (for testing purposes)"""
+    global _PATCH_APPLIED
+    if not _PATCH_APPLIED:
+        return
+    try:
+        from finetrainers.trainer.sft_trainer.trainer import SFTTrainer
+        # Restore original method
+        SFTTrainer._prepare_trainable_parameters = original_prepare_trainable_parameters
+        # Remove added method
+        if hasattr(SFTTrainer, '_load_pretrained_lora_weights'):
+            delattr(SFTTrainer, '_load_pretrained_lora_weights')
+        _PATCH_APPLIED = False
+        logger.info("Removed Finetrainers LoRA loading patch")
+    except Exception as e:
+        logger.error(f"Failed to remove Finetrainers LoRA loading patch: {e}")
+# Store reference to original method (will be set when patch is applied)
+original_prepare_trainable_parameters = None

vms/ui/project/services/training.py CHANGED Viewed

@@ -54,6 +54,7 @@ from vms.utils import (
     prepare_finetrainers_dataset,
     copy_files_to_training_dir
 )
 logger = logging.getLogger(__name__)
 logger.setLevel(logging.INFO)
@@ -71,6 +72,17 @@ class TrainingService:
         self.file_handler = None
         self.setup_logging()
         self.ensure_valid_ui_state_file()
         logger.info("Training service initialized")
@@ -573,6 +585,7 @@ class TrainingService:
         precomputation_items: int = DEFAULT_PRECOMPUTATION_ITEMS,
         lr_warmup_steps: int = DEFAULT_NB_LR_WARMUP_STEPS,
         progress: Optional[gr.Progress] = None,
     ) -> Tuple[str, str]:
         """Start training with finetrainers"""
@@ -822,6 +835,29 @@ class TrainingService:
                     logger.error(error_msg)
                     self.append_log(error_msg)
                     return error_msg, "No valid checkpoints available"
             # Common settings for both models
             config.mixed_precision = DEFAULT_MIXED_PRECISION
@@ -1158,6 +1194,94 @@ class TrainingService:
                 logger.error(f"Failed to remove corrupted checkpoint {checkpoint_dir}: {e}")
                 self.append_log(f"Failed to remove corrupted checkpoint {checkpoint_dir.name}: {e}")
     def recover_interrupted_training(self) -> Dict[str, Any]:
         """Attempt to recover interrupted training
@@ -1493,6 +1617,13 @@ class TrainingService:
                 gr.Info(success_msg)
                 self.save_status(state='completed', message=success_msg)
                 # Upload final model if repository was specified
                 session = self.load_session()
                 if session and session['params'].get('repo_id'):

     prepare_finetrainers_dataset,
     copy_files_to_training_dir
 )
+from vms.patches.finetrainers_lora_loading import apply_lora_loading_patch
 logger = logging.getLogger(__name__)
 logger.setLevel(logging.INFO)
         self.file_handler = None
         self.setup_logging()
         self.ensure_valid_ui_state_file()
+        # Apply Finetrainers patches for LoRA weight loading
+        try:
+            apply_lora_loading_patch()
+        except Exception as e:
+            logger.warning(f"Failed to apply Finetrainers LoRA loading patch: {e}")
+        # Start background cleanup task
+        self._cleanup_stop_event = threading.Event()
+        self._cleanup_thread = threading.Thread(target=self._background_cleanup_task, daemon=True)
+        self._cleanup_thread.start()
         logger.info("Training service initialized")
         precomputation_items: int = DEFAULT_PRECOMPUTATION_ITEMS,
         lr_warmup_steps: int = DEFAULT_NB_LR_WARMUP_STEPS,
         progress: Optional[gr.Progress] = None,
+        pretrained_lora_path: Optional[str] = None,
     ) -> Tuple[str, str]:
         """Start training with finetrainers"""
                     logger.error(error_msg)
                     self.append_log(error_msg)
                     return error_msg, "No valid checkpoints available"
+            # Add pretrained LoRA path if provided (for starting fresh training with existing weights)
+            if pretrained_lora_path:
+                # Validate the LoRA path exists and contains required files
+                lora_path = Path(pretrained_lora_path)
+                lora_weights_file = lora_path / "pytorch_lora_weights.safetensors"
+                if not lora_path.exists():
+                    error_msg = f"Pretrained LoRA path does not exist: {pretrained_lora_path}"
+                    logger.error(error_msg)
+                    self.append_log(error_msg)
+                    return error_msg, "LoRA path not found"
+                if not lora_weights_file.exists():
+                    error_msg = f"LoRA weights file not found: {lora_weights_file}"
+                    logger.error(error_msg)
+                    self.append_log(error_msg)
+                    return error_msg, "LoRA weights file missing"
+                # Set the pretrained LoRA path for the patched Finetrainers
+                config.pretrained_lora_path = str(lora_path)
+                self.append_log(f"Starting training with pretrained LoRA weights from: {lora_path}")
+                logger.info(f"Using pretrained LoRA weights: {lora_path}")
             # Common settings for both models
             config.mixed_precision = DEFAULT_MIXED_PRECISION
                 logger.error(f"Failed to remove corrupted checkpoint {checkpoint_dir}: {e}")
                 self.append_log(f"Failed to remove corrupted checkpoint {checkpoint_dir.name}: {e}")
+    def cleanup_old_lora_weights(self, max_to_keep: int = 2) -> None:
+        """Remove old LoRA weight directories, keeping only the most recent ones
+        Args:
+            max_to_keep: Maximum number of LoRA weight directories to keep (default: 2)
+        """
+        lora_weights_path = self.app.output_path / "lora_weights"
+        if not lora_weights_path.exists():
+            logger.debug("LoRA weights directory does not exist, nothing to clean up")
+            return
+        # Find all LoRA weight directories (should be named with step numbers)
+        lora_dirs = []
+        for item in lora_weights_path.iterdir():
+            if item.is_dir() and item.name.isdigit():
+                lora_dirs.append(item)
+        if len(lora_dirs) <= max_to_keep:
+            logger.debug(f"Found {len(lora_dirs)} LoRA weight directories, no cleanup needed (keeping {max_to_keep})")
+            return
+        # Sort by step number (directory name) in descending order (newest first)
+        lora_dirs_sorted = sorted(lora_dirs, key=lambda x: int(x.name), reverse=True)
+        # Keep the most recent max_to_keep directories, remove the rest
+        dirs_to_keep = lora_dirs_sorted[:max_to_keep]
+        dirs_to_remove = lora_dirs_sorted[max_to_keep:]
+        logger.info(f"Cleaning up old LoRA weights: keeping {len(dirs_to_keep)}, removing {len(dirs_to_remove)}")
+        self.append_log(f"Cleaning up old LoRA weights: keeping latest {max_to_keep} directories")
+        for lora_dir in dirs_to_remove:
+            try:
+                step_num = int(lora_dir.name)
+                logger.info(f"Removing old LoRA weights at step {step_num}: {lora_dir}")
+                shutil.rmtree(lora_dir)
+                self.append_log(f"Removed old LoRA weights: step {step_num}")
+            except Exception as e:
+                logger.error(f"Failed to remove old LoRA weights {lora_dir}: {e}")
+                self.append_log(f"Failed to remove old LoRA weights {lora_dir.name}: {e}")
+        # Log what we kept
+        kept_steps = [int(d.name) for d in dirs_to_keep]
+        kept_steps.sort(reverse=True)
+        logger.info(f"Kept LoRA weights for steps: {kept_steps}")
+        self.append_log(f"Kept LoRA weights for steps: {kept_steps}")
+    def _background_cleanup_task(self) -> None:
+        """Background task that runs every 10 minutes to clean up old LoRA weights"""
+        cleanup_interval = 600  # 10 minutes in seconds
+        logger.info("Started background LoRA cleanup task (runs every 10 minutes)")
+        while not self._cleanup_stop_event.is_set():
+            try:
+                # Wait for 10 minutes or until stop event is set
+                if self._cleanup_stop_event.wait(timeout=cleanup_interval):
+                    break  # Stop event was set
+                # Only run cleanup if we have an output path
+                if hasattr(self.app, 'output_path') and self.app.output_path:
+                    lora_weights_path = self.app.output_path / "lora_weights"
+                    # Only cleanup if the directory exists and has content
+                    if lora_weights_path.exists():
+                        lora_dirs = [d for d in lora_weights_path.iterdir() if d.is_dir() and d.name.isdigit()]
+                        if len(lora_dirs) > 2:
+                            logger.info(f"Background cleanup: Found {len(lora_dirs)} LoRA weight directories, cleaning up old ones")
+                            self.cleanup_old_lora_weights(max_to_keep=2)
+                        else:
+                            logger.debug(f"Background cleanup: Found {len(lora_dirs)} LoRA weight directories, no cleanup needed")
+            except Exception as e:
+                logger.error(f"Background LoRA cleanup task error: {e}")
+                # Continue running despite errors
+        logger.info("Background LoRA cleanup task stopped")
+    def stop_background_cleanup(self) -> None:
+        """Stop the background cleanup task"""
+        if hasattr(self, '_cleanup_stop_event'):
+            self._cleanup_stop_event.set()
+        if hasattr(self, '_cleanup_thread') and self._cleanup_thread.is_alive():
+            self._cleanup_thread.join(timeout=5)
+            logger.info("Background cleanup task stopped")
     def recover_interrupted_training(self) -> Dict[str, Any]:
         """Attempt to recover interrupted training
                 gr.Info(success_msg)
                 self.save_status(state='completed', message=success_msg)
+                # Clean up old LoRA weights to save disk space
+                try:
+                    self.cleanup_old_lora_weights(max_to_keep=2)
+                except Exception as e:
+                    logger.warning(f"Failed to cleanup old LoRA weights: {e}")
+                    self.append_log(f"Warning: Failed to cleanup old LoRA weights: {e}")
                 # Upload final model if repository was specified
                 session = self.load_session()
                 if session and session['params'].get('repo_id'):

vms/ui/project/tabs/manage_tab.py CHANGED Viewed

@@ -102,6 +102,18 @@ class ManageTab(BaseTab):
                         "Push my model"
                     )
             with gr.Row():
                 with gr.Column():
                     gr.Markdown("## ♻️ Delete your data")
@@ -225,6 +237,12 @@ class ManageTab(BaseTab):
             outputs=[self.components["download_output_btn"]]
         )
         # Dataset deletion with modal
         self.components["delete_dataset_btn"].click(
             fn=lambda: Modal(visible=True),
@@ -346,6 +364,16 @@ class ManageTab(BaseTab):
         else:
             return f"Failed to upload model to {repo_id}"
     def delete_dataset(self):
         """Delete dataset files (images, videos, captions)"""
         status_messages = {}

                         "Push my model"
                     )
+            with gr.Row():
+                with gr.Column():
+                    gr.Markdown("## 🧹 Maintenance")
+                    gr.Markdown("Clean up old files to free disk space.")
+                    with gr.Row():
+                        self.components["cleanup_lora_btn"] = gr.Button(
+                            "🔄 Keep last 2 LoRA weights and clean up older ones",
+                            variant="secondary",
+                            size="lg"
+                        )
             with gr.Row():
                 with gr.Column():
                     gr.Markdown("## ♻️ Delete your data")
             outputs=[self.components["download_output_btn"]]
         )
+        # LoRA cleanup button
+        self.components["cleanup_lora_btn"].click(
+            fn=self.cleanup_old_lora_weights,
+            outputs=[]
+        )
         # Dataset deletion with modal
         self.components["delete_dataset_btn"].click(
             fn=lambda: Modal(visible=True),
         else:
             return f"Failed to upload model to {repo_id}"
+    def cleanup_old_lora_weights(self):
+        """Clean up old LoRA weight directories, keeping only the latest 2"""
+        try:
+            self.app.training.cleanup_old_lora_weights(max_to_keep=2)
+            gr.Info("✅ Successfully cleaned up old LoRA weights")
+        except Exception as e:
+            error_msg = f"❌ Failed to cleanup LoRA weights: {str(e)}"
+            gr.Error(error_msg)
+            logger.error(f"LoRA cleanup failed: {e}")
     def delete_dataset(self):
         """Delete dataset files (images, videos, captions)"""
         status_messages = {}

vms/ui/project/tabs/train_tab.py CHANGED Viewed

@@ -341,12 +341,20 @@ For image-to-video tasks, 'index' (usually with index 0) is most common as it co
                                 ## ⚗️ Train your model on your dataset
                                 - **🚀 Start new training**: Begins training from scratch (clears previous checkpoints)
                                 - **🛸 Start from latest checkpoint**: Continues training from the most recent checkpoint
                                 """)
                                 with gr.Row():
                                     # Check for existing checkpoints to determine button text
                                     checkpoints = list(self.app.output_path.glob("finetrainers_step_*"))
                                     has_checkpoints = len(checkpoints) > 0
                                     self.components["start_btn"] = gr.Button(
                                         "🚀 Start new training",
@@ -361,6 +369,13 @@ For image-to-video tasks, 'index' (usually with index 0) is most common as it co
                                         interactive=has_checkpoints and not ASK_USER_TO_DUPLICATE_SPACE
                                     )
                                 with gr.Row():
                                     # Just use stop and pause buttons for now to ensure compatibility
                                     self.components["stop_btn"] = gr.Button(
@@ -497,6 +512,52 @@ For image-to-video tasks, 'index' (usually with index 0) is most common as it co
             resume_from_checkpoint="latest"
         )
     def connect_events(self) -> None:
         """Connect event handlers to UI components"""
         # Model type change event - Update model version dropdown choices and default parameters
@@ -701,6 +762,26 @@ For image-to-video tasks, 'index' (usually with index 0) is most common as it co
                 self.components["log_box"]
             ]
         )
         # Use simplified event handlers for pause/resume and stop
@@ -780,6 +861,7 @@ For image-to-video tasks, 'index' (usually with index 0) is most common as it co
         save_iterations, repo_id,
         progress=gr.Progress(),
         resume_from_checkpoint=None,
     ):
         """Handle training start with proper log parser reset and checkpoint detection"""
@@ -840,7 +922,8 @@ For image-to-video tasks, 'index' (usually with index 0) is most common as it co
                 num_gpus=num_gpus,
                 precomputation_items=precomputation_items,
                 lr_warmup_steps=lr_warmup_steps,
-                progress=progress
             )
         except Exception as e:
             logger.exception("Error starting training")

                                 ## ⚗️ Train your model on your dataset
                                 - **🚀 Start new training**: Begins training from scratch (clears previous checkpoints)
                                 - **🛸 Start from latest checkpoint**: Continues training from the most recent checkpoint
+                                - **🔄 Start over using latest LoRA weights**: Start fresh training but use existing LoRA weights as initialization
                                 """)
                                 with gr.Row():
                                     # Check for existing checkpoints to determine button text
                                     checkpoints = list(self.app.output_path.glob("finetrainers_step_*"))
                                     has_checkpoints = len(checkpoints) > 0
+                                    # Check for existing LoRA weights
+                                    lora_weights_path = self.app.output_path / "lora_weights"
+                                    has_lora_weights = False
+                                    if lora_weights_path.exists():
+                                        lora_dirs = [d for d in lora_weights_path.iterdir() if d.is_dir()]
+                                        has_lora_weights = len(lora_dirs) > 0
                                     self.components["start_btn"] = gr.Button(
                                         "🚀 Start new training",
                                         interactive=has_checkpoints and not ASK_USER_TO_DUPLICATE_SPACE
                                     )
+                                    # Add new button for starting from LoRA weights
+                                    self.components["start_from_lora_btn"] = gr.Button(
+                                        "🔄 Start over using latest LoRA weights",
+                                        variant="secondary",
+                                        interactive=has_lora_weights and not ASK_USER_TO_DUPLICATE_SPACE
+                                    )
                                 with gr.Row():
                                     # Just use stop and pause buttons for now to ensure compatibility
                                     self.components["stop_btn"] = gr.Button(
             resume_from_checkpoint="latest"
         )
+    def handle_start_from_lora_training(
+        self, model_type, model_version, training_type,
+        lora_rank, lora_alpha, train_steps, batch_size, learning_rate,
+        save_iterations, repo_id, progress=gr.Progress()
+    ):
+        """Handle starting training from existing LoRA weights"""
+        # Find the latest LoRA weights
+        lora_weights_path = self.app.output_path / "lora_weights"
+        if not lora_weights_path.exists():
+            return "No LoRA weights found", "Please train a model first or start a new training session"
+        # Find the latest LoRA checkpoint directory
+        lora_dirs = sorted([d for d in lora_weights_path.iterdir() if d.is_dir()],
+                          key=lambda x: int(x.name), reverse=True)
+        if not lora_dirs:
+            return "No LoRA weight directories found", "Please train a model first or start a new training session"
+        latest_lora_dir = lora_dirs[0]
+        # Verify the LoRA weights file exists
+        lora_weights_file = latest_lora_dir / "pytorch_lora_weights.safetensors"
+        if not lora_weights_file.exists():
+            return f"LoRA weights file not found in {latest_lora_dir}", "Please check your LoRA weights directory"
+        # Clear checkpoints to start fresh (but keep LoRA weights)
+        for checkpoint in self.app.output_path.glob("finetrainers_step_*"):
+            if checkpoint.is_dir():
+                shutil.rmtree(checkpoint)
+        # Delete session.json to start fresh
+        session_file = self.app.output_path / "session.json"
+        if session_file.exists():
+            session_file.unlink()
+        self.app.training.append_log(f"Starting training from LoRA weights: {latest_lora_dir}")
+        # Start training with the LoRA weights
+        return self.handle_training_start(
+            model_type, model_version, training_type,
+            lora_rank, lora_alpha, train_steps, batch_size, learning_rate,
+            save_iterations, repo_id, progress,
+            pretrained_lora_path=str(latest_lora_dir)
+        )
     def connect_events(self) -> None:
         """Connect event handlers to UI components"""
         # Model type change event - Update model version dropdown choices and default parameters
                 self.components["log_box"]
             ]
         )
+        self.components["start_from_lora_btn"].click(
+            fn=self.handle_start_from_lora_training,
+            inputs=[
+                self.components["model_type"],
+                self.components["model_version"],
+                self.components["training_type"],
+                self.components["lora_rank"],
+                self.components["lora_alpha"],
+                self.components["train_steps"],
+                self.components["batch_size"],
+                self.components["learning_rate"],
+                self.components["save_iterations"],
+                self.app.tabs["manage_tab"].components["repo_id"]
+            ],
+            outputs=[
+                self.components["status_box"],
+                self.components["log_box"]
+            ]
+        )
         # Use simplified event handlers for pause/resume and stop
         save_iterations, repo_id,
         progress=gr.Progress(),
         resume_from_checkpoint=None,
+        pretrained_lora_path=None,
     ):
         """Handle training start with proper log parser reset and checkpoint detection"""
                 num_gpus=num_gpus,
                 precomputation_items=precomputation_items,
                 lr_warmup_steps=lr_warmup_steps,
+                progress=progress,
+                pretrained_lora_path=pretrained_lora_path
             )
         except Exception as e:
             logger.exception("Error starting training")