Spaces:

jbilcke-hf
/

VideoModelStudio

Running

App Files Files Community

jbilcke-hf HF Staff commited on 2 days ago

Commit

ece1c33

1 Parent(s): 98352eb

fail to crack/hack finetrainers, so reverting..

Browse files

Files changed (5) hide show

.claude/settings.local.json +6 -1
vms/patches/finetrainers_lora_loading.py +0 -142
vms/ui/project/services/training.py +44 -31
vms/ui/project/tabs/manage_tab.py +19 -0
vms/ui/project/tabs/train_tab.py +3 -2

.claude/settings.local.json CHANGED Viewed

@@ -1,3 +1,8 @@
 {
-  "enableAllProjectMcpServers": false
 }

 {
+  "enableAllProjectMcpServers": false,
+  "permissions": {
+    "allow": [
+      "Bash(rg:*)"
+    ]
+  }
 }

vms/patches/finetrainers_lora_loading.py DELETED Viewed

@@ -1,142 +0,0 @@
-"""
-Monkey patch for Finetrainers to support loading existing LoRA weights as training initialization.
-This patch extends the SFTTrainer to accept a --pretrained_lora_path argument that allows
-starting training from existing LoRA weights instead of random initialization.
-"""
-import logging
-import json
-from typing import Optional, Dict, Any
-from pathlib import Path
-import safetensors.torch
-from peft import set_peft_model_state_dict
-logger = logging.getLogger(__name__)
-# Global flag to track if patch has been applied
-_PATCH_APPLIED = False
-def _load_pretrained_lora_weights(self, lora_path: str) -> None:
-    """Load existing LoRA weights as training initialization
-    Args:
-        lora_path: Path to directory containing pytorch_lora_weights.safetensors
-    """
-    lora_path = Path(lora_path)
-    # Find the safetensors file
-    safetensors_file = lora_path / "pytorch_lora_weights.safetensors"
-    if not safetensors_file.exists():
-        raise FileNotFoundError(f"LoRA weights file not found: {safetensors_file}")
-    logger.info(f"Loading pretrained LoRA weights from: {safetensors_file}")
-    try:
-        # Load the LoRA weights
-        lora_state_dict = safetensors.torch.load_file(str(safetensors_file))
-        # Extract metadata if available
-        metadata = {}
-        try:
-            with open(safetensors_file, 'rb') as f:
-                # Try to read metadata from safetensors header
-                header_size = int.from_bytes(f.read(8), 'little')
-                header_data = f.read(header_size)
-                header = json.loads(header_data.decode('utf-8'))
-                metadata = header.get('__metadata__', {})
-        except Exception as e:
-            logger.debug(f"Could not read metadata from safetensors: {e}")
-        # Log metadata info if available
-        if metadata:
-            logger.info(f"LoRA metadata: rank={metadata.get('rank', 'unknown')}, "
-                       f"alpha={metadata.get('lora_alpha', 'unknown')}")
-        # Apply the LoRA weights to the model
-        set_peft_model_state_dict(self.transformer, lora_state_dict)
-        logger.info(f"Successfully loaded LoRA weights from {safetensors_file}")
-        # Log the loaded keys for debugging
-        logger.debug(f"Loaded LoRA keys: {list(lora_state_dict.keys())}")
-    except Exception as e:
-        logger.error(f"Failed to load LoRA weights from {safetensors_file}: {e}")
-        raise RuntimeError(f"Failed to load LoRA weights: {e}")
-def patched_prepare_trainable_parameters(self) -> None:
-    """Patched version of _prepare_trainable_parameters that supports pretrained LoRA loading"""
-    # Call the original method first
-    original_prepare_trainable_parameters(self)
-    # Check if pretrained LoRA path is provided
-    if hasattr(self.args, 'pretrained_lora_path') and self.args.pretrained_lora_path:
-        logger.info(f"Pretrained LoRA path specified: {self.args.pretrained_lora_path}")
-        # Only load if we're doing LoRA training
-        if hasattr(self.args, 'training_type') and str(self.args.training_type) == 'TrainingType.LORA':
-            self._load_pretrained_lora_weights(self.args.pretrained_lora_path)
-        else:
-            logger.warning("pretrained_lora_path specified but training_type is not LORA")
-def apply_lora_loading_patch() -> None:
-    """Apply the monkey patch to enable LoRA weight loading in Finetrainers"""
-    global _PATCH_APPLIED
-    if _PATCH_APPLIED:
-        logger.debug("Finetrainers LoRA loading patch already applied")
-        return
-    try:
-        from finetrainers.trainer.sft_trainer.trainer import SFTTrainer
-        # Store reference to original method
-        global original_prepare_trainable_parameters
-        original_prepare_trainable_parameters = SFTTrainer._prepare_trainable_parameters
-        # Apply patches
-        SFTTrainer._prepare_trainable_parameters = patched_prepare_trainable_parameters
-        SFTTrainer._load_pretrained_lora_weights = _load_pretrained_lora_weights
-        _PATCH_APPLIED = True
-        logger.info("Successfully applied Finetrainers LoRA loading patch")
-    except ImportError as e:
-        logger.error(f"Failed to import Finetrainers classes for patching: {e}")
-        raise
-    except Exception as e:
-        logger.error(f"Failed to apply Finetrainers LoRA loading patch: {e}")
-        raise
-def remove_lora_loading_patch() -> None:
-    """Remove the monkey patch (for testing purposes)"""
-    global _PATCH_APPLIED
-    if not _PATCH_APPLIED:
-        return
-    try:
-        from finetrainers.trainer.sft_trainer.trainer import SFTTrainer
-        # Restore original method
-        SFTTrainer._prepare_trainable_parameters = original_prepare_trainable_parameters
-        # Remove added method
-        if hasattr(SFTTrainer, '_load_pretrained_lora_weights'):
-            delattr(SFTTrainer, '_load_pretrained_lora_weights')
-        _PATCH_APPLIED = False
-        logger.info("Removed Finetrainers LoRA loading patch")
-    except Exception as e:
-        logger.error(f"Failed to remove Finetrainers LoRA loading patch: {e}")
-# Store reference to original method (will be set when patch is applied)
-original_prepare_trainable_parameters = None

vms/ui/project/services/training.py CHANGED Viewed

@@ -54,7 +54,6 @@ from vms.utils import (
     prepare_finetrainers_dataset,
     copy_files_to_training_dir
 )
-from vms.patches.finetrainers_lora_loading import apply_lora_loading_patch
 logger = logging.getLogger(__name__)
 logger.setLevel(logging.INFO)
@@ -73,11 +72,6 @@ class TrainingService:
         self.setup_logging()
         self.ensure_valid_ui_state_file()
-        # Apply Finetrainers patches for LoRA weight loading
-        try:
-            apply_lora_loading_patch()
-        except Exception as e:
-            logger.warning(f"Failed to apply Finetrainers LoRA loading patch: {e}")
         # Start background cleanup task
         self._cleanup_stop_event = threading.Event()
@@ -585,7 +579,6 @@ class TrainingService:
         precomputation_items: int = DEFAULT_PRECOMPUTATION_ITEMS,
         lr_warmup_steps: int = DEFAULT_NB_LR_WARMUP_STEPS,
         progress: Optional[gr.Progress] = None,
-        pretrained_lora_path: Optional[str] = None,
     ) -> Tuple[str, str]:
         """Start training with finetrainers"""
@@ -836,29 +829,6 @@ class TrainingService:
                     self.append_log(error_msg)
                     return error_msg, "No valid checkpoints available"
-            # Add pretrained LoRA path if provided (for starting fresh training with existing weights)
-            if pretrained_lora_path:
-                # Validate the LoRA path exists and contains required files
-                lora_path = Path(pretrained_lora_path)
-                lora_weights_file = lora_path / "pytorch_lora_weights.safetensors"
-                if not lora_path.exists():
-                    error_msg = f"Pretrained LoRA path does not exist: {pretrained_lora_path}"
-                    logger.error(error_msg)
-                    self.append_log(error_msg)
-                    return error_msg, "LoRA path not found"
-                if not lora_weights_file.exists():
-                    error_msg = f"LoRA weights file not found: {lora_weights_file}"
-                    logger.error(error_msg)
-                    self.append_log(error_msg)
-                    return error_msg, "LoRA weights file missing"
-                # Set the pretrained LoRA path for the patched Finetrainers
-                config.pretrained_lora_path = str(lora_path)
-                self.append_log(f"Starting training with pretrained LoRA weights from: {lora_path}")
-                logger.info(f"Using pretrained LoRA weights: {lora_path}")
             # Common settings for both models
             config.mixed_precision = DEFAULT_MIXED_PRECISION
             config.seed = DEFAULT_SEED
@@ -1823,4 +1793,47 @@ class TrainingService:
                 return temp_zip_path
             except Exception as e:
                 print(f"Failed to create output zip: {str(e)}")
-                raise gr.Error(f"Failed to create output zip: {str(e)}")

     prepare_finetrainers_dataset,
     copy_files_to_training_dir
 )
 logger = logging.getLogger(__name__)
 logger.setLevel(logging.INFO)
         self.setup_logging()
         self.ensure_valid_ui_state_file()
         # Start background cleanup task
         self._cleanup_stop_event = threading.Event()
         precomputation_items: int = DEFAULT_PRECOMPUTATION_ITEMS,
         lr_warmup_steps: int = DEFAULT_NB_LR_WARMUP_STEPS,
         progress: Optional[gr.Progress] = None,
     ) -> Tuple[str, str]:
         """Start training with finetrainers"""
                     self.append_log(error_msg)
                     return error_msg, "No valid checkpoints available"
             # Common settings for both models
             config.mixed_precision = DEFAULT_MIXED_PRECISION
             config.seed = DEFAULT_SEED
                 return temp_zip_path
             except Exception as e:
                 print(f"Failed to create output zip: {str(e)}")
+                raise gr.Error(f"Failed to create output zip: {str(e)}")
+    def create_checkpoint_zip(self) -> Optional[str]:
+        """Create a ZIP file containing the latest finetrainers checkpoint
+        Returns:
+            Path to created ZIP file or None if no checkpoint found
+        """
+        # Find all checkpoint directories
+        checkpoints = list(self.app.output_path.glob("finetrainers_step_*"))
+        if not checkpoints:
+            logger.info("No checkpoint directories found")
+            raise gr.Error("No checkpoint directories found")
+        # Get the latest checkpoint by step number
+        latest_checkpoint = max(checkpoints, key=lambda x: int(x.name.split("_")[-1]))
+        step_num = int(latest_checkpoint.name.split("_")[-1])
+        # Create temporary zip file
+        with tempfile.NamedTemporaryFile(suffix='.zip', delete=False) as temp_zip:
+            temp_zip_path = str(temp_zip.name)
+            print(f"Creating zip file for checkpoint {latest_checkpoint.name}..")
+            try:
+                make_archive(latest_checkpoint, temp_zip_path)
+                print(f"Checkpoint zip file created for step {step_num}!")
+                return temp_zip_path
+            except Exception as e:
+                print(f"Failed to create checkpoint zip: {str(e)}")
+                raise gr.Error(f"Failed to create checkpoint zip: {str(e)}")
+    def get_checkpoint_button_text(self) -> str:
+        """Get the dynamic text for the download checkpoint button based on available checkpoints"""
+        try:
+            checkpoints = list(self.app.output_path.glob("finetrainers_step_*"))
+            if not checkpoints:
+                return "📥 Download checkpoints (not available)"
+            # Get the latest checkpoint by step number
+            latest_checkpoint = max(checkpoints, key=lambda x: int(x.name.split("_")[-1]))
+            step_num = int(latest_checkpoint.name.split("_")[-1])
+            return f"📥 Download checkpoints (step {step_num})"
+        except Exception as e:
+            logger.warning(f"Error getting checkpoint info for button text: {e}")
+            return "📥 Download checkpoints (not available)"

vms/ui/project/tabs/manage_tab.py CHANGED Viewed

@@ -39,6 +39,14 @@ class ManageTab(BaseTab):
             logger.warning(f"Error getting model info for button text: {e}")
             return "🧠 Download weights (.safetensors)"
     def update_download_button_text(self) -> gr.update:
         """Update the download button text"""
         return gr.update(value=self.get_download_button_text())
@@ -76,6 +84,12 @@ class ManageTab(BaseTab):
                             size="lg"
                         )
                         self.components["download_output_btn"] = gr.DownloadButton(
                             "📁 Download output directory (.zip)",
                             variant="secondary",
@@ -232,6 +246,11 @@ class ManageTab(BaseTab):
             outputs=[self.components["download_model_btn"]]
         )
         self.components["download_output_btn"].click(
             fn=self.app.training.create_output_directory_zip,
             outputs=[self.components["download_output_btn"]]

             logger.warning(f"Error getting model info for button text: {e}")
             return "🧠 Download weights (.safetensors)"
+    def get_checkpoint_button_text(self) -> str:
+        """Get the dynamic text for the download checkpoint button"""
+        try:
+            return self.app.training.get_checkpoint_button_text()
+        except Exception as e:
+            logger.warning(f"Error getting checkpoint button text: {e}")
+            return "📥 Download checkpoints (not available)"
     def update_download_button_text(self) -> gr.update:
         """Update the download button text"""
         return gr.update(value=self.get_download_button_text())
                             size="lg"
                         )
+                        self.components["download_checkpoint_btn"] = gr.DownloadButton(
+                            self.get_checkpoint_button_text(),
+                            variant="secondary",
+                            size="lg"
+                        )
                         self.components["download_output_btn"] = gr.DownloadButton(
                             "📁 Download output directory (.zip)",
                             variant="secondary",
             outputs=[self.components["download_model_btn"]]
         )
+        self.components["download_checkpoint_btn"].click(
+            fn=self.app.training.create_checkpoint_zip,
+            outputs=[self.components["download_checkpoint_btn"]]
+        )
         self.components["download_output_btn"].click(
             fn=self.app.training.create_output_directory_zip,
             outputs=[self.components["download_output_btn"]]

vms/ui/project/tabs/train_tab.py CHANGED Viewed

@@ -369,11 +369,12 @@ For image-to-video tasks, 'index' (usually with index 0) is most common as it co
                                         interactive=has_checkpoints and not ASK_USER_TO_DUPLICATE_SPACE
                                     )
-                                    # Add new button for starting from LoRA weights
                                     self.components["start_from_lora_btn"] = gr.Button(
                                         "🔄 Start over using latest LoRA weights",
                                         variant="secondary",
-                                        interactive=has_lora_weights and not ASK_USER_TO_DUPLICATE_SPACE
                                     )
                                 with gr.Row():

                                         interactive=has_checkpoints and not ASK_USER_TO_DUPLICATE_SPACE
                                     )
+                                    # Starting from LoRA weights is DISABLED for now
                                     self.components["start_from_lora_btn"] = gr.Button(
                                         "🔄 Start over using latest LoRA weights",
                                         variant="secondary",
+                                        interactive=has_lora_weights and not ASK_USER_TO_DUPLICATE_SPACE,
+                                        visible=False,
                                     )
                                 with gr.Row():