hf-train-frontend

Paused

App Files Files Community

George-API commited on Mar 9

Commit

3da7418

verified ·

1 Parent(s): 1d4c4c4

Upload folder using huggingface_hub

Browse files

Files changed (4) hide show

dataset_config.json +4 -3
hardware_config.json +27 -20
transformers_config.json +16 -4
update_space.py +14 -4

dataset_config.json CHANGED Viewed

@@ -26,11 +26,12 @@
         }
     },
     "data_loading": {
-        "batch_size": 16,
         "shuffle": false,
         "drop_last": false,
-        "num_workers": 2,
-        "pin_memory": false
     },
     "validation": {
         "log_samples": 3,

         }
     },
     "data_loading": {
+        "batch_size": 24,
         "shuffle": false,
         "drop_last": false,
+        "num_workers": 8,
+        "pin_memory": true,
+        "prefetch_factor": 4
     },
     "validation": {
         "log_samples": 3,

hardware_config.json CHANGED Viewed

@@ -1,42 +1,49 @@
 {
-  "hardware_name": "2xA10G",
   "specs": {
-    "gpu_count": 2,
-    "gpu_type": "A10G",
     "vram_per_gpu": 24,
-    "total_vram": 48,
-    "vcpu_count": 24,
-    "ram": 92
   },
   "training_optimizations": {
-    "per_device_batch_size": 16,
-    "gradient_accumulation_steps": 4,
-    "effective_batch_size": 128,
     "memory_optimizations": {
       "use_gradient_checkpointing": true,
       "pin_memory": true,
-      "num_workers": 2
     },
     "distributed_settings": {
       "device_map": "auto",
-      "ddp_find_unused_parameters": false
     }
   },
   "memory_breakdown": {
     "model_size": "~3.5GB (pre-quantized 4-bit)",
     "optimizer_states": "~1GB",
-    "batch_memory_per_gpu": "~2GB",
-    "peak_memory_estimate": "18-20GB",
-    "safe_headroom": "4-6GB"
   },
-  "compute_environment": "A10G_CLOUD",
-  "distributed_type": "DATA_PARALLEL",
   "mixed_precision": "bf16",
-  "num_gpus": 2,
   "training_parameters": {
-    "per_device_train_batch_size": 16,
-    "gradient_accumulation_steps": 4,
-    "dataloader_num_workers": 2,
     "dataloader_pin_memory": true,
     "gradient_checkpointing": true,
     "max_grad_norm": 1.0

 {
+  "hardware_name": "4xL4",
   "specs": {
+    "gpu_count": 4,
+    "gpu_type": "L4",
     "vram_per_gpu": 24,
+    "total_vram": 96,
+    "vcpu_count": 48,
+    "ram": 186
   },
   "training_optimizations": {
+    "per_device_batch_size": 32,
+    "gradient_accumulation_steps": 2,
+    "effective_batch_size": 256,
     "memory_optimizations": {
       "use_gradient_checkpointing": true,
       "pin_memory": true,
+      "num_workers": 8,
+      "use_flash_attention": true
     },
     "distributed_settings": {
       "device_map": "auto",
+      "ddp_find_unused_parameters": false,
+      "use_fsdp": true,
+      "fsdp_config": {
+        "sharding_strategy": "FULL_SHARD",
+        "mixed_precision": "BF16",
+        "activation_checkpointing": true
+      }
     }
   },
   "memory_breakdown": {
     "model_size": "~3.5GB (pre-quantized 4-bit)",
     "optimizer_states": "~1GB",
+    "batch_memory_per_gpu": "~3GB",
+    "peak_memory_estimate": "~18GB",
+    "safe_headroom": "~6GB"
   },
+  "compute_environment": "L4_CLOUD",
+  "distributed_type": "FSDP",
   "mixed_precision": "bf16",
+  "num_gpus": 4,
   "training_parameters": {
+    "per_device_train_batch_size": 32,
+    "gradient_accumulation_steps": 2,
+    "dataloader_num_workers": 8,
     "dataloader_pin_memory": true,
     "gradient_checkpointing": true,
     "max_grad_norm": 1.0

transformers_config.json CHANGED Viewed

@@ -13,9 +13,9 @@
   },
   "training": {
-    "per_device_train_batch_size": 16,
-    "gradient_accumulation_steps": 4,
-    "learning_rate": 2e-5,
     "num_train_epochs": 3,
     "max_steps": -1,
     "logging_steps": 10,
@@ -26,7 +26,7 @@
     "gradient_checkpointing": true,
     "optim": "adamw_torch",
     "lr_scheduler_type": "cosine",
-    "warmup_ratio": 0.03,
     "weight_decay": 0.01,
     "max_grad_norm": 1.0,
     "neftune_noise_alpha": 5
@@ -56,6 +56,18 @@
     ]
   },
   "logging": {
     "logging_steps": 50,
     "log_level": "info"

   },
   "training": {
+    "per_device_train_batch_size": 24,
+    "gradient_accumulation_steps": 2,
+    "learning_rate": 3e-5,
     "num_train_epochs": 3,
     "max_steps": -1,
     "logging_steps": 10,
     "gradient_checkpointing": true,
     "optim": "adamw_torch",
     "lr_scheduler_type": "cosine",
+    "warmup_ratio": 0.05,
     "weight_decay": 0.01,
     "max_grad_norm": 1.0,
     "neftune_noise_alpha": 5
     ]
   },
+  "distributed_training": {
+    "fsdp_config": {
+      "enabled": true,
+      "sharding_strategy": "FULL_SHARD",
+      "mixed_precision": "BF16",
+      "activation_checkpointing": true,
+      "offload_params": false
+    },
+    "ddp_find_unused_parameters": false,
+    "dataloader_num_workers": 8
+  },
   "logging": {
     "logging_steps": 50,
     "log_level": "info"

update_space.py CHANGED Viewed

@@ -31,7 +31,12 @@ def load_env_variables():
         from dotenv import load_dotenv
         env_path = Path(__file__).parent / ".env"
         if env_path.exists():
-            load_dotenv(env_path)
             logger.info(f"Loaded environment variables from {env_path}")
         else:
             logger.warning(f"No .env file found at {env_path}")
@@ -53,10 +58,15 @@ def load_env_variables():
         "HF_SPACE_NAME": os.environ.get("HF_SPACE_NAME", "phi4training")
     }
     missing_vars = [k for k, v in required_vars.items() if not v]
     if missing_vars:
         raise ValueError(f"Missing required environment variables: {', '.join(missing_vars)}")
     return required_vars
 def verify_configs():
@@ -138,7 +148,7 @@ def create_space(username, space_name):
         # Create new space
         try:
             api.create_repo(
-                repo_id=space_name,
                 private=False,
                 repo_type="space",
                 space_sdk="gradio"
@@ -181,8 +191,8 @@ def main():
         update_requirements()
         logger.info("Requirements updated successfully")
-        # Get space name
-        space_name = args.space_name or env_vars["HF_SPACE_NAME"]
         logger.info(f"Using space name: {space_name}")
         # Login to Hugging Face

         from dotenv import load_dotenv
         env_path = Path(__file__).parent / ".env"
         if env_path.exists():
+            # Load and explicitly set environment variables
+            with open(env_path) as f:
+                for line in f:
+                    if line.strip() and not line.startswith('#'):
+                        key, value = line.strip().split('=', 1)
+                        os.environ[key] = value.strip()
             logger.info(f"Loaded environment variables from {env_path}")
         else:
             logger.warning(f"No .env file found at {env_path}")
         "HF_SPACE_NAME": os.environ.get("HF_SPACE_NAME", "phi4training")
     }
+    # Ensure the space name is set correctly
+    if "HF_SPACE_NAME" not in os.environ:
+        os.environ["HF_SPACE_NAME"] = "phi4training"
     missing_vars = [k for k, v in required_vars.items() if not v]
     if missing_vars:
         raise ValueError(f"Missing required environment variables: {', '.join(missing_vars)}")
+    logger.info(f"Using environment variables: USERNAME={required_vars['HF_USERNAME']}, SPACE_NAME={required_vars['HF_SPACE_NAME']}")
     return required_vars
 def verify_configs():
         # Create new space
         try:
             api.create_repo(
+                repo_id=space_id,
                 private=False,
                 repo_type="space",
                 space_sdk="gradio"
         update_requirements()
         logger.info("Requirements updated successfully")
+        # Get space name from args or env, prioritize args
+        space_name = args.space_name if args.space_name else env_vars["HF_SPACE_NAME"]
         logger.info(f"Using space name: {space_name}")
         # Login to Hugging Face