Spaces:
Sleeping
Sleeping
| model: | |
| type: "custom" | |
| name: "smollm2_transformer" | |
| tokenizer_name: "gpt2" | |
| vocab_size: 50257 | |
| hidden_size: 256 | |
| num_attention_heads: 4 | |
| num_key_value_heads: 2 | |
| num_hidden_layers: 6 | |
| intermediate_size: 512 | |
| hidden_act: "gelu" | |
| max_position_embeddings: 256 | |
| initializer_range: 0.02 | |
| rms_norm_eps: 1.0e-5 | |
| use_cache: true | |
| pad_token_id: null | |
| optimizer: | |
| type: "adamW" | |
| weight_decay: 0.01 | |
| adam_beta1: 0.9 | |
| adam_beta2: 0.95 | |
| adam_eps: 1.0e-8 | |
| torch_adam_is_fused: true | |
| clip_grad: 1.0 | |
| accumulate_grad_in_fp32: true | |
| scheduler: | |
| type: "one_cycle" | |
| learning_rate: 0.001 | |
| warmup_steps: 50 | |
| max_lr: 0.001 | |
| pct_start: 0.02 | |
| anneal_strategy: "cos" | |
| cycle_momentum: false | |
| div_factor: 25.0 | |
| final_div_factor: 1000.0 | |
| training: | |
| output_dir: "./results" | |
| batch_size: 4 | |
| micro_batch_size: 2 | |
| gradient_accumulation_steps: 2 | |
| sequence_length: 256 | |
| learning_rate: 0.001 | |
| max_steps: 5050 # Total steps (5000 + 50) | |
| first_phase_steps: 5000 # Initial training phase | |
| second_phase_steps: 50 # Fine-tuning phase | |
| sample_frequency: 100 # Sample every 100 steps in first phase | |
| second_phase_sample_frequency: 5 # Sample more frequently in second phase | |
| logging_dir: "./logs" | |
| logging_steps: 1 | |
| save_steps: 100 | |
| checkpoint_dir: "checkpoints" | |
| sample_prompt: "Explain what machine learning is:" | |
| max_generate_length: 50 | |
| hardware: | |
| precision: "16-mixed" | |
| accelerator: "gpu" | |
| devices: 1 | |
| strategy: "auto" | |
| gradient_clip: 1.0 | |
| cuda_memory_fraction: 0.9 | |
| allow_tf32: true | |
| benchmark: true | |
| deterministic: false | |
| data: | |
| datasets: | |
| - name: "wikitext" | |
| path: "wikitext" | |
| subset: "wikitext-103-raw-v1" | |
| split_ratio: 0.01 # Use only 1% of the dataset | |
| weight: 1.0 | |
| loading: | |
| num_workers: 2 | |
| batch_size: 16 | |
| pin_memory: true | |
| prefetch_factor: 2 | |
| persistent_workers: true |