Initial commit.
Browse files- .gitattributes +1 -0
- README.md +85 -3
- SmolVLA_Eng.mp4 +3 -0
- config.json +81 -0
- model.safetensors +3 -0
- train_config.json +193 -0
.gitattributes
CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
SmolVLA_Eng.mp4 filter=lfs diff=lfs merge=lfs -text
|
README.md
CHANGED
@@ -1,3 +1,85 @@
|
|
1 |
-
---
|
2 |
-
license: apache-2.0
|
3 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
license: apache-2.0
|
3 |
+
datasets:
|
4 |
+
- masato-ka/so100_grasp_lego
|
5 |
+
tags:
|
6 |
+
- robotics
|
7 |
+
- act-policy
|
8 |
+
- lerobot
|
9 |
+
pipeline_tag: robotics
|
10 |
+
---
|
11 |
+
# Model Card for smolvla_block_instruction
|
12 |
+
|
13 |
+
SmolVLA trained for the block handling with text instructions.
|
14 |
+
|
15 |
+
|
16 |
+
|
17 |
+
<video src="SmolVLA_Eng.mp4" controls="true" width="640"></video>
|
18 |
+
|
19 |
+
## How to Get Started with the Model
|
20 |
+
|
21 |
+
See the [Lerobot library](https://github.com/huggingface/lerobot)
|
22 |
+
|
23 |
+
We strong recommend the environment needs to be the same as the video. I use the camera of Macbook Air M2, Also The model was inference by Macbook Air M2 16GB.
|
24 |
+
You can run this model with below command. Instruction set to control.single_task property.
|
25 |
+
|
26 |
+
```bash
|
27 |
+
python erobot/scripts/control_robot.py
|
28 |
+
--robot.type=so100
|
29 |
+
--control.type=record
|
30 |
+
--control.fps=30
|
31 |
+
--control.single_task="Transfer the blue block onto the yellow plate."
|
32 |
+
--control.repo_id=<YOUR EVAL DATASET>
|
33 |
+
--control.warmup_time_s=5
|
34 |
+
--control.episode_time_s=60
|
35 |
+
--control.reset_time_s=10
|
36 |
+
--control.num_episodes=1
|
37 |
+
--control.push_to_hub=false
|
38 |
+
--control.policy.path=masato-ka/smolvla_block_instruct
|
39 |
+
--control.display_data=true
|
40 |
+
--control.policy.device=mps
|
41 |
+
|
42 |
+
```
|
43 |
+
|
44 |
+
This model trained with below instruction.
|
45 |
+
|
46 |
+
```aiignore
|
47 |
+
- Transfer the blue block onto the yellow plate.
|
48 |
+
- Position the blue block a top the yellow plate.
|
49 |
+
- Set the blue block down on the yellow plate.
|
50 |
+
- Place blue block on yellow plate.
|
51 |
+
- Blue block goes on the yellow plate!
|
52 |
+
- Put the blue one on the yellow thing.
|
53 |
+
- Yellow plate for the blue block!
|
54 |
+
- Completely remove the blue block from the yellow plate.
|
55 |
+
- The blue block must be taken away from the yellow plate.
|
56 |
+
- Dislodge the blue block from the yellow plate entirely."
|
57 |
+
- Get that blue block off the yellow plate!
|
58 |
+
- Take the blue thing away from the yellow one.
|
59 |
+
- Blue's gotta go from the yellow plate!
|
60 |
+
- Remove blue block from yellow plate.
|
61 |
+
```
|
62 |
+
|
63 |
+
|
64 |
+
|
65 |
+
## Training Details
|
66 |
+
|
67 |
+
Trained with [LeRobot@b536f47](https://github.com/huggingface/lerobot/tree/b536f47e3ff8c3b340fc5efa52f0ece0a7212a57).
|
68 |
+
|
69 |
+
The model was trained using [LeRobot's training script](https://github.com/huggingface/lerobot/blob/b536f47e3ff8c3b340fc5efa52f0ece0a7212a57/lerobot/scripts/train.py) and with the [masato-ka/so100_nlact_block_instruct_v3](https://huggingface.co/datasets/masato-ka/so100_nlact_block_instruct_v3) dataset, using this command:
|
70 |
+
|
71 |
+
```bash
|
72 |
+
!python lerobot/scripts/train.py \
|
73 |
+
--dataset.repo_id=masato-ka/so100_nlact_block_instruct_v3 \
|
74 |
+
--policy.path=lerobot/smolvla_base \
|
75 |
+
--batch_size=8 \
|
76 |
+
--output_dir=outputs/train/smolvla \
|
77 |
+
--job_name=smolvla_exp03 \
|
78 |
+
--policy.device=cuda \
|
79 |
+
--steps=40000\
|
80 |
+
--save_freq=20000 \
|
81 |
+
--wandb.enable=true \
|
82 |
+
--wandb.project=smolvla_test
|
83 |
+
```
|
84 |
+
|
85 |
+
This took about 3h to train on an Nvida A100.
|
SmolVLA_Eng.mp4
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:565b6f516395efb05137e227c8d1133ebc7ab2be3af0dabbfd450f304050e530
|
3 |
+
size 11398607
|
config.json
ADDED
@@ -0,0 +1,81 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"type": "smolvla",
|
3 |
+
"n_obs_steps": 1,
|
4 |
+
"normalization_mapping": {
|
5 |
+
"VISUAL": "IDENTITY",
|
6 |
+
"STATE": "MEAN_STD",
|
7 |
+
"ACTION": "MEAN_STD"
|
8 |
+
},
|
9 |
+
"input_features": {
|
10 |
+
"observation.state": {
|
11 |
+
"type": "STATE",
|
12 |
+
"shape": [
|
13 |
+
6
|
14 |
+
]
|
15 |
+
},
|
16 |
+
"observation.images.front": {
|
17 |
+
"type": "VISUAL",
|
18 |
+
"shape": [
|
19 |
+
3,
|
20 |
+
480,
|
21 |
+
640
|
22 |
+
]
|
23 |
+
},
|
24 |
+
"observation.instruction": {
|
25 |
+
"type": "STATE",
|
26 |
+
"shape": [
|
27 |
+
384
|
28 |
+
]
|
29 |
+
}
|
30 |
+
},
|
31 |
+
"output_features": {
|
32 |
+
"action": {
|
33 |
+
"type": "ACTION",
|
34 |
+
"shape": [
|
35 |
+
6
|
36 |
+
]
|
37 |
+
}
|
38 |
+
},
|
39 |
+
"device": "cuda",
|
40 |
+
"use_amp": false,
|
41 |
+
"chunk_size": 50,
|
42 |
+
"n_action_steps": 50,
|
43 |
+
"max_state_dim": 32,
|
44 |
+
"max_action_dim": 32,
|
45 |
+
"resize_imgs_with_padding": [
|
46 |
+
512,
|
47 |
+
512
|
48 |
+
],
|
49 |
+
"empty_cameras": 0,
|
50 |
+
"adapt_to_pi_aloha": false,
|
51 |
+
"use_delta_joint_actions_aloha": false,
|
52 |
+
"tokenizer_max_length": 48,
|
53 |
+
"num_steps": 10,
|
54 |
+
"use_cache": true,
|
55 |
+
"freeze_vision_encoder": true,
|
56 |
+
"train_expert_only": true,
|
57 |
+
"train_state_proj": true,
|
58 |
+
"optimizer_lr": 0.0001,
|
59 |
+
"optimizer_betas": [
|
60 |
+
0.9,
|
61 |
+
0.95
|
62 |
+
],
|
63 |
+
"optimizer_eps": 1e-08,
|
64 |
+
"optimizer_weight_decay": 1e-10,
|
65 |
+
"optimizer_grad_clip_norm": 10.0,
|
66 |
+
"scheduler_warmup_steps": 1000,
|
67 |
+
"scheduler_decay_steps": 30000,
|
68 |
+
"scheduler_decay_lr": 2.5e-06,
|
69 |
+
"vlm_model_name": "HuggingFaceTB/SmolVLM2-500M-Video-Instruct",
|
70 |
+
"load_vlm_weights": true,
|
71 |
+
"add_image_special_tokens": false,
|
72 |
+
"attention_mode": "cross_attn",
|
73 |
+
"prefix_length": 0,
|
74 |
+
"pad_language_to": "max_length",
|
75 |
+
"num_expert_layers": 0,
|
76 |
+
"num_vlm_layers": 16,
|
77 |
+
"self_attn_every_n_layers": 2,
|
78 |
+
"expert_width_multiplier": 0.75,
|
79 |
+
"min_period": 0.004,
|
80 |
+
"max_period": 4.0
|
81 |
+
}
|
model.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:51be8ded65145ee5070b3660d6a40b3730aa6d00639e01d54ccdad6d4246d2f5
|
3 |
+
size 906716608
|
train_config.json
ADDED
@@ -0,0 +1,193 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"dataset": {
|
3 |
+
"repo_id": "masato-ka/so100_nlact_block_instruct_v3",
|
4 |
+
"root": null,
|
5 |
+
"episodes": null,
|
6 |
+
"image_transforms": {
|
7 |
+
"enable": false,
|
8 |
+
"max_num_transforms": 3,
|
9 |
+
"random_order": false,
|
10 |
+
"tfs": {
|
11 |
+
"brightness": {
|
12 |
+
"weight": 1.0,
|
13 |
+
"type": "ColorJitter",
|
14 |
+
"kwargs": {
|
15 |
+
"brightness": [
|
16 |
+
0.8,
|
17 |
+
1.2
|
18 |
+
]
|
19 |
+
}
|
20 |
+
},
|
21 |
+
"contrast": {
|
22 |
+
"weight": 1.0,
|
23 |
+
"type": "ColorJitter",
|
24 |
+
"kwargs": {
|
25 |
+
"contrast": [
|
26 |
+
0.8,
|
27 |
+
1.2
|
28 |
+
]
|
29 |
+
}
|
30 |
+
},
|
31 |
+
"saturation": {
|
32 |
+
"weight": 1.0,
|
33 |
+
"type": "ColorJitter",
|
34 |
+
"kwargs": {
|
35 |
+
"saturation": [
|
36 |
+
0.5,
|
37 |
+
1.5
|
38 |
+
]
|
39 |
+
}
|
40 |
+
},
|
41 |
+
"hue": {
|
42 |
+
"weight": 1.0,
|
43 |
+
"type": "ColorJitter",
|
44 |
+
"kwargs": {
|
45 |
+
"hue": [
|
46 |
+
-0.05,
|
47 |
+
0.05
|
48 |
+
]
|
49 |
+
}
|
50 |
+
},
|
51 |
+
"sharpness": {
|
52 |
+
"weight": 1.0,
|
53 |
+
"type": "SharpnessJitter",
|
54 |
+
"kwargs": {
|
55 |
+
"sharpness": [
|
56 |
+
0.5,
|
57 |
+
1.5
|
58 |
+
]
|
59 |
+
}
|
60 |
+
}
|
61 |
+
}
|
62 |
+
},
|
63 |
+
"revision": null,
|
64 |
+
"use_imagenet_stats": true,
|
65 |
+
"video_backend": "torchcodec"
|
66 |
+
},
|
67 |
+
"env": null,
|
68 |
+
"policy": {
|
69 |
+
"type": "smolvla",
|
70 |
+
"n_obs_steps": 1,
|
71 |
+
"normalization_mapping": {
|
72 |
+
"VISUAL": "IDENTITY",
|
73 |
+
"STATE": "MEAN_STD",
|
74 |
+
"ACTION": "MEAN_STD"
|
75 |
+
},
|
76 |
+
"input_features": {
|
77 |
+
"observation.state": {
|
78 |
+
"type": "STATE",
|
79 |
+
"shape": [
|
80 |
+
6
|
81 |
+
]
|
82 |
+
},
|
83 |
+
"observation.images.front": {
|
84 |
+
"type": "VISUAL",
|
85 |
+
"shape": [
|
86 |
+
3,
|
87 |
+
480,
|
88 |
+
640
|
89 |
+
]
|
90 |
+
},
|
91 |
+
"observation.instruction": {
|
92 |
+
"type": "STATE",
|
93 |
+
"shape": [
|
94 |
+
384
|
95 |
+
]
|
96 |
+
}
|
97 |
+
},
|
98 |
+
"output_features": {
|
99 |
+
"action": {
|
100 |
+
"type": "ACTION",
|
101 |
+
"shape": [
|
102 |
+
6
|
103 |
+
]
|
104 |
+
}
|
105 |
+
},
|
106 |
+
"device": "cuda",
|
107 |
+
"use_amp": false,
|
108 |
+
"chunk_size": 50,
|
109 |
+
"n_action_steps": 1,
|
110 |
+
"max_state_dim": 32,
|
111 |
+
"max_action_dim": 32,
|
112 |
+
"resize_imgs_with_padding": [
|
113 |
+
512,
|
114 |
+
512
|
115 |
+
],
|
116 |
+
"empty_cameras": 0,
|
117 |
+
"adapt_to_pi_aloha": false,
|
118 |
+
"use_delta_joint_actions_aloha": false,
|
119 |
+
"tokenizer_max_length": 48,
|
120 |
+
"num_steps": 10,
|
121 |
+
"use_cache": true,
|
122 |
+
"freeze_vision_encoder": true,
|
123 |
+
"train_expert_only": true,
|
124 |
+
"train_state_proj": true,
|
125 |
+
"optimizer_lr": 0.0001,
|
126 |
+
"optimizer_betas": [
|
127 |
+
0.9,
|
128 |
+
0.95
|
129 |
+
],
|
130 |
+
"optimizer_eps": 1e-08,
|
131 |
+
"optimizer_weight_decay": 1e-10,
|
132 |
+
"optimizer_grad_clip_norm": 10.0,
|
133 |
+
"scheduler_warmup_steps": 1000,
|
134 |
+
"scheduler_decay_steps": 30000,
|
135 |
+
"scheduler_decay_lr": 2.5e-06,
|
136 |
+
"vlm_model_name": "HuggingFaceTB/SmolVLM2-500M-Video-Instruct",
|
137 |
+
"load_vlm_weights": true,
|
138 |
+
"add_image_special_tokens": false,
|
139 |
+
"attention_mode": "cross_attn",
|
140 |
+
"prefix_length": 0,
|
141 |
+
"pad_language_to": "max_length",
|
142 |
+
"num_expert_layers": 0,
|
143 |
+
"num_vlm_layers": 16,
|
144 |
+
"self_attn_every_n_layers": 2,
|
145 |
+
"expert_width_multiplier": 0.75,
|
146 |
+
"min_period": 0.004,
|
147 |
+
"max_period": 4.0
|
148 |
+
},
|
149 |
+
"output_dir": "outputs/train/smolvla",
|
150 |
+
"job_name": "smolvla_exp03",
|
151 |
+
"resume": false,
|
152 |
+
"seed": 1000,
|
153 |
+
"num_workers": 4,
|
154 |
+
"batch_size": 8,
|
155 |
+
"steps": 100000,
|
156 |
+
"eval_freq": 20000,
|
157 |
+
"log_freq": 200,
|
158 |
+
"save_checkpoint": true,
|
159 |
+
"save_freq": 20000,
|
160 |
+
"use_policy_training_preset": true,
|
161 |
+
"optimizer": {
|
162 |
+
"type": "adamw",
|
163 |
+
"lr": 0.0001,
|
164 |
+
"weight_decay": 1e-10,
|
165 |
+
"grad_clip_norm": 10.0,
|
166 |
+
"betas": [
|
167 |
+
0.9,
|
168 |
+
0.95
|
169 |
+
],
|
170 |
+
"eps": 1e-08
|
171 |
+
},
|
172 |
+
"scheduler": {
|
173 |
+
"type": "cosine_decay_with_warmup",
|
174 |
+
"num_warmup_steps": 1000,
|
175 |
+
"num_decay_steps": 30000,
|
176 |
+
"peak_lr": 0.0001,
|
177 |
+
"decay_lr": 2.5e-06
|
178 |
+
},
|
179 |
+
"eval": {
|
180 |
+
"n_episodes": 50,
|
181 |
+
"batch_size": 50,
|
182 |
+
"use_async_envs": false
|
183 |
+
},
|
184 |
+
"wandb": {
|
185 |
+
"enable": true,
|
186 |
+
"disable_artifact": false,
|
187 |
+
"project": "smolvla_test",
|
188 |
+
"entity": null,
|
189 |
+
"notes": null,
|
190 |
+
"run_id": null,
|
191 |
+
"mode": null
|
192 |
+
}
|
193 |
+
}
|