|
#include <torch/extension.h> |
|
|
|
void multi_tensor_scale_cuda( |
|
int chunk_size, |
|
at::Tensor noop_flag, |
|
std::vector<std::vector<at::Tensor>> tensor_lists, |
|
float scale); |
|
|
|
void multi_tensor_sgd_cuda( |
|
int chunk_size, |
|
at::Tensor noop_flag, |
|
std::vector<std::vector<at::Tensor>> tensor_lists, |
|
float wd, |
|
float momentum, |
|
float dampening, |
|
float lr, |
|
bool nesterov, |
|
bool first_run, |
|
bool wd_after_momentum, |
|
float scale); |
|
|
|
void multi_tensor_axpby_cuda( |
|
int chunk_size, |
|
at::Tensor noop_flag, |
|
std::vector<std::vector<at::Tensor>> tensor_lists, |
|
float a, |
|
float b, |
|
int arg_to_check); |
|
|
|
std::tuple<at::Tensor, at::Tensor> multi_tensor_l2norm_cuda( |
|
int chunk_size, |
|
at::Tensor noop_flag, |
|
std::vector<std::vector<at::Tensor>> tensor_lists, |
|
at::optional<bool> per_tensor_python); |
|
|
|
std::tuple<at::Tensor, at::Tensor> multi_tensor_l2norm_mp_cuda( |
|
int chunk_size, |
|
at::Tensor noop_flag, |
|
std::vector<std::vector<at::Tensor>> tensor_lists, |
|
at::optional<bool> per_tensor_python); |
|
|
|
std::tuple<at::Tensor, at::Tensor> multi_tensor_l2norm_scale_cuda( |
|
int chunk_size, |
|
at::Tensor noop_flag, |
|
std::vector<std::vector<at::Tensor>> tensor_lists, |
|
float scale, |
|
at::optional<bool> per_tensor_python); |
|
|
|
std::tuple<at::Tensor, at::Tensor> multi_tensor_unscale_l2norm_cuda( |
|
int chunk_size, |
|
at::Tensor noop_flag, |
|
std::vector<std::vector<at::Tensor>> tensor_lists, |
|
at::Tensor inv_scale, |
|
at::optional<bool> per_tensor_python); |
|
|
|
void multi_tensor_lamb_stage1_cuda( |
|
int chunk_size, |
|
at::Tensor noop_flag, |
|
std::vector<std::vector<at::Tensor>> tensor_lists, |
|
at::Tensor per_tensor_decay, |
|
const int step, |
|
const float beta1, |
|
const float beta2, |
|
const float epsilon, |
|
at::Tensor global_grad_norm, |
|
const float max_global_grad_norm); |
|
|
|
void multi_tensor_lamb_stage2_cuda( |
|
int chunk_size, |
|
at::Tensor noop_flag, |
|
std::vector<std::vector<at::Tensor>> tensor_lists, |
|
at::Tensor per_tensor_param_norm, |
|
at::Tensor per_tensor_update_norm, |
|
const float lr, |
|
const float weight_decay, |
|
at::optional<bool> use_nvlamb_python); |
|
|
|
void multi_tensor_adam_cuda( |
|
int chunk_size, |
|
at::Tensor noop_flag, |
|
std::vector<std::vector<at::Tensor>> tensor_lists, |
|
const float lr, |
|
const float beta1, |
|
const float beta2, |
|
const float epsilon, |
|
const int step, |
|
const int mode, |
|
const int bias_correction, |
|
const float weight_decay); |
|
|
|
void multi_tensor_adam_capturable_cuda( |
|
int chunk_size, |
|
at::Tensor noop_flag, |
|
std::vector<std::vector<at::Tensor>> tensor_lists, |
|
at::Tensor lr, |
|
const float beta1, |
|
const float beta2, |
|
const float epsilon, |
|
at::Tensor step, |
|
const int mode, |
|
const int bias_correction, |
|
const float weight_decay, |
|
at::Tensor inv_scale); |
|
|
|
void multi_tensor_adam_capturable_master_cuda( |
|
int chunk_size, |
|
at::Tensor noop_flag, |
|
std::vector<std::vector<at::Tensor>> tensor_lists, |
|
at::Tensor lr, |
|
const float beta1, |
|
const float beta2, |
|
const float epsilon, |
|
at::Tensor step, |
|
const int mode, |
|
const int bias_correction, |
|
const float weight_decay, |
|
at::Tensor inv_scale); |
|
|
|
void multi_tensor_adagrad_cuda( |
|
int chunk_size, |
|
at::Tensor noop_flag, |
|
std::vector<std::vector<at::Tensor>> tensor_lists, |
|
const float lr, |
|
const float epsilon, |
|
const int mode, |
|
const float weight_decay); |
|
|
|
|
|
void multi_tensor_novograd_cuda( |
|
int chunk_size, |
|
at::Tensor noop_flag, |
|
std::vector<std::vector<at::Tensor>> tensor_lists, |
|
at::Tensor grad_norms, |
|
const float lr, |
|
const float beta1, |
|
const float beta2, |
|
const float epsilon, |
|
const int step, |
|
const int bias_correction, |
|
const float weight_decay, |
|
const int grad_averaging, |
|
const int mode, |
|
const int norm_type); |
|
|
|
void multi_tensor_lamb_cuda( |
|
int chunk_size, |
|
at::Tensor noop_flag, |
|
std::vector<std::vector<at::Tensor>> tensor_lists, |
|
const float lr, |
|
const float beta1, |
|
const float beta2, |
|
const float epsilon, |
|
const int step, |
|
const int bias_correction, |
|
const float weight_decay, |
|
const int grad_averaging, |
|
const int mode, |
|
at::Tensor global_grad_norm, |
|
const float max_grad_norm, |
|
at::optional<bool> use_nvlamb_python); |
|
|
|
void multi_tensor_lamb_mp_cuda( |
|
int chunk_size, |
|
at::Tensor noop_flag, |
|
std::vector<std::vector<at::Tensor>> tensor_lists, |
|
at::Tensor lr, |
|
const float beta1, |
|
const float beta2, |
|
const float epsilon, |
|
at::Tensor step, |
|
const int bias_correction, |
|
const float weight_decay, |
|
const int grad_averaging, |
|
const int mode, |
|
at::Tensor global_grad_norm, |
|
at::Tensor max_grad_norm, |
|
at::optional<bool> use_nvlamb_python, |
|
at::Tensor found_inf, |
|
at::Tensor inv_scale); |
|
|
|
at::Tensor update_scale_hysteresis_cuda( |
|
at::Tensor current_scale, |
|
at::Tensor growth_tracker, |
|
at::Tensor hysteresis_tracker, |
|
at::Tensor found_inf, |
|
const double growth_factor, |
|
const double backoff_factor, |
|
const int64_t growth_interval, |
|
const int hysteresis); |
|
|
|
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { |
|
m.def("multi_tensor_scale", &multi_tensor_scale_cuda, |
|
"Fused overflow check + scale for a list of contiguous tensors"); |
|
m.def("multi_tensor_sgd", &multi_tensor_sgd_cuda, |
|
"Fused SGD optimizer for list of contiguous tensors"); |
|
m.def("multi_tensor_axpby", &multi_tensor_axpby_cuda, |
|
"out = a*x + b*y for a list of contiguous tensors"); |
|
m.def("multi_tensor_l2norm", &multi_tensor_l2norm_cuda, |
|
"Computes L2 norm for a list of contiguous tensors"); |
|
m.def("multi_tensor_l2norm_mp", &multi_tensor_l2norm_mp_cuda, |
|
"Computes L2 norm for a list of contiguous tensors"); |
|
m.def("multi_tensor_l2norm_scale", &multi_tensor_l2norm_scale_cuda, |
|
"Computes L2 norm for a list of contiguous tensors and does scaling"); |
|
m.def("multi_tensor_unscale_l2norm", &multi_tensor_unscale_l2norm_cuda, |
|
"Computes L2 norm for a list of contiguous tensors after unscaling (unscaling is only performed for L2 norm computation, and tensors are not updated)"); |
|
m.def("multi_tensor_lamb_stage1_cuda", &multi_tensor_lamb_stage1_cuda, |
|
"Computes update part of LAMB optimizer"); |
|
m.def("multi_tensor_lamb_stage2_cuda", &multi_tensor_lamb_stage2_cuda, |
|
"Completes application of gradient to parameters for LAMB optimizer"); |
|
m.def("multi_tensor_adam", &multi_tensor_adam_cuda, |
|
"Compute and apply gradient update to parameters for Adam optimizer"); |
|
m.def("multi_tensor_adam_capturable", &multi_tensor_adam_capturable_cuda, |
|
"Compute and apply gradient update to parameters for Adam optimizer with CUDA graph support and LR scheduling"); |
|
m.def("multi_tensor_adam_capturable_master", &multi_tensor_adam_capturable_master_cuda, |
|
"Compute and apply gradient update to parameters for Adam optimizer with CUDA graph support, LR scheduling and FP32 master weights"); |
|
m.def("multi_tensor_adagrad", &multi_tensor_adagrad_cuda, |
|
"Compute and apply gradient update to parameters for Adam optimizer"); |
|
m.def("multi_tensor_novograd", &multi_tensor_novograd_cuda, |
|
"Compute and apply gradient update to parameters for Adam optimizer"); |
|
m.def("multi_tensor_lamb", &multi_tensor_lamb_cuda, |
|
"Computes and apply update for LAMB optimizer"); |
|
m.def("multi_tensor_lamb_mp", &multi_tensor_lamb_mp_cuda, |
|
"Computes and apply update for LAMB optimizer"); |
|
m.def("update_scale_hysteresis", &update_scale_hysteresis_cuda, |
|
"Updates scale while accounting for hysteresis"); |
|
} |
|
|