import torch
import gmm_transformer as gmm_model

def load_model(
    n_components = 6,
    hidden_d = 24 * 4,
    out_d = 24,
    n_heads = 4,
    mlp_ratio = 8,
    n_blocks = 6,
    encoder_path = r'_encoder_25_4537398.pth',
    path_para = r'_embedding_25_4537398.pth',
    path_token = r'_emb_empty_token_25_4537398.pth',
    random_sample_num = None
    ):

    chw = (1, random_sample_num,  25)
    
    # Set device to GPU if available, otherwise use CPU
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    # Initialize the transformer model
    encoder = gmm_model.ViT_encodernopara(chw, hidden_d, out_d, n_heads, mlp_ratio, n_blocks).to(device)
    _model_scale = sum(p.numel() for p in encoder.parameters() if p.requires_grad)
    print('Number of parameters of encoder:', _model_scale)

    # Load the pre-trained model state
    encoder.load_state_dict(torch.load(encoder_path, map_location=device))

    state_dict_para = torch.load(path_para, map_location=device)
    state_dict_token = torch.load(path_token, map_location=device)
    
    return encoder, state_dict_para, state_dict_token