File size: 3,910 Bytes
13d3ba0 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 |
# Convert a SAM model checkpoint to a ggml compatible file
#
import sys
import torch
import struct
import numpy as np
if len(sys.argv) < 3:
print("Usage: convert-pth-to-ggml.py file-model dir-output [ftype]\n")
print(" ftype == 0 -> float32")
print(" ftype == 1 -> float16")
sys.exit(1)
# output in the same directory as the model
fname_model = sys.argv[1]
dir_out = sys.argv[2]
fname_out = dir_out + "/ggml-model.bin"
# possible data types
# ftype == 0 -> float32
# ftype == 1 -> float16
#
# map from ftype to string
ftype_str = ["f32", "f16"]
ftype = 1
if len(sys.argv) > 3:
ftype = int(sys.argv[3])
if ftype < 0 or ftype > 1:
print("Invalid ftype: " + str(ftype))
sys.exit(1)
fname_out = fname_out.replace(".bin", "-" + ftype_str[ftype] + ".bin")
# Default params are set to sam_vit_b checkpoint
n_enc_state = 768
n_enc_layers = 12
n_enc_heads = 12
n_enc_out_chans = 256
n_pt_embd = 4
model = torch.load(fname_model, map_location="cpu")
for k, v in model.items():
print(k, v.shape)
if k == "image_encoder.blocks.0.norm1.weight":
n_enc_state = v.shape[0]
if n_enc_state == 1024: # sam_vit_l
n_enc_layers = 24
n_enc_heads = 16
elif n_enc_state == 1280: # sam_vit_h
n_enc_layers = 32
n_enc_heads = 16
hparams = {
"n_enc_state": n_enc_state,
"n_enc_layers": n_enc_layers,
"n_enc_heads": n_enc_heads,
"n_enc_out_chans": n_enc_out_chans,
"n_pt_embd": n_pt_embd,
}
print(hparams)
for k, v in model.items():
print(k, v.shape)
#exit()
#code.interact(local=locals())
fout = open(fname_out, "wb")
fout.write(struct.pack("i", 0x67676d6c)) # magic: ggml in hex
fout.write(struct.pack("i", hparams["n_enc_state"]))
fout.write(struct.pack("i", hparams["n_enc_layers"]))
fout.write(struct.pack("i", hparams["n_enc_heads"]))
fout.write(struct.pack("i", hparams["n_enc_out_chans"]))
fout.write(struct.pack("i", hparams["n_pt_embd"]))
fout.write(struct.pack("i", ftype))
for k, v in model.items():
name = k
shape = v.shape
if name[:19] == "prompt_encoder.mask":
continue
print("Processing variable: " + name + " with shape: ", shape, " and type: ", v.dtype)
#data = tf.train.load_variable(dir_model, name).squeeze()
#data = v.numpy().squeeze()
data = v.numpy()
n_dims = len(data.shape)
# for efficiency - transpose some matrices
# "model/h.*/attn/c_attn/w"
# "model/h.*/attn/c_proj/w"
# "model/h.*/mlp/c_fc/w"
# "model/h.*/mlp/c_proj/w"
#if name[-14:] == "/attn/c_attn/w" or \
# name[-14:] == "/attn/c_proj/w" or \
# name[-11:] == "/mlp/c_fc/w" or \
# name[-13:] == "/mlp/c_proj/w":
# print(" Transposing")
# data = data.transpose()
dshape = data.shape
# default type is fp16
ftype_cur = 1
if ftype == 0 or n_dims == 1 or \
name == "image_encoder.pos_embed" or \
name.startswith("prompt_encoder") or \
name.startswith("mask_decoder.iou_token") or \
name.startswith("mask_decoder.mask_tokens"):
print(" Converting to float32")
data = data.astype(np.float32)
ftype_cur = 0
else:
print(" Converting to float16")
data = data.astype(np.float16)
# reshape the 1D bias into a 4D tensor so we can use ggml_repeat
# keep it in F32 since the data is small
if name == "image_encoder.patch_embed.proj.bias":
data = data.reshape(1, data.shape[0], 1, 1)
n_dims = len(data.shape)
dshape = data.shape
print(" New shape: ", dshape)
# header
str = name.encode('utf-8')
fout.write(struct.pack("iii", n_dims, len(str), ftype_cur))
for i in range(n_dims):
fout.write(struct.pack("i", dshape[n_dims - 1 - i]))
fout.write(str)
# data
data.tofile(fout)
fout.close()
print("Done. Output file: " + fname_out)
print("")
|