Arrcttacsrks commited on
Commit
4839697
·
verified ·
1 Parent(s): 0ad6c07

Upload llama.cpp/ggml/src/ggml-alloc.c with huggingface_hub

Browse files
Files changed (1) hide show
  1. llama.cpp/ggml/src/ggml-alloc.c +1040 -0
llama.cpp/ggml/src/ggml-alloc.c ADDED
@@ -0,0 +1,1040 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #include "ggml-alloc.h"
2
+ #include "ggml-backend-impl.h"
3
+ #include "ggml.h"
4
+ #include "ggml-impl.h"
5
+ #include <assert.h>
6
+ #include <limits.h>
7
+ #include <stdarg.h>
8
+ #include <stdio.h>
9
+ #include <stdlib.h>
10
+ #include <string.h>
11
+
12
+ #define MAX(a, b) ((a) > (b) ? (a) : (b))
13
+ #define MAX_FREE_BLOCKS 256
14
+
15
+ //#define GGML_ALLOCATOR_DEBUG
16
+
17
+ //#define AT_PRINTF(...) GGML_LOG_DEBUG(__VA_ARGS__)
18
+ #define AT_PRINTF(...)
19
+
20
+
21
+ static bool ggml_is_view(const struct ggml_tensor * t) {
22
+ return t->view_src != NULL;
23
+ }
24
+
25
+ static bool ggml_are_same_layout(const struct ggml_tensor * a, const struct ggml_tensor * b) {
26
+ if (a->type != b->type) {
27
+ return false;
28
+ }
29
+ for (int i = 0; i < GGML_MAX_DIMS; i++) {
30
+ if (a->ne[i] != b->ne[i]) {
31
+ return false;
32
+ }
33
+ if (a->nb[i] != b->nb[i]) {
34
+ return false;
35
+ }
36
+ }
37
+ return true;
38
+ }
39
+
40
+ static bool ggml_op_can_inplace(enum ggml_op op) {
41
+ switch (op) {
42
+ case GGML_OP_SCALE:
43
+ case GGML_OP_DIAG_MASK_ZERO:
44
+ case GGML_OP_DIAG_MASK_INF:
45
+ case GGML_OP_ADD:
46
+ case GGML_OP_ADD1:
47
+ case GGML_OP_SUB:
48
+ case GGML_OP_MUL:
49
+ case GGML_OP_DIV:
50
+ case GGML_OP_SQR:
51
+ case GGML_OP_SQRT:
52
+ case GGML_OP_LOG:
53
+ case GGML_OP_UNARY:
54
+ case GGML_OP_ROPE:
55
+ case GGML_OP_RMS_NORM:
56
+ case GGML_OP_SOFT_MAX:
57
+ return true;
58
+
59
+ default:
60
+ return false;
61
+ }
62
+ }
63
+
64
+ static size_t aligned_offset(const void * buffer, size_t offset, size_t alignment) {
65
+ assert(alignment && !(alignment & (alignment - 1))); // power of 2
66
+ size_t align = (alignment - (((uintptr_t)buffer + offset) % alignment)) % alignment;
67
+ return offset + align;
68
+ }
69
+
70
+ // tallocr
71
+
72
+ struct ggml_tallocr ggml_tallocr_new(ggml_backend_buffer_t buffer) {
73
+ void * base = ggml_backend_buffer_get_base(buffer);
74
+ size_t align = ggml_backend_buffer_get_alignment(buffer);
75
+
76
+ assert(align && !(align & (align - 1))); // power of 2
77
+
78
+ struct ggml_tallocr talloc = (struct ggml_tallocr) {
79
+ /*.buffer = */ buffer,
80
+ /*.base = */ base,
81
+ /*.alignment = */ align,
82
+ /*.offset = */ aligned_offset(base, 0, align),
83
+ };
84
+ return talloc;
85
+ }
86
+
87
+ void ggml_tallocr_alloc(struct ggml_tallocr * talloc, struct ggml_tensor * tensor) {
88
+ size_t size = ggml_backend_buffer_get_alloc_size(talloc->buffer, tensor);
89
+ size = GGML_PAD(size, talloc->alignment);
90
+
91
+ if (talloc->offset + size > ggml_backend_buffer_get_size(talloc->buffer)) {
92
+ GGML_LOG_ERROR("%s: not enough space in the buffer to allocate %s (needed %zu, available %zu)\n",
93
+ __func__, tensor->name, size, ggml_backend_buffer_get_size(talloc->buffer) - talloc->offset);
94
+ GGML_ABORT("not enough space in the buffer");
95
+ }
96
+
97
+ void * addr = (char *)ggml_backend_buffer_get_base(talloc->buffer) + talloc->offset;
98
+ talloc->offset += size;
99
+
100
+ assert(((uintptr_t)addr % talloc->alignment) == 0);
101
+
102
+ ggml_backend_tensor_alloc(talloc->buffer, tensor, addr);
103
+ }
104
+
105
+ // dynamic tensor allocator
106
+
107
+ struct free_block {
108
+ size_t offset;
109
+ size_t size;
110
+ };
111
+
112
+ struct ggml_dyn_tallocr {
113
+ size_t alignment;
114
+ int n_free_blocks;
115
+ struct free_block free_blocks[MAX_FREE_BLOCKS];
116
+ size_t max_size;
117
+
118
+ #ifdef GGML_ALLOCATOR_DEBUG
119
+ struct {
120
+ const struct ggml_tensor * tensor;
121
+ size_t offset;
122
+ } allocated_tensors[1024];
123
+ #endif
124
+ };
125
+
126
+ #ifdef GGML_ALLOCATOR_DEBUG
127
+ static void add_allocated_tensor(struct ggml_dyn_tallocr * alloc, size_t offset, const struct ggml_tensor * tensor) {
128
+ for (int i = 0; i < 1024; i++) {
129
+ if (alloc->allocated_tensors[i].tensor == NULL) {
130
+ alloc->allocated_tensors[i].tensor = tensor;
131
+ alloc->allocated_tensors[i].offset = offset;
132
+ return;
133
+ }
134
+ }
135
+ GGML_ABORT("out of allocated_tensors");
136
+ }
137
+ static void remove_allocated_tensor(struct ggml_dyn_tallocr * alloc, size_t offset, const struct ggml_tensor * tensor) {
138
+ for (int i = 0; i < 1024; i++) {
139
+ if (alloc->allocated_tensors[i].offset == offset) {
140
+ alloc->allocated_tensors[i].tensor = NULL;
141
+ return;
142
+ }
143
+ }
144
+ GGML_ABORT("tried to free tensor %s not found\n", tensor->name);
145
+ }
146
+ #endif
147
+
148
+ static size_t ggml_dyn_tallocr_alloc(struct ggml_dyn_tallocr * alloc, size_t size, const struct ggml_tensor * tensor) {
149
+ size = aligned_offset(NULL, size, alloc->alignment);
150
+
151
+ AT_PRINTF("%s: allocating %s (%zu bytes) - ", __func__, tensor->name, size);
152
+
153
+ size_t max_avail = 0;
154
+
155
+ // find the best fitting free block besides the last block
156
+ int best_fit_block = -1;
157
+ size_t best_fit_size = SIZE_MAX;
158
+ for (int i = 0; i < alloc->n_free_blocks - 1; i++) {
159
+ struct free_block * block = &alloc->free_blocks[i];
160
+ max_avail = MAX(max_avail, block->size);
161
+ if (block->size >= size && block->size <= best_fit_size) {
162
+ best_fit_block = i;
163
+ best_fit_size = block->size;
164
+ }
165
+ }
166
+
167
+ if (best_fit_block == -1) {
168
+ // the last block is our last resort
169
+ struct free_block * block = &alloc->free_blocks[alloc->n_free_blocks - 1];
170
+ max_avail = MAX(max_avail, block->size);
171
+ if (block->size >= size) {
172
+ best_fit_block = alloc->n_free_blocks - 1;
173
+ } else {
174
+ // this should never happen
175
+ GGML_LOG_ERROR("%s: not enough space in the buffer to allocate %zu bytes, largest block available %zu bytes\n",
176
+ __func__, size, max_avail);
177
+ GGML_ABORT("not enough space in the buffer");
178
+ }
179
+ }
180
+
181
+ struct free_block * block = &alloc->free_blocks[best_fit_block];
182
+ size_t offset = block->offset;
183
+ block->offset = offset + size;
184
+ block->size -= size;
185
+ if (block->size == 0) {
186
+ // remove block if empty
187
+ alloc->n_free_blocks--;
188
+ for (int j = best_fit_block; j < alloc->n_free_blocks; j++) {
189
+ alloc->free_blocks[j] = alloc->free_blocks[j+1];
190
+ }
191
+ }
192
+
193
+ AT_PRINTF("block %d, offset %zu\n", best_fit_block, offset);
194
+
195
+ #ifdef GGML_ALLOCATOR_DEBUG
196
+ add_allocated_tensor(alloc, offset, tensor);
197
+ size_t cur_max = offset + size;
198
+ if (cur_max > alloc->max_size) {
199
+ // sort allocated_tensors by offset
200
+ for (int i = 0; i < 1024; i++) {
201
+ for (int j = i + 1; j < 1024; j++) {
202
+ if (alloc->allocated_tensors[i].offset > alloc->allocated_tensors[j].offset) {
203
+ const struct ggml_tensor * tmp_tensor = alloc->allocated_tensors[i].tensor;
204
+ size_t tmp_offset = alloc->allocated_tensors[i].offset;
205
+ alloc->allocated_tensors[i].tensor = alloc->allocated_tensors[j].tensor;
206
+ alloc->allocated_tensors[i].offset = alloc->allocated_tensors[j].offset;
207
+ alloc->allocated_tensors[j].tensor = tmp_tensor;
208
+ alloc->allocated_tensors[j].offset = tmp_offset;
209
+ }
210
+ }
211
+ }
212
+ GGML_LOG_DEBUG("max_size = %.2f MB: tensors: ", cur_max / 1024.0 / 1024.0);
213
+ for (int i = 0; i < 1024; i++) {
214
+ if (alloc->allocated_tensors[i].tensor) {
215
+ GGML_LOG_DEBUG("%s [%zx-%zx] (%.2f MB) ", alloc->allocated_tensors[i].tensor->name,
216
+ alloc->allocated_tensors[i].offset,
217
+ alloc->allocated_tensors[i].offset + ggml_nbytes(alloc->allocated_tensors[i].tensor),
218
+ ggml_nbytes(alloc->allocated_tensors[i].tensor) / 1024.0 / 1024.0);
219
+ }
220
+ }
221
+ GGML_LOG_DEBUG("\n");
222
+ }
223
+ #endif
224
+
225
+ alloc->max_size = MAX(alloc->max_size, offset + size);
226
+
227
+ return offset;
228
+
229
+ GGML_UNUSED(tensor);
230
+ }
231
+
232
+ // this is a very naive implementation, but for our case the number of free blocks should be very small
233
+ static void ggml_dyn_tallocr_free_tensor(struct ggml_dyn_tallocr * alloc, size_t offset, size_t size, const struct ggml_tensor * tensor) {
234
+ size = aligned_offset(NULL, size, alloc->alignment);
235
+
236
+ AT_PRINTF("%s: freeing %s at %zu (%zu bytes) - n_free_blocks = %d\n", __func__, tensor->name, offset, size, alloc->n_free_blocks);
237
+
238
+ #ifdef GGML_ALLOCATOR_DEBUG
239
+ remove_allocated_tensor(alloc, offset, tensor);
240
+ #endif
241
+
242
+ // see if we can merge with an existing block
243
+ for (int i = 0; i < alloc->n_free_blocks; i++) {
244
+ struct free_block * block = &alloc->free_blocks[i];
245
+ // check if ptr is at the end of the block
246
+ if (block->offset + block->size == offset) {
247
+ block->size += size;
248
+ // check if we can merge with the next block
249
+ if (i < alloc->n_free_blocks - 1 && block->offset + block->size == alloc->free_blocks[i+1].offset) {
250
+ block->size += alloc->free_blocks[i+1].size;
251
+ alloc->n_free_blocks--;
252
+ for (int j = i+1; j < alloc->n_free_blocks; j++) {
253
+ alloc->free_blocks[j] = alloc->free_blocks[j+1];
254
+ }
255
+ }
256
+ return;
257
+ }
258
+ // check if ptr is at the beginning of the block
259
+ if (offset + size == block->offset) {
260
+ block->offset = offset;
261
+ block->size += size;
262
+ // check if we can merge with the previous block
263
+ if (i > 0 && alloc->free_blocks[i-1].offset + alloc->free_blocks[i-1].size == block->offset) {
264
+ alloc->free_blocks[i-1].size += block->size;
265
+ alloc->n_free_blocks--;
266
+ for (int j = i; j < alloc->n_free_blocks; j++) {
267
+ alloc->free_blocks[j] = alloc->free_blocks[j+1];
268
+ }
269
+ }
270
+ return;
271
+ }
272
+ }
273
+ // otherwise, add a new block
274
+ GGML_ASSERT(alloc->n_free_blocks < MAX_FREE_BLOCKS && "out of free blocks");
275
+ // insert the new block in the correct position to keep the array sorted by address (to make merging blocks faster)
276
+ int insert_pos = 0;
277
+ while (insert_pos < alloc->n_free_blocks && alloc->free_blocks[insert_pos].offset < offset) {
278
+ insert_pos++;
279
+ }
280
+ // shift all blocks from insert_pos onward to make room for the new block
281
+ for (int i = alloc->n_free_blocks; i > insert_pos; i--) {
282
+ alloc->free_blocks[i] = alloc->free_blocks[i-1];
283
+ }
284
+ // insert the new block
285
+ alloc->free_blocks[insert_pos].offset = offset;
286
+ alloc->free_blocks[insert_pos].size = size;
287
+ alloc->n_free_blocks++;
288
+
289
+ GGML_UNUSED(tensor);
290
+ }
291
+
292
+ static void ggml_dyn_tallocr_reset(struct ggml_dyn_tallocr * alloc) {
293
+ alloc->n_free_blocks = 1;
294
+ alloc->free_blocks[0].offset = 0;
295
+ alloc->free_blocks[0].size = SIZE_MAX/2; // restrict maximum size of a measure allocator to half size_t max to avoid overflows
296
+ alloc->max_size = 0;
297
+
298
+ #ifdef GGML_ALLOCATOR_DEBUG
299
+ for (int i = 0; i < 1024; i++) {
300
+ alloc->allocated_tensors[i].tensor = NULL;
301
+ }
302
+ #endif
303
+ }
304
+
305
+ static struct ggml_dyn_tallocr * ggml_dyn_tallocr_new(size_t alignment) {
306
+ struct ggml_dyn_tallocr * alloc = (struct ggml_dyn_tallocr *)malloc(sizeof(struct ggml_dyn_tallocr));
307
+
308
+ *alloc = (struct ggml_dyn_tallocr) {
309
+ /*.alignment = */ alignment,
310
+ /*.n_free_blocks = */ 0,
311
+ /*.free_blocks = */ {{0}},
312
+ /*.max_size = */ 0,
313
+ #ifdef GGML_ALLOCATOR_DEBUG
314
+ /*.allocated_tensors = */ {{0}},
315
+ #endif
316
+ };
317
+
318
+ ggml_dyn_tallocr_reset(alloc);
319
+
320
+ return alloc;
321
+ }
322
+
323
+ static void ggml_dyn_tallocr_free(struct ggml_dyn_tallocr * alloc) {
324
+ free(alloc);
325
+ }
326
+
327
+ static size_t ggml_dyn_tallocr_max_size(struct ggml_dyn_tallocr * alloc) {
328
+ return alloc->max_size;
329
+ }
330
+
331
+
332
+ /////////////////////////////////////
333
+
334
+ // graph allocator
335
+
336
+ struct hash_node {
337
+ int n_children;
338
+ int n_views;
339
+ int buffer_id;
340
+ size_t offset; // offset within the buffer
341
+ bool allocated;
342
+ };
343
+
344
+ struct tensor_alloc {
345
+ int buffer_id;
346
+ size_t offset;
347
+ size_t size_max; // 0 = pre-allocated, unused, or view
348
+ };
349
+
350
+ struct leaf_alloc {
351
+ struct tensor_alloc leaf;
352
+ };
353
+
354
+ struct node_alloc {
355
+ struct tensor_alloc dst;
356
+ struct tensor_alloc src[GGML_MAX_SRC];
357
+ };
358
+
359
+ struct ggml_gallocr {
360
+ ggml_backend_buffer_type_t * bufts; // [n_buffers]
361
+ ggml_backend_buffer_t * buffers; // [n_buffers]
362
+ struct ggml_dyn_tallocr ** buf_tallocs; // [n_buffers]
363
+ int n_buffers;
364
+
365
+ struct ggml_hash_set hash_set;
366
+ struct hash_node * hash_values; // [hash_set.size]
367
+
368
+ struct node_alloc * node_allocs; // [n_nodes]
369
+ int n_nodes;
370
+
371
+ struct leaf_alloc * leaf_allocs; // [n_leafs]
372
+ int n_leafs;
373
+ };
374
+
375
+ ggml_gallocr_t ggml_gallocr_new_n(ggml_backend_buffer_type_t * bufts, int n_bufs) {
376
+ ggml_gallocr_t galloc = (ggml_gallocr_t)calloc(1, sizeof(struct ggml_gallocr));
377
+ GGML_ASSERT(galloc != NULL);
378
+
379
+ galloc->bufts = calloc(n_bufs, sizeof(ggml_backend_buffer_type_t));
380
+ GGML_ASSERT(galloc->bufts != NULL);
381
+
382
+ galloc->buffers = calloc(n_bufs, sizeof(ggml_backend_buffer_t));
383
+ GGML_ASSERT(galloc->buffers != NULL);
384
+
385
+ galloc->buf_tallocs = calloc(n_bufs, sizeof(struct ggml_dyn_tallocr *));
386
+ GGML_ASSERT(galloc->buf_tallocs != NULL);
387
+
388
+ for (int i = 0; i < n_bufs; i++) {
389
+ galloc->bufts[i] = bufts[i];
390
+ galloc->buffers[i] = NULL;
391
+
392
+ // check if the same buffer type is used multiple times and reuse the same allocator
393
+ for (int j = 0; j < i; j++) {
394
+ if (bufts[i] == bufts[j]) {
395
+ galloc->buf_tallocs[i] = galloc->buf_tallocs[j];
396
+ break;
397
+ }
398
+ }
399
+
400
+ if (galloc->buf_tallocs[i] == NULL) {
401
+ size_t alignment = ggml_backend_buft_get_alignment(bufts[i]);
402
+ galloc->buf_tallocs[i] = ggml_dyn_tallocr_new(alignment);
403
+ }
404
+ }
405
+ galloc->n_buffers = n_bufs;
406
+
407
+ return galloc;
408
+ }
409
+
410
+ ggml_gallocr_t ggml_gallocr_new(ggml_backend_buffer_type_t buft) {
411
+ return ggml_gallocr_new_n(&buft, 1);
412
+ }
413
+
414
+ void ggml_gallocr_free(ggml_gallocr_t galloc) {
415
+ if (galloc == NULL) {
416
+ return;
417
+ }
418
+
419
+ for (int i = 0; i < galloc->n_buffers; i++) {
420
+ if (galloc->buffers != NULL) {
421
+ // skip if already freed
422
+ bool freed = false;
423
+ for (int j = 0; j < i; j++) {
424
+ if (galloc->buffers[j] == galloc->buffers[i]) {
425
+ freed = true;
426
+ break;
427
+ }
428
+ }
429
+ if (!freed) {
430
+ ggml_backend_buffer_free(galloc->buffers[i]);
431
+ }
432
+ }
433
+ if (galloc->buf_tallocs != NULL) {
434
+ // skip if already freed
435
+ bool freed = false;
436
+ for (int j = 0; j < i; j++) {
437
+ if (galloc->buf_tallocs[j] == galloc->buf_tallocs[i]) {
438
+ freed = true;
439
+ break;
440
+ }
441
+ }
442
+ if (!freed) {
443
+ ggml_dyn_tallocr_free(galloc->buf_tallocs[i]);
444
+ }
445
+ }
446
+ }
447
+
448
+ ggml_hash_set_free(&galloc->hash_set);
449
+ free(galloc->hash_values);
450
+ free(galloc->bufts);
451
+ free(galloc->buffers);
452
+ free(galloc->buf_tallocs);
453
+ free(galloc->node_allocs);
454
+ free(galloc->leaf_allocs);
455
+ free(galloc);
456
+ }
457
+
458
+ typedef struct ggml_gallocr * ggml_gallocr_t;
459
+
460
+ static struct hash_node * ggml_gallocr_hash_get(ggml_gallocr_t galloc, struct ggml_tensor * t) {
461
+ size_t i = ggml_hash_find_or_insert(&galloc->hash_set, t);
462
+ return &galloc->hash_values[i];
463
+ }
464
+
465
+ static bool ggml_gallocr_is_own(ggml_gallocr_t galloc, struct ggml_tensor * t) {
466
+ return ggml_gallocr_hash_get(galloc, t)->allocated;
467
+ }
468
+
469
+ static void ggml_gallocr_set_node_offset(ggml_gallocr_t galloc, struct ggml_tensor * node, int buffer_id, size_t offset) {
470
+ struct hash_node * hn = ggml_gallocr_hash_get(galloc, node);
471
+ hn->buffer_id = buffer_id;
472
+ hn->offset = offset;
473
+ hn->allocated = true;
474
+ }
475
+
476
+ static bool ggml_gallocr_is_allocated(ggml_gallocr_t galloc, struct ggml_tensor * t) {
477
+ return t->data != NULL || ggml_gallocr_hash_get(galloc, t)->allocated;
478
+ }
479
+
480
+ static void ggml_gallocr_allocate_node(ggml_gallocr_t galloc, struct ggml_tensor * node, int buffer_id) {
481
+ struct hash_node * hn = ggml_gallocr_hash_get(galloc, node);
482
+
483
+ if (!ggml_gallocr_is_allocated(galloc, node) && !ggml_is_view(node)) {
484
+ hn->allocated = true;
485
+ assert(hn->offset == 0);
486
+
487
+ // try to reuse a parent's buffer (inplace)
488
+ if (ggml_op_can_inplace(node->op)) {
489
+ for (int i = 0; i < GGML_MAX_SRC; i++) {
490
+ struct ggml_tensor * parent = node->src[i];
491
+ if (parent == NULL) {
492
+ continue;
493
+ }
494
+
495
+ // if the node's data is external, then we cannot re-use it
496
+ if (!ggml_gallocr_is_own(galloc, parent)) {
497
+ AT_PRINTF("not reusing parent %s for %s as %p is external\n", parent->name, node->name, parent->data);
498
+ continue;
499
+ }
500
+
501
+ // outputs cannot be reused
502
+ if (parent->flags & GGML_TENSOR_FLAG_OUTPUT || (parent->view_src != NULL && parent->view_src->flags & GGML_TENSOR_FLAG_OUTPUT)) {
503
+ AT_PRINTF("not reusing parent %s for %s as it is an output\n", parent->name, node->name);
504
+ continue;
505
+ }
506
+
507
+ if (!ggml_are_same_layout(node, parent)) {
508
+ AT_PRINTF("not reusing parent %s for %s as layouts are different\n", parent->name, node->name);
509
+ continue;
510
+ }
511
+
512
+ struct hash_node * p_hn = ggml_gallocr_hash_get(galloc, parent);
513
+ if (p_hn->n_children == 1 && p_hn->n_views == 0) {
514
+ if (ggml_is_view(parent)) {
515
+ struct ggml_tensor * view_src = parent->view_src;
516
+ struct hash_node * view_src_hn = ggml_gallocr_hash_get(galloc, view_src);
517
+ if (view_src_hn->n_views == 1 && view_src_hn->n_children == 0 && view_src->data == parent->data) {
518
+ AT_PRINTF("reusing view parent %s (%s) for %s\n", parent->name, view_src->name, node->name);
519
+ assert(view_src_hn->offset == p_hn->offset);
520
+ hn->buffer_id = p_hn->buffer_id;
521
+ hn->offset = p_hn->offset;
522
+ p_hn->allocated = false; // avoid freeing the parent
523
+ view_src_hn->allocated = false;
524
+ return;
525
+ }
526
+ } else {
527
+ AT_PRINTF("reusing parent %s for %s\n", parent->name, node->name);
528
+ hn->buffer_id = p_hn->buffer_id;
529
+ hn->offset = p_hn->offset;
530
+ p_hn->allocated = false; // avoid freeing the parent
531
+ return;
532
+ }
533
+ }
534
+ }
535
+ }
536
+ // allocate tensor from the buffer
537
+ struct ggml_dyn_tallocr * alloc = galloc->buf_tallocs[buffer_id];
538
+ ggml_backend_buffer_type_t buft = galloc->bufts[buffer_id];
539
+ size_t size = ggml_backend_buft_get_alloc_size(buft, node);
540
+ size_t offset = ggml_dyn_tallocr_alloc(alloc, size, node);
541
+ hn->buffer_id = buffer_id;
542
+ hn->offset = offset;
543
+ return;
544
+ }
545
+ }
546
+
547
+ static void ggml_gallocr_free_node(ggml_gallocr_t galloc, struct ggml_tensor * node) {
548
+ // graph outputs are never freed
549
+ if (node->flags & GGML_TENSOR_FLAG_OUTPUT) {
550
+ AT_PRINTF("not freeing output %s\n", node->name);
551
+ return;
552
+ }
553
+
554
+ struct hash_node * hn = ggml_gallocr_hash_get(galloc, node);
555
+ size_t offset = hn->offset;
556
+ int buffer_id = hn->buffer_id;
557
+ struct ggml_dyn_tallocr * alloc = galloc->buf_tallocs[buffer_id];
558
+ ggml_backend_buffer_type_t buft = galloc->bufts[buffer_id];
559
+ size_t size = ggml_backend_buft_get_alloc_size(buft, node);
560
+ ggml_dyn_tallocr_free_tensor(alloc, offset, size, node);
561
+ hn->allocated = false;
562
+ }
563
+
564
+ static int get_node_buffer_id(const int * node_buffer_ids, int i) {
565
+ return node_buffer_ids ? node_buffer_ids[i] : 0;
566
+ }
567
+
568
+ static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids, const int * leaf_buffer_ids) {
569
+ // clear hash tables
570
+ ggml_hash_set_reset(&galloc->hash_set);
571
+ memset(galloc->hash_values, 0, sizeof(struct hash_node) * galloc->hash_set.size);
572
+
573
+ // allocate leafs
574
+ // these may be tensors that the application is not using in the graph, but may still want to allocate for other purposes
575
+ for (int i = 0; i < graph->n_leafs; i++) {
576
+ struct ggml_tensor * leaf = graph->leafs[i];
577
+ ggml_gallocr_allocate_node(galloc, leaf, get_node_buffer_id(leaf_buffer_ids, i));
578
+ }
579
+
580
+ // count number of children and views
581
+ // allocate other graph inputs and leafs first to avoid overwriting them
582
+ for (int i = 0; i < graph->n_nodes; i++) {
583
+ struct ggml_tensor * node = graph->nodes[i];
584
+
585
+ // TODO: better way to add external dependencies
586
+ // GGML_OP_NONE does not appear normally in the graph nodes, but is used by ggml-backend to add dependencies to
587
+ // control when some tensors are allocated and freed. in this case, the dependencies are in `src`, but the node
588
+ // itself is never used and should not be considered a dependency
589
+ if (ggml_is_view(node) && node->op != GGML_OP_NONE) {
590
+ struct ggml_tensor * view_src = node->view_src;
591
+ ggml_gallocr_hash_get(galloc, view_src)->n_views += 1;
592
+ }
593
+
594
+ if (node->flags & GGML_TENSOR_FLAG_INPUT) {
595
+ ggml_gallocr_allocate_node(galloc, graph->nodes[i], get_node_buffer_id(node_buffer_ids, i));
596
+ }
597
+
598
+ for (int j = 0; j < GGML_MAX_SRC; j++) {
599
+ struct ggml_tensor * src = node->src[j];
600
+ if (src == NULL) {
601
+ continue;
602
+ }
603
+
604
+ ggml_gallocr_hash_get(galloc, src)->n_children += 1;
605
+
606
+ // allocate explicit inputs
607
+ if (src->flags & GGML_TENSOR_FLAG_INPUT) {
608
+ ggml_gallocr_allocate_node(galloc, src, get_node_buffer_id(node_buffer_ids, i));
609
+ }
610
+ }
611
+ }
612
+
613
+ // allocate tensors
614
+ for (int i = 0; i < graph->n_nodes; i++) {
615
+ struct ggml_tensor * node = graph->nodes[i];
616
+ int buffer_id = get_node_buffer_id(node_buffer_ids, i);
617
+
618
+ // allocate parents (only leafs need to be allocated at this point)
619
+ for (int j = 0; j < GGML_MAX_SRC; j++) {
620
+ struct ggml_tensor * parent = node->src[j];
621
+ if (parent == NULL) {
622
+ continue;
623
+ }
624
+ ggml_gallocr_allocate_node(galloc, parent, buffer_id);
625
+ }
626
+
627
+ // allocate node
628
+ ggml_gallocr_allocate_node(galloc, node, buffer_id);
629
+
630
+ AT_PRINTF("exec: %s (%s) <= ", ggml_op_desc(node), node->name);
631
+ for (int j = 0; j < GGML_MAX_SRC; j++) {
632
+ struct ggml_tensor * parent = node->src[j];
633
+ if (parent == NULL) {
634
+ continue;
635
+ }
636
+ AT_PRINTF("%s", parent->name);
637
+ if (j < GGML_MAX_SRC - 1 && node->src[j + 1] != NULL) {
638
+ AT_PRINTF(", ");
639
+ }
640
+ }
641
+ AT_PRINTF("\n");
642
+
643
+ // update parents
644
+ for (int j = 0; j < GGML_MAX_SRC; j++) {
645
+ struct ggml_tensor * parent = node->src[j];
646
+ if (parent == NULL) {
647
+ continue;
648
+ }
649
+ struct hash_node * p_hn = ggml_gallocr_hash_get(galloc, parent);
650
+ p_hn->n_children -= 1;
651
+
652
+ AT_PRINTF("parent %s: %d children, %d views, allocated: %d\n",
653
+ parent->name, p_hn->n_children, p_hn->n_views, p_hn->allocated);
654
+
655
+ if (p_hn->n_children == 0 && p_hn->n_views == 0) {
656
+ if (ggml_is_view(parent)) {
657
+ struct ggml_tensor * view_src = parent->view_src;
658
+ struct hash_node * view_src_hn = ggml_gallocr_hash_get(galloc, view_src);
659
+ view_src_hn->n_views -= 1;
660
+ AT_PRINTF("view_src %s: %d children, %d views\n",
661
+ view_src->name, view_src_hn->n_children, view_src_hn->n_views);
662
+ if (view_src_hn->n_views == 0 && view_src_hn->n_children == 0 && view_src_hn->allocated) {
663
+ ggml_gallocr_free_node(galloc, view_src);
664
+ }
665
+ }
666
+ else if (p_hn->allocated) {
667
+ ggml_gallocr_free_node(galloc, parent);
668
+ }
669
+ }
670
+ AT_PRINTF("\n");
671
+ }
672
+ }
673
+ }
674
+
675
+ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids, const int * leaf_buffer_ids) {
676
+ size_t min_hash_size = graph->n_nodes + graph->n_leafs;
677
+ // add 25% margin to avoid hash collisions
678
+ min_hash_size += min_hash_size / 4;
679
+
680
+ // initialize hash table
681
+ if (galloc->hash_set.size < min_hash_size) {
682
+ ggml_hash_set_free(&galloc->hash_set);
683
+ galloc->hash_set = ggml_hash_set_new(min_hash_size);
684
+ GGML_ASSERT(galloc->hash_set.keys != NULL);
685
+
686
+ free(galloc->hash_values);
687
+ galloc->hash_values = malloc(sizeof(struct hash_node) * galloc->hash_set.size);
688
+ GGML_ASSERT(galloc->hash_values != NULL);
689
+ }
690
+
691
+ // reset allocators
692
+ for (int i = 0; i < galloc->n_buffers; i++) {
693
+ ggml_dyn_tallocr_reset(galloc->buf_tallocs[i]);
694
+ }
695
+
696
+ // allocate in hash table
697
+ ggml_gallocr_alloc_graph_impl(galloc, graph, node_buffer_ids, leaf_buffer_ids);
698
+
699
+ // set the node_allocs from the hash table
700
+ if (galloc->n_nodes < graph->n_nodes) {
701
+ free(galloc->node_allocs);
702
+ galloc->node_allocs = calloc(graph->n_nodes, sizeof(struct node_alloc));
703
+ GGML_ASSERT(galloc->node_allocs != NULL);
704
+ }
705
+ galloc->n_nodes = graph->n_nodes;
706
+ for (int i = 0; i < graph->n_nodes; i++) {
707
+ struct ggml_tensor * node = graph->nodes[i];
708
+ struct node_alloc * node_alloc = &galloc->node_allocs[i];
709
+ if (node->view_src || node->data) {
710
+ node_alloc->dst.buffer_id = -1;
711
+ node_alloc->dst.offset = SIZE_MAX;
712
+ node_alloc->dst.size_max = 0;
713
+ } else {
714
+ struct hash_node * hn = ggml_gallocr_hash_get(galloc, node);
715
+ node_alloc->dst.buffer_id = hn->buffer_id;
716
+ node_alloc->dst.offset = hn->offset;
717
+ node_alloc->dst.size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], node);
718
+ }
719
+ for (int j = 0; j < GGML_MAX_SRC; j++) {
720
+ struct ggml_tensor * src = node->src[j];
721
+ if (!src || src->view_src || src->data) {
722
+ node_alloc->src[j].buffer_id = -1;
723
+ node_alloc->src[j].offset = SIZE_MAX;
724
+ node_alloc->src[j].size_max = 0;
725
+ } else {
726
+ struct hash_node * hn = ggml_gallocr_hash_get(galloc, src);
727
+ node_alloc->src[j].buffer_id = hn->buffer_id;
728
+ node_alloc->src[j].offset = hn->offset;
729
+ node_alloc->src[j].size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], src);
730
+ }
731
+ }
732
+ }
733
+ if (galloc->n_leafs < graph->n_leafs) {
734
+ free(galloc->leaf_allocs);
735
+ galloc->leaf_allocs = calloc(graph->n_leafs, sizeof(galloc->leaf_allocs[0]));
736
+ GGML_ASSERT(galloc->leaf_allocs != NULL);
737
+ }
738
+ galloc->n_leafs = graph->n_leafs;
739
+ for (int i = 0; i < graph->n_leafs; i++) {
740
+ struct ggml_tensor * leaf = graph->leafs[i];
741
+ struct hash_node * hn = ggml_gallocr_hash_get(galloc, leaf);
742
+ if (leaf->view_src || leaf->data) {
743
+ galloc->leaf_allocs[i].leaf.buffer_id = -1;
744
+ galloc->leaf_allocs[i].leaf.offset = SIZE_MAX;
745
+ galloc->leaf_allocs[i].leaf.size_max = 0;
746
+ } else {
747
+ galloc->leaf_allocs[i].leaf.buffer_id = hn->buffer_id;
748
+ galloc->leaf_allocs[i].leaf.offset = hn->offset;
749
+ galloc->leaf_allocs[i].leaf.size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], leaf);
750
+ }
751
+ }
752
+
753
+ // reallocate buffers if needed
754
+ for (int i = 0; i < galloc->n_buffers; i++) {
755
+ // if the buffer type is used multiple times, we reuse the same buffer
756
+ for (int j = 0; j < i; j++) {
757
+ if (galloc->buf_tallocs[j] == galloc->buf_tallocs[i]) {
758
+ galloc->buffers[i] = galloc->buffers[j];
759
+ break;
760
+ }
761
+ }
762
+
763
+ size_t cur_size = galloc->buffers[i] ? ggml_backend_buffer_get_size(galloc->buffers[i]) : 0;
764
+ size_t new_size = ggml_dyn_tallocr_max_size(galloc->buf_tallocs[i]);
765
+
766
+ // even if there are no tensors allocated in this buffer, we still need to allocate it to initialize views
767
+ if (new_size > cur_size || galloc->buffers[i] == NULL) {
768
+ #ifndef NDEBUG
769
+ GGML_LOG_DEBUG("%s: reallocating %s buffer from size %.02f MiB to %.02f MiB\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), cur_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
770
+ #endif
771
+
772
+ ggml_backend_buffer_free(galloc->buffers[i]);
773
+ galloc->buffers[i] = ggml_backend_buft_alloc_buffer(galloc->bufts[i], new_size);
774
+ if (galloc->buffers[i] == NULL) {
775
+ GGML_LOG_ERROR("%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), new_size);
776
+ return false;
777
+ }
778
+ ggml_backend_buffer_set_usage(galloc->buffers[i], GGML_BACKEND_BUFFER_USAGE_COMPUTE);
779
+ }
780
+ }
781
+
782
+ return true;
783
+ }
784
+
785
+ bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph *graph) {
786
+ return ggml_gallocr_reserve_n(galloc, graph, NULL, NULL);
787
+ }
788
+
789
+ static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor * tensor, struct tensor_alloc * tensor_alloc) {
790
+ int buffer_id = tensor_alloc->buffer_id;
791
+ assert(tensor->data || tensor->view_src || ggml_backend_buffer_get_alloc_size(galloc->buffers[buffer_id], tensor) <= tensor_alloc->size_max);
792
+
793
+ if (tensor->view_src != NULL) {
794
+ if (tensor->buffer == NULL) {
795
+ assert(tensor_alloc->offset == SIZE_MAX);
796
+ if (tensor->view_src->buffer == NULL) {
797
+ // this tensor was allocated without ggml-backend
798
+ return;
799
+ }
800
+ ggml_backend_view_init(tensor);
801
+ }
802
+ } else {
803
+ if (tensor->data == NULL) {
804
+ assert(tensor_alloc->offset != SIZE_MAX);
805
+ assert(ggml_backend_buffer_get_alloc_size(galloc->buffers[buffer_id], tensor) <= tensor_alloc->size_max);
806
+ void * base = ggml_backend_buffer_get_base(galloc->buffers[buffer_id]);
807
+ void * addr = (char *)base + tensor_alloc->offset;
808
+ ggml_backend_tensor_alloc(galloc->buffers[buffer_id], tensor, addr);
809
+ } else {
810
+ if (tensor->buffer == NULL) {
811
+ // this tensor was allocated without ggml-backend
812
+ return;
813
+ }
814
+ }
815
+ }
816
+ }
817
+
818
+ static bool ggml_gallocr_node_needs_realloc(ggml_gallocr_t galloc, struct ggml_tensor * node, struct tensor_alloc * talloc) {
819
+ size_t node_size = (node->data || node->view_src) ? 0 : ggml_backend_buft_get_alloc_size(galloc->bufts[talloc->buffer_id], node);
820
+ return talloc->size_max >= node_size;
821
+ }
822
+
823
+ static bool ggml_gallocr_needs_realloc(ggml_gallocr_t galloc, struct ggml_cgraph * graph) {
824
+ if (galloc->n_nodes != graph->n_nodes) {
825
+ #ifndef NDEBUG
826
+ GGML_LOG_DEBUG("%s: graph has different number of nodes\n", __func__);
827
+ #endif
828
+ return true;
829
+ }
830
+
831
+ if (galloc->n_leafs != graph->n_leafs) {
832
+ #ifndef NDEBUG
833
+ GGML_LOG_DEBUG("%s: graph has different number of leafs\n", __func__);
834
+ #endif
835
+ return true;
836
+ }
837
+
838
+ for (int i = 0; i < graph->n_nodes; i++) {
839
+ struct ggml_tensor * node = graph->nodes[i];
840
+ struct node_alloc * node_alloc = &galloc->node_allocs[i];
841
+
842
+ if (!ggml_gallocr_node_needs_realloc(galloc, node, &node_alloc->dst)) {
843
+ #ifndef NDEBUG
844
+ GGML_LOG_DEBUG("%s: node %s is not valid\n", __func__, node->name);
845
+ #endif
846
+ return true;
847
+ }
848
+
849
+ for (int j = 0; j < GGML_MAX_SRC; j++) {
850
+ struct ggml_tensor * src = node->src[j];
851
+ if (src == NULL) {
852
+ continue;
853
+ }
854
+ if (!ggml_gallocr_node_needs_realloc(galloc, src, &node_alloc->src[j])) {
855
+ #ifndef NDEBUG
856
+ GGML_LOG_DEBUG("%s: src %d (%s) of node %s is not valid\n", __func__, j, src->name, node->name);
857
+ #endif
858
+ return true;
859
+ }
860
+ }
861
+ }
862
+
863
+ return false;
864
+ }
865
+
866
+ bool ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, struct ggml_cgraph * graph) {
867
+ if (ggml_gallocr_needs_realloc(galloc, graph)) {
868
+ if (galloc->n_buffers == 1) {
869
+ #ifndef NDEBUG
870
+ GGML_LOG_DEBUG("%s: reallocating buffers automatically\n", __func__);
871
+ #endif
872
+ if (!ggml_gallocr_reserve(galloc, graph)) {
873
+ return false;
874
+ }
875
+ } else {
876
+ #ifndef NDEBUG
877
+ GGML_LOG_DEBUG("%s: cannot reallocate multi buffer graph automatically, call reserve\n", __func__);
878
+ #endif
879
+ return false;
880
+ }
881
+ }
882
+
883
+ // reset buffers
884
+ for (int i = 0; i < galloc->n_buffers; i++) {
885
+ if (galloc->buffers[i] != NULL) {
886
+ ggml_backend_buffer_reset(galloc->buffers[i]);
887
+ }
888
+ }
889
+
890
+ // allocate the graph tensors from the previous assignments
891
+ // leafs
892
+ for (int i = 0; i < graph->n_leafs; i++) {
893
+ struct ggml_tensor * leaf = graph->leafs[i];
894
+ struct leaf_alloc * leaf_alloc = &galloc->leaf_allocs[i];
895
+ ggml_gallocr_init_tensor(galloc, leaf, &leaf_alloc->leaf);
896
+ }
897
+ // nodes
898
+ for (int i = 0; i < graph->n_nodes; i++) {
899
+ struct ggml_tensor * node = graph->nodes[i];
900
+ struct node_alloc * node_alloc = &galloc->node_allocs[i];
901
+ for (int j = 0; j < GGML_MAX_SRC; j++) {
902
+ struct ggml_tensor * src = node->src[j];
903
+ if (src == NULL) {
904
+ continue;
905
+ }
906
+ ggml_gallocr_init_tensor(galloc, src, &node_alloc->src[j]);
907
+ }
908
+ ggml_gallocr_init_tensor(galloc, node, &node_alloc->dst);
909
+ }
910
+
911
+ return true;
912
+ }
913
+
914
+ size_t ggml_gallocr_get_buffer_size(ggml_gallocr_t galloc, int buffer_id) {
915
+ GGML_ASSERT(buffer_id >= 0 && buffer_id < galloc->n_buffers);
916
+
917
+ if (galloc->buffers[buffer_id] == NULL) {
918
+ return 0;
919
+ }
920
+
921
+ for (int i = 0; i < buffer_id; i++) {
922
+ if (galloc->buffers[i] == galloc->buffers[buffer_id]) {
923
+ // this buffer is the same as a previous one due to the same buffer type being used multiple times
924
+ // only return the buffer size the first time it appears to avoid double counting
925
+ return 0;
926
+ }
927
+ }
928
+
929
+ return ggml_backend_buffer_get_size(galloc->buffers[buffer_id]);
930
+ }
931
+
932
+ // utils
933
+
934
+ static bool alloc_tensor_range(struct ggml_context * ctx,
935
+ struct ggml_tensor * first, struct ggml_tensor * last,
936
+ ggml_backend_buffer_type_t buft, size_t size,
937
+ ggml_backend_buffer_t ** buffers, size_t * n_buffers) {
938
+ ggml_backend_buffer_t buffer = ggml_backend_buft_alloc_buffer(buft, size);
939
+ if (buffer == NULL) {
940
+ #ifndef NDEBUG
941
+ GGML_LOG_DEBUG("%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(buft), size);
942
+ #endif
943
+ for (size_t i = 0; i < *n_buffers; i++) {
944
+ ggml_backend_buffer_free((*buffers)[i]);
945
+ }
946
+ free(*buffers);
947
+ return false;
948
+ }
949
+
950
+ struct ggml_tallocr tallocr = ggml_tallocr_new(buffer);
951
+
952
+ for (struct ggml_tensor * t = first; t != last; t = ggml_get_next_tensor(ctx, t)) {
953
+ if (t->data == NULL) {
954
+ if (t->view_src == NULL) {
955
+ ggml_tallocr_alloc(&tallocr, t);
956
+ } else if (t->buffer == NULL) {
957
+ ggml_backend_view_init(t);
958
+ }
959
+ } else {
960
+ if (t->view_src != NULL && t->buffer == NULL) {
961
+ // view of a pre-allocated tensor
962
+ ggml_backend_view_init(t);
963
+ }
964
+ }
965
+ }
966
+
967
+ *buffers = realloc(*buffers, sizeof(ggml_backend_buffer_t) * (*n_buffers + 1));
968
+ (*buffers)[(*n_buffers)++] = buffer;
969
+
970
+ return true;
971
+ }
972
+
973
+ ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft) {
974
+ GGML_ASSERT(ggml_get_no_alloc(ctx) == true);
975
+
976
+ size_t alignment = ggml_backend_buft_get_alignment(buft);
977
+ size_t max_size = ggml_backend_buft_get_max_size(buft);
978
+
979
+ ggml_backend_buffer_t * buffers = NULL;
980
+ size_t n_buffers = 0;
981
+
982
+ size_t cur_buf_size = 0;
983
+ struct ggml_tensor * first = ggml_get_first_tensor(ctx);
984
+ for (struct ggml_tensor * t = first; t != NULL; t = ggml_get_next_tensor(ctx, t)) {
985
+ size_t this_size = 0;
986
+ if (t->data == NULL && t->view_src == NULL) {
987
+ this_size = GGML_PAD(ggml_backend_buft_get_alloc_size(buft, t), alignment);
988
+ }
989
+
990
+ if (this_size > max_size) {
991
+ GGML_LOG_ERROR("%s: tensor %s is too large to fit in a %s buffer (tensor size: %zu, max buffer size: %zu)\n",
992
+ __func__, t->name,
993
+ ggml_backend_buft_name(buft),
994
+ this_size, max_size);
995
+ for (size_t i = 0; i < n_buffers; i++) {
996
+ ggml_backend_buffer_free(buffers[i]);
997
+ }
998
+ free(buffers);
999
+ return NULL;
1000
+ }
1001
+
1002
+ if ((cur_buf_size + this_size) > max_size) {
1003
+ // allocate tensors in the current buffer
1004
+ if (!alloc_tensor_range(ctx, first, t, buft, cur_buf_size, &buffers, &n_buffers)) {
1005
+ return NULL;
1006
+ }
1007
+ first = t;
1008
+ cur_buf_size = this_size;
1009
+ } else {
1010
+ cur_buf_size += this_size;
1011
+ }
1012
+ }
1013
+
1014
+ // allocate remaining tensors
1015
+ if (cur_buf_size > 0) {
1016
+ if (!alloc_tensor_range(ctx, first, NULL, buft, cur_buf_size, &buffers, &n_buffers)) {
1017
+ return NULL;
1018
+ }
1019
+ }
1020
+
1021
+ if (n_buffers == 0) {
1022
+ #ifndef NDEBUG
1023
+ GGML_LOG_DEBUG("%s: all tensors in the context are already allocated\n", __func__);
1024
+ #endif
1025
+ return NULL;
1026
+ }
1027
+
1028
+ ggml_backend_buffer_t buffer;
1029
+ if (n_buffers == 1) {
1030
+ buffer = buffers[0];
1031
+ } else {
1032
+ buffer = ggml_backend_multi_buffer_alloc_buffer(buffers, n_buffers);
1033
+ }
1034
+ free(buffers);
1035
+ return buffer;
1036
+ }
1037
+
1038
+ ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors(struct ggml_context * ctx, ggml_backend_t backend) {
1039
+ return ggml_backend_alloc_ctx_tensors_from_buft(ctx, ggml_backend_get_default_buffer_type(backend));
1040
+ }