|
#include "ggml-backend.h" |
|
#include "ggml-alloc.h" |
|
|
|
#include <assert.h> |
|
#include <stdarg.h> |
|
#include <stdio.h> |
|
#include <stdlib.h> |
|
#include <string.h> |
|
|
|
#define UNUSED GGML_UNUSED |
|
|
|
#define MAX(a, b) ((a) > (b) ? (a) : (b)) |
|
|
|
|
|
|
|
ggml_backend_buffer_t ggml_backend_buffer_init( |
|
struct ggml_backend * backend, |
|
struct ggml_backend_buffer_i iface, |
|
ggml_backend_buffer_context_t context, |
|
size_t size) { |
|
ggml_backend_buffer_t buffer = malloc(sizeof(struct ggml_backend_buffer)); |
|
|
|
GGML_ASSERT(iface.get_base != NULL); |
|
|
|
(*buffer) = (struct ggml_backend_buffer) { |
|
iface, |
|
backend, |
|
context, |
|
size, |
|
}; |
|
|
|
return buffer; |
|
} |
|
|
|
void ggml_backend_buffer_free(ggml_backend_buffer_t buffer) { |
|
if (buffer->iface.free_buffer != NULL) { |
|
buffer->iface.free_buffer(buffer); |
|
} |
|
free(buffer); |
|
} |
|
|
|
size_t ggml_backend_buffer_get_alignment(ggml_backend_buffer_t buffer) { |
|
return ggml_backend_get_alignment(buffer->backend); |
|
} |
|
|
|
void * ggml_backend_buffer_get_base(ggml_backend_buffer_t buffer) { |
|
return buffer->iface.get_base(buffer); |
|
} |
|
|
|
size_t ggml_backend_buffer_get_size(ggml_backend_buffer_t buffer) { |
|
return buffer->size; |
|
} |
|
|
|
size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) { |
|
if (buffer->iface.get_alloc_size) { |
|
return buffer->iface.get_alloc_size(buffer, tensor); |
|
} |
|
return ggml_nbytes(tensor); |
|
} |
|
|
|
void ggml_backend_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) { |
|
if (buffer->iface.init_tensor) { |
|
buffer->iface.init_tensor(buffer, tensor); |
|
} |
|
} |
|
|
|
void ggml_backend_buffer_free_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) { |
|
if (buffer->iface.free_tensor) { |
|
buffer->iface.free_tensor(buffer, tensor); |
|
} |
|
} |
|
|
|
|
|
|
|
ggml_backend_t ggml_get_backend(const struct ggml_tensor * tensor) { |
|
return tensor->buffer->backend; |
|
} |
|
|
|
const char * ggml_backend_name(ggml_backend_t backend) { |
|
return backend->iface.get_name(backend); |
|
} |
|
|
|
void ggml_backend_free(ggml_backend_t backend) { |
|
backend->iface.free(backend); |
|
} |
|
|
|
ggml_backend_buffer_t ggml_backend_alloc_buffer(ggml_backend_t backend, size_t size) { |
|
return backend->iface.alloc_buffer(backend, size); |
|
} |
|
|
|
size_t ggml_backend_get_alignment(ggml_backend_t backend) { |
|
return backend->iface.get_alignment(backend); |
|
} |
|
|
|
void ggml_backend_tensor_set_async(struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) { |
|
ggml_get_backend(tensor)->iface.set_tensor_async(ggml_get_backend(tensor), tensor, data, offset, size); |
|
} |
|
|
|
void ggml_backend_tensor_get_async(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) { |
|
ggml_get_backend(tensor)->iface.get_tensor_async(ggml_get_backend(tensor), tensor, data, offset, size); |
|
} |
|
|
|
void ggml_backend_tensor_set(struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) { |
|
ggml_get_backend(tensor)->iface.set_tensor_async(ggml_get_backend(tensor), tensor, data, offset, size); |
|
ggml_get_backend(tensor)->iface.synchronize(ggml_get_backend(tensor)); |
|
} |
|
|
|
void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) { |
|
ggml_get_backend(tensor)->iface.get_tensor_async(ggml_get_backend(tensor), tensor, data, offset, size); |
|
ggml_get_backend(tensor)->iface.synchronize(ggml_get_backend(tensor)); |
|
} |
|
|
|
void ggml_backend_synchronize(ggml_backend_t backend) { |
|
backend->iface.synchronize(backend); |
|
} |
|
|
|
ggml_backend_graph_plan_t ggml_backend_graph_plan_create(ggml_backend_t backend, struct ggml_cgraph * cgraph) { |
|
return backend->iface.graph_plan_create(backend, cgraph); |
|
} |
|
|
|
void ggml_backend_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) { |
|
backend->iface.graph_plan_free(backend, plan); |
|
} |
|
|
|
void ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) { |
|
backend->iface.graph_plan_compute(backend, plan); |
|
} |
|
|
|
void ggml_backend_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) { |
|
backend->iface.graph_compute(backend, cgraph); |
|
} |
|
|
|
bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) { |
|
return backend->iface.supports_op(backend, op); |
|
} |
|
|
|
|
|
|
|
static bool ggml_are_same_layout(const struct ggml_tensor * a, const struct ggml_tensor * b) { |
|
if (a->type != b->type) { |
|
return false; |
|
} |
|
for (int i = 0; i < GGML_MAX_DIMS; i++) { |
|
if (a->ne[i] != b->ne[i]) { |
|
return false; |
|
} |
|
if (a->nb[i] != b->nb[i]) { |
|
return false; |
|
} |
|
} |
|
return true; |
|
} |
|
|
|
void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst) { |
|
|
|
|
|
GGML_ASSERT(ggml_are_same_layout(src, dst) && "cannot copy tensors with different layouts"); |
|
|
|
|
|
|
|
if (src == dst) { |
|
return; |
|
} |
|
|
|
|
|
|
|
if (ggml_get_backend(dst)->iface.cpy_tensor_from != NULL) { |
|
ggml_get_backend(dst)->iface.cpy_tensor_from(ggml_get_backend(dst)->context, src, dst); |
|
} else if (ggml_get_backend(src)->iface.cpy_tensor_to != NULL) { |
|
ggml_get_backend(src)->iface.cpy_tensor_to(ggml_get_backend(src)->context, src, dst); |
|
} else { |
|
|
|
#ifndef NDEBUG |
|
fprintf(stderr, "ggml_backend_tensor_copy: neither cpy_tensor_from nor cpy_tensor_to are implemented for backends %s and %s, falling back to get/set\n", ggml_backend_name(src->buffer->backend), ggml_backend_name(dst->buffer->backend)); |
|
#endif |
|
size_t nbytes = ggml_nbytes(src); |
|
void * data = malloc(nbytes); |
|
ggml_backend_tensor_get(src, data, 0, nbytes); |
|
ggml_backend_tensor_set(dst, data, 0, nbytes); |
|
free(data); |
|
} |
|
} |
|
|
|
|
|
|
|
struct ggml_backend_cpu_context { |
|
int n_threads; |
|
void * work_data; |
|
size_t work_size; |
|
}; |
|
|
|
static const char * ggml_backend_cpu_name(ggml_backend_t backend) { |
|
return "CPU"; |
|
|
|
UNUSED(backend); |
|
} |
|
|
|
static void ggml_backend_cpu_free(ggml_backend_t backend) { |
|
struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context; |
|
free(cpu_ctx->work_data); |
|
free(cpu_ctx); |
|
free(backend); |
|
} |
|
|
|
static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) { |
|
return (void *)buffer->context; |
|
} |
|
|
|
static void ggml_backend_cpu_buffer_free_buffer(ggml_backend_buffer_t buffer) { |
|
free(buffer->context); |
|
UNUSED(buffer); |
|
} |
|
|
|
static struct ggml_backend_buffer_i cpu_backend_buffer_i = { |
|
ggml_backend_cpu_buffer_free_buffer, |
|
ggml_backend_cpu_buffer_get_base, |
|
NULL, |
|
NULL, |
|
NULL, |
|
}; |
|
|
|
|
|
static struct ggml_backend_buffer_i cpu_backend_buffer_i_from_ptr = { |
|
NULL, |
|
ggml_backend_cpu_buffer_get_base, |
|
NULL, |
|
NULL, |
|
NULL, |
|
}; |
|
|
|
static const size_t TENSOR_ALIGNMENT = 64; |
|
|
|
static ggml_backend_buffer_t ggml_backend_cpu_alloc_buffer(ggml_backend_t backend, size_t size) { |
|
size += TENSOR_ALIGNMENT; |
|
void * data = malloc(size); |
|
|
|
return ggml_backend_buffer_init(backend, cpu_backend_buffer_i, data, size); |
|
} |
|
|
|
static size_t ggml_backend_cpu_get_alignment(ggml_backend_t backend) { |
|
return TENSOR_ALIGNMENT; |
|
UNUSED(backend); |
|
} |
|
|
|
static void ggml_backend_cpu_set_tensor_async(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) { |
|
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds"); |
|
GGML_ASSERT(tensor->data != NULL && "tensor not allocated"); |
|
|
|
memcpy((char *)tensor->data + offset, data, size); |
|
|
|
UNUSED(backend); |
|
} |
|
|
|
static void ggml_backend_cpu_get_tensor_async(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) { |
|
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds"); |
|
GGML_ASSERT(tensor->data != NULL && "tensor not allocated"); |
|
|
|
memcpy(data, (const char *)tensor->data + offset, size); |
|
|
|
UNUSED(backend); |
|
} |
|
|
|
static void ggml_backend_cpu_synchronize(ggml_backend_t backend) { |
|
UNUSED(backend); |
|
} |
|
|
|
static void ggml_backend_cpu_cpy_tensor_from(ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst) { |
|
ggml_backend_tensor_get(src, dst->data, 0, ggml_nbytes(src)); |
|
|
|
UNUSED(backend); |
|
} |
|
|
|
static void ggml_backend_cpu_cpy_tensor_to(ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst) { |
|
|
|
ggml_backend_tensor_set_async(dst, src->data, 0, ggml_nbytes(src)); |
|
|
|
UNUSED(backend); |
|
} |
|
|
|
struct ggml_backend_plan_cpu { |
|
struct ggml_cplan cplan; |
|
struct ggml_cgraph cgraph; |
|
}; |
|
|
|
static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(ggml_backend_t backend, struct ggml_cgraph * cgraph) { |
|
struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context; |
|
|
|
struct ggml_backend_plan_cpu * cpu_plan = malloc(sizeof(struct ggml_backend_plan_cpu)); |
|
|
|
cpu_plan->cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads); |
|
cpu_plan->cgraph = *cgraph; |
|
|
|
if (cpu_plan->cplan.work_size > 0) { |
|
cpu_plan->cplan.work_data = malloc(cpu_plan->cplan.work_size); |
|
} |
|
|
|
return cpu_plan; |
|
} |
|
|
|
static void ggml_backend_cpu_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) { |
|
struct ggml_backend_plan_cpu * cpu_plan = (struct ggml_backend_plan_cpu *)plan; |
|
|
|
free(cpu_plan->cplan.work_data); |
|
free(cpu_plan); |
|
|
|
UNUSED(backend); |
|
} |
|
|
|
static void ggml_backend_cpu_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) { |
|
struct ggml_backend_plan_cpu * cpu_plan = (struct ggml_backend_plan_cpu *)plan; |
|
|
|
ggml_graph_compute(&cpu_plan->cgraph, &cpu_plan->cplan); |
|
|
|
UNUSED(backend); |
|
} |
|
|
|
static void ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) { |
|
struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context; |
|
|
|
struct ggml_cplan cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads); |
|
|
|
if (cpu_ctx->work_size < cplan.work_size) { |
|
|
|
cpu_ctx->work_data = realloc(cpu_ctx->work_data, cplan.work_size); |
|
cpu_ctx->work_size = cplan.work_size; |
|
} |
|
|
|
cplan.work_data = cpu_ctx->work_data; |
|
|
|
ggml_graph_compute(cgraph, &cplan); |
|
} |
|
|
|
static bool ggml_backend_cpu_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) { |
|
return true; |
|
UNUSED(backend); |
|
UNUSED(op); |
|
} |
|
|
|
static struct ggml_backend_i cpu_backend_i = { |
|
ggml_backend_cpu_name, |
|
ggml_backend_cpu_free, |
|
ggml_backend_cpu_alloc_buffer, |
|
ggml_backend_cpu_get_alignment, |
|
ggml_backend_cpu_set_tensor_async, |
|
ggml_backend_cpu_get_tensor_async, |
|
ggml_backend_cpu_synchronize, |
|
ggml_backend_cpu_cpy_tensor_from, |
|
ggml_backend_cpu_cpy_tensor_to, |
|
ggml_backend_cpu_graph_plan_create, |
|
ggml_backend_cpu_graph_plan_free, |
|
ggml_backend_cpu_graph_plan_compute, |
|
ggml_backend_cpu_graph_compute, |
|
ggml_backend_cpu_supports_op, |
|
}; |
|
|
|
ggml_backend_t ggml_backend_cpu_init(void) { |
|
struct ggml_backend_cpu_context * ctx = malloc(sizeof(struct ggml_backend_cpu_context)); |
|
|
|
ctx->n_threads = GGML_DEFAULT_N_THREADS; |
|
ctx->work_data = NULL; |
|
ctx->work_size = 0; |
|
|
|
ggml_backend_t cpu_backend = malloc(sizeof(struct ggml_backend)); |
|
|
|
*cpu_backend = (struct ggml_backend) { |
|
cpu_backend_i, |
|
ctx |
|
}; |
|
return cpu_backend; |
|
} |
|
|
|
bool ggml_backend_is_cpu(ggml_backend_t backend) { |
|
return backend->iface.get_name == ggml_backend_cpu_name; |
|
} |
|
|
|
void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads) { |
|
GGML_ASSERT(ggml_backend_is_cpu(backend_cpu)); |
|
|
|
struct ggml_backend_cpu_context * ctx = (struct ggml_backend_cpu_context *)backend_cpu->context; |
|
ctx->n_threads = n_threads; |
|
} |
|
|
|
ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(ggml_backend_t backend_cpu, void * ptr, size_t size) { |
|
return ggml_backend_buffer_init(backend_cpu, cpu_backend_buffer_i_from_ptr, ptr, size); |
|
} |
|
|