Skip to content

Commit

Permalink
llama : allow skipping CPU graph with llama_set_skip_cpu
Browse files Browse the repository at this point in the history
This allows us to exercise the device graph in order to allocate VRAM
without wasting cycles on the CPU graph.

Signed-off-by: Jared Van Bortel <[email protected]>
  • Loading branch information
cebtenzzre committed May 30, 2024
1 parent 89c99ab commit ed12631
Show file tree
Hide file tree
Showing 4 changed files with 17 additions and 0 deletions.
9 changes: 9 additions & 0 deletions ggml-backend.c
Original file line number Diff line number Diff line change
Expand Up @@ -1041,6 +1041,7 @@ struct ggml_backend_sched {
bool is_alloc;

int n_backends;
bool skip_cpu;

ggml_backend_t backends[GGML_SCHED_MAX_BACKENDS];
ggml_backend_buffer_type_t bufts[GGML_SCHED_MAX_BACKENDS];
Expand Down Expand Up @@ -1638,6 +1639,10 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
int split_backend_id = split->backend_id;
ggml_backend_t split_backend = sched->backends[split_backend_id];

if (sched->skip_cpu && ggml_backend_is_cpu(split_backend)) {
continue;
}

// copy the input tensors to the split backend
for (int j = 0; j < split->n_inputs; j++) {
ggml_backend_t input_backend = ggml_backend_sched_get_tensor_backend(sched, split->inputs[j]);
Expand Down Expand Up @@ -1782,6 +1787,10 @@ void ggml_backend_sched_free(ggml_backend_sched_t sched) {
free(sched);
}

void ggml_backend_sched_set_skip_cpu(ggml_backend_sched_t sched, bool value) {
sched->skip_cpu = value;
}

void ggml_backend_sched_reset(ggml_backend_sched_t sched) {
// reset state for the next run
if (!sched->is_reset) {
Expand Down
2 changes: 2 additions & 0 deletions ggml-backend.h
Original file line number Diff line number Diff line change
Expand Up @@ -182,6 +182,8 @@ extern "C" {
GGML_API ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size, bool parallel);
GGML_API void ggml_backend_sched_free(ggml_backend_sched_t sched);

GGML_API void ggml_backend_sched_set_skip_cpu(ggml_backend_sched_t sched, bool value);

// Initialize backend buffers from a measure graph
GGML_API bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph);

Expand Down
4 changes: 4 additions & 0 deletions llama.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -18026,6 +18026,10 @@ void llama_dump_timing_info_yaml(FILE * stream, const llama_context * ctx) {
1.0e6 * ctx->n_sample / ctx->t_sample_us);
}

void llama_set_skip_cpu(struct llama_context * ctx, bool value) {
ggml_backend_sched_set_skip_cpu(ctx->sched, value);
}

// For internal test use
const std::vector<std::pair<std::string, struct ggml_tensor *>> & llama_internal_get_tensor_map(
struct llama_context * ctx
Expand Down
2 changes: 2 additions & 0 deletions llama.h
Original file line number Diff line number Diff line change
Expand Up @@ -1104,6 +1104,8 @@ extern "C" {

LLAMA_API void llama_dump_timing_info_yaml(FILE * stream, const struct llama_context * ctx);

LLAMA_API void llama_set_skip_cpu(struct llama_context * ctx, bool value);

#ifdef __cplusplus
}
#endif
Expand Down

0 comments on commit ed12631

Please sign in to comment.