From 443665aec4721ecf57df8162e7e093a0cd674a76 Mon Sep 17 00:00:00 2001
From: Jared Van Bortel <jared@nomic.ai>
Date: Mon, 12 Aug 2024 11:46:29 -0400
Subject: [PATCH] llama : do not request buffer type if we don't need it anyway

Since we use ngl=0 with the Kompute backend to load models on CPU on
Linux and Windows, we need to make sure not to call
ggml_backend_kompute_buffer_type, which initializes the Vulkan driver.

Initializing the Vulkan driver in this case could cause a failure for no
good reason (e.g. if it is not available).

Also, when we do not create any Kompute buffers, the instance currently
does not have an opportunity to be freed until exit-time destructors
run, at which point the necessary libraries may have already been
unloaded from memory. This causes an observable segfault at exit when
loading the model on CPU via the Python bindings.
---
 src/llama.cpp | 38 +++++++++++++++++++++-----------------
 1 file changed, 21 insertions(+), 17 deletions(-)

diff --git a/src/llama.cpp b/src/llama.cpp
index 337748fd3475a..bf60dfb45bdfd 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -6035,28 +6035,32 @@ static bool llm_load_tensors(
     } else
 #endif
     {
-        ggml_backend_buffer_type_t split_buft;
-        if (split_mode == LLAMA_SPLIT_MODE_ROW) {
-            split_buft = llama_default_buffer_type_split(model, main_gpu, tensor_split);
-        } else {
-            // LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_LAYER in backends where it is not supported
-            split_buft = llama_default_buffer_type_offload(model, main_gpu);
-        }
+        ggml_backend_buffer_type_t split_buft = nullptr;
+        if (i_gpu_start < n_layer) {
+            if (split_mode == LLAMA_SPLIT_MODE_ROW) {
+                split_buft = llama_default_buffer_type_split(model, main_gpu, tensor_split);
+            } else {
+                // LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_LAYER in backends where it is not supported
+                split_buft = llama_default_buffer_type_offload(model, main_gpu);
+            }
 #ifdef GGML_USE_KOMPUTE
-        // we can fall back to CPU buffer type in some cases
-        if (!strcmp(ggml_backend_buft_name(split_buft), "CPU")) {
-            model.using_gpu = false;
-        }
+            // we can fall back to CPU buffer type in some cases
+            if (!strcmp(ggml_backend_buft_name(split_buft), "CPU")) {
+                model.using_gpu = false;
+            }
 #endif
-        // assign the repeating layers
-        for (int i = i_gpu_start; i < n_layer; ++i) {
-            model.buft_layer[i] = {
-                split_buft,
-                llama_default_buffer_type_offload(model, main_gpu)
-            };
+            // assign the repeating layers
+            for (int i = i_gpu_start; i < n_layer; ++i) {
+                model.buft_layer[i] = {
+                    split_buft,
+                    llama_default_buffer_type_offload(model, main_gpu)
+                };
+            }
         }
+
         // assign the output layer
         if (n_gpu_layers > n_layer) {
+            assert(split_buft);
             model.buft_output = {
                 split_buft,
                 llama_default_buffer_type_offload(model, main_gpu)