diff --git a/.gitignore b/.gitignore index d5b2817f..4d587306 100644 --- a/.gitignore +++ b/.gitignore @@ -17,3 +17,4 @@ SEAL *.dot *.csv *.out +GPU-MPC/experiments/sigma/sigma_offline_online diff --git a/GPU-MPC/experiments/sigma/llama2.h b/GPU-MPC/experiments/sigma/llama2.h index 8b15abe2..9b725d55 100644 --- a/GPU-MPC/experiments/sigma/llama2.h +++ b/GPU-MPC/experiments/sigma/llama2.h @@ -1,4 +1,4 @@ -// Author: Neha Jawalkar +// Author: Neha Jawalkar,Tanmay Rajore // Copyright: // // Copyright (c) 2024 Microsoft Research @@ -63,13 +63,14 @@ class GPULlamaTransformerBlock : public SytorchModule RMSNorm *ln0; RMSNorm *ln1; u64 n_heads, n_embd, intermediate_size; + bool rotatory; public: - GPULlamaTransformerBlock(u64 n_heads, u64 n_embd, u64 intermediate_size): n_heads(n_heads), n_embd(n_embd), intermediate_size(intermediate_size) + GPULlamaTransformerBlock(u64 n_heads, u64 n_embd, u64 intermediate_size,bool rotatory=true): n_heads(n_heads), n_embd(n_embd), intermediate_size(intermediate_size),rotatory(rotatory) { auto dim_W = n_embd / n_heads; - attn = new _MHADummy(n_heads, n_embd, (int)dim_W, "self", "qkvsep", true, true); + attn = new _MHADummy(n_heads, n_embd, (int)dim_W, "self", "qkvsep", true, rotatory); ffn = new LlamaFFN(n_embd, intermediate_size); ln0 = new RMSNorm(n_embd, false); ln1 = new RMSNorm(n_embd, false); @@ -93,14 +94,15 @@ class GPULlama : public SytorchModule std::vector *> blocks; // RMSNorm *ln_f; u64 n_layer, n_heads, n_embd, intermediate_size; + bool rotatory; public: - GPULlama(u64 n_layer, u64 n_heads, u64 n_embd, u64 intermediate_size): n_layer(n_layer), n_heads(n_heads), n_embd(n_embd), intermediate_size(intermediate_size) + GPULlama(u64 n_layer, u64 n_heads, u64 n_embd, u64 intermediate_size,bool rotatory=true): n_layer(n_layer), n_heads(n_heads), n_embd(n_embd), intermediate_size(intermediate_size),rotatory(rotatory) { for(u64 i = 0; i < n_layer; ++i) { - blocks.push_back(new GPULlamaTransformerBlock(n_heads, n_embd, intermediate_size)); + blocks.push_back(new GPULlamaTransformerBlock(n_heads, n_embd, intermediate_size,rotatory)); } // ln_f = new RMSNorm(n_embd); } diff --git a/GPU-MPC/experiments/sigma/sigma_offline_online.cu b/GPU-MPC/experiments/sigma/sigma_offline_online.cu new file mode 100644 index 00000000..b30eb3ae --- /dev/null +++ b/GPU-MPC/experiments/sigma/sigma_offline_online.cu @@ -0,0 +1,221 @@ +// Author: Neha Jawalkar,Tanmay Rajore +// Copyright: +// +// Copyright (c) 2024 Microsoft Research +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. + +#include +#include "gpt2.h" +#include "bert.h" +#include "llama2.h" +#include "backend/sigma.h" + +inline std::string toGB(u64 bytes) { + return std::to_string(bytes) + " B (" + std::to_string((float) bytes / (1024.0f * 1024.0f * 1024.0f)) + " GB)"; +} + +int main(int __argc, char **__argv) +{ + sytorch_init(); + + u64 n_embd = 0; + u64 n_head = 0; + u64 n_layer = 0; + std::string attnMask = "none"; + std::string qkvFormat = "qkvconcat"; + int bw = 0; + u64 scale = 12; + u64 n_seq = atoi(__argv[2]); + int role = atoi(__argv[3]); + int party = atoi(__argv[4]); + + std::string model(__argv[1]); + printf("Model=%s\n", model.data()); + std::string keyDir(__argv[5]); + auto keyFile = keyDir + model + "_inference_key"; + u64 keyBufSz = 0; + SytorchModule *net; + + if (model == "gpt2") + { + n_layer = 12; + n_head = 12; + n_embd = 768; + attnMask = "self"; + bw = 50; + keyBufSz = 20 * OneGB; + net = new GPUGPT2(n_layer, n_head, n_embd, attnMask, qkvFormat); + } + else if (model == "bert-tiny") + { + n_layer = 2; + n_head = 2; + n_embd = 128; + bw = 37; + keyBufSz = OneGB; + net = new GPUBERT(n_layer, n_head, n_embd, attnMask, qkvFormat); + } + else if (model == "bert-base") + { + n_layer = 12; + n_head = 12; + n_embd = 768; + bw = 50; + keyBufSz = 70 * OneGB; + net = new GPUBERT(n_layer, n_head, n_embd, attnMask, qkvFormat); + } + else if (model == "bert-large") + { + n_layer = 24; + n_head = 16; + n_embd = 1024; + bw = 50; + keyBufSz = 50 * OneGB; + net = new GPUBERT(n_layer, n_head, n_embd, attnMask, qkvFormat); + } + else if (model == "gpt-neo") + { + n_layer = 24; + n_head = 16; + n_embd = 2048; + attnMask = "self"; + qkvFormat = "kvqsep"; + bw = 51; + keyBufSz = 80 * OneGB; + net = new GPUGPT2(n_layer, n_head, n_embd, attnMask, qkvFormat, false); + } + else if (model == "gpt-neo-large") + { + n_layer = 32; + n_head = 20; + n_embd = 2560; + attnMask = "self"; + qkvFormat = "concat"; + bw = 51; // 52; + keyBufSz = 200 * OneGB; + net = new GPUGPT2(n_layer, n_head, n_embd, attnMask, qkvFormat, false); + } + else if (model == "llama7b") + { + n_layer = 32; + n_head = 32; + n_embd = 4096; + attnMask = "self"; + qkvFormat = "qkvsep"; + bw = 48; + u64 intermediate_size = 11008; + keyBufSz = 500 * OneGB; + net = new GPULlama(n_layer, n_head, n_embd, intermediate_size); + } + else if (model == "llama13b") + { + n_layer = 40; + n_head = 40; + n_embd = 5120; + attnMask = "self"; + qkvFormat = "qkvsep"; + bw = 48; + u64 intermediate_size = 13824; + keyBufSz = 450 * OneGB; + net = new GPULlama(n_layer, n_head, n_embd, intermediate_size); + } + else if (model == "airavata") + { + n_layer = 32; + n_head = 32; + n_embd = 4096; + attnMask = "self"; + qkvFormat = "qkvsep"; + bw = 48; + u64 intermediate_size = 11008; + keyBufSz = 500 * OneGB; + net = new GPULlama(n_layer, n_head, n_embd, intermediate_size,false); + } + else + { + printf("Invalid model\n"); + return 1; + } + + Tensor input({n_seq, n_embd}); + net->init(scale, input); + srand(time(NULL)); + + if (role == 0) + { + auto sigma = new SIGMAKeygen(party, bw, scale, keyFile, keyBufSz); + net->setBackend(sigma); + net->optimize(); + input.d_data = (u64 *)moveToGPU((u8 *)input.data, input.size() * sizeof(u64), (Stats *)NULL); + auto &activation = net->forward(input); + sigma->output(activation); + sigma->close(); + } + else + { + std::string ip(__argv[6]); + auto sigma = new SIGMA(party, ip, keyFile, bw, scale, n_seq, n_embd, atoi(__argv[7])); + net->setBackend(sigma); + net->optimize(); + sigma->peer->sync(); + auto start = std::chrono::high_resolution_clock::now(); + input.d_data = (u64 *)moveToGPU((u8 *)input.data, input.size() * sizeof(u64), (Stats *)NULL); + auto &activation = net->forward(input); + sigma->output(activation); + auto end = std::chrono::high_resolution_clock::now(); + auto elapsed = std::chrono::duration_cast(end - start); + sigma->close(); + + std::stringstream ss; + + ss << "Time in ms" << std::endl; + ss << "Total time=" + std::to_string(elapsed.count()); + ss << std::endl; + ss << "Comm time=" + std::to_string(sigma->s.comm_time); + ss << std::endl; + ss << "Transfer time=" + std::to_string(sigma->s.transfer_time); + ss << std::endl; + ss << "MHA time=" + std::to_string(sigma->s.mha_time); + ss << std::endl; + ss << "Matmul time=" + std::to_string(sigma->s.matmul_time); + ss << std::endl; + ss << "Truncate time=" + std::to_string(sigma->s.truncate_time); + ss << std::endl; + ss << "Gelu time=" + std::to_string(sigma->s.gelu_time); + ss << std::endl; + ss << "Softmax time=" + std::to_string(sigma->s.softmax_time); + ss << std::endl; + ss << "Layernorm time=" + std::to_string(sigma->s.layernorm_time); + ss << std::endl; + ss << std::endl; + ss << "Total Comm=" + toGB(sigma->peer->bytesSent() + sigma->peer->bytesReceived()); + ss << std::endl; + ss << "Gelu Comm=" + toGB(sigma->s.gelu_comm_bytes); + ss << std::endl; + ss << "Softmax Comm=" + toGB(sigma->s.softmax_comm_bytes); + ss << std::endl; + ss << "Layernorm Comm=" + toGB(sigma->s.layernorm_comm_bytes); + ss << std::endl; + + auto inferenceDir = "output/P" + std::to_string(party) + "/"; + std::ofstream statsFile(inferenceDir + model + ".txt"); + statsFile << ss.rdbuf(); + statsFile.close(); + } + return 0; +} \ No newline at end of file