diff --git a/lightllm/common/all_kernel_configs/mla_decode_attentnion/{out_dtype=torch.bfloat16,q_head_dim=512,q_head_num=128,q_rope_dim=64}_NVIDIA_H800.json b/lightllm/common/all_kernel_configs/mla_decode_attentnion/{out_dtype=torch.bfloat16,q_head_dim=512,q_head_num=128,q_rope_dim=64}_NVIDIA_H800.json new file mode 100644 index 000000000..7272d3f06 --- /dev/null +++ b/lightllm/common/all_kernel_configs/mla_decode_attentnion/{out_dtype=torch.bfloat16,q_head_dim=512,q_head_num=128,q_rope_dim=64}_NVIDIA_H800.json @@ -0,0 +1 @@ +{"256": {"1": {"BLOCK_N": 32, "BLOCK_Q_HEAD": 16, "stage1_num_warps": 4, "stage1_num_stages": 2, "stage2_num_warps": 4, "stage2_num_stages": 1}, "8": {"BLOCK_N": 32, "BLOCK_Q_HEAD": 16, "stage1_num_warps": 4, "stage1_num_stages": 2, "stage2_num_warps": 4, "stage2_num_stages": 1}, "16": {"BLOCK_N": 32, "BLOCK_Q_HEAD": 16, "stage1_num_warps": 4, "stage1_num_stages": 2, "stage2_num_warps": 4, "stage2_num_stages": 3}, "32": {"BLOCK_N": 32, "BLOCK_Q_HEAD": 16, "stage1_num_warps": 4, "stage1_num_stages": 2, "stage2_num_warps": 2, "stage2_num_stages": 1}, "64": {"BLOCK_N": 32, "BLOCK_Q_HEAD": 16, "stage1_num_warps": 4, "stage1_num_stages": 2, "stage2_num_warps": 1, "stage2_num_stages": 1}, "128": {"BLOCK_N": 32, "BLOCK_Q_HEAD": 32, "stage1_num_warps": 8, "stage1_num_stages": 3, "stage2_num_warps": 1, "stage2_num_stages": 3}, "256": {"BLOCK_N": 32, "BLOCK_Q_HEAD": 32, "stage1_num_warps": 8, "stage1_num_stages": 4, "stage2_num_warps": 1, "stage2_num_stages": 1}}, "512": {"1": {"BLOCK_N": 32, "BLOCK_Q_HEAD": 16, "stage1_num_warps": 8, "stage1_num_stages": 3, "stage2_num_warps": 4, "stage2_num_stages": 1}, "8": {"BLOCK_N": 32, "BLOCK_Q_HEAD": 16, "stage1_num_warps": 4, "stage1_num_stages": 2, "stage2_num_warps": 4, "stage2_num_stages": 1}, "16": {"BLOCK_N": 32, "BLOCK_Q_HEAD": 16, "stage1_num_warps": 4, "stage1_num_stages": 2, "stage2_num_warps": 1, "stage2_num_stages": 3}, "32": {"BLOCK_N": 32, "BLOCK_Q_HEAD": 16, "stage1_num_warps": 4, "stage1_num_stages": 2, "stage2_num_warps": 2, "stage2_num_stages": 1}, "64": {"BLOCK_N": 32, "BLOCK_Q_HEAD": 32, "stage1_num_warps": 8, "stage1_num_stages": 3, "stage2_num_warps": 4, "stage2_num_stages": 3}, "128": {"BLOCK_N": 32, "BLOCK_Q_HEAD": 32, "stage1_num_warps": 8, "stage1_num_stages": 4, "stage2_num_warps": 4, "stage2_num_stages": 3}, "256": {"BLOCK_N": 32, "BLOCK_Q_HEAD": 32, "stage1_num_warps": 8, "stage1_num_stages": 4, "stage2_num_warps": 2, "stage2_num_stages": 3}}, "1024": {"1": {"BLOCK_N": 32, "BLOCK_Q_HEAD": 16, "stage1_num_warps": 4, "stage1_num_stages": 2, "stage2_num_warps": 4, "stage2_num_stages": 1}, "8": {"BLOCK_N": 32, "BLOCK_Q_HEAD": 16, "stage1_num_warps": 4, "stage1_num_stages": 2, "stage2_num_warps": 4, "stage2_num_stages": 1}, "16": {"BLOCK_N": 32, "BLOCK_Q_HEAD": 16, "stage1_num_warps": 4, "stage1_num_stages": 2, "stage2_num_warps": 1, "stage2_num_stages": 3}, "32": {"BLOCK_N": 32, "BLOCK_Q_HEAD": 32, "stage1_num_warps": 8, "stage1_num_stages": 3, "stage2_num_warps": 1, "stage2_num_stages": 1}, "64": {"BLOCK_N": 32, "BLOCK_Q_HEAD": 32, "stage1_num_warps": 8, "stage1_num_stages": 4, "stage2_num_warps": 1, "stage2_num_stages": 1}, "128": {"BLOCK_N": 32, "BLOCK_Q_HEAD": 32, "stage1_num_warps": 8, "stage1_num_stages": 4, "stage2_num_warps": 1, "stage2_num_stages": 1}, "256": {"BLOCK_N": 32, "BLOCK_Q_HEAD": 32, "stage1_num_warps": 8, "stage1_num_stages": 4, "stage2_num_warps": 2, "stage2_num_stages": 3}}, "2048": {"1": {"BLOCK_N": 32, "BLOCK_Q_HEAD": 16, "stage1_num_warps": 4, "stage1_num_stages": 2, "stage2_num_warps": 4, "stage2_num_stages": 1}, "8": {"BLOCK_N": 32, "BLOCK_Q_HEAD": 16, "stage1_num_warps": 4, "stage1_num_stages": 2, "stage2_num_warps": 4, "stage2_num_stages": 1}, "16": {"BLOCK_N": 32, "BLOCK_Q_HEAD": 32, "stage1_num_warps": 8, "stage1_num_stages": 3, "stage2_num_warps": 1, "stage2_num_stages": 3}, "32": {"BLOCK_N": 32, "BLOCK_Q_HEAD": 32, "stage1_num_warps": 8, "stage1_num_stages": 4, "stage2_num_warps": 2, "stage2_num_stages": 1}, "64": {"BLOCK_N": 32, "BLOCK_Q_HEAD": 32, "stage1_num_warps": 8, "stage1_num_stages": 4, "stage2_num_warps": 1, "stage2_num_stages": 3}, "128": {"BLOCK_N": 32, "BLOCK_Q_HEAD": 32, "stage1_num_warps": 8, "stage1_num_stages": 4, "stage2_num_warps": 1, "stage2_num_stages": 1}, "256": {"BLOCK_N": 32, "BLOCK_Q_HEAD": 16, "stage1_num_warps": 4, "stage1_num_stages": 4, "stage2_num_warps": 2, "stage2_num_stages": 3}}, "4096": {"1": {"BLOCK_N": 32, "BLOCK_Q_HEAD": 16, "stage1_num_warps": 4, "stage1_num_stages": 2, "stage2_num_warps": 4, "stage2_num_stages": 1}, "8": {"BLOCK_N": 32, "BLOCK_Q_HEAD": 32, "stage1_num_warps": 8, "stage1_num_stages": 3, "stage2_num_warps": 4, "stage2_num_stages": 1}, "16": {"BLOCK_N": 32, "BLOCK_Q_HEAD": 32, "stage1_num_warps": 8, "stage1_num_stages": 4, "stage2_num_warps": 2, "stage2_num_stages": 1}, "32": {"BLOCK_N": 32, "BLOCK_Q_HEAD": 32, "stage1_num_warps": 8, "stage1_num_stages": 4, "stage2_num_warps": 2, "stage2_num_stages": 3}, "64": {"BLOCK_N": 32, "BLOCK_Q_HEAD": 32, "stage1_num_warps": 8, "stage1_num_stages": 4, "stage2_num_warps": 2, "stage2_num_stages": 3}, "128": {"BLOCK_N": 32, "BLOCK_Q_HEAD": 16, "stage1_num_warps": 4, "stage1_num_stages": 4, "stage2_num_warps": 2, "stage2_num_stages": 1}}, "8192": {"1": {"BLOCK_N": 32, "BLOCK_Q_HEAD": 16, "stage1_num_warps": 4, "stage1_num_stages": 2, "stage2_num_warps": 4, "stage2_num_stages": 1}, "8": {"BLOCK_N": 32, "BLOCK_Q_HEAD": 32, "stage1_num_warps": 8, "stage1_num_stages": 4, "stage2_num_warps": 4, "stage2_num_stages": 1}, "16": {"BLOCK_N": 32, "BLOCK_Q_HEAD": 32, "stage1_num_warps": 8, "stage1_num_stages": 4, "stage2_num_warps": 1, "stage2_num_stages": 3}, "32": {"BLOCK_N": 32, "BLOCK_Q_HEAD": 32, "stage1_num_warps": 8, "stage1_num_stages": 4, "stage2_num_warps": 1, "stage2_num_stages": 1}, "64": {"BLOCK_N": 32, "BLOCK_Q_HEAD": 16, "stage1_num_warps": 4, "stage1_num_stages": 4, "stage2_num_warps": 1, "stage2_num_stages": 3}}} \ No newline at end of file diff --git a/lightllm/common/all_kernel_configs/mla_decode_attentnion/{out_dtype=torch.bfloat16,q_head_dim=512,q_head_num=16,q_rope_dim=64}_NVIDIA_H800.json b/lightllm/common/all_kernel_configs/mla_decode_attentnion/{out_dtype=torch.bfloat16,q_head_dim=512,q_head_num=16,q_rope_dim=64}_NVIDIA_H800.json index 4abf69e8b..26143b609 100644 --- a/lightllm/common/all_kernel_configs/mla_decode_attentnion/{out_dtype=torch.bfloat16,q_head_dim=512,q_head_num=16,q_rope_dim=64}_NVIDIA_H800.json +++ b/lightllm/common/all_kernel_configs/mla_decode_attentnion/{out_dtype=torch.bfloat16,q_head_dim=512,q_head_num=16,q_rope_dim=64}_NVIDIA_H800.json @@ -1 +1 @@ -{"256": {"1": {"BLOCK_SEQ": 32, "BLOCK_N": 32, "BLOCK_Q_HEAD": 32, "stage1_num_warps": 4, "stage1_num_stages": 3, "stage2_num_warps": 8, "stage2_num_stages": 1}, "8": {"BLOCK_SEQ": 32, "BLOCK_N": 32, "BLOCK_Q_HEAD": 16, "stage1_num_warps": 4, "stage1_num_stages": 3, "stage2_num_warps": 8, "stage2_num_stages": 1}, "16": {"BLOCK_SEQ": 64, "BLOCK_N": 64, "BLOCK_Q_HEAD": 64, "stage1_num_warps": 8, "stage1_num_stages": 5, "stage2_num_warps": 4, "stage2_num_stages": 1}, "32": {"BLOCK_SEQ": 64, "BLOCK_N": 64, "BLOCK_Q_HEAD": 64, "stage1_num_warps": 8, "stage1_num_stages": 5, "stage2_num_warps": 4, "stage2_num_stages": 1}, "64": {"BLOCK_SEQ": 128, "BLOCK_N": 32, "BLOCK_Q_HEAD": 16, "stage1_num_warps": 4, "stage1_num_stages": 4, "stage2_num_warps": 4, "stage2_num_stages": 1}, "128": {"BLOCK_SEQ": 256, "BLOCK_N": 64, "BLOCK_Q_HEAD": 16, "stage1_num_warps": 8, "stage1_num_stages": 4, "stage2_num_warps": 1, "stage2_num_stages": 1}, "256": {"BLOCK_SEQ": 256, "BLOCK_N": 32, "BLOCK_Q_HEAD": 32, "stage1_num_warps": 4, "stage1_num_stages": 4, "stage2_num_warps": 2, "stage2_num_stages": 1}}, "512": {"1": {"BLOCK_SEQ": 32, "BLOCK_N": 32, "BLOCK_Q_HEAD": 64, "stage1_num_warps": 4, "stage1_num_stages": 2, "stage2_num_warps": 16, "stage2_num_stages": 1}, "8": {"BLOCK_SEQ": 64, "BLOCK_N": 64, "BLOCK_Q_HEAD": 64, "stage1_num_warps": 8, "stage1_num_stages": 5, "stage2_num_warps": 4, "stage2_num_stages": 1}, "16": {"BLOCK_SEQ": 64, "BLOCK_N": 32, "BLOCK_Q_HEAD": 16, "stage1_num_warps": 4, "stage1_num_stages": 4, "stage2_num_warps": 4, "stage2_num_stages": 1}, "32": {"BLOCK_SEQ": 128, "BLOCK_N": 32, "BLOCK_Q_HEAD": 16, "stage1_num_warps": 4, "stage1_num_stages": 4, "stage2_num_warps": 8, "stage2_num_stages": 1}, "64": {"BLOCK_SEQ": 256, "BLOCK_N": 64, "BLOCK_Q_HEAD": 16, "stage1_num_warps": 8, "stage1_num_stages": 5, "stage2_num_warps": 4, "stage2_num_stages": 1}, "128": {"BLOCK_SEQ": 256, "BLOCK_N": 32, "BLOCK_Q_HEAD": 64, "stage1_num_warps": 4, "stage1_num_stages": 4, "stage2_num_warps": 4, "stage2_num_stages": 1}, "256": {"BLOCK_SEQ": 256, "BLOCK_N": 16, "BLOCK_Q_HEAD": 16, "stage1_num_warps": 2, "stage1_num_stages": 3, "stage2_num_warps": 2, "stage2_num_stages": 1}}, "1024": {"1": {"BLOCK_SEQ": 64, "BLOCK_N": 64, "BLOCK_Q_HEAD": 32, "stage1_num_warps": 4, "stage1_num_stages": 2, "stage2_num_warps": 16, "stage2_num_stages": 1}, "8": {"BLOCK_SEQ": 64, "BLOCK_N": 32, "BLOCK_Q_HEAD": 16, "stage1_num_warps": 4, "stage1_num_stages": 4, "stage2_num_warps": 16, "stage2_num_stages": 1}, "16": {"BLOCK_SEQ": 128, "BLOCK_N": 32, "BLOCK_Q_HEAD": 16, "stage1_num_warps": 4, "stage1_num_stages": 4, "stage2_num_warps": 8, "stage2_num_stages": 1}, "32": {"BLOCK_SEQ": 256, "BLOCK_N": 64, "BLOCK_Q_HEAD": 16, "stage1_num_warps": 8, "stage1_num_stages": 5, "stage2_num_warps": 4, "stage2_num_stages": 1}, "64": {"BLOCK_SEQ": 256, "BLOCK_N": 32, "BLOCK_Q_HEAD": 32, "stage1_num_warps": 4, "stage1_num_stages": 4, "stage2_num_warps": 4, "stage2_num_stages": 1}, "128": {"BLOCK_SEQ": 256, "BLOCK_N": 16, "BLOCK_Q_HEAD": 16, "stage1_num_warps": 2, "stage1_num_stages": 3, "stage2_num_warps": 4, "stage2_num_stages": 1}, "256": {"BLOCK_SEQ": 256, "BLOCK_N": 16, "BLOCK_Q_HEAD": 16, "stage1_num_warps": 2, "stage1_num_stages": 3, "stage2_num_warps": 2, "stage2_num_stages": 1}}, "2048": {"1": {"BLOCK_SEQ": 64, "BLOCK_N": 64, "BLOCK_Q_HEAD": 64, "stage1_num_warps": 8, "stage1_num_stages": 4, "stage2_num_warps": 16, "stage2_num_stages": 1}, "8": {"BLOCK_SEQ": 128, "BLOCK_N": 32, "BLOCK_Q_HEAD": 16, "stage1_num_warps": 4, "stage1_num_stages": 4, "stage2_num_warps": 16, "stage2_num_stages": 1}, "16": {"BLOCK_SEQ": 256, "BLOCK_N": 64, "BLOCK_Q_HEAD": 16, "stage1_num_warps": 8, "stage1_num_stages": 5, "stage2_num_warps": 4, "stage2_num_stages": 5}, "32": {"BLOCK_SEQ": 256, "BLOCK_N": 32, "BLOCK_Q_HEAD": 64, "stage1_num_warps": 4, "stage1_num_stages": 4, "stage2_num_warps": 4, "stage2_num_stages": 1}, "64": {"BLOCK_SEQ": 256, "BLOCK_N": 16, "BLOCK_Q_HEAD": 16, "stage1_num_warps": 2, "stage1_num_stages": 3, "stage2_num_warps": 8, "stage2_num_stages": 1}, "128": {"BLOCK_SEQ": 256, "BLOCK_N": 16, "BLOCK_Q_HEAD": 16, "stage1_num_warps": 2, "stage1_num_stages": 3, "stage2_num_warps": 4, "stage2_num_stages": 1}, "256": {"BLOCK_SEQ": 256, "BLOCK_N": 32, "BLOCK_Q_HEAD": 16, "stage1_num_warps": 4, "stage1_num_stages": 5, "stage2_num_warps": 1, "stage2_num_stages": 1}}, "4096": {"1": {"BLOCK_SEQ": 128, "BLOCK_N": 64, "BLOCK_Q_HEAD": 64, "stage1_num_warps": 8, "stage1_num_stages": 5, "stage2_num_warps": 8, "stage2_num_stages": 1}, "8": {"BLOCK_SEQ": 256, "BLOCK_N": 64, "BLOCK_Q_HEAD": 16, "stage1_num_warps": 8, "stage1_num_stages": 5, "stage2_num_warps": 4, "stage2_num_stages": 4}, "16": {"BLOCK_SEQ": 256, "BLOCK_N": 32, "BLOCK_Q_HEAD": 64, "stage1_num_warps": 4, "stage1_num_stages": 4, "stage2_num_warps": 8, "stage2_num_stages": 1}, "32": {"BLOCK_SEQ": 256, "BLOCK_N": 16, "BLOCK_Q_HEAD": 16, "stage1_num_warps": 2, "stage1_num_stages": 3, "stage2_num_warps": 8, "stage2_num_stages": 1}, "64": {"BLOCK_SEQ": 256, "BLOCK_N": 16, "BLOCK_Q_HEAD": 16, "stage1_num_warps": 2, "stage1_num_stages": 3, "stage2_num_warps": 8, "stage2_num_stages": 1}, "128": {"BLOCK_SEQ": 256, "BLOCK_N": 32, "BLOCK_Q_HEAD": 16, "stage1_num_warps": 4, "stage1_num_stages": 4, "stage2_num_warps": 4, "stage2_num_stages": 1}}, "8192": {"1": {"BLOCK_SEQ": 128, "BLOCK_N": 64, "BLOCK_Q_HEAD": 64, "stage1_num_warps": 8, "stage1_num_stages": 5, "stage2_num_warps": 16, "stage2_num_stages": 1}, "8": {"BLOCK_SEQ": 256, "BLOCK_N": 32, "BLOCK_Q_HEAD": 64, "stage1_num_warps": 4, "stage1_num_stages": 4, "stage2_num_warps": 16, "stage2_num_stages": 1}, "16": {"BLOCK_SEQ": 256, "BLOCK_N": 16, "BLOCK_Q_HEAD": 16, "stage1_num_warps": 2, "stage1_num_stages": 3, "stage2_num_warps": 8, "stage2_num_stages": 1}, "32": {"BLOCK_SEQ": 256, "BLOCK_N": 32, "BLOCK_Q_HEAD": 16, "stage1_num_warps": 4, "stage1_num_stages": 5, "stage2_num_warps": 16, "stage2_num_stages": 1}, "64": {"BLOCK_SEQ": 256, "BLOCK_N": 32, "BLOCK_Q_HEAD": 32, "stage1_num_warps": 4, "stage1_num_stages": 4, "stage2_num_warps": 4, "stage2_num_stages": 1}}} \ No newline at end of file +{"256": {"1": {"BLOCK_N": 32, "BLOCK_Q_HEAD": 16, "stage1_num_warps": 4, "stage1_num_stages": 2, "stage2_num_warps": 4, "stage2_num_stages": 1}, "8": {"BLOCK_N": 32, "BLOCK_Q_HEAD": 16, "stage1_num_warps": 8, "stage1_num_stages": 3, "stage2_num_warps": 4, "stage2_num_stages": 1}, "16": {"BLOCK_N": 32, "BLOCK_Q_HEAD": 16, "stage1_num_warps": 8, "stage1_num_stages": 3, "stage2_num_warps": 4, "stage2_num_stages": 1}, "32": {"BLOCK_N": 32, "BLOCK_Q_HEAD": 16, "stage1_num_warps": 4, "stage1_num_stages": 2, "stage2_num_warps": 4, "stage2_num_stages": 1}, "64": {"BLOCK_N": 32, "BLOCK_Q_HEAD": 16, "stage1_num_warps": 4, "stage1_num_stages": 2, "stage2_num_warps": 4, "stage2_num_stages": 1}, "128": {"BLOCK_N": 32, "BLOCK_Q_HEAD": 16, "stage1_num_warps": 8, "stage1_num_stages": 5, "stage2_num_warps": 4, "stage2_num_stages": 3}, "256": {"BLOCK_N": 32, "BLOCK_Q_HEAD": 16, "stage1_num_warps": 8, "stage1_num_stages": 5, "stage2_num_warps": 1, "stage2_num_stages": 1}}, "512": {"1": {"BLOCK_N": 32, "BLOCK_Q_HEAD": 16, "stage1_num_warps": 4, "stage1_num_stages": 2, "stage2_num_warps": 4, "stage2_num_stages": 1}, "8": {"BLOCK_N": 32, "BLOCK_Q_HEAD": 16, "stage1_num_warps": 8, "stage1_num_stages": 3, "stage2_num_warps": 4, "stage2_num_stages": 1}, "16": {"BLOCK_N": 32, "BLOCK_Q_HEAD": 16, "stage1_num_warps": 4, "stage1_num_stages": 2, "stage2_num_warps": 4, "stage2_num_stages": 1}, "32": {"BLOCK_N": 32, "BLOCK_Q_HEAD": 16, "stage1_num_warps": 4, "stage1_num_stages": 2, "stage2_num_warps": 4, "stage2_num_stages": 1}, "64": {"BLOCK_N": 32, "BLOCK_Q_HEAD": 16, "stage1_num_warps": 8, "stage1_num_stages": 4, "stage2_num_warps": 4, "stage2_num_stages": 1}, "128": {"BLOCK_N": 32, "BLOCK_Q_HEAD": 16, "stage1_num_warps": 8, "stage1_num_stages": 5, "stage2_num_warps": 1, "stage2_num_stages": 1}, "256": {"BLOCK_N": 32, "BLOCK_Q_HEAD": 16, "stage1_num_warps": 8, "stage1_num_stages": 5, "stage2_num_warps": 2, "stage2_num_stages": 1}}, "1024": {"1": {"BLOCK_N": 32, "BLOCK_Q_HEAD": 16, "stage1_num_warps": 8, "stage1_num_stages": 3, "stage2_num_warps": 4, "stage2_num_stages": 1}, "8": {"BLOCK_N": 32, "BLOCK_Q_HEAD": 16, "stage1_num_warps": 4, "stage1_num_stages": 2, "stage2_num_warps": 4, "stage2_num_stages": 1}, "16": {"BLOCK_N": 32, "BLOCK_Q_HEAD": 16, "stage1_num_warps": 4, "stage1_num_stages": 2, "stage2_num_warps": 4, "stage2_num_stages": 1}, "32": {"BLOCK_N": 32, "BLOCK_Q_HEAD": 16, "stage1_num_warps": 8, "stage1_num_stages": 4, "stage2_num_warps": 4, "stage2_num_stages": 1}, "64": {"BLOCK_N": 32, "BLOCK_Q_HEAD": 16, "stage1_num_warps": 8, "stage1_num_stages": 4, "stage2_num_warps": 4, "stage2_num_stages": 1}, "128": {"BLOCK_N": 32, "BLOCK_Q_HEAD": 16, "stage1_num_warps": 8, "stage1_num_stages": 5, "stage2_num_warps": 1, "stage2_num_stages": 1}, "256": {"BLOCK_N": 32, "BLOCK_Q_HEAD": 16, "stage1_num_warps": 8, "stage1_num_stages": 5, "stage2_num_warps": 1, "stage2_num_stages": 1}}, "2048": {"1": {"BLOCK_N": 32, "BLOCK_Q_HEAD": 16, "stage1_num_warps": 8, "stage1_num_stages": 1, "stage2_num_warps": 4, "stage2_num_stages": 1}, "8": {"BLOCK_N": 32, "BLOCK_Q_HEAD": 16, "stage1_num_warps": 4, "stage1_num_stages": 2, "stage2_num_warps": 4, "stage2_num_stages": 1}, "16": {"BLOCK_N": 32, "BLOCK_Q_HEAD": 16, "stage1_num_warps": 8, "stage1_num_stages": 4, "stage2_num_warps": 4, "stage2_num_stages": 1}, "32": {"BLOCK_N": 32, "BLOCK_Q_HEAD": 16, "stage1_num_warps": 8, "stage1_num_stages": 4, "stage2_num_warps": 4, "stage2_num_stages": 1}, "64": {"BLOCK_N": 32, "BLOCK_Q_HEAD": 16, "stage1_num_warps": 8, "stage1_num_stages": 4, "stage2_num_warps": 4, "stage2_num_stages": 1}, "128": {"BLOCK_N": 32, "BLOCK_Q_HEAD": 16, "stage1_num_warps": 8, "stage1_num_stages": 5, "stage2_num_warps": 4, "stage2_num_stages": 3}, "256": {"BLOCK_N": 32, "BLOCK_Q_HEAD": 16, "stage1_num_warps": 4, "stage1_num_stages": 4, "stage2_num_warps": 1, "stage2_num_stages": 1}}, "4096": {"1": {"BLOCK_N": 32, "BLOCK_Q_HEAD": 16, "stage1_num_warps": 8, "stage1_num_stages": 3, "stage2_num_warps": 4, "stage2_num_stages": 1}, "8": {"BLOCK_N": 32, "BLOCK_Q_HEAD": 16, "stage1_num_warps": 8, "stage1_num_stages": 5, "stage2_num_warps": 4, "stage2_num_stages": 1}, "16": {"BLOCK_N": 32, "BLOCK_Q_HEAD": 16, "stage1_num_warps": 8, "stage1_num_stages": 4, "stage2_num_warps": 4, "stage2_num_stages": 1}, "32": {"BLOCK_N": 32, "BLOCK_Q_HEAD": 16, "stage1_num_warps": 8, "stage1_num_stages": 4, "stage2_num_warps": 4, "stage2_num_stages": 1}, "64": {"BLOCK_N": 32, "BLOCK_Q_HEAD": 16, "stage1_num_warps": 8, "stage1_num_stages": 5, "stage2_num_warps": 1, "stage2_num_stages": 3}, "128": {"BLOCK_N": 32, "BLOCK_Q_HEAD": 16, "stage1_num_warps": 4, "stage1_num_stages": 4, "stage2_num_warps": 2, "stage2_num_stages": 1}}, "8192": {"1": {"BLOCK_N": 32, "BLOCK_Q_HEAD": 16, "stage1_num_warps": 4, "stage1_num_stages": 2, "stage2_num_warps": 4, "stage2_num_stages": 1}, "8": {"BLOCK_N": 32, "BLOCK_Q_HEAD": 16, "stage1_num_warps": 8, "stage1_num_stages": 4, "stage2_num_warps": 4, "stage2_num_stages": 1}, "16": {"BLOCK_N": 32, "BLOCK_Q_HEAD": 16, "stage1_num_warps": 8, "stage1_num_stages": 5, "stage2_num_warps": 4, "stage2_num_stages": 1}, "32": {"BLOCK_N": 32, "BLOCK_Q_HEAD": 16, "stage1_num_warps": 8, "stage1_num_stages": 5, "stage2_num_warps": 1, "stage2_num_stages": 3}, "64": {"BLOCK_N": 32, "BLOCK_Q_HEAD": 16, "stage1_num_warps": 4, "stage1_num_stages": 4, "stage2_num_warps": 1, "stage2_num_stages": 3}}} \ No newline at end of file diff --git a/lightllm/common/basemodel/cuda_graph.py b/lightllm/common/basemodel/cuda_graph.py index 85475c3ee..82e4a9c9a 100644 --- a/lightllm/common/basemodel/cuda_graph.py +++ b/lightllm/common/basemodel/cuda_graph.py @@ -1,5 +1,6 @@ import os import torch +import copy from lightllm.utils.log_utils import init_logger from lightllm.distributed import custom_comm_ops @@ -27,10 +28,18 @@ def capture_decode(self, decode_func, input_ids, infer_state): infer_state.max_len_in_batch = self.graph_max_len_in_batch infer_state.total_token_num = self.graph_max_len_in_batch * batch_size # warmup + # 因为有些推理过程的代码,会通过判断infer_state中是否存在某些属性来在一层上 + # 做一些初始化的操作,后续层可以复用这些计算的结果,如 + # lightllm/models/deepseek2/triton_kernel/gqa_flash_decoding.py + # 中做的一些操作,所以在 warmup 的时候,需要调用infer_state的copy函数做一个 + # 浅拷贝,不然后续传入到cuda graph捕获过程中后,infer_state因为提前拥有了这些属性, + # 导致不会重新初始化,这样捕获过程中会不能捕获这些临时添加到 infer_state 管理对象 + # 中的 tensor。 for _ in range(1): torch.cuda.synchronize() - decode_func(input_ids, infer_state) + decode_func(input_ids, copy.copy(infer_state)) # infer_state must copy() torch.cuda.synchronize() + with custom_comm_ops.lightllm_capture_graph(): with torch.cuda.graph(graph_obj, pool=self.mempool): predict_logics = decode_func(input_ids, infer_state) diff --git a/lightllm/common/vllm_kernel/_ops.py b/lightllm/common/vllm_kernel/_ops.py index d8c8bd4b6..00fdcb748 100644 --- a/lightllm/common/vllm_kernel/_ops.py +++ b/lightllm/common/vllm_kernel/_ops.py @@ -37,11 +37,6 @@ supports_moe_ops = True -try: - from torch.library import register_fake -except ImportError: - from torch.library import impl_abstract as register_fake - # for vllm_quant.py torch.ops._C.cutlass_scaled_mm = torch.ops.vllm_total.cutlass_scaled_mm @@ -327,21 +322,6 @@ def gptq_gemm( return torch.ops.vllm_total.gptq_gemm(a, b_q_weight, b_gptq_qzeros, b_gptq_scales, b_g_idx, use_exllama, bit) -if hasattr(torch.ops.vllm_total, "gptq_gemm"): - - @register_fake("vllm_total::gptq_gemm") - def _gptq_gemm_fake( - a: torch.Tensor, - b_q_weight: torch.Tensor, - b_gptq_qzeros: torch.Tensor, - b_gptq_scales: torch.Tensor, - b_g_idx: torch.Tensor, - use_exllama: bool, - bit: int, - ) -> torch.Tensor: - return torch.empty((a.size(0), b_q_weight.size(1)), dtype=a.dtype, device=a.device) - - def gptq_shuffle(q_weight: torch.Tensor, q_perm: torch.Tensor, bit: int) -> None: torch.ops.vllm_total.gptq_shuffle(q_weight, q_perm, bit) @@ -376,179 +356,6 @@ def gptq_marlin_24_gemm( ) -if hasattr(torch.ops.vllm_total, "gptq_marlin_24_gemm"): - - @register_fake("vllm_total::gptq_marlin_24_gemm") - def _gptq_marlin_24_gemm_fake( - a: torch.Tensor, - b_q_weight: torch.Tensor, - b_meta: torch.Tensor, - b_scales: torch.Tensor, - workspace: torch.Tensor, - b_q_type, - size_m: torch.SymInt, - size_n: torch.SymInt, - size_k: torch.SymInt, - ) -> torch.Tensor: - return torch.empty((size_m, size_n), device=a.device, dtype=a.dtype) - - @register_fake("vllm_total::gptq_marlin_gemm") - def _gptq_marlin_gemm_fake( - a: torch.Tensor, - b_q_weight: torch.Tensor, - b_scales: torch.Tensor, - b_zeros: torch.Tensor, - g_idx: torch.Tensor, - perm: torch.Tensor, - workspace: torch.Tensor, - b_q_type, - size_m: torch.SymInt, - size_n: torch.SymInt, - size_k: torch.SymInt, - is_k_full: bool, - has_zp: bool = False, - use_fp32_reduce: bool = False, - ) -> torch.Tensor: - return torch.empty((size_m, size_n), device=a.device, dtype=a.dtype) - - @register_fake("vllm_total::ggml_dequantize") - def _ggml_dequantize_fake(W: torch.Tensor, quant_type: int, m: torch.SymInt, n: torch.SymInt) -> torch.Tensor: - return torch.empty((m, n), dtype=torch.float16, device=W.device) - - @register_fake("vllm_total::ggml_mul_mat_vec_a8") - def _ggml_mul_mat_vec_a8_fake( - W: torch.Tensor, - X: torch.Tensor, - quant_type: int, - row: torch.SymInt, - ) -> torch.Tensor: - return torch.empty((1, row), dtype=torch.float16, device=W.device) - - @register_fake("vllm_total::ggml_mul_mat_a8") - def _ggml_mul_mat_a8_fake( - W: torch.Tensor, - X: torch.Tensor, - quant_type: int, - row: torch.SymInt, - ) -> torch.Tensor: - batch = X.size(0) - return torch.empty((batch, row), dtype=torch.float16, device=W.device) - - @register_fake("vllm_total::marlin_qqq_gemm") - def _marlin_qqq_gemm_fake( - a: torch.Tensor, - b_q_weight: torch.Tensor, - s_tok: torch.Tensor, - s_ch: torch.Tensor, - s_group: torch.Tensor, - workspace: torch.Tensor, - size_m: torch.SymInt, - size_n: torch.SymInt, - size_k: torch.SymInt, - ) -> torch.Tensor: - return torch.empty((size_m, size_n), dtype=torch.float16, device=a.device) - - @register_fake("vllm_total::marlin_gemm") - def _marlin_gemm_fake( - a: torch.Tensor, - b_q_weight: torch.Tensor, - b_scales: torch.Tensor, - workspace: torch.Tensor, - size_m: torch.SymInt, - size_n: torch.SymInt, - size_k: torch.SymInt, - ) -> torch.Tensor: - return torch.empty((size_m, size_n), dtype=torch.float16, device=a.device) - - @register_fake("vllm_total::awq_dequantize") - def _awq_dequantize_fake( - qweight: torch.Tensor, - scales: torch.Tensor, - zeros: torch.Tensor, - split_k_iters: torch.SymInt, - thx: int, - thy: int, - ) -> torch.Tensor: - in_c = qweight.size(0) - qout_c = qweight.size(1) - out_c = qout_c * 8 - return torch.empty((in_c, out_c), dtype=scales.dtype, device=scales.device) - - @register_fake("vllm_total::awq_gemm") - def _awq_gemm_fake( - input: torch.Tensor, - qweight: torch.Tensor, - qzeros: torch.Tensor, - scales: torch.Tensor, - split_k_iters: torch.SymInt, - ) -> torch.Tensor: - num_in_feats = input.size(0) - return torch.empty( - (split_k_iters, num_in_feats, qweight.size(1) * 8), dtype=input.dtype, device=input.device - ).sum(0) - - @register_fake("vllm_total::aqlm_gemm") - def _aqlm_gemm_fake( - input: torch.Tensor, - codes: torch.Tensor, - codebooks: torch.Tensor, - scales: torch.Tensor, - codebook_partition_sizes: List[int], - bias: Optional[torch.Tensor], - ) -> torch.Tensor: - out_features = codes.size(0) * codebooks.size(2) - flat_input = input.reshape((-1, input.size(-1))) - flat_output = torch.empty((flat_input.size(0), out_features), dtype=input.dtype, device=input.device) - - output_sizes = list(input.shape) - output_sizes.pop() - output_sizes.append(-1) - return flat_output.reshape(tuple(output_sizes)) - - @register_fake("vllm_total::aqlm_dequant") - def _aqlm_dequant_fake( - codes: torch.Tensor, codebooks: torch.Tensor, codebook_partition_sizes: List[int] - ) -> torch.Tensor: - in_features = codes.size(1) * 8 - out_features = codes.size(0) - return torch.empty((out_features, in_features), dtype=codebooks.dtype, device=codebooks.device) - - @register_fake("vllm_total::fp8_marlin_gemm") - def _fp8_marlin_gemm_fake( - a: torch.Tensor, - b_q_weight: torch.Tensor, - b_scales: torch.Tensor, - workspace: torch.Tensor, - num_bits: int, - size_m: torch.SymInt, - size_n: torch.SymInt, - size_k: torch.SymInt, - ) -> torch.Tensor: - return torch.empty((size_m, size_n), dtype=a.dtype, device=a.device) - - @register_fake("vllm_total::machete_gemm") - def machete_gemm_fake( - a: torch.Tensor, - # Should be the tensor returned by machete_prepack_B - b_q: torch.Tensor, - b_type, - b_scales: Optional[torch.Tensor] = None, - b_zeros: Optional[torch.Tensor] = None, - b_group_size: Optional[int] = None, - c: Optional[torch.Tensor] = None, - alpha: Optional[float] = None, - beta: Optional[float] = None, - schedule: Optional[str] = None, - ) -> torch.Tensor: - m = a.size(0) - n = b_q.size(1) - return torch.empty((m, n), device=a.device, dtype=a.dtype) - - @register_fake("vllm_total::machete_prepack_B") - def machete_prepack_B_fake(b_q_weight: torch.Tensor, b_type) -> torch.Tensor: - return torch.empty_like(b_q_weight, memory_format=torch.contiguous_format) - - # cutlass def cutlass_scaled_mm_supports_fp8(cuda_device_capability: int) -> bool: return torch.ops.vllm_total.cutlass_scaled_mm_supports_fp8(cuda_device_capability) @@ -732,13 +539,6 @@ def machete_prepack_B(b_q_weight: torch.Tensor, b_type) -> torch.Tensor: return torch.ops.vllm_total.machete_prepack_B(b_q_weight, b_type.id) -if hasattr(torch.ops.vllm_total, "permute_cols"): - - @register_fake("vllm_total::permute_cols") - def _permute_cols_fake(a: torch.Tensor, perm: torch.Tensor) -> torch.Tensor: - return torch.empty_like(a) - - def permute_cols(a: torch.Tensor, perm: torch.Tensor) -> torch.Tensor: return torch.ops.vllm_total.permute_cols(a, perm) @@ -961,34 +761,6 @@ def topk_softmax( torch.ops.vllm_moe.topk_softmax(topk_weights, topk_ids, token_expert_indicies, gating_output) -if supports_moe_ops and hasattr(torch.ops.vllm_moe, "marlin_gemm_moe"): - - @register_fake("vllm_moe::marlin_gemm_moe") - def marlin_gemm_moe_fake( - a: torch.Tensor, - b_q_weights: torch.Tensor, - sorted_ids: torch.Tensor, - topk_weights: torch.Tensor, - topk_ids: torch.Tensor, - b_scales: torch.Tensor, - b_zero_points: torch.Tensor, - g_idx: torch.Tensor, - perm: torch.Tensor, - workspace: torch.Tensor, - b_q_type, - size_m: torch.SymInt, - size_n: torch.SymInt, - size_k: torch.SymInt, - is_k_full: bool, - num_experts: int, - topk: int, - moe_block_size: int, - replicate_input: bool, - apply_weights: bool, - ) -> torch.Tensor: - return torch.empty((size_m, topk, size_n), dtype=a.dtype, device=a.device) - - def reshape_and_cache( key: torch.Tensor, value: torch.Tensor, diff --git a/lightllm/models/deepseek2/layer_infer/transformer_layer_infer.py b/lightllm/models/deepseek2/layer_infer/transformer_layer_infer.py index f8eaa0865..87777463b 100644 --- a/lightllm/models/deepseek2/layer_infer/transformer_layer_infer.py +++ b/lightllm/models/deepseek2/layer_infer/transformer_layer_infer.py @@ -10,6 +10,8 @@ context_attention_fwd, context_attention_fwd_no_prompt_cache, ) +from lightllm.models.deepseek2.triton_kernel.context_flashattention_nopad_with_v import context_attention_fwd_with_v +from lightllm.models.deepseek2.triton_kernel.sample_kv import sample_kv from lightllm.models.deepseek2.triton_kernel.gqa_flash_decoding import gqa_token_decode_attention_flash_decoding from lightllm.models.llama.layer_infer.transformer_layer_infer import LlamaTransformerLayerInfer @@ -54,6 +56,7 @@ def __init__(self, layer_num, tp_rank, world_size, network_config, mode=[]): if mscale_all_dim: mscale = get_deepseek_mscale(scaling_factor, mscale_all_dim) self.softmax_scale = self.softmax_scale * mscale * mscale + self.enable_cc_method = os.getenv("ENABLE_CC_METHOD", "False").upper() in ["ON", "TRUE", "1"] super().__init__(layer_num, tp_rank, world_size, network_config, mode) self.enable_dp = os.getenv("ENABLE_DP", "0").upper() in ["ON", "TRUE", "1"] if self.enable_dp: @@ -65,7 +68,14 @@ def __init__(self, layer_num, tp_rank, world_size, network_config, mode=[]): return def _bind_attention(self): - self._context_attention_kernel = partial(Deepseek2TransformerLayerInfer._context_attention_kernel_origin, self) + if self.enable_cc_method: + self._context_attention_kernel = partial( + Deepseek2TransformerLayerInfer._context_attention_kernel_with_CC, self + ) + else: + self._context_attention_kernel = partial( + Deepseek2TransformerLayerInfer._context_attention_kernel_origin, self + ) self._token_attention_kernel = partial( Deepseek2TransformerLayerInfer._token_gqa_decode_attention_flashdecoding, self ) @@ -123,6 +133,65 @@ def _get_o( o_tensor = layer_weight.o_weight_.mm(input.reshape(-1, self.tp_q_head_num_ * self.qk_nope_head_dim)) return o_tensor + def _context_attention_kernel_with_CC( + self, + q: torch.Tensor, + kv, + infer_state: Deepseek2InferStateInfo, + layer_weight: Deepseek2TransformerLayerWeight, + out=None, + ) -> torch.Tensor: + if infer_state.use_dynamic_prompt_cache: + kv = infer_state.mem_manager.kv_buffer[self.layer_num_] + compressed_kv = self.alloc_tensor( + [infer_state.total_token_num, 1, layer_weight.kv_lora_rank], dtype=kv.dtype + ) + k_rope = self.alloc_tensor([infer_state.total_token_num, 1, self.qk_rope_head_dim], dtype=kv.dtype) + sample_kv( + kv, + compressed_kv, + k_rope, + infer_state.b_req_idx, + infer_state.b_seq_len, + infer_state.req_manager.req_to_token_indexs, + ) + else: + compressed_kv, k_rope = torch.split( # (b*s, 1, kv_lora + qk_r) + kv, [layer_weight.kv_lora_rank, layer_weight.qk_rope_head_dim], dim=-1 + ) + + # CC + k_nope = self.alloc_tensor( + [compressed_kv.shape[0], q.shape[1], self.qk_nope_head_dim], + dtype=compressed_kv.dtype, + ) + v = self.alloc_tensor( + k_nope.shape, + dtype=compressed_kv.dtype, + ) + compressed_kv = compressed_kv.view(-1, layer_weight.kv_lora_rank) + wk = layer_weight.k_b_proj_.weight.view(-1, layer_weight.kv_lora_rank) + wv = layer_weight.v_b_proj_.weight.transpose(1, 2).view(-1, layer_weight.kv_lora_rank) + torch.mm(compressed_kv, wk.transpose(0, 1), out=k_nope.reshape(compressed_kv.shape[0], -1)) + torch.mm(compressed_kv, wv.transpose(0, 1), out=v.reshape(compressed_kv.shape[0], -1)) + + q_nope, q_rope = q[:, :, : -self.qk_rope_head_dim], q[:, :, -self.qk_rope_head_dim :] + o_tensor = self.alloc_tensor(q_nope.shape, dtype=q_nope.dtype) if out is None else out + context_attention_fwd_with_v( + q_nope, + q_rope, + k_nope, + k_rope, + v, + o_tensor.view(-1, self.tp_q_head_num_, q_nope.shape[-1]), + infer_state.b_start_loc, + infer_state.b_seq_len, + infer_state.b_ready_cache_len, + infer_state.max_len_in_batch, + self.softmax_scale, + ) + return o_tensor + def _context_attention_kernel_origin( self, q: torch.Tensor, diff --git a/lightllm/models/deepseek2/triton_kernel/context_flashattention_nopad_with_v.py b/lightllm/models/deepseek2/triton_kernel/context_flashattention_nopad_with_v.py index 49cd34556..04e6facbb 100644 --- a/lightllm/models/deepseek2/triton_kernel/context_flashattention_nopad_with_v.py +++ b/lightllm/models/deepseek2/triton_kernel/context_flashattention_nopad_with_v.py @@ -20,28 +20,18 @@ def _fwd_kernel_with_v( B_Start_Loc, B_Seqlen, # B_LOC 内部记录每个batch 输入的真实位置, B_SEQ_len 记录当前输入的真实长度 Out, - Req_to_tokens, - B_req_idx, stride_q_bs, stride_q_h, - stride_q_d, stride_q_rope_bs, stride_q_rope_h, - stride_q_rope_d, stride_k_bs, stride_k_h, - stride_k_d, stride_k_rope_bs, stride_k_rope_h, - stride_k_rope_d, stride_vbs, stride_vh, - stride_vd, stride_obs, stride_oh, - stride_od, - stride_req_to_tokens_b, - stride_req_to_tokens_s, b_prompt_cache_len, BLOCK_M: tl.constexpr, BLOCK_DMODEL: tl.constexpr, @@ -57,7 +47,6 @@ def _fwd_kernel_with_v( cur_batch_in_all_start_index = tl.load(B_Start_Loc + cur_batch) prompt_cache_len = tl.load(b_prompt_cache_len + cur_batch) cur_batch_seq_len = tl.load(B_Seqlen + cur_batch) - prompt_cache_len - cur_batch_req_idx = tl.load(B_req_idx + cur_batch) block_start_loc = BLOCK_M * start_m @@ -66,20 +55,23 @@ def _fwd_kernel_with_v( offs_d = tl.arange(0, BLOCK_DMODEL) offs_rope_d = tl.arange(0, BLOCK_ROPE_DMODEL) offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M) - off_q = ( - (cur_batch_in_all_start_index + offs_m[:, None]) * stride_q_bs - + cur_head * stride_q_h - + offs_d[None, :] * stride_q_d - ) + off_q = (cur_batch_in_all_start_index + offs_m[:, None]) * stride_q_bs + cur_head * stride_q_h + offs_d[None, :] off_q_rope = ( (cur_batch_in_all_start_index + offs_m[:, None]) * stride_q_rope_bs + cur_head * stride_q_rope_h - + offs_rope_d[None, :] * stride_q_rope_d + + offs_rope_d[None, :] ) + off_k = offs_n[None, :] * stride_k_bs + cur_k_head * stride_k_h + offs_d[:, None] + off_k_rope = offs_n[None, :] * stride_k_rope_bs + offs_rope_d[:, None] + off_v = offs_n[:, None] * stride_vbs + cur_k_head * stride_vh + offs_d[None, :] q = tl.load(Q_nope + off_q, mask=offs_m[:, None] < cur_batch_seq_len, other=0.0) q_rope = tl.load(Q_rope + off_q_rope, mask=offs_m[:, None] < cur_batch_seq_len, other=0.0) + k_ptrs = K_nope + off_k + k_rope_ptrs = K_rope + off_k_rope + v_ptrs = V + off_v + # initialize pointer to m and l m_i = tl.zeros([BLOCK_M], dtype=tl.float32) - float("inf") l_i = tl.zeros([BLOCK_M], dtype=tl.float32) @@ -91,20 +83,20 @@ def _fwd_kernel_with_v( for start_n in range(0, block_mask * block_end_loc, BLOCK_N): start_n = tl.multiple_of(start_n, BLOCK_N) # -- compute qk ---- - k_loc = tl.load( - Req_to_tokens + stride_req_to_tokens_b * cur_batch_req_idx + stride_req_to_tokens_s * (start_n + offs_n), - mask=(start_n + offs_n) < block_end_loc, - other=0, + k = tl.load( + k_ptrs + (cur_batch_in_all_start_index + start_n) * stride_k_bs, + mask=(start_n + offs_n[None, :]) < block_end_loc, + other=0.0, + ) + k_rope = tl.load( + k_rope_ptrs + (cur_batch_in_all_start_index + start_n) * stride_k_rope_bs, + mask=(start_n + offs_n[None, :]) < block_end_loc, + other=0.0, ) - off_k = k_loc[None, :] * stride_k_bs + cur_k_head * stride_k_h + offs_d[:, None] * stride_k_d - off_k_rope = k_loc[None, :] * stride_k_rope_bs + offs_rope_d[:, None] * stride_k_rope_d - k = tl.load(K_nope + off_k, mask=(start_n + offs_n[None, :]) < block_end_loc, other=0.0) - k_rope = tl.load(K_rope + off_k_rope, mask=(start_n + offs_n[None, :]) < block_end_loc, other=0.0) qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32) qk += tl.dot(q, k) qk += tl.dot(q_rope, k_rope) - qk *= sm_scale qk = tl.where(offs_m[:, None] + prompt_cache_len >= start_n + offs_n[None, :], qk, float("-100000000.0")) @@ -126,19 +118,18 @@ def _fwd_kernel_with_v( acc_scale = tl.where(offs_m + prompt_cache_len >= start_n, acc_scale, 1.0) acc = acc * acc_scale[:, None] # update acc - off_v = k_loc[:, None] * stride_vbs + cur_k_head * stride_vh + offs_d[None, :] * stride_vd - v = tl.load(V + off_v, mask=(start_n + offs_n[:, None]) < block_end_loc, other=0.0) + v = tl.load( + v_ptrs + (cur_batch_in_all_start_index + start_n) * stride_vbs, + mask=(start_n + offs_n[:, None]) < block_end_loc, + other=0.0, + ) p = p.to(v.dtype) acc += tl.dot(p, v) # update m_i and l_i l_i = l_i_new m_i = m_i_new # initialize pointers to output - off_o = ( - (cur_batch_in_all_start_index + offs_m[:, None]) * stride_obs - + cur_head * stride_oh - + offs_d[None, :] * stride_od - ) + off_o = (cur_batch_in_all_start_index + offs_m[:, None]) * stride_obs + cur_head * stride_oh + offs_d[None, :] out_ptrs = Out + off_o tl.store(out_ptrs, acc, mask=offs_m[:, None] < cur_batch_seq_len) return @@ -152,12 +143,10 @@ def context_attention_fwd_with_v( k_rope, v, o, - b_req_idx, b_start_loc, b_seq_len, b_prompt_cache_len, max_input_len, - req_to_token_indexs, softmax_scale, ): @@ -194,28 +183,18 @@ def context_attention_fwd_with_v( b_start_loc, b_seq_len, o, - req_to_token_indexs, - b_req_idx, q_nope.stride(0), q_nope.stride(1), - q_nope.stride(2), q_rope.stride(0), q_rope.stride(1), - q_rope.stride(2), k_nope.stride(0), k_nope.stride(1), - k_nope.stride(2), k_rope.stride(0), k_rope.stride(1), - k_rope.stride(2), v.stride(0), v.stride(1), - v.stride(2), o.stride(0), o.stride(1), - o.stride(2), - req_to_token_indexs.stride(0), - req_to_token_indexs.stride(1), b_prompt_cache_len=b_prompt_cache_len, BLOCK_M=BLOCK, BLOCK_DMODEL=q_nope_dim, @@ -225,203 +204,3 @@ def context_attention_fwd_with_v( num_stages=1, ) return - - -@triton.jit -def _fwd_kernel_no_prompt_cache_with_v( - Q_nope, - Q_rope, - K_nope, - K_rope, - V, - sm_scale, - B_Start_Loc, - B_Seqlen, # B_LOC 内部记录每个batch 输入的真实位置, B_SEQ_len 记录当前输入的真实长度 - Out, - stride_q_bs, - stride_q_h, - stride_q_d, - stride_q_rope_bs, - stride_q_rope_h, - stride_q_rope_d, - stride_k_bs, - stride_k_h, - stride_k_d, - stride_k_rope_bs, - stride_k_rope_h, - stride_k_rope_d, - stride_vbs, - stride_vh, - stride_vd, - stride_obs, - stride_oh, - stride_od, - BLOCK_M: tl.constexpr, - BLOCK_DMODEL: tl.constexpr, - BLOCK_ROPE_DMODEL: tl.constexpr, - BLOCK_N: tl.constexpr, -): - cur_batch = tl.program_id(0) - cur_head = tl.program_id(1) - start_m = tl.program_id(2) - - cur_k_head = cur_head - - cur_batch_seq_len = tl.load(B_Seqlen + cur_batch) - cur_batch_in_all_start_index = tl.load(B_Start_Loc + cur_batch) - - block_start_loc = BLOCK_M * start_m - - # initialize offsets - offs_n = tl.arange(0, BLOCK_N) - offs_d = tl.arange(0, BLOCK_DMODEL) - offs_rope_d = tl.arange(0, BLOCK_ROPE_DMODEL) - offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M) - off_q = ( - (cur_batch_in_all_start_index + offs_m[:, None]) * stride_q_bs - + cur_head * stride_q_h - + offs_d[None, :] * stride_q_d - ) - off_rope_q = ( - (cur_batch_in_all_start_index + offs_m[:, None]) * stride_q_rope_bs - + cur_head * stride_q_rope_h - + offs_rope_d[None, :] * stride_q_rope_d - ) - off_k = offs_n[None, :] * stride_k_bs + cur_k_head * stride_k_h + offs_d[:, None] * stride_k_d - off_rope_k = offs_n[None, :] * stride_k_rope_bs + offs_rope_d[:, None] * stride_k_rope_d - off_v = offs_n[:, None] * stride_vbs + cur_k_head * stride_vh + offs_d[None, :] * stride_vd - - q = tl.load(Q_nope + off_q, mask=offs_m[:, None] < cur_batch_seq_len, other=0.0) - q_rope = tl.load(Q_rope + off_rope_q, mask=offs_m[:, None] < cur_batch_seq_len, other=0.0) - - k_ptrs = K_nope + off_k - k_rope_ptrs = K_rope + off_rope_k - v_ptrs = V + off_v - - # initialize pointer to m and l - m_i = tl.zeros([BLOCK_M], dtype=tl.float32) - float("inf") - l_i = tl.zeros([BLOCK_M], dtype=tl.float32) - acc = tl.zeros([BLOCK_M, BLOCK_DMODEL], dtype=tl.float32) - - block_mask = tl.where(block_start_loc < cur_batch_seq_len, 1, 0) - - for start_n in range(0, block_mask * (start_m + 1) * BLOCK_M, BLOCK_N): - start_n = tl.multiple_of(start_n, BLOCK_N) - # -- compute qk ---- - k = tl.load( - k_ptrs + (cur_batch_in_all_start_index + start_n) * stride_k_bs, - mask=(start_n + offs_n[None, :]) < cur_batch_seq_len, - other=0.0, - ) - k_rope = tl.load( - k_rope_ptrs + (cur_batch_in_all_start_index + start_n) * stride_k_rope_bs, - mask=(start_n + offs_n[None, :]) < cur_batch_seq_len, - other=0.0, - ) - - qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32) - qk += tl.dot(q, k) - qk += tl.dot(q_rope, k_rope) - qk *= sm_scale - qk = tl.where(offs_m[:, None] >= (start_n + offs_n[None, :]), qk, float("-inf")) - - # -- compute m_ij, p, l_ij - m_ij = tl.max(qk, 1) - p = tl.exp(qk - m_ij[:, None]) - l_ij = tl.sum(p, 1) - # -- update m_i and l_i - m_i_new = tl.maximum(m_i, m_ij) - alpha = tl.exp(m_i - m_i_new) - beta = tl.exp(m_ij - m_i_new) - l_i_new = alpha * l_i + beta * l_ij - # -- update output accumulator -- - # scale p - p_scale = beta / l_i_new - p = p * p_scale[:, None] - # scale acc - acc_scale = l_i / l_i_new * alpha - acc = acc * acc_scale[:, None] - # update acc - v = tl.load( - v_ptrs + (cur_batch_in_all_start_index + start_n) * stride_vbs, - mask=(start_n + offs_n[:, None]) < cur_batch_seq_len, - other=0.0, - ) - p = p.to(v.dtype) - acc += tl.dot(p, v) - # update m_i and l_i - l_i = l_i_new - m_i = m_i_new - # initialize pointers to output - off_o = ( - (cur_batch_in_all_start_index + offs_m[:, None]) * stride_obs - + cur_head * stride_oh - + offs_d[None, :] * stride_od - ) - out_ptrs = Out + off_o - tl.store(out_ptrs, acc, mask=offs_m[:, None] < cur_batch_seq_len) - return - - -@torch.no_grad() -def context_attention_fwd_no_prompt_cache_with_v( - q_nope, q_rope, k_nope, k_rope, v, o, b_start_loc, b_seq_len, max_input_len, softmax_scale -): - q_nope_dim = q_nope.shape[-1] - q_rope_dim = q_rope.shape[-1] - assert q_nope_dim == k_nope.shape[-1] - assert q_rope_dim == k_rope.shape[-1] - assert q_nope_dim in {16, 32, 64, 128, 256, 512} - assert q_rope_dim in {16, 32, 64, 128, 256} - assert q_nope_dim == v.shape[-1] - - if q_nope_dim >= 512: - BLOCK = 64 if not TESLA else 32 - else: - BLOCK = 128 if not TESLA else 64 - - if q_nope.dtype == torch.float32: - BLOCK = BLOCK // 4 - - sm_scale = softmax_scale - batch, head = b_seq_len.shape[0], q_nope.shape[1] - - grid = (batch, head, triton.cdiv(max_input_len, BLOCK)) # batch, head, - - num_warps = 4 if q_nope_dim <= 64 else 8 - _fwd_kernel_no_prompt_cache_with_v[grid]( - q_nope, - q_rope, - k_nope, - k_rope, - v, - sm_scale, - b_start_loc, - b_seq_len, - o, - q_nope.stride(0), - q_nope.stride(1), - q_nope.stride(2), - q_rope.stride(0), - q_rope.stride(1), - q_rope.stride(2), - k_nope.stride(0), - k_nope.stride(1), - k_nope.stride(2), - k_rope.stride(0), - k_rope.stride(1), - k_rope.stride(2), - v.stride(0), - v.stride(1), - v.stride(2), - o.stride(0), - o.stride(1), - o.stride(2), - BLOCK_M=BLOCK, - BLOCK_DMODEL=q_nope_dim, - BLOCK_ROPE_DMODEL=q_rope_dim, - BLOCK_N=BLOCK, - num_warps=num_warps, - num_stages=1, - ) - return diff --git a/lightllm/models/deepseek2/triton_kernel/gqa_flash_decoding.py b/lightllm/models/deepseek2/triton_kernel/gqa_flash_decoding.py index 8b688ddd3..256dfce5a 100644 --- a/lightllm/models/deepseek2/triton_kernel/gqa_flash_decoding.py +++ b/lightllm/models/deepseek2/triton_kernel/gqa_flash_decoding.py @@ -1,6 +1,8 @@ import os import torch import torch.multiprocessing as mp +import triton +import triton.language as tl from typing import List from lightllm.utils.log_utils import init_logger from .gqa_flash_decoding_config import MlaDecodeAttentionKernelConfig @@ -44,27 +46,19 @@ def gqa_token_decode_attention_flash_decoding( out_dtype=torch.bfloat16, ) + BLOCK_N = run_config["BLOCK_N"] + from .gqa_flash_decoding_stage1 import flash_decode_stage1 from .gqa_flash_decoding_stage2 import flash_decode_stage2 o_tensor = alloc_tensor_func(q_nope.shape, q_nope.dtype, q_nope.device) if out is None else out - mid_o_block_seq = torch.empty([1], dtype=torch.int64, device="cuda") - mid_o_batch_start_index = alloc_tensor_func( - [ - batch_size, - ], - dtype=torch.int64, - device="cuda", - ) - + fake_decode_att_block_seq = torch.empty([0], dtype=torch.int64, device="cuda") mid_o = torch.empty([q_head_num, 0, kv_lora_rank], dtype=torch.float32, device="cuda") mid_o_logexpsum = torch.empty([q_head_num, 0], dtype=torch.float32, device="cuda") vsm_count = flash_decode_stage1( - infer_state.total_token_num_tensor, - mid_o_block_seq, - mid_o_batch_start_index, + fake_decode_att_block_seq, q_nope.view(calcu_shape1), q_rope.view(calcu_shape2), kv_nope, @@ -79,13 +73,40 @@ def gqa_token_decode_attention_flash_decoding( **run_config ) + if not hasattr(infer_state, "decode_att_block_seq"): + assert batch_size <= 2048 + decode_att_block_seq = torch.empty( + [ + 1, + ], + dtype=torch.int64, + device="cuda", + ) + mid_o_batch_start_index = torch.empty( + [ + batch_size, + ], + dtype=torch.int64, + device="cuda", + ) + _fwd_kernel_calcu_index_and_block_seq[(1,)]( + infer_state.b_seq_len, + decode_att_block_seq, + mid_o_batch_start_index, + vsm_count, + batch_size, + BLOCK_N=BLOCK_N, + num_warps=4, + ) + + infer_state.decode_att_block_seq = decode_att_block_seq + infer_state.mid_o_batch_start_index = mid_o_batch_start_index + mid_o = torch.empty([q_head_num, vsm_count * 4 + batch_size, kv_lora_rank], dtype=torch.float32, device="cuda") mid_o_logexpsum = torch.empty([q_head_num, vsm_count * 4 + batch_size], dtype=torch.float32, device="cuda") flash_decode_stage1( - infer_state.total_token_num_tensor, - mid_o_block_seq, - mid_o_batch_start_index, + infer_state.decode_att_block_seq, q_nope.view(calcu_shape1), q_rope.view(calcu_shape2), kv_nope, @@ -101,8 +122,8 @@ def gqa_token_decode_attention_flash_decoding( ) flash_decode_stage2( - mid_o_block_seq, - mid_o_batch_start_index, + infer_state.decode_att_block_seq, + infer_state.mid_o_batch_start_index, mid_o, mid_o_logexpsum, infer_state.b_seq_len, @@ -110,3 +131,26 @@ def gqa_token_decode_attention_flash_decoding( **run_config ) return o_tensor + + +@triton.jit +def _fwd_kernel_calcu_index_and_block_seq( + b_seq_len_ptr, + mid_o_decode_att_block_seq_ptr, + mid_o_batch_start_index_ptr, + num_sm, + batch_size, + BLOCK_N: tl.constexpr, +): + b_seq_len = tl.load(b_seq_len_ptr + tl.arange(0, 2048), mask=tl.arange(0, 2048) < batch_size, other=0) + total_token_num = tl.sum(b_seq_len) + + block_seq = tl.cast(total_token_num / (num_sm * 4), dtype=tl.int32) + 1 + block_seq = tl.cdiv(block_seq, BLOCK_N) * BLOCK_N + + block_seq_len = tl.cdiv(b_seq_len, block_seq) + cumsum_seq_len = tl.cumsum(block_seq_len) + batch_start_index = cumsum_seq_len - block_seq_len + tl.store(mid_o_batch_start_index_ptr + tl.arange(0, 2048), batch_start_index, mask=tl.arange(0, 2048) < batch_size) + tl.store(mid_o_decode_att_block_seq_ptr, block_seq) + return diff --git a/lightllm/models/deepseek2/triton_kernel/gqa_flash_decoding_stage1.py b/lightllm/models/deepseek2/triton_kernel/gqa_flash_decoding_stage1.py index 9028c6e85..9191de6d1 100644 --- a/lightllm/models/deepseek2/triton_kernel/gqa_flash_decoding_stage1.py +++ b/lightllm/models/deepseek2/triton_kernel/gqa_flash_decoding_stage1.py @@ -35,9 +35,7 @@ def _fwd_kernel_flash_decode_stage1_padding( stride_mid_od, stride_mid_o_eh, stride_mid_o_es, - total_token_ptr, block_size_ptr, - batch_start_index_ptr, num_sm, head_group_num, head_num, @@ -51,15 +49,9 @@ def _fwd_kernel_flash_decode_stage1_padding( ): # cur_kv_head = 0 sm_id = tl.program_id(0).to(tl.int64) - grid_id = sm_id out_batch_start_index = tl.cast(0, tl.int64) - total_token_num = tl.load(total_token_ptr, eviction_policy="evict_last") + block_seq = tl.load(block_size_ptr, eviction_policy="evict_last") - block_seq = tl.cast(total_token_num / num_sm / 4, dtype=tl.int32) + 1 - block_seq = tl.cdiv(block_seq, BLOCK_N) * BLOCK_N - - if grid_id == 0: - tl.store(block_size_ptr, block_seq) cur_q_head_offs = tl.arange(0, Q_HEAD_NUM) offs_d = tl.arange(0, BLOCK_DMODEL) offs_rope_d = tl.arange(0, BLOCK_ROPE_DMODEL) @@ -163,9 +155,6 @@ def _fwd_kernel_flash_decode_stage1_padding( ) sm_id += num_sm - if grid_id == 0: - tl.store(batch_start_index_ptr + cur_batch, out_batch_start_index) - out_batch_start_index += cur_block_num // head_group_num sm_id -= cur_block_num return @@ -173,9 +162,7 @@ def _fwd_kernel_flash_decode_stage1_padding( @torch.no_grad() def flash_decode_stage1( - total_token_num_tensor: torch.Tensor, - out_block_seq: torch.Tensor, - batch_start_index: torch.Tensor, + in_block_seq: torch.Tensor, q_nope, q_rope, kv_nope, @@ -227,9 +214,7 @@ def flash_decode_stage1( *kv_rope.stride(), *mid_out.stride(), *mid_out_logsumexp.stride(), - total_token_num_tensor, - out_block_seq, - batch_start_index, + in_block_seq, num_sm=1, head_group_num=head_group_num, head_num=q_head_num, @@ -271,9 +256,7 @@ def flash_decode_stage1( *kv_rope.stride(), *mid_out.stride(), *mid_out_logsumexp.stride(), - total_token_num_tensor, - out_block_seq, - batch_start_index, + in_block_seq, num_sm=num_sm, head_group_num=head_group_num, head_num=q_head_num, diff --git a/lightllm/models/deepseek2/triton_kernel/sample_kv.py b/lightllm/models/deepseek2/triton_kernel/sample_kv.py new file mode 100644 index 000000000..912f8603d --- /dev/null +++ b/lightllm/models/deepseek2/triton_kernel/sample_kv.py @@ -0,0 +1,104 @@ +import torch + +import triton +import triton.language as tl + +TESLA = "Tesla" in torch.cuda.get_device_name(0) +CUDA_CAPABILITY = torch.cuda.get_device_capability() + + +@triton.jit +def _sample_kv_kernel( + KV_input, + KV_nope, + KV_rope, + B_start_loc, + B_Seqlen, + Req_to_tokens, + B_req_idx, + stride_input_dim, + stride_nope_dim, + stride_rope_dim, + stride_req_to_tokens_b, + BLOCK_M: tl.constexpr, + BLOCK_DMODEL: tl.constexpr, + BLOCK_ROPE_DMODEL: tl.constexpr, +): + cur_batch = tl.program_id(0) + start_m = tl.program_id(1) + + cur_batch_seq_len = tl.load(B_Seqlen + cur_batch) + cur_batch_req_idx = tl.load(B_req_idx + cur_batch) + cur_batch_start_loc = tl.load(B_start_loc + cur_batch) + + offs_nope_d = tl.arange(0, BLOCK_DMODEL) + offs_rope_d = tl.arange(0, BLOCK_ROPE_DMODEL) + offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M) + + block_end_loc = tl.minimum((start_m + 1) * BLOCK_M, cur_batch_seq_len) + + kv_loc = tl.load( + Req_to_tokens + stride_req_to_tokens_b * cur_batch_req_idx + offs_m, + mask=offs_m < block_end_loc, + other=0, + ) + off_kv_nope = kv_loc[:, None] * stride_input_dim + offs_nope_d[None, :] + off_kv_rope = kv_loc[:, None] * stride_input_dim + (offs_rope_d + BLOCK_DMODEL)[None, :] + kv_nope = tl.load(KV_input + off_kv_nope, mask=offs_m[:, None] < block_end_loc, other=0.0) + kv_rope = tl.load(KV_input + off_kv_rope, mask=offs_m[:, None] < block_end_loc, other=0.0) + off_nope = (offs_m + cur_batch_start_loc)[:, None] * stride_nope_dim + offs_nope_d[None, :] + off_rope = (offs_m + cur_batch_start_loc)[:, None] * stride_rope_dim + offs_rope_d[None, :] + nope_ptrs = KV_nope + off_nope + rope_ptrs = KV_rope + off_rope + tl.store(nope_ptrs, kv_nope, mask=offs_m[:, None] < block_end_loc) + tl.store(rope_ptrs, kv_rope, mask=offs_m[:, None] < block_end_loc) + return + + +@torch.no_grad() +def sample_kv( + kv_input, + kv_nope, + kv_rope, + b_req_idx, + b_seq_len, + req_to_token_indexs, +): + BLOCK = 128 if not TESLA else 64 + + nope_dim = kv_nope.shape[-1] + rope_dim = kv_rope.shape[-1] + if nope_dim >= 512: + BLOCK = 64 if not TESLA else 32 + else: + BLOCK = 128 if not TESLA else 64 + + batch = b_seq_len.shape[0] + + max_input_len = b_seq_len.max() + grid = ( + batch, + triton.cdiv(max_input_len, BLOCK), + ) + num_warps = 4 if nope_dim <= 64 else 8 + + b_start_loc = torch.cat([torch.zeros([1], device=b_seq_len.device, dtype=b_seq_len.dtype), b_seq_len[1:].cumsum(0)]) + _sample_kv_kernel[grid]( + kv_input, + kv_nope, + kv_rope, + b_start_loc, + b_seq_len, + req_to_token_indexs, + b_req_idx, + kv_input.stride(0), + kv_nope.stride(0), + kv_rope.stride(0), + req_to_token_indexs.stride(0), + BLOCK_M=BLOCK, + BLOCK_DMODEL=nope_dim, + BLOCK_ROPE_DMODEL=rope_dim, + num_warps=num_warps, + num_stages=1, + ) + return diff --git a/lightllm/server/httpserver/manager.py b/lightllm/server/httpserver/manager.py index 7bde5724f..598d38929 100644 --- a/lightllm/server/httpserver/manager.py +++ b/lightllm/server/httpserver/manager.py @@ -358,27 +358,29 @@ async def _wait_to_token_package( prompt_cache_len = metadata.pop("prompt_cache_len", 0) prompt_cache_ratio = prompt_cache_len / prompt_tokens format_start_time = datetime.datetime.fromtimestamp(start_time).strftime("%Y-%m-%d %H:%M:%S") - logger.info( - f"X-Request-Id:{x_request_id} " - f"X-Session-Id:{x_session_id} start_time:{format_start_time} " - f"lightllm_req_id:{group_request_id} first_token_cost:{first_token_cost_ms}ms " - f"total_cost_time:{total_cost_time_ms}ms,out_token_counter:{out_token_counter} " - f"mean_per_token_cost_time: {mean_per_token_cost_time_ms}ms " - f"prompt_token_num:{prompt_tokens} " - f"prompt_cache_len:{prompt_cache_len} " - f"prompt_cache_ratio:{prompt_cache_ratio} " - ) - self.metric_client.histogram_observe( - "lightllm_request_inference_duration", total_cost_time_ms / 1000.0 - ) - self.metric_client.histogram_observe( - "lightllm_request_mean_time_per_token_duration", mean_per_token_cost_time_ms / 1000.0 - ) - self.metric_client.histogram_observe( - "lightllm_request_first_token_duration", first_token_cost_ms / 1000.0 - ) - self.metric_client.histogram_observe("lightllm_request_generated_tokens", out_token_counter) - self.metric_client.counter_inc("lightllm_request_success") + if request is not None: + logger.info( + f"X-Request-Id:{x_request_id} " + f"X-Session-Id:{x_session_id} start_time:{format_start_time} " + f"lightllm_req_id:{group_request_id} first_token_cost:{first_token_cost_ms}ms " + f"total_cost_time:{total_cost_time_ms}ms,out_token_counter:{out_token_counter} " + f"mean_per_token_cost_time: {mean_per_token_cost_time_ms}ms " + f"prompt_token_num:{prompt_tokens} " + f"prompt_cache_len:{prompt_cache_len} " + f"prompt_cache_ratio:{prompt_cache_ratio} " + f"sampling_params: {{{sampling_params.to_string()}}}" + ) + self.metric_client.histogram_observe( + "lightllm_request_inference_duration", total_cost_time_ms / 1000.0 + ) + self.metric_client.histogram_observe( + "lightllm_request_mean_time_per_token_duration", mean_per_token_cost_time_ms / 1000.0 + ) + self.metric_client.histogram_observe( + "lightllm_request_first_token_duration", first_token_cost_ms / 1000.0 + ) + self.metric_client.histogram_observe("lightllm_request_generated_tokens", out_token_counter) + self.metric_client.counter_inc("lightllm_request_success") return req_status.out_token_info_list.clear() diff --git a/lightllm/server/router/model_infer/mode_backend/continues_batch/post_process.py b/lightllm/server/router/model_infer/mode_backend/continues_batch/post_process.py index a602fcdd3..79f634727 100644 --- a/lightllm/server/router/model_infer/mode_backend/continues_batch/post_process.py +++ b/lightllm/server/router/model_infer/mode_backend/continues_batch/post_process.py @@ -1,27 +1,44 @@ -import re import torch -from typing import List, Tuple -from lightllm.server.router.model_infer.infer_batch import InferBatch +from typing import List from lightllm.common.basemodel.triton_kernel.apply_penalty import apply_penalty +from dataclasses import dataclass + + +@dataclass +class OverlapStream: + overlap_stream: torch.cuda.Stream = None + + def get_overlap_stream(self): + if self.overlap_stream is None: + self.overlap_stream = torch.cuda.Stream() + return self.overlap_stream + + +g_single_overlap_stream = OverlapStream() def sample(logits, reqs, eos_id: List[int] = [2]): + + with torch.cuda.stream(g_single_overlap_stream.get_overlap_stream()): + ( + presence_penalties, + frequency_penalties, + repetition_penalties, + exponential_decay_length_penalties, + temperatures, + top_ps, + top_ks, + p_token_ids, + p_token_counts, + p_cumsum_seq_len, + p_max_len_in_batch, + length_penalty_idx, + mask_eos_reqs, + ) = _get_post_sample_tensors(reqs) + + torch.cuda.current_stream().wait_stream(g_single_overlap_stream.get_overlap_stream()) + logits = logits.contiguous() - ( - presence_penalties, - frequency_penalties, - repetition_penalties, - exponential_decay_length_penalties, - temperatures, - top_ps, - top_ks, - p_token_ids, - p_token_counts, - p_cumsum_seq_len, - p_max_len_in_batch, - length_penalty_idx, - mask_eos_reqs, - ) = _get_post_sample_tensors(reqs) apply_penalty( logits, diff --git a/lightllm/server/router/model_infer/model_rpc.py b/lightllm/server/router/model_infer/model_rpc.py index d5f1a6106..064cad0ae 100644 --- a/lightllm/server/router/model_infer/model_rpc.py +++ b/lightllm/server/router/model_infer/model_rpc.py @@ -93,15 +93,23 @@ def exposed_add_batch(self, batch_id, reqs): # @calculate_time(show=False, min_cost_ms=300) def exposed_prefill_batch(self, batch_id): - if self.world_size != 1: - batch_id = obtain(batch_id) - return self.backend.prefill_batch(batch_id) + try: + if self.world_size != 1: + batch_id = obtain(batch_id) + return self.backend.prefill_batch(batch_id) + except Exception as e: + err_msg = str(e) + logger.exception(f"Batch prefill encountered an unexpected ERROR: {err_msg}") # @calculate_time(show=True, min_cost_ms=200) def exposed_decode_batch(self, batch_id): - if self.world_size != 1: - batch_id = obtain(batch_id) - return self.backend.decode_batch(batch_id) + try: + if self.world_size != 1: + batch_id = obtain(batch_id) + return self.backend.decode_batch(batch_id) + except Exception as e: + err_msg = str(e) + logger.exception(f"Batch decode encountered an unexpected ERROR: {err_msg}") # @calculate_time(show=True, min_cost_ms=0.1) def exposed_filter_batch(self, batch_id, req_id_list, finished_req_id_list): diff --git a/lightllm/server/router/req_queue/continues_batch/impl.py b/lightllm/server/router/req_queue/continues_batch/impl.py index 07cc83379..3c6b2d38d 100644 --- a/lightllm/server/router/req_queue/continues_batch/impl.py +++ b/lightllm/server/router/req_queue/continues_batch/impl.py @@ -104,12 +104,15 @@ def generate_new_batch(self, current_batch: Batch): def _calcu_batch_token_load_batch_not_none(self, current_batch: Batch): is_busy = self.is_busy() self._init_cache_list(current_batch, is_busy) - self.cache_len_list.sort(key=lambda x: -x[1]) - left_out_len_array = np.array([e[1] for e in self.cache_len_list]) - has_run_len_array = np.array([e[0] for e in self.cache_len_list]) - cum_run_len_array = np.cumsum(has_run_len_array) - size_array = np.arange(1, len(self.cache_len_list) + 1, 1) - need_max_token_num = (left_out_len_array * size_array + cum_run_len_array).max() + if len(self.cache_len_list) != 0: + self.cache_len_list.sort(key=lambda x: -x[1]) + left_out_len_array = np.array([e[1] for e in self.cache_len_list]) + has_run_len_array = np.array([e[0] for e in self.cache_len_list]) + cum_run_len_array = np.cumsum(has_run_len_array) + size_array = np.arange(1, len(self.cache_len_list) + 1, 1) + need_max_token_num = (left_out_len_array * size_array + cum_run_len_array).max() + else: + need_max_token_num = 0 return ( need_max_token_num, (need_max_token_num + self.router.shared_token_load.get_frozened_token_count(self.dp_index)) diff --git a/lightllm/server/sampling_params.py b/lightllm/server/sampling_params.py index af8e6215f..1b8b16272 100644 --- a/lightllm/server/sampling_params.py +++ b/lightllm/server/sampling_params.py @@ -260,3 +260,9 @@ def to_origin_dict(self): ret["group_request_id"] = self.group_request_id ret["suggested_dp_index"] = self.suggested_dp_index return ret + + def to_string(self): + output_str = "" + for name, value in vars(self).items(): + output_str += f"{name}: {value} " + return output_str diff --git a/lightllm/utils/health_check.py b/lightllm/utils/health_check.py index fecf59843..cb5a755b5 100644 --- a/lightllm/utils/health_check.py +++ b/lightllm/utils/health_check.py @@ -1,3 +1,4 @@ +import os import asyncio import numpy as np from dataclasses import dataclass @@ -19,6 +20,7 @@ class HealthObj: _is_health: bool = True _is_health_checking: bool = False + timeout: int = int(os.getenv("HEALTH_TIMEOUT", 100)) def begin_check(self): self._is_health_checking = True @@ -65,7 +67,7 @@ async def check_timeout(results_generator): pass try: - await asyncio.wait_for(check_timeout(results_generator), timeout=88) + await asyncio.wait_for(check_timeout(results_generator), timeout=health_obj.timeout) health_obj.set_health() except asyncio.TimeoutError: health_obj.set_unhealth()