diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
index 6f37dcaa3..1cfe08b25 100644
--- a/.github/workflows/publish.yml
+++ b/.github/workflows/publish.yml
@@ -45,7 +45,7 @@ jobs:
           os: [ubuntu-20.04]
           python-version: ['3.8', '3.9', '3.10', '3.11', '3.12']
           torch-version: ['2.1.2', '2.2.2', '2.3.1', '2.4.0', '2.5.1']
-          cuda-version: ['11.8.0', '12.3.2']
+          cuda-version: ['11.8.0', '12.4.1']
           # We need separate wheels that either uses C++11 ABI (-D_GLIBCXX_USE_CXX11_ABI) or not.
           # Pytorch wheels currently don't use it, but nvcr images have Pytorch compiled with C++11 ABI.
           # Without this we get import error (undefined symbol: _ZN3c105ErrorC2ENS_14SourceLocationESs)
diff --git a/flash_attn/__init__.py b/flash_attn/__init__.py
index 218e299df..7a784dfee 100644
--- a/flash_attn/__init__.py
+++ b/flash_attn/__init__.py
@@ -1,4 +1,4 @@
-__version__ = "2.7.0"
+__version__ = "2.7.0.post1"
 
 from flash_attn.flash_attn_interface import (
     flash_attn_func,
diff --git a/setup.py b/setup.py
index af03c5b89..c3de7df06 100644
--- a/setup.py
+++ b/setup.py
@@ -436,9 +436,9 @@ def get_wheel_url():
         # We're using the CUDA version used to build torch, not the one currently installed
         # _, cuda_version_raw = get_cuda_bare_metal_version(CUDA_HOME)
         torch_cuda_version = parse(torch.version.cuda)
-        # For CUDA 11, we only compile for CUDA 11.8, and for CUDA 12 we only compile for CUDA 12.3
+        # For CUDA 11, we only compile for CUDA 11.8, and for CUDA 12 we only compile for CUDA 12.4
         # to save CI time. Minor versions should be compatible.
-        torch_cuda_version = parse("11.8") if torch_cuda_version.major == 11 else parse("12.3")
+        torch_cuda_version = parse("11.8") if torch_cuda_version.major == 11 else parse("12.4")
         # cuda_version = f"{cuda_version_raw.major}{cuda_version_raw.minor}"
         cuda_version = f"{torch_cuda_version.major}"