Merge branch 'main' into ck_tile

ROCm · Jul 22, 2024 · a4417c7 · a4417c7
2 parents 23a2b1c + 5f1ae4a
commit a4417c7
Showing 116 changed files with 7,978 additions and 381 deletions.
diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
@@ -43,9 +43,9 @@ jobs:
           # Using ubuntu-20.04 instead of 22.04 for more compatibility (glibc). Ideally we'd use the
           # manylinux docker image, but I haven't figured out how to install CUDA on manylinux.
           os: [ubuntu-20.04]
-          python-version: ['3.7', '3.8', '3.9', '3.10', '3.11', '3.12']
-          torch-version: ['1.12.1', '1.13.1', '2.0.1', '2.1.2', '2.2.2', '2.3.0', '2.4.0.dev20240407']
-          cuda-version: ['11.8.0', '12.2.2']
+          python-version: ['3.8', '3.9', '3.10', '3.11', '3.12']
+          torch-version: ['2.0.1', '2.1.2', '2.2.2', '2.3.1', '2.4.0.dev20240514']
+          cuda-version: ['11.8.0', '12.3.2']
           # We need separate wheels that either uses C++11 ABI (-D_GLIBCXX_USE_CXX11_ABI) or not.
           # Pytorch wheels currently don't use it, but nvcr images have Pytorch compiled with C++11 ABI.
           # Without this we get import error (undefined symbol: _ZN3c105ErrorC2ENS_14SourceLocationESs)
@@ -54,35 +54,13 @@ jobs:
           exclude:
             # see https://github.com/pytorch/pytorch/blob/main/RELEASE.md#release-compatibility-matrix
             # Pytorch < 2.2 does not support Python 3.12
-            - torch-version: '1.12.1'
-              python-version: '3.12'
-            - torch-version: '1.13.1'
-              python-version: '3.12'
             - torch-version: '2.0.1'
               python-version: '3.12'
             - torch-version: '2.1.2'
               python-version: '3.12'
-            # Pytorch <= 1.12 does not support Python 3.11
-            - torch-version: '1.12.1'
-              python-version: '3.11'
-            # Pytorch >= 2.0 only supports Python >= 3.8
-            - torch-version: '2.0.1'
-              python-version: '3.7'
-            - torch-version: '2.1.2'
-              python-version: '3.7'
-            - torch-version: '2.2.2'
-              python-version: '3.7'
-            - torch-version: '2.3.0'
-              python-version: '3.7'
-            - torch-version: '2.4.0.dev20240407'
-              python-version: '3.7'
             # Pytorch <= 2.0 only supports CUDA <= 11.8
-            - torch-version: '1.12.1'
-              cuda-version: '12.2.2'
-            - torch-version: '1.13.1'
-              cuda-version: '12.2.2'
             - torch-version: '2.0.1'
-              cuda-version: '12.2.2'
+              cuda-version: '12.3.2'
 
     steps:
       - name: Checkout
@@ -97,7 +75,6 @@ jobs:
         run: |
           echo "MATRIX_CUDA_VERSION=$(echo ${{ matrix.cuda-version }} | awk -F \. {'print $1 $2'})" >> $GITHUB_ENV
           echo "MATRIX_TORCH_VERSION=$(echo ${{ matrix.torch-version }} | awk -F \. {'print $1 "." $2'})" >> $GITHUB_ENV
-          echo "MATRIX_PYTHON_VERSION=$(echo ${{ matrix.python-version }} | awk -F \. {'print $1 $2'})" >> $GITHUB_ENV
 
       - name: Free up disk space
         if: ${{ runner.os == 'Linux' }}
@@ -141,8 +118,8 @@ jobs:
           # see https://github.com/pytorch/pytorch/blob/main/RELEASE.md#release-compatibility-matrix
           # This code is ugly, maybe there's a better way to do this.
           export TORCH_CUDA_VERSION=$(python -c "from os import environ as env; \
-            minv = {'1.12': 113, '1.13': 116, '2.0': 117, '2.1': 118, '2.2': 118, '2.3': 118, '2.4': 118}[env['MATRIX_TORCH_VERSION']]; \
-            maxv = {'1.12': 116, '1.13': 117, '2.0': 118, '2.1': 121, '2.2': 121, '2.3': 121, '2.4': 121}[env['MATRIX_TORCH_VERSION']]; \
+            minv = {'2.0': 117, '2.1': 118, '2.2': 118, '2.3': 118, '2.4': 118}[env['MATRIX_TORCH_VERSION']]; \
+            maxv = {'2.0': 118, '2.1': 121, '2.2': 121, '2.3': 121, '2.4': 121}[env['MATRIX_TORCH_VERSION']]; \
             print(max(min(int(env['MATRIX_CUDA_VERSION']), maxv), minv))" \
           )
           if [[ ${{ matrix.torch-version }} == *"dev"* ]]; then
@@ -168,8 +145,8 @@ jobs:
           export PATH=/usr/local/nvidia/bin:/usr/local/nvidia/lib64:$PATH
           export LD_LIBRARY_PATH=/usr/local/nvidia/lib64:/usr/local/cuda/lib64:$LD_LIBRARY_PATH
           # Limit MAX_JOBS otherwise the github runner goes OOM
-          # CUDA 11.8 can compile with 2 jobs, but CUDA 12.2 goes OOM
-          MAX_JOBS=$([ "$MATRIX_CUDA_VERSION" == "122" ] && echo 1 || echo 2) FLASH_ATTENTION_FORCE_BUILD="TRUE" FLASH_ATTENTION_FORCE_CXX11_ABI=${{ matrix.cxx11_abi}} python setup.py bdist_wheel --dist-dir=dist
+          # CUDA 11.8 can compile with 2 jobs, but CUDA 12.3 goes OOM
+          MAX_JOBS=$([ "$MATRIX_CUDA_VERSION" == "123" ] && echo 1 || echo 2) FLASH_ATTENTION_FORCE_BUILD="TRUE" FLASH_ATTENTION_FORCE_CXX11_ABI=${{ matrix.cxx11_abi}} python setup.py bdist_wheel --dist-dir=dist
           tmpname=cu${MATRIX_CUDA_VERSION}torch${MATRIX_TORCH_VERSION}cxx11abi${{ matrix.cxx11_abi }}
           wheel_name=$(ls dist/*whl | xargs -n 1 basename | sed "s/-/+$tmpname-/2")
           ls dist/*whl |xargs -I {} mv {} dist/${wheel_name}

diff --git a/README.md b/README.md
@@ -26,6 +26,42 @@ contains a partial list of places where FlashAttention is being used.
 FlashAttention and FlashAttention-2 are free to use and modify (see LICENSE).
 Please cite and credit FlashAttention if you use it.
 
+
+## FlashAttention-3 beta release
+FlashAttention-3 is optimized for Hopper GPUs (e.g. H100). 
+
+Blogpost: https://tridao.me/blog/2024/flash3/
+
+Paper: https://tridao.me/publications/flash3/flash3.pdf
+
+![FlashAttention-3 speedup on H100 80GB SXM5 with FP16](assets/flash3_fp16_fwd.png)
+
+This is a beta release for testing / benchmarking before we integrate that with
+the rest of the repo.
+
+Currently released:
+- FP16 forward and backward
+
+Coming soon in the next couple of days / next week:
+- BF16
+- Variable length (FP16, BF16)
+- FP8 forward.
+
+Requirements: H100 / H800 GPU, CUDA >= 12.3.
+
+To install:
+```sh
+cd hopper
+python setup.py install
+```
+To run the test:
+```sh
+export PYTHONPATH=$PWD
+pytest -q -s test_flash_attn.py
+```
+
+
+
 ## Installation and features
 
 Requirements:
@@ -314,6 +350,11 @@ Implement deterministic backward pass. Thanks to engineers from [Meituan](www.me
 Support paged KV cache (i.e., [PagedAttention](https://arxiv.org/abs/2309.06180)).
 Thanks to @beginlner for this contribution.
 
+### 2.6: Softcapping.
+
+Support attention with softcapping, as used in Gemma-2 and Grok models.
+Thanks to @Narsil for this contribution.
+
 ## Performance
 
 We present expected speedup (combined forward + backward pass) and memory savings from using FlashAttention against PyTorch standard attention, depending on sequence length, on different GPUs (speedup depends on memory bandwidth - we see more speedup on slower GPU memory).

diff --git a/assets/flash3_fp16_fwd.png b/assets/flash3_fp16_fwd.png
diff --git a/csrc/cutlass b/csrc/cutlass
+96 −0		include/cute/arch/mma_sm90.hpp
+3,546 −738		include/cute/arch/mma_sm90_gmma.hpp
+816 −104		include/cute/atom/mma_traits_sm90_gmma.hpp