From f91943e4a65458386afb38d85fd528c96d7e9770 Mon Sep 17 00:00:00 2001
From: chengzeyi <ichengzeyi@gmail.com>
Date: Fri, 20 Dec 2024 17:30:16 +0800
Subject: [PATCH] update hunyuanvideo performance on single L20

---
 docs/performance/hunyuanvideo.md  |  3 ++-
 examples/run_hunyuan_video_usp.sh | 43 +++++++++++++++++++++++++++++++
 2 files changed, 45 insertions(+), 1 deletion(-)
 create mode 100755 examples/run_hunyuan_video_usp.sh
diff --git a/docs/performance/hunyuanvideo.md b/docs/performance/hunyuanvideo.md
index 0832bba..e214923 100644
--- a/docs/performance/hunyuanvideo.md
+++ b/docs/performance/hunyuanvideo.md
@@ -10,6 +10,7 @@ xDiT is [HunyuanVideo](https://github.com/Tencent/HunyuanVideo?tab=readme-ov-fil
 |----------|--------|---------|---------|---------|
 | H100 | 1,904.08 | 925.04 | 514.08 | 337.58 |
 | H20 | 6,639.17 | 3,400.55 | 1,762.86 | 940.97 |
+| L20 | 6,043.88 | | | |
 
 </center>
 
@@ -22,4 +23,4 @@ xDiT is [HunyuanVideo](https://github.com/Tencent/HunyuanVideo?tab=readme-ov-fil
 | H100 | 1,735.01 | 934.09 | 645.45 | 367.02 |
 | H20 | 6,621.46 | 3,400.55 | 2,310.48 | 1,214.67 |
 
-</center>
\ No newline at end of file
+</center>
diff --git a/examples/run_hunyuan_video_usp.sh b/examples/run_hunyuan_video_usp.sh
new file mode 100755
index 0000000..c1f8813
--- /dev/null
+++ b/examples/run_hunyuan_video_usp.sh
@@ -0,0 +1,43 @@
+#!/bin/bash
+set -x
+
+export PYTHONPATH=$PWD:$PYTHONPATH
+
+# CogVideoX configuration
+SCRIPT="hunyuan_video_usp_example.py"
+MODEL_ID="/cfs/dit/HunyuanVideo"
+# MODEL_ID="tencent/HunyuanVideo"
+INFERENCE_STEP=50
+
+mkdir -p ./results
+
+# CogVideoX specific task args
+TASK_ARGS="--height 720 --width 1280 --num_frames 129"
+
+# CogVideoX parallel configuration
+N_GPUS=8
+PARALLEL_ARGS="--ulysses_degree 4 --ring_degree 2"
+# CFG_ARGS="--use_cfg_parallel"
+
+# Uncomment and modify these as needed
+# PIPEFUSION_ARGS="--num_pipeline_patch 8"
+# OUTPUT_ARGS="--output_type latent"
+# PARALLLEL_VAE="--use_parallel_vae"
+ENABLE_TILING="--enable_tiling"
+ENABLE_MODEL_CPU_OFFLOAD="--enable_model_cpu_offload"
+# COMPILE_FLAG="--use_torch_compile"
+
+torchrun --nproc_per_node=$N_GPUS ./examples/$SCRIPT \
+--model $MODEL_ID \
+$PARALLEL_ARGS \
+$TASK_ARGS \
+$PIPEFUSION_ARGS \
+$OUTPUT_ARGS \
+--num_inference_steps $INFERENCE_STEP \
+--warmup_steps 0 \
+--prompt "A cat walks on the grass, realistic" \
+$CFG_ARGS \
+$PARALLLEL_VAE \
+$ENABLE_TILING \
+$ENABLE_MODEL_CPU_OFFLOAD \
+$COMPILE_FLAG