forked from NVIDIA/Megatron-LM
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathexp.sh
167 lines (145 loc) · 6.35 KB
/
exp.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
#!/bin/bash
# 定义参数范围
# num_micros=(8 16 32)
# sp_splits=(1 4)
# seq_lens=(8096 16384)
# model_configs=('3.5b' '7b')
export NCCL_IB_HCA=mlx5_0,mlx5_1,mlx5_2,mlx5_3,mlx5_4,mlx5_6,mlx5_7,mlx5_8
export CUDA_DEVICE_MAX_CONNECTIONS=1
DATETIME=$(date +'date_%y-%m-%d_time_%H-%M-%S')
PP_SP_STRS=('uniform_comp')
num_micros=(16)
sp_splits=(4)
export DATA_PATH="/workspace/workspace/Megatron-LM/"
# sp_splits=(1)
# export MASTER_ADDR=localhost
# export MASTER_PORT=32768
# export WORLD_SIZE=1
seq_lens=(65536)
model_configs=('30b')
VPP_SIZES=(2 4)
TP_SIZE=8
PP_SIZE=8
export PROFILE="true"
# Distributed
MICRO_BATCH=1
# WORLD_SIZE=1
TRAIN_ITER=10
GPUS_PER_NODE=8
NGPUS=$((WORLD_SIZE * GPUS_PER_NODE))
# MASTER_ADDR=g4006
MASTER_PORT=12306
export TP_SIZE PP_SIZE MICRO_BATCH WORLD_SIZE GPUS_PER_NODE MASTER_ADDR MASTER_PORT PP_SP_STR
# 结果文件配置
declare -A config_1_3b=(
[num_layers]=24
[hidden]=2048
[num_attn_heads]=16
)
declare -A config_2_7b=(
[num_layers]=32
[hidden]=2560
[num_attn_heads]=32
)
declare -A config_7b=(
[num_layers]=32
[hidden]=4096
[num_attn_heads]=32
)
declare -A config_13b=(
[num_layers]=40
[hidden]=5120
[num_attn_heads]=40
)
declare -A config_30b=(
[num_layers]=64
[hidden]=6144
[num_attn_heads]=64
)
# 创建一个映射从模型配置名称到相应的数组名称
declare -A model_config_map=(
["2.7b"]="config_2_7b"
["1.3b"]="config_1_3b"
["30b"]="config_30b"
["7b"]="config_7b"
["13b"]="config_13b"
)
# 设置变量
# Debug 开关,设置为1以启用屏幕输出
DEBUG=1
EXP_OUTPUT_FILE="./exp_logs/exps/${DATETIME}_exp.log"
echo "micro_bsz, num_of_batch, seq_length, vpp_size, tp_size, pp_size, sp_splits, model_size, split_strategy, dp_size, throughput, memory_array, one_step_time, Ngpus, tflops" > $EXP_OUTPUT_FILE
for VPP_SIZE in "${VPP_SIZES[@]}"; do
for num_micro in "${num_micros[@]}"; do
for SEQ_LENGTH in "${seq_lens[@]}"; do
for model_config in "${model_configs[@]}"; do
for PP_SP in "${sp_splits[@]}"; do
if [ $PP_SP -eq 1 ]; then
PP_SP_STRS_RES=('average')
else
PP_SP_STRS_RES=("${PP_SP_STRS[@]}")
fi
for PP_SP_STR in "${PP_SP_STRS_RES[@]}"; do
echo $PP_SP_STR
DATETIME=$(date +'date_%y-%m-%d_time_%H-%M-%S')
DP_SIZE=$((NGPUS / TP_SIZE / PP_SIZE))
GLOBAL_BATCH=$((MICRO_BATCH * num_micro * DP_SIZE))
# 根据模型配置名动态获取配置
config_name="${model_config_map[$model_config]}"
declare -n config=$config_name
NUM_LAYERS="${config[num_layers]}"
HIDDEN="${config[hidden]}"
NUM_ATTN_HEADS="${config[num_attn_heads]}"
NUM_LAYERS_PER_VSTAGE=$((NUM_LAYERS / PP_SIZE / VPP_SIZE))
echo $NUM_LAYERS
echo $NUM_LAYERS_PER_VSTAGE
# 导出变量以便 run.sh 脚本可以使用
export NUM_LAYERS HIDDEN NUM_ATTN_HEADS PP_SP SEQ_LENGTH GLOBAL_BATCH NUM_LAYERS_PER_VSTAGE VPP_SIZE TRAIN_ITER
echo "****************MODEL_CONFIG*****************"
echo "number of layers: $NUM_LAYERS"
echo "hidden size: $HIDDEN"
echo "number of attention heads: $NUM_ATTN_HEADS"
echo "Running experiment with TP_SIZE=${TP_SIZE}, PP_SIZE=${PP_SIZE}, MICRO_BATCH=${MICRO_BATCH}, WORLD_SIZE=${WORLD_SIZE}, num_micro=${num_micro}, PP_SP=${PP_SP}, SEQ_LENGTH=${SEQ_LENGTH}, model_config=${model_config}"
# 执行 run.sh 并根据 DEBUG 重定向输出到日志和/或屏幕
if [ $PP_SP -eq 1 ]; then
PP_SP_STR_RES="none"
else
PP_SP_STR_RES=$PP_SP_STR
fi
logname="./exp_logs/TP${TP_SIZE}_PP${PP_SIZE}_VPP${VPP_SIZE}_MICRO${MICRO_BATCH}_WORLD${WORLD_SIZE}_num_micro${num_micro}_sp${PP_SP}_str${PP_SP_STR_RES}_seqlen${SEQ_LENGTH}_model${model_config}.log"
echo "**********Log file***********"
echo "File path: $logname"
if [ "$DEBUG" -eq 1 ]; then
set -o pipefail
cmd="bash ./run.sh 2>&1 | tee $logname"
else
cmd="bash ./run.sh > $logname 2>&1"
fi
if [ "$PROFILE" = "true" ]; then
cmd="nsys profile -s none -t nvtx,cuda --force-overwrite true --capture-range=cudaProfilerApi --capture-range-end=stop -o ${WORLD_SIZE}gpus_${model_config}model_${SEQ_LENGTH}seqlen_${PP_SP}_splits_${VPP_SIZE}vpp.nsys-rep $cmd"
fi
echo $cmd
$cmd
return_code=$?
if [ $return_code -ne 0 ]; then
echo "Error: Process terminated abnormally."
echo "$MICRO_BATCH,$num_micro,$SEQ_LENGTH,$VPP_SIZE,$TP_SIZE,$PP_SIZE,$PP_SP,$model_config,$PP_SP_STR_RES,$DP_SIZE,OOM,OOM,OOM,$NGPUS,OOM" >> $EXP_OUTPUT_FILE
else
echo "*********Output*********"
output=$(tail -n 4 $logname)
echo "$output"
# declare -A data
# while IFS=: read -r key value; do
# data["$key"]=$(echo $value | xargs) # 使用 xargs 去除多余的空格
# done <<< "$output"
#
# echo "$MICRO_BATCH,$num_micro,$SEQ_LENGTH,$VPP_SIZE,$TP_SIZE,$PP_SIZE,$PP_SP,$model_config,$PP_SP_STR_RES,$DP_SIZE,${data[toks]},${data[mem_arr]},${data[time]},$NGPUS,${data[tflops]}" >> $EXP_OUTPUT_FILE
fi
done
done
done
done
done
done
echo "All experiments completed."
echo "Final Log file path: $EXP_OUTPUT_FILE"