-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathauto_monitor_eval.py
executable file
·163 lines (140 loc) · 6.92 KB
/
auto_monitor_eval.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
import time
import subprocess
import os
import argparse
from pprint import pprint
def get_folder_size(Folderpath):
size=0
for path, dirs, files in os.walk(Folderpath):
for f in files:
fp = os.path.join(path, f)
size += os.path.getsize(fp)
return size
def dump_model_configs(config_list, config_file='hf_llama_7b.py',):
with open(config_file, 'w') as file:
print(f'writing {len(config_list)} models to hf_llama_7b.py:')
pprint(config_list)
file.write('from opencompass.models import HuggingFaceCausalLM\n\n')
file.write('models = ' + repr(config_list).replace(", ", ",\n").replace('}','}\n') + '\n')
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('-s', '--serial', action='store_true', help='detect new ckpt after the current evaluation is finished')
args = parser.parse_args()
# run with: nohup python -u auto_monitor_eval.py > auto_monitor_eval.master_on_dgx-021.log 2>&1 &
# run with: nohup python -u auto_monitor_eval.py -s > auto_monitor_eval.master_on_dgx-021.log 2>&1 &
MODEL_TEMPLATE = {
'type': 'HuggingFaceCausalLM',
'abbr': 'exp2.6_phase2/20.00B',
'path': '/workspace/aifs4su/code/checkpoints/hkg_7b_nl_tp2_pp1_mb1_gb1024_gas4/pt2.6_phase2/hf_ckpt/20.00B',
'tokenizer_path': '/workspace/aifs4su/code/checkpoints/hkg_7b_nl_tp2_pp1_mb1_gb1024_gas4/pt2.6/hf_ckpt/hkg_hk50B_hf',
'tokenizer_kwargs': {
'padding_side': 'left',
'truncation_side': 'left',
'use_fast': False,
'trust_remote_code': True,
},
'max_out_len': 100,
'max_seq_len': 4096,
'batch_size': 16,
'model_kwargs': {
'device_map': 'auto',
'trust_remote_code': True,
},
'batch_padding': False,
'run_cfg': {'num_gpus': 1, 'num_procs': 1},
}
MAXIMUM_RUN_HOUR = 300
# megatron_ckpt_path = 'checkpoints/hkg_7b_nl_tp2_pp1_mb1_gb1024_gas4/pt2.6/checkpoint'
hf_ckpt_path = '/workspace/aifs4su/code/checkpoints/hkg_7b_nl_tp2_pp1_mb1_gb1024_gas4/pt2.6_phase2/hf_ckpt'
model_config_file = "/workspace/opencompass_new/hf_llama_7b.py"
MINIMUM_TOKEN_TO_TEST = 0.0
MAXIMUM_TOKEN_TO_TEST = float('inf')
SKIP_TOKEN_TO_TEST = []
# Record the start time
start_time = time.time()
# initial_files = []
MAX_INIT_TO_ADD = 0.0
initial_files = os.listdir(hf_ckpt_path)
initial_files = [
file for file in initial_files
if file.endswith('B') and float(file.replace('B','')) < MAX_INIT_TO_ADD
]
initial_files = sorted(initial_files, key=lambda x: float(x.replace('B','')), reverse=False)
TESTED_MODELS = []
RESULT_FOLDER = '20240404_exp2.6phase2_auto'
print('initial_files:', initial_files)
while True:
# watch new generated file in the folder
current_files = os.listdir(hf_ckpt_path)
current_files = [
file for file in current_files
if file.endswith('B') and float(file.replace('B',''))>=MINIMUM_TOKEN_TO_TEST and file not in SKIP_TOKEN_TO_TEST
]
# exclude folder that smaller than 10GB
current_files = [
file for file in current_files
if get_folder_size(os.path.join(hf_ckpt_path, file)) > 12*(1024**3)
]
current_files = sorted(current_files, key=lambda x: float(x.replace('B','')), reverse=False)
print('current_files:', current_files)
# Find new files
new_files = [file for file in current_files if file not in initial_files]
if len(new_files) > 0:
print("New checkpoint(s) detected:")
for file in new_files:
print(os.path.join(hf_ckpt_path, file))
model_configs = [] # new detected checkpoints
for file in new_files:
ckpt_to_eval = os.path.join(hf_ckpt_path,file)
# print(f'evaluating {ckpt_to_eval}')
trained_token = float(file.replace('B',''))
if trained_token < MINIMUM_TOKEN_TO_TEST or trained_token > MAXIMUM_TOKEN_TO_TEST:
continue
else:
new_model_conf = MODEL_TEMPLATE.copy()
base_abbr = os.path.basename(new_model_conf['abbr'])
new_model_conf['abbr'] = new_model_conf['abbr'].replace(base_abbr, file)
new_model_conf['path'] = ckpt_to_eval
model_configs.append(new_model_conf)
# upadte the minimum
MINIMUM_TOKEN_TO_TEST = trained_token if trained_token >= MINIMUM_TOKEN_TO_TEST else MINIMUM_TOKEN_TO_TEST
TESTED_MODELS = TESTED_MODELS + model_configs
print(f'##### Submitted Evaluation on checkpoint(s): #####')
print("\n".join(new_files))
# Define your bash command
# remember to change the config in hkgai/launcher/scripts/pretrain/pt2/batch_convert_ckpt.pt2.6.sh
# get the current date time
# bash_command = f"nohup python run.py eval_llama_7b_test.py > {eval_log_file} 2>&1 &"
# bash_command = f"nohup python test_print.py > {eval_log_file} 2>&1 &"
# write the new checkpoints to configs
dump_model_configs(model_configs, 'hf_llama_7b.py')
if args.serial:
bash_command = f"python -u run.py eval_llama_7b_test.py -l -r {RESULT_FOLDER}"
print(f'run command:', bash_command)
subprocess.run(bash_command.split())
else:
current_date_time = time.strftime("%Y%m%d-%H%M%S")
eval_log_file = f'auto_eval_{current_date_time}.log'
bash_command = f"bash auto_submit.sh {eval_log_file}"
print(f'run command:', bash_command)
subprocess.run(bash_command.split())
# Update the initial file list
print('update tested file list')
initial_files = current_files
# Record the end time
current_time = time.time()
print(f'The program has run {current_time-start_time} seconds')
if (current_time - start_time)/3600>=MAXIMUM_RUN_HOUR:
print(f'exceed maximum run time {MAXIMUM_RUN_HOUR} hour')
break
else:
print('no new file, hang')
time.sleep(600)
if len(TESTED_MODELS) > 1:
print("#"*10 + "\nRe-scan the previous failed evalution\n" + "#"*10)
dump_model_configs(TESTED_MODELS, 'hf_llama_7b.py')
bash_command = f"python -u run.py eval_llama_7b_test.py -l -r {RESULT_FOLDER}"
print(f'run command:', bash_command)
subprocess.run(bash_command.split())
# mannually run
# nohup python run.py eval_llama_7b_test.py > eval_659.95B.log 2>&1 &