-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathstart_containers.py
executable file
·115 lines (102 loc) · 4.2 KB
/
start_containers.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
#!/usr/bin/env python3
#
# Copyright (c) 2018 Dell Inc., or its subsidiaries. All Rights Reserved.
#
# Written by Claudio Fahey <[email protected]>
#
"""
This script starts the TensorFlow Docker container on multiple hosts.
"""
import configargparse
import subprocess
from multiprocessing import Pool
import functools
def start_container(args, host):
ssh_user = '' if args.user == '' else args.user + '@'
cmd = [
'ssh',
'-p', '22',
'%s%s' % (ssh_user, host),
'docker', 'stop', args.container_name,
]
print(' '.join(cmd))
subprocess.run(cmd, check=False)
if args.start:
if args.pull:
cmd = [
'ssh',
'-p', '22',
'%s%s' % (args.user, host),
'docker',
'pull',
args.docker_image,
]
print(' '.join(cmd))
subprocess.run(cmd, check=True)
cmd = [
'ssh',
'-p', '22',
'%s%s' % (args.user, host),
'docker',
'run',
'--rm',
'--detach',
'--privileged',
] + (['--gpus', args.gpus] if args.gpus else []) + [
'-v', '%s:/scripts' % args.scripts_dir,
'-v', '%s:/tensorflow-benchmarks' % args.benchmarks_dir,
'-v', '%s:/imagenet-data:ro' % args.imagenet_data_dir,
'-v', '%s:/imagenet-scratch' % args.imagenet_scratch_dir,
'-v', '/mnt:/mnt',
'--network=host',
'--shm-size=1g',
'--ulimit', 'memlock=-1',
'--ulimit', 'stack=67108864',
'--name', args.container_name,
args.docker_image,
'bash', '-c', '"/usr/sbin/sshd ; sleep infinity"',
]
print(' '.join(cmd))
subprocess.run(cmd, check=True)
def start_containers(args):
with Pool(16) as p:
p.map(functools.partial(start_container, args), args.host)
def main():
parser = configargparse.ArgParser(
description='Start Docker containers for TensorFlow benchmarking',
config_file_parser_class=configargparse.YAMLConfigFileParser,
default_config_files=['start_containers.yaml'],
)
parser.add_argument('--config', '-c', required=False, is_config_file=True, help='config file path')
parser.add_argument('--host', '-H', action='append', required=True, help='List of hosts on which to invoke processes.')
parser.add_argument('--scripts_dir', action='store',
default='/mnt/isilon/data/ai-benchmark-util',
help='Fully qualified path to the scripts directory.')
parser.add_argument('--benchmarks_dir', action='store',
default='/mnt/isilon/data/tensorflow-benchmarks',
help='Fully qualified path to the TensorFlow Benchmarks directory.')
parser.add_argument('--gpus', action='store', default='all')
parser.add_argument('--imagenet_data_dir', action='store',
default='/mnt/isilon/data/imagenet-data',
help='Fully qualified path to the directory containing the original ImageNet data.')
parser.add_argument('--imagenet_scratch_dir', action='store',
default='/mnt/isilon/data/imagenet-scratch',
help='Fully qualified path to the ImageNet scratch directory.')
parser.add_argument('--container_name', action='store', default='tf',
help='Name to assign to the containers.')
parser.add_argument('--docker_image', action='store',
default='claudiofahey/tensorflow:19.03-py3-custom',
help='Docker image tag.')
parser.add_argument('--user', action='store',
default='',
help='SSH user')
parser.add_argument('--nopull', dest='pull', action='store_false',
default=True,
help='Pull image')
parser.add_argument('--nostart', dest='start', action='store_false',
default=True,
help='Start containers')
args = parser.parse_args()
start_containers(args)
if __name__ == '__main__':
main()