This repository has been archived by the owner on Dec 30, 2019. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 37
/
Copy pathremote_deploy.sh
executable file
·226 lines (191 loc) · 6.95 KB
/
remote_deploy.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
#!/usr/bin/env bash
#
# Copyright 2011 Twitter, Inc.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
#
# This script is used to deploy a PyCascading job remotely to a server
# where Hadoop is installed. The variables below are the defaults.
#
# This is the default server where the PyCascading script will be submitted
# to Hadoop. We assume we have SSH access to this server.
server=localhost
# This is the folder on the remote server where a temporary directory is
# going to be created for the submission. $HOME is only expanded on the
# remote server.
server_deploys_dir='$HOME/pycascading/deploys'
# The folder on the remote server where the PyCascading master jar will be
# placed. This must be given as an absolute path name so that the master
# files can be found from any directory.
server_build_dir='$HOME/pycascading/master'
# Additional SSH options (see "man ssh"; private key, etc.)
ssh_options=""
# Additional Hadoop options to be put in the run.sh runner
hadoop_options=""
# Options over, the script begins here
usage()
{
cat <<EOF
Usage: $(basename "$0") [options] <main_script> [additional_files]
The main_script gets executed by PyCascading. All additional_files are also
copied to the remote server and submitted together with the job to Hadoop.
Options:
-h Show this message.
-m Also deploy the PyCascading master archives before submitting
the job. The master archives must be on the Hadoop server
before a job can be run.
-f <file> Copy file to the server together with main_script, but
do not bundle it up for submission. This option may be
repeated several times for multiple files. File names
cannot start with a dot.
-s <server> The name of the remote server where Hadoop is installed,
and the PyCascading scripts should be deployed to.
-o <ssh_options> Additional options for SSH (such as private key, etc.).
ssh_options is one string enclosed by "s or 's, even if
there are several parameters.
-O <hadoop_opts> Additional Hadoop options to be put in the running script.
-r Run the job immediately after submission with SSH. The
recommended way to run a script is either using screen
or nohup, so that the job doesn't get interrupted if the
terminal connection goes down. Note that no additional
command line parameters can be passed in this case for
the job.
EOF
}
# Returns the absolute path for the parameter. We cannot use either realpath
# or readlink, as these may not be installed on MacOS.
# Thanks to Simon Radford.
realpath()
{
if echo "$1" | grep '^/' >/dev/null; then
# Path is absolute
echo "$1"
else
# Path is relative to the working directory
echo "$(pwd)/$1"
fi
}
# Remove the leading slashes from a path. This is needed when we package the
# Python sources as tar does the same, and on extraction there are no leading
# slashes.
remove_leading_slash()
{
echo "$1" | sed 's/^\/*//'
}
# Copy the master jar over first? The -m option.
master_first=no
# Run job after submission with SSH?
run_immediately='dont_run'
declare -a files_to_copy
while getopts ":hmf:s:o:O:r" OPTION; do
case $OPTION in
h) usage
exit 1
;;
m) master_first=yes
;;
f) files_to_copy=("${files_to_copy[@]}" "$OPTARG")
;;
s) server="$OPTARG"
;;
o) ssh_options="$OPTARG"
;;
O) hadoop_options="$OPTARG"
;;
r) run_immediately='do_run'
;;
esac
done
shift $((OPTIND-1))
main_file="$1"
if [ "$main_file" == "" -a $master_first == no ]; then
usage
exit 3
fi
home_dir=$(realpath $(dirname "$0"))
# This is the version that works both on Linux and MacOS
tmp_dir=$(mktemp -d -t PyCascading-tmp-XXXXXX)
if [ $master_first == yes ]; then
build_dir="$home_dir/build"
if [ -a "$build_dir/pycascading.jar" -a \
-a "$build_dir/pycascading.tgz" ]; then
ln -s "$build_dir/pycascading.jar" "$build_dir/pycascading.tgz" \
"$home_dir/python/pycascading/bootstrap.py" "$tmp_dir"
else
echo 'Build the PyCascading master package first in the "java" folder with ant.'
exit 2
fi
fi
if [ "$main_file" != "" ]; then
tar -c -z -f "$tmp_dir/sources.tgz" "$@"
if [ ${#files_to_copy} -gt 0 ]; then
tar -c -z -f "$tmp_dir/others.tgz" "${files_to_copy[@]}"
fi
fi
#
# Create a setup file that will be run on the deploy server after everything
# is copied over.
#
cat >"$tmp_dir/setup.sh" <<EOF
#
# This script is run on the deploy server to set up the PyCascading job folder
#
if [ -e pycascading.jar ]; then
# If we packaged the master jar, update it
mkdir -p "$server_build_dir"
mv pycascading.jar pycascading.tgz bootstrap.py "$server_build_dir"
rm -rf "$server_build_dir/python"
tar -x -z -f "$server_build_dir/pycascading.tgz" -C "$server_build_dir"
fi
if [ -e sources.tgz ]; then
mkdir -p "$server_deploys_dir"
deploy_dir=\$(mktemp -d "$server_deploys_dir/XXXXXX")
mkdir "\$deploy_dir/job"
mv run.sh "\$deploy_dir"
tar -x -z -f sources.tgz -C "\$deploy_dir/job"
mv sources.tgz "\$deploy_dir"
if [ -e others.tgz ]; then
tar -x -z -f others.tgz -C "\$deploy_dir/job"
fi
if [ ! -e "$server_build_dir/pycascading.jar" ]; then
echo 'WARNING!!!'
echo 'The PyCascading master jar has not yet been deployed, do a "remote_deploy.sh -m" first.'
echo
fi
echo "Run the job on $server with:"
echo " \$deploy_dir/run.sh [parameters]"
fi
if [ \$1 == 'do_run' ]; then
\$deploy_dir/run.sh "\$@"
fi
EOF
chmod +x "$tmp_dir/setup.sh"
#
# Create a small script on the remote server that runs the job
#
main_file=$(remove_leading_slash "$main_file")
cat >"$tmp_dir/run.sh" <<EOF
# Run the PyCascading job
cd "\$(dirname "\$0")/job"
hadoop $hadoop_options jar "$server_build_dir/pycascading.jar" \\
"$server_build_dir/bootstrap.py" hadoop "$server_build_dir" \\
-a "$server_build_dir/pycascading.tgz" -a ../sources.tgz \\
"$main_file" "\$@"
EOF
chmod +x "$tmp_dir/run.sh"
# Upload the package to the server and run the setup script
cd "$tmp_dir"
tar -c -z -h -f - . | ssh $server $ssh_options \
"dir=\$(mktemp -d -t PyCascading-tmp-XXXXXX); cd \"\$dir\"; tar -x -z -f -; " \
"./setup.sh $run_immediately \"\$@\"; rm -r \"\$dir\""
rm -r "$tmp_dir"