-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathschulz.sh
executable file
·134 lines (116 loc) · 4.45 KB
/
schulz.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
#!/bin/bash
#* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
#* *
#* This file is part of the program and library *
#* SCIP --- Solving Constraint Integer Programs *
#* *
#* Copyright (C) 2002-2016 Konrad-Zuse-Zentrum *
#* fuer Informationstechnik Berlin *
#* *
#* SCIP is distributed under the terms of the ZIB Academic License. *
#* *
#* You should have received a copy of the ZIB Academic License *
#* along with SCIP; see the file COPYING. If not email to [email protected]. *
#* *
#* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
# Bash-shell version of schulz script from GAMS performance tools (http://www.gamsworld.org/performance/schulz.htm)
#
# Supervises processes and sends given signals via kill when elapsed time exceeds given thresholds
#
# Usage: ./schulz.sh <process> <timelimits> <signals> <sleep>
# <process> is a regular expression which determines the commands that should be supervised (default: ^gms)
# <timelimits> is a colon separated list of timelimits (default: 120:180:240)
# <signals> is a colon separated list of signals (default: 2:1:9)
# <sleep> is the number of seconds to wait before checking the process list again (default: 60)
#
# Whenever a process which command matches the first argument exceeds one of the given timelimits,
# the corresponding signal is send to that process.
#
# That is, for the default values, for each process which command that starts with ^gms the following happens:
# If it is running for more than 240 seconds, signal 9 (SIGKILL) is send.
# If it is running for more than 280 seconds, but less-or-equal 240 seconds, signal 1 (SIGHUP) is send.
# If it is running for more than 120 seconds, but less-or-equal 180 seconds, signal 2 (SIGINT) is send.
# If it is running for less-or-equal 120 seconds, nothing happens.
#
# The list of timelimits and signals need to have the same length.
watch=${1:-^gms}
ress=${2:-120\:180\:240}
sigs=${3:-2\:1\:9}
sleepsec=${4:-60}
# split ress and sigs into arrays
IFS=":"
j=1
for r in $ress
do
resl[$j]=$r
((j++))
done
j=1
for s in $sigs
do
sigl[$j]=$s
((j++))
done
# check if number of timelimits equals number of signals
if test ${#resl[@]} -ne ${#sigl[@]}
then
echo "List of times and list of signals need to have equal length."
exit 1
fi
echo "--- " `date` ": Start watching $watch sleep = $sleepsec seconds"
for (( j=1; j <= ${#resl[@]}; j++ ))
do
echo " Threshold ${resl[$j]}s -> Signal ${sigl[$j]}"
done
# given a date in format [[[dd-]hh:]mm:]ss,
# - calculates corresponding number of seconds and stores them in $secs
# - checks if one of the timelimits is exceeded and stores corresponding signal in $signal
# if no timelimit is exceeded, $signal is set to 0
function getsignal() {
secs=${1: -2:2}
mins=${1: -5:2}
hours=${1: -8:2}
days=${1: -11:2}
#remove 0's from beginning
days=${days##0}
hours=${hours##0}
mins=${mins##0}
secs=${secs##0}
#echo $days "d" $hours "h" $mins "m" $secs "s"
(( secs = secs + 60 * mins + 3600 * hours + 86400 * days ))
signal=0
for (( j=${#resl[@]}; j ; j-- ))
do
if test $secs -gt ${resl[$j]}
then
signal=${sigl[$j]}
break
fi
done
}
# run forever
for ((;;))
do
sleep $sleepsec || exit
# get list of processes to watch in the format "pid,elapsedtime,command#pid,elapsedtime,command#..."
ap=`ps -Ao comm,pid,etime | grep -E "$watch" | awk '{ printf("%s,%s,%s#",$2,$3,tolower($1)) }'`
echo "--- " `date` ":"
# process list of processes to watch
IFS="#"
for i in $ap
do
# get pid, elapsed time of process, and command
pid=`echo $i | awk -F"," '{print $1}'`
processtime=`echo $i | awk -F"," '{print $2}'`
command=`echo $i | awk -F"," '{print $3}'`
# check if we need to send a signal
getsignal $processtime
if test $signal -ne 0
then
echo "Signal $signal to $command with pid $pid running for $secs seconds"
kill -$signal $pid > /dev/null
else
echo "$pid:$command:$secs"
fi
done
done