-
Notifications
You must be signed in to change notification settings - Fork 32
/
paramrun
executable file
·339 lines (303 loc) · 8.87 KB
/
paramrun
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
#!/bin/bash
function auto-retry()
{
let backoff=1
false
while [ $? -ne 0 ]; do
"$@" || (sleep $((backoff*=2));false)
done
}
function lwarn {
if [ "x$LAUNCHER_LOGFILE" == "x" -o "$LAUNCHER_LOGFILE" == "stderr" ]
then
date +"$1 (%D %T): $2" >&2
else
date +"$1 (%D %T): $2" >>$LAUNCHER_LOGFILE
fi
}
function lexit {
if [ "x$LAUNCHER_LOGFILE" == "x" -o "$LAUNCHER_LOGFILE" == "stderr" ]
then
echo "Launcher exited without completing."
else
echo "Launcher exited without completing. Please check LAUNCHER_LOGFILE ($LAUNCHER_LOGFILE)."
fi
exit
}
#check if the program is running on a windows machine
if grep -qE "(Microsoft|WSL)" /proc/version &> /dev/null ; then
windowsP=true
else
windowsP=false
fi
if [[ -z $LAUNCHER_DIR ]]
then
lwarn ERROR "Launcher base directory LAUNCHER_DIR not set. Exiting."
lexit
fi
if [[ "x$LAUNCHER_WORKDIR" == "x" ]]
then
#Check for older WORKDIR
if [[ ! "x$WORKDIR" == "x" ]]
then
lwarn NOTICE "WORKDIR variable has been deprecated. Use LAUNCHER_WORKDIR."
export LAUNCHER_WORKDIR=$WORKDIR
else
lwarn WARNING "LAUNCHER_WORKDIR variable not set. Using current directory."
export LAUNCHER_WORKDIR=`pwd`
fi
fi
#Step 0: Determine if resource manager integration is enabled
if [[ -z $LAUNCHER_PLUGIN_DIR ]]
then
lwarn WARNING "LAUNCHER_PLUGIN_DIR not set. Defaulting to $LAUNCHER_DIR/plugins."
LAUNCHER_PLUGIN_DIR=$LAUNCHER_DIR/plugins
fi
if [[ -z $LAUNCHER_RMI ]]
then
echo "Launcher: Starting without resource manager integration."
else
if [[ -f $LAUNCHER_PLUGIN_DIR/$LAUNCHER_RMI.rmi ]]
then
source $LAUNCHER_PLUGIN_DIR/$LAUNCHER_RMI.rmi
export LAUNCHER_HOSTFILE=$LAUNCHER_RMI_HOSTFILE
else
lwarn WARNING "Resource Manager Integration plugin file $LAUNCHER_RMI.rmi not found."
lwarn "" " Available options:"
for rmi in `ls $LAUNCHER_PLUGIN_DIR/*.rmi 2>/dev/null`
do
lwarn "" " `basename $rmi .rmi`"
done
lwarn "" " Defaulting to no integration."
fi
fi
#Step 1: Ensure that LAUNCHER_HOSTFILE is set and valid
if [[ -z $LAUNCHER_HOSTFILE ]]
then
lwarn WARNING "LAUNCHER_HOSTFILE is not set. Defaulting to localhost."
LAUNCHER_LOCALHOST=1
export LAUNCHER_NHOSTS=1
else
if [[ ! -f $LAUNCHER_HOSTFILE ]]
then
lwarn ERROR "Hostfile ($LAUNCHER_HOSTFILE) not found."
lexit
fi
fi
#Step 2: Ensure that necessary variables are set
if [[ -z $LAUNCHER_PPN ]]
then
if [[ ! -z $LAUNCHER_RMI_PPN ]]
then
export LAUNCHER_PPN=$LAUNCHER_RMI_PPN
else
lwarn WARNING "LAUNCHER_PPN is not set. Defaulting to 1."
export LAUNCHER_PPN=1
fi
fi
if [[ -z $LAUNCHER_NHOSTS ]]
then
if [[ ! -z $LAUNCHER_RMI_NHOSTS ]]
then
export LAUNCHER_NHOSTS=$LAUNCHER_RMI_NHOSTS
else
lwarn WARNING "LAUNCHER_NHOSTS is not set. Calculating from hostfile."
export LAUNCHER_NHOSTS=`wc -l $LAUNCHER_HOSTFILE | awk '{print $1}'`
fi
fi
export LAUNCHER_NPROCS=`expr $LAUNCHER_NHOSTS \* $LAUNCHER_PPN`
if [[ -z $LAUNCHER_NPROCS ]]
then
lwarn ERROR "LAUNCHER_NPROCS is not set."
lexit
fi
#Backward Compatability with v2 and v1: Check for CONTROL_FILE
if [[ ! -z $CONTROL_FILE ]]
then
lwarn NOTICE "CONTROL_FILE variable deprecated. Use LAUNCHER_JOB_FILE in the future."
export LAUNCHER_JOB_FILE=$CONTROL_FILE
fi
#Step 3: Ensure that LAUNCHER_JOB_FILE exists
if [[ -z $LAUNCHER_JOB_FILE ]]
then
lwarn ERROR "LAUNCHER_JOB_FILE not set."
lexit
else
if [[ -f $LAUNCHER_JOB_FILE ]]
then
export LAUNCHER_NJOBS=`wc -l $LAUNCHER_JOB_FILE | awk '{print $1}'`
if [[ $LAUNCHER_NJOBS -eq 0 ]]
then
lwarn ERROR "LAUNCHER_JOB_FILE ($LAUNCHER_JOB_FILE) is empty. This could be caused if the file is DOS formatted."
lwarn NOTICE "Consider using dos2unix on your file to convert the newline characters."
lexit
fi
else
lwarn ERROR "LAUNCHER_JOB_FILE ($LAUNCHER_JOB_FILE) not found."
lexit
fi
fi
#Step 4: Setup Xeon Phi support
if [ "x$LAUNCHER_NPHI" == "x" -o "x$LAUNCHER_NPHI" == "x0" ]
then
LAUNCHER_USE_PHI=0
else
echo "Launcher: Setting up Intel Xeon Phi support."
LAUNCHER_USE_PHI=1
fi
#Step 5: Scheduling setup
if [[ "x$LAUNCHER_SCHED" == "x" ]]
then
export LAUNCHER_SCHED="dynamic"
fi
if [[ "$LAUNCHER_SCHED" == "dynamic" ]]; then
#Start tskserver
RUNNING="false"
RETRY=0
while [[ "$RUNNING" == "false" ]]
do
if [[ "$windowsP" == "true" ]]; then
$LAUNCHER_DIR/tskserver $LAUNCHER_NJOBS localhost 9471 2>/dev/null &
LAUNCHER_DYN_PID=$!
disown $LAUNCHER_DYN_PID
sleep 1s
else
$LAUNCHER_DIR/tskserver $LAUNCHER_NJOBS $HOSTNAME 9471 2>/dev/null &
LAUNCHER_DYN_PID=$!
disown $LAUNCHER_DYN_PID
sleep 1s
fi
if ! ps -p $! >/dev/null 2>/dev/null
then
if [[ $RETRY -ne 10 ]]
then
lwarn WARNING "Unable to start dynamic task service. Retrying..."
RETRY=`expr $RETRY + 1`
sleep 3s
else
lwarn ERROR "Unable to start dynamic task service. Shutting down."
lexit
fi
else
RUNNING="true"
fi
done
if [[ "$windowsP" == "true" ]]; then
export LAUNCHER_DYN_COUNT=localhost
else
export LAUNCHER_DYN_COUNT="$HOSTNAME"
fi
export LAUNCHER_DYN_COUNT_PORT=9471
if [[ $LAUNCHER_USE_PHI -ne "0" ]]; then
if [[ $windowsP==true ]]; then
#Start another tskserver for the Intel Xeon Phi cards
$LAUNCHER_DIR/tskserver `wc -l $PHI_WORKDIR/$PHI_CONTROL_FILE` localhost 9472 2>/dev/null &
LAUNCHER_PHI_DYN_PID=$1
disown $LAUNCHER_PHI_DYN_PID
export LAUNCHER_PHI_DYN_COUNT=localhost
export LAUNCHER_PHI_DYN_COUNT_PORT=9472
else
#Start another tskserver for the Intel Xeon Phi cards
$LAUNCHER_DIR/tskserver `wc -l $PHI_WORKDIR/$PHI_CONTROL_FILE` $HOSTNAME 9472 2>/dev/null &
LAUNCHER_PHI_DYN_PID=$1
disown $LAUNCHER_PHI_DYN_PID
export LAUNCHER_PHI_DYN_COUNT="$HOSTNAME"
export LAUNCHER_PHI_DYN_COUNT_PORT=9472
fi
fi
fi
if [[ "$LAUNCHER_BIND" == "1" ]]
then
num_socks=$(lstopo-no-graphics --only socket | wc -l | awk '{print $1}')
if [[ $num_socks -eq 0 ]]
then
num_socks=1
fi
num_cores=$(lstopo-no-graphics --only core | wc -l | awk '{print $1}')
num_threads=$(lstopo-no-graphics --only pu | wc -l | awk '{print $1}')
if [[ $LAUNCHER_PPN -gt $num_cores ]]
then
export LAUNCHER_BIND_HT=1
if [[ $LAUNCHER_PPN -gt $num_threads ]]
then
lwarn WARNING "Requested Processes per Node ($LAUNCHER_PPN) exceeds number of available threads ($num_threads). Resetting..."
export LAUNCHER_PPN=$num_threads
export LAUNCHER_NPROCS=$(($LAUNCHER_NHOSTS * $LAUNCHER_PPN))
fi
pu_per_task=$(($num_threads / $LAUNCHER_PPN))
else
export LAUNCHER_BIND_HT=0
pu_per_task=$(($num_cores / $LAUNCHER_PPN))
fi
export LAUNCHER_PUPT=$pu_per_task
fi
#------------------------------
# Let's finally launch the job
#------------------------------
echo "Launcher: Setup complete."
echo
echo "------------- SUMMARY ---------------"
echo " Number of hosts: $LAUNCHER_NHOSTS"
echo " Working directory: $LAUNCHER_WORKDIR"
echo " Processes per host: $LAUNCHER_PPN"
echo " Total processes: $LAUNCHER_NPROCS"
echo " Total jobs: $LAUNCHER_NJOBS"
echo " Scheduling method: $LAUNCHER_SCHED"
if [[ "$LAUNCHER_BIND" == "1" ]]
then
echo
echo "------ Process Binding Enabled ------"
echo " Sockets per host: $num_socks"
echo " Cores per host: $num_cores"
echo " Threads per host: $num_threads"
echo -n " Binding each task to $LAUNCHER_PUPT "
if [[ "$LAUNCHER_BIND_HT" == "1" ]]
then
echo "threads (Hyperthreads in use)"
else
echo "cores (Hyperthreads ignored)"
fi
fi
if [[ "$LAUNCHER_USE_PHI" == "1" ]]
then
echo
echo "--- Intel Xeon Phi Support Enabled ---"
echo " Cards per host: $LAUNCHER_NPHI"
echo " Processes per card: $LAUNCHER_PHI_PPN"
fi
echo
echo "-------------------------------------"
echo "Launcher: Starting parallel tasks..."
i=0
if [[ "x$LAUNCHER_LOCALHOST" == "x1" ]]
then
env LAUNCHER_HOST_ID=0 $LAUNCHER_DIR/init_launcher
else
echo "using $LAUNCHER_HOSTFILE to get hosts" >&2
for host in `cat $LAUNCHER_HOSTFILE`
do
echo "starting job on $host" >&2
( auto-retry ssh $host "cd $LAUNCHER_WORKDIR; env `$LAUNCHER_DIR/pass_env` LAUNCHER_NHOSTS=$np LAUNCHER_HOST_ID=$i $LAUNCHER_DIR/init_launcher" ) &
# This is the previoous version before auto_try
# ssh $host "cd $LAUNCHER_WORKDIR; env `$LAUNCHER_DIR/pass_env` LAUNCHER_NHOSTS=$np LAUNCHER_HOST_ID=$i $LAUNCHER_DIR/init_launcher" &
i=`expr $i + 1`
done
wait
fi
res=$?
#Cleanup processes and files
if [[ ! "x$LAUNCHER_DYN_PID" == "x" ]]
then
kill $LAUNCHER_DYN_PID
fi
if [[ ! "x$LAUNCHER_PHI_DYN" == "x" ]]
then
kill $LAUNCHER_PHI_DYN_PID
fi
rm -f $LAUNCHER_RMI_HOSTFILE
if [[ $res -ne 0 ]]; then
echo "Launcher: Done. Job exited with code: $res"
else
echo "Launcher: Done. Job exited without errors"
fi
exit $res