From f41deaf337cf3d7c45ead4ec074ccaba350139ce Mon Sep 17 00:00:00 2001 From: Oliver Schulz Date: Fri, 19 Apr 2024 17:52:48 +0200 Subject: [PATCH] Make SLUM worker startup more robust and provide more feedback (#200) * Fix SlurmManager launch Using append! for job_output_template appends it character-wise, causing failure. * FIx indentation in slurm.jl * Replace println by logging directives in SlurmManager * Make SlurmManager worker startup more robust --- src/slurm.jl | 98 +++++++++++++++++++++++++++++++++------------------- 1 file changed, 62 insertions(+), 36 deletions(-) diff --git a/src/slurm.jl b/src/slurm.jl index 90fb873..bf5e5ba 100644 --- a/src/slurm.jl +++ b/src/slurm.jl @@ -51,63 +51,89 @@ function launch(manager::SlurmManager, params::Dict, instances_arr::Array, mkdir(job_file_loc) end - # Check for given output file name - jobname = "julia-$(getpid())" - has_output_name = ("-o" in srunargs) | ("--output" in srunargs) - if has_output_name - loc = findfirst(x-> x == "-o" || x == "--output", srunargs) - job_output_name = srunargs[loc+1] - job_output_template = joinpath(job_file_loc, job_output_name) - srunargs[loc+1] = job_output_template - else - job_output_name = "$(jobname)-$(trunc(Int, Base.time() * 10))" - make_job_output_path(task_num) = joinpath(job_file_loc, "$(job_output_name)-$(task_num).out") - job_output_template = make_job_output_path("%4t") - append!(srunargs, "-o", job_output_template) - end + # Check for given output file name + jobname = "julia-$(getpid())" + has_output_name = ("-o" in srunargs) | ("--output" in srunargs) + if has_output_name + loc = findfirst(x-> x == "-o" || x == "--output", srunargs) + job_output_name = srunargs[loc+1] + job_output_template = joinpath(job_file_loc, job_output_name) + srunargs[loc+1] = job_output_template + else + job_output_name = "$(jobname)-$(trunc(Int, Base.time() * 10))" + make_job_output_path(task_num) = joinpath(job_file_loc, "$(job_output_name)-$(task_num).out") + job_output_template = make_job_output_path("%4t") + push!(srunargs, "-o", job_output_template) + end np = manager.np srun_cmd = `srun -J $jobname -n $np -D $exehome $(srunargs) $exename $exeflags $(worker_arg())` + + @info "Starting SLURM job $jobname: $srun_cmd" srun_proc = open(srun_cmd) + slurm_spec_regex = r"([\w]+):([\d]+)#(\d{1,3}.\d{1,3}.\d{1,3}.\d{1,3})" + could_not_connect_regex = r"could not connect" + exiting_regex = r"exiting." retry_delays = manager.retry_delays + + t_start = time() + t_waited = round(Int, time() - t_start) for i = 0:np - 1 - println("connecting to worker $(i + 1) out of $np") - slurm_spec_match = nothing - if has_output_name - fn = job_output_template - else - fn = make_job_output_path(lpad(i, 4, "0")) - end - t0 = time() - for retry_delay in retry_delays + slurm_spec_match::Union{RegexMatch,Nothing} = nothing + worker_errors = String[] + if has_output_name + fn = job_output_template + else + fn = make_job_output_path(lpad(i, 4, "0")) + end + for retry_delay in push!(collect(retry_delays), 0) + t_waited = round(Int, time() - t_start) + # Wait for output log to be created and populated, then parse - if isfile(fn) && filesize(fn) > 0 - slurm_spec_match = open(fn) do f - # Due to error and warning messages, the specification - # may not appear on the file's first line - for line in eachline(f) - re_match = match(slurm_spec_regex, line) - if re_match !== nothing - return re_match # only returns from do-block + + if isfile(fn) + if filesize(fn) > 0 + open(fn) do f + # Due to error and warning messages, the specification + # may not appear on the file's first line + for line in eachline(f) + re_match = match(slurm_spec_regex, line) + if !isnothing(re_match) + slurm_spec_match = re_match + end + for expr in [could_not_connect_regex, exiting_regex] + if !isnothing(match(expr, line)) + slurm_spec_match = nothing + push!(worker_errors, line) + end + end end end end - if slurm_spec_match !== nothing - break # break if specification found + if !isempty(worker_errors) || !isnothing(slurm_spec_match) + break # break if error or specification found + else + @info "Worker $i (after $t_waited s): Output file found, but no connection details yet" end + else + @info "Worker $i (after $t_waited s): No output file \"$fn\" yet" end + # Sleep for some time to limit ressource usage while waiting for the job to start sleep(retry_delay) end - if slurm_spec_match === nothing - throw(SlurmException("Timeout while trying to connect to worker")) + if !isempty(worker_errors) + throw(SlurmException("Worker $i failed after $t_waited s: $(join(worker_errors, " "))")) + elseif isnothing(slurm_spec_match) + throw(SlurmException("Timeout after $t_waited s while waiting for worker $i to get ready.")) end config = WorkerConfig() config.port = parse(Int, slurm_spec_match[2]) config.host = strip(slurm_spec_match[3]) + @info "Worker $i ready after $t_waited s on host $(config.host), port $(config.port)" # Keep a reference to the proc, so it's properly closed once # the last worker exits. config.userdata = srun_proc @@ -115,7 +141,7 @@ function launch(manager::SlurmManager, params::Dict, instances_arr::Array, notify(c) end catch e - println("Error launching Slurm job:") + @error "Error launching Slurm job" rethrow(e) end end