From 58ecff5d8915e6785a95628271f5bfc1481ec261 Mon Sep 17 00:00:00 2001 From: Graham Smith Date: Wed, 1 Jul 2020 11:49:10 -0500 Subject: [PATCH 1/7] Slurm robust to no jobid and to node warnings --- src/slurm.jl | 43 ++++++++++++++++++++++--------------------- 1 file changed, 22 insertions(+), 21 deletions(-) diff --git a/src/slurm.jl b/src/slurm.jl index 5ff242b..7b13d89 100644 --- a/src/slurm.jl +++ b/src/slurm.jl @@ -57,37 +57,38 @@ function launch(manager::SlurmManager, params::Dict, instances_arr::Array, np = manager.np jobname = "julia-$(getpid())" - jobID = String(ENV["SLURM_JOB_ID"]) - srun_cmd = `srun -J $jobname -n $np -o "$(joinpath(job_file_loc, "job-$jobID-%4t.out"))" -D $exehome $(srunargs) $exename $exeflags $(worker_arg())` + job_output_name = "$(jobname)-$(trunc(Int, Base.time() * 10))" + make_job_output_path(task_num) = joinpath(job_file_loc, "$(job_output_name)-$(task_num).out") + job_output_template = make_job_output_path("%4t") + srun_cmd = `srun -J $jobname -n $np -o "$(job_output_template)" -D $exehome $(srunargs) $exename $exeflags $(worker_arg())` srun_proc = open(srun_cmd) + slurm_spec_regex = r"([\w]+):([\d]+)#(\d{1,3}.\d{1,3}.\d{1,3}.\d{1,3})" for i = 0:np - 1 println("connecting to worker $(i + 1) out of $np") - local w=[] + local slurm_spec_match = nothing fn = "$(joinpath(exehome, job_file_loc))/job-$jobID-$(lpad(i, 4, "0")).out" t0 = time() while true - if time() > t0 + 60 + np - @warn "dropping worker: file not created in $(60 + np) seconds" - break - end - sleep(0.001) - if isfile(fn) && filesize(fn) > 0 - w = open(fn) do f - return split(split(readline(f), ":")[2], "#") + slurm_spec_match = open(fn) do f + for line in eachline(f) + re_match = match(slurm_spec_regex, line) + if re_match !== nothing + return re_match + end end + end + if slurm_spec_match !== nothing break end end - if length(w) > 0 - config = WorkerConfig() - config.port = parse(Int, w[1]) - config.host = strip(w[2]) - # Keep a reference to the proc, so it's properly closed once - # the last worker exits. - config.userdata = srun_proc - push!(instances_arr, config) - notify(c) - end + config = WorkerConfig() + config.port = parse(Int, slurm_spec_match[2]) + config.host = strip(slurm_spec_match[3]) + # Keep a reference to the proc, so it's properly closed once + # the last worker exits. + config.userdata = srun_proc + push!(instances_arr, config) + notify(c) end catch e println("Error launching Slurm job:") From 02ba2c1e35ccc28cc96743a8d573937bcfd05028 Mon Sep 17 00:00:00 2001 From: Graham Smith Date: Wed, 1 Jul 2020 12:14:02 -0500 Subject: [PATCH 2/7] merge mistake --- src/slurm.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/slurm.jl b/src/slurm.jl index 7b13d89..bdc511c 100644 --- a/src/slurm.jl +++ b/src/slurm.jl @@ -66,7 +66,7 @@ function launch(manager::SlurmManager, params::Dict, instances_arr::Array, for i = 0:np - 1 println("connecting to worker $(i + 1) out of $np") local slurm_spec_match = nothing - fn = "$(joinpath(exehome, job_file_loc))/job-$jobID-$(lpad(i, 4, "0")).out" + fn = make_job_output_path(lpad(i, 4, "0")) t0 = time() while true slurm_spec_match = open(fn) do f From b075372ebd7b72971fb2ee3f71efd73db2737a05 Mon Sep 17 00:00:00 2001 From: Graham Smith Date: Wed, 1 Jul 2020 12:19:39 -0500 Subject: [PATCH 3/7] Fix delayed file creation regression --- src/slurm.jl | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/src/slurm.jl b/src/slurm.jl index bdc511c..a4005b6 100644 --- a/src/slurm.jl +++ b/src/slurm.jl @@ -69,16 +69,18 @@ function launch(manager::SlurmManager, params::Dict, instances_arr::Array, fn = make_job_output_path(lpad(i, 4, "0")) t0 = time() while true - slurm_spec_match = open(fn) do f - for line in eachline(f) - re_match = match(slurm_spec_regex, line) - if re_match !== nothing - return re_match + if isfile(fn) && filesize(fn) > 0 + slurm_spec_match = open(fn) do f + for line in eachline(f) + re_match = match(slurm_spec_regex, line) + if re_match !== nothing + return re_match + end end end - end - if slurm_spec_match !== nothing - break + if slurm_spec_match !== nothing + break + end end end config = WorkerConfig() From 7558cc7d96e2f2da1e646892bd40d7e108329a69 Mon Sep 17 00:00:00 2001 From: Graham Smith Date: Wed, 1 Jul 2020 12:29:02 -0500 Subject: [PATCH 4/7] No longer remove old log files --- src/slurm.jl | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/slurm.jl b/src/slurm.jl index a4005b6..6a36d5f 100644 --- a/src/slurm.jl +++ b/src/slurm.jl @@ -50,10 +50,9 @@ function launch(manager::SlurmManager, params::Dict, instances_arr::Array, mkdir(job_file_loc) end - println("removing old files") + # println("removing old files") # cleanup old files - map(f->rm(joinpath(job_file_loc, f)), filter(t -> occursin(r"job(.*?).out", t), readdir(job_file_loc))) - println("removing old Setting up srun commands") + # map(f->rm(joinpath(job_file_loc, f)), filter(t -> occursin(r"job(.*?).out", t), readdir(job_file_loc))) np = manager.np jobname = "julia-$(getpid())" From e8fa6fbb509762dff84ee353ac17cbaf57dba98d Mon Sep 17 00:00:00 2001 From: Graham Smith Date: Fri, 25 Sep 2020 10:35:21 -0500 Subject: [PATCH 5/7] Clean up prints and comments for PR --- src/slurm.jl | 8 -------- 1 file changed, 8 deletions(-) diff --git a/src/slurm.jl b/src/slurm.jl index 6a36d5f..c2e62dd 100644 --- a/src/slurm.jl +++ b/src/slurm.jl @@ -17,11 +17,7 @@ function launch(manager::SlurmManager, params::Dict, instances_arr::Array, stdkeys = keys(Distributed.default_addprocs_params()) - println(stdkeys) p = filter(x->(!(x[1] in stdkeys) && x[1] != :job_file_loc), params) - println(p) - - srunargs = [] for k in keys(p) @@ -50,10 +46,6 @@ function launch(manager::SlurmManager, params::Dict, instances_arr::Array, mkdir(job_file_loc) end - # println("removing old files") - # cleanup old files - # map(f->rm(joinpath(job_file_loc, f)), filter(t -> occursin(r"job(.*?).out", t), readdir(job_file_loc))) - np = manager.np jobname = "julia-$(getpid())" job_output_name = "$(jobname)-$(trunc(Int, Base.time() * 10))" From ea0219b2151592923d556f7a4bf5bffe740bdd84 Mon Sep 17 00:00:00 2001 From: Graham Smith Date: Fri, 25 Sep 2020 10:42:33 -0500 Subject: [PATCH 6/7] Remove spurious local --- src/slurm.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/slurm.jl b/src/slurm.jl index c2e62dd..a6da015 100644 --- a/src/slurm.jl +++ b/src/slurm.jl @@ -56,7 +56,7 @@ function launch(manager::SlurmManager, params::Dict, instances_arr::Array, slurm_spec_regex = r"([\w]+):([\d]+)#(\d{1,3}.\d{1,3}.\d{1,3}.\d{1,3})" for i = 0:np - 1 println("connecting to worker $(i + 1) out of $np") - local slurm_spec_match = nothing + slurm_spec_match = nothing fn = make_job_output_path(lpad(i, 4, "0")) t0 = time() while true From 5d57c7e115722b69d0c8c085ddc9be19768576d3 Mon Sep 17 00:00:00 2001 From: Graham Smith Date: Tue, 29 Sep 2020 13:29:01 -0500 Subject: [PATCH 7/7] Added comments --- src/slurm.jl | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/slurm.jl b/src/slurm.jl index a6da015..fb0f4ae 100644 --- a/src/slurm.jl +++ b/src/slurm.jl @@ -60,17 +60,20 @@ function launch(manager::SlurmManager, params::Dict, instances_arr::Array, fn = make_job_output_path(lpad(i, 4, "0")) t0 = time() while true + # Wait for output log to be created and populated, then parse if isfile(fn) && filesize(fn) > 0 slurm_spec_match = open(fn) do f + # Due to error and warning messages, the specification + # may not appear on the file's first line for line in eachline(f) re_match = match(slurm_spec_regex, line) if re_match !== nothing - return re_match + return re_match # only returns from do-block end end end if slurm_spec_match !== nothing - break + break # break if specification found end end end