Skip to content

Commit

Permalink
Merge pull request Ensembles#1481 from joakim-hove/blacklist-fixup
Browse files Browse the repository at this point in the history
Blacklist fixup
  • Loading branch information
joakim-hove authored Apr 5, 2017
2 parents 0313709 + 947a31e commit 2a7b911
Show file tree
Hide file tree
Showing 4 changed files with 18 additions and 27 deletions.
1 change: 1 addition & 0 deletions libjob_queue/include/ert/job_queue/job_node.h
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,7 @@ typedef struct job_queue_node_struct job_queue_node_type;
const char * job_queue_node_get_name( const job_queue_node_type * node);
int job_queue_node_get_submit_attempt( const job_queue_node_type * node);
void job_queue_node_reset_submit_attempt( job_queue_node_type * node);
void job_queue_node_dec_submit_attempt( job_queue_node_type * node);
const char * job_queue_node_get_failed_job( const job_queue_node_type * node);
const char * job_queue_node_get_error_reason( const job_queue_node_type * node);
const char * job_queue_node_get_stderr_capture( const job_queue_node_type * node);
Expand Down
4 changes: 4 additions & 0 deletions libjob_queue/src/job_node.c
Original file line number Diff line number Diff line change
Expand Up @@ -238,6 +238,10 @@ void job_queue_node_reset_submit_attempt( job_queue_node_type * node) {
node->submit_attempt = 0;
}

void job_queue_node_dec_submit_attempt( job_queue_node_type * node) {
node->submit_attempt--;
}

int job_queue_node_get_submit_attempt( const job_queue_node_type * node) {
return node->submit_attempt;
}
Expand Down
1 change: 1 addition & 0 deletions libjob_queue/src/job_queue.c
Original file line number Diff line number Diff line change
Expand Up @@ -783,6 +783,7 @@ static void * job_queue_run_DO_KILL_callback( void * arg ) {
static void job_queue_handle_DO_KILL_NODE_FAILURE(job_queue_type * queue, job_queue_node_type * node) {
queue_driver_blacklist_node( queue->driver, job_queue_node_get_driver_data(node) );
job_queue_change_node_status(queue, node, JOB_QUEUE_DO_KILL);
job_queue_node_dec_submit_attempt(node);
}

static void job_queue_handle_DO_KILL( job_queue_type * queue , job_queue_node_type * node) {
Expand Down
39 changes: 12 additions & 27 deletions libjob_queue/src/lsf_driver.c
Original file line number Diff line number Diff line change
Expand Up @@ -273,11 +273,9 @@ char* lsf_job_write_bjobs_to_file(const char * bjobs_cmd, lsf_driver_type * driv
argv[1] = cmd;
util_spawn_blocking(driver->rsh_cmd, 2, (const char **) argv, tmp_file, NULL);
free(argv);
} else if (driver->submit_method == LSF_SUBMIT_LOCAL_SHELL) {
char ** argv = util_calloc(1, sizeof *argv);
argv[0] = "";
util_spawn_blocking(cmd, 1, (const char **) argv, tmp_file, NULL);
}
} else if (driver->submit_method == LSF_SUBMIT_LOCAL_SHELL)
util_spawn_blocking(cmd, 0 , NULL , tmp_file, NULL);

free(cmd);

return tmp_file;
Expand Down Expand Up @@ -851,32 +849,19 @@ void lsf_driver_free_job(void * __job) {
}

static void lsf_driver_node_failure(lsf_driver_type * driver, long lsf_job_id) {
fprintf(stderr, "%s attempting to blacklist nodes for job id %ld.\n", __func__, lsf_job_id);
char * fname = lsf_job_write_bjobs_to_file(driver->bjobs_cmd, driver, lsf_job_id);
stringlist_type * hosts = lsf_job_alloc_parse_hostnames(fname);
char* hostnames = stringlist_alloc_joined_string(hosts, ", ");

{
char * fname = lsf_job_write_bjobs_to_file(driver->bsub_cmd, driver, lsf_job_id);
stringlist_type * hosts = lsf_job_alloc_parse_hostnames(fname);
char* hostnames = stringlist_alloc_joined_string(hosts, ", ");
fprintf(stderr, "%s blacklisting nodes %s.\n", __func__, hostnames);

fprintf(stderr,
"Realization %ld seems to have failed as a result of LSF node failure.\n",
lsf_job_id
);
fprintf(stderr,
"This job will be re-submitted to a different node according to the "
"number of re-submit specified in MAX_SUBMIT in the ert config file.\n"
);

lsf_driver_add_exclude_hosts(driver, hostnames);

util_free(hostnames);
stringlist_free(hosts);
util_free(fname);
}
fprintf(stderr, "The job:%ld never started - the nodes: %s will be excluded, the job will be resubmitted to LSF.\n", lsf_job_id , hostnames);
lsf_driver_add_exclude_hosts(driver, hostnames);

util_free(hostnames);
stringlist_free(hosts);
util_free(fname);
}


void lsf_driver_blacklist_node(void * __driver, void * __job) {
lsf_driver_type * driver = lsf_driver_safe_cast(__driver);
lsf_job_type * job = lsf_job_safe_cast(__job);
Expand Down

0 comments on commit 2a7b911

Please sign in to comment.