From 1851eef50b3e8e240ffdf36de918d2dca9ea5361 Mon Sep 17 00:00:00 2001 From: Jerome Kelleher Date: Mon, 23 Nov 2015 16:27:06 +0000 Subject: [PATCH 1/8] Added extra sanity checks for run. --- lib/main.c | 12 ++++++++---- lib/msprime.c | 2 +- lib/msprime.h | 1 + 3 files changed, 10 insertions(+), 5 deletions(-) diff --git a/lib/main.c b/lib/main.c index f1b73a43a..e7e734270 100644 --- a/lib/main.c +++ b/lib/main.c @@ -406,10 +406,14 @@ run_simulate(char *conf_file) if (ret != 0) { goto out; } - result = msp_run(msp, DBL_MAX, ULONG_MAX); - if (result < 0) { - ret = result; - goto out; + result = 1; + while (result == 1) { + result = msp_run(msp, DBL_MAX, 1); + if (result < 0) { + ret = result; + goto out; + } + msp_verify(msp); } ret = msp_print_state(msp); if (ret != 0) { diff --git a/lib/msprime.c b/lib/msprime.c index 1478b6f02..eaf258f21 100644 --- a/lib/msprime.c +++ b/lib/msprime.c @@ -703,7 +703,7 @@ msp_print_segment_chain(msp_t *self, segment_t *head) } -static void +void msp_verify(msp_t *self) { int64_t s, ss, total_links; diff --git a/lib/msprime.h b/lib/msprime.h index d620ccd15..d100821f3 100644 --- a/lib/msprime.h +++ b/lib/msprime.h @@ -287,6 +287,7 @@ int msp_set_coalescence_record_block_size(msp_t *self, size_t block_size); int msp_run(msp_t *self, double max_time, unsigned long max_events); int msp_print_state(msp_t *self); int msp_free(msp_t *self); +void msp_verify(msp_t *self); int msp_get_population_models(msp_t *self, population_model_t *models); int msp_get_ancestors(msp_t *self, segment_t **ancestors); From e583576fced268e520ee58fe1718b8e4b440c005 Mon Sep 17 00:00:00 2001 From: Jerome Kelleher Date: Mon, 23 Nov 2015 17:27:45 +0000 Subject: [PATCH 2/8] Fix for subtle links bug. --- lib/main.c | 1 + lib/msprime.c | 17 ++++++++++++----- 2 files changed, 13 insertions(+), 5 deletions(-) diff --git a/lib/main.c b/lib/main.c index e7e734270..5d78f9237 100644 --- a/lib/main.c +++ b/lib/main.c @@ -414,6 +414,7 @@ run_simulate(char *conf_file) goto out; } msp_verify(msp); + /* ret = msp_print_state(msp); */ } ret = msp_print_state(msp); if (ret != 0) { diff --git a/lib/msprime.c b/lib/msprime.c index eaf258f21..d29b476e8 100644 --- a/lib/msprime.c +++ b/lib/msprime.c @@ -706,17 +706,19 @@ msp_print_segment_chain(msp_t *self, segment_t *head) void msp_verify(msp_t *self) { - int64_t s, ss, total_links; + int64_t s, ss, total_links, left, right, alt_total_links; size_t total_segments = 0; size_t total_avl_nodes = 0; avl_node_t *node; segment_t *u; total_links = 0; + alt_total_links = 0; node = (&self->ancestral_population)->head; while (node != NULL) { u = (segment_t *) node->item; assert(u->prev == NULL); + left = u->left; while (u != NULL) { total_segments++; assert(u->left < u->right); @@ -729,7 +731,7 @@ msp_verify(msp_t *self) } */ if (u->prev != NULL) { - s = u->right - u->prev->right - 1; + s = u->right - u->prev->right; } else { s = u->right - u->left - 1; } @@ -737,11 +739,14 @@ msp_verify(msp_t *self) total_links += ss; assert(s == ss); ss = s; /* just to keep compiler happy - see below also */ + right = u->right; u = u->next; } + alt_total_links += right - left - 1; node = node->next; } assert(total_links == fenwick_get_total(&self->links)); + assert(total_links == alt_total_links); assert(total_segments == object_heap_get_num_allocated( &self->segment_heap)); total_avl_nodes = avl_count(&self->ancestral_population) @@ -801,7 +806,8 @@ msp_print_state(msp_t *self) u = msp_get_segment(self, j); v = fenwick_get_value(&self->links, j); if (v != 0) { - printf("\t%ld\tl=%d r=%d v=%d prev=%p next=%p\n", (long) v, u->left, + printf("\t%ld\ti=%d l=%d r=%d v=%d prev=%p next=%p\n", (long) v, + (int) u->index, u->left, u->right, (int) u->value, (void *) u->prev, (void *) u->next); } } @@ -1105,14 +1111,15 @@ msp_coancestry_event(msp_t *self) if (ret != 0) { goto out; } + fenwick_set_value(&self->links, alpha->index, + alpha->right - alpha->left - 1); } else { defrag_required |= z->right == alpha->left && z->value == alpha->value; z->next = alpha; - l = z->right; + fenwick_set_value(&self->links, alpha->index, alpha->right - z->right); } alpha->prev = z; z = alpha; - fenwick_set_value(&self->links, alpha->index, alpha->right - l - 1); } } if (defrag_required) { From 75d2b36a5cd929ac45ad91c490c1b32367a09f48 Mon Sep 17 00:00:00 2001 From: Jerome Kelleher Date: Tue, 24 Nov 2015 16:42:42 +0000 Subject: [PATCH 3/8] Initial pass at new alg without defrag. --- lib/main.c | 2 +- lib/msprime.c | 128 ++++++++++++++++++++++++++++++++++++++++---------- lib/msprime.h | 1 + 3 files changed, 105 insertions(+), 26 deletions(-) diff --git a/lib/main.c b/lib/main.c index 5d78f9237..c35b5708e 100644 --- a/lib/main.c +++ b/lib/main.c @@ -414,7 +414,7 @@ run_simulate(char *conf_file) goto out; } msp_verify(msp); - /* ret = msp_print_state(msp); */ + ret = msp_print_state(msp); } ret = msp_print_state(msp); if (ret != 0) { diff --git a/lib/msprime.c b/lib/msprime.c index d29b476e8..5c5534bda 100644 --- a/lib/msprime.c +++ b/lib/msprime.c @@ -449,6 +449,7 @@ msp_alloc(msp_t *self, uint32_t sample_size) /* set up the AVL trees */ avl_init_tree(&self->ancestral_population, cmp_individual, NULL); avl_init_tree(&self->breakpoints, cmp_node_mapping, NULL); + avl_init_tree(&self->overlap_counts, cmp_node_mapping, NULL); /* Add the base population model */ ret = msp_add_constant_population_model(self, -1.0, 1.0); out: @@ -750,7 +751,8 @@ msp_verify(msp_t *self) assert(total_segments == object_heap_get_num_allocated( &self->segment_heap)); total_avl_nodes = avl_count(&self->ancestral_population) - + avl_count(&self->breakpoints); + + avl_count(&self->breakpoints) + + avl_count(&self->overlap_counts); assert(total_avl_nodes == object_heap_get_num_allocated( &self->avl_node_heap)); if (total_avl_nodes == total_segments) { @@ -816,6 +818,11 @@ msp_print_state(msp_t *self) nm = (node_mapping_t *) node->item; printf("\t%d -> %d\n", nm->left, nm->value); } + printf("Overlap count = %d\n", avl_count(&self->overlap_counts)); + for (node = self->overlap_counts.head; node != NULL; node = node->next) { + nm = (node_mapping_t *) node->item; + printf("\t%d -> %d\n", nm->left, nm->value); + } printf("Coalescence records = %ld\n", (long) self->num_coalescence_records); for (j = 0; j < self->num_coalescence_records; j++) { cr = &self->coalescence_records[j]; @@ -840,11 +847,10 @@ msp_print_state(msp_t *self) } /* - * Inserts a new breakpoint at the specified locus left, mapping to the - * specified tree node v. + * Inserts a new breakpoint at the specified locus left. */ static int WARN_UNUSED -msp_insert_breakpoint(msp_t *self, uint32_t left, uint32_t v) +msp_insert_breakpoint(msp_t *self, uint32_t left) { int ret = 0; avl_node_t *node = msp_alloc_avl_node(self); @@ -855,7 +861,7 @@ msp_insert_breakpoint(msp_t *self, uint32_t left, uint32_t v) goto out; } m->left = left; - m->value = v; + m->value = 0; avl_init_node(node, m); node = avl_insert_node(&self->breakpoints, node); assert(node != NULL); @@ -863,19 +869,44 @@ msp_insert_breakpoint(msp_t *self, uint32_t left, uint32_t v) return ret; } + +/* + * Inserts a new overlap_count at the specified locus left, mapping to the + * specified number of overlapping segments b. + */ +static int WARN_UNUSED +msp_insert_overlap_count(msp_t *self, uint32_t left, uint32_t v) +{ + int ret = 0; + avl_node_t *node = msp_alloc_avl_node(self); + node_mapping_t *m = msp_alloc_node_mapping(self); + + if (node == NULL || m == NULL) { + ret = MSP_ERR_NO_MEMORY; + goto out; + } + m->left = left; + m->value = v; + avl_init_node(node, m); + node = avl_insert_node(&self->overlap_counts, node); + assert(node != NULL); +out: + return ret; +} + /* - * Inserts a new breakpoint at the specified locus, and copies its - * node mapping from the containing breakpoint. + * Inserts a new overlap_count at the specified locus, and copies its + * node mapping from the containing overlap_count. */ static int WARN_UNUSED -msp_copy_breakpoint(msp_t *self, uint32_t k) +msp_copy_overlap_count(msp_t *self, uint32_t k) { int ret; node_mapping_t search, *nm; avl_node_t *node; search.left = k; - avl_search_closest(&self->breakpoints, &search, &node); + avl_search_closest(&self->overlap_counts, &search, &node); assert(node != NULL); nm = (node_mapping_t *) node->item; if (nm->left > k) { @@ -883,7 +914,7 @@ msp_copy_breakpoint(msp_t *self, uint32_t k) assert(node != NULL); nm = (node_mapping_t *) node->item; } - ret = msp_insert_breakpoint(self, k, nm->value); + ret = msp_insert_overlap_count(self, k, nm->value); return ret; } @@ -970,7 +1001,7 @@ msp_recombination_event(msp_t *self) fenwick_increment(&self->links, y->index, k - z->right); search.left = (uint32_t) k; if (avl_search(&self->breakpoints, &search) == NULL) { - ret = msp_copy_breakpoint(self, (uint32_t) k); + ret = msp_insert_breakpoint(self, (uint32_t) k); if (ret != 0) { goto out; } @@ -996,7 +1027,6 @@ msp_coancestry_event(msp_t *self) int ret = 0; int coalescence = 0; int defrag_required = 0; - int segment_coalesced; uint32_t j, n, l, r, r_max, v; avl_node_t *node; node_mapping_t *nm, search; @@ -1057,21 +1087,39 @@ msp_coancestry_event(msp_t *self) assert(self->next_node != 0); } v = self->next_node - 1; - l = x->left; + r_max = GSL_MIN(x->right, y->right); + /* Insert overlap counts for bounds, if necessary */ search.left = l; - node = avl_search(&self->breakpoints, &search); - assert(node != NULL); - nm = (node_mapping_t *) node->item; - nm->value -= 1; - segment_coalesced = nm->value == 1; - node = node->next; + node = avl_search(&self->overlap_counts, &search); + if (node == NULL) { + ret = msp_copy_overlap_count(self, l); + if (ret < 0) { + goto out; + } + } + search.left = r_max; + node = avl_search(&self->overlap_counts, &search); + if (node == NULL) { + ret = msp_copy_overlap_count(self, r_max); + if (ret < 0) { + goto out; + } + } + /* Now get overlap count at the left */ + search.left = l; + node = avl_search(&self->overlap_counts, &search); assert(node != NULL); nm = (node_mapping_t *) node->item; - r = nm->left; - if (!segment_coalesced) { - r_max = GSL_MIN(x->right, y->right); - while (nm->value > 2 && r < r_max) { + if (nm->value == 2) { + nm->value = 0; + node = node->next; + assert(node != NULL); + nm = (node_mapping_t *) node->item; + r = nm->left; + } else { + r = l; + while (nm->value != 2 && r < r_max) { nm->value--; node = node->next; assert(node != NULL); @@ -1084,10 +1132,39 @@ msp_coancestry_event(msp_t *self) goto out; } } + + /* l = x->left; */ + /* search.left = l; */ + /* node = avl_search(&self->breakpoints, &search); */ + /* assert(node != NULL); */ + /* nm = (node_mapping_t *) node->item; */ + /* nm->value -= 1; */ + /* segment_coalesced = nm->value == 1; */ + /* node = node->next; */ + /* assert(node != NULL); */ + /* nm = (node_mapping_t *) node->item; */ + /* r = nm->left; */ + /* if (!segment_coalesced) { */ + /* r_max = GSL_MIN(x->right, y->right); */ + /* while (nm->value > 2 && r < r_max) { */ + /* nm->value--; */ + /* node = node->next; */ + /* assert(node != NULL); */ + /* nm = (node_mapping_t *) node->item; */ + /* r = nm->left; */ + /* } */ + /* alpha = msp_alloc_segment(self, l, r, v, NULL, NULL); */ + /* if (alpha == NULL) { */ + /* ret = MSP_ERR_NO_MEMORY; */ + /* goto out; */ + /* } */ + /* } */ + ret = msp_record_coalescence(self, l, r, x->value, y->value, v); if (ret != 0) { goto out; } + /* Trim the ends of x and y, and prepare for next iteration. */ if (x->right == r) { beta = x; x = x->next; @@ -1190,11 +1267,12 @@ msp_initialise(msp_t *self) fenwick_set_value(&self->links, u->index, self->num_loci - 1); } self->next_node = self->sample_size + 1; - ret = msp_insert_breakpoint(self, 0, self->sample_size); + ret = msp_insert_overlap_count(self, 0, self->sample_size); if (ret != 0) { goto out; } - ret = msp_insert_breakpoint(self, self->num_loci, 0); + ret = msp_insert_overlap_count(self, self->num_loci, + self->sample_size + 1); if (ret != 0) { goto out; } diff --git a/lib/msprime.h b/lib/msprime.h index d100821f3..d206f29eb 100644 --- a/lib/msprime.h +++ b/lib/msprime.h @@ -129,6 +129,7 @@ typedef struct { gsl_rng *rng; avl_tree_t ancestral_population; avl_tree_t breakpoints; + avl_tree_t overlap_counts; fenwick_t links; /* memory management */ object_heap_t avl_node_heap; From 41bb125fcea56ae5c2e51a027fd269afa848693c Mon Sep 17 00:00:00 2001 From: Jerome Kelleher Date: Tue, 24 Nov 2015 16:59:40 +0000 Subject: [PATCH 4/8] Crude fix for new breakpoints format. --- msprime/trees.py | 7 +++++-- tests/test_highlevel.py | 7 ++++--- tests/test_lowlevel.py | 12 +++++------- 3 files changed, 14 insertions(+), 12 deletions(-) diff --git a/msprime/trees.py b/msprime/trees.py index 9ebc09487..cfd75ef8c 100644 --- a/msprime/trees.py +++ b/msprime/trees.py @@ -752,11 +752,14 @@ def newick_trees(self, precision=3, breakpoints=None): else: trees_covered = 0 j = 0 + # TODO this is ugly. Update the alg so we don't need this + # bracketing. + bp = [0] + breakpoints + [self.get_num_loci()] for length, tree in iterator: trees_covered += length - while breakpoints[j] < trees_covered: + while bp[j] < trees_covered: j += 1 - yield breakpoints[j] - breakpoints[j - 1], tree + yield bp[j] - bp[j - 1], tree def dump(self, path, zlib_compression=False): """ diff --git a/tests/test_highlevel.py b/tests/test_highlevel.py index d63bf1359..57d1559c1 100644 --- a/tests/test_highlevel.py +++ b/tests/test_highlevel.py @@ -479,17 +479,18 @@ def verify_all_breakpoints(self, tree_sequence, breakpoints): the all_breakpoints option for newick generation. """ trees = list(tree_sequence.newick_trees(2, breakpoints)) - self.assertEqual(len(trees), len(breakpoints) - 1) + bp = [0] + breakpoints + [tree_sequence.get_num_loci()] + self.assertEqual(len(trees), len(bp) - 1) j = 0 s = 0 for length, _ in trees: self.assertGreater(length, 0) - self.assertEqual(s, breakpoints[j]) + self.assertEqual(s, bp[j]) s += length j += 1 self.assertEqual(s, tree_sequence.get_num_loci()) pts = tests.PythonTreeSequence( - tree_sequence.get_ll_tree_sequence(), breakpoints) + tree_sequence.get_ll_tree_sequence(), bp) diffs = list(pts.diffs(all_breaks=True)) self.assertEqual(len(diffs), len(trees)) for j in range(1, len(diffs)): diff --git a/tests/test_lowlevel.py b/tests/test_lowlevel.py index 1b9f13e5d..d2f4c01c6 100644 --- a/tests/test_lowlevel.py +++ b/tests/test_lowlevel.py @@ -272,7 +272,7 @@ def verify_running_simulation(self, sim): Verifies the state of the specified simulation that has run for at least one event. """ - self.assertGreater(sim.get_num_breakpoints(), 0) + self.assertGreaterEqual(sim.get_num_breakpoints(), 0) self.assertGreater(sim.get_time(), 0.0) self.assertGreater(sim.get_num_ancestors(), 1) self.assertGreater(sim.get_used_memory(), 0) @@ -292,11 +292,9 @@ def verify_running_simulation(self, sim): self.assertTrue(0 <= l < m) self.assertTrue(1 <= r <= m) self.assertGreaterEqual(node, 1) - breakpoints = sim.get_breakpoints() - self.assertEqual(len(breakpoints), sim.get_num_breakpoints()) + breakpoints = [0] + sim.get_breakpoints() + [m] + self.assertEqual(len(breakpoints), sim.get_num_breakpoints() + 2) self.assertEqual(breakpoints, sorted(breakpoints)) - self.assertEqual(breakpoints[0], 0) - self.assertEqual(breakpoints[-1], m) records = sim.get_coalescence_records() self.assertEqual(len(records), sim.get_num_coalescence_records()) for l, r, p, children, t in records: @@ -421,7 +419,7 @@ def verify_trees(self, sim, sorted_records): st = next(st_iter) self.verify_trees_equal(n, pi, tau, st) num_trees += 1 - self.assertLessEqual(num_trees, sim.get_num_breakpoints()) + self.assertLessEqual(num_trees, sim.get_num_breakpoints() + 2) self.assertRaises(StopIteration, next, st_iter) def verify_squashed_records(self, sorted_records): @@ -442,7 +440,7 @@ def verify_completed_simulation(self, sim): """ self.assertEqual(sim.get_ancestors(), []) self.assertEqual(sim.get_num_ancestors(), 0) - self.assertGreater(sim.get_num_breakpoints(), 0) + self.assertGreaterEqual(sim.get_num_breakpoints(), 0) self.assertGreater(sim.get_num_coalescence_records(), 0) self.assertGreater(sim.get_time(), 0.0) events = sim.get_num_coancestry_events() From 4176058e50d5b1cbf29aa8c7967feaafed8d10ee Mon Sep 17 00:00:00 2001 From: Jerome Kelleher Date: Tue, 24 Nov 2015 17:57:51 +0000 Subject: [PATCH 5/8] Rough first effort squashing. --- lib/main.c | 42 +++++++++++------------ lib/msprime.c | 92 +++++++++++++++++++++++++++++++++++---------------- 2 files changed, 85 insertions(+), 49 deletions(-) diff --git a/lib/main.c b/lib/main.c index c35b5708e..b043748ac 100644 --- a/lib/main.c +++ b/lib/main.c @@ -414,38 +414,38 @@ run_simulate(char *conf_file) goto out; } msp_verify(msp); - ret = msp_print_state(msp); + /* ret = msp_print_state(msp); */ } ret = msp_print_state(msp); if (ret != 0) { goto out; } - /* Create the tree_sequence from the state of the simulator. */ - ret = tree_sequence_create(tree_seq, msp); - if (ret != 0) { - goto out; - } - ret = tree_sequence_generate_mutations(tree_seq, - mutation_params.mutation_rate, mutation_params.random_seed); - if (ret != 0) { - goto out; - } - int j; - for (j = 0; j < 1; j++) { - ret = tree_sequence_dump(tree_seq, output_file, 0); + if (0) { + /* Create the tree_sequence from the state of the simulator. */ + ret = tree_sequence_create(tree_seq, msp); if (ret != 0) { goto out; } - tree_sequence_free(tree_seq); - memset(tree_seq, 0, sizeof(tree_sequence_t)); - ret = tree_sequence_load(tree_seq, output_file, 0); + ret = tree_sequence_generate_mutations(tree_seq, + mutation_params.mutation_rate, mutation_params.random_seed); if (ret != 0) { goto out; } - } - print_tree_sequence(tree_seq); - print_haplotypes(tree_seq); - if (0) { + int j; + for (j = 0; j < 1; j++) { + ret = tree_sequence_dump(tree_seq, output_file, 0); + if (ret != 0) { + goto out; + } + tree_sequence_free(tree_seq); + memset(tree_seq, 0, sizeof(tree_sequence_t)); + ret = tree_sequence_load(tree_seq, output_file, 0); + if (ret != 0) { + goto out; + } + } + print_tree_sequence(tree_seq); + print_haplotypes(tree_seq); tree_sequence_print_state(tree_seq); print_newick_trees(tree_seq); tree_sequence_print_state(tree_seq); diff --git a/lib/msprime.c b/lib/msprime.c index 5c5534bda..f0b4690d0 100644 --- a/lib/msprime.c +++ b/lib/msprime.c @@ -965,6 +965,64 @@ msp_record_coalescence(msp_t *self, uint32_t left, uint32_t right, return ret; } +static int +msp_compress_overlap_counts(msp_t *self) +{ + int ret = 0; + avl_node_t *node1, *node2; + node_mapping_t *nm1, *nm2; + + /* printf("Coalescence occured\n"); */ + + /* printf("Before: Overlap count = %d\n", avl_count(&self->overlap_counts)); */ + /* for (node1 = self->overlap_counts.head; node1 != NULL; node1 = node1->next) { */ + /* nm1 = (node_mapping_t *) node1->item; */ + /* printf("\t%d -> %d\n", nm1->left, nm1->value); */ + /* } */ + + node1 = self->overlap_counts.head; + node2 = node1->next; + while (node2 != NULL) { + nm1 = (node_mapping_t *) node1->item; + nm2 = (node_mapping_t *) node2->item; + if (nm1->value == nm2->value) { + /* printf("Squash %d\n", nm2->left); */ + // TODO free the node mapping. + avl_unlink_node(&self->overlap_counts, node2); + msp_free_avl_node(self, node2); + node2 = node1->next; + } else { + node1 = node2; + node2 = node2->next; + } + } + + /* printf("After: Overlap count = %d\n", avl_count(&self->overlap_counts)); */ + /* for (node1 = self->overlap_counts.head; node1 != NULL; node1 = node1->next) { */ + /* nm1 = (node_mapping_t *) node1->item; */ + /* printf("\t%d -> %d\n", nm1->left, nm1->value); */ + /* } */ + +/* for (node = self->overlap_counts.head; node->next != NULL; node = node->next) { */ +/* nm = (node_mapping_t *) node->item; */ +/* v = nm->value; */ +/* nm = (node_mapping_t *) node->next->item; */ +/* if (v == nm->value) { */ +/* printf("squash %d\n", nm->left); */ +/* } */ + +/* } */ + +/* printf("After: Overlap count = %d\n", avl_count(&self->overlap_counts)); */ +/* for (node = self->overlap_counts.head; node != NULL; */ +/* node = node->next) { */ +/* nm = (node_mapping_t *) node->item; */ +/* printf("\t%d -> %d\n", nm->left, nm->value); */ +/* } */ + + return ret; +} + static int WARN_UNUSED msp_recombination_event(msp_t *self) { @@ -1132,34 +1190,6 @@ msp_coancestry_event(msp_t *self) goto out; } } - - /* l = x->left; */ - /* search.left = l; */ - /* node = avl_search(&self->breakpoints, &search); */ - /* assert(node != NULL); */ - /* nm = (node_mapping_t *) node->item; */ - /* nm->value -= 1; */ - /* segment_coalesced = nm->value == 1; */ - /* node = node->next; */ - /* assert(node != NULL); */ - /* nm = (node_mapping_t *) node->item; */ - /* r = nm->left; */ - /* if (!segment_coalesced) { */ - /* r_max = GSL_MIN(x->right, y->right); */ - /* while (nm->value > 2 && r < r_max) { */ - /* nm->value--; */ - /* node = node->next; */ - /* assert(node != NULL); */ - /* nm = (node_mapping_t *) node->item; */ - /* r = nm->left; */ - /* } */ - /* alpha = msp_alloc_segment(self, l, r, v, NULL, NULL); */ - /* if (alpha == NULL) { */ - /* ret = MSP_ERR_NO_MEMORY; */ - /* goto out; */ - /* } */ - /* } */ - ret = msp_record_coalescence(self, l, r, x->value, y->value, v); if (ret != 0) { goto out; @@ -1215,6 +1245,12 @@ msp_coancestry_event(msp_t *self) y = x; } } + if (coalescence) { + ret = msp_compress_overlap_counts(self); + if (ret != 0) { + goto out; + } + } out: return ret; } From e679c7d3692c369e65f26e10bb3dff588383f9a8 Mon Sep 17 00:00:00 2001 From: Jerome Kelleher Date: Tue, 24 Nov 2015 20:06:39 +0000 Subject: [PATCH 6/8] Added bounds on compress. --- lib/msprime.c | 43 +++++++++++++++---------------------------- 1 file changed, 15 insertions(+), 28 deletions(-) diff --git a/lib/msprime.c b/lib/msprime.c index f0b4690d0..4a9090ef5 100644 --- a/lib/msprime.c +++ b/lib/msprime.c @@ -966,13 +966,13 @@ msp_record_coalescence(msp_t *self, uint32_t left, uint32_t right, } static int -msp_compress_overlap_counts(msp_t *self) +msp_compress_overlap_counts(msp_t *self, uint32_t l, uint32_t r) { int ret = 0; avl_node_t *node1, *node2; - node_mapping_t *nm1, *nm2; + node_mapping_t search, *nm1, *nm2; - /* printf("Coalescence occured\n"); */ + /* printf("Coalescence occured %d %d\n", l, r); */ /* printf("Before: Overlap count = %d\n", avl_count(&self->overlap_counts)); */ /* for (node1 = self->overlap_counts.head; node1 != NULL; node1 = node1->next) { */ @@ -980,9 +980,13 @@ msp_compress_overlap_counts(msp_t *self) /* printf("\t%d -> %d\n", nm1->left, nm1->value); */ /* } */ - node1 = self->overlap_counts.head; + search.left = l; + node1 = avl_search(&self->overlap_counts, &search); + if (node1->prev != NULL) { + node1 = node1->prev; + } node2 = node1->next; - while (node2 != NULL) { + do { nm1 = (node_mapping_t *) node1->item; nm2 = (node_mapping_t *) node2->item; if (nm1->value == nm2->value) { @@ -995,31 +999,13 @@ msp_compress_overlap_counts(msp_t *self) node1 = node2; node2 = node2->next; } - } + } while (node2 != NULL && nm2->left < r); /* printf("After: Overlap count = %d\n", avl_count(&self->overlap_counts)); */ /* for (node1 = self->overlap_counts.head; node1 != NULL; node1 = node1->next) { */ /* nm1 = (node_mapping_t *) node1->item; */ /* printf("\t%d -> %d\n", nm1->left, nm1->value); */ /* } */ - -/* for (node = self->overlap_counts.head; node->next != NULL; node = node->next) { */ -/* nm = (node_mapping_t *) node->item; */ -/* v = nm->value; */ -/* nm = (node_mapping_t *) node->next->item; */ -/* if (v == nm->value) { */ -/* printf("squash %d\n", nm->left); */ -/* } */ - -/* } */ - -/* printf("After: Overlap count = %d\n", avl_count(&self->overlap_counts)); */ -/* for (node = self->overlap_counts.head; node != NULL; */ -/* node = node->next) { */ -/* nm = (node_mapping_t *) node->item; */ -/* printf("\t%d -> %d\n", nm->left, nm->value); */ -/* } */ - return ret; } @@ -1085,7 +1071,7 @@ msp_coancestry_event(msp_t *self) int ret = 0; int coalescence = 0; int defrag_required = 0; - uint32_t j, n, l, r, r_max, v; + uint32_t j, n, l, r, l_min, r_max, v; avl_node_t *node; node_mapping_t *nm, search; segment_t *x, *y, *z, *alpha, *beta; @@ -1138,15 +1124,16 @@ msp_coancestry_event(msp_t *self) } x->left = y->left; } else { + l = x->left; + r_max = GSL_MIN(x->right, y->right); if (!coalescence) { coalescence = 1; + l_min = l; self->next_node++; /* Check for overflow */ assert(self->next_node != 0); } v = self->next_node - 1; - l = x->left; - r_max = GSL_MIN(x->right, y->right); /* Insert overlap counts for bounds, if necessary */ search.left = l; node = avl_search(&self->overlap_counts, &search); @@ -1246,7 +1233,7 @@ msp_coancestry_event(msp_t *self) } } if (coalescence) { - ret = msp_compress_overlap_counts(self); + ret = msp_compress_overlap_counts(self, l_min, r_max); if (ret != 0) { goto out; } From 327a44709439d9e4536f1c76923280fae0f097fb Mon Sep 17 00:00:00 2001 From: Jerome Kelleher Date: Tue, 24 Nov 2015 20:25:57 +0000 Subject: [PATCH 7/8] Bumped version number. --- msprime/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/msprime/__init__.py b/msprime/__init__.py index d8ac5832c..dc02e9cb8 100644 --- a/msprime/__init__.py +++ b/msprime/__init__.py @@ -28,4 +28,4 @@ from msprime.trees import * # NOQA -__version__ = '0.1.5' +__version__ = '0.1.6.dev1' From 5d766cc346f9e7be139db2e9d8ad65d5ad229917 Mon Sep 17 00:00:00 2001 From: Jerome Kelleher Date: Wed, 25 Nov 2015 10:35:23 +0000 Subject: [PATCH 8/8] Tidied up compress implementation. --- lib/dev.cfg | 2 +- lib/msprime.c | 111 ++++++++++++++------------------------------------ lib/msprime.h | 5 +-- 3 files changed, 33 insertions(+), 85 deletions(-) diff --git a/lib/dev.cfg b/lib/dev.cfg index b1eed48b3..6746ed6a0 100644 --- a/lib/dev.cfg +++ b/lib/dev.cfg @@ -1,5 +1,5 @@ # Development config file -sample_size = 20; +sample_size = 9; num_loci = 57; random_seed = 878638576; # The scaled recombination rate, 4 Ne rho. Note that unlike ms diff --git a/lib/msprime.c b/lib/msprime.c index 4a9090ef5..410361fcd 100644 --- a/lib/msprime.c +++ b/lib/msprime.c @@ -269,31 +269,6 @@ msp_get_node_mapping_mem_increment(msp_t *self) * sizeof(node_mapping_t); } -static int WARN_UNUSED -msp_add_node_mapping_block(msp_t *self) -{ - int ret = -1; - void *p = realloc(self->node_mapping_blocks, - (self->num_node_mapping_blocks + 1) * sizeof(void *)); - - if (p == NULL) { - ret = MSP_ERR_NO_MEMORY; - goto out; - } - self->node_mapping_blocks = p; - p = malloc(self->node_mapping_block_size * sizeof(node_mapping_t)); - if (p == NULL) { - ret = MSP_ERR_NO_MEMORY; - goto out; - } - self->node_mapping_blocks[self->num_node_mapping_blocks] = p; - self->num_node_mapping_blocks++; - self->next_node_mapping = 0; - ret = 0; -out: - return ret; -} - size_t msp_get_num_avl_node_blocks(msp_t *self) { @@ -303,7 +278,7 @@ msp_get_num_avl_node_blocks(msp_t *self) size_t msp_get_num_node_mapping_blocks(msp_t *self) { - return self->num_node_mapping_blocks; + return self->node_mapping_heap.num_blocks; } size_t @@ -336,6 +311,7 @@ int msp_set_num_loci(msp_t *self, uint32_t num_loci) { int ret = 0; + if (num_loci < 1) { ret = MSP_ERR_BAD_PARAM_VALUE; goto out; @@ -350,6 +326,7 @@ msp_set_scaled_recombination_rate(msp_t *self, double scaled_recombination_rate) { int ret = 0; + if (scaled_recombination_rate < 0) { ret = MSP_ERR_BAD_PARAM_VALUE; goto out; @@ -363,6 +340,7 @@ int msp_set_max_memory(msp_t *self, size_t max_memory) { int ret = 0; + if (max_memory < 1) { ret = MSP_ERR_BAD_PARAM_VALUE; goto out; @@ -375,6 +353,7 @@ msp_set_max_memory(msp_t *self, size_t max_memory) int msp_set_node_mapping_block_size(msp_t *self, size_t block_size) { int ret = 0; + if (block_size < 1) { ret = MSP_ERR_BAD_PARAM_VALUE; goto out; @@ -387,6 +366,7 @@ int msp_set_node_mapping_block_size(msp_t *self, size_t block_size) int msp_set_segment_block_size(msp_t *self, size_t block_size) { int ret = 0; + if (block_size < 1) { ret = MSP_ERR_BAD_PARAM_VALUE; goto out; @@ -399,6 +379,7 @@ int msp_set_segment_block_size(msp_t *self, size_t block_size) int msp_set_avl_node_block_size(msp_t *self, size_t block_size) { int ret = 0; + if (block_size < 1) { ret = MSP_ERR_BAD_PARAM_VALUE; goto out; @@ -411,6 +392,7 @@ int msp_set_avl_node_block_size(msp_t *self, size_t block_size) int msp_set_coalescence_record_block_size(msp_t *self, size_t block_size) { int ret = 0; + if (block_size < 1) { ret = MSP_ERR_BAD_PARAM_VALUE; goto out; @@ -474,20 +456,11 @@ msp_alloc_memory_blocks(msp_t *self) if (ret != 0) { goto out; } - /* set up the node mapping allocator */ - self->node_mapping_blocks = malloc(sizeof(void *)); - if (self->node_mapping_blocks == NULL) { - ret = MSP_ERR_NO_MEMORY; - goto out; - } - self->num_node_mapping_blocks = 1; - self->node_mapping_blocks[0] = malloc(self->node_mapping_block_size - * sizeof(node_mapping_t)); - if (self->node_mapping_blocks[0] == NULL) { - ret = MSP_ERR_NO_MEMORY; + ret = object_heap_init(&self->node_mapping_heap, sizeof(node_mapping_t), + self->node_mapping_block_size, NULL); + if (ret != 0) { goto out; } - self->next_node_mapping = 0; /* allocate the segments and Fenwick tree */ ret = object_heap_init(&self->segment_heap, sizeof(segment_t), self->segment_block_size, segment_init); @@ -532,7 +505,6 @@ int msp_free(msp_t *self) { int ret = -1; - size_t j; if (self->population_models != NULL) { free(self->population_models); @@ -543,15 +515,8 @@ msp_free(msp_t *self) /* free the object heaps */ object_heap_free(&self->avl_node_heap); object_heap_free(&self->segment_heap); + object_heap_free(&self->node_mapping_heap); fenwick_free(&self->links); - if (self->node_mapping_blocks != NULL) { - for (j = 0; j < self->num_node_mapping_blocks; j++) { - if (self->node_mapping_blocks[j] != NULL) { - free(self->node_mapping_blocks[j]); - } - } - free(self->node_mapping_blocks); - } if (self->coalescence_records != NULL) { free(self->coalescence_records); } @@ -588,30 +553,33 @@ static inline node_mapping_t * msp_alloc_node_mapping(msp_t *self) { node_mapping_t *ret = NULL; - char *p; - if (self->next_node_mapping == self->node_mapping_block_size) { + if (object_heap_empty(&self->node_mapping_heap)) { self->used_memory += msp_get_node_mapping_mem_increment(self); if (self->used_memory > self->max_memory) { goto out; } - if (msp_add_node_mapping_block(self) != 0) { + if (object_heap_expand(&self->node_mapping_heap) != 0) { goto out; } } - p = self->node_mapping_blocks[self->num_node_mapping_blocks - 1]; - /* cast to void * here to avoid alignment warnings from clang. */ - ret = (void *)(p + - self->next_node_mapping * sizeof(node_mapping_t)); - self->next_node_mapping++; + ret = (node_mapping_t *) object_heap_alloc_object(&self->node_mapping_heap); + if (ret == NULL) { + goto out; + } out: return ret; } +static void +msp_free_node_mapping(msp_t *self, node_mapping_t *nm) +{ + object_heap_free_object(&self->node_mapping_heap, nm); +} static segment_t * WARN_UNUSED -msp_alloc_segment(msp_t *self, uint32_t left, uint32_t right, uint32_t value, segment_t *prev, - segment_t *next) +msp_alloc_segment(msp_t *self, uint32_t left, uint32_t right, uint32_t value, + segment_t *prev, segment_t *next) { segment_t *seg = NULL; @@ -755,6 +723,8 @@ msp_verify(msp_t *self) + avl_count(&self->overlap_counts); assert(total_avl_nodes == object_heap_get_num_allocated( &self->avl_node_heap)); + assert(total_avl_nodes - avl_count(&self->ancestral_population) + == object_heap_get_num_allocated(&self->node_mapping_heap)); if (total_avl_nodes == total_segments) { /* do nothing - this is just to keep the compiler happy when * asserts are turned off. @@ -834,10 +804,8 @@ msp_print_state(msp_t *self) object_heap_print_state(&self->avl_node_heap); printf("segment_heap:"); object_heap_print_state(&self->segment_heap); - printf("node_mapping stack:\n"); - printf("\tblock size = %d\n", (int) self->node_mapping_block_size); - printf("\tnext = %d\n", (int) self->next_node_mapping); - printf("\tnum_blocks = %d\n", (int) self->num_node_mapping_blocks); + printf("node_mapping_heap:"); + object_heap_print_state(&self->node_mapping_heap); msp_verify(self); out: if (ancestors != NULL) { @@ -972,14 +940,6 @@ msp_compress_overlap_counts(msp_t *self, uint32_t l, uint32_t r) avl_node_t *node1, *node2; node_mapping_t search, *nm1, *nm2; - /* printf("Coalescence occured %d %d\n", l, r); */ - - /* printf("Before: Overlap count = %d\n", avl_count(&self->overlap_counts)); */ - /* for (node1 = self->overlap_counts.head; node1 != NULL; node1 = node1->next) { */ - /* nm1 = (node_mapping_t *) node1->item; */ - /* printf("\t%d -> %d\n", nm1->left, nm1->value); */ - /* } */ - search.left = l; node1 = avl_search(&self->overlap_counts, &search); if (node1->prev != NULL) { @@ -990,22 +950,15 @@ msp_compress_overlap_counts(msp_t *self, uint32_t l, uint32_t r) nm1 = (node_mapping_t *) node1->item; nm2 = (node_mapping_t *) node2->item; if (nm1->value == nm2->value) { - /* printf("Squash %d\n", nm2->left); */ - // TODO free the node mapping. avl_unlink_node(&self->overlap_counts, node2); msp_free_avl_node(self, node2); + msp_free_node_mapping(self, nm2); node2 = node1->next; } else { node1 = node2; node2 = node2->next; } - } while (node2 != NULL && nm2->left < r); - - /* printf("After: Overlap count = %d\n", avl_count(&self->overlap_counts)); */ - /* for (node1 = self->overlap_counts.head; node1 != NULL; node1 = node1->next) { */ - /* nm1 = (node_mapping_t *) node1->item; */ - /* printf("\t%d -> %d\n", nm1->left, nm1->value); */ - /* } */ + } while (node2 != NULL && nm2->left <= r); return ret; } @@ -1242,8 +1195,6 @@ msp_coancestry_event(msp_t *self) return ret; } - - /* Given the value of the current_population_model, return the next one, * or NULL if none exists. */ diff --git a/lib/msprime.h b/lib/msprime.h index d206f29eb..f6476c742 100644 --- a/lib/msprime.h +++ b/lib/msprime.h @@ -134,10 +134,7 @@ typedef struct { /* memory management */ object_heap_t avl_node_heap; object_heap_t segment_heap; - /* node mappings are never freed, so simpler requirements */ - void **node_mapping_blocks; - size_t num_node_mapping_blocks; - size_t next_node_mapping; + object_heap_t node_mapping_heap; /* coalescence records are stored in an array */ coalescence_record_t *coalescence_records; size_t num_coalescence_records;