Skip to content

Commit

Permalink
Support sample sets in divmat
Browse files Browse the repository at this point in the history
Add missing test for threading + span_normalise

First pass at sample_sets args

Update normalisation

Basic C version

Basic approach work to sample sets branch

Basic Python C API plumbing

Progress(ish)

Implement draft "ids" arg

Harden low-level sample sets

Add in quick version of GRM

Change back to sample sets

Update

numpify normalisation
  • Loading branch information
jeromekelleher committed Jan 15, 2024
1 parent 77faade commit 64a9528
Show file tree
Hide file tree
Showing 10 changed files with 714 additions and 258 deletions.
116 changes: 83 additions & 33 deletions c/tests/test_stats.c
Original file line number Diff line number Diff line change
Expand Up @@ -287,7 +287,8 @@ verify_divergence_matrix(tsk_treeseq_t *ts, tsk_flags_t options)
ts, n, sample_set_sizes, samples, n * n, index_tuples, 0, NULL, options, D1);
CU_ASSERT_EQUAL_FATAL(ret, 0);

ret = tsk_treeseq_divergence_matrix(ts, 0, NULL, 0, NULL, options, D2);
ret = tsk_treeseq_divergence_matrix(
ts, n, sample_set_sizes, samples, 0, NULL, options, D2);
CU_ASSERT_EQUAL_FATAL(ret, 0);

for (j = 0; j < n; j++) {
Expand Down Expand Up @@ -1058,18 +1059,41 @@ test_single_tree_divergence_matrix(void)
double result[16];
double D_branch[16] = { 0, 2, 6, 6, 2, 0, 6, 6, 6, 6, 0, 4, 6, 6, 4, 0 };
double D_site[16] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
tsk_size_t sample_set_sizes[] = { 2, 2 };

tsk_treeseq_from_text(&ts, 1, single_tree_ex_nodes, single_tree_ex_edges, NULL, NULL,
NULL, NULL, NULL, 0);

ret = tsk_treeseq_divergence_matrix(&ts, 0, NULL, 0, NULL, TSK_STAT_BRANCH, result);
ret = tsk_treeseq_divergence_matrix(
&ts, 0, NULL, NULL, 0, NULL, TSK_STAT_BRANCH, result);
CU_ASSERT_EQUAL_FATAL(ret, 0);
assert_arrays_almost_equal(16, result, D_branch);

ret = tsk_treeseq_divergence_matrix(&ts, 0, NULL, 0, NULL, TSK_STAT_SITE, result);
ret = tsk_treeseq_divergence_matrix(
&ts, 0, NULL, NULL, 0, NULL, TSK_STAT_SITE, result);
CU_ASSERT_EQUAL_FATAL(ret, 0);
assert_arrays_almost_equal(16, result, D_site);

ret = tsk_treeseq_divergence_matrix(
&ts, 2, sample_set_sizes, NULL, 0, NULL, TSK_STAT_SITE, result);
CU_ASSERT_EQUAL_FATAL(ret, 0);
assert_arrays_almost_equal(4, result, D_site);

ret = tsk_treeseq_divergence_matrix(
&ts, 2, sample_set_sizes, NULL, 0, NULL, TSK_STAT_BRANCH, result);
CU_ASSERT_EQUAL_FATAL(ret, 0);

sample_set_sizes[0] = 3;
sample_set_sizes[1] = 1;
ret = tsk_treeseq_divergence_matrix(
&ts, 2, sample_set_sizes, NULL, 0, NULL, TSK_STAT_BRANCH, result);
CU_ASSERT_EQUAL_FATAL(ret, 0);
ret = tsk_treeseq_divergence_matrix(
&ts, 2, sample_set_sizes, NULL, 0, NULL, TSK_STAT_SITE, result);
CU_ASSERT_EQUAL_FATAL(ret, 0);

/* assert_arrays_almost_equal(4, result, D_site); */

verify_divergence_matrix(&ts, TSK_STAT_BRANCH);
verify_divergence_matrix(&ts, TSK_STAT_BRANCH | TSK_STAT_SPAN_NORMALISE);
verify_divergence_matrix(&ts, TSK_STAT_SITE);
Expand All @@ -1083,7 +1107,7 @@ test_single_tree_divergence_matrix_internal_samples(void)
{
tsk_treeseq_t ts;
int ret;
double result[16];
double *result = malloc(16 * sizeof(double));
double D[16] = { 0, 2, 4, 3, 2, 0, 4, 3, 4, 4, 0, 1, 3, 3, 1, 0 };

const char *nodes = "1 0 -1 -1\n" /* 2.00┊ 6 ┊ */
Expand All @@ -1109,14 +1133,35 @@ test_single_tree_divergence_matrix_internal_samples(void)
"3 3 T -1\n"
"4 4 T -1\n"
"5 5 T -1\n";
tsk_id_t samples[] = { 0, 1, 2, 5 };
tsk_size_t sizes[] = { 1, 1, 1, 1 };

tsk_treeseq_from_text(&ts, 1, nodes, edges, NULL, sites, mutations, NULL, NULL, 0);

ret = tsk_treeseq_divergence_matrix(&ts, 0, NULL, 0, NULL, TSK_STAT_BRANCH, result);
ret = tsk_treeseq_divergence_matrix(
&ts, 0, NULL, NULL, 0, NULL, TSK_STAT_BRANCH, result);
CU_ASSERT_EQUAL_FATAL(ret, 0);
assert_arrays_almost_equal(16, result, D);
ret = tsk_treeseq_divergence_matrix(
&ts, 0, NULL, NULL, 0, NULL, TSK_STAT_SITE, result);
CU_ASSERT_EQUAL_FATAL(ret, 0);
assert_arrays_almost_equal(16, result, D);

ret = tsk_treeseq_divergence_matrix(
&ts, 4, sizes, samples, 0, NULL, TSK_STAT_BRANCH, result);
CU_ASSERT_EQUAL_FATAL(ret, 0);
assert_arrays_almost_equal(16, result, D);
ret = tsk_treeseq_divergence_matrix(
&ts, 4, sizes, samples, 0, NULL, TSK_STAT_SITE, result);
CU_ASSERT_EQUAL_FATAL(ret, 0);
assert_arrays_almost_equal(16, result, D);

ret = tsk_treeseq_divergence_matrix(&ts, 0, NULL, 0, NULL, TSK_STAT_SITE, result);
ret = tsk_treeseq_divergence_matrix(
&ts, 4, NULL, samples, 0, NULL, TSK_STAT_BRANCH, result);
CU_ASSERT_EQUAL_FATAL(ret, 0);
assert_arrays_almost_equal(16, result, D);
ret = tsk_treeseq_divergence_matrix(
&ts, 4, NULL, samples, 0, NULL, TSK_STAT_SITE, result);
CU_ASSERT_EQUAL_FATAL(ret, 0);
assert_arrays_almost_equal(16, result, D);

Expand All @@ -1126,6 +1171,7 @@ test_single_tree_divergence_matrix_internal_samples(void)
verify_divergence_matrix(&ts, TSK_STAT_SITE | TSK_STAT_SPAN_NORMALISE);

tsk_treeseq_free(&ts);
free(result);
}

static void
Expand Down Expand Up @@ -1164,7 +1210,8 @@ test_single_tree_divergence_matrix_multi_root(void)

tsk_treeseq_from_text(&ts, 1, nodes, edges, NULL, sites, mutations, NULL, NULL, 0);

ret = tsk_treeseq_divergence_matrix(&ts, 0, NULL, 0, NULL, TSK_STAT_BRANCH, result);
ret = tsk_treeseq_divergence_matrix(
&ts, 0, NULL, NULL, 0, NULL, TSK_STAT_BRANCH, result);
CU_ASSERT_EQUAL_FATAL(ret, 0);
assert_arrays_almost_equal(16, result, D_branch);

Expand Down Expand Up @@ -2434,61 +2481,64 @@ test_simplest_divergence_matrix(void)
tsk_treeseq_from_text(&ts, 1, nodes, edges, NULL, NULL, NULL, NULL, NULL, 0);

ret = tsk_treeseq_divergence_matrix(
&ts, 2, sample_ids, 0, NULL, TSK_STAT_BRANCH, result);
&ts, 2, NULL, sample_ids, 0, NULL, TSK_STAT_BRANCH, result);
CU_ASSERT_EQUAL_FATAL(ret, 0);
assert_arrays_almost_equal(4, D_branch, result);

ret = tsk_treeseq_divergence_matrix(
&ts, 2, sample_ids, 0, NULL, TSK_STAT_BRANCH | TSK_STAT_SPAN_NORMALISE, result);
ret = tsk_treeseq_divergence_matrix(&ts, 2, NULL, sample_ids, 0, NULL,
TSK_STAT_BRANCH | TSK_STAT_SPAN_NORMALISE, result);
CU_ASSERT_EQUAL_FATAL(ret, 0);
assert_arrays_almost_equal(4, D_branch, result);

ret = tsk_treeseq_divergence_matrix(&ts, 2, sample_ids, 0, NULL, 0, result);
ret = tsk_treeseq_divergence_matrix(&ts, 2, NULL, sample_ids, 0, NULL, 0, result);
CU_ASSERT_EQUAL_FATAL(ret, 0);
assert_arrays_almost_equal(4, D_site, result);

ret = tsk_treeseq_divergence_matrix(
&ts, 2, sample_ids, 0, NULL, TSK_STAT_SPAN_NORMALISE, result);
&ts, 2, NULL, sample_ids, 0, NULL, TSK_STAT_SPAN_NORMALISE, result);
CU_ASSERT_EQUAL_FATAL(ret, 0);
assert_arrays_almost_equal(4, D_site, result);

ret = tsk_treeseq_divergence_matrix(
&ts, 2, sample_ids, 0, NULL, TSK_STAT_SITE, result);
&ts, 2, NULL, sample_ids, 0, NULL, TSK_STAT_SITE, result);
CU_ASSERT_EQUAL_FATAL(ret, 0);
assert_arrays_almost_equal(4, D_site, result);

ret = tsk_treeseq_divergence_matrix(&ts, 0, NULL, 0, NULL, TSK_STAT_BRANCH, result);
ret = tsk_treeseq_divergence_matrix(
&ts, 0, NULL, NULL, 0, NULL, TSK_STAT_BRANCH, result);
CU_ASSERT_EQUAL_FATAL(ret, 0);
assert_arrays_almost_equal(4, D_branch, result);

ret = tsk_treeseq_divergence_matrix(&ts, 0, NULL, 0, NULL, TSK_STAT_SITE, result);
ret = tsk_treeseq_divergence_matrix(
&ts, 0, NULL, NULL, 0, NULL, TSK_STAT_SITE, result);
CU_ASSERT_EQUAL_FATAL(ret, 0);
assert_arrays_almost_equal(4, D_site, result);

ret = tsk_treeseq_divergence_matrix(&ts, 0, NULL, 0, NULL, TSK_STAT_NODE, result);
ret = tsk_treeseq_divergence_matrix(
&ts, 0, NULL, NULL, 0, NULL, TSK_STAT_NODE, result);
CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_UNSUPPORTED_STAT_MODE);

ret = tsk_treeseq_divergence_matrix(
&ts, 0, NULL, 0, NULL, TSK_STAT_POLARISED, result);
&ts, 0, NULL, NULL, 0, NULL, TSK_STAT_POLARISED, result);
CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_STAT_POLARISED_UNSUPPORTED);

ret = tsk_treeseq_divergence_matrix(
&ts, 0, NULL, 0, NULL, TSK_STAT_SITE | TSK_STAT_BRANCH, result);
&ts, 0, NULL, NULL, 0, NULL, TSK_STAT_SITE | TSK_STAT_BRANCH, result);
CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_MULTIPLE_STAT_MODES);

sample_ids[0] = -1;
ret = tsk_treeseq_divergence_matrix(&ts, 2, sample_ids, 0, NULL, 0, result);
ret = tsk_treeseq_divergence_matrix(&ts, 2, NULL, sample_ids, 0, NULL, 0, result);
CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_NODE_OUT_OF_BOUNDS);

sample_ids[0] = 3;
ret = tsk_treeseq_divergence_matrix(&ts, 2, sample_ids, 0, NULL, 0, result);
ret = tsk_treeseq_divergence_matrix(&ts, 2, NULL, sample_ids, 0, NULL, 0, result);
CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_NODE_OUT_OF_BOUNDS);

sample_ids[0] = 1;
ret = tsk_treeseq_divergence_matrix(&ts, 2, sample_ids, 0, NULL, 0, result);
ret = tsk_treeseq_divergence_matrix(&ts, 2, NULL, sample_ids, 0, NULL, 0, result);
CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_DUPLICATE_SAMPLE);
ret = tsk_treeseq_divergence_matrix(
&ts, 2, sample_ids, 0, NULL, TSK_STAT_BRANCH, result);
&ts, 2, NULL, sample_ids, 0, NULL, TSK_STAT_BRANCH, result);
CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_DUPLICATE_SAMPLE);

tsk_treeseq_free(&ts);
Expand All @@ -2515,39 +2565,39 @@ test_simplest_divergence_matrix_windows(void)

tsk_treeseq_from_text(&ts, 1, nodes, edges, NULL, sites, mutations, NULL, NULL, 0);

ret = tsk_treeseq_divergence_matrix(&ts, 2, sample_ids, 2, windows, 0, result);
ret = tsk_treeseq_divergence_matrix(&ts, 2, NULL, sample_ids, 2, windows, 0, result);
CU_ASSERT_EQUAL_FATAL(ret, 0);
assert_arrays_almost_equal(8, D_site, result);
ret = tsk_treeseq_divergence_matrix(
&ts, 2, sample_ids, 2, windows, TSK_STAT_BRANCH, result);
&ts, 2, NULL, sample_ids, 2, windows, TSK_STAT_BRANCH, result);
CU_ASSERT_EQUAL_FATAL(ret, 0);
assert_arrays_almost_equal(8, D_branch, result);

/* Windows for the second half */
ret = tsk_treeseq_divergence_matrix(
&ts, 2, sample_ids, 1, windows + 1, TSK_STAT_SITE, result);
&ts, 2, NULL, sample_ids, 1, windows + 1, TSK_STAT_SITE, result);
CU_ASSERT_EQUAL_FATAL(ret, 0);
assert_arrays_almost_equal(4, D_site, result);
ret = tsk_treeseq_divergence_matrix(
&ts, 2, sample_ids, 1, windows + 1, TSK_STAT_BRANCH, result);
&ts, 2, NULL, sample_ids, 1, windows + 1, TSK_STAT_BRANCH, result);
CU_ASSERT_EQUAL_FATAL(ret, 0);
assert_arrays_almost_equal(4, D_branch, result);

ret = tsk_treeseq_divergence_matrix(&ts, 2, sample_ids, 0, windows, 0, result);
ret = tsk_treeseq_divergence_matrix(&ts, 2, NULL, sample_ids, 0, windows, 0, result);
CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_BAD_NUM_WINDOWS);

windows[0] = -1;
ret = tsk_treeseq_divergence_matrix(&ts, 2, sample_ids, 2, windows, 0, result);
ret = tsk_treeseq_divergence_matrix(&ts, 2, NULL, sample_ids, 2, windows, 0, result);
CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_BAD_WINDOWS);

windows[0] = 0.45;
windows[2] = 1.5;
ret = tsk_treeseq_divergence_matrix(&ts, 2, sample_ids, 2, windows, 0, result);
ret = tsk_treeseq_divergence_matrix(&ts, 2, NULL, sample_ids, 2, windows, 0, result);
CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_BAD_WINDOWS);

windows[0] = 0.55;
windows[2] = 1.0;
ret = tsk_treeseq_divergence_matrix(&ts, 2, sample_ids, 2, windows, 0, result);
ret = tsk_treeseq_divergence_matrix(&ts, 2, NULL, sample_ids, 2, windows, 0, result);
CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_BAD_WINDOWS);

tsk_treeseq_free(&ts);
Expand All @@ -2558,7 +2608,7 @@ test_simplest_divergence_matrix_internal_sample(void)
{
const char *nodes = "1 0 0\n"
"1 0 0\n"
"0 1 0\n";
"1 1 0\n";
const char *edges = "0 1 2 0,1\n";
tsk_treeseq_t ts;
tsk_id_t sample_ids[] = { 0, 1, 2 };
Expand All @@ -2570,12 +2620,12 @@ test_simplest_divergence_matrix_internal_sample(void)
tsk_treeseq_from_text(&ts, 1, nodes, edges, NULL, NULL, NULL, NULL, NULL, 0);

ret = tsk_treeseq_divergence_matrix(
&ts, 3, sample_ids, 0, NULL, TSK_STAT_BRANCH, result);
&ts, 3, NULL, sample_ids, 0, NULL, TSK_STAT_BRANCH, result);
CU_ASSERT_EQUAL_FATAL(ret, 0);
assert_arrays_almost_equal(9, D_branch, result);

ret = tsk_treeseq_divergence_matrix(
&ts, 3, sample_ids, 0, NULL, TSK_STAT_SITE, result);
&ts, 3, NULL, sample_ids, 0, NULL, TSK_STAT_SITE, result);
CU_ASSERT_EQUAL_FATAL(ret, 0);
assert_arrays_almost_equal(9, D_site, result);

Expand Down
5 changes: 3 additions & 2 deletions c/tests/test_trees.c
Original file line number Diff line number Diff line change
Expand Up @@ -8008,9 +8008,10 @@ test_time_uncalibrated(void)
TSK_STAT_BRANCH | TSK_STAT_ALLOW_TIME_UNCALIBRATED, sigma);
CU_ASSERT_EQUAL_FATAL(ret, 0);

ret = tsk_treeseq_divergence_matrix(&ts2, 0, NULL, 0, NULL, TSK_STAT_BRANCH, result);
ret = tsk_treeseq_divergence_matrix(
&ts2, 0, NULL, NULL, 0, NULL, TSK_STAT_BRANCH, result);
CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_TIME_UNCALIBRATED);
ret = tsk_treeseq_divergence_matrix(&ts2, 0, NULL, 0, NULL,
ret = tsk_treeseq_divergence_matrix(&ts2, 0, NULL, NULL, 0, NULL,
TSK_STAT_BRANCH | TSK_STAT_ALLOW_TIME_UNCALIBRATED, result);
CU_ASSERT_EQUAL_FATAL(ret, 0);

Expand Down
10 changes: 0 additions & 10 deletions c/tests/testlib.c
Original file line number Diff line number Diff line change
Expand Up @@ -966,16 +966,6 @@ tskit_suite_init(void)
return CUE_SUCCESS;
}

void
assert_arrays_almost_equal(tsk_size_t len, double *a, double *b)
{
tsk_size_t j;

for (j = 0; j < len; j++) {
CU_ASSERT_DOUBLE_EQUAL(a[j], b[j], 1e-9);
}
}

static int
tskit_suite_cleanup(void)
{
Expand Down
11 changes: 10 additions & 1 deletion c/tests/testlib.h
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,16 @@ void parse_individuals(const char *text, tsk_individual_table_t *individual_tabl

void unsort_edges(tsk_edge_table_t *edges, size_t start);

void assert_arrays_almost_equal(tsk_size_t len, double *a, double *b);
/* Use a macro so we can get line numbers at roughly the right place */
#define assert_arrays_almost_equal(len, a, b) \
{ \
do { \
tsk_size_t _j; \
for (_j = 0; _j < len; _j++) { \
CU_ASSERT_DOUBLE_EQUAL(a[_j], b[_j], 1e-9); \
} \
} while (0); \
}

extern const char *single_tree_ex_nodes;
extern const char *single_tree_ex_edges;
Expand Down
Loading

0 comments on commit 64a9528

Please sign in to comment.