evaluation.py

"""
Script for statistically evaluating various aspects of tsinfer performance.
"""
import argparse
import concurrent.futures
import json
import logging
import os.path
import random
import time
import warnings

import colorama
import daiquiri
import matplotlib as mp
import msprime
import numpy as np
import pandas as pd
import tqdm
import tskit

import tsinfer
import tsinfer.cli as cli

# We break the normal conventions for ordering imports here
# because we have to make this ugly hack to make matplotlib
# work from a shell session and keep flake8 happy.

# Force matplotlib to not use any Xwindows backend.
mp.use("Agg")
import matplotlib.pyplot as plt  # noqa: E402
from matplotlib import collections as mc  # noqa: E402
import seaborn as sns  # noqa: E402


# Set by the CLI.
global _output_format
_output_format = None


def save_figure(basename):
    plt.savefig(basename + "." + _output_format)
    plt.clf()


def make_errors(v, p):
    """
    For each sample an error occurs with probability p. Errors are generated by
    sampling values from the stationary distribution, that is, if we have an
    allele frequency of f, a 1 is emitted with probability f and a
    0 with probability 1 - f. Thus, there is a possibility that an 'error'
    will in fact result in the same value.
    """
    w = np.copy(v)
    if p > 0:
        m = v.shape[0]
        frequency = np.sum(v) / m
        # Randomly choose samples with probability p
        samples = np.where(np.random.random(m) < p)[0]
        # Generate observations from the stationary distribution.
        errors = (np.random.random(samples.shape[0]) < frequency).astype(int)
        w[samples] = errors
    return w


def make_errors_genotype_model(g, error_probs):
    """
    Given an empirically estimated error probability matrix, resample for a particular
    variant. Determine variant frequency and true genotype (g0, g1, or g2),
    then return observed genotype based on row in error_probs with nearest
    frequency. Treat each pair of alleles as a diploid individual.
    """
    w = np.copy(g)

    # Make diploid (iterate each pair of alleles)
    genos = [(w[i], w[i + 1]) for i in range(0, w.shape[0], 2)]

    # Record the true genotypes
    g0 = [i for i, x in enumerate(genos) if x == (0, 0)]
    g1a = [i for i, x in enumerate(genos) if x == (1, 0)]
    g1b = [i for i, x in enumerate(genos) if x == (0, 1)]
    g2 = [i for i, x in enumerate(genos) if x == (1, 1)]

    for idx in g0:
        result = [(0, 0), (1, 0), (1, 1)][
            np.random.choice(3, p=error_probs[["p00", "p01", "p02"]].values[0])
        ]
        if result == (1, 0):
            genos[idx] = [(0, 1), (1, 0)][np.random.choice(2)]
        else:
            genos[idx] = result
    for idx in g1a:
        genos[idx] = [(0, 0), (1, 0), (1, 1)][
            np.random.choice(3, p=error_probs[["p10", "p11", "p12"]].values[0])
        ]
    for idx in g1b:
        genos[idx] = [(0, 0), (0, 1), (1, 1)][
            np.random.choice(3, p=error_probs[["p10", "p11", "p12"]].values[0])
        ]
    for idx in g2:
        result = [(0, 0), (1, 0), (1, 1)][
            np.random.choice(3, p=error_probs[["p20", "p21", "p22"]].values[0])
        ]
        if result == (1, 0):
            genos[idx] = [(0, 1), (1, 0)][np.random.choice(2)]
        else:
            genos[idx] = result

    return np.array(sum(genos, ()))


def generate_samples(ts, error_param=0):
    """
    Generate a samples file from a simulated ts based on the empirically estimated
    error matrix saved in self.error_matrix.
    Reject any variants that result in a fixed column.
    """
    assert ts.num_sites != 0
    sd = tsinfer.SampleData(sequence_length=ts.sequence_length)
    try:
        e = float(error_param)
        for v in ts.variants():
            g = v.genotypes if error_param == 0 else make_errors(v.genotypes, e)
            sd.add_site(position=v.site.position, alleles=v.alleles, genotypes=g)
    except ValueError:
        error_matrix = pd.read_csv(error_param)
        # Error_param is not a number => is a error file
        # First record the allele frequency
        for v in ts.variants():
            m = v.genotypes.shape[0]
            frequency = np.sum(v.genotypes) / m
            # Find closest row in error matrix file
            closest_row = (error_matrix["freq"] - frequency).abs().argsort()[:1]
            closest_freq = error_matrix.iloc[closest_row]
            g = make_errors_genotype_model(v.genotypes, closest_freq)
            sd.add_site(position=v.site.position, alleles=v.alleles, genotypes=g)
    sd.finalise()
    return sd


def run_infer(
    ts, engine=tsinfer.C_ENGINE, path_compression=True, exact_ancestors=False
):
    """
    Runs the perfect inference process on the specified tree sequence.
    """
    sample_data = tsinfer.SampleData.from_tree_sequence(ts)

    if exact_ancestors:
        ancestor_data = tsinfer.AncestorData(
            sample_data.sites_position, sample_data.sequence_length
        )
        tsinfer.build_simulated_ancestors(sample_data, ancestor_data, ts)
        ancestor_data.finalise()
    else:
        ancestor_data = tsinfer.generate_ancestors(sample_data, engine=engine)

    ancestors_ts = tsinfer.match_ancestors(
        sample_data, ancestor_data, path_compression=path_compression, engine=engine
    )
    inferred_ts = tsinfer.match_samples(
        sample_data, ancestors_ts, path_compression=path_compression, engine=engine
    )
    return inferred_ts


def edges_performance_worker(args):
    simulation_args, tree_metrics, engine = args
    before = time.perf_counter()
    smc_ts = msprime.simulate(**simulation_args)
    sim_time = time.perf_counter() - before

    tmp_ts = tsinfer.strip_singletons(smc_ts)
    if tmp_ts.num_sites == 0:
        warnings.warn("Dropping simulation with no variants")
        return {}

    before = time.perf_counter()
    estimated_ancestors_ts = run_infer(smc_ts, exact_ancestors=False, engine=engine)
    estimated_ancestors_time = time.perf_counter() - before
    num_children = []
    for edgeset in estimated_ancestors_ts.edgesets():
        num_children.append(len(edgeset.children))
    estimated_ancestors_num_children = np.array(num_children)

    before = time.perf_counter()
    exact_ancestors_ts = run_infer(smc_ts, exact_ancestors=True, engine=engine)
    exact_ancestors_time = time.perf_counter() - before
    num_children = []
    for edgeset in exact_ancestors_ts.edgesets():
        num_children.append(len(edgeset.children))
    exact_ancestors_num_children = np.array(num_children)

    results = {
        "sim_time": sim_time,
        "estimated_anc_time": estimated_ancestors_time,
        "exact_anc_time": exact_ancestors_time,
        "num_sites": smc_ts.num_sites,
        "source_num_trees": smc_ts.num_trees,
        "estimated_anc_trees": estimated_ancestors_ts.num_trees,
        "exact_anc_trees": exact_ancestors_ts.num_trees,
        "source_edges": smc_ts.num_edges,
        "estimated_anc_edges": estimated_ancestors_ts.num_edges,
        "exact_anc_edges": exact_ancestors_ts.num_edges,
        "estimated_anc_max_children": np.max(estimated_ancestors_num_children),
        "estimated_anc_mean_children": np.mean(estimated_ancestors_num_children),
        "exact_anc_max_children": np.max(exact_ancestors_num_children),
        "exact_anc_mean_children": np.mean(exact_ancestors_num_children),
    }
    results.update(simulation_args)
    if tree_metrics:
        before = time.perf_counter()
        breakpoints, kc_distance = tsinfer.compare(smc_ts, exact_ancestors_ts)
        d = breakpoints[1:] - breakpoints[:-1]
        d /= breakpoints[-1]
        exact_anc_kc_distance_weighted = np.sum(kc_distance * d)
        exact_anc_perfect_trees = np.sum((kc_distance == 0) * d)
        exact_anc_kc_mean = np.mean(kc_distance)
        breakpoints, kc_distance = tsinfer.compare(smc_ts, estimated_ancestors_ts)
        d = breakpoints[1:] - breakpoints[:-1]
        d /= breakpoints[-1]
        estimated_anc_kc_distance_weighted = np.sum(kc_distance * d)
        estimated_anc_perfect_trees = np.sum((kc_distance == 0) * d)
        estimated_anc_kc_mean = np.mean(kc_distance)
        tree_metrics_time = time.perf_counter() - before
        results.update(
            {
                "tree_metrics_time": tree_metrics_time,
                "exact_anc_kc_distance_weighted": exact_anc_kc_distance_weighted,
                "exact_anc_perfect_trees": exact_anc_perfect_trees,
                "exact_anc_kc_mean": exact_anc_kc_mean,
                "estimated_anc_kc_distance_weighted": estimated_anc_kc_distance_weighted,
                "estimated_anc_perfect_trees": estimated_anc_perfect_trees,
                "estimated_anc_kc_mean": estimated_anc_kc_mean,
            }
        )
    return results


def run_edges_performance(args):
    num_lengths = 10
    MB = 10**6

    work = []
    rng = random.Random()
    rng.seed(args.random_seed)
    for L in np.linspace(0, args.length, num_lengths + 1)[1:]:
        for _ in range(args.num_replicates):
            sim_args = {
                "sample_size": args.sample_size,
                "length": L * MB,
                "recombination_rate": args.recombination_rate,
                "mutation_rate": args.mutation_rate,
                "Ne": args.Ne,
                "model": "smc_prime",
                "random_seed": rng.randint(1, 2**30),
            }
            work.append((sim_args, args.compute_tree_metrics, args.engine))

    random.shuffle(work)
    progress = tqdm.tqdm(total=len(work), disable=not args.progress)
    results = []
    try:
        with concurrent.futures.ProcessPoolExecutor(args.num_processes) as executor:
            for result in executor.map(edges_performance_worker, work):
                results.append(result)
                progress.update()

    except KeyboardInterrupt:
        pass
    progress.close()

    df = pd.DataFrame(results)
    df.length /= MB
    dfg = df.groupby(df.length).mean(numeric_only=True)
    # print(dfg.estimated_anc_edges.describe())
    print(dfg)

    name_format = os.path.join(
        args.destination_dir,
        "ancestors_n={}_L={}_mu={}_rho={}_{{}}".format(
            args.sample_size, args.length, args.mutation_rate, args.recombination_rate
        ),
    )

    plt.plot(
        dfg.num_sites,
        dfg.estimated_anc_edges / dfg.source_edges,
        label="estimated ancestors",
    )
    plt.plot(
        dfg.num_sites, dfg.exact_anc_edges / dfg.source_edges, label="exact ancestors"
    )
    plt.title(
        "n = {}, mut_rate={}, rec_rate={}, reps={}".format(
            args.sample_size,
            args.mutation_rate,
            args.recombination_rate,
            args.num_replicates,
        )
    )
    plt.ylabel("inferred # edges / source # edges")
    plt.xlabel("Num sites")
    plt.legend()
    save_figure(name_format.format("edges"))

    plt.plot(
        dfg.num_sites,
        dfg.estimated_anc_mean_children,
        label="estimated ancestors mean",
        color="blue",
    )
    plt.plot(
        dfg.num_sites,
        dfg.estimated_anc_max_children,
        label="estimated ancestors max",
        color="blue",
        linestyle=":",
    )
    plt.title(
        "n = {}, mut_rate={}, rec_rate={}, reps={}".format(
            args.sample_size,
            args.mutation_rate,
            args.recombination_rate,
            args.num_replicates,
        )
    )
    plt.plot(
        dfg.num_sites,
        dfg.exact_anc_mean_children,
        label="exact ancestors mean",
        color="red",
    )
    plt.plot(
        dfg.num_sites,
        dfg.exact_anc_max_children,
        label="exact ancestors max",
        color="red",
        linestyle=":",
    )
    plt.title(
        "n = {}, mut_rate={}, rec_rate={}, reps={}".format(
            args.sample_size,
            args.mutation_rate,
            args.recombination_rate,
            args.num_replicates,
        )
    )
    plt.ylabel("num_children")
    plt.xlabel("Num sites")
    plt.legend()
    save_figure(name_format.format("num_children"))
    plt.clf()

    if args.compute_tree_metrics:
        plt.plot(
            dfg.num_sites,
            dfg.estimated_anc_kc_distance_weighted,
            label="estimated ancestors",
        )
        plt.plot(
            dfg.num_sites, dfg.exact_anc_kc_distance_weighted, label="exact ancestors"
        )
        plt.title(
            "n = {}, mut_rate={}, rec_rate={}, reps={}".format(
                args.sample_size,
                args.mutation_rate,
                args.recombination_rate,
                args.num_replicates,
            )
        )
        plt.ylabel("Distance weighted KC metric")
        plt.xlabel("Num sites")
        plt.legend()
        save_figure(name_format.format("kc_distance_weighted"))
        plt.clf()

        plt.plot(dfg.num_sites, dfg.estimated_anc_kc_mean, label="estimated ancestors")
        plt.plot(dfg.num_sites, dfg.exact_anc_kc_mean, label="exact ancestors")
        plt.title(
            "n = {}, mut_rate={}, rec_rate={}, reps={}".format(
                args.sample_size,
                args.mutation_rate,
                args.recombination_rate,
                args.num_replicates,
            )
        )
        plt.ylabel("Mean KC metric")
        plt.xlabel("Num sites")
        plt.legend()
        save_figure(name_format.format("kc_mean"))
        plt.clf()

        plt.plot(
            dfg.num_sites, dfg.estimated_anc_perfect_trees, label="estimated ancestors"
        )
        plt.plot(dfg.num_sites, dfg.exact_anc_perfect_trees, label="exact ancestors")
        plt.title(
            "n = {}, mut_rate={}, rec_rate={}, reps={}".format(
                args.sample_size,
                args.mutation_rate,
                args.recombination_rate,
                args.num_replicates,
            )
        )
        plt.ylabel("Mean KC metric")
        plt.xlabel("Num sites")
        plt.legend()
        save_figure(name_format.format("perfect_trees"))
        plt.clf()


def unrank(samples, n):
    """
    Unranks the specified set of samples from a possible n into its position
    in a lexicographically sorted list of bitstrings.
    """
    bitstring = np.zeros(n, dtype=int)
    for s in samples:
        bitstring[s] = 1
    mult = 2 ** np.arange(n, dtype=int)
    unranked = np.sum(mult * bitstring)
    return unranked


def edge_plot(ts, filename):
    n = ts.num_samples
    pallete = sns.color_palette("husl", 2**n - 1)
    lines = []
    colours = []
    for tree in ts.trees():
        left, right = tree.interval
        for u in tree.nodes():
            children = tree.children(u)
            # Don't bother plotting unary nodes, which will all have the same
            # samples under them as their next non-unary descendant
            if len(children) > 1:
                for c in children:
                    lines.append([(left, c), (right, c)])
                    colours.append(pallete[unrank(tree.samples(c), n)])

    lc = mc.LineCollection(lines, linewidths=2, colors=colours)
    fig, ax = plt.subplots()
    ax.add_collection(lc)
    ax.autoscale()
    save_figure(filename)


def run_hotspot_analysis(args):
    MB = 10**6
    L = args.length * MB

    rng = random.Random()
    if args.random_seed is not None:
        rng.seed(args.random_seed)

    breakpoints = np.linspace(0, L, args.num_hotspots + 2)
    end = breakpoints[1:-1] + L * args.hotspot_width
    breakpoints = np.hstack([breakpoints, end])
    breakpoints.sort()
    rates = np.zeros_like(breakpoints)
    rates[:-1] = args.recombination_rate
    # Set the odd elements of the array to be hotspots.
    rates[1::2] *= args.hotspot_intensity
    recomb_map = msprime.RecombinationMap(list(breakpoints), list(rates))

    sim_args = {
        "sample_size": args.sample_size,
        "recombination_map": recomb_map,
        "mutation_rate": args.mutation_rate,
        "Ne": args.Ne,
        "random_seed": rng.randint(1, 2**30),
    }
    ts = msprime.simulate(**sim_args)
    print("simulated ", ts.num_trees, "trees and", ts.num_sites, "sites")

    inferred_ts = run_infer(ts)

    num_bins = 100
    hotspot_breakpoints = breakpoints

    for density in [True, False]:
        for x in hotspot_breakpoints[1:-1]:
            plt.axvline(x=x, color="k", ls=":")
        breakpoints = np.array(list(inferred_ts.breakpoints()))
        v, bin_edges = np.histogram(breakpoints, num_bins, density=density)
        plt.plot(bin_edges[:-1], v, label="inferred")
        breakpoints = np.array(list(ts.breakpoints()))
        v, bin_edges = np.histogram(breakpoints, num_bins, density=density)
        plt.plot(bin_edges[:-1], v, label="source")
        plt.ylabel("Number of breakpoints")
        plt.legend()

        name_format = os.path.join(
            args.destination_dir,
            "hotspots_n={}_L={}_mu={}_rho={}_N={}_I={}_W={}_{{}}".format(
                args.sample_size,
                args.length,
                args.mutation_rate,
                args.recombination_rate,
                args.num_hotspots,
                args.hotspot_intensity,
                args.hotspot_width,
            ),
        )
        save_figure(name_format.format(f"breakpoints_density={density}"))
        plt.clf()

    print("Generating edge plots")
    # TODO add option for colour mapping.
    edge_plot(ts, name_format.format("source_edges"))
    edge_plot(inferred_ts, name_format.format("dest_edges"))


def ancestor_properties_worker(args):
    simulation_args, compute_exact = args
    ts = msprime.simulate(**simulation_args)

    sample_data = tsinfer.SampleData.from_tree_sequence(ts)
    estimated_anc = tsinfer.generate_ancestors(sample_data)
    # Show lengths as a fraction of the total.
    estimated_anc_length = estimated_anc.ancestors_length / ts.sequence_length
    focal_sites = estimated_anc.ancestors_focal_sites[:]
    estimated_anc_focal_distance = np.zeros(estimated_anc.num_ancestors)
    pos = np.hstack([estimated_anc.sites_position[:] / ts.sequence_length] + [1])
    for j in range(estimated_anc.num_ancestors):
        focal = focal_sites[j]
        if len(focal) > 0:
            estimated_anc_focal_distance[j] = pos[focal[-1]] - pos[focal[0]]

    results = {
        "num_sites": ts.num_sites,
        "num_trees": ts.num_trees,
        "estimated_anc_num": estimated_anc.num_ancestors,
        "estimated_anc_mean_len": np.mean(estimated_anc_length),
        "estimated_anc_mean_focal_distance": np.mean(estimated_anc_focal_distance),
    }

    if compute_exact:
        exact_anc = tsinfer.AncestorData(
            sample_data.sites_position, sample_data.sequence_length
        )
        tsinfer.build_simulated_ancestors(sample_data, exact_anc, ts)
        exact_anc.finalise()
        # Show lengths as a fraction of the total.
        exact_anc_length = exact_anc.ancestors_end[:] - exact_anc.ancestors_start[:]

        focal_sites = exact_anc.ancestors_focal_sites[:]
        pos = np.hstack([exact_anc.sites_position[:] / ts.sequence_length] + [1])
        exact_anc_focal_distance = np.zeros(exact_anc.num_ancestors)
        for j in range(exact_anc.num_ancestors):
            focal = focal_sites[j]
            if len(focal) > 0:
                exact_anc_focal_distance[j] = pos[focal[-1]] - pos[focal[0]]
        results.update(
            {
                "exact_anc_num": exact_anc.num_ancestors,
                "exact_anc_mean_len": np.mean(exact_anc_length),
                "exact_anc_mean_focal_distance": np.mean(exact_anc_focal_distance),
            }
        )

    results.update(simulation_args)
    return results


def run_ancestor_properties(args):
    num_lengths = 10
    MB = 10**6

    work = []
    rng = random.Random()
    if args.random_seed is not None:
        rng.seed(args.random_seed)
    for L in np.linspace(0, args.length, num_lengths + 1)[1:]:
        for _ in range(args.num_replicates):
            sim_args = {
                "sample_size": args.sample_size,
                "length": L * MB,
                "recombination_rate": args.recombination_rate,
                "mutation_rate": args.mutation_rate,
                "Ne": args.Ne,
                "model": "smc_prime",
                "random_seed": rng.randint(1, 2**30),
            }
            work.append((sim_args, not args.skip_exact))

    random.shuffle(work)
    progress = tqdm.tqdm(total=len(work), disable=not args.progress)
    results = []
    try:
        with concurrent.futures.ProcessPoolExecutor(args.num_processes) as executor:
            for result in executor.map(ancestor_properties_worker, work):
                results.append(result)
                progress.update()

    except KeyboardInterrupt:
        pass
    progress.close()

    df = pd.DataFrame(results)
    dfg = df.groupby(df.length).mean(numeric_only=True)
    print(dfg)

    name_format = os.path.join(
        args.destination_dir,
        "anc-prop_n={}_L={}_mu={}_rho={}_{{}}".format(
            args.sample_size, args.length, args.mutation_rate, args.recombination_rate
        ),
    )

    plt.plot(dfg.num_sites, dfg.estimated_anc_num, label="estimated ancestors")
    if not args.skip_exact:
        plt.plot(dfg.num_sites, dfg.exact_anc_num, label="exact ancestors")
    plt.title(
        "n = {}, mut_rate={}, rec_rate={}, reps={}".format(
            args.sample_size,
            args.mutation_rate,
            args.recombination_rate,
            args.num_replicates,
        )
    )
    # plt.ylabel("inferred # ancestors / exact # ancestors")
    plt.xlabel("Num sites")
    plt.legend()
    save_figure(name_format.format("num"))
    plt.clf()

    plt.plot(dfg.num_sites, dfg.estimated_anc_mean_len, label="estimated ancestors")
    if not args.skip_exact:
        plt.plot(dfg.num_sites, dfg.exact_anc_mean_len, label="exact ancestors")
    plt.title(
        "n = {}, mut_rate={}, rec_rate={}, reps={}".format(
            args.sample_size,
            args.mutation_rate,
            args.recombination_rate,
            args.num_replicates,
        )
    )
    # plt.ylabel("inferred # ancestors / exact # ancestors")
    plt.xlabel("Num sites")
    plt.legend()
    save_figure(name_format.format("mean_len"))
    plt.clf()

    plt.plot(
        dfg.num_sites,
        dfg.estimated_anc_mean_focal_distance,
        label="estimated ancestors",
    )
    if not args.skip_exact:
        plt.plot(
            dfg.num_sites, dfg.exact_anc_mean_focal_distance, label="exact ancestors"
        )
    plt.title(
        "n = {}, mut_rate={}, rec_rate={}, reps={}".format(
            args.sample_size,
            args.mutation_rate,
            args.recombination_rate,
            args.num_replicates,
        )
    )
    # plt.ylabel("inferred # ancestors / exact # ancestors")
    plt.xlabel("Num sites")
    plt.legend()
    save_figure(name_format.format("mean_focal_distance"))
    plt.clf()


def imputation_accuracy_worker(args):
    simulation_args, missing_proportion = args
    ts = msprime.simulate(**simulation_args)
    np.random.seed(simulation_args["random_seed"])
    G = ts.genotype_matrix()
    missing = np.random.rand(ts.num_sites, ts.num_samples) < missing_proportion
    G[missing] = tskit.MISSING_DATA
    with tsinfer.SampleData(ts.sequence_length) as sample_data:
        for var in ts.variants():
            sample_data.add_site(
                var.site.position, alleles=var.alleles, genotypes=G[var.site.id]
            )

    ts_inferred = tsinfer.infer(sample_data)
    assert ts_inferred.num_sites == ts.num_sites
    total_missing = np.sum(missing)
    num_correct = 0
    for v1, v2 in zip(ts.variants(), ts_inferred.variants()):
        site_id = v1.site.id
        a1 = np.array(v1.alleles)[v1.genotypes]
        a2 = np.array(v2.alleles)[v2.genotypes]
        original = a1[missing[site_id]]
        inferred = a2[missing[site_id]]
        num_correct += np.sum(original == inferred)
    accuracy = 1
    if total_missing > 0:
        accuracy = num_correct / total_missing

    results = {
        "num_trees": ts.num_trees,
        "num_sites": ts.num_sites,
        "num_samples": ts.num_samples,
        "missing_proportion": missing_proportion,
        "accuracy": accuracy,
    }
    return results


def run_imputation_accuracy(args):
    MB = 10**6

    work = []
    rng = random.Random()
    if args.random_seed is not None:
        rng.seed(args.random_seed)
    for missing_proportion in np.linspace(0.01, 0.1, 10):
        for _ in range(args.num_replicates):
            sim_args = {
                "sample_size": args.sample_size,
                "length": args.length * MB,
                "recombination_rate": args.recombination_rate,
                "mutation_rate": args.mutation_rate,
                "Ne": args.Ne,
                "random_seed": rng.randint(1, 2**30),
            }
            work.append((sim_args, missing_proportion))
            # imputation_accuracy_worker((sim_args, missing_proportion))

    rng.shuffle(work)
    progress = tqdm.tqdm(total=len(work), disable=not args.progress)
    results = []
    try:
        with concurrent.futures.ProcessPoolExecutor(args.num_processes) as executor:
            for result in executor.map(imputation_accuracy_worker, work):
                results.append(result)
                progress.update()

    except KeyboardInterrupt:
        pass
    progress.close()

    df = pd.DataFrame(results)
    dfg = df.groupby(df.missing_proportion).mean()
    print(dfg)

    name_format = os.path.join(
        args.destination_dir,
        "imputation-accuracy_n={}_L={}_mu={}_rho={}_{{}}".format(
            args.sample_size, args.length, args.mutation_rate, args.recombination_rate
        ),
    )
    sns.lineplot(x="missing_proportion", y="accuracy", data=df)
    plt.title(
        "n = {}, mut_rate={}, rec_rate={}, reps={}".format(
            args.sample_size,
            args.mutation_rate,
            args.recombination_rate,
            args.num_replicates,
        )
    )
    plt.ylabel("Fraction of missing genotypes imputed correctly")
    plt.xlabel("Fraction of genotypes missing")
    save_figure(name_format.format("num"))


def running_mean(x, N):
    cumsum = np.cumsum(np.insert(x, 0, 0))
    return (cumsum[N:] - cumsum[:-N]) / float(N)


def running_median(x, N):
    idx = np.arange(N) + np.arange(len(x) - N + 1)[:, None]
    b = [row[row > 0] for row in x[idx]]
    return np.array(list(map(np.median, b)))


class MidpointNormalize(mp.colors.Normalize):
    def __init__(self, vmin=None, vmax=None, midpoint=None, clip=False):
        self.midpoint = midpoint
        mp.colors.Normalize.__init__(self, vmin, vmax, clip)

    def __call__(self, value, clip=None):
        # I'm ignoring masked values and all kinds of edge cases to make a
        # simple example...
        x, y = [self.vmin, self.midpoint, self.vmax], [0, 0.5, 1]
        return np.ma.masked_array(np.interp(value, x, y))


class NormalizeBandWidths(mp.colors.Normalize):
    """
    normalise a range into 0..1 where ranges of integers are banded
    into a single colour. The init parameter band_widths needs to be
    a numpy vector of length the maximum integer encountered
    """

    def __init__(self, vmin=None, vmax=None, band_widths=None, clip=False):
        self.bands = np.cumsum(band_widths) / np.sum(band_widths)
        self.x = np.arange(len(self.bands))
        mp.colors.Normalize.__init__(self, vmin, vmax, clip)

    def __call__(self, value, clip=None):
        return np.ma.masked_array(np.interp(value, self.x, self.bands))


def sim_true_and_inferred_ancestors(args):
    """
    Run a simulation under args and return the samples, plus the true and the inferred
    ancestors
    """
    MB = 10**6
    rng = random.Random(args.random_seed)
    np.random.seed(args.random_seed)
    sim_args = {
        "sample_size": args.sample_size,
        "length": args.length * MB,
        "recombination_rate": args.recombination_rate,
        "mutation_rate": args.mutation_rate,
        "Ne": args.Ne,
        "model": "smc_prime",
        "random_seed": rng.randint(1, 2**30),
    }
    ts = msprime.simulate(**sim_args)

    sample_data = generate_samples(ts, args.error)

    inferred_anc = tsinfer.generate_ancestors(sample_data, engine=args.engine)
    true_anc = tsinfer.AncestorData(
        sample_data.sites_position, sample_data.sequence_length
    )
    tsinfer.build_simulated_ancestors(sample_data, true_anc, ts)
    true_anc.finalise()
    return sample_data, true_anc, inferred_anc


def ancestor_data_by_pos(anc1, anc2):
    """
    Return indexes into ancestor data, keyed by focal site position, returning only
    those indexes where positions are the same for both ancestors. This is useful
    e.g. for plotting length v length scatterplots.
    """
    anc_by_focal_pos = []
    for anc in (anc1, anc2):
        position_to_index = {
            anc.sites_position[:][site_index]: i
            for i, sites in enumerate(anc.ancestors_focal_sites[:])
            for site_index in sites
        }
        anc_by_focal_pos.append(position_to_index)

    # NB with error we may not have exactly the same focal sites in exact & estimated
    shared_indices = set.intersection(*[set(a.keys()) for a in anc_by_focal_pos])

    return {
        pos: np.array([anc_by_focal_pos[0][pos], anc_by_focal_pos[1][pos]], np.int64)
        for pos in shared_indices
    }


def run_ancestor_comparison(args):
    sample_data, exact_anc, estimated_anc = sim_true_and_inferred_ancestors(args)
    # Convert lengths to kb.
    estimated_anc_length = estimated_anc.ancestors_length / 1000
    exact_anc_length = exact_anc.ancestors_length / 1000
    max_length = sample_data.sequence_length / 1000
    try:
        err = float(args.error)
    except ValueError:
        err = args.error.replace("/", "_")
        if err.endswith(".csv"):
            err = err[: -len(".csv")]
    name_format = os.path.join(
        args.destination_dir,
        "anc-qual_n={}_Ne={}_L={}_mu={}_rho={}_err={}_{{}}".format(
            args.sample_size,
            args.Ne,
            args.length,
            args.mutation_rate,
            args.recombination_rate,
            err,
        ),
    )
    if args.store_data:
        # TODO Are we using this option for anything?
        filename = name_format.format("length.json")
        # Don't store the longest (root) ancestor
        data = {
            "exact_ancestors": exact_anc_length[1:].tolist(),
            "estimated_ancestors": estimated_anc_length[1:].tolist(),
        }
        with open(filename, "w") as f:
            json.dump(data, f)

    plt.hist(
        [exact_anc_length[1:], estimated_anc_length[1:]], label=["Exact", "Estimated"]
    )
    plt.ylabel("Length (kb)")
    plt.legend()
    save_figure(name_format.format("length-dist"))
    plt.clf()

    # NB ancestors_time is not exactly the same as frequency, because frequency
    # categories that are not represented in the data will be missed out. If we want a
    # true frequency, we therefore need to get it directly from the samples
    pos_to_ancestor = {}
    estimated_anc.ancestors_focal_freq = np.zeros(estimated_anc.num_ancestors, np.int64)
    ancestor_site_position = estimated_anc.sites_position
    for a, focal_sites in enumerate(estimated_anc.ancestors_focal_sites[:]):
        for focal_site in focal_sites:
            pos_to_ancestor[ancestor_site_position[focal_site]] = a
    for var in sample_data.variants(ancestor_site_position):
        # for i, g in sample_data.genotypes(inference_sites=True):
        # pos = sample_data.sites_position[:][i]
        pos = var.site.position
        freq = np.sum(var.genotypes)
        if estimated_anc.ancestors_focal_freq[pos_to_ancestor[pos]]:
            # check all focal sites in an ancestor have the same freq
            assert freq == estimated_anc.ancestors_focal_freq[pos_to_ancestor[pos]]
        estimated_anc.ancestors_focal_freq[pos_to_ancestor[pos]] = freq

    print("mean estimated ancestor length", np.mean(estimated_anc_length))
    # Get the number of ancestors that have the maximum length
    max_len = np.max(estimated_anc_length)
    num_max_len = np.sum(estimated_anc_length == max_len)
    print("max_len = ", max_len)
    print(
        "fraction of ancestors with max length = ",
        num_max_len / estimated_anc.num_ancestors,
    )

    plt.hist(estimated_anc_length[estimated_anc.ancestors_focal_freq == 2], bins=50)
    plt.xlabel("doubleton ancestor length")
    save_figure(name_format.format("doubleton-length-dist"))
    plt.clf()

    anc_indexes = ancestor_data_by_pos(exact_anc, estimated_anc)
    # convert to a 2d numpy array for convenience
    exact_v_estimated_indexes = np.array([v for v in anc_indexes.values()])

    for colorscale in ("Frequency", "True_time"):
        fig = plt.figure(figsize=(10, 10), dpi=100)
        if args.length_scale == "log":
            plt.yscale("log")
            plt.xscale("log")
        if colorscale != "Frequency":
            cs = exact_anc.ancestors_time[:][exact_v_estimated_indexes[:, 0]]
        else:
            cs = estimated_anc.ancestors_focal_freq[exact_v_estimated_indexes[:, 1]]
        plt.scatter(
            exact_anc_length[exact_v_estimated_indexes[:, 0]],
            estimated_anc_length[exact_v_estimated_indexes[:, 1]],
            c=cs,
            cmap="brg",
            s=2,
            norm=NormalizeBandWidths(band_widths=np.bincount(cs)),
        )
        plt.plot([1, max_length], [1, max_length], "-", color="grey", zorder=-1)
        plt.xlim(1, max_length)
        plt.ylim(1, max_length)
        cbar = plt.colorbar()
        cbar.set_label(colorscale, rotation=270)
        plt.xlabel("True ancestor length per variant (kb)")
        plt.ylabel("Inferred ancestor length per variant (kb)")
        save_figure(name_format.format(f"length-scatter_{colorscale.lower()}"))

    # plot exact ancestors ordered by time, and estimated ancestors in frequency bands
    # one point per variable site, so these should be directly comparable
    # the exact ancestors have ancestors_time from 1..n_ancestors, ordered by real time
    # in the simulation, so that each time is unique for a set of site on one ancestor
    for ancestors_are_estimated, anc in enumerate([exact_anc, estimated_anc]):
        time = anc.ancestors_time[:] + (1 if ancestors_are_estimated else 0)
        df = pd.DataFrame(
            {
                "start": anc.ancestors_start[:],
                "end": anc.ancestors_end[:],
                "l": anc.ancestors_length / 1000,
                "time": time,
                "nsites": [len(x) for x in anc.ancestors_focal_sites[:]],
            }
        )

        df_all = pd.DataFrame(
            {
                "lengths_per_site": np.repeat(df.l.values, df.nsites.values),
                "time": np.repeat(df.time.values, df.nsites.values),
                "const": 1,
            }
        ).sort_values(by=["time"])
        sum_per_timeslice = df_all.groupby("time").sum().const.values
        df_all["x_pos"] = range(df_all.shape[0])
        df_all["mean_x_pos"] = np.repeat(
            df_all.groupby("time").mean().x_pos.values, sum_per_timeslice
        )
        df_all["width"] = np.repeat(sum_per_timeslice, sum_per_timeslice)

        mean_by_anc_time = (
            df.iloc[df["nsites"].nonzero()].groupby("time", sort=True).mean()
        )
        median_by_anc_time = (
            df.iloc[df["nsites"].nonzero()].groupby("time", sort=True).median()
        )
        sum_by_anc_time = (
            df.iloc[df["nsites"].nonzero()].groupby("time", sort=True).sum()
        )

        line_x = np.insert(np.cumsum(sum_by_anc_time["nsites"]).values, 0, 0)

        if ancestors_are_estimated:
            # averaging over times is probably more-or-less OK
            lines_y = [mean_by_anc_time.l, median_by_anc_time.l]
            names = ["Mean", "Median"]
            linestyles = ["-", ":"]
            colours = ["orange", "darkorange"]
        else:
            # times are unique per ancestor, so we don't do well averaging
            # have to use a running mean
            assert args.running_average_span % 2 == 1, "Must have odd number of bins"
            pad_mean = np.pad(
                running_mean(mean_by_anc_time.l.values, args.running_average_span),
                (args.running_average_span - 1) // 2,
                mode="constant",
                constant_values=(np.nan,),
            )
            pad_median = np.pad(
                running_median(median_by_anc_time.l.values, args.running_average_span),
                (args.running_average_span - 1) // 2,
                mode="constant",
                constant_values=(np.nan,),
            )
            lines_y = [pad_mean, pad_median]
            names = [
                f"Running mean over {args.running_average_span} ancestors",
                f"Running median over {args.running_average_span} ancestors",
            ]
            linestyles = ["-", ":"]
            colours = ["limegreen", "forestgreen"]
            # save some stuff for when we plot inferred lines
            exact_mean_line_y = lines_y[0]
            exact_median_line_y = lines_y[1]
            exact_line_x = line_x
            # max_y = np.max(df_all.lengths_per_site.values)

        fig = plt.figure(figsize=(10, 10), dpi=100)
        w = df_all.width.values * 9 / 20
        jitter = np.random.uniform(-w, w, len(df_all.mean_x_pos.values))
        x_jittered = df_all.mean_x_pos.values + jitter
        plt.scatter(
            x_jittered,
            df_all.lengths_per_site.values,
            marker=".",
            s=72.0 / fig.dpi,
            alpha=0.75,
            color="black",
        )
        # plt.ylim(1 / (1000 if args.physical_length else 1), max_y*1.02)
        if args.length_scale == "log":
            plt.yscale("log")
        ax = plt.gca()
        ax.set_xlim(xmin=0, xmax=max(line_x))
        if ancestors_are_estimated:
            plt.title("Ancestor lengths as estimated by tsinfer")
            ax.step(
                exact_line_x[:-1],
                exact_mean_line_y,
                label="True mean",
                where="post",
                color="limegreen",
            )
            ax.step(
                exact_line_x[:-1],
                exact_median_line_y,
                label="True median",
                where="post",
                color="forestgreen",
                linestyle=":",
            )
            plt.xlabel("Ancestors_freq (youngest to oldest)")
            ax.set_xlim(xmin=0)
            ax.tick_params(axis="x", which="major", length=0)
            ax.set_xticklabels("", minor=True)
            ax.set_xticks(line_x[:-1], minor=True)
            ax.set_xticks(line_x[:-1] + np.diff(line_x) / 2)
            ax.set_xticklabels(
                np.where(
                    np.isin(
                        mean_by_anc_time.index,
                        np.array([1, 2, 3, 4, 5, 6, 10, 50, 1000, 5000]),
                    ),
                    mean_by_anc_time.index,
                    "",
                )
            )
        else:
            plt.title("True ancestor lengths, ordered by known simulation time")
            plt.xlabel("Ancestors_time index (youngest to oldest)")

        for y, label, linestyle, colour in zip(lines_y, names, linestyles, colours):
            ax.step(
                line_x[:-1],
                y,
                label=label,
                where="post",
                color=colour,
                linestyle=linestyle,
            )
        plt.ylabel("Length (kb)")
        plt.legend(loc="upper center")
        save_figure(
            name_format.format(
                "time_{}".format(
                    "estimated" if ancestors_are_estimated else "true_ancestors"
                )
            )
        )


def binomial_confidence(x, n, z=1.96):
    """
    Calculate the Wilson binomial interval, e.g. from
    https://stackoverflow.com/questions/10029588/python-implementation-of-the-wilson-score-interval
    """  # noqa
    phat = x / n
    d = z * np.sqrt((phat * (1 - phat) + z * z / (4 * n)) / n)
    return np.array(
        [
            (phat + z * z / (2 * n) - d) / (1 + z * z / n),
            (phat + z * z / (2 * n) + d) / (1 + z * z / n),
        ]
    )


def run_ancestor_quality(args):
    """
    Calculate quality measures per focal site, as these are comparable from estimated
    to exact ancestors. This is a bit complicated because we don't always have the same
    inference sites in estimated & exact ancestors, so we need to only check the sites
    that are shared. We also need to limit the bounds over which we calculate quality
    so that we only look at the regions of overlap between true and inferred ancestors
    """
    sample_data, exact_anc, estim_anc = sim_true_and_inferred_ancestors(args)
    try:
        err = float(args.error)
    except ValueError:
        err = args.error.replace("/", "_")
        if err.endswith(".csv"):
            err = err[: -len(".csv")]
    name_format = os.path.join(
        args.destination_dir,
        "anc-qual_n={}_Ne={}_L={}_mu={}_rho={}_err={}_{{}}".format(
            args.sample_size,
            args.Ne,
            args.length,
            args.mutation_rate,
            args.recombination_rate,
            err,
        ),
    )

    anc_indices = ancestor_data_by_pos(exact_anc, estim_anc)
    shared_positions = np.array(list(sorted(anc_indices.keys())))
    # append sequence_length to pos so that ancestors_end[:] indices are always valid
    exact_positions = np.append(
        exact_anc.sites_position[:], sample_data.sequence_length
    )
    estim_positions = np.append(
        estim_anc.sites_position[:], sample_data.sequence_length
    )
    # only include sites which are focal in both exact and estim in the genome-wise masks
    exact_sites_mask = np.isin(exact_anc.sites_position[:], shared_positions)
    estim_sites_mask = np.isin(estim_anc.sites_position[:], shared_positions)
    assert np.sum(exact_sites_mask) == np.sum(estim_sites_mask) == len(anc_indices)

    # store the data to plot for each focal_site, keyed by position
    freq = {var.site.position: np.sum(var.genotypes) for var in sample_data.variants()}
    estim_freq = np.array([freq[p] for p in estim_anc.sites_position], dtype=np.int64)
    olap_n_sites = {}
    olap_n_should_be_1_higher_freq = {}
    olap_n_should_be_0_higher_freq = {}
    olap_n_should_be_1_low_eq_freq = {}
    olap_n_should_be_0_low_eq_freq = {}
    olap_lft = {}
    olap_rgt = {}
    true_len = {}
    est_len = {}
    true_time = {}
    # find the left and right edges of the overlap - iterate by true time in reverse
    for i, focal_pos in enumerate(
        sorted(
            shared_positions,
            key=lambda pos: -exact_anc.ancestors_time[:][anc_indices[pos][0]],
        )
    ):
        exact_index, estim_index = anc_indices[focal_pos]
        # left (start) is biggest of exact and estim
        exact_start = exact_positions[exact_anc.ancestors_start[:][exact_index]]
        estim_start = estim_positions[estim_anc.ancestors_start[:][estim_index]]
        if exact_start > estim_start:
            olap_start_exact = exact_anc.ancestors_start[:][exact_index]
            olap_start = exact_positions[olap_start_exact]
            olap_start_estim = np.searchsorted(estim_anc.sites_position[:], olap_start)
        else:
            olap_start_estim = estim_anc.ancestors_start[:][estim_index]
            olap_start = estim_positions[olap_start_estim]
            olap_start_exact = np.searchsorted(exact_anc.sites_position[:], olap_start)

        # right (end) is smallest of exact and estim
        exact_end = exact_positions[exact_anc.ancestors_end[:][exact_index]]
        estim_end = estim_positions[estim_anc.ancestors_end[:][estim_index]]
        if exact_end < estim_end:
            olap_end_exact = exact_anc.ancestors_end[:][exact_index]
            olap_end = exact_positions[olap_end_exact]
            olap_end_estim = np.searchsorted(estim_anc.sites_position[:], olap_end)
        else:
            olap_end_estim = estim_anc.ancestors_end[:][estim_index]
            olap_end = estim_positions[olap_end_estim]
            olap_end_exact = np.searchsorted(exact_anc.sites_position[:], olap_end)

        offset1 = exact_anc.ancestors_start[:][exact_index]
        offset2 = estim_anc.ancestors_start[:][estim_index]

        exact_full_hap = exact_anc.ancestors_full_haplotype[:, exact_index, 0]
        # slice the full haplotype to include only the overlapping region
        exact_olap = exact_full_hap[olap_start_exact:olap_end_exact]
        # make a 1/0 array with only the comparable sites
        exact_comp = exact_olap[exact_sites_mask[olap_start_exact:olap_end_exact]]

        estim_full_hap = estim_anc.ancestors_full_haplotype[:, estim_index, 0]
        estim_olap = estim_full_hap[olap_start_estim:olap_end_estim]
        small_estim_mask = estim_sites_mask[olap_start_estim:olap_end_estim]
        estim_comp = estim_olap[small_estim_mask]

        assert len(exact_comp) == len(estim_comp)

        # save the statistics into variables indexed by position
        bad_sites = exact_comp != estim_comp
        should_be_1 = exact_comp & ~estim_comp
        should_be_0 = estim_comp & ~exact_comp

        assert np.sum(should_be_1 | should_be_0) == np.sum(bad_sites)

        olap_n_sites[focal_pos] = len(exact_comp)
        olap_lft[focal_pos] = olap_start
        olap_rgt[focal_pos] = olap_end
        true_len[focal_pos] = exact_anc.ancestors_length[:][exact_index]
        est_len[focal_pos] = estim_anc.ancestors_length[:][estim_index]
        true_time[focal_pos] = exact_anc.ancestors_time[:][exact_index]
        sites_freq = estim_freq[olap_start_estim:olap_end_estim]
        higher_freq = sites_freq[small_estim_mask] > freq[focal_pos]
        olap_n_should_be_1_higher_freq[focal_pos] = np.sum(should_be_1 & higher_freq)
        olap_n_should_be_0_higher_freq[focal_pos] = np.sum(should_be_0 & higher_freq)
        olap_n_should_be_1_low_eq_freq[focal_pos] = np.sum(should_be_1 & ~higher_freq)
        olap_n_should_be_0_low_eq_freq[focal_pos] = np.sum(should_be_0 & ~higher_freq)
        assert olap_rgt[focal_pos] - olap_lft[focal_pos] <= true_len[focal_pos]
        assert olap_n_should_be_1_higher_freq[
            focal_pos
        ] + olap_n_should_be_0_higher_freq[focal_pos] + olap_n_should_be_1_low_eq_freq[
            focal_pos
        ] + olap_n_should_be_0_low_eq_freq[
            focal_pos
        ] == np.sum(
            bad_sites
        )
        if args.print_bad_ancestors and np.any(bad_sites):
            if i == 0:
                print(
                    "Freq & haplotype of bad ancestors, ordered by true time,"
                    " oldest first (black = focal site, red or magenta = bad site "
                    " with > or < freq than focal)"
                )
            if args.print_bad_ancestors == "all":
                print(
                    "TRUE ANCESTOR for focal site "
                    "#{} (pos {}, time_index = {}/{})".format(
                        i,
                        focal_pos,
                        exact_anc.ancestors_time[:][exact_index],
                        max(exact_anc.ancestors_time[:]),
                    )
                )
                print(
                    "Haplotype (start @idx {}, pos {})".format(
                        offset1, exact_positions[offset1]
                    )
                )
                print(" " * (olap_start_estim - offset2), end="")
                hap = "".join(exact_full_hap.astype(str))
                focal_index = np.argmax(exact_positions[offset1:] == focal_pos)
                print(hap[:focal_index], end="")
                print(
                    colorama.Fore.WHITE
                    + colorama.Back.BLACK
                    + hap[focal_index]
                    + colorama.Style.RESET_ALL,
                    end="",
                )
                print(hap[focal_index + 1 :])
                print(
                    "Match with inferred ancestor starts @idx {}, pos {}".format(
                        olap_start_exact, exact_positions[olap_start_exact]
                    )
                )
                print(" " * (olap_start_estim - offset2), end="")
                print(" " * (olap_start_exact - offset1))
                print(
                    "".join(
                        [
                            str(x)
                            for x in np.where(
                                exact_sites_mask[olap_start_exact:olap_end_exact],
                                exact_olap,
                                "*",
                            )
                        ]
                    )
                )
                print(f"INFERRED ANCESTOR for focal site #{i} (pos {focal_pos})")
                print(
                    "Haplotype (start @idx {}, pos {})".format(
                        offset2, estim_positions[offset2]
                    )
                )
                print(" " * (olap_start_exact - offset1), end="")
                hap = "".join(estim_full_hap.astype(str))
                focal_index = np.argmax(estim_positions[offset2:] == focal_pos)
                print(hap[:focal_index], end="")
                print(
                    colorama.Fore.WHITE
                    + colorama.Back.BLACK
                    + hap[focal_index]
                    + colorama.Style.RESET_ALL,
                    end="",
                )
                print(hap[focal_index + 1 :])
                # now indicate in the inferred ancestor which sites are bad, and if it is
                # a case of a 1 being mistakenly reconstructed as a 0, whether this is
                # because this is a less frequent site, or whether it is actually more
                # frequent but we are calling a consensus
                print(
                    "Match with inferred ancestor starts @idx {}, pos {}".format(
                        olap_start_estim, estim_positions[olap_start_estim]
                    )
                )
                print(" " * (olap_start_exact - offset1), end="")
                print(" " * (olap_start_estim - offset2), end="")
            elif args.print_bad_ancestors == "inferred":
                print(f"{int(freq[focal_pos]):<5}", end="")
            k = 0
            mask = estim_sites_mask[olap_start_estim:olap_end_estim]
            for j, (bit, curr_pos) in enumerate(
                zip(estim_olap, estim_positions[olap_start_estim:])
            ):
                if mask[j]:
                    if focal_pos == curr_pos:
                        print(
                            colorama.Fore.WHITE
                            + colorama.Back.BLACK
                            + str(bit)
                            + colorama.Style.RESET_ALL,
                            end="",
                        )
                    elif exact_comp[k] == bit:
                        print(str(bit), end="")
                    elif freq[focal_pos] < freq[curr_pos]:
                        print(
                            colorama.Back.RED + str(bit) + colorama.Style.RESET_ALL,
                            end="",
                        )
                    elif freq[focal_pos] > freq[curr_pos]:
                        print(
                            colorama.Back.MAGENTA + str(bit) + colorama.Style.RESET_ALL,
                            end="",
                        )
                    else:
                        print(
                            colorama.Back.YELLOW + str(bit) + colorama.Style.RESET_ALL,
                            end="",
                        )
                    k += 1
                else:
                    print("*", end="")
            print(colorama.Style.RESET_ALL)

    # create the data for use, ordered by real time (and make a new time index)
    data = pd.DataFrame.from_records(
        [
            (
                p,
                freq[p],
                olap_n_sites[p],
                true_len[p],
                est_len[p],
                olap_rgt[p] - olap_lft[p],
                olap_n_should_be_1_higher_freq[p],
                olap_n_should_be_1_low_eq_freq[p],
                olap_n_should_be_0_higher_freq[p],
                olap_n_should_be_0_low_eq_freq[p],
                t,
                true_time[p],
            )
            for t, p in enumerate(sorted(shared_positions, key=lambda x: true_time[x]))
        ],
        columns=(
            "position",
            "Frequency",
            "n_sites",
            "Real length",
            "Estim length",
            "Overlap",
            "err_hiF should be 1",
            "err_loF should be 1",
            "err_hiF should be 0",
            "err_loF should be 0",
            "Known time order",
            "orig_time",
        ),
    )

    # we want to know for each site whether it the frequency puts it within the same
    # bounds as the known time order, and if not, whether we have inferred it as
    # too old or too young. So we make an ordered list of "expected" freqs
    freq_bins = np.bincount(data.Frequency)
    freq_repeated = np.repeat(np.arange(len(freq_bins)), freq_bins)
    # add another column on to the expected freq, as calculated from the actual time
    data["expected_Frequency"] = freq_repeated[data["Known time order"].values]
    data["n_mismatches"] = (
        data["err_hiF should be 1"]
        + data["err_loF should be 1"]
        + data["err_hiF should be 0"]
        + data["err_loF should be 0"]
    )
    data["Inaccuracy"] = data.n_mismatches / data.n_sites
    data["Inferred time inaccuracy"] = data.expected_Frequency - data.Frequency
    data["Inference error bias"] = (
        data["err_hiF should be 1"] + data["err_loF should be 1"]
    ) / data.n_mismatches
    data["err_hiF"] = data["err_hiF should be 1"] + data["err_hiF should be 0"]
    data["err_loF"] = data["err_loF should be 1"] + data["err_loF should be 0"]

    print(
        "{} ancestors, {} with at least one error".format(
            len(data), np.sum(data.n_mismatches != 0)
        )
    )
    print(
        data[
            [
                "err_hiF should be 1",
                "err_loF should be 1",
                "err_hiF should be 0",
                "err_loF should be 0",
            ]
        ].sum()
    )
    if args.csv_only:
        # Add some standard params to the CSV to make it easy to paste CSVs together
        data["sample_size"] = args.sample_size
        data["seq_length"] = args.length
        data["mu"] = args.mutation_rate
        data["rho"] = args.recombination_rate
        data["seq_error"] = args.error
        data.to_csv(name_format.format("error_data.csv"))
        return

    # Now do the plots
    Inaccuracy_label = "Sequence difference in overlapping region"
    name = "quality-by-missingness"
    x_axis_length_metric = "fraction"  # or e.g. "fraction"
    data["abs_missing_l"] = (data["Real length"] - data["Overlap"]) + 1
    data["rel_missing_l"] = 1 - (data["Overlap"] / data["Real length"])
    if x_axis_length_metric == "absolute":
        x_col = "abs_missing_l"
        ax_params = {
            "xlabel": "Absolute length of missing ancestor + 1",
            "xscale": "log",
            "xlim": (0.8, np.max(data[x_col])),
        }
    elif x_axis_length_metric == "fraction":
        x_col = "rel_missing_l"
        ax_params = {"xlabel": "Fraction of true ancestor missing from inferred"}
    else:
        raise AssertionError("Set x_axis_length_metric to 'absolute' or 'fraction'")
    ax = data.plot.scatter(
        x=x_col,
        y="Inaccuracy",
        c="Frequency",
        cmap="brg",
        s=2,
        # FIXME #669 pandas no longer (as of 1.3.0) supports passing "norm"
        # norm=NormalizeBandWidths(band_widths=freq_bins),
    )
    ax.errorbar(
        x=data[x_col],
        y=data.Inaccuracy,
        fmt="none",
        zorder=-2,
        ecolor="0.9",
        yerr=np.abs(
            binomial_confidence(data.n_mismatches.values, data.n_sites.values)
            - data.Inaccuracy.values
        ),
    )
    data = data.sort_values(by=x_col)
    rolling_mean = data.Inaccuracy.rolling(
        center=True, window=args.running_average_span, min_periods=1
    ).mean()
    ax.plot(data[x_col], rolling_mean.values, "k-", lw=1, zorder=-1)
    ax.set(ylabel=Inaccuracy_label, ylim=(-0.01, 1), **ax_params)
    save_figure(name_format.format(name))

    name = "quality-by-freq-with-time"
    ax = data.plot.scatter(
        x="Frequency", y="Inaccuracy", c="Known time order", cmap="brg", s=2
    )
    ax.set_ylabel(Inaccuracy_label)
    save_figure(name_format.format(name))

    ax_params = {
        "xlim": (-1, np.max(data["Known time order"]) * 1.01),
        "ylabel": "Sequence difference in overlapping region",
    }
    legend_elements = [
        mp.lines.Line2D(
            [],
            [],
            linewidth=0,
            label="1",
            marker="o",
            color="k",
            markeredgewidth=0.5,
            markerfacecolor="w",
            markersize=1**0.5,
        ),
        mp.lines.Line2D(
            [],
            [],
            linewidth=0,
            label="10",
            marker="o",
            color="k",
            markeredgewidth=0.5,
            markerfacecolor="w",
            markersize=10**0.5,
        ),
        mp.lines.Line2D(
            [],
            [],
            linewidth=0,
            label="100",
            marker="o",
            color="k",
            markeredgewidth=0.5,
            markerfacecolor="w",
            markersize=100**0.5,
        ),
    ]
    name = "quality-by-freq-with-bias"
    ax = data.plot.scatter(
        x="Known time order",
        y="Inaccuracy",
        c="Inference error bias",
        cmap="coolwarm",
        s=data.n_mismatches.values + 1,
    )
    """
    # Add some tiny labels, to aid identification in a pdf plot
    labels = ["{:.1f}\n{:.0f}\n{:.0f}".format(
        r['position'], r['orig_time'], r['n_mismatches']) for i, r in data.iterrows()]
    for x,y,s in zip(data['Known time order'].values, data["Inaccuracy"].values, labels):
        ax.text(x,y,s,fontsize=1, ha="center", va="center")
    """
    ax.set(**ax_params)
    ax.legend(handles=legend_elements, title="# bad sites\nper ancestor")
    save_figure(name_format.format(name))

    name = "quality-by-freq-with-ordererr"
    ax = data.plot.scatter(
        x="Known time order",
        y="Inaccuracy",
        c="Inferred time inaccuracy",
        cmap="BrBG",
        s=data.n_mismatches.values,
        # FIXME #669 pandas no longer (as of 1.3.0) supports passing "norm"
        # norm=MidpointNormalize(midpoint=0),
    )
    ax.set(**ax_params)
    ax.legend(handles=legend_elements, title="# bad sites\nper ancestor")
    save_figure(name_format.format(name))

    name = "error-type-by-freq-mean-sem"
    # show the (weighted) average for different types of error
    g = data[["err_hiF", "err_loF", "n_sites"]].groupby(data.Frequency)
    f_data = pd.DataFrame.from_dict(
        {
            "Frequency": g.sum().index,
            "hi": g.sum().err_hiF.values / g.sum().n_sites.values,
            "lo": g.sum().err_loF.values / g.sum().n_sites.values,
        }
    )
    f_data = f_data.sort_values(by="Frequency")
    plt.plot(
        f_data.Frequency,
        f_data.hi,
        marker="o",
        linestyle="none",
        markersize=5,
        c="darkgoldenrod",
        label="Errors at higher freq than focal",
    )
    plt.plot(
        f_data.Frequency,
        f_data.lo,
        marker="o",
        linestyle="none",
        markersize=5,
        c="mediumseagreen",
        label="Errors at lower freq than focal",
    )
    rolling_mean = f_data.rolling(
        center=True, window=args.running_average_span, min_periods=1
    ).mean()
    plt.plot(f_data.Frequency, rolling_mean.hi, "-", c="darkgoldenrod")
    plt.plot(f_data.Frequency, rolling_mean.lo, "-", c="mediumseagreen")
    plt.ylim(None, args.diff_y_lim)

    plt.legend()
    plt.ylabel(Inaccuracy_label)
    plt.xlabel("Frequency")
    save_figure(name_format.format(name))

    name = "quality-by-freq-mean-sem"
    # the same as quality-by-freq but show a mean and sterr for quality
    g = data.Inaccuracy.groupby(data.Frequency)
    with warnings.catch_warnings():
        # matplotlib warns for nans in error bars
        warnings.simplefilter("ignore")
        # FIXME #669 error in matplotlib here
        # plt.errorbar(
        #     g.sem().index,
        #     g.mean().values,
        #     yerr=g.sem().values,
        #     marker="o",
        #     ls="none",
        #     ecolor="0.6",
        # )
    plt.ylabel(Inaccuracy_label)
    plt.xlabel("Frequency")
    plt.ylim(None, args.diff_y_lim)
    save_figure(name_format.format(name))

    name = "quality-by-time"
    ax = data.plot.scatter(
        x="Known time order",
        y="Inaccuracy",
        c="Frequency",
        cmap="brg",
        s=2,
        # FIXME #669 pandas no longer (as of 1.3.0) supports passing "norm"
        # norm=NormalizeBandWidths(band_widths=freq_bins),
    )
    data = data.sort_values(by="Known time order")
    ax.errorbar(
        data["Known time order"],
        data["Inaccuracy"],
        yerr=np.abs(
            binomial_confidence(data.n_mismatches.values, data.n_sites.values)
            - data.Inaccuracy.values
        ),
        fmt="none",
        ecolor="0.9",
        zorder=-2,
    )
    rolling_mean = data.Inaccuracy.rolling(
        center=True, window=args.running_average_span, min_periods=1
    ).mean()
    ax.plot(data["Known time order"].values, rolling_mean.values, "k-")
    ax.set(ylabel=Inaccuracy_label, ylim=(-0.01, 1))
    save_figure(name_format.format(name))

    name = "quality-by-length"
    ax = data.plot.scatter(
        x="Overlap",
        y="Inaccuracy",
        c="Frequency",
        cmap="brg",
        s=2,
        # FIXME #669 pandas no longer (as of 1.3.0) supports passing "norm"
        # norm=NormalizeBandWidths(band_widths=freq_bins),
    )
    ax.set(ylabel=Inaccuracy_label, xscale="log", ylim=(-0.01, 1), xlim=(1))
    save_figure(name_format.format(name))


def get_node_degree_by_depth(ts):
    """
    Returns a tuple (degree, depth) for each node in each tree in the
    specified tree sequence (empty flanking regions are omitted)
    """
    degree = []
    depth = []
    for tree in ts.trees():
        if tree.num_edges == 0 and (tree.index == 0 or tree.index == ts.num_trees - 1):
            continue
        stack = [(tree.root, 0)]
        while len(stack) > 0:
            u, d = stack.pop()
            if len(tree.children(u)) > 0:
                degree.append(len(tree.children(u)))
                depth.append(d)
            for v in tree.children(u):
                stack.append((v, d + 1))
    return np.array(degree), np.array(depth)


def run_node_degree(args):
    MB = 10**6
    rng = random.Random()
    if args.random_seed is not None:
        rng.seed(args.random_seed)
    sim_args = {
        "sample_size": args.sample_size,
        "length": args.length * MB,
        "recombination_rate": args.recombination_rate,
        "mutation_rate": args.mutation_rate,
        "Ne": args.Ne,
        "model": "smc_prime",
        "random_seed": rng.randint(1, 2**30),
    }
    smc_ts = msprime.simulate(**sim_args)

    engine = args.engine
    df = pd.DataFrame()
    for path_compression in [True, False]:
        estimated_ancestors_ts = run_infer(
            smc_ts,
            engine=engine,
            exact_ancestors=False,
            path_compression=path_compression,
        )
        degree, depth = get_node_degree_by_depth(estimated_ancestors_ts)
        df = pd.concat(
            [
                df,
                pd.DataFrame(
                    {
                        "degree": degree,
                        "depth": depth,
                        "type": "estimated",
                        "path_compression": path_compression,
                    }
                ),
            ]
        )
        exact_ancestors_ts = run_infer(
            smc_ts,
            engine=engine,
            exact_ancestors=True,
            path_compression=path_compression,
        )
        degree, depth = get_node_degree_by_depth(exact_ancestors_ts)
        df = pd.concat(
            [
                df,
                pd.DataFrame(
                    {
                        "degree": degree,
                        "depth": depth,
                        "type": "exact",
                        "path_compression": path_compression,
                    }
                ),
            ]
        )

    name_format = os.path.join(
        args.destination_dir,
        "node-degree_n={}_L={}_mu={}_rho={}_{{}}".format(
            args.sample_size, args.length, args.mutation_rate, args.recombination_rate
        ),
    )
    print(df.describe())

    with warnings.catch_warnings():
        # Seaborn is throwing some warnings here. Presumably will be fixed at some point.
        warnings.filterwarnings("ignore", category=FutureWarning)
        sns.catplot(
            x="depth",
            y="degree",
            hue="path_compression",
            col="type",
            data=df,
            kind="bar",
        )
    save_figure(name_format.format("path-compression"))
    plt.clf()

    with warnings.catch_warnings():
        # Seaborn is throwing some warnings here. Presumably will be fixed at some point.
        warnings.filterwarnings("ignore", category=FutureWarning)
        sns.barplot(x="depth", y="degree", hue="type", data=df[df.path_compression])
    save_figure(name_format.format("length"))
    plt.clf()


def multiple_recombinations(ts):
    """
    Returns true if the specified tree sequence contains multiple recombinations.
    """
    for _, e_out, _ in ts.edge_diffs():
        if len(e_out) > 4:
            return True
    return False


def run_perfect_inference(args):
    model = "smc_prime"
    if args.use_ts:
        model = "hudson"
    rng = random.Random()
    rng.seed(args.random_seed)
    for _ in range(args.num_replicates):
        seed = rng.randint(1, 2**30)
        base_ts = msprime.simulate(
            args.sample_size,
            Ne=args.Ne,
            length=args.length * 10**6,
            recombination_rate=1e-8,
            random_seed=seed,
            model=model,
        )
        print(
            "simulated ts with n={} and {} trees; seed={}".format(
                base_ts.num_samples, base_ts.num_trees, seed
            )
        )
        if not args.use_ts and multiple_recombinations(base_ts):
            print("Multiple recombinations; skipping")
            continue
        ts, inferred_ts = tsinfer.run_perfect_inference(
            base_ts,
            num_threads=args.num_threads,
            engine=args.engine,
            extended_checks=args.extended_checks,
            time_chunking=not args.no_time_chunking,
            use_ts=args.use_ts,
            path_compression=args.path_compression,
        )
        print(
            "n={} num_trees={} num_sites={}".format(
                ts.num_samples, ts.num_trees, ts.num_sites
            )
        )
        assert ts.num_samples == inferred_ts.num_samples
        assert ts.num_sites == inferred_ts.num_sites
        if args.path_compression:
            _, distances = tsinfer.compare(ts, inferred_ts)
            assert np.all(distances == 0)
        else:
            assert ts.tables.edges == inferred_ts.tables.edges
            assert np.all(ts.tables.sites.position == inferred_ts.tables.sites.position)
            assert ts.tables.mutations == inferred_ts.tables.mutations
            assert np.array_equal(ts.tables.nodes.flags, inferred_ts.tables.nodes.flags)
            assert np.any(ts.tables.nodes.time != inferred_ts.tables.nodes.time)


def setup_logging(args):
    log_level = "WARN"
    if args.verbosity > 0:
        log_level = "INFO"
    if args.verbosity > 1:
        log_level = "DEBUG"
    if args.log_section is None:
        daiquiri.setup(level=log_level)
    else:
        daiquiri.setup(level="WARN")
        logger = logging.getLogger(args.log_section)
        logger.setLevel(log_level)


def add_standard_arguments(
    parser, sample_size=10, length=1, mutation_rate=1e-8, num_replicates=10
):
    parser.add_argument("--destination-dir", "-d", default="")
    parser.add_argument("--sample-size", "-n", type=int, default=sample_size)
    parser.add_argument("--Ne", "-N", type=int, default=10**4)
    parser.add_argument(
        "--length", "-l", type=float, default=length, help="Sequence length in MB"
    )
    parser.add_argument(
        "--recombination-rate",
        "-r",
        type=float,
        default=1e-8,
        help="Recombination rate",
    )
    parser.add_argument("--random-seed", "-s", type=int, default=None)
    if mutation_rate is not None:
        parser.add_argument(
            "--mutation-rate",
            "-u",
            type=float,
            default=mutation_rate,
            help="Mutation rate",
        )
    if num_replicates is not None:
        parser.add_argument("--num-replicates", "-R", type=int, default=num_replicates)


def add_worker_arguments(parser):
    parser.add_argument("--num-processes", "-p", type=int, default=None)
    parser.add_argument(
        "--progress", "-P", action="store_true", help="Show a progress monitor."
    )


if __name__ == "__main__":

    top_parser = argparse.ArgumentParser(
        description="Simple inferface for running various tsinfer evaluations."
    )
    top_parser.add_argument(
        "-V",
        "--version",
        action="version",
        version=f"%(prog)s {tsinfer.__version__}",
    )
    top_parser.add_argument(
        "-o", "--output-format", default="png", help="The output format for plots"
    )
    top_parser.add_argument(
        "-e", "--engine", default=tsinfer.C_ENGINE, help="The implementation to use."
    )

    subparsers = top_parser.add_subparsers(dest="subcommand")
    subparsers.required = True

    #
    # Perfect inference
    #
    parser = subparsers.add_parser(
        "perfect-inference",
        aliases=["pi"],
        help="Runs the perfect inference process on simulated tree sequences.",
    )
    cli.add_logging_arguments(parser)
    parser.set_defaults(runner=run_perfect_inference)
    add_standard_arguments(parser, mutation_rate=None, num_replicates=1)
    parser.add_argument("--num-threads", "-t", type=int, default=0)
    parser.add_argument(
        "--extended-checks",
        "-X",
        action="store_true",
        help="Enable extra consistency checking (slow)",
    )
    parser.add_argument(
        "--use-ts",
        action="store_true",
        help="Use the original tree sequence as the ancestors tree sequence.",
    )
    parser.add_argument(
        "--no-time-chunking",
        action="store_true",
        help="Disable time-chunking to give each ancestor a distinct time.",
    )
    parser.add_argument(
        "--path-compression",
        "-c",
        action="store_true",
        help="Turn on path compression. Makes verification much slower.",
    )

    #
    # Edges performance
    #
    parser = subparsers.add_parser(
        "edges-performance",
        aliases=["ep"],
        help="Runs a plot showing performance in terms of the edge ratio.",
    )
    cli.add_logging_arguments(parser)
    parser.set_defaults(runner=run_edges_performance)
    add_standard_arguments(parser, length=0.1)
    add_worker_arguments(parser)
    parser.add_argument(
        "--compute-tree-metrics", "-T", action="store_true", help="Compute tree metrics"
    )

    #
    # Hotspot analysis
    #
    parser = subparsers.add_parser(
        "hotspot-analysis",
        aliases=["ha"],
        help="Runs plots analysing the effects of recombination hotspots.",
    )
    cli.add_logging_arguments(parser)
    parser.set_defaults(runner=run_hotspot_analysis)
    add_standard_arguments(parser)
    add_worker_arguments(parser)
    parser.add_argument(
        "--num-hotspots", "-H", type=int, default=1, help="Number of hotspots"
    )
    parser.add_argument(
        "--hotspot-intensity",
        "-I",
        type=float,
        default=10,
        help="Intensity of hotspots relative to background.",
    )
    parser.add_argument(
        "--hotspot-width",
        "-W",
        type=float,
        default=0.01,
        help="Width of hotspots as a fraction of total genome length.",
    )

    #
    # Ancestor properties
    #
    parser = subparsers.add_parser(
        "ancestor-properties",
        aliases=["ap"],
        help="Runs plots showing the properties of estimated ancestors.",
    )
    cli.add_logging_arguments(parser)
    parser.set_defaults(runner=run_ancestor_properties)
    add_standard_arguments(parser)
    add_worker_arguments(parser)
    parser.add_argument(
        "--skip-exact",
        "-S",
        action="store_true",
        help="Skip computing the exact ancestors",
    )

    #
    # Ancestor comparison
    #
    parser = subparsers.add_parser(
        "ancestor-comparison",
        aliases=["ac"],
        help=(
            "Runs plots comparing the real and simulated ancestors "
            "for a single instance."
        ),
    )
    cli.add_logging_arguments(parser)
    parser.set_defaults(runner=run_ancestor_comparison)
    add_standard_arguments(parser, sample_size=100, num_replicates=None)
    parser.add_argument(
        "--error",
        "-e",
        default="0",
        help="Error: either a probability or a csv filename to use for empirical error",
    )
    parser.add_argument(
        "--store-data", "-S", action="store_true", help="Store some raw data."
    )
    parser.add_argument(
        "--length-scale",
        "-X",
        choices=["linear", "log"],
        default="linear",
        help="Length scale for distances when plotting",
    )
    parser.add_argument(
        "--running-average-span",
        "-A",
        type=int,
        default=51,
        help=(
            "How many ancestors should we average over when calculating "
            "running means and medians (must be an odd number)"
        ),
    )

    #
    # Ancestor quality
    #
    parser = subparsers.add_parser(
        "ancestor-quality",
        aliases=["aq"],
        help=(
            "Runs plots comparing the quality of simulated compared to real ancestors"
            "for a single instance."
        ),
    )
    cli.add_logging_arguments(parser)
    parser.set_defaults(runner=run_ancestor_quality)
    add_standard_arguments(parser, sample_size=100, num_replicates=None)
    parser.add_argument(
        "--error",
        "-e",
        default="0",
        help="Error: either a probability or a csv filename to use for empirical error",
    )
    parser.add_argument(
        "--print-bad-ancestors",
        "-b",
        nargs="?",
        const="inferred",
        choices=["inferred", "all"],
        help="Also print out all the bad ancestor matches",
    )
    parser.add_argument(
        "--csv-only",
        "-C",
        action="store_true",
        help="Do not create plots, but output a csv file of the data for later plotting",
    )
    parser.add_argument(
        "--length-scale",
        "-X",
        choices=["linear", "log"],
        default="linear",
        help="Length scale for distances when plotting",
    )
    parser.add_argument(
        "--diff-y-lim",
        help="The y-limit to use for sequence difference plots.",
        default=None,
        type=float,
    )
    parser.add_argument(
        "--running-average-span",
        "-A",
        type=int,
        default=51,
        help=(
            "How many ancestors should we average over when calculating "
            "running means and medians (must be an odd number)"
        ),
    )

    #
    # Node degree
    #
    parser = subparsers.add_parser(
        "node-degree", aliases=["nd"], help="Plots node degree vs depth in the tree."
    )
    cli.add_logging_arguments(parser)
    parser.set_defaults(runner=run_node_degree)
    add_standard_arguments(parser, num_replicates=None)

    #
    # Imputation accuracy
    #
    parser = subparsers.add_parser(
        "imputation-accuracy",
        aliases=["ia"],
        help="Runs plots analysing the quality of imputation.",
    )
    cli.add_logging_arguments(parser)
    parser.set_defaults(runner=run_imputation_accuracy)
    add_standard_arguments(parser)
    add_worker_arguments(parser)

    args = top_parser.parse_args()
    cli.setup_logging(args)
    _output_format = args.output_format
    args.runner(args)