leaks when loading tree sequence in SLiM #2868

bhaller · 2023-11-07T23:29:46Z

Hi folks (ping @petrelharp). I need your help debugging a problem; I'm a bit mystified. Sorry for the long report.

It's a leak that occurs when SLiM is running a specific kind of model: one that saves out a tree sequence and then later reloads that same tree sequence, potentially reloading the same .trees file multiple times. With each reload, including the very first time, a bunch of memory is getting leaked. I can watch this happening live in the debugger, as I'll describe below. First of all, here's a SLiM model that makes it happen:

function (NULL)set_default(string k, lifs v)
{
	if (!exists(k))
		defineConstant(k, v);
	catn(c("Parameter", k, v), sep='\t');
	return NULL;
}

initialize()
{
	set_default("L", 750000); // chromosome length in bp
	set_default("nchrs", 14); // chromosome length in bp	
	set_default("selpos", asInteger(L / 3)); // selection position in bp
	set_default("num_origins", 2); //how many genomes contains the selected mutation when selection starts.
	set_default("N", 10000); // ancient effective population size
	set_default("h", 0.5); // dominant coefficient
	set_default("s", 0.3); // selection coefficient
	set_default("g_sel_start", 80); // time of selected mutation being introduced (generations ago --BACKWARD)
	set_default("r", 6.67e-7); // recombinantion rate
	set_default("outid", 1); // idx
	set_default("max_restart", 10000); // max number of restart
	set_default("sim_relatedness", F); // whether simulate high relatedness
	set_default("sim_relatedness_power", 0.0); // parameter1 for mapping relatedness to probability
	set_default("sim_relatedness_delta", 0.01); // parameter2 for mapping relatedness to probability
	set_default("sim_relatedness_g", 40); // generations ago to start simulate inbreeding via mating to relatives
	set_default("N0", 1000); // the effective population size at sampling time
	set_default("g_ne_change_start", 200); // Ne change time (generations ago -- BACKWARD)
	set_default("slim_total_generations", // time of simulation ended -- forward
		max(g_sel_start, g_ne_change_start + 1, sim_relatedness_g));
	initializeSLiMOptions(keepPedigrees=T);
	initializeTreeSeq();
	initializeMutationRate(0.0);
	initializeMutationType("m1", 0.5, "f", 0.0); // neutral
	initializeMutationType("m2", h, "f", s); // selected
	initializeGenomicElementType("g1", m1, 1);
	
	// whole genome length
	initializeGenomicElement(g1, 0, L * nchrs - 1);
	
	// rate maps
	ends = rep(0, 2 * nchrs - 1);
	rates = rep(r, 2 * nchrs - 1);
	for (i in 1:nchrs)
	{
		ends[i * 2 - 2] = L * i - 1; // Note: Eidos indexing is from 0 not from zeros
	}
	for (i in 1:(nchrs - 1))
	{
		rates[i * 2 - 1] = 0.5;
		ends[i * 2 - 1] = L * i;
	}
	initializeRecombinationRate(rates, ends);
	
	// define global
	defineGlobal("restart_counter", 1);
}

s0 9999:10000 mateChoice()
{
	rel = individual.relatedness(sourceSubpop.individuals);
	return weights * (rel^sim_relatedness_power + sim_relatedness_delta);
}

1 early()
{
	sim.addSubpop("p1", N);
	if (sim_relatedness)
	{
		community.rescheduleScriptBlock(s0, slim_total_generations - sim_relatedness_g + 1);
	}
	community.rescheduleScriptBlock(s1, slim_total_generations - g_ne_change_start + 1);
	community.rescheduleScriptBlock(s2, slim_total_generations - g_sel_start - 1, slim_total_generations); // minus 1 so that it allows the s2 code block the save the state
	community.rescheduleScriptBlock(s3, slim_total_generations + 1, slim_total_generations + 1);
	print(slim_total_generations);
}

// control populatio size
s1 300: early()
{
	t = slim_total_generations - sim.cycle; // generation ago
	Nt = (N / N0)^(t / g_ne_change_start) * N0; // calculate Nt
	p1.setSubpopulationSize(asInteger(Nt)); // set new population size
}

// condition on selection establishment (not lost)
s2 450: late()
{
	if (sim.cycle == slim_total_generations - g_sel_start - 1 & s != 0.0)
	{
		catn("WRITING...");
		sim.treeSeqOutput(paste("tmp_slim_state_single_pop_", outid, ".trees", sep=''));
		for (i in 1:nchrs){
			sample(p1.genomes, num_origins).addNewDrawnMutation(m2, selpos + (i-1) * L);
		}
	}
	else if (sim.cycle >= slim_total_generations - g_sel_start & s != 0)
	{
		mut = sim.mutationsOfType(m2);
		nfixed = sum(sim.substitutions.mutationType == m2);
		nmut = mut.size();
		
		
		nlost = nchrs - nmut - nfixed;
		
		need_restart = 0;
		if (nfixed >= nchrs - 3)
		{
			// print("selected mutation fixed");
			// catn(c("DAF", slim_total_generations - sim.cycle, 1.0), sep='\t');
			community.deregisterScriptBlock(self);
		}
		else if ((nlost>3) & (restart_counter < max_restart))
		{
		catn("RESTARTING...");
			sim.readFromPopulationFile(paste("tmp_slim_state_single_pop_", outid, ".trees", sep=''));
			setSeed(rdunif(1, 0, asInteger(2^62 - 1)));
			for (i in 1:nchrs){
				sample(p1.genomes, num_origins).addNewDrawnMutation(m2, selpos + (i-1) * L);
			}
			restart_counter = restart_counter + 1;
		}
		else
		{
		
		}
	}
}

s3 500 late()
{
	
	mut = sim.mutationsOfType(m2);
	nfixed = sum(sim.substitutions.mutationType == m2);
	nmut = mut.size();
	
	nlowafreq = sum(p1.species.mutationFrequencies(p1, mut) < 0.2);
	
	if ((nlowafreq > 0)  &(sim_relatedness)){
		catn("RESTARTING...");
		sim.readFromPopulationFile(paste("tmp_slim_state_single_pop_", outid, ".trees", sep=''));
		setSeed(rdunif(1, 0, asInteger(2^62 - 1)));
		for (i in 1:nchrs){
			sample(p1.genomes, num_origins).addNewDrawnMutation(m2, selpos + (i-1) * L);
		}
		restart_counter = restart_counter + 1;
	}
	else{
		// finish simulation only when there is no low allele frequency of selected mutations
		sim.simulationFinished();
		sim.treeSeqOutput(paste("tmp_slim_out_single_pop_", outid, ".trees", sep=''));
	}
}

It's more crufty than it needs to be, but the details of it are unimportant. The only important bits are that it saves out a tree sequence (marked with catn("WRITING...")), and then later loads that tree sequence back in (marked with catn("RESTARTING...") in two places). The leak occurs across the call to sim.readFromPopulationFile(), 100% reproducibly. I think I have checked for a leak from this specific usage pattern in the past, and it did not leak, so I suspect a regression somewhere, but perhaps I'm misremembering, or my test case back then was somehow different. But I suspect a regression somewhere.

So, what does sim.readFromPopulationFile() do? After percolating through the innards of the Eidos interpreter's dispatcher, etc., we end up at some simple C++ code:

slim_tick_t Species::_InitializePopulationFromTskitBinaryFile(const char *p_file, EidosInterpreter *p_interpreter, SUBPOP_REMAP_HASH &p_subpop_map)
{
	...
	int ret;

	if (recording_tree_)
		FreeTreeSequence();
	
	...
	
	ret = tsk_table_collection_load(&tables_, p_file, TSK_LOAD_SKIP_REFERENCE_SEQUENCE);	// we load the ref seq ourselves; see below
	if (ret != 0) handle_error("tsk_table_collection_load", ret);
	
	...

The ... bits are just cruft, definitely uninvolved, but I can come up with links to the lines in GitHub if you want to see the complete code. So, in a nutshell, first we free the tree sequence, then we load the replacement back in. FreeTreeSequence() is just this:

void Species::FreeTreeSequence()
{
	if (!recording_tree_)
		EIDOS_TERMINATION << "ERROR (Species::FreeTreeSequence): (internal error) FreeTreeSequence() called when tree-sequence recording is not enabled." << EidosTerminate();
	
	if (tables_initialized_)
	{
		// Free any tree-sequence recording stuff that has been allocated; called when Species is getting deallocated,
		// and also when we're wiping the slate clean with something like readFromPopulationFile().
		tsk_table_collection_free(&tables_);
		tables_initialized_ = false;
		
		remembered_genomes_.clear();
		tabled_individuals_hash_.clear();
	}
}

I've stepped through this in the debugger; tsk_table_collection_free() is definitely called. I walked through it and watched the tables getting freed one by one, and I looked at the table collection data structure afterwards and saw null pointers as appropriate; it definitely got freed. And note that this table collection, tables_, is the same one that tsk_table_collection_load() loads into; it's the permanent table collection kept by SLiM's Species class. So there shouldn't be any live data left in the table collection for us to overwrite (which would be the obvious way that a leak would occur in this scenario).

OK, so here's the kicker. Step back out to _InitializePopulationFromTskitBinaryFile(), and up to the point where tsk_table_collection_load() is about to be called. Now – on macOS, there's a command-line tool called leaks that scans a running process for leaks and prints out a summary, and (it's wonderful) you can run it on a process that is stopped in the debugger. So, run leaks at that moment; it reports zero leaks. Step over tsk_table_collection_load(), and now there are a whole bunch of leaks reported. 100% reproducible.

At a finer level of detail, one can step inside tsk_table_collection_load() and watched the leaks happen one by one. Inside read_table_cols(), there is a call to kastore_containss(). I can measure leaks before that call, and it says there are X leaks (based on how much leaking has already occurred during that debugger session). Step over that call and measure leaks again, and now there are X+1 leaks. Each new call to kastore_containss() leaks one more malloced block (with a little variation in the exact timing, discussed below). These appear to be the full-size table columns getting leaked; the blocks are as big as 9 MB, for the run I'm looking at now. And if you suspect that the leaks tool is lying to me (I did :->), note that a completely different macOS debugging tool, Instruments, also has a leak checker – and it reports the leaks as coming from kastore_containss() too.

So, kastore_containss() percolates down to kastore_get(). That executes without any leak being evidenced; but the leak appears in the output from leaks soon thereafter, at what seems to be a somewhat unpredictable time. I suspect that a pointer to the leaked block remains in the stack for a little while, in some local variable that has fallen out of scope but has not yet been overwritten, and then when that stack memory happens to get overwritten the leak registers. I suspect that the culprit is somehow related to these lines in kastore_get():

if (self->flags & KAS_GET_TAKES_OWNERSHIP) {
    item->array = NULL;
}

These lines do execute; the entry in item->array, which pointed to the new block just read in, gets set to NULL. I'm not sure what that code is doing exactly. I see that tsk_table_collection_loadf_inited() forces the KAS_GET_TAKES_OWNERSHIP flag on. But I'm not 100% sure this is the culprit, it's just the most likely-looking code I can see in what I've stepped through.

And this is where I bottom out, since I don't really understand how the kastore calls are intended to work exactly. As far as I can see, SLiM has correctly freed the table collection and then loaded a new .trees into it, and the leaks occur inside that load call, and register as leaks by the time the load call returns to SLiM's code. Whatever KAS_GET_TAKES_OWNERSHIP does exactly, that flag is not in SLiM's control, but is forced on by tskit. I'm drawing a blank as to how to pursue this further. Maybe the way we're using the table collection is just wrong; if so, please enlighten me :->. I'm happy to zoom, screen-share, and show you all this live in the debugger. Perhaps if you have SLiM installed you can reproduce it yourself, using the model given above, with whatever leaks tool you have on your platform.

In summary: HELP!

The text was updated successfully, but these errors were encountered:

benjeffery · 2023-11-08T00:49:18Z

I have reproduced the leak with valgrind. Investigating.

benjeffery · 2023-11-08T01:21:41Z

So valgrind here is giving:

==2852465== Syscall param write(buf) points to uninitialised byte(s)
==2852465==    at 0x4CD00A7: write (write.c:26)
==2852465==    by 0x4C50EBC: _IO_file_write@@GLIBC_2.2.5 (fileops.c:1181)
==2852465==    by 0x4C517D7: new_do_write (fileops.c:449)
==2852465==    by 0x4C517D7: _IO_new_file_xsputn (fileops.c:1255)
==2852465==    by 0x4C517D7: _IO_file_xsputn@@GLIBC_2.2.5 (fileops.c:1197)
==2852465==    by 0x4C453F0: fwrite (iofwrite.c:39)
==2852465==    by 0x5C948B: kastore_close (in /home/benj/projects/slim-leak/build/slim)

Which indicates we're trying to write uninit'd memory. Which is odd. Could it be that you're dumping a free'd ts? I haven't dug into slim code yet. Time to sleep!

bhaller · 2023-11-08T01:31:36Z

Hmm, odd. There's no sign of a problem like that; all we have is the memory leak, there's never a crash or any other bad behavior. The models seems to work fine, and it can reload the tree sequence over and over without problems. If it were relying upon uninited memory, I wouldn't expect that to be the case. SLiM passes all its unit tests, and lots of people are using it without problems; it just leaks. Very strange bug, so far.

benjeffery · 2023-11-08T01:48:26Z

Could you post a version that does the same kind thing but runs much faster? Valgrind makes a process run much slower.

bhaller · 2023-11-08T01:52:15Z

Probably yes; give me a couple minutes. I apologize that this is disturbing your sleep; me too. :-O

bhaller · 2023-11-08T02:06:23Z

OK, I think this runs much faster and still exhibits the leak:

function (NULL)set_default(string k, lifs v)
{
	if (!exists(k))
		defineConstant(k, v);
	catn(c("Parameter", k, v), sep='\t');
	return NULL;
}

initialize()
{
	set_default("L", 750000); // chromosome length in bp
	set_default("nchrs", 14); // chromosome length in bp	
	set_default("selpos", asInteger(L / 3)); // selection position in bp
	set_default("num_origins", 2); //how many genomes contains the selected mutation when selection starts.
	set_default("N", 10000); // ancient effective population size
	set_default("h", 0.5); // dominant coefficient
	set_default("s", 0.3); // selection coefficient
	set_default("g_sel_start", 80); // time of selected mutation being introduced (generations ago --BACKWARD)
	set_default("r", 6.67e-7); // recombinantion rate
	set_default("outid", 1); // idx
	set_default("max_restart", 10000); // max number of restart
	set_default("N0", 1000); // the effective population size at sampling time
	set_default("g_ne_change_start", 200); // Ne change time (generations ago -- BACKWARD)
	set_default("slim_total_generations", // time of simulation ended -- forward
		201);
	initializeSLiMOptions(keepPedigrees=T);
	initializeTreeSeq(simplificationInterval=1000);
	initializeMutationRate(0.0);
	initializeMutationType("m1", 0.5, "f", 0.0); // neutral
	initializeMutationType("m2", h, "f", s); // selected
	initializeGenomicElementType("g1", m1, 1);
	
	// whole genome length
	initializeGenomicElement(g1, 0, L * nchrs - 1);
	
	// rate maps
	ends = rep(0, 2 * nchrs - 1);
	rates = rep(r, 2 * nchrs - 1);
	for (i in 1:nchrs)
	{
		ends[i * 2 - 2] = L * i - 1; // Note: Eidos indexing is from 0 not from zeros
	}
	for (i in 1:(nchrs - 1))
	{
		rates[i * 2 - 1] = 0.5;
		ends[i * 2 - 1] = L * i;
	}
	initializeRecombinationRate(rates, ends);
	
	// define global
	defineGlobal("restart_counter", 1);
}

1 early()
{
	sim.addSubpop("p1", N);
}

// control populatio size
2: early()
{
	t = slim_total_generations - sim.cycle; // generation ago
	Nt = (N / N0)^(t / g_ne_change_start) * N0; // calculate Nt
	p1.setSubpopulationSize(asInteger(Nt)); // set new population size
}

// condition on selection establishment (not lost)
10:20 late()
{
	if (sim.cycle == 10)
	{
		catn("WRITING...");
		sim.treeSeqOutput(paste("tmp_slim_state_single_pop_", outid, ".trees", sep=''));
		for (i in 1:nchrs){
			sample(p1.genomes, num_origins).addNewDrawnMutation(m2, selpos + (i-1) * L);
		}
	}
	else if (sim.cycle >= 12)
	{
		catn("RESTARTING...");
			sim.readFromPopulationFile(paste("tmp_slim_state_single_pop_", outid, ".trees", sep=''));
			setSeed(rdunif(1, 0, asInteger(2^62 - 1)));
			for (i in 1:nchrs){
				sample(p1.genomes, num_origins).addNewDrawnMutation(m2, selpos + (i-1) * L);
			}
			restart_counter = restart_counter + 1;
	}
}

petrelharp · 2023-11-08T03:29:03Z

For a quick one using scripts from pyslim:

git clone [email protected]:tskit-dev/pyslim
cd pyslim/tests/test_recipes
slim recipe_WF_early_late.slim    # to make out.trees
valgrind --leak-check=full --track-origins=yes slim -d "TREES_IN='out.trees'" -d "STAGE='early'" restart_WF.slim

and the output is (lengthy and) attached.
valgrind.log

bhaller · 2023-11-08T03:32:12Z

For a quick one using scripts from pyslim:

git clone [email protected]:tskit-dev/pyslim
cd pyslim/tests/test_recipes
slim recipe_WF_early_late.slim    # to make out.trees
valgrind --leak-check=full --track-origins=yes slim -d "TREES_IN='out.trees'" -d "STAGE='early'" restart_WF.slim

and the output is (lengthy and) attached. valgrind.log

So does this mean that it is definitively a regression, then? Or has this test not been run under valgrind before? (It would be nice if there were a way to check with leaks with CI; I've not looked into that.)

petrelharp · 2023-11-08T04:15:18Z

Hm, well I've just realized we haven't updated the tskit code in a bit (since C_1.0, and now we're at C_1.1.2). I've done a quick update to the code: MesserLab/SLiM#406 (but haven't yet checked if it works...).

bhaller · 2023-11-08T04:19:30Z

Yes, worthwhile for sure. I have a vague feeling that we had a patch living in SLiM; did you do a diff with the old sources before you copied in the new? Anyway, I can do a test tomorrow to see if this helps the problem, but if @benjeffery doesn't remember fixing this bug, my guess is that it won't...

petrelharp · 2023-11-08T04:30:03Z

Well, running my test above with the tskit-updated SLiM results in fewer bytes "definitely lost" and a lot fewer entries in the valgrind log. Here's the result (attached).
valgrind2.log

petrelharp · 2023-11-08T04:33:13Z

If this doesn't fix it, perhaps the thing to do is to extract out the tsk_X calls into a simple C program and verify there?

bhaller · 2023-11-08T04:47:42Z

I don't know why this issue got closed, WTF. Oh, @petrelharp – you need to watch your wording, you marked that PR as fixing this issue! :-> Reopening. For the record: there was indeed a patch in SLiM's copy of tskit, but it got resolved by the update so all is well.

bhaller · 2023-11-08T04:58:37Z

Looks to me like this has resolved the vast majority of the problem. We still get a 400-byte leak with every reload; I'm not sure where it's coming from exactly; one tool indicates the line variant = (tsk_variant_t *)malloc(sizeof(tsk_variant_t)), but we do call ret = tsk_variant_free(variant) so I'm not sure. But anyway, the leak of the entire tables is fixed, so that's good. @benjeffery, do you remember fixing this problem now? Is it understood what's going on here – can we forget about it, do you think?

bhaller · 2023-11-08T05:00:02Z

Oh, tsk_variant_free(variant) just frees the stuff inside the variant object, doesn't it – not the malloc itself. So that bit is SLiM's leak, I think?

bhaller · 2023-11-08T05:01:09Z

Thanks for the update @petrelharp!

bguo068 · 2023-11-08T05:14:24Z

I also did a quick test with my original slim script using slim from the last commit from @petrelharp (MesserLab/SLiM@dc0e413). slim command only used 200-300Mb RAM after 500 restarts. This almost fixed MesserLab/SLiM#404. I will report back if I see the failure due to out of memory again.

Thank you @petrelharp @bhaller

benjeffery · 2023-11-08T12:05:54Z

I don't remember fixing anything like this, but the tskit C API has pretty good leak checking as part of CI, so I'd be surprised if we have an issue there. I'll try again with the quicker code and the updated code.

benjeffery · 2023-11-08T12:25:37Z

The quicker code is still taking over 30min to run under valgrind, and the pyslim "restart_WF" script is giving me:

ERROR (EidosInterpreter::_ProcessArgumentList): unrecognized named argument subpopMap to function readFromPopulationFile.

Error on script line 19, character 8:

    sim.readFromPopulationFile(TREES_IN, subpopMap=SUBPOP_MAP);

So I can't really dig into this without a working minimal script.

bhaller · 2023-11-08T13:59:15Z

@benjeffery Hmm, "unrecognized named argument subpopMap" seems to indicate that you're running an old version of SLiM, I think? subpopMap was added in SLiM 4.0, I think, and we're now on 4.0.1.

As for the script still taking more than 30min to run, yes – sorry, I'm an idiot. I assumed you were working in the debugger in the same way as me, observing leaks across one reload operation at a time, but of course you're not, you're running the script to completion under valgrind – and the script I supplied runs forever. It just keeps doing reloads over and over, ad infinitum, because that's how I was testing. (It's "faster" in the sense that the time to the first reload and the time between reloads is much shorter, which for some reason is what I thought you meant.) So, mea culpa, I apologize for wasting 30 minutes of your life. :-> Here is a script that does two reloads and then ends:

function (NULL)set_default(string k, lifs v)
{
	if (!exists(k))
		defineConstant(k, v);
	catn(c("Parameter", k, v), sep='\t');
	return NULL;
}

initialize()
{
	set_default("L", 750000); // chromosome length in bp
	set_default("nchrs", 14); // chromosome length in bp	
	set_default("selpos", asInteger(L / 3)); // selection position in bp
	set_default("num_origins", 2); //how many genomes contains the selected mutation when selection starts.
	set_default("N", 10000); // ancient effective population size
	set_default("h", 0.5); // dominant coefficient
	set_default("s", 0.3); // selection coefficient
	set_default("g_sel_start", 80); // time of selected mutation being introduced (generations ago --BACKWARD)
	set_default("r", 6.67e-7); // recombinantion rate
	set_default("outid", 1); // idx
	set_default("max_restart", 10000); // max number of restart
	set_default("N0", 1000); // the effective population size at sampling time
	set_default("g_ne_change_start", 200); // Ne change time (generations ago -- BACKWARD)
	set_default("slim_total_generations", // time of simulation ended -- forward
		201);
	initializeSLiMOptions(keepPedigrees=T);
	initializeTreeSeq(simplificationInterval=1000);
	initializeMutationRate(0.0);
	initializeMutationType("m1", 0.5, "f", 0.0); // neutral
	initializeMutationType("m2", h, "f", s); // selected
	initializeGenomicElementType("g1", m1, 1);
	
	// whole genome length
	initializeGenomicElement(g1, 0, L * nchrs - 1);
	
	// rate maps
	ends = rep(0, 2 * nchrs - 1);
	rates = rep(r, 2 * nchrs - 1);
	for (i in 1:nchrs)
	{
		ends[i * 2 - 2] = L * i - 1; // Note: Eidos indexing is from 0 not from zeros
	}
	for (i in 1:(nchrs - 1))
	{
		rates[i * 2 - 1] = 0.5;
		ends[i * 2 - 1] = L * i;
	}
	initializeRecombinationRate(rates, ends);
	
	// define global
	defineGlobal("restart_counter", 1);
}

1 early()
{
	sim.addSubpop("p1", N);
}

// control populatio size
2: early()
{
	t = slim_total_generations - sim.cycle; // generation ago
	Nt = (N / N0)^(t / g_ne_change_start) * N0; // calculate Nt
	p1.setSubpopulationSize(asInteger(Nt)); // set new population size
}

// condition on selection establishment (not lost)
10:15 late()
{
	if (sim.cycle == 10)
	{
		catn("WRITING...");
		sim.treeSeqOutput(paste("tmp_slim_state_single_pop_", outid, ".trees", sep=''));
		for (i in 1:nchrs){
			sample(p1.genomes, num_origins).addNewDrawnMutation(m2, selpos + (i-1) * L);
		}
	}
	else if ((sim.cycle >= 12) & (restart_counter < 3))
	{
		catn("RESTARTING...");
			sim.readFromPopulationFile(paste("tmp_slim_state_single_pop_", outid, ".trees", sep=''));
			setSeed(rdunif(1, 0, asInteger(2^62 - 1)));
			for (i in 1:nchrs){
				sample(p1.genomes, num_origins).addNewDrawnMutation(m2, selpos + (i-1) * L);
			}
			restart_counter = restart_counter + 1;
	}
}

This should run to completion under valgrind in perhaps a minute or so; without valgrind it takes only ten seconds or so. Again, my apologies.

bhaller · 2023-11-08T14:19:49Z

I've just pushed a fix for the remaining 400-byte per reload leak, which was due to not calling free() on a malloced tsk_variant_t struct. There was no reason to malloc it in the first place; it's now stack-local. The original test model now runs cleanly with no leaks. It would still be good to be sure that we understand why updating to the new tskit fixed the problem, just so we're sure that a regression didn't get "fixed" by another regression or something, but perhaps the search for that is tilting at windmills, since I guess there have been quite a few diffs between C_1.0 and C_1.1.2; that's up to you, @benjeffery; in any case, thank you for your help. Thanks for the very good idea, @petrelharp! I'll leave this for the tskit folks to close when they're satisfied, but I have closed MesserLab/SLiM#404 over on my side.

benjeffery · 2023-11-08T14:22:50Z

I think? subpopMap was added in SLiM 4.0, I think, and we're now on 4.0.1.

Thanks, I had a fresh clone, but somehow master was out-of-date. No idea why!

This should run to completion under valgrind in perhaps a minute or so
Yep, have that running now

I'll leave this for the tskit folks to close when they're satisfied

Ok, I'm going to do a little bit of digging, but glad this is fixed now.

benjeffery · 2023-11-08T14:49:48Z

Hmm, having been through the diff I don't see anything. All the kastore memory ownership changes were already in 1.0 and are pretty heavily tested for leaks etc. I don't think it's worth digging any further here unless we get any other leak reports. Thanks for raising the issue.

bhaller · 2023-11-08T15:03:15Z

Interesting. Thanks for looking into it. Must be something about SLiM's particular usage pattern, I guess. It would be nice to have leak-testing as part of SLiM's CI; right now the leak-testing I do is just ad hoc. Something to ponder.

benjeffery · 2023-11-08T15:04:36Z

The tskit ones are defined here:
https://github.com/tskit-dev/tskit/blob/main/.circleci/config.yml#L64

petrelharp · 2023-11-09T06:14:00Z

Phew! Glad that worked to fix things.

bhaller mentioned this issue Nov 7, 2023

Potential memory leak of conda-forge slim 4.0.1 MesserLab/SLiM#404

Closed

petrelharp mentioned this issue Nov 8, 2023

tskit update to C_1.1.2 MesserLab/SLiM#406

Merged

bhaller closed this as completed in MesserLab/SLiM#406 Nov 8, 2023

bhaller reopened this Nov 8, 2023

benjeffery closed this as completed Nov 8, 2023

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

leaks when loading tree sequence in SLiM #2868

leaks when loading tree sequence in SLiM #2868

bhaller commented Nov 7, 2023 •

edited by petrelharp

Loading

benjeffery commented Nov 8, 2023

benjeffery commented Nov 8, 2023

bhaller commented Nov 8, 2023

benjeffery commented Nov 8, 2023

bhaller commented Nov 8, 2023

bhaller commented Nov 8, 2023

petrelharp commented Nov 8, 2023 •

edited

Loading

bhaller commented Nov 8, 2023

petrelharp commented Nov 8, 2023

bhaller commented Nov 8, 2023

petrelharp commented Nov 8, 2023

petrelharp commented Nov 8, 2023

bhaller commented Nov 8, 2023

bhaller commented Nov 8, 2023 •

edited

Loading

bhaller commented Nov 8, 2023

bhaller commented Nov 8, 2023

bguo068 commented Nov 8, 2023

benjeffery commented Nov 8, 2023

benjeffery commented Nov 8, 2023

bhaller commented Nov 8, 2023

bhaller commented Nov 8, 2023 •

edited

Loading

benjeffery commented Nov 8, 2023

benjeffery commented Nov 8, 2023

bhaller commented Nov 8, 2023

benjeffery commented Nov 8, 2023

petrelharp commented Nov 9, 2023

leaks when loading tree sequence in SLiM #2868

leaks when loading tree sequence in SLiM #2868

Comments

bhaller commented Nov 7, 2023 • edited by petrelharp Loading

benjeffery commented Nov 8, 2023

benjeffery commented Nov 8, 2023

bhaller commented Nov 8, 2023

benjeffery commented Nov 8, 2023

bhaller commented Nov 8, 2023

bhaller commented Nov 8, 2023

petrelharp commented Nov 8, 2023 • edited Loading

bhaller commented Nov 8, 2023

petrelharp commented Nov 8, 2023

bhaller commented Nov 8, 2023

petrelharp commented Nov 8, 2023

petrelharp commented Nov 8, 2023

bhaller commented Nov 8, 2023

bhaller commented Nov 8, 2023 • edited Loading

bhaller commented Nov 8, 2023

bhaller commented Nov 8, 2023

bguo068 commented Nov 8, 2023

benjeffery commented Nov 8, 2023

benjeffery commented Nov 8, 2023

bhaller commented Nov 8, 2023

bhaller commented Nov 8, 2023 • edited Loading

benjeffery commented Nov 8, 2023

benjeffery commented Nov 8, 2023

bhaller commented Nov 8, 2023

benjeffery commented Nov 8, 2023

petrelharp commented Nov 9, 2023

bhaller commented Nov 7, 2023 •

edited by petrelharp

Loading

petrelharp commented Nov 8, 2023 •

edited

Loading

bhaller commented Nov 8, 2023 •

edited

Loading

bhaller commented Nov 8, 2023 •

edited

Loading