diff --git a/examples/long_deletions_test/SNV.txt b/examples/long_deletions_test/SNV.txt index 1c00ef3..547ed34 100644 --- a/examples/long_deletions_test/SNV.txt +++ b/examples/long_deletions_test/SNV.txt @@ -3,15 +3,15 @@ REF 15 G A * 0.1303 0.1423 * 0.9981 1.0000 REF 19 C A * 0.1049 0.1235 * 0.9977 1.0000 REF 19 C G * 0.8951 0.8765 * 1.0000 1.0000 REF 29 G C 0.0254 0.0187 0.0292 1.0000 1.0000 1.0000 -REF 51 A --------- - 0.8343 0.8414 - 1.0000 1.0000 -REF 51 A ------------ - 0.0047 0.0040 - 1.0000 1.0000 +REF 50 AAGTTCGGCC A - 0.8343 0.8414 - 1.0000 1.0000 +REF 50 AAGTTCGGCCGAG A - 0.0047 0.0040 - 1.0000 1.0000 REF 65 A T 0.8401 0.8455 0.8553 1.0000 1.0000 0.9999 REF 100 A G 0.2388 0.1016 0.1064 0.9999 1.0000 1.0000 REF 101 T A 0.7612 0.8984 0.8936 1.0000 1.0000 1.0000 REF 103 T A 0.9584 0.9674 0.9687 1.0000 1.0000 1.0000 REF 103 T C 0.0416 0.0326 0.0313 0.9996 1.0000 1.0000 -REF 108 T --------- - 0.0009 0.0010 - 0.9796 0.9823 -REF 111 A --------- - 0.9665 0.9677 - 1.0000 1.0000 +REF 107 ATGAAATCTC A - 0.0009 0.0010 - 0.9796 0.9823 +REF 110 AAATCTCGGC A - 0.9665 0.9677 - 1.0000 1.0000 REF 119 C A 0.0009 0.0010 0.8238 0.9796 0.9823 1.0000 REF 129 A T 0.1064 0.1762 0.1398 1.0000 0.9995 0.9994 REF 239 T C 0.1325 0.1367 0.1334 1.0000 1.0000 1.0000 diff --git a/examples/long_deletions_test/SNVs_0.010000.txt b/examples/long_deletions_test/SNVs_0.010000.txt index 4ceb8c4..f660571 100644 --- a/examples/long_deletions_test/SNVs_0.010000.txt +++ b/examples/long_deletions_test/SNVs_0.010000.txt @@ -2,15 +2,15 @@ REF 15 G A * 0.1303 0.1423 * 0.9981 1.0000 69 3 536 14 0.566433 REF 19 C A * 0.1049 0.1235 * 0.9977 1.0000 81 5 686 33 0.719778 REF 19 C G * 0.8951 0.8765 * 1.0000 1.0000 605 28 686 33 1 REF 29 G C 0.0254 0.0187 0.0292 1.0000 1.0000 1.0000 19 1 1030 86 1 -REF 51 A --------- - 0.8343 0.8414 - 1.0000 1.0000 728 143 929 192 0.915081 -REF 51 A ------------ - 0.0047 0.0040 - 1.0000 1.0000 3 1 914 191 1 +REF 50 AAGTTCGGCC A - 0.8343 0.8414 - 1.0000 1.0000 728 143 929 192 0.915081 +REF 50 AAGTTCGGCCGAG A - 0.0047 0.0040 - 1.0000 1.0000 3 1 914 191 1 REF 65 A T 0.8401 0.8455 0.8553 1.0000 1.0000 0.9999 1211 336 1451 407 1 REF 100 A G 0.2388 0.1016 0.1064 0.9999 1.0000 1.0000 163 120 1362 768 0.270704 REF 101 T A 0.7612 0.8984 0.8936 1.0000 1.0000 1.0000 810 476 968 596 0.846958 REF 103 T A 0.9584 0.9674 0.9687 1.0000 1.0000 1.0000 853 540 885 565 0.98614 REF 103 T C 0.0416 0.0326 0.0313 0.9996 1.0000 1.0000 32 25 885 565 0.611461 -REF 108 T --------- - 0.0009 0.0010 - 0.9796 0.9823 0 1 636 434 0.811215 -REF 111 A --------- - 0.9665 0.9677 - 1.0000 1.0000 603 407 635 440 0.919726 +REF 107 ATGAAATCTC A - 0.0009 0.0010 - 0.9796 0.9823 0 1 636 434 0.811215 +REF 110 AAATCTCGGC A - 0.9665 0.9677 - 1.0000 1.0000 603 407 635 440 0.919726 REF 119 C A 0.0009 0.0010 0.8238 0.9796 0.9823 1.0000 77 55 717 498 0.953245 REF 129 A T 0.1064 0.1762 0.1398 1.0000 0.9995 0.9994 210 183 1715 1402 0.789111 REF 239 T C 0.1325 0.1367 0.1334 1.0000 1.0000 1.0000 57 240 448 1815 0.965147 diff --git a/examples/long_deletions_test/test_long_deletions.py b/examples/long_deletions_test/test_long_deletions.py index e38c8c2..0eba7dd 100644 --- a/examples/long_deletions_test/test_long_deletions.py +++ b/examples/long_deletions_test/test_long_deletions.py @@ -41,7 +41,7 @@ def read_counts(chrm, start, end, bamfile): return read_count.forward, read_count.reverse -def count_matching_variants(chrm, start, length, variant, bamfile): +def count_matching_variants(chrm, start, length, variant, is_del, bamfile): variant_count = StrandCounts() with pysam.AlignmentFile(bamfile, 'rb') as alnfile: @@ -55,7 +55,7 @@ def count_matching_variants(chrm, start, length, variant, bamfile): # least one read - including positions outside the region of # interest, unless 'tuncate=True' for read in column.pileups: - if variant.startswith('-'): + if is_del: # Handle deletions if -read.indel == length: if read.alignment.is_reverse: @@ -82,31 +82,33 @@ def main(bamfile, snvsfile, outfile): df_snvs = pd.read_csv(snvsfile, sep="\t", header=0, compression=None) # convert to 0-based df_snvs["Pos"] -= 1 - df_snvs["Pos_end"] = df_snvs["Pos"] + df_snvs["Var"].str.len() - del_mask = df_snvs["Var"].str.startswith('-') + # Deletion length + aux_len = df_snvs["Ref"].str.len() - 1 + del_mask = aux_len > 0 df_snvs["Del_len"] = np.nan - df_snvs.loc[del_mask, "Del_len"] = df_snvs.loc[del_mask, "Var"].str.len() - # Pysam pileup: length of the deletion is recorded in the preceding - # position - df_snvs.loc[del_mask, "Pos"] -= 1 + df_snvs.loc[del_mask, "Del_len"] = aux_len[del_mask] + df_snvs["Is_del"] = del_mask df_snvs[["Variant_forward", "Variant_reverse"]] = df_snvs.apply( lambda x: count_matching_variants( - x["Chromosome"], x["Pos"], x["Del_len"], x["Var"], bamfile), - axis=1, result_type="expand") + x["Chromosome"], x["Pos"], x["Del_len"], x["Var"], x["Is_del"], + bamfile), axis=1, result_type="expand") - # Revert operations for the reporting position of deletions + # Temporarily change to start position for the deletions to count the + # number of reads that cover the long deletion in full length df_snvs.loc[del_mask, "Pos"] += 1 + df_snvs.loc[del_mask, "Pos_end"] = df_snvs["Pos"] + aux_len + df_snvs.loc[~del_mask, "Pos_end"] = df_snvs["Pos"] + 1 df_snvs[["Depth_forward", "Depth_reverse"]] = df_snvs.apply( lambda x: read_counts( x["Chromosome"], x["Pos"], x["Pos_end"], bamfile), axis=1, result_type="expand") # Revert operations for the reporting position - df_snvs["Pos"] += 1 + df_snvs.loc[~del_mask, "Pos"] += 1 # Clean-up for comparison - df_snvs = df_snvs.drop(columns=["Pos_end", "Del_len"]) + df_snvs = df_snvs.drop(columns=["Del_len", "Is_del", "Pos_end"]) # Load output produced by ShoRAH for comparison df_out = pd.read_csv(outfile, sep="\t", header=None, compression=None)