-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathdereplicate_minimal_marker_lineages.pl
56 lines (45 loc) · 1.19 KB
/
dereplicate_minimal_marker_lineages.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
#!/usr/bin/perl
$markers = $ARGV[0];
$markers =~ s/\.fasta/_minimal_markers.txt/;
$markers = "annotated_$markers";
chomp $markers;
open(IN, "$markers");
open(OUT, ">dereplicated_$markers");
$head = <IN>;
chomp $head;
($id, @head) = split(/\t/, $head);
$lineage = <IN>;
chomp $lineage;
($id, @lineage) = split(/\t/, $lineage);
while(<IN>)
{
chomp;
($marker, @data) = split(/\t/, $_);
$i = 0;
push @markers, $marker;
foreach $cell(@data){$transpose[$i].=$cell; $i++;}
}
$n = @markers;
print "$n marker rows\n";
print OUT "ID\tLineage";
foreach $marker(@markers){print OUT "\t$marker";}
print OUT "\n";
$i = 0;
foreach $seq(@transpose)
{
$rseq = $seq; $rseq =~ s/x/\\D/g;
$sample = $head[$i];
$lineage = $lineage[$i];
$replicate{$seq}{$lineage}++;
$x = 0; foreach $seq2(@transpose){$lineage2 = $lineage[$x]; $x++; $rseq2 = $seq2; $rseq2 =~ s/x/\\D/g; if ($seq ne $seq2 && ($seq =~ /$rseq2/ || $rseq =~ /$seq2/)){$replicate{$seq2}{$lineage2}++;}}
if($replicate{$seq}{$lineage} ==1)
{
print OUT "$sample\t$lineage";
@seq = split(//, $seq);
$n = @seq; print "now $n characters\n";
$seq2 = join("\t", @seq);
print OUT "\t$seq2\n";
}
else{print "Removed $sample $lineage $seq as a replicate\n";}
$i++;
}