-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathget_normalized.CpG.content.pl
36 lines (28 loc) · 1.09 KB
/
get_normalized.CpG.content.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
#!/usr/bin/perl
use Bio::EnsEMBL::Registry;
my $registry = 'Bio::EnsEMBL::Registry';
$registry->load_registry_from_db(
-host => 'ensembldb.ensembl.org',
-user => 'anonymous',
-version => 56
);
my $slice_adaptor = $registry->get_adaptor( 'Human', 'Core', 'Slice' );
while($line = <STDIN>){
chomp($line);
@t=split("\t", $line);
# chr start0 end0 genename confidence strand transcriptname class
$chr=$t[0];
if($chr eq 'chr'){
print join("\t", @t, "normalized.CpG.content"), "\n";
next;
}
$start=($t[5] eq "-")?$t[1]:($t[1]-1500);
$end=($t[5] eq "-")?($t[2]+1500):$t[2];
$chr =~ s/^chr//g;
my $s = $slice_adaptor->fetch_by_region( 'chromosome', $chr, $start, $end, ($t[5] eq '+')?1:-1)->seq;
my @n1 = ($s=~/GC/g); my $n1=@n1;
my $n2 = length($s);
my @n3 = (($s=~/C/g) , ($s=~/G/g)); my $n3=@n3;
print join("\t", @t, $s, sprintf("%.3f", ($n1/$n2)/(($n3/$n2/2)*($n3/$n2/2)))), "\n";
# Normalized CpG fraction was computed as (observed CpG)/(expected CpG), where expected CpG was calculated as (GC content/2)2.
}