forked from chenmy33/IntronGetting
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathretrieve_seq.py
143 lines (122 loc) · 6.59 KB
/
retrieve_seq.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
from Bio import SeqIO
from Bio.Seq import Seq
import os,glob
def reverse(i,seq):
if i==1:
return seq
if i==-1:
temp=Seq(seq).reverse_complement()
return str(temp)
def retrieve_seqs(genome_dir,out_dir):
exon_dict={}#sequeces pool
chromosome_dict={}#chromosome pool:{taxa:{chromosome:gene_location}}
a=0
b=0
exon_file='exon_pairs.txt'
for line in open(exon_file):
if b==0:
b=1
taxon=line.split()
for i in range(len(taxon)):
chromosome_dict[taxon[i]]={}
else:
for i in range(len(line.split())):
if line.split()[i]!= 'None&None':
if chromosome_dict[taxon[i]].has_key(line.split()[i].split('|')[2]):
chromosome_dict[taxon[i]][line.split()[i].split('|')[2]].append(line.split()[i])
else:
chromosome_dict[taxon[i]][line.split()[i].split('|')[2]]= [line.split()[i]]
for filename in glob.glob(os.path.join(genome_dir,'*.fa')):
records=SeqIO.parse(filename,'fasta')
for record in records:
a=0
count=0
chromosome=str(record.description).split()[0]
taxa_name= os.path.basename(filename).split('.')[0]
if taxa_name not in chromosome_dict:
break
else:
if chromosome not in chromosome_dict[taxa_name].keys():
continue
if count == len(chromosome_dict[taxa_name].keys()):
del chromosome_dict[taxa_name]
break
else:
record_seq=str(record.seq)
for key in chromosome_dict:
if a!=0:
break
else:
if key== taxa_name:
for sub_key in chromosome_dict[key]:
if a!=0:
break
else:
if sub_key==chromosome:
count+=1
for genename in chromosome_dict[taxa_name][sub_key]:
if genename!= 'None&None':
exon1_start=int(genename.split('&')[0].split('|')[4])
exon2_start=int(genename.split('&')[1].split('|')[4])
if exon1_start<exon2_start:
seq_Start=int(genename.split('&')[0].split('|')[4])-1
seq_End=int(genename.split('&')[1].split('|')[5])
taxa_seq=record_seq[seq_Start:seq_End]
else:
seq_Start=int(genename.split('&')[1].split('|')[4])-1
seq_End=int(genename.split('&')[0].split('|')[5])
taxa_seq=record_seq[seq_Start:seq_End]
if exon_dict.has_key(taxa_name):
exon_dict[taxa_name].append((genename,taxa_seq))
else:
exon_dict.setdefault(taxa_name,[(genename,taxa_seq)])
else:
continue
a=1
taxon=[]
a=0
seq_dict={}
outp1=open('reference_missing_exon.txt','w')
for line in open(exon_file):
if a==0:
a=1
taxon=line.split()
else:
if line.split()[0]!= 'None&None':
reference_direction= int(line.split()[0].split('&')[1].split('|')[-1])
exon_info= line.split()[0]
genename=line.split()[0].split('&')[0].split('|')[0]+'_'+line.split()[0].split('&')[0].split('|')[1]+'_'+line.split()[0].split('&')[1].split('|')[1].split('_')[1]
for info in exon_dict[taxon[0]]:
## print info[0]
if info[0]== exon_info:
seq= info[1]
real_seq=reverse(reference_direction,seq)
seq_dict.setdefault(genename,[(taxon[0],real_seq)])
break
for i in range(1,len(line.split())):
if line.split()[i]!= 'None&None':
sself_direction= [int(line.split()[i].split('&')[1].split('|')[-2]),str(line.split()[i].split('&')[1].split('|')[-1])]
if (sself_direction[0]==reference_direction) and (sself_direction[1]=='Plus'):
self_direction=reference_direction
if (sself_direction[0]==reference_direction) and (sself_direction[1]=='Minus'):
self_direction=reference_direction*(-1)
if (sself_direction[0]==reference_direction*(-1)) and (sself_direction[1]=='Plus'):
self_direction=reference_direction*(-1)
if (sself_direction[0]==reference_direction*(-1)) and (sself_direction[1]=='Minus'):
self_direction=reference_direction
exon_info= line.split()[i]
for info in exon_dict[taxon[i]]:
if info[0]== exon_info:
seq2= info[1]
real_seq2=reverse(self_direction,seq2)
if seq_dict.has_key(genename):
seq_dict[genename].append((taxon[i],real_seq2))
break
else:
outp1.write(line)
for key in seq_dict:
filename=str(key)+'.fas'
outp=open(os.path.join(out_dir,filename),'w')
for seq in seq_dict[key]:
outp.write('>'+str(seq[0])+'\n'+str(seq[1])+'\n')
outp.close()