-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathDatasetCleaner.py
54 lines (43 loc) · 2.09 KB
/
DatasetCleaner.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
__author__ = 'cyberbeast'
import os as os
from Bio import SeqIO
from multiprocessing import Pool
import glob
def clean(fasta_file, min_length=0, rec_n=100):
# TODO Code to check if file is already clean or not!
# ---------------------------------------------------------------------------------------------------
# ---------------------------------------------------------------------------------------------------
# create our hash table to add the sequences
sequences = {}
# Using the biopython fasta parse we can read our fasta input
for seq_record in SeqIO.parse(fasta_file, "fasta"):
# Take the current sequence
sequence = str(seq_record.seq).upper()
# Check if the current sequence is according to the user parameters
if len(sequence) >= min_length and (float(sequence.count("N")) / float(len(sequence))) * 100 <= rec_n:
# If the sequence passed in the test "is It clean?" and It isnt in the hash table , the sequence and Its id are going to be in the hash
if sequence not in sequences:
sequences[sequence] = seq_record.id
# If It is already in the hash table, We're just gonna concatenate the ID of the current sequence to another one that is already in the hash table
else:
sequences[sequence] += "_" + seq_record.id
# Write the clean sequences
# Create a file in the same directory where you ran this script
output_file = open("clear_" + fasta_file, "w+")
# Just Read the Hash Table and write on the file as a fasta format
for sequence in sequences:
output_file.write(">" + sequences[sequence] + "\n" + sequence + "\n")
output_file.close()
print("CLEAN!!!\nPlease check clear_" + fasta_file)
if __name__ == "__main__":
filelist = []
# use glob module
for name in glob.glob(str(os.getcwd()) + '/GenomeDataset/Chromosomes/*.fa'):
filelist.append(name)
# print(filelist)
pool = Pool()
for file_l in filelist:
print(file_l)
pool.apply_async(clean, (file_l,))
pool.close()
pool.join()