-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathclean_panda_before_hierarchical_analysis.py
104 lines (76 loc) · 3.41 KB
/
clean_panda_before_hierarchical_analysis.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
import sys
import pandas
import os
import re
def remove_rows_with_no_taxonomy(temp_panda):
'''
This function removes rows when there is no species/organ/disease triplet left in those columns
This can happen when all labels have been dropped
'''
print(temp_panda.index)
indices_to_drop=[True if len(temp_panda.at[i,'organ'])==0 else False for i in temp_panda.index]
indices_to_drop=temp_panda.index[indices_to_drop]
print(indices_to_drop)
temp_panda.drop(
labels=indices_to_drop,
axis='rows',
inplace=True
)
def remove_rows_without_curated_inchikey(temp_panda):
'''
We remove rows without a curated inchikey
At some point, we may add ML to them
'''
indices_to_drop=[True if temp_panda.at[i,'inchikey_curated']=='pre_curation_file' else False for i in temp_panda.index]
indices_to_drop=temp_panda.index[indices_to_drop]
temp_panda.drop(
labels=indices_to_drop,
axis='rows',
inplace=True
)
def remove_rows_where_curated_inchikey_is_at(temp_panda):
'''
This is the case when....
plb edit 2-6-2022
@@@@@@ is the "fill na" value in a previous script
for some reason, the curated inchikey translation pad had @@@@@@ as the "inchikey" and "inchikey curated"
columns
'''
indices_to_drop=[True if temp_panda.at[i,'inchikey_curated']=='@@@@@@@' else False for i in temp_panda.index]
indices_to_drop=temp_panda.index[indices_to_drop]
temp_panda.drop(
labels=indices_to_drop,
axis='rows',
inplace=True
)
'''
def remove_rows_without_classyfire_assignment(temp_panda):
indices_to_drop=[True if temp_panda.at[i,'direct_parent_5']=='pre_curation_file' else False for i in temp_panda.index]
indices_to_drop=temp_panda.index[indices_to_drop]
temp_panda.drop(
labels=indices_to_drop,
axis='rows',
inplace=True
)
'''
if __name__ == "__main__":
min_fold_change=sys.argv[1]
os.system('mkdir -p ../results/'+str(min_fold_change)+'/step_5_panda_cleaned/')
os.system('touch ../results/'+str(min_fold_change)+'/step_5_panda_cleaned/dummy.txt')
pipeline_input_panda_directory='../results/'+str(min_fold_change)+'/step_4_classes_transformed/'
pipeline_output_directory='../results/'+str(min_fold_change)+'/step_5_panda_cleaned/'
file_list=os.listdir(pipeline_input_panda_directory)
file_list.remove('dummy.txt')
for temp_file in file_list:
input_panda=pandas.read_pickle(pipeline_input_panda_directory+temp_file)
#plb edit 2-6-2022
#it literally looks like the first function is irrelevant because we get data from carrot
#therefore all compounds have all species/organ/special (aka disease). so the first is irrelevant
#then, we are keeping those without curated inchikeys to function as unknowns. so the next
#two are irrelevant.
#delete rows if the parallel lists organ/species/special property are empty
##remove_rows_with_no_taxonomy(input_panda)
#we do this for the moment to streamline the small scale analysis
#will probably want to convert the standards to taxonomy "unspecified"
temporary_file_integer=re.findall(r'\d+', temp_file)[0]
input_panda.to_pickle(pipeline_output_directory+'binvestigate_ready_for_analysis_'+str(temporary_file_integer)+'.bin',protocol=0)