-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathhelperFuncs.jl
220 lines (159 loc) · 5.4 KB
/
helperFuncs.jl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
#=
helperFuncs
Copyright © 2014 Vera Abaimova <[email protected]>
Mark Wells <[email protected]>
Distributed under terms of the MIT license.
=#
## This file contains all of the helper functions used throughout
## the project from preprocessing to clustering
using FITSIO
#############################################################
## Get desired data (not directly related to a singular purpose)
## Get the KIDs that are found in the directory of FITS files
function dir_KIDs(dir_name::String)
#=
this function will return all the KIDs that are within the given directory
uses the filenames (assumes the form of kplr#########-#############_llc.fits)
=#
println("Reading directory: " * dir_name)
#returns all the files from dir_name
fits_files = readdir(dir_name)
#initialize an empty string array
kids = String[]
#this will be used to compare kids
temp_kid = ""
for file in fits_files
#extract the 9 digit id number
try kid = file[5:13]
if kid != temp_kid
append!(kids,[kid])
end
temp_kid = kid
catch
println("Unexpected file in source directory: ",file)
end
end
println("Discovered ",length(kids)," unique Kepler ID numbers")
return kids
end
## This function takes a directory name and a Kepler ID number
## and returns the first file instance for that KID
function firstInstOfKID(dirName::String, kid::String)
## Get the file names from the given directory
files = readdir(dirName)
## Find the first instance of the Kepler ID in the file directory
instance = files[findfirst(map((x) -> contains(x,kid),files))]
return instance
end
## Get the features from the file indicated
function getFeatures(file::String)
table = readcsv(file,String)
kids = table[:,1]
feats = float(table[:,2:end])
return kids,feats
end
## Get data from a file in a particular directory, specifically when combining it
function getDataFromFile(dir::String,file::String)
## Make sure the file name has the full path
file = dir * file
# println(file)
println("reading: ",file)
data = readcsv(file,ASCIIString)
kids = data[:,1]
# println("kids=",kids[1])
data = data[:,2:end]
# println(typeof(data[1]))
# println(data[1])
return kids,data
end
##############################################################
## Process and Sort Data
## Impute data to handle missing values
function imputeData(data)
## Create imputer
imp = preprocessing.Imputer(missing_values=-9999,strategy="median",axis=0)
## Replace the missing values (represented by -9999)
## with the median of the column
imputedData = imp[:fit_transform](data)
return imputedData
end
## Sort the data by Kepler ID number
function sortData(kids,features)
## Sort by Kepler ID number to make future merging easier
## Get the sorted indices for the first column after transforming the
## Kepler ID strings to integers
indices = sortperm(kids,by=int)
## Reindex features with the new sorted indices
features = features[indices,:]
## Reindex the kids
kids = kids[indices]
return kids,features
end
## Sort the Galex data by Kepler ID
function sortGalex(galexFile::String)
data = readcsv(galexFile)
kids = int(data[:,end])
kids = map((x) -> lpad(x,9,"0"),kids)
features = data[:,1]
features = float(features)
indices = sortperm(kids,by=int)
kids = kids[indices]
features = features[indices,:]
return kids,features
end
##############################################################
## Testing Directories
## Test to see if a directory exists
function test_fits_dir(dir::String)
#=
Produces a warning message if directory does not exist
name will be used to generate warning message
=#
try readdir(dir)
println("Found FITs directory!")
catch
println("Please check SETTINGS.txt file")
## this will halt execution of the entire program with a helpful message
throw("Could not find FITs directory!")
end
end
## Make a directory if one by that name is not present
function make_if_needed(dir::String)
#=
Produces a warning message if directory does not exist
name will be used to generate warning message
=#
try readdir(dir)
catch
mkdir(dir)
end
end
## Make sure that the end of the directory name has a /
## so that any operation that involved appending to the file path
## would be carried out successfully
function checkDirEnd(dirName::String)
if dirName[end] != '/'
dirName = dirName * "/"
# println("Needed / addition")
end
# println(dirName)
return dirName
end
##############################################################
## Graphing
## Create a bar graph indicating the cluster membership
function createBarGraph(labels)
figure()
## Get the unique label names for the x-axis
labelNames = unique(labels)
labelNames = sort(labelNames)
println("length of labelNames: ", length(labelNames))
println("labelNames: ", labelNames)
## Get the counts for each label
counts = hist(labels,length(labels))[2]
println("size of counts: ", length(counts))
println("counts: ", counts)
## Graph it
width = .9
bar(labelNames-width/2,counts,width=width)
end