forked from hungnguyengoc/rps_citi_mljun2018
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrandomforestdemo.py
158 lines (124 loc) · 4.49 KB
/
randomforestdemo.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
# -*- coding: utf-8 -*-
"""
Created on Wed Jun 13 21:36:24 2018
@author: Balasubramaniam
"""
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
import pdb
# File Paths
INPUT_PATH = "./inputs/breast-cancer-wisconsin.data"
OUTPUT_PATH = "./inputs/breast-cancer-wisconsin.csv"
# Headers
HEADERS = ["CodeNumber", "ClumpThickness", "UniformityCellSize", "UniformityCellShape", "MarginalAdhesion",
"SingleEpithelialCellSize", "BareNuclei", "BlandChromatin", "NormalNucleoli", "Mitoses", "CancerType"]
def read_data(path):
"""
Read the data into pandas dataframe
:param path:
:return:
"""
data = pd.read_csv(path)
return data
def get_headers(dataset):
"""
dataset headers
:param dataset:
:return:
"""
return dataset.columns.values
def add_headers(dataset, headers):
"""
Add the headers to the dataset
:param dataset:
:param headers:
:return:
"""
dataset.columns = headers
return dataset
def data_file_to_csv():
"""
:return:
"""
# Headers
headers = ["CodeNumber", "ClumpThickness", "UniformityCellSize", "UniformityCellShape", "MarginalAdhesion",
"SingleEpithelialCellSize", "BareNuclei", "BlandChromatin", "NormalNucleoli", "Mitoses",
"CancerType"]
# Load the dataset into Pandas data frame
dataset = read_data(INPUT_PATH)
# Add the headers to the loaded dataset
dataset = add_headers(dataset, headers)
# Save the loaded dataset into csv format
dataset.to_csv(OUTPUT_PATH, index=False)
print ("File saved ...!")
def split_dataset(dataset, train_percentage, feature_headers, target_header):
"""
Split the dataset with train_percentage
:param dataset:
:param train_percentage:
:param feature_headers:
:param target_header:
:return: train_x, test_x, train_y, test_y
"""
# Split dataset into train and test dataset
train_x, test_x, train_y, test_y = train_test_split(dataset[feature_headers], dataset[target_header],
train_size=train_percentage)
return train_x, test_x, train_y, test_y
def handel_missing_values(dataset, missing_values_header, missing_label):
"""
Filter missing values from the dataset
:param dataset:
:param missing_values_header:
:param missing_label:
:return:
"""
return dataset[dataset[missing_values_header] != missing_label]
def random_forest_classifier(features, target):
"""
To train the random forest classifier with features and target data
:param features:
:param target:
:return: trained random forest classifier
"""
clf = RandomForestClassifier()
clf.fit(features, target)
return clf
def dataset_statistics(dataset):
"""
Basic statistics of the dataset
:param dataset: Pandas dataframe
:return: None, print the basic statistics of the dataset
"""
print (dataset.describe())
def main():
"""
Main function
:return:
"""
# Load the csv file into pandas dataframe
dataset = pd.read_csv(OUTPUT_PATH)
# Get basic statistics of the loaded dataset
dataset_statistics(dataset)
# Filter missing values
dataset = handel_missing_values(dataset, HEADERS[6], '?')
train_x, test_x, train_y, test_y = split_dataset(dataset, 0.7, HEADERS[1:-1], HEADERS[-1])
# Train and Test dataset size details
print ("Train_x Shape :: ", train_x.shape)
print ("Train_y Shape :: ", train_y.shape)
print ("Test_x Shape :: ", test_x.shape)
print ("Test_y Shape :: ", test_y.shape)
# Create random forest classifier instance
trained_model = random_forest_classifier(train_x, train_y)
print ("Trained model :: ", trained_model)
predictions = trained_model.predict(test_x)
for i in range(0, 5):
print ("Actual outcome :: {} and Predicted outcome :: {}".format(list(test_y)[i],
predictions[i]))
print ("Train Accuracy :: ", accuracy_score(train_y, trained_model.predict(train_x)))
print ("Test Accuracy :: ", accuracy_score(test_y, predictions))
print (" Confusion matrix ", confusion_matrix(test_y, predictions))
if __name__ == "__main__":
main()