-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdynamWeightBandwidth.py
154 lines (112 loc) · 4.87 KB
/
dynamWeightBandwidth.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
#finding the best radius to use rather then hard coding it!
import matplotlib.pyplot as plt
from matplotlib import style
import numpy as np
from sklearn.datasets.samples_generator import make_blobs
style.use('ggplot')
X, y = make_blobs(n_samples=15, centers=3, n_features=2)
##X = np.array([[1, 2],
## [1.5, 1.8],
## [5, 8],
## [8, 8],
## [1, 0.6],
## [9, 11],
## [8, 2],
## [10, 2],
## [9, 3]])
##plt.scatter(X[:, 0],X[:, 1], marker = "x", s=150, linewidths = 5, zorder = 10)
##plt.show()
'''
1. Start at every datapoint as a cluster center
2. take mean of radius around cluster, setting that as new cluster center
3. Repeat #2 until convergence.
'''
class Mean_Shift:
def __init__(self, radius = None, radius_norm_step = 100):
self.radius = radius
self.radius_norm_step = radius_norm_step
def fit(self,data):
if self.radius == None:
#getting the center for all ouf the data
all_data_centroid = np.average(data,axis=0)
#magnitude from the origin
all_data_norm = np.linalg.norm(all_data_centroid)
#finding a good radius to use (so its the entire norm / the radius step
self.radius = all_data_norm/self.radius_norm_step
print(self.radius)
centroids = {}
for i in range(len(data)):
centroids[i] = data[i]
#so the weights are from zero to the step and then reversed ex 99,98,97...0
weights = [i for i in range(self.radius_norm_step)][::-1]
while True:
new_centroids = []
for i in centroids:
in_bandwidth = []
centroid = centroids[i]
for featureset in data:
distance = np.linalg.norm(featureset-centroid)
if distance == 0:
distance = 0.00000000001
weight_index = int(distance/self.radius) #how many steps we took
if weight_index > self.radius_norm_step-1:
weight_index = self.radius_norm_step-1
to_add = (weights[weight_index]**2)*[featureset]
in_bandwidth +=to_add
new_centroid = np.average(in_bandwidth,axis=0)
new_centroids.append(tuple(new_centroid))
uniques = sorted(list(set(new_centroids)))
to_pop = []
#to get rid of points that are really close to each other
for i in uniques:
for ii in [i for i in uniques]:
if i == ii:
pass
#if the two verticles are within one radius of each other then converge them
elif np.linalg.norm(np.array(i)-np.array(ii)) <= self.radius:
#print(np.array(i), np.array(ii))
to_pop.append(ii)
break
for i in to_pop:
try:
uniques.remove(i)
except:
pass
prev_centroids = dict(centroids)
centroids = {}
for i in range(len(uniques)):
centroids[i] = np.array(uniques[i])
optimized = True
for i in centroids:
if not np.array_equal(centroids[i], prev_centroids[i]):
optimized = False
if optimized:
break
self.centroids = centroids
self.classifications = {}
for i in range(len(self.centroids)):
self.classifications[i] = []
for featureset in data:
#compare distance to either centroid
distances = [np.linalg.norm(featureset-self.centroids[centroid]) for centroid in self.centroids]
#print(distances) whatever centroid has the smallest distance is our classifier
classification = (distances.index(min(distances)))
# featureset that belongs to that cluster
self.classifications[classification].append(featureset)
def predict(self,data):
#compare distance to either centroid
distances = [np.linalg.norm(data-self.centroids[centroid]) for centroid in self.centroids]
classification = (distances.index(min(distances)))
return classification
clf = Mean_Shift()
clf.fit(X)
centroids = clf.centroids
print(centroids)
colors = 10*['r','g','b','c','k','y']
for classification in clf.classifications:
color = colors[classification]
for featureset in clf.classifications[classification]:
plt.scatter(featureset[0],featureset[1], marker = "x", color=color, s=150, linewidths = 5, zorder = 10)
for c in centroids:
plt.scatter(centroids[c][0],centroids[c][1], color='k', marker = "*", s=150, linewidths = 5)
plt.show()