-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathCLT_class.py
161 lines (147 loc) · 6.86 KB
/
CLT_class.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
"""
Define the Chow_liu Tree class
"""
#
from __future__ import print_function
import numpy as np
from Util import *
from scipy.sparse import csr_matrix
from scipy.sparse.csgraph import minimum_spanning_tree
from scipy.sparse.csgraph import depth_first_order
import sys
from collections import defaultdict
import os
import time
import random
'''
Class Chow-Liu Tree.
Members:
nvariables: Number of variables
xycounts:
Sufficient statistics: counts of value assignments to all pairs of variables
Four dimensional array: first two dimensions are variable indexes
last two dimensions are value indexes 00,01,10,11
xcounts:
Sufficient statistics: counts of value assignments to each variable
First dimension is variable, second dimension is value index [0][1]
xyprob:
xycounts converted to probabilities by normalizing them
xprob:
xcounts converted to probabilities by normalizing them
topo_order:
Topological ordering over the variables
parents:
Parent of each node. Parent[i] gives index of parent of variable indexed by i
If Parent[i]=-9999 then i is the root node
'''
class CLT:
def __init__(self):
self.nvariables = 0
self.xycounts = np.ones((1, 1, 2, 2), dtype=int)
self.xcounts = np.ones((1, 2), dtype=int)
self.xyprob = np.zeros((1, 1, 2, 2))
self.xprob = np.zeros((1, 2))
self.topo_order = []
self.parents = []
'''
Learn the structure of the Chow-Liu Tree using the given dataset
'''
def learn(self, dataset):
self.nvariables = dataset.shape[1]
self.xycounts = Util.compute_xycounts(dataset) + 1 # laplace correction
self.xcounts = Util.compute_xcounts(dataset) + 2 # laplace correction
self.xyprob = Util.normalize2d(self.xycounts)
self.xprob = Util.normalize1d(self.xcounts)
# compute mutual information score for all pairs of variables
# weights are multiplied by -1.0 because we compute the minimum spanning tree
edgemat = Util.compute_MI_prob(self.xyprob, self.xprob) * (-1.0)
edgemat[edgemat == 0.0] = 1e-20 # to avoid the case where the tree is not connected
# compute the minimum spanning tree
Tree = minimum_spanning_tree(csr_matrix(edgemat))
# Convert the spanning tree to a Bayesian network
self.topo_order, self.parents = depth_first_order(Tree, 0, directed=False)
'''
Update the Chow-Liu Tree using weighted samples.
Note that we assume that weight of each sample >0.
Important function for performing updates when running EM
'''
def update(self, dataset, weights):
# Update the Chow-Liu Tree based on a weighted dataset
# assume that dataset_.shape[0] equals weights.shape[0] because we assume each example has a weight
if not np.all(weights):
print('Error: Weight of an example in the dataset is zero')
sys.exit(-1)
if weights.shape[0]==dataset.shape[0]:
# Weighted laplace correction, note that num-examples=dataset.shape[0]
# If 1-laplace smoothing is applied to num-examples,
# then laplace smoothing applied to "sum-weights" equals "sum-weights/num-examples"
smooth = np.sum(weights)/ dataset.shape[0]
self.xycounts = Util.compute_weighted_xycounts(dataset, weights) + smooth
self.xcounts = Util.compute_weighted_xcounts(dataset, weights) + 2.0 *smooth
else:
print('Error: Each example must have a weight')
sys.exit(-1)
self.xyprob = Util.normalize2d(self.xycounts)
self.xprob = Util.normalize1d(self.xcounts)
edgemat = Util.compute_MI_prob(self.xycounts, self.xcounts) * (-1.0)
Tree = minimum_spanning_tree(csr_matrix(edgemat))
self.topo_order, self.parents = depth_first_order(Tree, 0, directed=False)
'''
Compute the Log-likelihood score of the dataset
'''
def computeLL(self,dataset):
ll=0.0
for i in range(dataset.shape[0]):
for x in self.topo_order:
assignx=dataset[i,x]
# if root sample from marginal
if self.parents[x] == -9999:
ll+=np.log(self.xprob[x][assignx])
else:
# sample from p(x|y)
y = self.parents[x]
assigny = dataset[i,y]
ll+=np.log(self.xyprob[x, y, assignx, assigny] / self.xprob[y, assigny])
return ll
def getProb(self,sample):
prob = 1.0
for x in self.topo_order:
assignx = sample[x]
# if root sample from marginal
if self.parents[x] == -9999:
prob *= self.xprob[x][assignx]
else:
# sample from p(x|y)
y = self.parents[x]
assigny = sample[y]
prob *= self.xyprob[x, y, assignx, assigny] / self.xprob[y, assigny]
return prob
def Bootstrap_learn(self, dataset, weights, hyperparameterR):
self.nvariables = dataset.shape[1]
self.xycounts = Util.compute_xycounts(dataset) + 1 # laplace correction
self.xcounts = Util.compute_xcounts(dataset) + 2 # laplace correction
self.xyprob = Util.normalize2d(self.xycounts)
self.xprob = Util.normalize1d(self.xcounts)
if not np.all(weights):
print('Error: Weight of an example in the dataset is zero')
sys.exit(-1)
if weights.shape[0]==dataset.shape[0]:
# Weighted laplace correction, note that num-examples=dataset.shape[0]
# If 1-laplace smoothing is applied to num-examples,
# then laplace smoothing applied to "sum-weights" equals "sum-weights/num-examples"
smooth = np.sum(weights)/ dataset.shape[0]
self.xycounts = Util.compute_weighted_xycounts(dataset, weights) + smooth
self.xcounts = Util.compute_weighted_xcounts(dataset, weights) + 2.0 *smooth
else:
print('Error: Each example must have a weight')
sys.exit(-1)
self.xyprob = Util.normalize2d(self.xycounts)
self.xprob = Util.normalize1d(self.xcounts)
edgemat = Util.compute_MI_prob(self.xycounts, self.xcounts) * (-1.0)
for r in range(hyperparameterR):
randomRow = random.randrange(0, min(dataset.shape[0],dataset.shape[1]))
randomCol = random.randrange(0, min(dataset.shape[0],dataset.shape[1]))
edgemat[randomRow][randomCol] = 0
edgemat[randomCol][randomRow] = 0
Tree = minimum_spanning_tree(csr_matrix(edgemat))
self.topo_order, self.parents = depth_first_order(Tree, 0, directed=False)