1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194
| import numpy as np
class BaseDecisionTree: def entropy(self, y): _, cnt = np.unique(y, return_counts=True) probs = cnt / y.shape[0] return -np.sum(probs * np.log(probs) / np.log(2))
def gini(self, y): _, cnt = np.unique(y, return_counts=True) probs = cnt / y.shape[0] return 1 - np.sum(np.square(probs))
def calcInfoGainRatio(self, y, y_sub): n_samples = y.shape[0]
entropy = self.entropy(y) new_entropy = np.sum([ny.size / n_samples * self.entropy(ny) for ny in y_sub]) gain = entropy - new_entropy
iv = - np.sum([ny.size / n_samples * np.log(ny.size / n_samples) for ny in y_sub]) gain_ratio = gain / iv
return gain_ratio
def calcInfoGain(self, y, y_sub): n_samples = y.shape[0]
entropy = self.entropy(y) new_entropy = np.sum([ny.size / n_samples * self.entropy(ny) for ny in y_sub])
return entropy - new_entropy
def calcGiniGain(self, y, y_sub): n_samples = y.shape[0]
gini = self.gini(y) new_gini = np.sum([ny.size / n_samples * self.gini(ny) for ny in y_sub])
return gini - new_gini
def storeTree(self): import pickle with open('DecisionTree.txt', 'wb') as fw: pickle.dump(self.decisionTree, fw)
def loadTree(self): import pickle with open('DecisionTree.txt', 'rb') as fr: self.decisionTree = pickle.load(fr)
class BinaryDecisionTree(BaseDecisionTree): def __init__(self, loss='info_gain'): self.loss = loss self.decisionTree = {}
def fit(self, x, y): self.decisionTree = self.createTree(x, y)
def createTree(self, x, y): if np.unique(y).size == 1: return y[0][0]
index, threshold = self.split(x, y)
if index == -1: return np.argmax(np.bincount(y.flatten()))
decisionTree = {'split_index': index, 'threshold': threshold}
pos = x[:, index] < threshold decisionTree['left'] = self.createTree(x[pos], y[pos]) decisionTree['right'] = self.createTree(x[~pos], y[~pos])
return decisionTree
def split(self, x, y): n_samples, n_feat = x.shape[:] opt_loss, split_index, threshold = 0, -1, None
for i in range(n_feat): unique = np.unique(np.sort(x[:, i])) split_val = (unique[:-1] + unique[1:]) / 2 for val in split_val: pos = x[:, i] < val y_sub = [y[pos], y[~pos]]
if self.loss == 'info_gain': new_loss = self.calcInfoGain(y, y_sub) elif self.loss == 'info_gain_ratio': new_loss = self.calcInfoGainRatio(y, y_sub) else: new_loss = self.calcGiniGain(y, y_sub)
if new_loss > opt_loss: opt_loss, split_index, threshold = new_loss, i, val
return split_index, threshold
def classify(self, inputs, node): index = node['split_index'] nxt = node['left'] if inputs[index] < node['threshold'] else node['right']
if type(nxt).__name__ == 'dict': return self.classify(inputs, nxt) else: return nxt
def predict(self, inputs): res = [self.classify(sample, self.decisionTree) for sample in inputs] return np.array(res)
class MultiwayDecisionTree(BaseDecisionTree): """ :param loss {"info_gain", "info_gain_ratio", "gini"}, default="info_gain" """ def __init__(self, loss='info_gain'): self.loss = loss self.decisionTree = {}
def fit(self, x, y): used = np.full((x.shape[1], ), False) self.decisionTree = self.createTree(x, y, used)
def createTree(self, x, y, used): if np.unique(y).size == 1: return y[0][0]
most_freq = np.argmax(np.bincount(y.flatten()))
if np.all(used): return most_freq
index = self.split(x, y, used)
if index == -1: return most_freq
used[index] = True decisionTree = {'split_index': index, 'others': most_freq}
unique = np.unique(x[:, index]) for val in unique: pos = np.where(x[:, index] == val) decisionTree[val] = self.createTree(x[pos], y[pos], used)
return decisionTree
def split(self, x, y, used): n_samples, n_feat = x.shape[:] opt_loss, split_index = 0., -1
for i in range(n_feat): if used[i]: continue
unique = np.unique(x[:, i]) y_sub = [y[np.where(x[:, i] == val)] for val in unique]
if self.loss == 'info_gain': new_loss = self.calcInfoGain(y, y_sub) elif self.loss == 'info_gain_ratio': new_loss = self.calcInfoGainRatio(y, y_sub) else: new_loss = self.calcGiniGain(y, y_sub)
if new_loss > opt_loss: max_gain, split_index = new_loss, i
return split_index
def classify(self, inputs, node): index = node['split_index']
if inputs[index] not in node: return node['others']
nxt = node[inputs[index]]
if type(nxt).__name__ == 'dict': return self.classify(inputs, nxt) else: return nxt
def predict(self, inputs): res = [self.classify(sample, self.decisionTree) for sample in inputs] return np.array(res)
|