决策树 - Decision Tree

《人工智能通识课还真讲算法.jpg》

决策树

实现用的是 ID3 算法

  1. 计算对于当前数据集的目标分类的信息熵,记数据集为 其中, 是第 个分类
  2. 计算当前数据集每一个属性(除了目标分类属性)的信息增益
    1. 先根据第 个属性,记为 ,将原数据集按这一属性的属性值分组,我们记属性值为 的分组为 ,随后计算这一分组中目标分类的信息熵,注意,这边的 是分组后的。
    2. 计算该属性的加权平均信息熵,其中, 分别表示当前数据集中按 的分组的数据条数,与整个数据集中的数据个数。
    3. 计算信息增益
  3. 选取信息增益最大的一个属性作为决策树新一层的分支依据,即根据这一属性进行分类(决策)
  4. 递归进行 步进行建树,直到当前数据集只剩 条数据或当前数据集的标签已经全部用完。
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
#include <bits/stdc++.h>
using namespace std;

class DecisionTree {
public:
vector<DecisionTree *> leaves;
string name;
DecisionTree(string n) : name(n) {}
};

DecisionTree *createTree(string name, vector<vector<string>> dataset,
vector<string> tags, size_t label_id,
size_t attribute_count) {
// Calculate Entropy for target
double E_tar = 0.0;
double p_i;
unordered_map<string, size_t> label_type_cnt;
DecisionTree *tr = new DecisionTree(name);
for (auto &record : dataset) {
label_type_cnt[record[label_id]]++;
}
cout << "--------------------------------------------------------------------------" << endl;
cout << "[" << name << "] Entropy:";
for (auto &[label, freq] : label_type_cnt) {
p_i = freq / (double)dataset.size();
cout << " -" << p_i << " * log2(" << p_i << ")";
E_tar += -p_i * log2(p_i);
}
cout << " = " << E_tar << endl;
vector<double> E_A(attribute_count, 0.0); // E(label|attribute)
vector<double> G_A(attribute_count, 0.0); // InfoGain(label|attribute)
double gain_max = -1;
int gain_max_attribute_id = -1;
for (size_t attribute_id = 0; attribute_id < attribute_count;
attribute_id++) {
unordered_map<string, map<string, int>> attribute_types_label_types_cnt;
for (auto &record : dataset) {
attribute_types_label_types_cnt[record[attribute_id]][record[label_id]]++;
}
// ltc: label type count
cout << "[" << tags[attribute_id] << "] -------------------- " << endl;
for (auto &[attribute_value, ltc] : attribute_types_label_types_cnt) {
double E = 0;
int attribute_value_cnt = 0;
for (auto &[label, freq] : ltc) {
attribute_value_cnt += freq;
}
cout << " " << attribute_value
<< " P: " << (attribute_value_cnt / (double)dataset.size())
<< " E: ";
for (auto &[label, freq] : ltc) {
p_i = freq / (double)attribute_value_cnt;
cout << " -" << p_i << " * log2(" << p_i << ")";
E += -p_i * log2(p_i);
}
cout << " = " << E << endl;
E_A[attribute_id] += (attribute_value_cnt / (double)dataset.size()) * E;
}
G_A[attribute_id] = E_tar - E_A[attribute_id];
if (G_A[attribute_id] > gain_max)
gain_max = G_A[attribute_id], gain_max_attribute_id = attribute_id;
cout << " " << "E: " << E_A[attribute_id] << " G: " << G_A[attribute_id]
<< endl;
cout << "[" << tags[attribute_id] << "] -------------------- " << endl;
}
cout << "max: " << tags[gain_max_attribute_id] << " " << gain_max << endl;

DecisionTree *leaf = new DecisionTree(tags[gain_max_attribute_id]);
unordered_map<string, vector<vector<string>>> attribute_datesets;
tr->leaves.push_back(leaf);

vector<string> newTags = tags;
newTags.erase(newTags.begin() + gain_max_attribute_id);
for (auto &record : dataset) {
vector<string> newRecord = record;
newRecord.erase(newRecord.begin() + gain_max_attribute_id);
attribute_datesets[record[gain_max_attribute_id]].push_back(newRecord);
}

for (auto &[attribute_value, newDataset] : attribute_datesets) {
unordered_set<string> attribute_labels;
DecisionTree *trr = new DecisionTree(attribute_value);
for (auto &record : newDataset) {
attribute_labels.emplace(record[label_id - 1]);
}
if (newDataset.size() > 1 && attribute_labels.size() > 1) {
leaf->leaves.push_back(
createTree(attribute_value, newDataset, newTags,
label_id - 1, // minus 1 for earsed attribute
attribute_count - 1));
} else {
for (auto &label : attribute_labels) {
trr->leaves.push_back(new DecisionTree(label));
}
leaf->leaves.push_back(trr);
}
}
return tr;
}

int main() {
cout << fixed << setprecision(3);
// 0 1 2 3 4
// age income credit debt loan (label)
vector<string> tags = {"age", "income", "credit", "debt", "loan"};
vector<vector<string>> dataset = {
{"Young", "Low", "Fair", "High", "No"},
{"Young", "Low", "Good", "High", "No"},
{"Middle", "Low", "Good", "High", "Yes"},
{"Old", "Medium", "Good", "High", "Yes"},
{"Old", "High", "Fair", "Low", "Yes"},
{"Old", "High", "Good", "Low", "Yes"},
{"Middle", "High", "Good", "Low", "Yes"},
{"Young", "Medium", "Fair", "High", "No"},
{"Young", "High", "Good", "Low", "Yes"},
{"Old", "Medium", "Fair", "Low", "Yes"},
{"Young", "Medium", "Good", "High", "Yes"},
{"Middle", "Medium", "Poor", "High", "No"},
};
const size_t label_id = 4;
const size_t attribute_count = 4;
DecisionTree *tr =
createTree("Loan", dataset, tags, label_id, attribute_count);
cout << endl;
queue<tuple<int, DecisionTree *>> q;
q.push({0, tr});
int last_layer = 0;
while (!q.empty()) {
auto [layer, tp] = q.front();
q.pop();
if (layer != last_layer) {
last_layer = layer;
cout << endl;
}
cout << tp->name << " ";
for (auto &t : tp->leaves) {
q.push({layer + 1, t});
}
delete tp;
}
cout << endl;
return 0;
}