第一次種樹:先向大佬學習:https://blog.csdn.net/yangliuy/article/details/7322015
話不多說,開始幹!
決策樹作爲最常用的機器學習方法,也是最容易理解的算法,顧名思義,就是對當前樣本做出決策。
舉個栗子:
晴天,空氣溼度正常–>可以外出活動
但是決策樹不是這麼簡單的照本宣科,它的一大功能:對未知屬性集合做出決策
這其實也暗示着:得到一顆決策樹不需要通過訓練所有屬性集合
開始構建決策樹:
一顆決策樹有且僅有一個根節點(outlook),以及至少一個葉節點(NO/Yes)
構建決策樹的過程==挑選最優節點的過程
- 選擇最優節點的依據:我們希望決策樹的分支結點所包含的樣本儘可能屬於同一類別,即結點的"純度"(purity)儘可能的高。而信息熵(information entropy)是度量樣本集合純度最常用的一種指標,假定當前樣本集合S第i類樣本所佔比例爲Pi,則D的信息熵定義爲
若Ent(S)的值越小,則D的純度越高 - 假定離散屬性A有V個可能取值{A1,A2,A3,…AV},若使用α來對也樣本S進行劃分,則會產生V個分支結點,其中第V個分支結點包含了S所有在屬性A上取值爲AV的樣本,記爲SV。再根據樣本所佔比重賦予權重: |SV| / |S|, 可得"信息增益"(information gain)
一般而言,信息增益越大,說明使用屬性A劃分所獲得的“純度提升”越大。這篇博客就是以信息增益爲準則進行劃分屬性的 ID3決策樹
注意:由於這個決策樹只是爲了在項目要求的環境下運行,所以有些數據格式的轉化是必需的。還有博主爲了方便(畢竟ID3本身容易過擬合,不去測試也知道結果怎麼樣),沒有去分割測試集,直接拿訓練集來測試,所以測試集要自己分割,建議採用bagging法
數據集已經上傳,請自行下載
下面是完整的決策樹代碼:
#include <iostream>
#include <string>
#include <vector>
#include <map>
#include <algorithm>
#include <cmath>
#include <fstream>
#include<sstream>
using namespace std;
const int row = 12960;
const int feather = 9;
int mmData[row][feather];//截取數據12960個
int toNum(string str)//Enclave無法接受string類型數據,轉化成整數
{
int ans = 0;
for (unsigned int i = 0; i < str.length(); i++)
{
ans = ans * 10 + (str[i] - '0');
}
return ans;
}
void loaddata(string path)//讀取文本數據,並存儲在二維數組
{
ifstream Filein;
try { Filein.open(path); }
catch (exception e)
{
cout << "File open failed!";
}
string line;
int data_num = 0;
while (getline(Filein, line)) {
int before = 0;
int cnt = 0;
data_num++;
for (unsigned int i = 0; i < line.length(); i++) {
if (line[i] == ',' || line[i] == '\n') {
string sub = line.substr(before, i - before);
before = i + 1;
mmData[data_num - 1][cnt] = toNum(sub);
cnt++;
}
}
mmData[data_num - 1][cnt] = toNum(line.substr(before, line.length()));
}
cout << "data loading done.\nthe amount of data is: " << data_num << endl;
}
int tree_size = 0;
vector<vector<int>>Data;//保存實例集
vector<string>attribute_row;//保存屬性集
vector<int>item(feather);//保存一整行數據
int not_recom(91);
int recommend(92);
int very_recom(93);
int priority(94);
int spec_prior(95);
int blank(0);
map<string, vector < int > > map_attribute_values;//存儲屬性對應的所有的值
struct Node {//決策樹節點
int attribute;//屬性值
string bestAttribute;
int arrived_value;//到達的屬性值
bool LeafNode;
vector<Node *> childs;//所有的孩子
Node() {
attribute = 0;
arrived_value = blank;
bestAttribute = "";
LeafNode = false;
}
};
Node * root;
void setAttribute()
{
string att[9] = { "parents","has_nurs","form","children","housing","finance","socila","health","Distribution" };
for (int i = 0; i < feather; i++)
attribute_row.push_back(att[i]);
}
void GetSgxData(int maindata[12960][9])
{
vector<int>temp;
for (int i = 0; i < row; i++)
{
for (int j = 0; j < feather; j++)
{
temp.push_back(maindata[i][j]);
}
Data.push_back(temp);
temp.clear();
}
}
//建立屬性map(字典)
void ComputeMapFrom2DVector() {
unsigned int i, j, k;
bool exited = false;
vector<int> values;
for (i = 0; i < feather - 1; i++) {//按照列遍歷
for (j = 0; j < Data.size(); j++) {
for (k = 0; k < values.size(); k++) {
if (values[k] == Data[j][i]) exited = true;
}
if (!exited) {
values.push_back(Data[j][i]);//注意Vector的插入都是從前面插入的,注意更新it,始終指向vector頭
}
exited = false;
}
map_attribute_values[attribute_row[i]] = values;
cout << values[0] << endl;
values.erase(values.begin(), values.end());
}
}
//計算信息熵,values(91,92...)
double ComputeEntropy(vector <vector <int> > remain_data, string attribute, int value, bool ifparent) {
vector<int> count(5, 0);
unsigned int i, j;
bool done_flag = false;//哨兵值
for (j = 0; j < feather; j++) {
if (done_flag) break;
if (!attribute_row[j].compare(attribute)) {
for (i = 0; i < remain_data.size(); i++) {
if ((!ifparent && (remain_data[i][j] == value)) || ifparent) {//ifparent記錄是否算父節點
for (int k = 91; k < 96; k++) {//計數,看不同結果的各佔多少
if (remain_data[i][feather - 1] == k) {
count[k - 91]++;
break;
}
}
}
}
done_flag = true;
}
}
if (count[0] == 0 || count[1] == 0 || count[2] == 0 || count[3] == 0 || count[4] == 0) return 0;//全部是正實例或者負實例
//具體計算熵 根據[+count[0],-count[1]],log2爲底通過換底公式換成自然數底數
double sum = count[0] + count[1] + count[2] + count[3] + count[4];
double entropy = 0;
for (int i = 0; i < 5; i++) {
entropy += -count[i] / sum * log(count[i] / sum) / log(2.0);
}
return entropy;
}
double ComputeGain(vector <vector <int> > remain_data, string attribute) {
unsigned int j, k, m;
//首先求不做劃分時的熵
double parent_entropy = ComputeEntropy(remain_data, attribute, 0, true);
double children_entropy = 0;
//然後求做劃分後各個值的熵,values存放了某個屬性的所有可能取值
vector<int> values = map_attribute_values[attribute];
vector<double> ratio;
vector<int> count_values;
int tempint;
for (m = 0; m < values.size(); m++) {
tempint = 0;
for (k = 0; k < feather - 1; k++) {
if (!attribute_row[k].compare(attribute)) {
for (j = 0; j < remain_data.size(); j++) {
if (remain_data[j][k] == values[m]) {
tempint++;
}
}
}
}
count_values.push_back(tempint);
}
for (j = 0; j < values.size(); j++) {
ratio.push_back((double)count_values[j] / (double)(remain_data.size()));
}
double temp_entropy;
for (j = 0; j < values.size(); j++) {
temp_entropy = ComputeEntropy(remain_data, attribute, values[j], false);
children_entropy += ratio[j] * temp_entropy;
}
return (parent_entropy - children_entropy);
}
int FindAttriNumByName(string attri) {
for (int i = 0; i < feather; i++) {
if (!attribute_row[i].compare(attri)) return i;
}
/*cout << "can't find the numth of attribute" << endl;*/
return 0;
}
//找出樣例中佔多數的結果(91,92,93...)
int MostCommonLabel(vector <vector <int> > remain_state) {
int p[5] = { 0 };
for (unsigned i = 0; i < remain_state.size(); i++) {
for (int j = 0; j < 5; j++) {
if (remain_state[i][feather - 1] == 91 + j) {
p[j]++;
break;
}
}
}
int temp = 0;
for (int i = 0; i < 5; i++) {
if (temp < p[i])
temp = p[i];
}
return temp;
}
//判斷樣例是否(正負)性都爲label
bool AllTheSameLabel(vector <vector <int> > remain_state, int label) {
int count = 0;
for (unsigned int i = 0; i < remain_state.size(); i++) {
if (remain_state[i][feather - 1] == label) count++;
}
if (count == remain_state.size()) return true;
else return false;
}
//計算信息增益,DFS構建決策樹
//current_node爲當前的節點
//remain_state爲剩餘待分類的樣例
//remian_attribute爲剩餘還沒有考慮的屬性
//返回根結點指針
Node * BulidDecisionTreeDFS(Node * p, vector <vector <int> > remain_state, vector <string> remain_attribute) {
if (p == NULL)
p = new Node();
//先看搜索到樹葉的情況
for (int i = 91; i < 96; i++) {
if (AllTheSameLabel(remain_state, i)){
p->attribute = i;
p->LeafNode = true;
return p;
}
}
if (remain_attribute.size() == 0) {//所有的屬性均已經考慮完了,還沒有分盡
int label = MostCommonLabel(remain_state);
p->attribute = label;
return p;
}
double max_gain = 0, temp_gain;
vector <string>::iterator max_it = remain_attribute.begin();
vector <string>::iterator it1;
for (it1 = remain_attribute.begin(); it1 != remain_attribute.end(); it1++) {
temp_gain = ComputeGain(remain_state, (*it1));
if (temp_gain > max_gain) {
max_gain = temp_gain;
max_it = it1;
}
}
//下面根據max_it指向的屬性來劃分當前樣例,更新樣例集和屬性集
vector <string> new_attribute;
vector <vector <int> > new_state;
for (vector <string>::iterator it2 = remain_attribute.begin(); it2 != remain_attribute.end(); it2++) {
if ((*it2).compare(*max_it))
new_attribute.push_back(*it2);
}
//確定了最佳劃分屬性,注意保存
p->bestAttribute = *max_it;
vector <int> values = map_attribute_values[*max_it];
int attribue_num = FindAttriNumByName(*max_it);
/*new_state.push_back(attribute_row);*/
for (vector <int>::iterator it3 = values.begin(); it3 != values.end(); it3++) {
for (unsigned int i = 0; i < remain_state.size(); i++) {
if (remain_state[i][attribue_num] == *it3) {
new_state.push_back(remain_state[i]);
}
}
Node * new_node = new Node();
new_node->arrived_value = *it3;
if (new_state.size() == 0) {//表示當前沒有這個分支的樣例,當前的new_node爲葉子節點
new_node->attribute = MostCommonLabel(remain_state);
}
else
BulidDecisionTreeDFS(new_node, new_state, new_attribute);
//遞歸函數返回時即回溯時需要1 將新結點加入父節點孩子容器 2清除new_state容器
p->childs.push_back(new_node);
new_state.erase(new_state.begin(), new_state.end());//注意先清空new_state中的前一個取值的樣例,準備遍歷下一個取值樣例
}
return p;
}
void PrintTree(Node *p, int depth) {
for (int i = 0; i < depth; i++) printf("\t");//按照樹的深度先輸出tab
if (!(p->arrived_value == 0)) {
printf("%d\n", p->arrived_value);//某一屬性所代表的值
for (int i = 0; i < depth + 1; i++) printf("\t");//按照樹的深度先輸出tab
}
cout << p->bestAttribute <<'\t';
printf("%d\n", p->attribute);//標籤
for (vector<Node*>::iterator it = p->childs.begin(); it != p->childs.end(); it++) {
PrintTree(*it, depth + 1);
}
}
void FreeTree(Node *p) {
if (p == NULL)
return;
for (vector<Node*>::iterator it = p->childs.begin(); it != p->childs.end(); it++) {
FreeTree(*it);
}
delete p;
tree_size++;
}
int getIndex(int num)
{
num = (num / 10) - 1;
if (num==-1)
return 3;
else return num;
}
int num = 0;
int predictTree(Node *p,vector<int>test,int depth)
{
if (p->LeafNode&&depth != 0)
{
//cout << "p->attribute:" << p->attribute << endl;
num = p->attribute;
return num;
}
else {
Node *temp;
for (vector<Node*>::iterator it = p->childs.begin(); it != p->childs.end(); it++) {
temp = *it;
//cout <<"p->arrive_val:" <<temp->arrived_value << endl;
int indexAtt = getIndex(temp->arrived_value);
if (test[indexAtt] != temp->arrived_value)
continue;
predictTree(*it, test,depth+1);
}
return 0;
}
}
vector<int>retest;
void BuildSgxTree()
{
setAttribute();
vector <string> remain_attribute;
string att[8] = { "parents","has_nurs","form","children","housing","finance","socila","health"};
for (int i = 0; i < feather - 1; i++)
remain_attribute.push_back(att[i]);
vector <vector <int> > remain_state;
for (unsigned int i = 0; i < Data.size(); i++) {
remain_state.push_back(Data[i]);
}
ComputeMapFrom2DVector();
root = BulidDecisionTreeDFS(root, remain_state, remain_attribute);
//PrintTree(root, 0);
vector<int>test;
for (int x = 0; x < 12960; x++)
{
for (int i = 0; i < 8; i++)
{
if (x == 1)
Data[1][0] = 14;
test.push_back(Data[x][i]);
}
int result = 0;
result = predictTree(root, test, 0);
result = num;
//cout << "result:" << result << endl;
test.clear();
if (result == 0)
{
result = rand() % 5 + 91;
cout << "reslut:" << result << endl;
}
retest.push_back(result);
}
FreeTree(root);
}
int main()
{
loaddata("boring.txt");//獲取數據集,並存於Data數組
int x = 2;
int t = 32;
GetSgxData(mmData);
BuildSgxTree();
cout << "tree_size:" << tree_size << endl;
map<string, vector<int>>::iterator it;
for (it = map_attribute_values.begin(); it != map_attribute_values.end(); it++)
{
cout << it->first << ":";
for (unsigned int i = 0; i < it->second.size(); i++)
cout << it->second[i] << '\t';
cout << endl;
}
double bingo = 0;
for (int i = 0; i < 12960; i++)
{
if (retest[i] == Data[i][8])
bingo++;
}
cout << "accrucy:" << bingo / 12960.00 << endl;
system("pause");
return 0;
}
這個是決策樹的效果圖,用代碼生成的那個不好看,這個是用其他軟件生成的