ID3決策樹原理及其實現(簡單易懂,附測試函數)

第一次種樹:先向大佬學習:https://blog.csdn.net/yangliuy/article/details/7322015
話不多說,開始幹!
決策樹作爲最常用的機器學習方法,也是最容易理解的算法,顧名思義,就是對當前樣本做出決策。
舉個栗子:
晴天,空氣溼度正常–>可以外出活動
但是決策樹不是這麼簡單的照本宣科,它的一大功能:對未知屬性集合做出決策
這其實也暗示着:得到一顆決策樹不需要通過訓練所有屬性集合
決策樹例子
開始構建決策樹:
一顆決策樹有且僅有一個根節點(outlook),以及至少一個葉節點(NO/Yes)
構建決策樹的過程==挑選最優節點的過程

  1. 選擇最優節點的依據:我們希望決策樹的分支結點所包含的樣本儘可能屬於同一類別,即結點的"純度"(purity)儘可能的高。而信息熵(information entropy)是度量樣本集合純度最常用的一種指標,假定當前樣本集合S第i類樣本所佔比例爲Pi,則D的信息熵定義爲信息熵
    若Ent(S)的值越小,則D的純度越高
  2. 假定離散屬性A有V個可能取值{A1,A2,A3,…AV},若使用α來對也樣本S進行劃分,則會產生V個分支結點,其中第V個分支結點包含了S所有在屬性A上取值爲AV的樣本,記爲SV。再根據樣本所佔比重賦予權重: |SV| / |S|, 可得"信息增益"(information gain)信息增益
    一般而言,信息增益越大,說明使用屬性A劃分所獲得的“純度提升”越大。這篇博客就是以信息增益爲準則進行劃分屬性的 ID3決策樹

注意:由於這個決策樹只是爲了在項目要求的環境下運行,所以有些數據格式的轉化是必需的。還有博主爲了方便(畢竟ID3本身容易過擬合,不去測試也知道結果怎麼樣),沒有去分割測試集,直接拿訓練集來測試,所以測試集要自己分割,建議採用bagging法
數據集已經上傳,請自行下載
下面是完整的決策樹代碼:

 
#include <iostream>
#include <string>  
#include <vector>  
#include <map>  
#include <algorithm>  
#include <cmath>  
#include <fstream>
#include<sstream>
using namespace std;
const int row = 12960;
const int feather = 9;
int mmData[row][feather];//截取數據12960個
int toNum(string str)//Enclave無法接受string類型數據,轉化成整數
{
	int ans = 0;
	for (unsigned int i = 0; i < str.length(); i++)
	{
		ans = ans * 10 + (str[i] - '0');
	}
	return ans;
}
void loaddata(string path)//讀取文本數據,並存儲在二維數組
{
	ifstream Filein;
	try { Filein.open(path); }
	catch (exception e)
	{
		cout << "File open failed!";
	}

	string line;
	int data_num = 0;
	while (getline(Filein, line)) {
		int before = 0;
		int cnt = 0;
		data_num++;
		for (unsigned int i = 0; i < line.length(); i++) {
			if (line[i] == ',' || line[i] == '\n') {
				string sub = line.substr(before, i - before);
				before = i + 1;
				mmData[data_num - 1][cnt] = toNum(sub);
				cnt++;
			}
		}
		mmData[data_num - 1][cnt] = toNum(line.substr(before, line.length()));
	}
	cout << "data loading done.\nthe amount of data is: " << data_num << endl;
}
 

 
int tree_size = 0;
vector<vector<int>>Data;//保存實例集
vector<string>attribute_row;//保存屬性集
vector<int>item(feather);//保存一整行數據
int not_recom(91);
int recommend(92);
int very_recom(93);
int priority(94);
int spec_prior(95);
int blank(0);
map<string, vector < int > > map_attribute_values;//存儲屬性對應的所有的值
struct Node {//決策樹節點  
	int attribute;//屬性值
	string bestAttribute;
	int arrived_value;//到達的屬性值  
	bool LeafNode;
	vector<Node *> childs;//所有的孩子  
	Node() {
		attribute = 0;
		arrived_value = blank;
		bestAttribute = "";
		LeafNode = false;
	}
};
Node * root;
void setAttribute()
{
	string att[9] = { "parents","has_nurs","form","children","housing","finance","socila","health","Distribution" };
	for (int i = 0; i < feather; i++)
		attribute_row.push_back(att[i]);
}
void GetSgxData(int maindata[12960][9])
{
	vector<int>temp;
	for (int i = 0; i < row; i++)
	{
		for (int j = 0; j < feather; j++)
		{
			temp.push_back(maindata[i][j]);
		}
		Data.push_back(temp);
		temp.clear();
	}
}

//建立屬性map(字典)
void ComputeMapFrom2DVector() {
	unsigned int i, j, k;
	bool exited = false;
	vector<int> values;
	for (i = 0; i < feather - 1; i++) {//按照列遍歷  
		for (j = 0; j < Data.size(); j++) {
			for (k = 0; k < values.size(); k++) {
				if (values[k] == Data[j][i]) exited = true;
			}
			if (!exited) {
				values.push_back(Data[j][i]);//注意Vector的插入都是從前面插入的,注意更新it,始終指向vector頭  
			}
			exited = false;
		}
		map_attribute_values[attribute_row[i]] = values;
		cout << values[0] << endl;
		values.erase(values.begin(), values.end());
	}
}
//計算信息熵,values(91,92...)
double ComputeEntropy(vector <vector <int> > remain_data, string attribute, int value, bool ifparent) {
	vector<int> count(5, 0);
	unsigned int i, j;
	bool done_flag = false;//哨兵值  
	for (j = 0; j < feather; j++) {
		if (done_flag) break;
		if (!attribute_row[j].compare(attribute)) {
			for (i = 0; i < remain_data.size(); i++) {
				if ((!ifparent && (remain_data[i][j] == value)) || ifparent) {//ifparent記錄是否算父節點  
					for (int k = 91; k < 96; k++) {//計數,看不同結果的各佔多少
						if (remain_data[i][feather - 1] == k) {
							count[k - 91]++;
							break;
						}
					}
				}
			}
			done_flag = true;
		}
	}
	if (count[0] == 0 || count[1] == 0 || count[2] == 0 || count[3] == 0 || count[4] == 0) return 0;//全部是正實例或者負實例  
	//具體計算熵 根據[+count[0],-count[1]],log2爲底通過換底公式換成自然數底數  
	double sum = count[0] + count[1] + count[2] + count[3] + count[4];
	double entropy = 0;
	for (int i = 0; i < 5; i++) {
		entropy += -count[i] / sum * log(count[i] / sum) / log(2.0);
	}
	return entropy;
}

double ComputeGain(vector <vector <int> > remain_data, string attribute) {
	unsigned int j, k, m;
	//首先求不做劃分時的熵  
	double parent_entropy = ComputeEntropy(remain_data, attribute, 0, true);
	double children_entropy = 0;
	//然後求做劃分後各個值的熵,values存放了某個屬性的所有可能取值
	vector<int> values = map_attribute_values[attribute];
	vector<double> ratio;
	vector<int> count_values;
	int tempint;
	for (m = 0; m < values.size(); m++) {
		tempint = 0;
		for (k = 0; k < feather - 1; k++) {
			if (!attribute_row[k].compare(attribute)) {
				for (j = 0; j < remain_data.size(); j++) {
					if (remain_data[j][k] == values[m]) {
						tempint++;
					}
				}
			}
		}
		count_values.push_back(tempint);
	}

	for (j = 0; j < values.size(); j++) {
		ratio.push_back((double)count_values[j] / (double)(remain_data.size()));
	}
	double temp_entropy;
	for (j = 0; j < values.size(); j++) {
		temp_entropy = ComputeEntropy(remain_data, attribute, values[j], false);
		children_entropy += ratio[j] * temp_entropy;
	}
	return (parent_entropy - children_entropy);
}


int FindAttriNumByName(string attri) {
	for (int i = 0; i < feather; i++) {
		if (!attribute_row[i].compare(attri)) return i;
	}
	/*cout << "can't find the numth of attribute" << endl;*/
	return 0;
}

//找出樣例中佔多數的結果(91,92,93...) 
int MostCommonLabel(vector <vector <int> > remain_state) {
	int p[5] = { 0 };
	for (unsigned i = 0; i < remain_state.size(); i++) {
		for (int j = 0; j < 5; j++) {
			if (remain_state[i][feather - 1] == 91 + j) {
				p[j]++;
				break;
			}
		}
	}
	int temp = 0;
	for (int i = 0; i < 5; i++) {
		if (temp < p[i])
			temp = p[i];
	}
	return temp;
}

//判斷樣例是否(正負)性都爲label  
bool AllTheSameLabel(vector <vector <int> > remain_state, int label) {
	int count = 0;
	for (unsigned int i = 0; i < remain_state.size(); i++) {
		if (remain_state[i][feather - 1] == label) count++;
	}
	if (count == remain_state.size()) return true;
	else return false;
}

//計算信息增益,DFS構建決策樹  
//current_node爲當前的節點  
//remain_state爲剩餘待分類的樣例  
//remian_attribute爲剩餘還沒有考慮的屬性  
//返回根結點指針  
Node * BulidDecisionTreeDFS(Node * p, vector <vector <int> > remain_state, vector <string> remain_attribute) {
	if (p == NULL)
		p = new Node();
	//先看搜索到樹葉的情況  
	for (int i = 91; i < 96; i++) {
		if (AllTheSameLabel(remain_state, i)){
			p->attribute = i;
			p->LeafNode = true;
			return p;
		}
	}
	 

	if (remain_attribute.size() == 0) {//所有的屬性均已經考慮完了,還沒有分盡  
		int label = MostCommonLabel(remain_state);
		p->attribute = label;
		return p;
	}

	double max_gain = 0, temp_gain;
	vector <string>::iterator max_it = remain_attribute.begin();
	vector <string>::iterator it1;
	for (it1 = remain_attribute.begin(); it1 != remain_attribute.end(); it1++) {
		temp_gain = ComputeGain(remain_state, (*it1));
		if (temp_gain > max_gain) {
			max_gain = temp_gain;
			max_it = it1;
		}
	}
	//下面根據max_it指向的屬性來劃分當前樣例,更新樣例集和屬性集  
	vector <string> new_attribute;
	vector <vector <int> > new_state;
	for (vector <string>::iterator it2 = remain_attribute.begin(); it2 != remain_attribute.end(); it2++) {
		if ((*it2).compare(*max_it))
			new_attribute.push_back(*it2);
	}
	//確定了最佳劃分屬性,注意保存  
	p->bestAttribute = *max_it;
	vector <int> values = map_attribute_values[*max_it];
	int attribue_num = FindAttriNumByName(*max_it);
	/*new_state.push_back(attribute_row);*/
	for (vector <int>::iterator it3 = values.begin(); it3 != values.end(); it3++) {
		for (unsigned int i = 0; i < remain_state.size(); i++) {
			if (remain_state[i][attribue_num] == *it3) {
				new_state.push_back(remain_state[i]);
			}
		}
		Node * new_node = new Node();
		new_node->arrived_value = *it3;
		if (new_state.size() == 0) {//表示當前沒有這個分支的樣例,當前的new_node爲葉子節點  
			new_node->attribute = MostCommonLabel(remain_state);
		}
		else
			BulidDecisionTreeDFS(new_node, new_state, new_attribute);
		//遞歸函數返回時即回溯時需要1 將新結點加入父節點孩子容器 2清除new_state容器  
		p->childs.push_back(new_node);
		new_state.erase(new_state.begin(), new_state.end());//注意先清空new_state中的前一個取值的樣例,準備遍歷下一個取值樣例  
	}
	return p;
}


void PrintTree(Node *p, int depth) {
	for (int i = 0; i < depth; i++) printf("\t");//按照樹的深度先輸出tab  
	if (!(p->arrived_value == 0)) {
		printf("%d\n", p->arrived_value);//某一屬性所代表的值
		for (int i = 0; i < depth + 1; i++) printf("\t");//按照樹的深度先輸出tab  
	}
	cout << p->bestAttribute <<'\t';
	printf("%d\n", p->attribute);//標籤

	for (vector<Node*>::iterator it = p->childs.begin(); it != p->childs.end(); it++) {
		PrintTree(*it, depth + 1);
	}
}
void FreeTree(Node *p) {
	if (p == NULL)
		return;
	for (vector<Node*>::iterator it = p->childs.begin(); it != p->childs.end(); it++) {
		FreeTree(*it);
	}
	delete p;
	tree_size++;
}

int getIndex(int num)
{
	num = (num / 10) - 1;
	if (num==-1)
		return 3;
	else  return num; 
}

int num = 0;
int predictTree(Node *p,vector<int>test,int depth)
{

	if (p->LeafNode&&depth != 0)
	{
		//cout << "p->attribute:" << p->attribute << endl;
		num = p->attribute;
		return num;
	}
	
	else {
		Node *temp;
		for (vector<Node*>::iterator it = p->childs.begin(); it != p->childs.end(); it++) {
			temp = *it;
			//cout <<"p->arrive_val:" <<temp->arrived_value << endl;
			int indexAtt = getIndex(temp->arrived_value);
			if (test[indexAtt] != temp->arrived_value)
				continue;
			predictTree(*it, test,depth+1);
		}
		return 0;
	}
}
 
vector<int>retest;
void BuildSgxTree()
{
	setAttribute();
	vector <string> remain_attribute;
	string att[8] = { "parents","has_nurs","form","children","housing","finance","socila","health"};
	for (int i = 0; i < feather - 1; i++)
		remain_attribute.push_back(att[i]);
	 
	vector <vector <int> > remain_state;
	for (unsigned int i = 0; i < Data.size(); i++) {
		remain_state.push_back(Data[i]);
	}
	ComputeMapFrom2DVector();
	root = BulidDecisionTreeDFS(root, remain_state, remain_attribute);
	//PrintTree(root, 0);
	 
	vector<int>test;
	 
	 
	for (int x = 0; x < 12960; x++)
	{
		for (int i = 0; i < 8; i++)
		{
			if (x == 1)
				Data[1][0] = 14;
			test.push_back(Data[x][i]);
		}
		int result = 0;
		result = predictTree(root, test, 0);
		result = num;
		//cout << "result:" << result << endl;
		test.clear();
		if (result == 0)
		{
			result = rand() % 5 + 91;
			cout << "reslut:" << result << endl;
		}
		retest.push_back(result);
	}
	FreeTree(root);
}


int main()
{
	loaddata("boring.txt");//獲取數據集,並存於Data數組
	int x = 2;
	int t = 32;
	GetSgxData(mmData);
	BuildSgxTree();
	cout << "tree_size:" << tree_size << endl;
	map<string, vector<int>>::iterator it;
	for (it = map_attribute_values.begin(); it != map_attribute_values.end(); it++)
	{
		cout << it->first << ":";
		for (unsigned int i = 0; i < it->second.size(); i++)
			cout << it->second[i] << '\t';
		cout << endl;
	}
	double bingo = 0;
	for (int i = 0; i < 12960; i++)
	{
		if (retest[i] == Data[i][8])
			bingo++;
	}
	cout << "accrucy:" << bingo / 12960.00 << endl;
	system("pause");
	return 0;
}

這個是決策樹的效果圖,用代碼生成的那個不好看,這個是用其他軟件生成的
決策樹效果圖

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章