基于ID3算法的决策树研究与天气预测C++实现

目录

一、初识决策树

决策树（ Decision Tree ）又称为判定树，是对数据进行分类的一种树结构，并通过分类达到预测的目的。决策树分为分类树和回归树两种，分类树是对离散变量做决策树，回归树是对连续变量做决策树。构造决策树是采用自上而下的递归构造方法。

决策树构造的结果是一棵二叉或多叉树，它的输入是一组带有类别标记的训练数据。决策树中的每个内部结点代表对某个属性的一次测试，每条边代表一个测试结果，叶结点代表某个类或者类的分布，最上面的结点是根结点。二叉树的非叶结点一般表示为一个逻辑判断，如形为（a = b）的逻辑判断，其中 a 是属性，b 是该属性的某个属性值；树的边是逻辑判断的分支结果；树的叶结点都是类别标记。多叉树的内部结点是属性，边是该属性的所有取值，有几个属性值，就有几条边。

决策树的分类过程也就是决策树分类模型（简称决策树）的生成过程，如下图所示。从图中可知决策树分类的建立过程与用决策树分类模型进行预测的过程实际上是一种归纳-演绎过程。其中，由已分类数据得到决策树分类模型的过程称归纳过程，用决策树分类模型对未分类数据进行分类的过程称为演绎过程。需要强调的是：由训练集得到分类模型必须经过测试集测试达到一定要求才能用于预测。

二、理论基础

1.信息量：衡量信息多少的物理量。

若概率很大，人们事先已有所估计，则该消息信息量很小；若概率很小，人们感到很突然，则该消息所含信息量很大。

信息量的定义：若一个消息x出现的概率为p，则这一消息所含的信息量为

n=2时，单位为bit；n=e时，单位为nat；n=10时，单位为hart。一般计算中n常取2。

例：抛一枚均匀硬币，出现正面和反面的信息量是多少？

解：出现正反面概率均为0.5，则

2.信息熵

信源含有的信息量是信源发出的所有可能消息的平均不确定性，香农把信源所含有的信息量称为信息熵，是指每个属性所含信息量的统计平均值，即所有可能发生事件所带来的的信息量的期望。信息论中一个离散型随机变量X的熵定义如下：

信息熵的定义也可表示为：

n为训练集X类别数，如子集结果类别为正面、反面，则n为2。

例：抛一枚均匀硬币的信息熵是多少？

解：

（注：ID3算法中会为每一个类别计算信息熵，具有最小信息熵的类别在本次迭代中用来划分数据集X。）

3.条件自信息量

在事件出现的条件下，随机事件发生的条件概率为，则它的条件自信息量定义为条件概率对数的负值：

4.条件熵

条件熵的定义是：在Y给定条件下，X的条件概率分布的熵对Y的数学期望。

在给定条件下，的条件自信息量为，X集合的条件熵为：

在给定Y（即各个）条件下，X集合的条件熵为：

注意：条件熵中Y也是一个变量，意思是在一个变量Y的条件下（变量Y的每个值都会取），另一个变量X熵对Y的期望。条件熵不是指在给定某个数（某个变量为某个值）的情况下，另一个变量的熵是多少，变量的不确定性是多少，而是整体X变量的不确定度或期望！

5.信息增益

信息增益描述了一个特征带来的信息量的多少，往往用于特征选择。一个特征往往会使一个随机变量Y的信息量减少，减少的部分就是信息增益。

信息增益 = 信息熵 - 条件熵

是表示集合X被属性Y分类之前和之后熵的差异。即：集合X原来的熵和已知属性Y之后的熵之差。表示属性Y被固定前后集合X的不确定性降低了多少。

（注：ID3算法中会为每一个类别计算信息熵，具有最大信息增益的类别在本次迭代中用来划分数据集X。）

三、ID3算法

ID3 (Iterative Dichotomiser 3) 是由Quinlan提出的分类预测算法，用来给一个数据集创建决策树。该算法以信息论为基础，以信息熵和信息增益为衡量标准，从而实现对数据的归纳分类。ID3算法计算每个属性的信息增益，并选取具有最高增益的属性作为给定集合的测试属性。对被选取的测试属性创建一个节点，并以该节点的属性标记，对该属性的每个值创建一个分支以此划分样本。

算法流程[References 2]：

四、问题实现

问题描述：给定上述天气数据，将数据划分为训练集和测试集（最后四行），设计算法使用测试集测试构造的决策树，然后使用测试集数据进行预测，看是否适合打网球一栏是否能够正确对应。

程序设计思想（ID3）：每次从数据集中根据最大信息增益选取一个最好特征，将数据进行划分，每次划分都会消耗一个特征，使得特征越来越少，当所有数据集都是同一类，或者消耗完所有特征时，划分结束。其中信息熵和信息增益使用前面理论基础部分的公式。

说明：为了区分最后两列的取值，使观察更方便，我将最后一列中“是”替换为“适合”，“否”替换为“不适合”。

C++实现：代码修改自References[4]

#include <iostream>
#include <string>
#include <vector>
#include <set>
#include <algorithm>
#include <map>
#include <math.h>
using namespace std;
 
#define N 10
#define feature 4
vector< vector<string> > X;
string x[N][feature+1] = 
{
	{"晴",    "热", "高", "否", "不适合"},   
	{"晴",    "热", "高", "是", "不适合"},
	{"阴",    "热", "高", "否",   "适合"},
	{"雨",    "温", "高", "否",   "适合"},
	{"雨",  "凉爽", "中", "否",   "适合"},
	{"雨",  "凉爽", "中", "是", "不适合"},
	{"阴",  "凉爽", "中", "是",   "适合"},
	{"晴",    "温", "高", "否", "不适合"},
	{"晴",  "凉爽", "中", "否",   "适合"},
	{"雨",    "温", "中", "否",   "适合"},
//	{"晴",    "温", "中", "是",   "适合"},
//	{"阴",    "温", "高", "是",   "适合"},
//	{"阴",    "热", "中", "否",   "适合"},
//	{"雨",    "温", "高", "是", "不适合"}
};

//四个特征的名称，比如天气取值有三个：晴，阴，雨 
string attribute[] = {"天气", "温度", "湿度", "是否有风"};

vector<string> attributes;
 
//创建数据集
void createDataset() {
	//创建数据集
	X = vector< vector<string> >(N, vector<string>(feature+1));
	int i, j;
	for(i=0; i<N; i++) {
		for(int j=0; j<feature+1; j++) {
			X[i][j] = x[i][j];
		}
	}
	//创建特征
	for(i=0; i<feature; i++)
		attributes.push_back(attribute[i]);
}
 
//计算给定数据集的香农熵
double calcShanno(const vector< vector<string> > &data) {
	 int n = data.size();
	 map<string, int> classCounts;
	 int i;
	 int label = data[0].size() - 1;
    //初始为0
	 for(i=0; i<n; i++)
		classCounts[ data[i][label] ] = 0;
     //每当出现一次，+1
	 for(i=0; i<data.size(); i++)
		classCounts[ data[i][label] ] += 1;
	 //计算香农熵
	 double shanno = 0;
	 map<string, int>::iterator it;
	 for(it = classCounts.begin(); it != classCounts.end(); it++) {
		 double prob = (double)(it->second) / (double)n;
		 shanno -= prob * ( log(prob) / log(2) );
	 }
	 return shanno;
}
 
//按照给定特征划分数据集，划分后的数据集中不包含给定特征，即新的数据集的维度少了一个
//axis ：特征下标
//value：特征值
vector< vector<string> > splitDataSet(const vector< vector<string> > data, int axis, string value) {
	vector< vector<string> > result;
	for(int i=0; i<data.size(); i++) {
		if(data[i][axis] == value) {
			//将“当前特征”这个维度去掉
			vector<string> removed(data[i].begin(), data[i].begin()+axis);
			removed.insert(removed.end(), data[i].begin()+axis+1, data[i].end());
			result.push_back(removed);
		}
	}
	return result;
}
 
//创建特征列表
vector<string> createFeatureList(const vector< vector<string> > &data, int axis) {
	int n = data.size();
	vector<string>featureList;   //特征的所有取值
	set<string> s;
	for(int j=0; j<n; j++)    //寻找该特征的所有可能取值
		s.insert(data[j][axis]);
	set<string>::iterator it;
	for(it = s.begin(); it != s.end(); it++) {
		featureList.push_back(*it);
	}
	return featureList;
}
 
//选择最好的数据集划分方式
int chooseBestFeatureToSplit(const vector< vector<string> > &data) {
	int n = data[0].size() - 1; 
	double bestEntropy = calcShanno(data);  //初始香农熵
	double bestInfoGain = 0;   //最大的信息增益
	int bestFeature = 0;       //最好的特征
    //所有特征
	for(int i=0; i<n; i++) {
		double newEntropy = 0;
        //该特征的所有可能取值
		vector<string> featureList = createFeatureList(data, i);  
		for(int j=0; j<featureList.size(); j++) {
			vector< vector<string> > subData = splitDataSet(data, i, featureList[j]);
			double prob = (double)subData.size() / (double)data.size();
			newEntropy += prob * calcShanno(subData);   
		}
                          //信息增益，即熵的减少，或数据无序度的减少
		double infoGain = bestEntropy - newEntropy;  
		if(infoGain > bestInfoGain) {
			bestInfoGain = infoGain;
			bestFeature = i;
		}
	}
	return bestFeature;
}
 
//返回出现次数最多的分类名称
//如果数据集已处理了所有属性，但类标签依然不是唯一的，采用多数表决的方法定义叶子节点的分类
string majorityCnt(vector<string> &classList) {
	int n = classList.size();
	map<string, int> classCount;
	int i;
	for(i=0; i<n; i++)
		classCount[classList[i]] = 0;
	for(i=0; i<n; i++)
		classCount[classList[i]] += 1;
	int maxCnt = 0;
	map<string, int>::iterator it;
	string result = "";
	for(it = classCount.begin(); it != classCount.end(); it++) {
		if(it->second > maxCnt) {
			maxCnt = it->second;
			result = it->first;
		}
	}
	return result;
}
 
struct Node {
	string attribute; 
	string val; 
	bool isLeaf;
	vector<Node*> childs;
	Node() {
		val = "";
		attribute = "";
		isLeaf = false;
	}
};
Node *root = NULL;
 
//递归构建决策树
Node* createTree(Node *root, const vector< vector<string> > &data, vector<string> &attribute) {
	if(root == NULL)
		root = new Node();
	vector<string> classList;
	set<string> classList1;
	int i, j;
	int label = data[0].size() - 1;
	int n = data.size();
	for(i=0; i<n; i++) {
		classList.push_back(data[i][label]);
		classList1.insert(data[i][label]);
	}
    //如果所有实例都属于同一类，停止划分
	if(classList1.size() == 1) {
		if(classList[0] == "适合")
			root->attribute = "适合";
		else
			root->attribute = "不适合";
		root->isLeaf = true;
		return root;
	}
    //遍历完所有特征，返回出现次数最多的类别
	if(data[0].size() == 1) {
		root->attribute = majorityCnt(classList);
		return root;
	}
 
	int bestFeatureIndex = chooseBestFeatureToSplit(data);
    //得到属性的所有可能值
	vector<string> featureList = createFeatureList(data, bestFeatureIndex);  
	string bestFeature = attribute[bestFeatureIndex];
    //记录要划分的属性
	root->attribute = bestFeature;   
    //对于当前属性的每个可能值，创建新的分支
	for(i=0; i<featureList.size(); i++) {
		vector<string> subAttribute;  
		for(j=0; j<attribute.size(); j++) {
			if(bestFeature != attribute[j])
				subAttribute.push_back(attribute[j]);
		}
		Node *newNode = new Node();
		newNode->val = featureList[i];//记录属性的取值
		createTree(newNode, splitDataSet(data, bestFeatureIndex, featureList[i]), subAttribute);
		root->childs.push_back(newNode);
	}
	return root;
}
 
//打印
void print(Node *root, int depth) {
	int i;
	for(i=0; i<depth; i++)
		cout << "\t";
	if(root->val != "") {
		cout << root->val << endl;
		for(i=0; i<depth+1; i++)
			cout << "\t";
	}
	cout << root->attribute << endl;
	vector<Node*>::iterator it;
	for(it = root->childs.begin(); it != root->childs.end(); it++) {
		print(*it, depth+1);
	}
}
 
//预测x
string classify(Node *root, vector<string> &attribute, string *test) {
	string firstFeature = root->attribute;
	int firstFeatureIndex;
	int i;
    //找到根节点是第几个特征
	for(i=0; i<feature; i++) {
		if(firstFeature == attribute[i]) {
			firstFeatureIndex = i;
			break;
		}
	}
	if(root->isLeaf)  //如果是叶子节点，直接输出结果
		return root->attribute;
	for(i=0; i<root->childs.size(); i++) {
		if(test[firstFeatureIndex] == root->childs[i]->val) {
			return classify(root->childs[i], attribute, test);
		}
	}
}

//释放节点
void freeNode(Node *root) {
	if(root == NULL)
		return;
	vector<Node*>::iterator it;
	for(it=root->childs.begin(); it != root->childs.end(); it++)
		freeNode(*it);
	delete root;
}
 
int main() {	
	createDataset();
	root = createTree(root, X, attributes);
	print(root, 0);
	string test[] = {"晴", "温", "中", "是"};
	int i;
	cout << endl << "属性：";
	for(i=0; i<feature; i++)
		cout << attributes[i] << "\t";
	cout << endl << "例子：";
	for(i=0; i<feature; i++)
		cout << test[i] << "\t";
	cout << endl << "预测：";
	cout << classify(root, attributes, test) << endl;
	freeNode(root);
	return 0;
}