有bug,一直未找到,閒暇下來在看看,存一檔
輸入格式
首先爲正整數n、m,分別代表特徵個數、訓練樣本個數。
隨後爲m行,每行有n+1個整數。其中(1<n<=100,1<m<=1000)。
在後續的m行中,每行代表一個樣本中的n個整數特徵值( )與樣本的實際觀測結果y(整數)。其中 。
輸出格式
決策樹的前序遍歷結果,若是一個內部節點則輸出inner(特徵id),若是一個葉節點,則輸出leaf(特徵id)。在對每個節點打印時,若該節點處在n層,需在前面打印n個---,並打印該分支的特徵值。
輸入樣例:
3 10
2 1 1 0
2 0 1 1
1 1 1 0
0 1 0 0
0 0 1 2
0 1 0 0
1 0 0 1
1 0 1 3
1 1 1 0
2 0 0 1
輸出樣例:
inner(1)
---0->inner(0)
------0->leaf(2)
------1->inner(2)
---------0->leaf(1)
---------1->leaf(3)
------2->leaf(1)
---1->leaf(0)
代碼如下:問題,在最後打印出決策樹時總是出錯
#include<iostream>
#include<vector>
#include<math.h>
#include<map>
#include<stdio.h>
using namespace std;
struct TNode{
//構造樹的節點
int feature;
int type;
//feature不爲-1,代表是分支節點,相同,type不爲-1代表是葉節點,二者不能同時爲-1
TNode *child[32];
//int*a[5]是指針數組,int(*a)[5]是數組指針,前者表示一個數組,數組元素都是指向int型變量的指針,後者表示一個指針,該指針指向一個int型有5個元素的數組,希望對你有所幫助
TNode(){
feature=-1;
type=-1;
for(int i=0;i<32;i++){
child[i]=0;
}
}
};
TNode* construct_decision_tree(vector<vector<int> > training_sample,vector<int> feature);
double compute_single_entropy(double p){
//計算單個的信息熵,只需要傳入y爲某個值的比例即可
if(p!=0)
return -p*(log(p)/log(2));
return 0;
}
double compute_double_entropy(vector<int> y){
//計算某一堆分類的信息熵,只需要將關於輸出的數組傳入即可
map<int,int> y_count;
//將每個y出現的次數都存下來
for(int i=0;i<y.size();i++){
y_count[y[i]]++;
}
double all_entropy=0.0;
//遍歷整個map,將所有的entropy相加
map<int,int>::iterator beg=y_count.begin();
map<int,int>::iterator end=y_count.end();
while(beg!=end){
all_entropy+=compute_single_entropy(double(beg->second)/double(y.size()));
beg++;
}
return all_entropy;
}
int find_the_best_division(vector<vector<int> > training_sample,vector<int>feature){
//找出最好的分類方法,所需的是:整個訓練特徵向量組和每個特徵向量的結果標註
double best_division;
int best_feature=-1;
map<int,int> x_count;
map<int,vector<int> > y_value;
//記錄每個x可能出現的y;
for(int colnum=0;colnum<training_sample[0].size();colnum++){
double division=0.0;
for(int row=0;row<training_sample.size();row++){
x_count[training_sample[row][colnum]]++;
y_value[training_sample[row][colnum]].push_back(training_sample[row][training_sample[0].size()-1]);
}
map<int,int>::iterator beg=x_count.begin();
map<int,int>::iterator end=x_count.end();
while(beg!=end){
division+=double(beg->second)/double(training_sample.size())*compute_double_entropy(y_value[beg->first]);
if(best_feature==-1||best_division>division){
best_division=division;
best_feature=colnum;
}
beg++;
}
x_count.clear();
y_value.clear();
}
return feature[best_feature];
}
int check_is_leaf(vector<vector<int> > training_sample,vector<int> feature){
//判斷是不是葉子節點,是葉子節點,把對應的標註結果返回
map<int,int> y_count;
for(int i=0;i<training_sample.size();i++){
y_count[training_sample[i][training_sample[0].size()-1]]++;
}
cout<<y_count.size()<<'-'<<feature.size()<<endl;
if(y_count.size()!=1&&feature.size()!=0) return -1;
map<int,int>::iterator beg=y_count.begin();
map<int,int>::iterator max=beg;
map<int,int>::iterator end=y_count.end();
while(beg!=end){
if(beg->second>max->second) max=beg;
beg++;
}
return max->first;
}
void create_branch_tree(vector<vector<int> >training_sample,TNode* root,int division_feature,vector<int> feature){
int division_location=0;
//先找到最好分類值的位置
while(feature[division_location]!=division_feature) division_location++;
vector<int>::iterator it=feature.begin()+division_location;
feature.erase(it);
map<int,vector<vector<int> > >classification;
for(int i=0;i<training_sample.size();i++){
int key=training_sample[i][division_location];
vector<int> value;
for(int j=0;j<division_location;j++) value.push_back(training_sample[i][j]);
for(int j=division_location;j<training_sample[0].size();j++) value.push_back(training_sample[i][j]);
classification[key].push_back(value);
value.clear();
}
map<int,vector<vector<int> > >::iterator beg=classification.begin();
map<int,vector<vector<int> > >::iterator end=classification.end();
int i=0;
while(beg!=end){
root->child[i]=construct_decision_tree(beg->second,feature);
beg++;
}
}
TNode* construct_decision_tree(vector<vector<int> > training_sample,vector<int> feature){
TNode *root=new TNode();
int type=check_is_leaf(training_sample,feature);
if(type!=-1){
root->type=type;
cout<<"besttype"<<type<<endl;
}
else if(feature.size()==1){
map<int,vector<vector<int> > >classification;
for(int i=0;i<training_sample.size();i++){
int key=training_sample[i][0];
vector<int> value;
value.push_back(training_sample[i][0]);
classification[key].push_back(value);
value.clear();
}
map<int,vector<vector<int> > >::iterator beg=classification.begin();
map<int,vector<vector<int> > >::iterator end=classification.end();
int i=0;
while(beg!=end){
root->child[i]=construct_decision_tree(beg->second,feature);
beg++;
}
}
else{
int division_feature=find_the_best_division(training_sample,feature);
//上式返回的是最好的feature的值
cout<<"bestdivis"<<division_feature<<endl;
root->feature=division_feature;
create_branch_tree(training_sample,root,division_feature,feature);
}
return root;
}
void preorder_decision_tree(TNode* root,int value,int level){
for(int i=0;i<level;i++){
cout<<"---";
}
if(level) cout<<value<<"->";
if(root->feature==-1) cout<<"leaf("<<root->type<<")"<<endl;
else cout<<"inner("<<root->feature<<")"<<endl;
for(int i=0;i<=32;i++){
if(root->child[i]){
preorder_decision_tree(root->child[i],i,level+1);
}
}
}
int main(){
freopen("in.txt","r",stdin);
int feature_num,training_num;
cin>>feature_num>>training_num;
vector<vector<int> >training_set;
vector<int> row;
for(int i=0;i<training_num;i++){
row.clear();
for(int j=0;j<=feature_num;j++){
int n;
cin>>n;
row.push_back(n);
}
training_set.push_back(row);
}
vector<int> feature;
for(int i=0;i<feature_num;i++) feature.push_back(i);
TNode* root=construct_decision_tree(training_set,feature);
cout<<root->feature<<" "<<root->type<<endl;
preorder_decision_tree(root,0,0);
return 0;
}