決策樹ID3算法C++代碼及測試用例(bug版)

有bug,一直未找到,閒暇下來在看看,存一檔

輸入格式

首先爲正整數n、m,分別代表特徵個數、訓練樣本個數。

隨後爲m行,每行有n+1個整數。其中(1<n<=100,1<m<=1000)。

在後續的m行中,每行代表一個樣本中的n個整數特徵值( )與樣本的實際觀測結果y(整數)。其中 。

輸出格式

決策樹的前序遍歷結果,若是一個內部節點則輸出inner(特徵id),若是一個葉節點,則輸出leaf(特徵id)。在對每個節點打印時,若該節點處在n層,需在前面打印n個---,並打印該分支的特徵值。


輸入樣例:

3 10
2 1 1 0
2 0 1 1
1 1 1 0
0 1 0 0
0 0 1 2
0 1 0 0
1 0 0 1
1 0 1 3
1 1 1 0
2 0 0 1
輸出樣例:

inner(1)
---0->inner(0)
------0->leaf(2)
------1->inner(2)
---------0->leaf(1)
---------1->leaf(3)
------2->leaf(1)
---1->leaf(0)
代碼如下:問題,在最後打印出決策樹時總是出錯

#include<iostream>
#include<vector>
#include<math.h>
#include<map>
#include<stdio.h>
using namespace std;

struct TNode{
//構造樹的節點
    int feature;
    int type;
    //feature不爲-1,代表是分支節點,相同,type不爲-1代表是葉節點,二者不能同時爲-1
    TNode *child[32];
    //int*a[5]是指針數組,int(*a)[5]是數組指針,前者表示一個數組,數組元素都是指向int型變量的指針,後者表示一個指針,該指針指向一個int型有5個元素的數組,希望對你有所幫助
    TNode(){
    feature=-1;
    type=-1;
    for(int i=0;i<32;i++){
        child[i]=0;
    }
    }
};
TNode* construct_decision_tree(vector<vector<int> > training_sample,vector<int> feature);
double compute_single_entropy(double p){
    //計算單個的信息熵,只需要傳入y爲某個值的比例即可
    if(p!=0)
        return -p*(log(p)/log(2));
    return 0;
}
double compute_double_entropy(vector<int> y){
    //計算某一堆分類的信息熵,只需要將關於輸出的數組傳入即可
    map<int,int> y_count;
    //將每個y出現的次數都存下來
    for(int i=0;i<y.size();i++){
        y_count[y[i]]++;
    }
    double all_entropy=0.0;
    //遍歷整個map,將所有的entropy相加
    map<int,int>::iterator beg=y_count.begin();
    map<int,int>::iterator end=y_count.end();
    while(beg!=end){
        all_entropy+=compute_single_entropy(double(beg->second)/double(y.size()));
        beg++;
    } 
    return all_entropy;
}
int find_the_best_division(vector<vector<int> > training_sample,vector<int>feature){
    //找出最好的分類方法,所需的是:整個訓練特徵向量組和每個特徵向量的結果標註
    double best_division;
    int best_feature=-1;
    map<int,int> x_count;
    map<int,vector<int> > y_value;
    //記錄每個x可能出現的y;
    for(int colnum=0;colnum<training_sample[0].size();colnum++){
        double division=0.0;
        for(int row=0;row<training_sample.size();row++){
            x_count[training_sample[row][colnum]]++;
            y_value[training_sample[row][colnum]].push_back(training_sample[row][training_sample[0].size()-1]);
        }
        map<int,int>::iterator beg=x_count.begin();
        map<int,int>::iterator end=x_count.end();
        while(beg!=end){
            division+=double(beg->second)/double(training_sample.size())*compute_double_entropy(y_value[beg->first]);
            if(best_feature==-1||best_division>division){
            best_division=division;
            best_feature=colnum;
            }
            beg++;
        }
        x_count.clear();
        y_value.clear();
    }
    return feature[best_feature];
}
int check_is_leaf(vector<vector<int> > training_sample,vector<int> feature){
//判斷是不是葉子節點,是葉子節點,把對應的標註結果返回
    map<int,int> y_count;
    for(int i=0;i<training_sample.size();i++){
        y_count[training_sample[i][training_sample[0].size()-1]]++;
    }
    cout<<y_count.size()<<'-'<<feature.size()<<endl;
    if(y_count.size()!=1&&feature.size()!=0) return -1;
    map<int,int>::iterator beg=y_count.begin();
    map<int,int>::iterator max=beg;
    map<int,int>::iterator end=y_count.end();
    while(beg!=end){
        if(beg->second>max->second) max=beg;
        beg++;
    }
    return max->first;
}
void create_branch_tree(vector<vector<int> >training_sample,TNode* root,int division_feature,vector<int> feature){
    int division_location=0;
    //先找到最好分類值的位置
    while(feature[division_location]!=division_feature) division_location++;
    vector<int>::iterator it=feature.begin()+division_location;
    feature.erase(it);
    map<int,vector<vector<int> > >classification;
    for(int i=0;i<training_sample.size();i++){
        int key=training_sample[i][division_location];
        vector<int> value;
        for(int j=0;j<division_location;j++) value.push_back(training_sample[i][j]);
        for(int j=division_location;j<training_sample[0].size();j++) value.push_back(training_sample[i][j]);
        classification[key].push_back(value);
        value.clear();
    }
    map<int,vector<vector<int> > >::iterator beg=classification.begin();
    map<int,vector<vector<int> > >::iterator end=classification.end();
    int i=0;
    while(beg!=end){
        root->child[i]=construct_decision_tree(beg->second,feature);
        beg++;
    }
}
TNode* construct_decision_tree(vector<vector<int> > training_sample,vector<int> feature){
    TNode *root=new TNode();
    int type=check_is_leaf(training_sample,feature);
    if(type!=-1){
        root->type=type;
        cout<<"besttype"<<type<<endl;
    }
    else if(feature.size()==1){
            map<int,vector<vector<int> > >classification;
            for(int i=0;i<training_sample.size();i++){
            int key=training_sample[i][0];
            vector<int> value;
            value.push_back(training_sample[i][0]);
            classification[key].push_back(value);
            value.clear();
        }
        map<int,vector<vector<int> > >::iterator beg=classification.begin();
        map<int,vector<vector<int> > >::iterator end=classification.end();
        int i=0;
        while(beg!=end){
            root->child[i]=construct_decision_tree(beg->second,feature);
            beg++;
        }
    }
    else{
        int division_feature=find_the_best_division(training_sample,feature);
        //上式返回的是最好的feature的值
        cout<<"bestdivis"<<division_feature<<endl;
        root->feature=division_feature;
        create_branch_tree(training_sample,root,division_feature,feature);
    }
    return root;
}
void preorder_decision_tree(TNode* root,int value,int level){
    for(int i=0;i<level;i++){
        cout<<"---";
    }
    if(level) cout<<value<<"->";
    if(root->feature==-1) cout<<"leaf("<<root->type<<")"<<endl;
    else cout<<"inner("<<root->feature<<")"<<endl;
    for(int i=0;i<=32;i++){
        if(root->child[i]){
            preorder_decision_tree(root->child[i],i,level+1);
        }
    }
}
int main(){
    freopen("in.txt","r",stdin);
    int feature_num,training_num;
    cin>>feature_num>>training_num;
    vector<vector<int> >training_set;
    vector<int> row;
    for(int i=0;i<training_num;i++){
        row.clear();
        for(int j=0;j<=feature_num;j++){
            int n;
            cin>>n;
            row.push_back(n);
        }
        training_set.push_back(row);
    }
    vector<int> feature;
    for(int i=0;i<feature_num;i++) feature.push_back(i);
    TNode* root=construct_decision_tree(training_set,feature);
    cout<<root->feature<<" "<<root->type<<endl;
    preorder_decision_tree(root,0,0);
    return 0;
}


發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章