決策樹算法ID3算法源代碼&數據文件

//////////////////////////////////////////////////////////////////////////
/*
*
*
*
* 文件名稱：ID3.cpp
*
* 摘要：ID3算法實現

*
* 當前版本：1.0

* 完成日期：2011.01.13

*/////////////////////////////////////////////////////////////////////////

#include <stdio.h>
#include <iostream>
#include <vector>
#include <math.h>
#include <string.h>

using namespace std;

typedef struct tnode
{
char tdata[100];

}tnode;

typedef struct Tree_Node
{
char name[100];
bool isLeaf; //標記是否葉子節點

vector<tnode> att_list;//屬性名稱列表

vector<Tree_Node * > child_list;

}Tree_Node,* pTreeNpde;

typedef struct dnode
{
vector<tnode>row;
}dnode;

typedef struct D_Node
{
vector<dnode>DB;
vector<tnode> attr_name;
tnode class_name;

}D_Node;

D_Node G_DB;
pTreeNpde Root = NULL;

typedef struct FreeQNode
{
char name[100];
int count;
vector<int> Set_ID;
}FreeQNode;

typedef struct FreeQNodeDouble
{
char name[100];
int count;
vector<int> row_id;

vector<FreeQNode> classes;//存放分類屬性列表及相應的出現次數
}FreeQNodeDouble;

typedef struct attr_node
{
int attr_id;
vector<tnode> attr_name;
vector<int> count_list;

}attr_node;

vector<attr_node> G_Attr_List;

typedef struct binNode
{
char name[100];
int count;
vector<int> Set_ID;

struct binNode * lchild;
struct binNode * rchild;
}binNode;

typedef struct binNodeDouble
{
char name[100];
int count;
vector<int> row_id;
struct binNodeDouble * lchild;
struct binNodeDouble * rchild;

vector<FreeQNode> classes;

}binNodeDouble;

void insert_tree(binNode * & r, char str[100])
{

if (NULL == r)
{

  binNode * node = new binNode;
  strcpy(node->name,str);
  node->count = 1;
  //printf("[%s,%d]\n",node->name,node->count);
  node->lchild = node->rchild = NULL;
  r = node;
}
else
{
  if (strcmp(r->name,str) == 0)
  {
   r->count ++;
  }
  else if (strcmp(r->name,str) < 0)
  {
   insert_tree(r->lchild,str);
  }
  else
  {
   insert_tree(r->rchild,str);
  }
}

}

void delete_bin_tree(binNode *& r)
{
if (r != NULL)
{
  delete_bin_tree(r->lchild);
  delete_bin_tree(r->rchild);
  delete(r);
  r = NULL;
}
}

void Bin_tree_inorder(binNode * r,vector<FreeQNode> & Fq)
{
if (r != NULL)
{
Bin_tree_inorder(r->lchild,Fq);

  FreeQNode ft;
  //printf("%s,%d\n",r->name,r->count);
  strcpy(ft.name,r->name);
  ft.count = r->count;
  for (int i= 0;i < r->Set_ID.size();i++)
  {
   ft.Set_ID.push_back(r->Set_ID[i]); //保存子集對應的ID號
  }

Fq.push_back(ft); //此處少了這條語句，造成結果無法返回

Bin_tree_inorder(r->rchild,Fq);

}
}

void Get_attr(binNode * r,attr_node & attr)
{
if (r != NULL)
{
Get_attr(r->lchild,attr);

tnode t;
strcpy(t.tdata,r->name);

  //printf("%s,%d\n",r->name,r->count);
  attr.attr_name.push_back(t);
  attr.count_list.push_back(r->count);//保存出現次數

  Get_attr(r->rchild,attr);

}
}

void insert_tree_double(binNodeDouble *& r, int DB_ID,char attr_name[100],char class_name[100])
{

if (NULL == r)
{
  binNodeDouble * node = new binNodeDouble;
  strcpy(node->name,attr_name);
  node->count = 1;
  node->row_id.push_back(DB_ID);

node->lchild = node->rchild = NULL;

  FreeQNode fq;
  strcpy(fq.name,class_name);
  fq.count = 1;
  fq.Set_ID.push_back(DB_ID); //保存子集所對應的ID號
  node->classes.push_back(fq);
  r= node;
}
else
{
  if (strcmp(r->name,attr_name) == 0)
  {
   r->count ++;
   r->row_id.push_back(DB_ID);//這裏也需要保存相應的ID號

bool found = false;

   for (int i = 0; i< r->classes.size();i++)
   {
    if (strcmp(r->classes[i].name,class_name) == 0)
    {
     r->classes[i].count ++;
     r->classes[i].Set_ID.push_back(DB_ID);//保存子集對應的ID號

     found = true; //發現相同的變量名，計數器增1,
     break;        //並退出循環
    }

   }
   if (!found)
   {
    FreeQNode fq;
    strcpy(fq.name,class_name);
    fq.count = 1;
    fq.Set_ID.push_back(DB_ID);//保存子集所對應的ID號
    r->classes.push_back(fq);
   }
  }
  else if (strcmp(r->name,attr_name) < 0)
  {
   insert_tree_double(r->lchild,DB_ID,attr_name,class_name);
  }
  else
  {
   insert_tree_double(r->rchild,DB_ID,attr_name,class_name);
  }
}

}

void delete_bin_tree_double(binNodeDouble *& r)
{
if (r != NULL)
{
  delete_bin_tree_double(r->lchild);
  delete_bin_tree_double(r->rchild);
  delete(r);
  r = NULL;
}
}

void Bin_tree_inorder_double(binNodeDouble *& r,vector<FreeQNodeDouble> &Fq)
{
if (r != NULL)
{
  Bin_tree_inorder_double(r->lchild,Fq);

  FreeQNodeDouble ft;
  strcpy(ft.name,r->name); //保存候屬性的名稱
  ft.count = r->count;

  for (int k = 0;k< r->row_id.size();k++)
  {
   ft.row_id.push_back(r->row_id[k]);
  }

//printf("doubleTree. %s,%d\n",r->name,r->count);

  for (int i = 0;i< r->classes.size();i++)
  {
   FreeQNode fq;
   strcpy(fq.name,r->classes[i].name);
   fq.count = r->classes[i].count;
   for (int j = 0;j < r->classes[i].Set_ID.size();j++)
   {
    fq.Set_ID.push_back( r->classes[i].Set_ID[j]); //保存子集對應的ID號
   }
   ft.classes.push_back(fq);

}

Fq.push_back(ft);
ft.classes.erase(ft.classes.begin(),ft.classes.end());//使用完，必須清空

Bin_tree_inorder_double(r->rchild,Fq);

}
}

void getFqI(vector<int> S,int class_id,vector<FreeQNode> & Fq)
{
binNode * root = NULL;

for (int i = 0;i< S.size();i++)
{
insert_tree(root,G_DB.DB[S[i]].row[class_id].tdata);
}

Bin_tree_inorder(root,Fq);

delete_bin_tree(root);

}

void getFqIA(vector<int> S,int attr_id,int class_id,vector<FreeQNodeDouble> & Fq)
{
binNodeDouble * root = NULL;
printf("call getFqIA\n");

for (int i = 0;i< S.size();i++)
{
insert_tree_double(root,S[i],G_DB.DB[S[i]].row[attr_id].tdata,G_DB.DB[S[i]].row[class_id].tdata);
}

Bin_tree_inorder_double(root,Fq);

delete_bin_tree_double(root);

}
void readdata(char *filename)
{
char str[1000];
FILE * fp;
fp = fopen(filename,"r");
fgets(str,1000,fp);
int len = strlen(str);
int attr_no = 0; //屬性個數
int row_num = 0;
if (str != NULL)
{
row_num = 1;
}

for (int i = 0;i< len;i++)
{
  if (str[i] == '\t')
  {
   attr_no ++;
  }
}
attr_no ++;//最後一個是回車，整個屬性值+1

printf("%d\n",attr_no);
while(fgets(str,1000,fp) != NULL)
{
  row_num ++; //統計行數
}
fclose(fp);
fopen(filename,"r");
tnode t;
for (i = 0;i<attr_no;i++)
{
  fscanf(fp,"%s",t.tdata);
  G_DB.attr_name.push_back(t);
  printf("%s\n",t.tdata);
}
strcpy(G_DB.class_name.tdata,G_DB.attr_name[attr_no-1].tdata);
for (int j = 1;j< row_num;j++)
{
  dnode dt;
  tnode temp;

  for (int i = 0;i<attr_no;i++)
  {
   fscanf(fp,"%s",temp.tdata);
   dt.row.push_back(temp);
  }
  G_DB.DB.push_back(dt);
  dt.row.erase(dt.row.begin(),dt.row.end());
}

printf("%d\n",G_DB.DB.size());
for (i = 0;i< G_DB.DB.size();i++)
{
  for (int j = 0;j< G_DB.DB[i].row.size();j++)
  {
   printf("%s\t",G_DB.DB[i].row[j].tdata);
  }
  printf("\n");
}
}

double Fnc_I(vector<int> S,int class_id)
{
//給定一個子集，計算其按照class_id所對應的分類屬性進行分類時的期望I
// printf("called Fnc_I(%d)\n ",class_id);

vector<FreeQNode> Fq;

getFqI(S,class_id,Fq);
//調用getFqI獲取按照Class_id爲分類標準的分類結果,當Fq中爲一條數據時，則子集S都屬於一個分類
//否則，從中找到出現此時最大的，作爲返回結果

// printf("begin to compute I \n");
double total = 0;
for (int i = 0;i< Fq.size();i++)
{
total += Fq[i].count;
// printf("%s,%d\n",Fq[i].name,Fq[i].count);
}

double result = 0;

if (0 == total)
{
return 0;
}

for (i = 0;i< Fq.size();i++)
{
double p = Fq[i].count/total;
result += -1*(p * log(p)/log(2));
}

// printf("FNC_I return\n\n");

return result;

}

double Fnc_IA(vector<int> S,int attr_id,int class_id,vector<FreeQNodeDouble> & Fq)
{
//給定一個子集，計算其按照class_id所對應的分類屬性進行分類時的期望I

printf("給定一個子集，計算其按照class_id所對應的分類屬性進行分類時的期望I\n");

getFqIA(S,attr_id,class_id,Fq);

double total = 0;
for (int i = 0;i< Fq.size();i++)
{
  total += Fq[i].count;
}

double result = 0;

if (0 == total)
{
  return 0;
}
bool pr= true;
for (i = 0;i< Fq.size();i++)
{
  double stotal = Fq[i].count;
  double sresult = 0;
if (pr) printf("[%s,%d]\n",Fq[i].name,Fq[i].count);
  for (int j = 0;j < Fq[i].classes.size();j++)
  {
  if (pr) printf("%s,%d\n",Fq[i].classes[j].name,Fq[i].classes[j].count);
   for (int k = 0;k < Fq[i].classes[j].count;k++)
   {
   // printf("%d\t",Fq[i].classes[j].Set_ID[k]+1);
   }
      //printf("\n");
   double sp = Fq[i].classes[j].count/stotal; //計算子集的頻率
   sresult += -1*(sp*log(sp)/log(2));
  }

  result += (stotal/total) * sresult;
}
  if (pr) printf("\n");

return result;

}

int SelectBestAttribute(vector<int> Samples,vector<int> attribute_list,int class_id)
{
//輸入訓練數據集Samples，候選屬性列表attribute_list
//分類屬性標記class_id
//返回best_attribute

double fi = Fnc_I(Samples,5);
// printf("%lf\n",fi);

double IA = 999999999;
int best_attrib = -1;

for (int i = 0;i < attribute_list.size();i++)
{
vector<FreeQNodeDouble> fqd;
double tfa = Fnc_IA(Samples,attribute_list[i],class_id,fqd);
// printf("%d, FIA = %lf\n",i,tfa);

      if (IA > tfa)
      {
    IA = tfa;
    best_attrib = i;
      }
}
//printf("%lf\n",IA);

printf("gain(%d) = %lf - %lf = %lf\n",best_attrib,fi,IA,fi - IA);

return attribute_list[best_attrib];
}

void fnc_getattr(vector<int> Samples,int att_id,attr_node &at)
{
binNode * root = NULL;

for (int i = 0;i< Samples.size();i++)
{
insert_tree(root,G_DB.DB[Samples[i]].row[att_id].tdata);
}

Get_attr(root,at);

delete_bin_tree(root);

}

void get_class_num_and_name(vector<int> Samples,int class_id,int & class_num,tnode & class_name)
{

attr_node at;

binNode * root = NULL;

for (int i = 0;i< Samples.size();i++)
{
insert_tree(root,G_DB.DB[Samples[i]].row[class_id].tdata);
}

Get_attr(root,at);
delete_bin_tree(root);

//printf("att_size = %d\n",at.attr_name.size());

class_num = at.attr_name.size();

int num = 0;
int id = 0;
if (1 == class_num)
{
  strcpy(class_name.tdata,at.attr_name[0].tdata);
}
else
{

  for (int j = 0;j < at.attr_name.size();j++ )
  {
   if (at.count_list[j] > num)
   {
    num = at.count_list[j];
    id = j;
   }
  }
}

strcpy(class_name.tdata,at.attr_name[id].tdata);//保存最普通的類名

}

void getAllTheAttribute(vector<int> Samples,vector<int> attribute_list,int class_id)
{
printf("all the attribute are:\n");

for (int i = 0;i < attribute_list.size();i++)
{

  attr_node at;
  at.attr_id = attribute_list[i];
  fnc_getattr(Samples,attribute_list[i],at);

  G_Attr_List.push_back(at);

}
for (i = 0;i <G_Attr_List.size();i++)
{
  printf("%d\n",G_Attr_List[i].attr_id);
  for (int j = 0;j< G_Attr_List[i].attr_name.size();j++)
  {
   printf("%s\t",G_Attr_List[i].attr_name[j].tdata);
  }
  printf("\n");
}

}

void Generate_decision_tree(Tree_Node * & root,vector<int> Samples, vector<int> attribute_list,int class_id)
{
/*算法：Generate_decision_tree(samples, attribute)。由給定的訓練數據產生一棵判定樹。
輸入：訓練樣本samples，由離散值屬性表示；候選屬性的集合attribute_list。
輸出：一棵判定樹。
方法：

(1) 創建結點 N；
(2) if samples 都在同一個類C then //類標號屬性的值均爲C，其候選屬性值不考慮
(3) return N 作爲葉結點，以類C 標記；
(4) if attribut_list 爲空 then
(5) return N 作爲葉結點，標記爲 samples 中最普通的類； //類標號屬性值數量最大的那個
(6) 選擇attribute_list 中具有最高信息增益的屬性best_attribute；//找出最好的劃分屬性
(7) 標記結點 N 爲best_attribute；
(8) for each best_attribute 中的未知值a i //將樣本samples按照best_attribute進行劃分
(9) 由結點 N 長出一個條件爲 best_attribute = a i 的分枝；
(10) 設si 是samples 中best_attribute = a i 的樣本的集合；//a partition
(11) if si 爲空 then
(12) 加上一個樹葉，標記爲 samples 中最普通的類；//從樣本中找出類標號數量最多的，作爲此節點的標記
(13) else 加上一個由 Generate_decision_tree(si,attribute_list - best_attribute)返回的結點；//對數據子集si,遞歸調用，此時候選屬性已刪除best_attribute

*/
printf("begin to call Generate_decision_tree.\n");
printf("the samples are:\n");
for (int ts = 0;ts < Samples.size();ts++)
{
printf("%d\t",Samples[ts]);
}
printf("\nend\n");

int class_num = 0;
tnode class_name;
get_class_num_and_name(Samples,class_id,class_num,class_name); //判斷是否屬於同一類
//如果是同一類，則class_num =1，class_name就是類名
//否則，class_num >1 ,class_name就是最普通的類名

//printf("class_num = %d\n",class_num);

root = new Tree_Node;//(1) 創建結點 N；

root->isLeaf = false; //這裏顯式初始化爲False，以免程序出錯 2011.01.13 17:08

if (1 == class_num)
{
  printf("samples 都在同一個類【%s】 ,返回\n",class_name.tdata);
  //(2) if samples 都在同一個類C then   //類標號屬性的值均爲C，其候選屬性值不考慮
  //(3) return N 作爲葉結點，以類C 標記；
  strcpy(root->name,class_name.tdata);
  root->isLeaf = true;

  return;
}

if (attribute_list.size() == 0)
{
  printf("attribute_list.size() == 0\n");
  //(4) if attribut_list 爲空 then
  //(5) return N 作爲葉結點，標記爲 samples 中最普通的類； //類標號屬性值數量最大的那個
  //上面已經計算了，這裏直接引用了
  strcpy(root->name,class_name.tdata);
  root->isLeaf = true;

  return;

}

//開始找最佳劃分

int best_arrtib = SelectBestAttribute(Samples,attribute_list,class_id);

printf("the best attrib_id is %d,attribute_name is [%s]\n",best_arrtib,G_DB.attr_name[best_arrtib].tdata);

vector<FreeQNodeDouble> Fq;

double tfa = Fnc_IA(Samples,best_arrtib,class_id,Fq);

vector<int> att_list;
for (int tt = 0;tt<attribute_list.size();tt++ )
{
  if (attribute_list[tt] != best_arrtib)
  {
   att_list.push_back(attribute_list[tt]);
  }
}

///////////////////////////直接選擇裏面最普通的類///////////////////////////////////////////////
int id = 0;
int num = Fq[id].count;

for (int i = 1;i < Fq.size();i++)
{
  if (num < Fq[i].count)
  {
   id = i;
   num = Fq[i].count;
  }
}
printf("最普通的類名:%s\n",Fq[id].name);
///////////////////////////直接選擇裏面最普通的類///////////////////////////////////////////////

printf("開始處理，將Sample按照%d,%s進行劃分\n",best_arrtib,G_DB.attr_name[best_arrtib].tdata);

strcpy(root->name,G_DB.attr_name[best_arrtib].tdata); //這裏已經是最好的劃分了
//(6) 選擇attribute_list 中具有最高信息增益的屬性best_attribute；//找出最好的劃分屬性
//(7) 標記結點 N 爲best_attribute；

printf("劃分共有%d個子集\n",Fq.size());
printf("G_Attr_List[best_arrtib].attr_name.size() = %d\n",G_Attr_List[best_arrtib].attr_name.size());

for (int k = 0;k < G_Attr_List[best_arrtib].attr_name.size();k++)
{

  //root->att_list.push_back(G_Attr_List[best_arrtib].attr_name[k]);//保存相應的屬性值
        bool has_attr = false; //判斷實際數據集中是否有該屬性值，無沒有，則跳過郭運凱 2012.05.06
  for (int nn = 0;nn < Fq.size();nn ++)
  {
   if (strcmp(G_Attr_List[best_arrtib].attr_name[k].tdata,Fq[nn].name) == 0)
    has_attr = true;
  }

  if ( ! has_attr)
  {
   continue;//如果沒有該屬性值，則跳過,不然就會出現錯誤的規則(假設該子集中就一種情況，
            // G_Attr_List[best_arrtib].attr_name 裏面卻有3中屬性，這樣，就會生成3條規則，而不是一條規則，並且後面兩條規則是錯誤的)
  }

root->att_list.push_back(G_Attr_List[best_arrtib].attr_name[k]);//保存相應的屬性值

  // (8) for each best_attribute 中的已知值a i //將樣本samples按照best_attribute進行劃分
  printf("\n開始獲取第[%d]個劃分\n",k);
  //printf("[%d],name = %s\n",k,G_Attr_List[best_arrtib-1].attr_name[k].tdata); 這裏出了問題，應該直接訪問best_arrtib
  printf("[%d],屬性名 = %s\n",k,G_Attr_List[best_arrtib].attr_name[k].tdata);

bool found = false;

  for (int n = 0;n < Fq.size();n++)
  {
   printf("Fq[%d].name=%s\n",n,Fq[n].name);

   if (strcmp(G_Attr_List[best_arrtib].attr_name[k].tdata,Fq[n].name) == 0)
   {
    printf("k=%d,n=%d\n",k,n);
    printf("%s\t%s\n",G_Attr_List[best_arrtib].attr_name[k].tdata,Fq[n].name);
    //(13) else 加上一個由 Generate_decision_tree(si,attribute_list - best_attribute)
    //返回的結點；//對數據子集si,遞歸調用，此時候選屬性已刪除best_attribute
    pTreeNpde trnode = new Tree_Node;//(1) 創建結點 N；

//Generate_decision_tree(trnode,Fq[k].row_id,att_list,class_id); //這裏Fq的下標應該取n,而不是k, 郭運凱 2012.05.06

Generate_decision_tree(trnode,Fq[n].row_id,att_list,class_id);

    root->child_list.push_back(trnode);
    found = true;
    break;

}
}

  if (!found)
  {
   // (9) 由結點 N 長出一個條件爲 best_attribute = a i 的分枝；
   //(10) 設si 是samples 中best_attribute = a i 的樣本的集合；//a partition
   //(11) if si 爲空 then
   // (12) 加上一個樹葉，標記爲 samples 中最普通的類；//從樣本中找出類標號數量最多的，作爲此節點的標記

    pTreeNpde trnode = new Tree_Node;//(1) 創建結點 N；
    strcpy(trnode->name,Fq[id].name);
          trnode->isLeaf = true;
    root->child_list.push_back(trnode);
  }
}

}

void generage_decision_rules(Tree_Node * r,char rules[1000],int level)
{
//printf("%d,%s\n",level,r->name);
if (1 == level)
{
  for (int i = 0;i < r->child_list.size();i++)
  {
   char str[1000];
   strcpy(str,rules);
   strcat(str,"IF ");
   strcat(str,r->name);
   strcat(str," = \"");
   strcat(str,r->att_list[i].tdata);
   strcat(str,"\"");

   generage_decision_rules(r->child_list[i],str,level +1 );
  }
}
else
{
  if(! r->isLeaf)
  {
   for (int i = 0;i < r->child_list.size();i++)
   {
    char str[1000];
    strcpy(str,rules);
    strcat(str," AND ");
    strcat(str,r->name);
    strcat(str," = \"");
    strcat(str,r->att_list[i].tdata);
    strcat(str," \"");

    //printf(" AND %s = \"%s\" ",r->name,r->att_list[i].tdata);
    generage_decision_rules(r->child_list[i],str,level +1 );
   }
  }
  else
  {
   printf("%s",rules);
   printf(" THEN %s = \"%s\"\n",G_DB.class_name.tdata,r->name);

  }


}

}

void test()
{

vector<int> s;

for (int i = 0;i< G_DB.DB.size();i++)
{
s.push_back(i);
}
vector<int> arrt_list;
int class_id = G_DB.attr_name.size()-1;

for ( i =0;i< G_DB.attr_name.size()-1;i++)
{
arrt_list.push_back(i);
}

getAllTheAttribute(s,arrt_list,class_id);

Generate_decision_tree(Root,s,arrt_list,class_id);

char rules[1000] ="";

generage_decision_rules(Root,rules,1);

}

void main()
{
readdata("data.txt");
test();

}
----------------------------------------------------------------------------------------

data.txt

由於數據文件中分隔符是以tab鍵分割，在這裏貼上後就變成了空格，造成格式錯亂，代碼無法正常運行，請自行到

http://download.csdn.net/detail/liema2000/4231529 (決策樹之ID3 算法源碼及數據文件)

下載

決策樹算法ID3算法源代碼&數據文件

Fully generated SQL is a required output link property 錯誤解決

數據挖掘算法之 apriori

決策樹算法ID3算法源代碼&數據文件

數據挖掘十大經典算法之apriori算法&源代碼

互信息 Mutual Information

Mac下配置sublime實現LaTeX

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結