//////////////////////////////////////////////////////////////////////////
/*
*
*
*
* 文件名稱:ID3.cpp
*
* 摘 要:ID3算法實現
*
* 當前版本:1.0
* 完成日期:2011.01.13
*/////////////////////////////////////////////////////////////////////////
#include <stdio.h>
#include <iostream>
#include <vector>
#include <math.h>
#include <string.h>
using namespace std;
typedef struct tnode
{
char tdata[100];
}tnode;
typedef struct Tree_Node
{
char name[100];
bool isLeaf; //標記是否葉子節點
vector<tnode> att_list;//屬性名稱列表
vector<Tree_Node * > child_list;
}Tree_Node,* pTreeNpde;
typedef struct dnode
{
vector<tnode>row;
}dnode;
typedef struct D_Node
{
vector<dnode>DB;
vector<tnode> attr_name;
tnode class_name;
}D_Node;
D_Node G_DB;
pTreeNpde Root = NULL;
typedef struct FreeQNode
{
char name[100];
int count;
vector<int> Set_ID;
}FreeQNode;
typedef struct FreeQNodeDouble
{
char name[100];
int count;
vector<int> row_id;
vector<FreeQNode> classes;//存放分類屬性列表及相應的出現次數
}FreeQNodeDouble;
typedef struct attr_node
{
int attr_id;
vector<tnode> attr_name;
vector<int> count_list;
}attr_node;
vector<attr_node> G_Attr_List;
typedef struct binNode
{
char name[100];
int count;
vector<int> Set_ID;
struct binNode * lchild;
struct binNode * rchild;
}binNode;
typedef struct binNodeDouble
{
char name[100];
int count;
vector<int> row_id;
struct binNodeDouble * lchild;
struct binNodeDouble * rchild;
vector<FreeQNode> classes;
}binNodeDouble;
void insert_tree(binNode * & r, char str[100])
{
if (NULL == r)
{
binNode * node = new binNode;
strcpy(node->name,str);
node->count = 1;
//printf("[%s,%d]\n",node->name,node->count);
node->lchild = node->rchild = NULL;
r = node;
}
else
{
if (strcmp(r->name,str) == 0)
{
r->count ++;
}
else if (strcmp(r->name,str) < 0)
{
insert_tree(r->lchild,str);
}
else
{
insert_tree(r->rchild,str);
}
}
}
void delete_bin_tree(binNode *& r)
{
if (r != NULL)
{
delete_bin_tree(r->lchild);
delete_bin_tree(r->rchild);
delete(r);
r = NULL;
}
}
void Bin_tree_inorder(binNode * r,vector<FreeQNode> & Fq)
{
if (r != NULL)
{
Bin_tree_inorder(r->lchild,Fq);
FreeQNode ft;
//printf("%s,%d\n",r->name,r->count);
strcpy(ft.name,r->name);
ft.count = r->count;
for (int i= 0;i < r->Set_ID.size();i++)
{
ft.Set_ID.push_back(r->Set_ID[i]); //保存子集對應的ID號
}
Fq.push_back(ft); //此處少了這條語句,造成結果無法返回
Bin_tree_inorder(r->rchild,Fq);
}
}
void Get_attr(binNode * r,attr_node & attr)
{
if (r != NULL)
{
Get_attr(r->lchild,attr);
tnode t;
strcpy(t.tdata,r->name);
//printf("%s,%d\n",r->name,r->count);
attr.attr_name.push_back(t);
attr.count_list.push_back(r->count);//保存出現次數
Get_attr(r->rchild,attr);
}
}
void insert_tree_double(binNodeDouble *& r, int DB_ID,char attr_name[100],char class_name[100])
{
if (NULL == r)
{
binNodeDouble * node = new binNodeDouble;
strcpy(node->name,attr_name);
node->count = 1;
node->row_id.push_back(DB_ID);
node->lchild = node->rchild = NULL;
FreeQNode fq;
strcpy(fq.name,class_name);
fq.count = 1;
fq.Set_ID.push_back(DB_ID); //保存子集所對應的ID號
node->classes.push_back(fq);
r= node;
}
else
{
if (strcmp(r->name,attr_name) == 0)
{
r->count ++;
r->row_id.push_back(DB_ID);//這裏也需要保存相應的ID號
bool found = false;
for (int i = 0; i< r->classes.size();i++)
{
if (strcmp(r->classes[i].name,class_name) == 0)
{
r->classes[i].count ++;
r->classes[i].Set_ID.push_back(DB_ID);//保存子集對應的ID號
found = true; //發現相同的變量名,計數器增1,
break; //並退出循環
}
}
if (!found)
{
FreeQNode fq;
strcpy(fq.name,class_name);
fq.count = 1;
fq.Set_ID.push_back(DB_ID);//保存子集所對應的ID號
r->classes.push_back(fq);
}
}
else if (strcmp(r->name,attr_name) < 0)
{
insert_tree_double(r->lchild,DB_ID,attr_name,class_name);
}
else
{
insert_tree_double(r->rchild,DB_ID,attr_name,class_name);
}
}
}
void delete_bin_tree_double(binNodeDouble *& r)
{
if (r != NULL)
{
delete_bin_tree_double(r->lchild);
delete_bin_tree_double(r->rchild);
delete(r);
r = NULL;
}
}
void Bin_tree_inorder_double(binNodeDouble *& r,vector<FreeQNodeDouble> &Fq)
{
if (r != NULL)
{
Bin_tree_inorder_double(r->lchild,Fq);
FreeQNodeDouble ft;
strcpy(ft.name,r->name); //保存候屬性的名稱
ft.count = r->count;
for (int k = 0;k< r->row_id.size();k++)
{
ft.row_id.push_back(r->row_id[k]);
}
//printf("doubleTree. %s,%d\n",r->name,r->count);
for (int i = 0;i< r->classes.size();i++)
{
FreeQNode fq;
strcpy(fq.name,r->classes[i].name);
fq.count = r->classes[i].count;
for (int j = 0;j < r->classes[i].Set_ID.size();j++)
{
fq.Set_ID.push_back( r->classes[i].Set_ID[j]); //保存子集對應的ID號
}
ft.classes.push_back(fq);
}
Fq.push_back(ft);
ft.classes.erase(ft.classes.begin(),ft.classes.end());//使用完,必須清空
Bin_tree_inorder_double(r->rchild,Fq);
}
}
void getFqI(vector<int> S,int class_id,vector<FreeQNode> & Fq)
{
binNode * root = NULL;
for (int i = 0;i< S.size();i++)
{
insert_tree(root,G_DB.DB[S[i]].row[class_id].tdata);
}
Bin_tree_inorder(root,Fq);
delete_bin_tree(root);
}
void getFqIA(vector<int> S,int attr_id,int class_id,vector<FreeQNodeDouble> & Fq)
{
binNodeDouble * root = NULL;
printf("call getFqIA\n");
for (int i = 0;i< S.size();i++)
{
insert_tree_double(root,S[i],G_DB.DB[S[i]].row[attr_id].tdata,G_DB.DB[S[i]].row[class_id].tdata);
}
Bin_tree_inorder_double(root,Fq);
delete_bin_tree_double(root);
}
void readdata(char *filename)
{
char str[1000];
FILE * fp;
fp = fopen(filename,"r");
fgets(str,1000,fp);
int len = strlen(str);
int attr_no = 0; //屬性個數
int row_num = 0;
if (str != NULL)
{
row_num = 1;
}
for (int i = 0;i< len;i++)
{
if (str[i] == '\t')
{
attr_no ++;
}
}
attr_no ++;//最後一個是回車,整個屬性值+1
printf("%d\n",attr_no);
while(fgets(str,1000,fp) != NULL)
{
row_num ++; //統計行數
}
fclose(fp);
fopen(filename,"r");
tnode t;
for (i = 0;i<attr_no;i++)
{
fscanf(fp,"%s",t.tdata);
G_DB.attr_name.push_back(t);
printf("%s\n",t.tdata);
}
strcpy(G_DB.class_name.tdata,G_DB.attr_name[attr_no-1].tdata);
for (int j = 1;j< row_num;j++)
{
dnode dt;
tnode temp;
for (int i = 0;i<attr_no;i++)
{
fscanf(fp,"%s",temp.tdata);
dt.row.push_back(temp);
}
G_DB.DB.push_back(dt);
dt.row.erase(dt.row.begin(),dt.row.end());
}
printf("%d\n",G_DB.DB.size());
for (i = 0;i< G_DB.DB.size();i++)
{
for (int j = 0;j< G_DB.DB[i].row.size();j++)
{
printf("%s\t",G_DB.DB[i].row[j].tdata);
}
printf("\n");
}
}
double Fnc_I(vector<int> S,int class_id)
{
//給定一個子集,計算其按照class_id所對應的分類屬性進行分類時的期望I
// printf("called Fnc_I(%d)\n ",class_id);
vector<FreeQNode> Fq;
getFqI(S,class_id,Fq);
//調用getFqI獲取按照Class_id爲分類標準的分類結果,當Fq中爲一條數據時,則子集S都屬於一個分類
//否則,從中找到出現此時最大的,作爲返回結果
// printf("begin to compute I \n");
double total = 0;
for (int i = 0;i< Fq.size();i++)
{
total += Fq[i].count;
// printf("%s,%d\n",Fq[i].name,Fq[i].count);
}
double result = 0;
if (0 == total)
{
return 0;
}
for (i = 0;i< Fq.size();i++)
{
double p = Fq[i].count/total;
result += -1*(p * log(p)/log(2));
}
// printf("FNC_I return\n\n");
return result;
}
double Fnc_IA(vector<int> S,int attr_id,int class_id,vector<FreeQNodeDouble> & Fq)
{
//給定一個子集,計算其按照class_id所對應的分類屬性進行分類時的期望I
printf("給定一個子集,計算其按照class_id所對應的分類屬性進行分類時的期望I\n");
getFqIA(S,attr_id,class_id,Fq);
double total = 0;
for (int i = 0;i< Fq.size();i++)
{
total += Fq[i].count;
}
double result = 0;
if (0 == total)
{
return 0;
}
bool pr= true;
for (i = 0;i< Fq.size();i++)
{
double stotal = Fq[i].count;
double sresult = 0;
if (pr) printf("[%s,%d]\n",Fq[i].name,Fq[i].count);
for (int j = 0;j < Fq[i].classes.size();j++)
{
if (pr) printf("%s,%d\n",Fq[i].classes[j].name,Fq[i].classes[j].count);
for (int k = 0;k < Fq[i].classes[j].count;k++)
{
// printf("%d\t",Fq[i].classes[j].Set_ID[k]+1);
}
//printf("\n");
double sp = Fq[i].classes[j].count/stotal; //計算子集的頻率
sresult += -1*(sp*log(sp)/log(2));
}
result += (stotal/total) * sresult;
}
if (pr) printf("\n");
return result;
}
int SelectBestAttribute(vector<int> Samples,vector<int> attribute_list,int class_id)
{
//輸入訓練數據集Samples,候選屬性列表attribute_list
//分類屬性標記class_id
//返回best_attribute
double fi = Fnc_I(Samples,5);
// printf("%lf\n",fi);
double IA = 999999999;
int best_attrib = -1;
for (int i = 0;i < attribute_list.size();i++)
{
vector<FreeQNodeDouble> fqd;
double tfa = Fnc_IA(Samples,attribute_list[i],class_id,fqd);
// printf("%d, FIA = %lf\n",i,tfa);
if (IA > tfa)
{
IA = tfa;
best_attrib = i;
}
}
//printf("%lf\n",IA);
printf("gain(%d) = %lf - %lf = %lf\n",best_attrib,fi,IA,fi - IA);
return attribute_list[best_attrib];
}
void fnc_getattr(vector<int> Samples,int att_id,attr_node &at)
{
binNode * root = NULL;
for (int i = 0;i< Samples.size();i++)
{
insert_tree(root,G_DB.DB[Samples[i]].row[att_id].tdata);
}
Get_attr(root,at);
delete_bin_tree(root);
}
void get_class_num_and_name(vector<int> Samples,int class_id,int & class_num,tnode & class_name)
{
attr_node at;
binNode * root = NULL;
for (int i = 0;i< Samples.size();i++)
{
insert_tree(root,G_DB.DB[Samples[i]].row[class_id].tdata);
}
Get_attr(root,at);
delete_bin_tree(root);
//printf("att_size = %d\n",at.attr_name.size());
class_num = at.attr_name.size();
int num = 0;
int id = 0;
if (1 == class_num)
{
strcpy(class_name.tdata,at.attr_name[0].tdata);
}
else
{
for (int j = 0;j < at.attr_name.size();j++ )
{
if (at.count_list[j] > num)
{
num = at.count_list[j];
id = j;
}
}
}
strcpy(class_name.tdata,at.attr_name[id].tdata);//保存最普通的類名
}
void getAllTheAttribute(vector<int> Samples,vector<int> attribute_list,int class_id)
{
printf("all the attribute are:\n");
for (int i = 0;i < attribute_list.size();i++)
{
attr_node at;
at.attr_id = attribute_list[i];
fnc_getattr(Samples,attribute_list[i],at);
G_Attr_List.push_back(at);
}
for (i = 0;i <G_Attr_List.size();i++)
{
printf("%d\n",G_Attr_List[i].attr_id);
for (int j = 0;j< G_Attr_List[i].attr_name.size();j++)
{
printf("%s\t",G_Attr_List[i].attr_name[j].tdata);
}
printf("\n");
}
}
void Generate_decision_tree(Tree_Node * & root,vector<int> Samples, vector<int> attribute_list,int class_id)
{
/*算法:Generate_decision_tree(samples, attribute)。由給定的訓練數據產生一棵判定樹。
輸入:訓練樣本samples,由離散值屬性表示;候選屬性的集合attribute_list。
輸出:一棵判定樹。
方法:
(1) 創建結點 N;
(2) if samples 都在同一個類C then //類標號屬性的值均爲C,其候選屬性值不考慮
(3) return N 作爲葉結點,以類C 標記;
(4) if attribut_list 爲空 then
(5) return N 作爲葉結點,標記爲 samples 中最普通的類; //類標號屬性值數量最大的那個
(6) 選擇attribute_list 中具有最高信息增益的屬性best_attribute;//找出最好的劃分屬性
(7) 標記結點 N 爲best_attribute;
(8) for each best_attribute 中的未知值a i //將樣本samples按照best_attribute進行劃分
(9) 由結點 N 長出一個條件爲 best_attribute = a i 的分枝;
(10) 設si 是samples 中best_attribute = a i 的樣本的集合;//a partition
(11) if si 爲空 then
(12) 加上一個樹葉,標記爲 samples 中最普通的類;//從樣本中找出類標號數量最多的,作爲此節點的標記
(13) else 加上一個由 Generate_decision_tree(si,attribute_list - best_attribute)返回的結點;//對數據子集si,遞歸調用,此時候選屬性已刪除best_attribute
*/
printf("begin to call Generate_decision_tree.\n");
printf("the samples are:\n");
for (int ts = 0;ts < Samples.size();ts++)
{
printf("%d\t",Samples[ts]);
}
printf("\nend\n");
int class_num = 0;
tnode class_name;
get_class_num_and_name(Samples,class_id,class_num,class_name); //判斷是否屬於同一類
//如果是同一類,則class_num =1,class_name就是類名
//否則,class_num >1 ,class_name就是 最普通的類名
//printf("class_num = %d\n",class_num);
root = new Tree_Node;//(1) 創建結點 N;
root->isLeaf = false; //這裏顯式初始化爲False,以免程序出錯 2011.01.13 17:08
if (1 == class_num)
{
printf("samples 都在同一個類【%s】 ,返回\n",class_name.tdata);
//(2) if samples 都在同一個類C then //類標號屬性的值均爲C,其候選屬性值不考慮
//(3) return N 作爲葉結點,以類C 標記;
strcpy(root->name,class_name.tdata);
root->isLeaf = true;
return;
}
if (attribute_list.size() == 0)
{
printf("attribute_list.size() == 0\n");
//(4) if attribut_list 爲空 then
//(5) return N 作爲葉結點,標記爲 samples 中最普通的類; //類標號屬性值數量最大的那個
//上面已經計算了,這裏直接引用了
strcpy(root->name,class_name.tdata);
root->isLeaf = true;
return;
}
//開始找最佳劃分
int best_arrtib = SelectBestAttribute(Samples,attribute_list,class_id);
printf("the best attrib_id is %d,attribute_name is [%s]\n",best_arrtib,G_DB.attr_name[best_arrtib].tdata);
vector<FreeQNodeDouble> Fq;
double tfa = Fnc_IA(Samples,best_arrtib,class_id,Fq);
vector<int> att_list;
for (int tt = 0;tt<attribute_list.size();tt++ )
{
if (attribute_list[tt] != best_arrtib)
{
att_list.push_back(attribute_list[tt]);
}
}
///////////////////////////直接選擇裏面最普通的類///////////////////////////////////////////////
int id = 0;
int num = Fq[id].count;
for (int i = 1;i < Fq.size();i++)
{
if (num < Fq[i].count)
{
id = i;
num = Fq[i].count;
}
}
printf("最普通的類名:%s\n",Fq[id].name);
///////////////////////////直接選擇裏面最普通的類///////////////////////////////////////////////
printf("開始處理,將Sample按照%d,%s進行劃分\n",best_arrtib,G_DB.attr_name[best_arrtib].tdata);
strcpy(root->name,G_DB.attr_name[best_arrtib].tdata); //這裏已經是最好的劃分了
//(6) 選擇attribute_list 中具有最高信息增益的屬性best_attribute;//找出最好的劃分屬性
//(7) 標記結點 N 爲best_attribute;
printf("劃分共有%d個子集\n",Fq.size());
printf("G_Attr_List[best_arrtib].attr_name.size() = %d\n",G_Attr_List[best_arrtib].attr_name.size());
for (int k = 0;k < G_Attr_List[best_arrtib].attr_name.size();k++)
{
//root->att_list.push_back(G_Attr_List[best_arrtib].attr_name[k]);//保存相應的屬性值
bool has_attr = false; //判斷實際數據集中是否有該屬性值,無沒有,則跳過 郭運凱 2012.05.06
for (int nn = 0;nn < Fq.size();nn ++)
{
if (strcmp(G_Attr_List[best_arrtib].attr_name[k].tdata,Fq[nn].name) == 0)
has_attr = true;
}
if ( ! has_attr)
{
continue;//如果沒有該屬性值,則跳過,不然就會出現錯誤的規則(假設該子集中就一種情況,
// G_Attr_List[best_arrtib].attr_name 裏面卻有3中屬性,這樣,就會生成3條規則,而不是一條規則,並且後面兩條規則是錯誤的)
}
root->att_list.push_back(G_Attr_List[best_arrtib].attr_name[k]);//保存相應的屬性值
// (8) for each best_attribute 中的已知值a i //將樣本samples按照best_attribute進行劃分
printf("\n開始獲取第[%d]個劃分\n",k);
//printf("[%d],name = %s\n",k,G_Attr_List[best_arrtib-1].attr_name[k].tdata); 這裏出了問題,應該直接訪問best_arrtib
printf("[%d],屬性名 = %s\n",k,G_Attr_List[best_arrtib].attr_name[k].tdata);
bool found = false;
for (int n = 0;n < Fq.size();n++)
{
printf("Fq[%d].name=%s\n",n,Fq[n].name);
if (strcmp(G_Attr_List[best_arrtib].attr_name[k].tdata,Fq[n].name) == 0)
{
printf("k=%d,n=%d\n",k,n);
printf("%s\t%s\n",G_Attr_List[best_arrtib].attr_name[k].tdata,Fq[n].name);
//(13) else 加上一個由 Generate_decision_tree(si,attribute_list - best_attribute)
//返回的結點;//對數據子集si,遞歸調用,此時候選屬性已刪除best_attribute
pTreeNpde trnode = new Tree_Node;//(1) 創建結點 N;
//Generate_decision_tree(trnode,Fq[k].row_id,att_list,class_id); //這裏Fq的下標應該取n,而不是k, 郭運凱 2012.05.06
Generate_decision_tree(trnode,Fq[n].row_id,att_list,class_id);
root->child_list.push_back(trnode);
found = true;
break;
}
}
if (!found)
{
// (9) 由結點 N 長出一個條件爲 best_attribute = a i 的分枝;
//(10) 設si 是samples 中best_attribute = a i 的樣本的集合;//a partition
//(11) if si 爲空 then
// (12) 加上一個樹葉,標記爲 samples 中最普通的類;//從樣本中找出類標號數量最多的,作爲此節點的標記
pTreeNpde trnode = new Tree_Node;//(1) 創建結點 N;
strcpy(trnode->name,Fq[id].name);
trnode->isLeaf = true;
root->child_list.push_back(trnode);
}
}
}
void generage_decision_rules(Tree_Node * r,char rules[1000],int level)
{
//printf("%d,%s\n",level,r->name);
if (1 == level)
{
for (int i = 0;i < r->child_list.size();i++)
{
char str[1000];
strcpy(str,rules);
strcat(str,"IF ");
strcat(str,r->name);
strcat(str," = \"");
strcat(str,r->att_list[i].tdata);
strcat(str,"\"");
generage_decision_rules(r->child_list[i],str,level +1 );
}
}
else
{
if(! r->isLeaf)
{
for (int i = 0;i < r->child_list.size();i++)
{
char str[1000];
strcpy(str,rules);
strcat(str," AND ");
strcat(str,r->name);
strcat(str," = \"");
strcat(str,r->att_list[i].tdata);
strcat(str," \"");
//printf(" AND %s = \"%s\" ",r->name,r->att_list[i].tdata);
generage_decision_rules(r->child_list[i],str,level +1 );
}
}
else
{
printf("%s",rules);
printf(" THEN %s = \"%s\"\n",G_DB.class_name.tdata,r->name);
}
}
}
void test()
{
vector<int> s;
for (int i = 0;i< G_DB.DB.size();i++)
{
s.push_back(i);
}
vector<int> arrt_list;
int class_id = G_DB.attr_name.size()-1;
for ( i =0;i< G_DB.attr_name.size()-1;i++)
{
arrt_list.push_back(i);
}
getAllTheAttribute(s,arrt_list,class_id);
Generate_decision_tree(Root,s,arrt_list,class_id);
char rules[1000] ="";
generage_decision_rules(Root,rules,1);
}
void main()
{
readdata("data.txt");
test();
}
----------------------------------------------------------------------------------------
data.txt
由於數據文件中分隔符是以tab鍵分割,在這裏貼上後就變成了空格,造成格式錯亂,代碼無法正常運行,請自行到
http://download.csdn.net/detail/liema2000/4231529 (決策樹之ID3 算法源碼及數據文件)
下載