數據挖掘 —— 課程小項目(MATLAB實現)

Data Preprocessing

clc;clear all;
 %% Data Extraction
readFilename='Diagnostic Data of Tuberculous Pleural Effusion.xlsx';
writeFilename='Data Mining Results of Diagnostic Data of Tuberculous Pleural Effusion.xlsx';
[~,~,rawdata] = xlsread(readFilename,'Sheet1');
rawdata2 = [];
for i = 4 : size(rawdata, 1)
    if (i==99||i==107||i==111||i==113||i==124)
        for j = 1 : size(rawdata, 2)
            rawdata2(i-3, j) = cell2mat(rawdata(i, j));
        end
        continue;
    end
    rawdata2(i-3, :) = cell2mat(rawdata(i, :));
end
rawdata3 = rawdata2;% data backup

 %% Data De-Noising
% Calculating the standard deviations of the data. Then three times the standard deviation is taken as the limit range, and the data beyond the limit is removed as the abnormal data.
meandata = nanmean(rawdata2, 1);
stddata = nanstd(rawdata2, 1);
for j = 1 : size(rawdata2, 2)
    if (j == 3)
        continue;% the diabetes data is not suitable for Data De-Noising
    end
    rawdata2(rawdata2(:, j)>meandata(j)+3*stddata(j), j) = NaN;% the Outliers are replaced by NaN
    rawdata2(rawdata2(:, j)<meandata(j)-3*stddata(j), j) = NaN;
end

 %% Average Interpolation
meandata = nanmean(rawdata2, 1);
meandata1 = nanmean(rawdata2(rawdata2(:, end) == 1, :), 1);% mean value of class 1
meandata2 = nanmean(rawdata2(rawdata2(:, end) == 2, :), 1);% mean value of class 2
for j = 1 : size(rawdata2, 2)
    if (j == 3)
        rawdata2(isnan(rawdata2(:,j)), j) = 2;% set NaN data of diabetes column to 2
        continue;
    elseif (j == 5)
        continue;% skip gender data
    end

    rawdata2(isnan(rawdata2(:,j)), j) = meandata(j);% using mean value of each attribute to fill up vacancies
%         for i = 1 : size(rawdata2, 1)
%             if (isnan(rawdata2(i, j)))
%                 if (rawdata2(i, end) == 1)
%                     rawdata2(i, j) = meandata1(j);% using mean values of all samples belonging to the same class as the given tuple
%                 else
%                     rawdata2(i, j) = meandata2(j);
%                 end
%             end
%         end
end

 %% using SVM to fill up NaN data of gender column
rawdata2(:, 5) = rawdata3(:, 5);% recovery gender data to the status before Average Interpolation
gendertestdata = rawdata2(isnan(rawdata2(:,5)), :);
gendertraindata = rawdata2(~isnan(rawdata2(:,5)), :);

genderXtr = gendertraindata;
genderXtr(:, 5) = [];
genderYtr = gendertraindata(:, 5);
genderXte = gendertestdata;
genderXte(:, 5) = [];

svmStruct = svmtrain(genderXtr,genderYtr);
genderC = svmclassify(svmStruct,genderXte);
rawdata2(isnan(rawdata2(:,5)), 5) = genderC;

 %% Normalization
% rawdata2 = rawdata2';% min - max normalization
% rawdata2(1:end-1, :) = mapminmax(rawdata2(1:end-1, :));
% rawdata2 = rawdata2';

rawdata2(:, 1:end-1) = zscore(rawdata2(:, 1:end-1));% z-score normalization

 %% Save Processed Data
save ProcessedData2 rawdata2;

Cross Validation

clc;clear all;tic;
load ProcessedData2;%load dataDE_x;load data_y;rawdata2 = [dataDE_x, data_y];
rawdata2(rawdata2(:, end)==2, end) = -1;

 %% Ten Experiments
tenExperimentResult = zeros(10, 5);
% SVM
TETFg = zeros(10, 10);TETFc = zeros(10, 10);
global TETFcg_curves;TETFcg_curves = zeros(10, 10, 1000);global TETFfs;TETFfs = zeros(10, 10, size(rawdata2, 2)-1);
parfor experimentCnt = 1 : 10
    display(['----------------','Round',num2str(experimentCnt),'--------------------']);
    foldNum = 10;accuracy=zeros(1,foldNum);errorrate=zeros(1,foldNum);sensitivity=zeros(1,foldNum);specificity=zeros(1,foldNum);precision=zeros(1,foldNum);
    % SVM
    gArray=zeros(1,foldNum);cArray=zeros(1,foldNum);
    
    %% Ten Fold Cross Validation
    [m, n] = size(rawdata2); MaxIter = 1000;
    indices = crossvalind('Kfold',m,foldNum); FSindexArr = zeros(foldNum, n-1); cg_curves = zeros(foldNum, MaxIter);
    for i = 1 : foldNum
        display(['----------------','Fold',num2str(i),'--------------------']);
        testindex = (indices==i);        testdata = rawdata2(testindex, :);
        trainindex = ~testindex;        traindata = rawdata2(trainindex, :);
        
        %% Separate the X part and the Y part
        Xte = testdata(:, 1:end-1);        Yte = testdata(:, end);
        Xtr = traindata(:, 1:end-1);        Ytr = traindata(:, end);
        
        %% Feature Selection and Parameter Selection
        % DE Search
        PSindices = crossvalind('Kfold', size(traindata, 1), 5);% 80%-20%
        PStestindex = (PSindices==1);        PStestdata = traindata(PStestindex, :);
        PStrainindex = ~PStestindex;        PStraindata = traindata(PStrainindex, :);
        PSXte = PStestdata(:, 1:end-1);        PSYte = PStestdata(:, end);
        PSXtr = PStraindata(:, 1:end-1);        PSYtr = PStraindata(:, end);
        % SVM
        dim=2 + size(Xtr, 2);  lb = zeros(1,dim);ub = zeros(1,dim);  lb(1:2)=2^(-10);  ub(1:2)=2^(10); lb(3:end)=-1; ub(3:end)=1;  MaxFEs=dim*100;  N=30;
        fobj=@LIBSVM4FSPS;
        [x,cg_curve]=DE4PS(N, MaxIter, lb, ub, dim, fobj, PSXtr, PSYtr, PSXte, PSYte);
        bestg = x(1); bestc = x(2); FSindex = x(3:end); FSindex=FSindex>0; Xtr = Xtr(:, FSindex); Xte = Xte(:, FSindex); FSindexArr(i, :) = FSindex; cg_curves(i, :) = cg_curve;
        
        
        %% Test
        %SVM
        %model = libsvmtrain(Ytr, Xtr, '-c 8.0 -g 0.008');
        model = libsvmtrain(Ytr, Xtr, ['-g ',num2str(bestg),' -c ',num2str(bestc),' -q']);
        [C, accRate, decision_values] = svmpredict(Yte, Xte, model);

        conMat = confusionmat(Yte, C);% calculate the confusion matrix
        
        %% Evaluation of Classification Algorithms
        TP = conMat(1,1);FP = conMat(2,1);TN = conMat(2,2);FN = conMat(1,2);
        P = TP + FN;    N = FP + TN;    P_ = TP + FP;    N_ = FN + TN;
        accuracy(i) = (TP + TN) / (P + N);errorrate(i) = (FP+FN)/(P+N);sensitivity(i) = TP/P;  specificity(i) = TN/N;  precision(i) = TP/(TP+FP);
        
        %% Save Parameters
        % SVM
        gArray(i) = bestg;cArray(i) = bestc;
    end
    
    %% Save Parameters
    TETFg(experimentCnt, :) = gArray; TETFc(experimentCnt, :) = cArray; parsave(cg_curves,FSindexArr,experimentCnt);
    tenExperimentResult(experimentCnt, :) = [mean(accuracy), mean(errorrate), mean(sensitivity), mean(specificity), mean(precision)];
end

 %% Save Results
tenExperimentResult
meanTER = mean(tenExperimentResult, 1)
save CrossValidationResults2SVM tenExperimentResult meanTER TETFg TETFc TETFcg_curves TETFfs;
writeFilename='Data Mining Results of Diagnostic Data of Tuberculous Pleural Effusion2 SVM.xlsx';
xlswrite(writeFilename, tenExperimentResult, 'tenExperimentResult');TETFps = [TETFg, TETFc];xlswrite(writeFilename, TETFps, 'TETFps');toc;

DE(Differential Evolution) for Parameter Selection and Feature Selection

Namely the DE4PS above.

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章