Data Preprocessing
clc;clear all;
%% Data Extraction
readFilename='Diagnostic Data of Tuberculous Pleural Effusion.xlsx';
writeFilename='Data Mining Results of Diagnostic Data of Tuberculous Pleural Effusion.xlsx';
[~,~,rawdata] = xlsread(readFilename,'Sheet1');
rawdata2 = [];
for i = 4 : size(rawdata, 1)
if (i==99||i==107||i==111||i==113||i==124)
for j = 1 : size(rawdata, 2)
rawdata2(i-3, j) = cell2mat(rawdata(i, j));
end
continue;
end
rawdata2(i-3, :) = cell2mat(rawdata(i, :));
end
rawdata3 = rawdata2;% data backup
%% Data De-Noising
% Calculating the standard deviations of the data. Then three times the standard deviation is taken as the limit range, and the data beyond the limit is removed as the abnormal data.
meandata = nanmean(rawdata2, 1);
stddata = nanstd(rawdata2, 1);
for j = 1 : size(rawdata2, 2)
if (j == 3)
continue;% the diabetes data is not suitable for Data De-Noising
end
rawdata2(rawdata2(:, j)>meandata(j)+3*stddata(j), j) = NaN;% the Outliers are replaced by NaN
rawdata2(rawdata2(:, j)<meandata(j)-3*stddata(j), j) = NaN;
end
%% Average Interpolation
meandata = nanmean(rawdata2, 1);
meandata1 = nanmean(rawdata2(rawdata2(:, end) == 1, :), 1);% mean value of class 1
meandata2 = nanmean(rawdata2(rawdata2(:, end) == 2, :), 1);% mean value of class 2
for j = 1 : size(rawdata2, 2)
if (j == 3)
rawdata2(isnan(rawdata2(:,j)), j) = 2;% set NaN data of diabetes column to 2
continue;
elseif (j == 5)
continue;% skip gender data
end
rawdata2(isnan(rawdata2(:,j)), j) = meandata(j);% using mean value of each attribute to fill up vacancies
% for i = 1 : size(rawdata2, 1)
% if (isnan(rawdata2(i, j)))
% if (rawdata2(i, end) == 1)
% rawdata2(i, j) = meandata1(j);% using mean values of all samples belonging to the same class as the given tuple
% else
% rawdata2(i, j) = meandata2(j);
% end
% end
% end
end
%% using SVM to fill up NaN data of gender column
rawdata2(:, 5) = rawdata3(:, 5);% recovery gender data to the status before Average Interpolation
gendertestdata = rawdata2(isnan(rawdata2(:,5)), :);
gendertraindata = rawdata2(~isnan(rawdata2(:,5)), :);
genderXtr = gendertraindata;
genderXtr(:, 5) = [];
genderYtr = gendertraindata(:, 5);
genderXte = gendertestdata;
genderXte(:, 5) = [];
svmStruct = svmtrain(genderXtr,genderYtr);
genderC = svmclassify(svmStruct,genderXte);
rawdata2(isnan(rawdata2(:,5)), 5) = genderC;
%% Normalization
% rawdata2 = rawdata2';% min - max normalization
% rawdata2(1:end-1, :) = mapminmax(rawdata2(1:end-1, :));
% rawdata2 = rawdata2';
rawdata2(:, 1:end-1) = zscore(rawdata2(:, 1:end-1));% z-score normalization
%% Save Processed Data
save ProcessedData2 rawdata2;
Cross Validation
clc;clear all;tic;
load ProcessedData2;%load dataDE_x;load data_y;rawdata2 = [dataDE_x, data_y];
rawdata2(rawdata2(:, end)==2, end) = -1;
%% Ten Experiments
tenExperimentResult = zeros(10, 5);
% SVM
TETFg = zeros(10, 10);TETFc = zeros(10, 10);
global TETFcg_curves;TETFcg_curves = zeros(10, 10, 1000);global TETFfs;TETFfs = zeros(10, 10, size(rawdata2, 2)-1);
parfor experimentCnt = 1 : 10
display(['----------------','Round',num2str(experimentCnt),'--------------------']);
foldNum = 10;accuracy=zeros(1,foldNum);errorrate=zeros(1,foldNum);sensitivity=zeros(1,foldNum);specificity=zeros(1,foldNum);precision=zeros(1,foldNum);
% SVM
gArray=zeros(1,foldNum);cArray=zeros(1,foldNum);
%% Ten Fold Cross Validation
[m, n] = size(rawdata2); MaxIter = 1000;
indices = crossvalind('Kfold',m,foldNum); FSindexArr = zeros(foldNum, n-1); cg_curves = zeros(foldNum, MaxIter);
for i = 1 : foldNum
display(['----------------','Fold',num2str(i),'--------------------']);
testindex = (indices==i); testdata = rawdata2(testindex, :);
trainindex = ~testindex; traindata = rawdata2(trainindex, :);
%% Separate the X part and the Y part
Xte = testdata(:, 1:end-1); Yte = testdata(:, end);
Xtr = traindata(:, 1:end-1); Ytr = traindata(:, end);
%% Feature Selection and Parameter Selection
% DE Search
PSindices = crossvalind('Kfold', size(traindata, 1), 5);% 80%-20%
PStestindex = (PSindices==1); PStestdata = traindata(PStestindex, :);
PStrainindex = ~PStestindex; PStraindata = traindata(PStrainindex, :);
PSXte = PStestdata(:, 1:end-1); PSYte = PStestdata(:, end);
PSXtr = PStraindata(:, 1:end-1); PSYtr = PStraindata(:, end);
% SVM
dim=2 + size(Xtr, 2); lb = zeros(1,dim);ub = zeros(1,dim); lb(1:2)=2^(-10); ub(1:2)=2^(10); lb(3:end)=-1; ub(3:end)=1; MaxFEs=dim*100; N=30;
fobj=@LIBSVM4FSPS;
[x,cg_curve]=DE4PS(N, MaxIter, lb, ub, dim, fobj, PSXtr, PSYtr, PSXte, PSYte);
bestg = x(1); bestc = x(2); FSindex = x(3:end); FSindex=FSindex>0; Xtr = Xtr(:, FSindex); Xte = Xte(:, FSindex); FSindexArr(i, :) = FSindex; cg_curves(i, :) = cg_curve;
%% Test
%SVM
%model = libsvmtrain(Ytr, Xtr, '-c 8.0 -g 0.008');
model = libsvmtrain(Ytr, Xtr, ['-g ',num2str(bestg),' -c ',num2str(bestc),' -q']);
[C, accRate, decision_values] = svmpredict(Yte, Xte, model);
conMat = confusionmat(Yte, C);% calculate the confusion matrix
%% Evaluation of Classification Algorithms
TP = conMat(1,1);FP = conMat(2,1);TN = conMat(2,2);FN = conMat(1,2);
P = TP + FN; N = FP + TN; P_ = TP + FP; N_ = FN + TN;
accuracy(i) = (TP + TN) / (P + N);errorrate(i) = (FP+FN)/(P+N);sensitivity(i) = TP/P; specificity(i) = TN/N; precision(i) = TP/(TP+FP);
%% Save Parameters
% SVM
gArray(i) = bestg;cArray(i) = bestc;
end
%% Save Parameters
TETFg(experimentCnt, :) = gArray; TETFc(experimentCnt, :) = cArray; parsave(cg_curves,FSindexArr,experimentCnt);
tenExperimentResult(experimentCnt, :) = [mean(accuracy), mean(errorrate), mean(sensitivity), mean(specificity), mean(precision)];
end
%% Save Results
tenExperimentResult
meanTER = mean(tenExperimentResult, 1)
save CrossValidationResults2SVM tenExperimentResult meanTER TETFg TETFc TETFcg_curves TETFfs;
writeFilename='Data Mining Results of Diagnostic Data of Tuberculous Pleural Effusion2 SVM.xlsx';
xlswrite(writeFilename, tenExperimentResult, 'tenExperimentResult');TETFps = [TETFg, TETFc];xlswrite(writeFilename, TETFps, 'TETFps');toc;
DE(Differential Evolution) for Parameter Selection and Feature Selection
Namely the DE4PS above.