SVM ten-fold cross validation 的时候用并行计算
% this is script is for SVM training and testing
tic;
clc
clear
allData=dlmread('features_LDA_ratios.txt');
% allData=importdata('experiment1.mat'); % allData=dlmread('experiment1.mat');
% generate the class label for all samples
% in experiment 1, nonfraud firm read first, then fraud firms
% note that 1 for fraud, -1 for nonfraud
numOfDocs = size(allData,1);
labels = [-1*ones(numOfDocs/2,1);ones(numOfDocs/2,1)]; % experiment 1, 1 for fraud, -1 for nonfraud
first_allData = [allData,labels];
%% read random
randomOrder = dlmread('order.txt');
first_allData = first_allData(randomOrder,:);
labels = first_allData(:,size(first_allData,2));
first_allData = first_allData(:,1:(size(first_allData,2)-1));
%% here is for feaature selection with T Test
%% T test for feature selection
% [h_unpair11,p_unpair11] = ttest2(X_topic,Y_topic,0.35,'both','unequal'); %independet sample t test
% [h_unpair22,p_unpair22] = ttest2(X_tfidf,Y_tfidf,0.35,'both','unequal'); %independet sample t test
%% generate the new data input
% % data_emotion = first_allData(:,1:7);
% % data_topic = first_allData(:,8:107);
% % first_allData = [data_emotion(:,emotion_index),data_topic(:,topic_index)]; % using features with significance in T test to generate the new data input
%% read tfidf features
% tfidf_fold1 = dlmread('tfidf_1_fold1.txt');
% tfidf_fold2 = dlmread('tfidf_1_fold2.txt');
% tfidf_fold3 = dlmread('tfidf_1_fold3.txt');
% tfidf_fold4 = dlmread('tfidf_1_fold4.txt');
% tfidf_fold5 = dlmread('tfidf_1_fold5.txt');
% tfidf_fold6 = dlmread('tfidf_1_fold6.txt');
% tfidf_fold7 = dlmread('tfidf_1_fold7.txt');
% tfidf_fold8 = dlmread('tfidf_1_fold8.txt');
% tfidf_fold9 = dlmread('tfidf_1_fold9.txt');
% tfidf_fold10 = dlmread('tfidf_1_fold10.txt');
tfidf_fold1 = importdata('tfidf_1_fold1.mat');
tfidf_fold2 = importdata('tfidf_1_fold2.mat');
tfidf_fold3 = importdata('tfidf_1_fold3.mat');
tfidf_fold4 = importdata('tfidf_1_fold4.mat');
tfidf_fold5 = importdata('tfidf_1_fold5.mat');
tfidf_fold6 = importdata('tfidf_1_fold6.mat');
tfidf_fold7 = importdata('tfidf_1_fold7.mat');
tfidf_fold8 = importdata('tfidf_1_fold8.mat');
tfidf_fold9 = importdata('tfidf_1_fold9.mat');
tfidf_fold10 = importdata('tfidf_1_fold10.mat');
v=10;% If v=5,it means 5-fold cross validation.
step=floor(size(allData,1)/v);
train_accuracy_ave=[];
train_precision_ave = [];
train_recall_ave = [];
train_F1_ave = [];
train_auc_ave = [];
test_accuracy_ave=[];
test_precision_ave=[];
test_recall_ave = [];
test_F1_ave =[];
test_auc_ave=[];
%% parallel computing setting
%Initialize Matlab Parallel Computing Enviornment
CoreNum = 8;
if matlabpool('size')<=0 % judge whether the parallel computing environment has already ready
matlabpool('open','local',CoreNum); % if not, start the parallel computing environment
else
disp('Already initialized');
end
parfor j =1:v
if j~= v
startpoint=(j-1)*step+1;
endpoint=(j)*step;
else
startpoint=(j-1)*step+1;
endpoint= numOfDocs;
end
cv_p=startpoint:endpoint; %%%% test set position
tfidf_matrix = [];
switch j
case 1
tfidf_matrix = tfidf_fold1;
case 2
tfidf_matrix = tfidf_fold2;
case 3
tfidf_matrix = tfidf_fold3;
case 4
tfidf_matrix = tfidf_fold4;
case 5
tfidf_matrix = tfidf_fold5;
case 6
tfidf_matrix = tfidf_fold6;
case 7
tfidf_matrix = tfidf_fold7;
case 8
tfidf_matrix = tfidf_fold8;
case 9
tfidf_matrix = tfidf_fold9;
case 10
tfidf_matrix = tfidf_fold10;
otherwise
disp('other value')
end
% generate all data
allData = [first_allData,tfidf_matrix];
allData = [allData,labels];
[numOfRows,numOfColumns]=size(allData);
testData=allData(cv_p,:);
trainData=allData;
trainData(cv_p,:)='';
%%??features?labels
train_features=trainData(:,1:numOfColumns-1);
train_labels=trainData(:,numOfColumns);
%% feature selection using T test in the training sample
index_p = find(train_labels>0);
index_n = find(train_labels<0);
X = train_features(index_p,:);
Y = train_features(index_n,:);
% X_emotion = X(:,1:7);
% Y_emotion = Y(:,1:7);
% X_topic = X(:,8:107);
% Y_topic = Y(:,8:107);
X_tfidf = X(:,123:size(train_features,2));
Y_tfidf = Y(:,123:size(train_features,2));
% [h_unpair0,p_unpair0] = ttest2(X_emotion,Y_emotion,0.05,'both','unequal'); %independet sample t test
% [h_unpair1,p_unpair1] = ttest2(X_topic,Y_topic,0.05,'both','unequal'); %independet sample t test
[h_unpair2,p_unpair2] = ttest2(X_tfidf,Y_tfidf,0.05,'both','unequal'); %independet sample t test
% emotion_index = find(h_unpair0);
% topic_index = find(h_unpair1);
tfidf_index = find(h_unpair2);
train_features = [train_features(:,1:122),train_features(:,tfidf_index+122)];
test_features=testData(:,1:numOfColumns-1);
test_features = [test_features(:,1:122),test_features(:,tfidf_index+122)];
test_labels=testData(:,numOfColumns);
%% ???
[train_final,test_final] = scaleForSVM(train_features,test_features,0,1);
%% find best c and g by Grid search
[bestCVaccuracy,bestc,bestg] = SVMcgForClass(train_labels,train_final,-9,9,-9,9,5,0.5,0.5,4.5);
%% find best c and g by gene algorithm
% ga_option.maxgen = 100;
% ga_option.sizepop = 20;
% ga_option.ggap = 0.9;
% ga_option.cbound = [0,100];
% ga_option.gbound = [0,100];
% ga_option.v = 5;
% [bestacc,bestc,bestg] = gaSVMcgForClass(train_data_labels,train_final,ga_option);
%% find best c and g by PSO
% pso_option.c1 = 1.5;
% pso_option.c2 = 1.7;
% pso_option.maxgen = 100;
% pso_option.sizepop = 20;
% pso_option.k = 0.6;
% pso_option.wV = 1;
% pso_option.wP = 1;
% pso_option.v = 3;
% pso_option.popcmax = 100;
% pso_option.popcmin = 0.1;
% pso_option.popgmax = 100;
% pso_option.popgmin = 0.1;
% [bestacc,bestc,bestg] = psoSVMcgForClass(train_data_labels,train_final,pso_option)
cmd = ['-c ',num2str(bestc),' -g ',num2str(bestg)];
%% ?????
train_final=sparse(train_final);
test_final=sparse(test_final);
model = svmtrain(train_labels, train_final,cmd);
[ptrain_label, train_accuracy,~] = svmpredict(train_labels, train_final, model);
AUC_train = plotroc(train_labels,train_final,model);
train_auc_ave = [train_auc_ave,AUC_train];
% The second output train_accuracy, is a vector including accuracy (for classification), mean
% squared error, and squared correlation coefficient (for regression)
% The third is a matrix containing decision values or probability
% estimates (if '-b 1' is specified). If k is the number of classes, for decision values,
% each row includes results of predicting k(k-1/2) binary-class SVMs. For probabilities,
% each row contains k values indicating the probability that the testing instance is in
% each class.
temp_plus_train = train_labels + ptrain_label;
temp_minus_train = train_labels - ptrain_label;
TP_train = length(find(temp_plus_train>=2));
TN_train = length(find(temp_plus_train<=-2));
FN_train = length(find(temp_minus_train>=2));
FP_train = length(find(temp_minus_train<=-2));
accuracy_temp_train = (TP_train+TN_train)/(TP_train+FN_train+FP_train+TN_train);
recall_temp_train = TP_train/(TP_train+FN_train);
precision_temp_train = TP_train/(TP_train+FP_train);
f1_measure_temp_train = (2*precision_temp_train*recall_temp_train)/(precision_temp_train+recall_temp_train);
train_accuracy_ave=[train_accuracy_ave,train_accuracy(1)];
train_precision_ave = [train_precision_ave,precision_temp_train];
train_recall_ave = [train_recall_ave,recall_temp_train];
train_F1_ave = [train_F1_ave,f1_measure_temp_train];
[ptest_label, test_accuracy,~] = svmpredict(test_labels, test_final, model);
AUC_test = plotroc(test_labels,test_final,model);
test_auc_ave = [test_auc_ave,AUC_test];
temp_plus = test_labels + ptest_label;
temp_minus = test_labels - ptest_label;
TP = length(find(temp_plus>=2));
TN = length(find(temp_plus<=-2));
FN = length(find(temp_minus>=2));
FP = length(find(temp_minus<=-2));
accuracy_temp = (TP+TN)/(TP+FN+FP+TN);
recall_temp = TP/(TP+FN);
precision_temp = TP/(TP+FP);
f1_measure_temp = (2*precision_temp*recall_temp)/(precision_temp+recall_temp);
test_accuracy_ave=[test_accuracy_ave,test_accuracy(1)];
test_precision_ave = [test_precision_ave,precision_temp];
test_recall_ave = [test_recall_ave,recall_temp];
test_F1_ave = [test_F1_ave,f1_measure_temp];
end
disp('all training accuracy.');
disp(train_accuracy_ave);
disp('average training accuracy.');
train_accuracy_ave=mean(train_accuracy_ave);
disp(train_accuracy_ave);
disp('all training precision.');
disp(train_precision_ave);
disp('average training precision.');
train_precision_ave=mean(train_precision_ave);
disp(train_precision_ave);
disp('all training recall.');
disp(train_recall_ave);
disp('average training recall.');
train_recall_ave=mean(train_recall_ave);
disp(train_recall_ave);
disp('all training F1 measure.');
disp(train_F1_ave);
disp('average training F1 measure.');
train_F1_ave=mean(train_F1_ave);
disp(train_F1_ave);
disp('all training AUC.');
disp(train_auc_ave);
disp('average training AUC.');
train_auc_ave=mean(train_auc_ave);
disp(train_auc_ave);
disp('all testing accuracy.');
disp(test_accuracy_ave);
disp('average testing accuracy.');
test_accuracy_ave=mean(test_accuracy_ave);
disp(test_accuracy_ave);
disp('all testing precision.');
disp(test_precision_ave);
disp('average testing precision.');
test_precision_ave=mean(test_precision_ave);
disp(test_precision_ave);
disp('all testing recall.');
disp(test_recall_ave);
disp('average testing recall.');
test_recall_ave=mean(test_recall_ave);
disp(test_recall_ave);
disp('all testing F1 measure.');
disp(test_F1_ave);
disp('average testing F1 measure.');
test_F1_ave=mean(test_F1_ave);
disp(test_F1_ave);
disp('all testing AUC.');
disp(test_auc_ave);
disp('average testing AUC.');
test_auc_ave=mean(test_auc_ave);
disp(test_auc_ave);
matlabpool close
toc;