matlab SVM 并行计算


SVM ten-fold cross validation 的时候用并行计算


%  this is script is for SVM training and testing
tic;
clc
clear
allData=dlmread('features_LDA_ratios.txt'); 
% allData=importdata('experiment1.mat');  % allData=dlmread('experiment1.mat'); 
% generate the class label for all samples
% in experiment 1, nonfraud firm read first, then fraud firms      
% note that 1 for fraud, -1 for nonfraud
numOfDocs = size(allData,1);

labels = [-1*ones(numOfDocs/2,1);ones(numOfDocs/2,1)]; % experiment 1, 1 for fraud, -1 for nonfraud

first_allData = [allData,labels];

%% read random 
randomOrder = dlmread('order.txt'); 
first_allData = first_allData(randomOrder,:);
labels = first_allData(:,size(first_allData,2));
first_allData = first_allData(:,1:(size(first_allData,2)-1));
 
%% here is for feaature selection with T Test






%% T test for feature selection
% [h_unpair11,p_unpair11] = ttest2(X_topic,Y_topic,0.35,'both','unequal'); %independet sample t test
% [h_unpair22,p_unpair22] = ttest2(X_tfidf,Y_tfidf,0.35,'both','unequal'); %independet sample t test

%% generate the new data input
% % data_emotion = first_allData(:,1:7);
% % data_topic = first_allData(:,8:107);
% % first_allData = [data_emotion(:,emotion_index),data_topic(:,topic_index)]; % using features with significance in T test to generate the new data input

%% read tfidf features

% tfidf_fold1 = dlmread('tfidf_1_fold1.txt'); 
% tfidf_fold2 = dlmread('tfidf_1_fold2.txt'); 
% tfidf_fold3 = dlmread('tfidf_1_fold3.txt'); 
% tfidf_fold4 = dlmread('tfidf_1_fold4.txt'); 
% tfidf_fold5 = dlmread('tfidf_1_fold5.txt'); 
% tfidf_fold6 = dlmread('tfidf_1_fold6.txt'); 
% tfidf_fold7 = dlmread('tfidf_1_fold7.txt'); 
% tfidf_fold8 = dlmread('tfidf_1_fold8.txt'); 
% tfidf_fold9 = dlmread('tfidf_1_fold9.txt'); 
% tfidf_fold10 = dlmread('tfidf_1_fold10.txt'); 

tfidf_fold1 = importdata('tfidf_1_fold1.mat'); 
tfidf_fold2 = importdata('tfidf_1_fold2.mat'); 
tfidf_fold3 = importdata('tfidf_1_fold3.mat'); 
tfidf_fold4 = importdata('tfidf_1_fold4.mat'); 
tfidf_fold5 = importdata('tfidf_1_fold5.mat'); 
tfidf_fold6 = importdata('tfidf_1_fold6.mat'); 
tfidf_fold7 = importdata('tfidf_1_fold7.mat'); 
tfidf_fold8 = importdata('tfidf_1_fold8.mat'); 
tfidf_fold9 = importdata('tfidf_1_fold9.mat'); 
tfidf_fold10 = importdata('tfidf_1_fold10.mat'); 

 v=10;% If v=5,it means 5-fold cross validation.
step=floor(size(allData,1)/v);
train_accuracy_ave=[];
train_precision_ave = [];
train_recall_ave = [];
train_F1_ave = [];
train_auc_ave = [];
test_accuracy_ave=[];
test_precision_ave=[];
test_recall_ave = [];
test_F1_ave =[];
test_auc_ave=[];

%% parallel computing setting
%Initialize Matlab Parallel Computing Enviornment 
CoreNum = 8; 
if matlabpool('size')<=0 % judge whether the parallel computing environment has already ready
matlabpool('open','local',CoreNum); % if not, start the parallel computing environment
else
disp('Already initialized'); 
end

parfor j =1:v
    if j~= v
        startpoint=(j-1)*step+1;
        endpoint=(j)*step;
    else
        startpoint=(j-1)*step+1;
        endpoint= numOfDocs;
    end
    cv_p=startpoint:endpoint; %%%% test set position
    
    tfidf_matrix = [];
    switch j
    case 1
        tfidf_matrix = tfidf_fold1;
    case 2
        tfidf_matrix = tfidf_fold2;
    case 3
        tfidf_matrix = tfidf_fold3;
    case 4
        tfidf_matrix = tfidf_fold4;
    case 5
        tfidf_matrix = tfidf_fold5;
    case 6
        tfidf_matrix = tfidf_fold6;
    case 7
        tfidf_matrix = tfidf_fold7;
    case 8
        tfidf_matrix = tfidf_fold8;
    case 9
        tfidf_matrix = tfidf_fold9;
    case 10
        tfidf_matrix = tfidf_fold10;
    otherwise
        disp('other value')
    end
    
    % generate all data
    allData = [first_allData,tfidf_matrix];    
    allData = [allData,labels]; 
    [numOfRows,numOfColumns]=size(allData);
     testData=allData(cv_p,:);
     trainData=allData;
     trainData(cv_p,:)='';       
     %%??features?labels
    train_features=trainData(:,1:numOfColumns-1);
    train_labels=trainData(:,numOfColumns);
    %% feature selection using T test in the training sample
    index_p = find(train_labels>0);
    index_n = find(train_labels<0);
    X = train_features(index_p,:);
    Y = train_features(index_n,:);
%     X_emotion = X(:,1:7);
%     Y_emotion = Y(:,1:7);
%     X_topic = X(:,8:107);
%     Y_topic = Y(:,8:107);
    X_tfidf = X(:,123:size(train_features,2));
    Y_tfidf = Y(:,123:size(train_features,2));

%     [h_unpair0,p_unpair0] = ttest2(X_emotion,Y_emotion,0.05,'both','unequal'); %independet sample t test
%     [h_unpair1,p_unpair1] = ttest2(X_topic,Y_topic,0.05,'both','unequal'); %independet sample t test
    [h_unpair2,p_unpair2] = ttest2(X_tfidf,Y_tfidf,0.05,'both','unequal'); %independet sample t test
%     emotion_index = find(h_unpair0);
%     topic_index = find(h_unpair1);
    tfidf_index = find(h_unpair2);
    train_features = [train_features(:,1:122),train_features(:,tfidf_index+122)];
    
    
    test_features=testData(:,1:numOfColumns-1);
    test_features = [test_features(:,1:122),test_features(:,tfidf_index+122)];
    test_labels=testData(:,numOfColumns);
    %% ???
    [train_final,test_final] = scaleForSVM(train_features,test_features,0,1);
   
    %% find best c and g by Grid search
    [bestCVaccuracy,bestc,bestg] = SVMcgForClass(train_labels,train_final,-9,9,-9,9,5,0.5,0.5,4.5);
    
    %% find best c and g by gene algorithm
%     ga_option.maxgen = 100;
%     ga_option.sizepop = 20; 
%     ga_option.ggap = 0.9;
%     ga_option.cbound = [0,100];
%     ga_option.gbound = [0,100];
%     ga_option.v = 5;
%     [bestacc,bestc,bestg] = gaSVMcgForClass(train_data_labels,train_final,ga_option);

    %% find best c and g by PSO
    % pso_option.c1 = 1.5;
    % pso_option.c2 = 1.7;
    % pso_option.maxgen = 100;
    % pso_option.sizepop = 20;
    % pso_option.k = 0.6;
    % pso_option.wV = 1;
    % pso_option.wP = 1;
    % pso_option.v = 3;
    % pso_option.popcmax = 100;
    % pso_option.popcmin = 0.1;
    % pso_option.popgmax = 100;
    % pso_option.popgmin = 0.1;
    % [bestacc,bestc,bestg] = psoSVMcgForClass(train_data_labels,train_final,pso_option)
    
    
    
    
    cmd = ['-c ',num2str(bestc),' -g ',num2str(bestg)];

  
    %% ?????
    train_final=sparse(train_final);
    test_final=sparse(test_final);
    model = svmtrain(train_labels, train_final,cmd);
    [ptrain_label, train_accuracy,~] = svmpredict(train_labels, train_final, model);
    AUC_train = plotroc(train_labels,train_final,model);
    train_auc_ave = [train_auc_ave,AUC_train];
%     The second output train_accuracy, is a vector including accuracy (for classification), mean
% squared error, and squared correlation coefficient (for regression)
% The third is a matrix containing decision values or probability
% estimates (if '-b 1' is specified). If k is the number of classes, for decision values, 
% each row includes results of predicting k(k-1/2) binary-class SVMs. For probabilities, 
% each row contains k values indicating the probability that the testing instance is in
% each class. 
    temp_plus_train = train_labels + ptrain_label;
    temp_minus_train = train_labels - ptrain_label;
    TP_train = length(find(temp_plus_train>=2));
    TN_train = length(find(temp_plus_train<=-2));
    FN_train = length(find(temp_minus_train>=2));
    FP_train = length(find(temp_minus_train<=-2));
    accuracy_temp_train = (TP_train+TN_train)/(TP_train+FN_train+FP_train+TN_train);
    recall_temp_train = TP_train/(TP_train+FN_train);
    precision_temp_train = TP_train/(TP_train+FP_train);
    f1_measure_temp_train = (2*precision_temp_train*recall_temp_train)/(precision_temp_train+recall_temp_train);
    
    train_accuracy_ave=[train_accuracy_ave,train_accuracy(1)];
    train_precision_ave = [train_precision_ave,precision_temp_train];
    train_recall_ave = [train_recall_ave,recall_temp_train];
    train_F1_ave = [train_F1_ave,f1_measure_temp_train];
    
    [ptest_label, test_accuracy,~] = svmpredict(test_labels, test_final, model);
    
    AUC_test = plotroc(test_labels,test_final,model);
    test_auc_ave = [test_auc_ave,AUC_test];
    
    temp_plus = test_labels + ptest_label;
    temp_minus = test_labels - ptest_label;
    TP = length(find(temp_plus>=2));
    TN = length(find(temp_plus<=-2));
    FN = length(find(temp_minus>=2));
    FP = length(find(temp_minus<=-2));
    accuracy_temp = (TP+TN)/(TP+FN+FP+TN);
    recall_temp = TP/(TP+FN);
    precision_temp = TP/(TP+FP);
    f1_measure_temp = (2*precision_temp*recall_temp)/(precision_temp+recall_temp);
    
     test_accuracy_ave=[test_accuracy_ave,test_accuracy(1)];
     test_precision_ave = [test_precision_ave,precision_temp];
     test_recall_ave = [test_recall_ave,recall_temp];
     test_F1_ave = [test_F1_ave,f1_measure_temp];

end
disp('all training accuracy.');
disp(train_accuracy_ave);
disp('average training accuracy.');
train_accuracy_ave=mean(train_accuracy_ave);
disp(train_accuracy_ave);
disp('all training precision.');
disp(train_precision_ave);
disp('average training precision.');
train_precision_ave=mean(train_precision_ave);
disp(train_precision_ave);
disp('all training recall.');
disp(train_recall_ave);
disp('average training recall.');
train_recall_ave=mean(train_recall_ave);
disp(train_recall_ave);
disp('all training F1 measure.');
disp(train_F1_ave);
disp('average training F1 measure.');
train_F1_ave=mean(train_F1_ave);
disp(train_F1_ave);
disp('all training AUC.');
disp(train_auc_ave);
disp('average training AUC.');
train_auc_ave=mean(train_auc_ave);
disp(train_auc_ave);


disp('all testing accuracy.');
disp(test_accuracy_ave);
disp('average testing accuracy.');
test_accuracy_ave=mean(test_accuracy_ave);
disp(test_accuracy_ave);
disp('all testing precision.');
disp(test_precision_ave);
disp('average testing precision.');
test_precision_ave=mean(test_precision_ave);
disp(test_precision_ave);
disp('all testing recall.');
disp(test_recall_ave);
disp('average testing recall.');
test_recall_ave=mean(test_recall_ave);
disp(test_recall_ave);
disp('all testing F1 measure.');
disp(test_F1_ave);
disp('average testing F1 measure.');
test_F1_ave=mean(test_F1_ave);
disp(test_F1_ave);
disp('all testing AUC.');
disp(test_auc_ave);
disp('average testing AUC.');
test_auc_ave=mean(test_auc_ave);
disp(test_auc_ave);

matlabpool close
toc;

猜你喜欢

转载自blog.csdn.net/dongweionly/article/details/45130813