|
%% Improving Model Accuracy by Evaluating Architecture & Feature Choices
|
|
% Most of the predictive models discussed here have numerous parameters
|
|
% that can be selected to tweak the model to improve its performance for a
|
|
% particular problem. Selecting the right features is also a very important
|
|
% step in the machine learning workflow. This script explores methods of
|
|
% making these decisions
|
|
|
|
% Copyright 2015 The MathWorks, Inc.
|
|
|
|
%% Start with the Bagged Decision Tree model
|
|
|
|
% Import data
|
|
faultData = importFaultData('faultData.xlsx');
|
|
faultData.Fault = categorical(faultData.Fault);
|
|
names =faultData.Properties.VariableNames;
|
|
|
|
% Filter data
|
|
faultData(faultData.Fault == 'lost',:) = [];
|
|
faultData(faultData.Fault == 'moved',:) = [];
|
|
faultData.Fault = removecats(faultData.Fault);
|
|
|
|
%
|
|
X = table2array(faultData(:,1:end-1));
|
|
Y = faultData.Fault;
|
|
retrain = true;
|
|
opts = statset('UseParallel', true);
|
|
tbL = TreeBagger(100,X,Y,'method','classification',...
|
|
'Options',opts,'OOBVarImp','on');
|
|
|
|
%% Estimating a Good Ensemble Size for Bagged Classification Trees
|
|
% Because each tree in the bag is trained on a subset of the training data,
|
|
% the out-of-bag observations can be used to estimate the out of sample
|
|
% misclassification rate of the tree. Examining the out-of-bag error may
|
|
% give an insight into determining a good ensemble size.
|
|
|
|
clf
|
|
plot(oobError(tbL)); grid on;
|
|
xlabel('Number of Grown Trees');
|
|
ylabel('Out-of-Bag Classification Error/Misclassification Probability');
|
|
|
|
%% Estimating Feature Importance
|
|
% Feature importance measures the increase in prediction error if the
|
|
% values of that variable are permuted across the out-of-bag observations.
|
|
|
|
figure;
|
|
[~,idxvarimp] = sort(tbL.OOBPermutedVarDeltaError, 'ascend');
|
|
barh(tbL.OOBPermutedVarDeltaError(idxvarimp));
|
|
title('Out-Of-Bag Feature Importance');
|
|
xlabel('Relative Feature Importance Score');
|
|
set(gca,'YTick',1:length(names)-1,'YTickLabel', names(idxvarimp), 'XGrid', 'on');
|
|
|
|
%% Sequential Feature Selection
|
|
% Feature selection reduces the dimensionality of data by selecting only a
|
|
% subset of measured features (predictor variables) to create a model.
|
|
% Selection criteria involves the minimization of a specific measure of
|
|
% predictive error for models fit to different subsets. In this case we try
|
|
% to find the best feature group that minimizes the sum of false positive
|
|
% and false negative rates.
|
|
%
|
|
% Sequential feature selection can be computationally intensive. It can
|
|
% benefit significantly from parallel computing.
|
|
|
|
gcp; % open a pool of workers
|
|
|
|
opts = statset('display','iter','UseParallel','always','TolFun',1e-2);
|
|
|
|
% keepin forces those columns to be used
|
|
% use our cv partition object with 3-fold cross validation
|
|
|
|
[fs,~] = sequentialfs(@featureTest,X,Y,...
|
|
'options',opts);
|
|
|
|
% Display
|
|
disp(fs)
|
|
disp(names(fs).')
|