root/MachineLearningWithMATLAB/Classification_RobotFailures/ReducingModelComplexity.m @ 11
10 | anderm8 | %% Improving Model Accuracy by Evaluating Architecture & Feature Choices
|
|
% Most of the predictive models discussed here have numerous parameters
|
|||
% that can be selected to tweak the model to improve its performance for a
|
|||
% particular problem. Selecting the right features is also a very important
|
|||
% step in the machine learning workflow. This script explores methods of
|
|||
% making these decisions
|
|||
% Copyright 2015 The MathWorks, Inc.
|
|||
%% Start with the Bagged Decision Tree model
|
|||
% Import data
|
|||
faultData = importFaultData('faultData.xlsx');
|
|||
faultData.Fault = categorical(faultData.Fault);
|
|||
names =faultData.Properties.VariableNames;
|
|||
% Filter data
|
|||
faultData(faultData.Fault == 'lost',:) = [];
|
|||
faultData(faultData.Fault == 'moved',:) = [];
|
|||
faultData.Fault = removecats(faultData.Fault);
|
|||
%
|
|||
X = table2array(faultData(:,1:end-1));
|
|||
Y = faultData.Fault;
|
|||
retrain = true;
|
|||
opts = statset('UseParallel', true);
|
|||
tbL = TreeBagger(100,X,Y,'method','classification',...
|
|||
'Options',opts,'OOBVarImp','on');
|
|||
%% Estimating a Good Ensemble Size for Bagged Classification Trees
|
|||
% Because each tree in the bag is trained on a subset of the training data,
|
|||
% the out-of-bag observations can be used to estimate the out of sample
|
|||
% misclassification rate of the tree. Examining the out-of-bag error may
|
|||
% give an insight into determining a good ensemble size.
|
|||
clf
|
|||
plot(oobError(tbL)); grid on;
|
|||
xlabel('Number of Grown Trees');
|
|||
ylabel('Out-of-Bag Classification Error/Misclassification Probability');
|
|||
%% Estimating Feature Importance
|
|||
% Feature importance measures the increase in prediction error if the
|
|||
% values of that variable are permuted across the out-of-bag observations.
|
|||
figure;
|
|||
[~,idxvarimp] = sort(tbL.OOBPermutedVarDeltaError, 'ascend');
|
|||
barh(tbL.OOBPermutedVarDeltaError(idxvarimp));
|
|||
title('Out-Of-Bag Feature Importance');
|
|||
xlabel('Relative Feature Importance Score');
|
|||
set(gca,'YTick',1:length(names)-1,'YTickLabel', names(idxvarimp), 'XGrid', 'on');
|
|||
%% Sequential Feature Selection
|
|||
% Feature selection reduces the dimensionality of data by selecting only a
|
|||
% subset of measured features (predictor variables) to create a model.
|
|||
% Selection criteria involves the minimization of a specific measure of
|
|||
% predictive error for models fit to different subsets. In this case we try
|
|||
% to find the best feature group that minimizes the sum of false positive
|
|||
% and false negative rates.
|
|||
%
|
|||
% Sequential feature selection can be computationally intensive. It can
|
|||
% benefit significantly from parallel computing.
|
|||
gcp; % open a pool of workers
|
|||
opts = statset('display','iter','UseParallel','always','TolFun',1e-2);
|
|||
% keepin forces those columns to be used
|
|||
% use our cv partition object with 3-fold cross validation
|
|||
[fs,~] = sequentialfs(@featureTest,X,Y,...
|
|||
'options',opts);
|
|||
% Display
|
|||
disp(fs)
|
|||
disp(names(fs).')
|