|
%% Predicting Fuel Economy
|
|
% In this demo, we use regression trees to predict the fuel economy of
|
|
% vehicles.
|
|
% ---------------------------------
|
|
%
|
|
% Copyright 2015 The MathWorks, Inc.
|
|
|
|
%% Loading Excel file representing Fuel Economy Data
|
|
|
|
CarData = importCarTable('2004dat.xlsx');
|
|
|
|
%% Categorical Variables
|
|
% Many of the variables represent discrete items, or categories: a car or a
|
|
% truck, front wheel or rear wheel drive, etc. To conserve memory and
|
|
% accurately classify these, we'll convert them to |categorical| variables.
|
|
|
|
CarData.Car_Truck = categorical(CarData.Car_Truck);
|
|
CarData.Police = categorical(CarData.Police);
|
|
CarData.Transmission = categorical(CarData.Transmission);
|
|
CarData.Drive = categorical(CarData.Drive);
|
|
CarData.AC = categorical(CarData.AC);
|
|
CarData.City_Highway = categorical(CarData.City_Highway);
|
|
|
|
% Not enough samples of each Manufacturer and Car Line (100s unique ones)
|
|
% Year is all the same
|
|
CarData.MfrName = [];
|
|
CarData.CarLine = [];
|
|
CarData.Year = [];
|
|
|
|
% %% Test with 10% of the Data
|
|
% % Several of the techniques below use random numbers. We are going
|
|
% % to set the random number generator here to ensure repeatability.
|
|
% rng(5)
|
|
|
|
%% Partition Data for Cross Validation
|
|
% cvpartition helps us create a cross-validation partition for data. We
|
|
% create a test set (10% of data) and training set (90% of data).
|
|
|
|
% Build cross-validation partition
|
|
c = cvpartition(height(CarData),'holdout');
|
|
|
|
% Extract data at indices
|
|
mdlTrain = CarData(training(c),:);
|
|
mdlTest = CarData(test(c),:);
|
|
|
|
% Extract predictors and response
|
|
X = CarData;
|
|
X.MPG = []; % Remove mpg
|
|
Y = CarData.MPG;
|
|
|
|
%% Multiple Linear Regression
|
|
% First try multiple linear regression
|
|
|
|
% Fit linear model
|
|
modelLR = fitlm(mdlTrain,'ResponseVar', 'MPG');
|
|
disp(modelLR)
|
|
|
|
% Make prediction on test set
|
|
yfitLR = predict(modelLR,mdlTest);
|
|
|
|
% Show Results
|
|
showFit(mdlTest.MPG, yfitLR)
|
|
|
|
|
|
%% Stepwise Linear Regression
|
|
% Stepwise linear regression adds each term to see which ones decrease the
|
|
% error the most.
|
|
modelSW = stepwiselm(mdlTrain,'ResponseVar', 'MPG','Upper','linear');
|
|
disp(modelSW)
|
|
disp(mdlTrain.Properties.VariableNames(modelSW.VariableInfo.InModel).');
|
|
|
|
% Make prediction on test set
|
|
yfitSW = predict(modelSW,mdlTest);
|
|
|
|
% Show Results
|
|
showFit(mdlTest.MPG, yfitSW)
|
|
|
|
% %% Support Vector Regression
|
|
% % Stepwise linear regression adds each term to see which ones decrease the
|
|
% % error the most.
|
|
% modelsvm = fitrsvm(mdlTrain,'MPG');
|
|
% disp(modelsvm)
|
|
%
|
|
% % Make prediction on test set
|
|
% yfitsvm = predict(modelsvm,mdlTest);
|
|
%
|
|
% % Show Results
|
|
% showFit(mdlTest.MPG, yfitsvm)
|
|
|
|
%% Regression Trees: Train the Tree
|
|
% In many cases, the form of the relationship between predictors and a
|
|
% response is unknown. Decision trees offer a nonparametric alternative
|
|
% for regression.
|
|
|
|
t = fitrtree(mdlTrain,'MPG');
|
|
t2 = prune(t,'level',250);
|
|
% view(t2); % textual
|
|
view(t2,'mode','graph'); % as a tree
|
|
|
|
% Regression Trees: Evaluate the Tree
|
|
|
|
yfitT = predict(t, mdlTest);
|
|
|
|
% Show Results
|
|
showFit(mdlTest.MPG, yfitT)
|
|
|
|
%% Bagged Decision Trees
|
|
% Bagging stands for bootstrap aggregation. Every tree in the ensemble is
|
|
% grown on an independently drawn sample of input data. To
|
|
% compute prediction for the ensemble of trees, fitensemble
|
|
% takes an average of predictions from individual trees. Ensemble
|
|
% techniques such as bagging combining many weak learners to produce a
|
|
% strong learner.
|
|
%
|
|
% To use default values:
|
|
%
|
|
% tbfit = fitensemble(Xtrain,Ytrain,'Bag',100,'tree','type','regression');
|
|
%
|
|
% To determine how many trees to use in your ensemble:
|
|
%
|
|
% treeLoss = oobLoss(tbfit,'mode','cumulative')
|
|
% plot(1:length(treeLoss),treeLoss)
|
|
%
|
|
% oobLoss (Out-of-bag regression error) computes MSE versus the number of
|
|
% grown trees. You can use a similar technique to figure out best mininum
|
|
% leaf size.
|
|
|
|
ttemp = templateTree('MinLeaf',1);
|
|
tbfit = fitensemble(mdlTrain,'MPG','Bag',100,ttemp,'type','regression');
|
|
|
|
% Predict
|
|
yfitTB = predict(tbfit,mdlTest);
|
|
|
|
% Show Results
|
|
showFit(mdlTest.MPG, yfitTB)
|
|
|
|
|
|
%% Bagged Decision Trees: Predictor Importance
|
|
% Predictor importance offers insight into the relative importance of each
|
|
% predictor in the model. It is calculated by summing changes in
|
|
% the mean squared error (MSE) due to splits on every predictor and
|
|
% dividing the sum by the number of branch nodes.
|
|
|
|
figure
|
|
pI = predictorImportance(tbfit);
|
|
barh(pI)
|
|
set(gca,'YTick',1:numel(X.Properties.VariableNames));
|
|
set(gca,'YTickLabel',strrep(X.Properties.VariableNames,'_','\_'));
|
|
xlabel('Predictor Importance')
|
|
|
|
|
|
%% Sequential Feature Selection
|
|
% Sequential feature selection selects a subset of features from the data
|
|
% matrix X that best predict the data in Y by sequentially selecting
|
|
% features until there is no improvement in prediction. We are using 3
|
|
% fold cross-validation so we can use the whole data set here without
|
|
% having to break up a training set and a test set. Normally, you
|
|
% may want to use a higher fold, but we are keeping it small for demo
|
|
% purposes. We can then see which features to keep in our model.
|
|
|
|
% Data needs to be numeric
|
|
Xdummy = dummytable(X);
|
|
XNumeric = table2array(Xdummy);
|
|
|
|
gcp; % open a pool of workers
|
|
|
|
opts = statset('display','iter','UseParallel','always','TolFun',1e-2);
|
|
|
|
% use our cv partition object with 3-fold cross validation
|
|
cv = cvpartition(height(X),'k',3);
|
|
|
|
% Determine important features
|
|
fs = sequentialfs(@featureTest,XNumeric,Y,'options',opts,'cv',cv);
|
|
|
|
% Display
|
|
disp(Xdummy.Properties.VariableNames(fs).')
|
|
|
|
|
|
%% Treebagger with New Predictor Set
|
|
% About the same answer but smaller set of predictors. Important for
|
|
% computational speed, avoiding overtraining, and for general simplicity.
|
|
% Could yield a more accurate result, as well.
|
|
|
|
ttemp = templateTree('MinLeaf',1);
|
|
tbfit = fitensemble(XNumeric(training(c),fs),Y(training(c)),...
|
|
'Bag',100,ttemp,'type','regression');
|
|
|
|
yfitFinal = predict(tbfit,XNumeric(test(c),fs));
|
|
|
|
% Show results
|
|
showFit(Y(test(c)), yfitFinal)
|
|
|
|
%% Neural Networks
|
|
% Use app to train data to _XNumeric_ and _Y_. Then generate the script
|
|
% and turn it into a function:
|
|
|
|
net = trainRegressionNetwork(XNumeric,Y);
|
|
yfitNN = net(XNumeric.');
|
|
|
|
% Show results
|
|
showFit(Y,yfitNN.');
|
|
|
|
|