Project

General

Profile

%% Predicting Fuel Economy
% In this demo, we use regression trees to predict the fuel economy of
% vehicles.
% ---------------------------------
%
% Copyright 2015 The MathWorks, Inc.

%% Loading Excel file representing Fuel Economy Data

CarData = importCarTable('2004dat.xlsx');

%% Categorical Variables
% Many of the variables represent discrete items, or categories: a car or a
% truck, front wheel or rear wheel drive, etc. To conserve memory and
% accurately classify these, we'll convert them to |categorical| variables.

CarData.Car_Truck = categorical(CarData.Car_Truck);
CarData.Police = categorical(CarData.Police);
CarData.Transmission = categorical(CarData.Transmission);
CarData.Drive = categorical(CarData.Drive);
CarData.AC = categorical(CarData.AC);
CarData.City_Highway = categorical(CarData.City_Highway);

% Not enough samples of each Manufacturer and Car Line (100s unique ones)
% Year is all the same
CarData.MfrName = [];
CarData.CarLine = [];
CarData.Year = [];

% %% Test with 10% of the Data
% % Several of the techniques below use random numbers. We are going
% % to set the random number generator here to ensure repeatability.
% rng(5)

%% Partition Data for Cross Validation
% cvpartition helps us create a cross-validation partition for data. We
% create a test set (10% of data) and training set (90% of data).

% Build cross-validation partition
c = cvpartition(height(CarData),'holdout');

% Extract data at indices
mdlTrain = CarData(training(c),:);
mdlTest = CarData(test(c),:);

% Extract predictors and response
X = CarData;
X.MPG = []; % Remove mpg
Y = CarData.MPG;

%% Multiple Linear Regression
% First try multiple linear regression

% Fit linear model
modelLR = fitlm(mdlTrain,'ResponseVar', 'MPG');
disp(modelLR)

% Make prediction on test set
yfitLR = predict(modelLR,mdlTest);

% Show Results
showFit(mdlTest.MPG, yfitLR)


%% Stepwise Linear Regression
% Stepwise linear regression adds each term to see which ones decrease the
% error the most.
modelSW = stepwiselm(mdlTrain,'ResponseVar', 'MPG','Upper','linear');
disp(modelSW)
disp(mdlTrain.Properties.VariableNames(modelSW.VariableInfo.InModel).');

% Make prediction on test set
yfitSW = predict(modelSW,mdlTest);

% Show Results
showFit(mdlTest.MPG, yfitSW)

% %% Support Vector Regression
% % Stepwise linear regression adds each term to see which ones decrease the
% % error the most.
% modelsvm = fitrsvm(mdlTrain,'MPG');
% disp(modelsvm)
%
% % Make prediction on test set
% yfitsvm = predict(modelsvm,mdlTest);
%
% % Show Results
% showFit(mdlTest.MPG, yfitsvm)

%% Regression Trees: Train the Tree
% In many cases, the form of the relationship between predictors and a
% response is unknown. Decision trees offer a nonparametric alternative
% for regression.

t = fitrtree(mdlTrain,'MPG');
t2 = prune(t,'level',250);
% view(t2); % textual
view(t2,'mode','graph'); % as a tree

% Regression Trees: Evaluate the Tree

yfitT = predict(t, mdlTest);

% Show Results
showFit(mdlTest.MPG, yfitT)

%% Bagged Decision Trees
% Bagging stands for bootstrap aggregation. Every tree in the ensemble is
% grown on an independently drawn sample of input data. To
% compute prediction for the ensemble of trees, fitensemble
% takes an average of predictions from individual trees. Ensemble
% techniques such as bagging combining many weak learners to produce a
% strong learner.
%
% To use default values:
%
% tbfit = fitensemble(Xtrain,Ytrain,'Bag',100,'tree','type','regression');
%
% To determine how many trees to use in your ensemble:
%
% treeLoss = oobLoss(tbfit,'mode','cumulative')
% plot(1:length(treeLoss),treeLoss)
%
% oobLoss (Out-of-bag regression error) computes MSE versus the number of
% grown trees. You can use a similar technique to figure out best mininum
% leaf size.

ttemp = templateTree('MinLeaf',1);
tbfit = fitensemble(mdlTrain,'MPG','Bag',100,ttemp,'type','regression');

% Predict
yfitTB = predict(tbfit,mdlTest);

% Show Results
showFit(mdlTest.MPG, yfitTB)


%% Bagged Decision Trees: Predictor Importance
% Predictor importance offers insight into the relative importance of each
% predictor in the model. It is calculated by summing changes in
% the mean squared error (MSE) due to splits on every predictor and
% dividing the sum by the number of branch nodes.

figure
pI = predictorImportance(tbfit);
barh(pI)
set(gca,'YTick',1:numel(X.Properties.VariableNames));
set(gca,'YTickLabel',strrep(X.Properties.VariableNames,'_','\_'));
xlabel('Predictor Importance')


%% Sequential Feature Selection
% Sequential feature selection selects a subset of features from the data
% matrix X that best predict the data in Y by sequentially selecting
% features until there is no improvement in prediction. We are using 3
% fold cross-validation so we can use the whole data set here without
% having to break up a training set and a test set. Normally, you
% may want to use a higher fold, but we are keeping it small for demo
% purposes. We can then see which features to keep in our model.

% Data needs to be numeric
Xdummy = dummytable(X);
XNumeric = table2array(Xdummy);

gcp; % open a pool of workers

opts = statset('display','iter','UseParallel','always','TolFun',1e-2);

% use our cv partition object with 3-fold cross validation
cv = cvpartition(height(X),'k',3);

% Determine important features
fs = sequentialfs(@featureTest,XNumeric,Y,'options',opts,'cv',cv);

% Display
disp(Xdummy.Properties.VariableNames(fs).')


%% Treebagger with New Predictor Set
% About the same answer but smaller set of predictors. Important for
% computational speed, avoiding overtraining, and for general simplicity.
% Could yield a more accurate result, as well.

ttemp = templateTree('MinLeaf',1);
tbfit = fitensemble(XNumeric(training(c),fs),Y(training(c)),...
'Bag',100,ttemp,'type','regression');

yfitFinal = predict(tbfit,XNumeric(test(c),fs));

% Show results
showFit(Y(test(c)), yfitFinal)

%% Neural Networks
% Use app to train data to _XNumeric_ and _Y_. Then generate the script
% and turn it into a function:

net = trainRegressionNetwork(XNumeric,Y);
yfitNN = net(XNumeric.');

% Show results
showFit(Y,yfitNN.');


(2-2/9)