root/MachineLearningWithMATLAB/Regression_FuelEconomy/FuelEconomyPrediction.m @ 11
10 | anderm8 | %% Predicting Fuel Economy
|
|
% In this demo, we use regression trees to predict the fuel economy of
|
|||
% vehicles.
|
|||
% ---------------------------------
|
|||
%
|
|||
% Copyright 2015 The MathWorks, Inc.
|
|||
%% Loading Excel file representing Fuel Economy Data
|
|||
CarData = importCarTable('2004dat.xlsx');
|
|||
%% Categorical Variables
|
|||
% Many of the variables represent discrete items, or categories: a car or a
|
|||
% truck, front wheel or rear wheel drive, etc. To conserve memory and
|
|||
% accurately classify these, we'll convert them to |categorical| variables.
|
|||
CarData.Car_Truck = categorical(CarData.Car_Truck);
|
|||
CarData.Police = categorical(CarData.Police);
|
|||
CarData.Transmission = categorical(CarData.Transmission);
|
|||
CarData.Drive = categorical(CarData.Drive);
|
|||
CarData.AC = categorical(CarData.AC);
|
|||
CarData.City_Highway = categorical(CarData.City_Highway);
|
|||
% Not enough samples of each Manufacturer and Car Line (100s unique ones)
|
|||
% Year is all the same
|
|||
CarData.MfrName = [];
|
|||
CarData.CarLine = [];
|
|||
CarData.Year = [];
|
|||
% %% Test with 10% of the Data
|
|||
% % Several of the techniques below use random numbers. We are going
|
|||
% % to set the random number generator here to ensure repeatability.
|
|||
% rng(5)
|
|||
%% Partition Data for Cross Validation
|
|||
% cvpartition helps us create a cross-validation partition for data. We
|
|||
% create a test set (10% of data) and training set (90% of data).
|
|||
% Build cross-validation partition
|
|||
c = cvpartition(height(CarData),'holdout');
|
|||
% Extract data at indices
|
|||
mdlTrain = CarData(training(c),:);
|
|||
mdlTest = CarData(test(c),:);
|
|||
% Extract predictors and response
|
|||
X = CarData;
|
|||
X.MPG = []; % Remove mpg
|
|||
Y = CarData.MPG;
|
|||
%% Multiple Linear Regression
|
|||
% First try multiple linear regression
|
|||
% Fit linear model
|
|||
modelLR = fitlm(mdlTrain,'ResponseVar', 'MPG');
|
|||
disp(modelLR)
|
|||
% Make prediction on test set
|
|||
yfitLR = predict(modelLR,mdlTest);
|
|||
% Show Results
|
|||
showFit(mdlTest.MPG, yfitLR)
|
|||
%% Stepwise Linear Regression
|
|||
% Stepwise linear regression adds each term to see which ones decrease the
|
|||
% error the most.
|
|||
modelSW = stepwiselm(mdlTrain,'ResponseVar', 'MPG','Upper','linear');
|
|||
disp(modelSW)
|
|||
disp(mdlTrain.Properties.VariableNames(modelSW.VariableInfo.InModel).');
|
|||
% Make prediction on test set
|
|||
yfitSW = predict(modelSW,mdlTest);
|
|||
% Show Results
|
|||
showFit(mdlTest.MPG, yfitSW)
|
|||
% %% Support Vector Regression
|
|||
% % Stepwise linear regression adds each term to see which ones decrease the
|
|||
% % error the most.
|
|||
% modelsvm = fitrsvm(mdlTrain,'MPG');
|
|||
% disp(modelsvm)
|
|||
%
|
|||
% % Make prediction on test set
|
|||
% yfitsvm = predict(modelsvm,mdlTest);
|
|||
%
|
|||
% % Show Results
|
|||
% showFit(mdlTest.MPG, yfitsvm)
|
|||
%% Regression Trees: Train the Tree
|
|||
% In many cases, the form of the relationship between predictors and a
|
|||
% response is unknown. Decision trees offer a nonparametric alternative
|
|||
% for regression.
|
|||
t = fitrtree(mdlTrain,'MPG');
|
|||
t2 = prune(t,'level',250);
|
|||
% view(t2); % textual
|
|||
view(t2,'mode','graph'); % as a tree
|
|||
% Regression Trees: Evaluate the Tree
|
|||
yfitT = predict(t, mdlTest);
|
|||
% Show Results
|
|||
showFit(mdlTest.MPG, yfitT)
|
|||
%% Bagged Decision Trees
|
|||
% Bagging stands for bootstrap aggregation. Every tree in the ensemble is
|
|||
% grown on an independently drawn sample of input data. To
|
|||
% compute prediction for the ensemble of trees, fitensemble
|
|||
% takes an average of predictions from individual trees. Ensemble
|
|||
% techniques such as bagging combining many weak learners to produce a
|
|||
% strong learner.
|
|||
%
|
|||
% To use default values:
|
|||
%
|
|||
% tbfit = fitensemble(Xtrain,Ytrain,'Bag',100,'tree','type','regression');
|
|||
%
|
|||
% To determine how many trees to use in your ensemble:
|
|||
%
|
|||
% treeLoss = oobLoss(tbfit,'mode','cumulative')
|
|||
% plot(1:length(treeLoss),treeLoss)
|
|||
%
|
|||
% oobLoss (Out-of-bag regression error) computes MSE versus the number of
|
|||
% grown trees. You can use a similar technique to figure out best mininum
|
|||
% leaf size.
|
|||
ttemp = templateTree('MinLeaf',1);
|
|||
tbfit = fitensemble(mdlTrain,'MPG','Bag',100,ttemp,'type','regression');
|
|||
% Predict
|
|||
yfitTB = predict(tbfit,mdlTest);
|
|||
% Show Results
|
|||
showFit(mdlTest.MPG, yfitTB)
|
|||
%% Bagged Decision Trees: Predictor Importance
|
|||
% Predictor importance offers insight into the relative importance of each
|
|||
% predictor in the model. It is calculated by summing changes in
|
|||
% the mean squared error (MSE) due to splits on every predictor and
|
|||
% dividing the sum by the number of branch nodes.
|
|||
figure
|
|||
pI = predictorImportance(tbfit);
|
|||
barh(pI)
|
|||
set(gca,'YTick',1:numel(X.Properties.VariableNames));
|
|||
set(gca,'YTickLabel',strrep(X.Properties.VariableNames,'_','\_'));
|
|||
xlabel('Predictor Importance')
|
|||
%% Sequential Feature Selection
|
|||
% Sequential feature selection selects a subset of features from the data
|
|||
% matrix X that best predict the data in Y by sequentially selecting
|
|||
% features until there is no improvement in prediction. We are using 3
|
|||
% fold cross-validation so we can use the whole data set here without
|
|||
% having to break up a training set and a test set. Normally, you
|
|||
% may want to use a higher fold, but we are keeping it small for demo
|
|||
% purposes. We can then see which features to keep in our model.
|
|||
% Data needs to be numeric
|
|||
Xdummy = dummytable(X);
|
|||
XNumeric = table2array(Xdummy);
|
|||
gcp; % open a pool of workers
|
|||
opts = statset('display','iter','UseParallel','always','TolFun',1e-2);
|
|||
% use our cv partition object with 3-fold cross validation
|
|||
cv = cvpartition(height(X),'k',3);
|
|||
% Determine important features
|
|||
fs = sequentialfs(@featureTest,XNumeric,Y,'options',opts,'cv',cv);
|
|||
% Display
|
|||
disp(Xdummy.Properties.VariableNames(fs).')
|
|||
%% Treebagger with New Predictor Set
|
|||
% About the same answer but smaller set of predictors. Important for
|
|||
% computational speed, avoiding overtraining, and for general simplicity.
|
|||
% Could yield a more accurate result, as well.
|
|||
ttemp = templateTree('MinLeaf',1);
|
|||
tbfit = fitensemble(XNumeric(training(c),fs),Y(training(c)),...
|
|||
'Bag',100,ttemp,'type','regression');
|
|||
yfitFinal = predict(tbfit,XNumeric(test(c),fs));
|
|||
% Show results
|
|||
showFit(Y(test(c)), yfitFinal)
|
|||
%% Neural Networks
|
|||
% Use app to train data to _XNumeric_ and _Y_. Then generate the script
|
|||
% and turn it into a function:
|
|||
net = trainRegressionNetwork(XNumeric,Y);
|
|||
yfitNN = net(XNumeric.');
|
|||
% Show results
|
|||
showFit(Y,yfitNN.');
|
|||