% In this demo, we use regression trees to predict the fuel economy of
% vehicles.
%% Loading Excel file representing Fuel Economy Data
CarData = importCarTable('2004dat.xlsx');
%% Categorical Variables
% Many of the variables represent discrete items, or categories: a car or a
% truck, front wheel or rear wheel drive, etc. To conserve memory and
% accurately classify these, we'll convert them to |categorical| variables.
CarData.Car_Truck = categorical(CarData.Car_Truck);
CarData.Police = categorical(CarData.Police);
CarData.Transmission = categorical(CarData.Transmission);
CarData.Drive = categorical(CarData.Drive);
CarData.AC = categorical(CarData.AC);
CarData.City_Highway = categorical(CarData.City_Highway);
% Not enough samples of each Manufacturer and Car Line (100s unique ones)
% Year is all the same
CarData.MfrName = [];
CarData.CarLine = [];
CarData.Year = [];
% %% Test with 10% of the Data
% % Several of the techniques below use random numbers. We are going
% % to set the random number generator here to ensure repeatability.
% rng(5)
%% Partition Data for Cross Validation
% cvpartition helps us create a cross-validation partition for data. We
% create a test set (10% of data) and training set (90% of data).
% Build cross-validation partition
c = cvpartition(height(CarData),'holdout');
% Extract data at indices
mdlTrain = CarData(training(c),:);
mdlTest = CarData(test(c),:);
% Extract predictors and response
X = CarData;
X.MPG = []; % Remove mpg
Y = CarData.MPG;
%% Multiple Linear Regression
% First try multiple linear regression
% Fit linear model
modelLR = fitlm(mdlTrain,'ResponseVar', 'MPG');
% Make prediction on test set
yfitLR = predict(modelLR,mdlTest);
% Show Results
showFit(mdlTest.MPG, yfitLR)
%% Stepwise Linear Regression
% Stepwise linear regression adds each term to see which ones decrease the
% error the most.
modelSW = stepwiselm(mdlTrain,'ResponseVar', 'MPG','Upper','linear');
% Make prediction on test set
yfitSW = predict(modelSW,mdlTest);
% Show Results
showFit(mdlTest.MPG, yfitSW)
% %% Support Vector Regression
% % Stepwise linear regression adds each term to see which ones decrease the
% % error the most.
% modelsvm = fitrsvm(mdlTrain,'MPG');
% disp(modelsvm)
% % Make prediction on test set
% yfitsvm = predict(modelsvm,mdlTest);
% % Show Results
% showFit(mdlTest.MPG, yfitsvm)
%% Regression Trees: Train the Tree
% In many cases, the form of the relationship between predictors and a
% response is unknown. Decision trees offer a nonparametric alternative
% for regression.
t = fitrtree(mdlTrain,'MPG');
t2 = prune(t,'level',250);
% view(t2); % textual
view(t2,'mode','graph'); % as a tree
% Regression Trees: Evaluate the Tree
yfitT = predict(t, mdlTest);
% Show Results
showFit(mdlTest.MPG, yfitT)
%% Bagged Decision Trees
% Bagging stands for bootstrap aggregation. Every tree in the ensemble is
% grown on an independently drawn sample of input data. To
% compute prediction for the ensemble of trees, fitensemble
% takes an average of predictions from individual trees. Ensemble
% techniques such as bagging combining many weak learners to produce a
% strong learner.
% To use default values:
% tbfit = fitensemble(Xtrain,Ytrain,'Bag',100,'tree','type','regression');
% To determine how many trees to use in your ensemble:
% treeLoss = oobLoss(tbfit,'mode','cumulative')
% plot(1:length(treeLoss),treeLoss)
% oobLoss (Out-of-bag regression error) computes MSE versus the number of
% grown trees. You can use a similar technique to figure out best mininum
% leaf size.
ttemp = templateTree('MinLeaf',1);
tbfit = fitensemble(mdlTrain,'MPG','Bag',100,ttemp,'type','regression');
% Predict
yfitTB = predict(tbfit,mdlTest);
% Show Results
showFit(mdlTest.MPG, yfitTB)
%% Bagged Decision Trees: Predictor Importance
% Predictor importance offers insight into the relative importance of each
% predictor in the model. It is calculated by summing changes in
% the mean squared error (MSE) due to splits on every predictor and
% dividing the sum by the number of branch nodes.
pI = predictorImportance(tbfit);
xlabel('Predictor Importance')
%% Sequential Feature Selection
% Sequential feature selection selects a subset of features from the data
% matrix X that best predict the data in Y by sequentially selecting
% features until there is no improvement in prediction. We are using 3
% fold cross-validation so we can use the whole data set here without
% having to break up a training set and a test set. Normally, you
% may want to use a higher fold, but we are keeping it small for demo
% purposes. We can then see which features to keep in our model.
% Data needs to be numeric
Xdummy = dummytable(X);
XNumeric = table2array(Xdummy);
gcp; % open a pool of workers
opts = statset('display','iter','UseParallel','always','TolFun',1e-2);
% use our cv partition object with 3-fold cross validation
cv = cvpartition(height(X),'k',3);
% Determine important features
fs = sequentialfs(@featureTest,XNumeric,Y,'options',opts,'cv',cv);
% Display
%% Treebagger with New Predictor Set
% About the same answer but smaller set of predictors. Important for
% computational speed, avoiding overtraining, and for general simplicity.
% Could yield a more accurate result, as well.
ttemp = templateTree('MinLeaf',1);
tbfit = fitensemble(XNumeric(training(c),fs),Y(training(c)),...
yfitFinal = predict(tbfit,XNumeric(test(c),fs));
% Show results
showFit(Y(test(c)), yfitFinal)
%% Neural Networks
% Use app to train data to _XNumeric_ and _Y_. Then generate the script
% and turn it into a function:
net = trainRegressionNetwork(XNumeric,Y);
yfitNN = net(XNumeric.');
% Show results