/MachineLearningWithMATLAB/Regression_FuelEconomy/FuelEconomyPrediction.m - Capstone Support - The Design Lab at Rensselaer - Electronic Design Notebook

root/MachineLearningWithMATLAB/Regression_FuelEconomy/FuelEconomyPrediction.m @ 10

       %% Predicting Fuel Economy
       % In this demo, we use regression trees to predict the fuel economy of
       % vehicles.
       % ---------------------------------
+      %
       %   Copyright 2015 The MathWorks, Inc.
       %% Loading Excel file representing Fuel Economy Data
       CarData = importCarTable('2004dat.xlsx');
       %% Categorical Variables
       % Many of the variables represent discrete items, or categories: a car or a
       % truck, front wheel or rear wheel drive, etc.  To conserve memory and
       % accurately classify these, we'll convert them to |categorical| variables.
       CarData.Car_Truck = categorical(CarData.Car_Truck);
       CarData.Police = categorical(CarData.Police);
       CarData.Transmission = categorical(CarData.Transmission);
       CarData.Drive = categorical(CarData.Drive);
       CarData.AC = categorical(CarData.AC);
       CarData.City_Highway = categorical(CarData.City_Highway);
       % Not enough samples of each Manufacturer and Car Line (100s unique ones)
       % Year is all the same
       CarData.MfrName = [];
       CarData.CarLine = [];
       CarData.Year = [];
       % %%  Test with 10% of the Data
       % % Several of the techniques below use random numbers.  We are going
       % % to set the random number generator here to ensure repeatability.
       % rng(5)
       %% Partition Data for Cross Validation
       % cvpartition helps us create a cross-validation partition for data. We
       % create a test set (10% of data) and training set (90% of data).
       % Build cross-validation partition
       c = cvpartition(height(CarData),'holdout');
       % Extract data at indices
       mdlTrain = CarData(training(c),:);
       mdlTest  = CarData(test(c),:);
       % Extract predictors and response
       X = CarData;
       X.MPG = []; % Remove mpg
       Y = CarData.MPG;
       %% Multiple Linear Regression
       % First try multiple linear regression
       % Fit linear model
       modelLR = fitlm(mdlTrain,'ResponseVar', 'MPG');
       disp(modelLR)
       % Make prediction on test set
       yfitLR = predict(modelLR,mdlTest);
       % Show Results
       showFit(mdlTest.MPG, yfitLR)
       %% Stepwise Linear Regression
       % Stepwise linear regression adds each term to see which ones decrease the
       % error the most.
       modelSW = stepwiselm(mdlTrain,'ResponseVar', 'MPG','Upper','linear');
       disp(modelSW)
       disp(mdlTrain.Properties.VariableNames(modelSW.VariableInfo.InModel).');
       % Make prediction on test set
       yfitSW  = predict(modelSW,mdlTest);
       % Show Results
       showFit(mdlTest.MPG, yfitSW)
       % %% Support Vector Regression
       % % Stepwise linear regression adds each term to see which ones decrease the
       % % error the most.
       % modelsvm = fitrsvm(mdlTrain,'MPG');
       % disp(modelsvm)
+      %
       % % Make prediction on test set
       % yfitsvm  = predict(modelsvm,mdlTest);
+      %
       % % Show Results
       % showFit(mdlTest.MPG, yfitsvm)
       %% Regression Trees: Train the Tree
       % In many cases, the form of the relationship between predictors and a
       % response is unknown.  Decision trees offer a nonparametric alternative
       % for regression.
       t = fitrtree(mdlTrain,'MPG');
       t2 = prune(t,'level',250);
       % view(t2); % textual
       view(t2,'mode','graph'); % as a tree
       % Regression Trees: Evaluate the Tree
       yfitT = predict(t, mdlTest);
       % Show Results
       showFit(mdlTest.MPG, yfitT)
       %% Bagged Decision Trees
       % Bagging stands for bootstrap aggregation. Every tree in the ensemble is
       % grown on an independently drawn sample of input data. To
       % compute prediction for the ensemble of trees, fitensemble
       % takes an average of predictions from individual trees.  Ensemble
       % techniques such as bagging combining many weak learners to produce a
       % strong learner.
+      %
       % To use default values:
+      %
       %  tbfit = fitensemble(Xtrain,Ytrain,'Bag',100,'tree','type','regression');
+      %
       % To determine how many trees to use in your ensemble:
+      %
       %  treeLoss = oobLoss(tbfit,'mode','cumulative')
       %  plot(1:length(treeLoss),treeLoss)
+      %
       % oobLoss (Out-of-bag regression error) computes MSE versus the number of
       % grown trees. You can use a similar technique to figure out best mininum
       % leaf size.
       ttemp = templateTree('MinLeaf',1);
       tbfit = fitensemble(mdlTrain,'MPG','Bag',100,ttemp,'type','regression');
       % Predict
       yfitTB = predict(tbfit,mdlTest);
       % Show Results
       showFit(mdlTest.MPG, yfitTB)
       %% Bagged Decision Trees: Predictor Importance
       % Predictor importance offers insight into the relative importance of each
       % predictor in the model.  It is calculated by summing changes in
       % the mean squared error (MSE) due to splits on every predictor and
       % dividing the sum by the number of branch nodes.
       figure
       pI = predictorImportance(tbfit);
       barh(pI)
       set(gca,'YTick',1:numel(X.Properties.VariableNames));
       set(gca,'YTickLabel',strrep(X.Properties.VariableNames,'_','\_'));
       xlabel('Predictor Importance')
       %% Sequential Feature Selection
       % Sequential feature selection selects a subset of features from the data
       % matrix X that best predict the data in Y by sequentially selecting
       % features until there is no improvement in prediction. We are using 3
       % fold cross-validation so we can use the whole data set here without
       % having to break up a training set and a test set.  Normally, you
       % may want to use a higher fold, but we are keeping it small for demo
       % purposes. We can then see which features to keep in our model.
       % Data needs to be numeric
       Xdummy = dummytable(X);
       XNumeric = table2array(Xdummy);
       gcp; % open a pool of workers
       opts = statset('display','iter','UseParallel','always','TolFun',1e-2);
       % use our cv partition object with 3-fold cross validation
       cv = cvpartition(height(X),'k',3);
       % Determine important features
       fs = sequentialfs(@featureTest,XNumeric,Y,'options',opts,'cv',cv);
       % Display
       disp(Xdummy.Properties.VariableNames(fs).')
       %% Treebagger with New Predictor Set
       % About the same answer but smaller set of predictors. Important for
       % computational speed, avoiding overtraining, and for general simplicity.
       % Could yield a more accurate result, as well.
       ttemp = templateTree('MinLeaf',1);
       tbfit = fitensemble(XNumeric(training(c),fs),Y(training(c)),...
           'Bag',100,ttemp,'type','regression');
       yfitFinal = predict(tbfit,XNumeric(test(c),fs));
       % Show results
       showFit(Y(test(c)), yfitFinal)
       %% Neural Networks
       % Use app to train data to _XNumeric_ and _Y_.  Then generate the script
       % and turn it into a function:
       net = trainRegressionNetwork(XNumeric,Y);
       yfitNN = net(XNumeric.');
       % Show results
       showFit(Y,yfitNN.');

(2-2/9)

Project

General

Profile

Capstone Support