Classification of data set #1 and data set #3

This example is from Section IV.C of Tabassum and Ollila (2019). Running this code you are able to reproduce figures for data set #1 (Isolet wovels) and data set #3 (Khan et al.) given the Table 1 in the paper referenced below.

Reference:

[1] M.N. Tabassum and E. Ollila (2019), "A Compressive Classification Framework for High-Dimensional Data," Preprint, Submitted for publication, Oct. 2019.

(c) E. Ollila and M.N. Tabassum, CompressiveRDA MATLAB toolbox.

Contents

Initialize

clear; clc;

Q = 3;   % Nr of CRDA approaches
L = 10;  % # of MC splits of data to training / test sets
cntrX             = true;
use_uniform_prior = true;

CT  = zeros(L,Q); % computation times
FSR = zeros(L,Q); % feature selection rate
TER = zeros(L,Q); % test error rate

print_info   = true;

Load and check the data

pt = 1/3; % percentage of training observations
dsname = 'IsoletVowels';

% Uncheck these to compute the results for data set #3
%dsname = 'khan2001';
%pt = 0.6;

load(sprintf('%s.mat',dsname), 'Xo','yo');
Xo = crda_check_data(Xo,yo,print_info);

yo = double(yo);
G  = max(yo);
p = size(Xo,1);
percentage of missing values = 0.00

Start the simulatios

rng('default'); % for reproducibility

for mc=1:L  % Simulation with L splits of data for training-test sets

    [yt,Xt,y,X,mu,prior] = crda_create_data(Xo,yo,pt,cntrX);

    if use_uniform_prior
        prior = (1/G)*ones(1,G);
    end
    Nt = length(yt);

    fprintf('Data-split# %d ...\n', mc);
    scurr = mc*1e3;

   % CRDA1 (Ell1-RSCM, {K,q} = CV)
   rng(scurr);
   algo = 1;
   tic;
   [yhat1,~,~,K1,~] = CRDA(Xt,X,y,'method','crda1','prior',prior,'mu',mu);
   CT(mc,algo) = toc;
   TER(mc,algo) = sum(yhat1 ~= yt)/Nt;
   FSR(mc,algo) = K1/p;

   if print_info
       fprintf('\tCRDA%d : {TER, FSR} = {%5.2f, %5.2f} | CT = %.2f\n', ...
             algo, 100*TER(mc,algo),100*FSR(mc,algo), CT(mc,algo));
   end

   % CRDA2 (Ell2-RSCM, {K,q} = CV)
   rng(scurr);
   algo = algo + 1;
   tic;
   [yhat2,~,~,K2,~] = CRDA(Xt,X,y,'method','crda2','prior',prior,'mu',mu);
   CT(mc,algo) = toc;
   TER(mc,algo) = sum(yhat2 ~= yt)/Nt;
   FSR(mc,algo) = K2/p;

   if print_info
      fprintf('\tCRDA%d : {TER, FSR} = {%5.2f, %5.2f} | CT = %.2f\n', ...
           algo, 100*TER(mc,algo),100*FSR(mc,algo), CT(mc,algo));
   end

   % CRDA3 (PSCM, K = Kub, q = CV)
   rng(scurr);
   algo = algo + 1;
   tic;
   [yhat3,~,~,K3,~] = CRDA(Xt,X,y,'method','crda3','prior',prior,'mu',mu);
   CT(mc,algo) = toc;
   TER(mc,algo) = sum(yhat3 ~= yt)/Nt;
   FSR(mc,algo) = K3/p;

   if print_info
       fprintf('\tCRDA%d : {TER, FSR} = {%5.2f, %5.2f} | CT = %.2f\n', ...
            algo, 100*TER(mc,algo),100*FSR(mc,algo), CT(mc,algo));
   end

end
Data-split# 1 ...
	CRDA1 : {TER, FSR} = { 3.00,  9.89} | CT = 0.94
	CRDA2 : {TER, FSR} = { 2.00,  7.78} | CT = 0.44
	CRDA3 : {TER, FSR} = { 1.50, 46.68} | CT = 1.44
Data-split# 2 ...
	CRDA1 : {TER, FSR} = { 3.00, 14.91} | CT = 0.75
	CRDA2 : {TER, FSR} = { 3.00, 14.75} | CT = 0.41
	CRDA3 : {TER, FSR} = { 1.50, 46.35} | CT = 0.80
Data-split# 3 ...
	CRDA1 : {TER, FSR} = { 1.00, 27.55} | CT = 0.86
	CRDA2 : {TER, FSR} = { 2.50, 17.67} | CT = 0.68
	CRDA3 : {TER, FSR} = { 0.00, 43.44} | CT = 0.83
Data-split# 4 ...
	CRDA1 : {TER, FSR} = { 2.50,  9.40} | CT = 0.29
	CRDA2 : {TER, FSR} = { 1.00, 11.67} | CT = 0.22
	CRDA3 : {TER, FSR} = { 1.00, 46.19} | CT = 0.49
Data-split# 5 ...
	CRDA1 : {TER, FSR} = { 4.50, 14.59} | CT = 0.32
	CRDA2 : {TER, FSR} = { 1.50, 11.67} | CT = 0.30
	CRDA3 : {TER, FSR} = { 0.50, 47.65} | CT = 0.59
Data-split# 6 ...
	CRDA1 : {TER, FSR} = { 2.00, 36.14} | CT = 0.30
	CRDA2 : {TER, FSR} = { 2.00, 28.85} | CT = 0.22
	CRDA3 : {TER, FSR} = { 0.50, 48.46} | CT = 0.56
Data-split# 7 ...
	CRDA1 : {TER, FSR} = { 4.00,  9.40} | CT = 0.27
	CRDA2 : {TER, FSR} = { 4.00,  9.40} | CT = 0.20
	CRDA3 : {TER, FSR} = { 1.50, 45.54} | CT = 0.50
Data-split# 8 ...
	CRDA1 : {TER, FSR} = { 2.50, 12.32} | CT = 0.34
	CRDA2 : {TER, FSR} = { 3.50,  9.89} | CT = 0.21
	CRDA3 : {TER, FSR} = { 1.50, 45.54} | CT = 0.58
Data-split# 9 ...
	CRDA1 : {TER, FSR} = { 1.00, 26.74} | CT = 0.29
	CRDA2 : {TER, FSR} = { 4.50, 17.83} | CT = 0.21
	CRDA3 : {TER, FSR} = { 1.50, 44.57} | CT = 0.56
Data-split# 10 ...
	CRDA1 : {TER, FSR} = { 3.00, 11.83} | CT = 0.29
	CRDA2 : {TER, FSR} = { 2.50,  9.56} | CT = 0.27
	CRDA3 : {TER, FSR} = { 1.50, 46.19} | CT = 0.57

Calculate the naive TER

n = histcounts(y);
[~,tmp_indx] = max(n);
avgNaiveTER = 100 * ( sum(repmat(tmp_indx,size(yt)) ~= yt) / length(yt) );

Make a table

avgFSR = 100*mean(FSR)';
avgTER = 100*mean(TER)';
avgCT = mean(CT)';

fprintf('\nResults for %s dataset\n',dsname)
table({'CRDA1','CRDA2','CRDA3'}',round(avgTER,2),round(avgFSR,2), ...
      round(avgCT,2), 'VariableNames',{'method','TER','FSR','CT'})
Results for IsoletVowels dataset

ans =

  3×4 table

    method     TER      FSR      CT 
    _______    ____    _____    ____

    'CRDA1'    2.65    17.28    0.46
    'CRDA2'    2.65    13.91    0.32
    'CRDA3'     1.1    46.06    0.69

Make a bar plot

figure(1); clf;
names = categorical({'CRDA1','CRDA2','CRDA3'});
subplot(1,2,1)
bar(names,avgTER);
title('Training error rate');
grid on;
set(gca,'FontSize',16,'LineWidth',1.3)

subplot(1,2,2);
bar(names,avgFSR);
title('Feature selection rate','FontSize',18);
grid on;
set(gca,'FontSize',16,'LineWidth',1.3)