%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%% Comparion of WAIC and DIC in a nomral mixture
%%% MATLAB program. Sumio Watanabe.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%% This program may needs several hours.
%%% If you find something wrong, please let the author know it.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%% In order to evaluate information criteria in Bayes estimation, we have to
%%% construct the accurate posterior distribution.
%%% We have to remark this accuracy.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%% This program studies a normal mixture on a 3 dimensional space.
%%% N(b,STD^2) : 3 dimensional Normal distribution, average b and variance STD^2
%%% Statistical Model : p=(1-aa) N(0,STD^2) + aa N(b,STD^2) : b=(b1,b2,b3)
%%% True : q=(1-AA)N(0,STD^2) + AA N(BB,STD^2), BB=(B1,B2,B3)
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
clf;
clear;
close all;
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
TRIAL_N=50; %%% Independent Trial Number
DISPLAY_N=5; %%% Display number
NNN=200; %%% Number of training samples %%% NNN=20, 100, 200, ...
TTT=5000; %%% Number of testing samples
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%% In a regular case, DIC1=DIC2=WAIC,
%%% In sinular and delicate cases, they are different.
Case=2;
%%%%%%%%%%%%%%%%%% Regular Case
if(Case==1)
AA=0.5; %%% True mixture ratio
B1=2; %%% True b1
B2=2; %%% True b2
B3=2; %%% True b3
fprintf('Regular case. Sample=%g. Theoretial Gen Err=%f\n',NNN,(3+1)/(2*NNN));
end
%%%%%%%%%%%%%%%%% Singular Case
if(Case==2)
AA=0.5; %%% True mixture ratio
B1=0; %%% True b1
B2=0; %%% True b2
B3=0; %%% True b3
fprintf('Singular case. Sample=%g. Theoretical Gen Err=%f\n',NNN,1/(2*NNN));
end
%%%%%%%%%%%%%%%%%% Delicate Case
if(Case==3)
AA=0.5; %%% True mixture ratio
B1=0.3; %%% True b1
B2=0.3; %%% True b2
B3=0.3; %%% True b3
fprintf('Delicate case. Sample=%g. Theoretical Gen Err = %f ~ %f \n',NNN,1/(2*NNN),2*(3+1)/(2*NNN));
end
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%% B1=B2=B3=2 : Regular Case
%%% B1=B2=B3=0 : Singular Case
%%% B1=B2=B3=0.3 : Delicate Case
%%% In practical applications, model selection and hypothesis test
%%% are conducted in delicate cases.
%%% Let B=|B1|=|B2|=|B3|. Then
%%% in this situation (NNN=200), it sees that
%%% 0<=B<=0.1 Singular
%%% 0.1<=B<=0.5 Delicate
%%% 0.5**1)
aa=aa-1;
end
hh1=HHH(aa,bb1,bb2,bb3,XX1,XX2,XX3);
DD=hh1-hh0;
if(exp(-DD)>rand(1,1)) %%% Metropolis probability
aa0=aa;
b10=bb1;
b20=bb2;
b30=bb3;
hh0=hh1;
if(t>BURNIN)
acceptance=acceptance+1;
end
end
if(mod(t,INTER)==0&t>BURNIN) %%% MCMC parameters sampled
k=k+1;
waa(1,k)=aa0;
wb1(1,k)=b10;
wb2(1,k)=b20;
wb3(1,k)=b30;
EEE(1,k)=hh0;
end
end
ACCEPT(trial)=acceptance/(MCMC-BURNIN); %%% acceptance ratio
if(mod(trial,DISPLAY_N)==0)
fprintf('WAIC %g times done.\n',trial); %%% Display trai number
end
%%%%%%%%%%%%%%%%% If exchange rate is not appropriate, then MC step should be improved.
%%%%%%%%%%%%%%%%% recommended echange rate : 0.05 < echange_rate < 0.6
%%%%%%%%%%%%%%%%% It seems that there is no theory about the best exchange rate for MCMC.
%%%%%%%%%%%%%%%%%%%%%%%% Calculation of Training Error %%%%%%%%%%%%%%%%%%%%%%%%%%%%%
pred=zeros(1,NNN);
for i=1:1:NNN
pred(i)=mean(pmodel1(XX1(1,i),XX2(1,i),XX3(1,i),waa,wb1,wb2,wb3));
end
qq=pmodel2(XX1,XX2,XX3,AA,B1,B2,B3);
TE(trial)=mean(log(qq./pred));
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%% Calculation of WAIC and DIC %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%% WAIC %%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%% S.Watanabe, Algebraic Geometry and Statistical Learning Theory, Cambridge University Press, 2009.
%%% WAIC is always the unbiased estimator of the generalization error.
%%% Variance of WAIC is equal to that of GE, even in singular cases.
%%% Algebraic geometry is necessary to prove such a fact.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%% Original DIC1 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%% D.J.Spiegelhalter, N.G.Best,B.P.Carlin, A. Linde, Journal of Royal Statistical Society, B
%%% 64(4),583-639,2002
%%% DIC1 assumes that the posterior distribution can be approximated by
%%% some normal distribution.
%%%%%%%%%%%%%%%%%% DIC2 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%% A.Gelman,J.B.Carlin,H.S.Stern,D.B.Rubin, Bayesian Data Analysis, 2nd Edition Chapman& Hall, 2004.
%%% DIC2 assumes that likelihood function is subject to chi-square distribution.
%%% Andrew Gelman John B. Carlin, Hal S. Stern, David B. Dunson, Aki Vehtari, Donald B. Rubin
%%% Bayesian Data Analysis, 3rd Edition Chapman & Hall, 2013,
%%% Description of DIC2 was improved.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
avaa=mean(waa); %%% DIC1 average parameter
avb1=mean(wb1); %%% DIC1 average parameter
avb2=mean(wb2); %%% DIC1 average parameter
avb3=mean(wb3); %%% DIC1 average parameter
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
power1=zeros(1,NNN);
power2=zeros(1,NNN);
for i=1:1:NNN
logpr=log(pmodel1(XX1(1,i),XX2(1,i),XX3(1,i),waa,wb1,wb2,wb3));
power1(i)=mean(logpr);
power2(i)=mean(logpr.*logpr);
end
VV=sum(power2-power1.*power1); %%%%%%%%% WAIC Functional Variance
dic0=log(pmodel2(XX1,XX2,XX3,avaa,avb1,avb2,avb3)); %%%%% likelihoof of mean parameter
eff_num=2.0*sum( - power1 + dic0 ); %%%%%%% DIC1 effective number of parameters
%%% VV is the effective number of parameters in WAIC. This is not equal to the real log canonical threshold.
%%% eff_num is the effective number of parameters defined in DIC1.
DIC1(trial)=TE(trial)+eff_num/NNN; %%% Training error + effective number of paratemers / training sample number
WAIC(trial)=TE(trial)+VV/NNN; %%% Training error + functional variance / training sample number
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
tmp=zeros(1,SIZEM);
for k=1:1:SIZEM
tmp(k)=sum(log(pmodel2(XX1,XX2,XX3,waa(k),wb1(k),wb2(k),wb3(k))));
end
power1=mean(tmp);
power2=mean(tmp.*tmp);
DIC2(trial)=TE(trial)+2*(power2-power1*power1)/NNN; %%% Training error + effective number / training sample number
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%% Calculation of Generalization Error %%%%%%%%%%%%%%%%%%%%
pre=zeros(1,TTT);
for i=1:1:TTT
pre(i)=mean(pmodel1(XT1(1,i),XT2(1,i),XT3(1,i),waa,wb1,wb2,wb3));
end
qqq=pmodel2(XT1,XT2,XT3,AA,B1,B2,B3);
ge=mean(log(qqq./pre)+pre./qqq-1); %%% Kullback Leibler = int qlog(q/p) = int q(log(q/p)+p/q-1)
GE(trial)=ge;
end
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% Experiments END %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% Display Results %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
subplot(2,2,1);
plot(TRI,GE,'r-',TRI,TE,'b-');
xlabel('Independent Trials');
title('Red=G Error, Blue=T Error.');
subplot(2,2,2);
plot(TRI,GE,'r-',TRI,DIC1,'g-',TRI,WAIC,'b-');
xlabel('Independent Trials');
title('Red=G Error, Green=DIC1, Blue=WAIC.');
subplot(2,2,3);
plot(TRI,GE,'r-',TRI,DIC2,'g-',TRI,WAIC,'b-');
xlabel('Independent Trials');
title('Red=G Error, Green=DIC2, Blue=WAIC.');
subplot(2,2,4);
plot(waa,wb1,'b.',avaa,avb1,'rs');
title('(a,b1): Blue:Posterior, Red:Mean ');
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% Record Results %%%%%%%%%%%%%%%%%%%%%%%%%%%%
fp=fopen('mc2ver3.txt','w'); %%% File mc2ver3.txt is generated.
fprintf(fp,'GE TE DIC1 DIC2 WAIC AcceptRatio\n');
for i=1:1:TRIAL_N
fprintf(fp,'%f %f %f %f %f %f\n',GE(i),TE(i),DIC1(i),DIC2(i),WAIC(i),ACCEPT(i));
end
fclose(fp);
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%% GE : generalization error
%%% TE : training error
%%% DIC1 : original deviance information criterion
%%% DIC2 : Gelman's deviance information criterion
%%% WAIC : widely applicable information criterion
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
fprintf(' GE TE DIC1 DIC2 WAIC Accept Ratio\n');
fprintf('Average: %f %f %f %f %f %f\n',mean(GE),mean(TE),mean(DIC1),mean(DIC2),mean(WAIC),mean(ACCEPT));
fprintf('Std dev: %f %f %f %f %f %f\n',std(GE),std(TE),std(DIC1),std(DIC2),std(WAIC),std(ACCEPT));
fprintf('WAIC made file, mc2ver3.txt.\n');
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% Remarks %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%% (1) By singular learning theory, expectation of GE is asymptotically equal to that of WAIC
%% even if the true distribution is singular for or unrealizable by the statistical model.
%% They are equl to each other, even if the set of parameters is restricted.
%% (2) Variance of GE is equal to that of WAIC. Please compare the variances of TE and WAIC.
%% Also please compare the variances of WAIC, DIC1, and DIC2.
%% (3) There are a lot of improved DICs, however, there has been no theoretical
%% support. WAIC has the theoretical support of algebraic geometry.
%% WAIC requires neither Laplace approximation nor Fisher asymptotic theory.
%% Therefore WAIC is very different from DICs.
%% (4) If the posterior distribution is accurately approximated, then
%% GE+WAIC=2*RLCT/NNN holds, where RLCT is the real log canonical threshold.
%% In other words, GE+WAIC is a constant.
%% RLCT is an important birational invariant in algebraic geometry.
%% (5) Half of the expectation value of the functional varinace is
%% the singular fluctuation (SF). SF is a new birational invariant which was
%% found in statistics. In general, RLCT is not equal to SF.
%% (6) If the true distribution is regular for and realizable by the statistical model,
%% then both RLCT and the singular fluctuation are the half of the number of parameters.
%% (7) Widely Applicable information criterion is asymptotically equivalent to
%% Bayes leave-one-out cross validation in both regular and singular cases.
%% By this equivalence, it is proved that the variance of cross validation is
%% asymptotically equal to that of GE. Without singular learning theory,
%% we can not know such equivalence and the variance of the cross validation.
%% Sumio Watanabe, ``Asymptotic Equivalence of Bayes Cross Validation and Widely
%% Applicable Information Criterion in Singular Learning Theory." Journal of Machine
%% Learning Research, 11, 3571-3594, 2010.
%% (8) To obtain the bayes marginal likelihood in both regular and singular cases,
%% the widely applicable Bayesian information criterion (WBIC) is useful.
%% Sumio Watanabe, "A widely applicable Bayesian information criterion," Journal of
%% Machine learning Research, vol.14, pp. 867-897, 2013.
%% If you employ Bayes estimation, then the generalization error is made small even if
%% the model is redundant for the true. Thus information criterion is also small.
%% If you want to select a model with consistency, you had better use WBIC.
%% However, WBIC is not an unbised estimator of the generalization error.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
**