gappcalg<-function(Xmat,kvec,B) { # computes the gap statistic # for PAM clustering # required input: # Xmat = data matrix # kvec = number of clusters # B = number of null data set generated # pcsim<-function(Xmat) { Xmm<-apply(Xmat,2,mean) for (k in (1:dim(Xmat)[2])) { Xmat[,k]<-Xmat[,k]-Xmm[k] } ss<-svd(Xmat) Xs<-Xmat%*%ss$v Xnew<-apply(Xs,2,simgap) Xt<-Xnew%*%t(ss$v) for (k in (1:dim(Xmat)[2])) { Xt[,k]<-Xt[,k]+Xmm[k] } return(Xt) } simgap<-function(Xvec) { ma<-max(Xvec) mi<-min(Xvec) Xout<-runif(length(Xvec),min=mi,max=ma) return(Xout) } # Within sum of squares for clusterdata Wk0<-rep(0,length(kvec)) WkB<-matrix(0,length(kvec),B) for (bb in (1:B)) { Xnew<-pcsim(Xmat) for (kl in (1:length(kvec))) { if (bb==1) { pp<-pam(Xmat,kvec[kl]) pp2<-pam(Xnew,kvec[kl]) if (kvec[kl]>1) { for (zz in (1:kvec[kl])) { Xuse<-Xmat[pp$cluster==zz,] Wk0[kl]<-Wk0[kl]+sum(diag(var(Xuse)))*(length(pp$cluster[pp$cluster==zz])-1)/(dim(Xmat)[1]-kvec[kl]) Xuse2<-Xnew[pp2$cluster==zz,] WkB[kl,bb]<-WkB[kl,bb]+sum(diag(var(Xuse2)))*(length(pp2$cluster[pp2$cluster==zz])-1)/(dim(Xmat)[1]-kvec[kl]) } } if (kvec[kl]==1) { Wk0[kl]<-sum(diag(var(Xmat))) WkB[kl,bb]<-sum(diag(var(Xnew))) } } if (bb>1) { pp2<-pam(Xnew,kvec[kl]) if (kvec[kl]>1) { for (zz in (1:kvec[kl])) { Xuse2<-Xnew[pp2$cluster==zz,] WkB[kl,bb]<-WkB[kl,bb]+sum(diag(var(Xuse2)))*length(pp2$cluster[pp2$cluster==zz])/(dim(Xmat)[1]-kvec[kl]) } } if (kvec[kl]==1) { WkB[kl,bb]<-sum(diag(var(Xnew))) } } } } Sgap<-rep(0,length(kvec)) Sdgap<-Sgap for (kk in (1:length(kvec))) { Sgap[kk]<-mean(log(WkB[kk,]))-log(Wk0[kk]) Sdgap[kk]<-sqrt(1+1/B)*sqrt(var(log(WkB[kk,])))*sqrt((B-1)/B) } diffk<-Sgap[1:(length(kvec)-1)]-(Sgap[2:length(kvec)]-Sdgap[2:length(kvec)]) if (length(diffk[diffk>0])==0) { ksel<-kvec[length(kvec)] } if (length(diffk[diffk>0])>0) { Sgapstar<-diffk[diffk>0] ksel<-kvec[diffk>0] ksel<-ksel[1] } plot(kvec,Sgap) lines(kvec,Sgap+Sdgap,lty=2) lines(kvec,Sgap-Sdgap,lty=2) return(Sgap,Sdgap,diffk,ksel) }