gapalg<-function(Xmat,kvec,B=10) { # computes the gap statistic # for PAM clustering # can substitute another clustering algorithm here # # don't forget library(cluster) before run # # required input: # Xmat = data matrix (clusters rows of Xmat) # kvec = vector of number of clusters to try # B = number of null data set generated (default=10) # # Within sum of squares for clusterdata # # function needed simgap<-function(Xvec) { ma<-max(Xvec) mi<-min(Xvec) Xout<-runif(length(Xvec),min=mi,max=ma) return(Xout) } # Wk0<-rep(0,length(kvec)) WkB<-matrix(0,length(kvec),B) for (bb in (1:B)) { Xnew<-apply(Xmat,2,simgap) for (kl in (1:length(kvec))) { if (bb==1) { pp<-pam(Xmat,kvec[kl]) pp2<-pam(Xnew,kvec[kl]) # # here can use another algorithm that outputs cluster labels # if (kvec[kl]>1) { for (zz in (1:kvec[kl])) { Xuse<-Xmat[pp$cluster==zz,] Wk0[kl]<-Wk0[kl]+sum(diag(var(Xuse)))*(length(pp$cluster[pp$cluster==zz])-1)/(dim(Xmat)[1]-kvec[kl]) Xuse2<-Xnew[pp2$cluster==zz,] WkB[kl,bb]<-WkB[kl,bb]+sum(diag(var(Xuse2)))*(length(pp2$cluster[pp2$cluster==zz])-1)/(dim(Xmat)[1]-kvec[kl]) } } if (kvec[kl]==1) { Wk0[kl]<-sum(diag(var(Xmat))) WkB[kl,bb]<-sum(diag(var(Xnew))) } } if (bb>1) { pp2<-pam(Xnew,kvec[kl]) if (kvec[kl]>1) { for (zz in (1:kvec[kl])) { Xuse2<-Xnew[pp2$cluster==zz,] WkB[kl,bb]<-WkB[kl,bb]+sum(diag(var(Xuse2)))*length(pp2$cluster[pp2$cluster==zz])/(dim(Xmat)[1]-kvec[kl]) } } if (kvec[kl]==1) { WkB[kl,bb]<-sum(diag(var(Xnew))) } } } } Sgap<-rep(0,length(kvec)) Sdgap<-Sgap for (kk in (1:length(kvec))) { Sgap[kk]<-mean(log(WkB[kk,]))-log(Wk0[kk]) Sdgap[kk]<-sqrt(1+1/B)*sqrt(var(log(WkB[kk,])))*sqrt((B-1)/B) } diffk<-Sgap[1:(length(kvec)-1)]-(Sgap[2:length(kvec)]-Sdgap[2:length(kvec)]) if (length(diffk[diffk>0])==0) { ksel<-kvec[length(kvec)] } if (length(diffk[diffk>0])>0) { Sgapstar<-diffk[diffk>0] ksel<-kvec[diffk>0] ksel<-ksel[1] } plot(kvec,Sgap) lines(kvec,Sgap+Sdgap,lty=2) lines(kvec,Sgap-Sdgap,lty=2) return(Sgap,Sdgap,diffk,ksel) }