# October 22, 2006; DM added -s option; fixed -p 1 bug $commandString = join " ",@ARGV; if ($commandString =~ /-h/) { print "Here is how to use this program:\n\n"; print "perl logisticBugs.pl [options] training_data_file\n\n"; print "where the options are:\n\n"; print "-N , Number of MCMC iterations (default is 10000)\n"; print "-B , Number of MCMC iterations discarded for burn-in (default is 1000)\n"; print "-p <[1,2]>, Type of prior, 1-Laplace 2-Gaussian (default is 2)\n"; print "-s <[0,1]>, Standardize the predictors (default is 0)\n"; print "-v , Prior variance for each coefficient (default is 1)\n"; print "-V , Prior variance for the intercept (default is 1)\n\n\n"; print "Here's an example:\n"; print "perl logisticBugs.pl -N 10000 -B 1000 -p 2 -v 1 -V 1 c:\\cdc\\pima.bbr\n\n"; print "NOTE: for some reason, this program has be in the same directory as,\n"; print "or a subdirectory of, WinBUGS.\n\n"; die; } # perl logisticBugs.pl -N 10000 -B 1000 -p 2 -v 1 -V 1 c:\cdc\pima.bbr # get the filenames $commandString =~ /([^ ]+) *$/; $INFILE = $1; $NUMITER = 10000; if ($commandString =~ /-N (\d+)/) { if ($1 > 0) { $NUMITER = $1; } } $BURNIN = 1000; if ($commandString =~ /-B (\d+)/) { if ($1 > 0) { $BURNIN = $1; } } $PRIOR = 2; if ($commandString =~ /-p (\d)/) { if ($1 == 1) { $PRIOR = 1; } } $VAR = 1.0; if ($commandString =~ /-v ([\d\.]+)/) { if ($1 > 0) { $VAR = $1; } } $VAR0 = 1.0; if ($commandString =~ /-V ([\d\.]+)/) { if ($1 > 0) { $VAR0 = $1; } } $STANDARD = 0; if ($commandString =~ /-s (\d)/) { if ($1 == 1) { $STANDARD = 1; } } # first find out how many predictors there are $d = 0; $n = 0; open(RAW,$INFILE); while () { chomp; $n++; @temp = split /\s/; shift @temp; foreach $temp (@temp) { @temp1 = split ":",$temp; if ($temp1[0] > $d) {$d = $temp1[0];} } } close(RAW); # now read the data into memory... $i = -1; open(RAW,$INFILE); while () { chomp; $i++; @temp = split /\s/; if ($temp[0] > 0) {$y[$i]=1;} else {$y[$i]=0;} shift @temp; undef %tempHash; foreach $temp (@temp) { @temp1 = split ":",$temp; $tempHash{$temp1[0]}=$temp1[1]; } for ($j = 1; $j <= $d; $j++) { if (defined $tempHash{$j}) { $x[$j]->[$i] = $tempHash{$j}; } else { $x[$j]->[$i] = 0; } } } close(RAW); if ($STANDARD == 1) { for ($i=1; $i <= $d; $i++) { $sum = 0; $sumSqr = 0; for ($j=0; $j < $n; $j++) { $sum += $x[$i][$j]; $sumSqr += $x[$i][$j]*$x[$i][$j]; } $temp = (1/($n-1)) * ($sumSqr - ($sum*$sum/$n)); if ($temp > 0.0001) { $stDev[$i] = sqrt($temp); } else { $stDev[$i]=0; } $mean[$i] = $sum/$n; if ($stDev[$i] > 0) { for ($j=0; $j < $n; $j++) { $x[$i][$j] = ($x[$i][$j]-$mean[$i])/$stDev[$i]; } } } } # now write out the data for BUGS open(DATA,">logistic_dat.txt"); print DATA "list(\n"; print DATA "n=$n,var=$VAR,var0=$VAR0,\n"; print DATA "y=c(",join(",",@y),"),\n"; for ($i=1; $i <= $d; $i++) { $temp=$x[$i]; @temp = @$temp; print DATA "x$i=c("; for ($j=0; $j < @temp; $j++) { printf DATA "%.6f",$temp[$j]; if ($j < (@temp-1)) {print DATA ",";} } print DATA "),\n"; } print DATA ")\n"; close(DATA); # write the code open(CODE,">logistic_mod.txt"); print CODE "model; \n"; print CODE "{ \n"; print CODE " for( i in 1 : n ) { \n"; print CODE " \n"; print CODE " y[i] ~ dbern(p[i]) \n"; if ($n <= 25) { print CODE " lik[i] <- exp(b0 "; for ($i=1; $i <= $d; $i++) { print CODE "+ (b$i * x$i\[i\])"; } print CODE ") \n"; } else { $j=0; while (($j*25) < $d) { print CODE " lik$j\[i\] <- "; for ($i=($j*25) + 1; $i <= min($d,($j*25) + 25); $i++) { print CODE "(b$i * x$i\[i\])"; if ($i < min($d,($j*25) + 25)) { print CODE " + "; } } print CODE "\n"; $j++; } print CODE " lik[i] <- exp(b0 "; for ($i=0; $i < $j; $i++) { print CODE "+ lik$i\[i\]"} print CODE ") \n"; } print CODE " p[i] <- lik[i] / (1 + lik[i]) \n"; print CODE "} \n"; if ($PRIOR == 2) { print CODE "b0 ~ dnorm(0.0,prec0)\n"; for ($i=1; $i <= $d; $i++) { print CODE " b$i ~ dnorm(0.0,prec) \n"; } print CODE "prec <- 1 / var\n"; print CODE "prec0 <- 1 / var0\n"; } else { # note: for the ddexp the variance is actually 2 * sqr(var) print CODE "b0 ~ ddexp(0.0,prec0)\n"; for ($i=1; $i <= $d; $i++) { print CODE " b$i ~ ddexp(0.0,prec) \n"; } print CODE "prec <- sqrt(2 / var)\n"; print CODE "prec0 <- sqrt(2 / var0)\n"; } print CODE "} \n"; close(CODE); # write the inits open(INITS,">logistic_in.txt"); print INITS "list("; for ($i=0; $i<=$d; $i++) { print INITS "b$i=0,"; } print INITS ");"; close(INITS); # write the script open(SCRIPT,">logistic_script.txt"); print SCRIPT "display(\'log\') \n"; print SCRIPT "check(\'cdc\\logistic_mod.txt\') \n"; print SCRIPT "data(\'cdc\\logistic_dat.txt\') \n"; print SCRIPT "compile(1) \n"; print SCRIPT "inits(1, \'cdc\\logistic_in.txt\') \n"; print SCRIPT "gen.inits() \n"; print SCRIPT "over.relax() \n"; print SCRIPT "update($BURNIN) \n"; for ($i=0; $i<=$d;$i++) { print SCRIPT "set(b$i) \n"; } print SCRIPT "update($NUMITER) \n"; print SCRIPT "stats(*) \n"; for ($i=0; $i<=$d; $i++) { print SCRIPT "coda(b$i,\'cdc\\b$i\') \n"; } print SCRIPT "history(*) \n"; print SCRIPT "save(\'cdc\\logistic_Log.txt\') \n"; print SCRIPT "save(\'cdc\\logistic_Log\') \n"; print SCRIPT "quit() \n"; close(SCRIPT); # now run BUGS system "c:\\winbugs14\\winbugs14.exe /PAR cdc\\logistic_script.txt"; # now process the output open(BUGS,"logistic_Log.txt"); # first get pass the header crap do { $temp = ; } until ($temp =~ /MC error/); # then pick up the coefficients's for ($i = 0; $i <= $d; $i++) { $temp = ; @temp = split "\t", $temp; $temp[1] =~ /b(.*)/; $index = $1; $b[$index] = $temp[2]; $bsd[$index] = $temp[3]; } close(BUGS); for ($i = 0; $i <= $d; $i++) { $temp = "b$i"."1.txt"; open(CODA,$temp); @tempBeta = (); while () { chomp; @temp = split "\t"; push @tempBeta,$temp[1]; } close(CODA); $bmode[$i] = &mode(\@tempBeta) } for ($i=0; $i<=$d; $i++) { printf "%.0f mean=%.2f mode=%.2f sd=%.2f\n",$i,$b[$i],$bmode[$i],$bsd[$i]; } if ($STANDARD == 1) { print "original scale\n"; for ($i=1; $i<=$d; $i++) { if ($stDev[$i] > 0) { printf "%.0f mean=%.2f mode=%.2f sd=%.2f\n",$i,($b[$i]/$stDev[$i]),($bmode[$i]/$stDev[$i]),($bsd[$i]/$stDev[$i]); } else { printf "%.0f mean=%.2f mode=%.2f sd=%.2f\n",$i,$b[$i],$bmode[$i],$bsd[$i]; } } } sub min { my ($a,$b); $a = shift; $b = shift; if ($a < $b) { return $a; } else { return $b; } } sub mode { # iterated half-range mode # http://interstat.stat.vt.edu/interstat/articles/2001/articles/N01001.pdf my (@x,$x, $length, $j); $x = shift; @x = @$x; @x = sort {$a <=> $b} @x; do { $length = @x; $halfLength = int($length/2); $minLength = 100000000; $minIndex = -1; for ($j=0; $j < $halfLength; $j++) { if (($x[$j+$halfLength]-$x[$j]) < $minLength) { $minLength = ($x[$j+$halfLength]-$x[$j]); $minIndex = $j; } } @x = @x[$minIndex..($minIndex+$halfLength)]; # print $x[int(@x/2)],"\n"; } until (@x < 5); return $x[int(@x/2)]; }