options pagesize=47 linesize=64; /********************************************************/ /* Computation for Cancer Risk Among Nickel Smelters */ /* We will explore a data set reflecting the health out-*/ /* comes among employees at a nickel smelting plant. */ /* Information collected on each employee was */ /* id=an id number */ /* icd=code for cause of death, if died, or 0 if alive. */ /* exp=a measure of exposure to carcinogens */ /* dob=date of birth */ /* agefemp=age at first employment */ /* aff=age at first follow-up */ /* deathage=age at death */ /* First, read in data, from Breslow and Day Vol II. */ /********************************************************/ data nickel; infile 'nickel.dat'; input id icd exp dob agefemp aff deathage; /********************************************************/ /* Create variables nasal for whether the person has */ /* nasal cancer or not, and exposure for whether they */ /* have low or high exposure. */ /********************************************************/ nasal=0; if icd=160 then nasal=1; exposure=1; if exp>=1 then exposure=2; newexp=0; if exp>0 then newexp=1; if exp>2 then newexp=2; if exp>4 then newexp=3; time=deathage-aff; run; proc freq data=nickel; table exposure*nasal; run; /********************************************************/ /* Exposed smelters appear to have higher risk than non-*/ /* exposed smelters. This might reflect changes in the */ /* smelting operation. Maybe workers are less exposed */ /* to toxins now, and so more recent hires would tend to*/ /* be unexposed. More work is required to check this. */ /* We will group data to illustrate concepts of person-*/ /* years at risk. Grouped data is also given directly */ /* by Breslow and Day Vol II Appendix IV. */ /********************************************************/ data bigset; set nickel; /* Put age at first employment into groups*/ afe=1; do ii=20,27.5,35; if agefemp>=ii then afe=afe+1; end; /* Put year of first employment into groups*/ yfe=0; do ii=1902,1910,1915,1920; if (dob+agefemp)>=ii then yfe=afe+1; end; const=1; beg=0; /*********************************************************/ /* For each time interval, beg is year interval begins. */ /* tfed:time from employment to death or loss to followup*/ /* tfef:time from employment to beginning of followup */ /*********************************************************/ do yend=20,30,40,50,200; tfed=deathage-agefemp; tfef=aff-agefemp; rskyrs=max(min(tfed,yend)-max(tfef,beg),0); case=0; if tfed>=beg and tfed0 then rate=case/rskyrs; run; proc print data=sumset noobs; run; /*******************************************************/ /* Now we'll calculate the raw rate, both for the whole*/ /* group and separately for exposed and nonexposed. */ /*******************************************************/ proc sort data=bigset; by beg exposure ; run; proc means data=bigset sum noprint; by beg exposure ; var rskyrs case const; output out=sumbyex sum=; run; data sumbyex; set sumbyex; if rskyrs>0 then rate=case/rskyrs; run; proc means data=sumbyex sum noprint; by beg; var case rskyrs const; output out=standset sum=; run; data unstand; set standset sumbyex; run; proc sort data=unstand; by exposure; run; proc means data=unstand sum noprint; by exposure; output out=ussum sum=; run; data ussum; set ussum; rate=case/rskyrs; avgpyrs=rskyrs/const; run; title 'Unstandardized rate'; proc print data=ussum noobs; var exposure case rskyrs rate const avgpyrs; run; proc sort data=unstand ; by beg exposure; run; /**********************************************************/ /* Now we'll compare to rates in the general population. */ /* The nickel smelters will end up having much more nasal */ /* cancer than the general population; furthermore, we've */ /* used duration of exposure as our time measurement. The*/ /* general population isn't exposed, and so the closest */ /* thing to duration of exposure available is age. Nasal */ /* cancer rates in the genral population increase with */ /* age, and so the general population cancer rate */ /* corresponding to an employee with x years of service is*/ /* no higher than the general population rate for age */ /* x+the maximal age at first employment, or 30. */ /**********************************************************/ data compare; set unstand; if beg=0 then srate=.000012444; if beg=20 then srate=.000020444; if beg>20 then srate=.000042556; if beg=0 then srskyrs=100; if beg=20 then srskyrs=100; if beg=30 then srskyrs=70; if beg=40 then srskyrs=50; if beg=50 then srskyrs=30; cmftop=srskyrs*case/rskyrs; expect=srate*rskyrs; expect1=srate*srskyrs; run; proc sort data=compare ; by beg descending exposure; run; data relrisk; set compare; retain r2; if exposure=1 then r1=rate; else do r2=rate; delete; enddo; rr=r2/r1; run; proc sort data=compare; by descending exposure; run; proc means data=compare noprint sum; by exposure notsorted; output out=smr sum=; var case expect cmftop srskyrs expect1; run; data smr; set smr; drop _type_ _freq_; smr=case/expect; cmf=cmftop/expect1; direct=cmftop/srskyrs; smru=smr*exp(1.96/case); smrl=smr/exp(1.96/case); run; proc print data=smr noobs; title 'SMR computation'; run; proc print data=relrisk noobs; title 'Data relative risk'; var beg r1 r2 rr; run; /********************************************************/ /* Material for Lecture 4. Input data set has 3 lines, */ /* corresponding to low, high, and joint exposure */ /* groups. The only variables we use are expect and */ /* case. Last line is ignored, and we recode case as o0*/ /* and o1, and expect as e0 and e1. First calculate */ /* standard errors for the CI and the test, as sepi and */ /* sepi0 respectively. Pick a z value corresponding to */ /* the size of the test and 1-confidence level; the size*/ /* is .05, and the z value is 1.96 here. */ data smrci; set smr; retain o1 e0 e1 o0; m=-1; if exposure=2 then do; o1=case; e1=expect; delete; end; if exposure=1 then do; o0=case; e0=expect; m=1; end; if exposure=. then do; pi0=e1/(e0+e1); sepi0=sqrt(pi0*(1-pi0)/(o0+o1)); test=abs(o1/(o0+o1)-pi0)/sepi0; pv=2*(1-probnorm(test)); epvl=probbnml(pi0,(o0+o1),o1); epvu=probbnml(1-pi0,(o0+o1),o0); end; sepi=sqrt(o1*o0/(o0+o1)**3); zv=1.96; cipi=o1/(o0+o1)+m*zv*sepi; cirelr=e0*cipi/(e1*(1-cipi)); run; title 'Tests and CIs for pi and smr ratio'; proc print data=smrci noobs; var pi0 sepi0 test pv epvl epvu sepi cipi cirelr; run; /******************************************************/ /* Here are tests for ordered exposures, for lecture 1*/ /* nickel is the data set as before, except that I've */ /* created a new variable newexp that takes values 0, */ /* 1,2,3 indicating increasing exposure. Do chi- */ /* square tests. First calculate sums of times and */ /* nasal cancer cases by exposure group, and put into */ /* the data set ordered. We will perform three tests.*/ /* One will test whether the rates in the various exp-*/ /* posure groups are the same as the population rate, */ /* vs the general alternative, the second tests */ /* whether the rates are the same as each other (but */ /* not necessarily any a priori value) vs. the general*/ /* alternative, adn the third tests the same null vs */ /* the ordered alternative. To do the first test, I */ /* have to specify an a priori rate; this is called */ /* srate. To do the second and third, I need the */ /* total time and total number of cases; these are put*/ /* in the data set total. */ /******************************************************/ proc sort data=nickel; by newexp; proc means data=nickel sum noprint; by newexp; output out=ordered sum=; var nasal time; run; proc means data=nickel sum noprint; output out=total sum=; var nasal time; run; data total; set total; stime=time;snasal=nasal; drop time nasal; run; /******************************************************/ /* The following data set takes the information on */ /* numbers of cases and total times, totaled by exp. */ /* group, and hooks in the total number of cases and */ /* time over all. Then I calculate the two chi-sq */ /* tests with the two different expectations, and the */ /* test for the ordered alternative. */ /******************************************************/ data ordered; merge ordered total; retain ttime tnasal; if _n_=1 then do; ttime=stime; tnasal=snasal; end; drop stime snasal; srate=.000042556; drop _freq_ _type_; expect=time*srate; expect1=time*tnasal/ttime; test1=(nasal-expect)**2/expect; test2=(nasal-expect1)**2/expect1; test3=newexp*(nasal-expect1); v1=newexp**2*expect1; v2=newexp*expect1; run; run; proc print data=ordered noobs; run; proc means data=ordered sum noprint; output out=chisq sum=; run; data chisq; set chisq; test3=test3/sqrt(v1-v2**2/nasal); pv1=1-CDF('chisq',test1,4); pv2=1-CDF('chisq',test2,3); pv3=2*(1-CDF('normal',test3)); run; proc print data=chisq noobs; var test1 test2 test3 pv1 pv2 pv3; run;