/* Compare three types of correlation measures for paired data (X_i,Y_i) (like height and weight) (Pearson rho, Spearman R, and Kendall tau) Also, jackknife and bootstrap confidence intervals for all three */ #include #include #include /* Prototypes for functions in statlib.c : Compile by e.g. gcc -o ProgName Progname.c statlib.c with ProgName.c, statlib.c, and statlib.h in the default directory. */ #include "statlib.h" #define DTOL 1e-12 /* Minimum floating-point error */ #define MAXN 50 /* Allow for up to this many pairs */ /* The `const' says that the program should not change these values */ /* without a compiler crash or at least a very strong warning. */ /* Text for this example */ const char *title_data= "Prepacked Lubricants and Survival Times for Engines\n" "(Source of data unknown.)"; /* Number of pairs (subjects) */ const int nn=20; const char *xname="Lubricant", *yname="Survival"; const double xx[] = { 29, 21, 30, 45, 48, 92, 74, 61, 99, 24, 84, 37, 94, 56, 67, 79, 70, 93, 76, 72 }; const double yy[] = { 23, 36, 67, 104, 147, 164, 247, 304, 355, 408, 429, 489, 504, 925, 980, 1254, 2961, 5351, 9915, 11301 }; /* Default starting random-number seed and the number of */ /* permutations in permutation procedures */ unsigned long wseed=123456; int nsims=100000; /* Number of bootstrap replications in bootstrap procedures */ #define NBOOT 10000 int nboot=NBOOT; /* Prototypes to calculate the three types of correlation coefficients */ void do_pearson(int nn, const double *xx, const double *yy); /* rx[] and ry[] are ranks within each sample */ void do_spearman(int nn, const double *rx, const double *ry); /* Kendall correlation coefficient and P-values */ /* Returns the large-sample Z statistic and the permutation P-value CI */ void do_kendall(int nn, const double *xx, const double *yy, double *zkendptr, double *p1ptr, double *p2ptr, double *p3ptr); /* Pearson correlation coefficient revisited */ void do_pearson_revisited(int nn, const double *xx, const double *logyy); /* Jackknife and Bootstrap for Pearson correlation coefficient */ void do_jackknife_pearson(int nn, const double *xx, const double *yy); void do_bootstrap_pearson(int nn, const double *xx, const double *yy); /* Bootstrap for Kendall tau */ void do_bootstrap_kendall(int nn, const double *xx, const double *yy, double zkendstat, double p1kend, double p2kend, double p3kend); void show_data (int nn, const double *xx, const double *yy, const char *xname, const char *yname); void show_data_and_midranks (int nn, const double *xx, const double *yy, const double *rx, const double *ry, const char *xname, const char *yname); void show_logdata (int nn, const double *xx, const double *yy, const char *xname, const char *yname); int main(int argc, char **argv) { int i; /* Midranks and tiegroups for each sample */ double rx[MAXN], ry[MAXN], logyy[MAXN]; double zkendstat, p1kend,p2kend,p3kend; /* Helpful for segmentation faults and some other fatal errors */ /* This function is in statlib.c */ set_newsignals(); printf ( "Three different types of correlation coefficients:\n" " Pearson rho, Spearman R, and Kendall tau\n" "Also, jackknife and bootstrap confidence intervals for all three.\n"); /* Initialize random-number seed */ /* Enter `(ProgramName) ' to set starting RN seed= */ if (argc>=2) wseed=atol(argv[1]); wseed=setrand(wseed); /* If wseed=0, wseed=value from system clock */ printf ("\nStarting random-number seed: %lu\n",wseed); printf ("(Enter a number on the command line " "to specify a different seed.)\n"); /* Describe and display the data */ printf ("\n%s\n\n", title_data); show_data(nn,xx,yy, xname,yname); /* Make sure that nn is big enough */ if (nn<4) { printf ("\nN=%d is too small!\n",nn); exit(1); } /* Calculate and analyze Pearson correlation coefficient: */ do_pearson(nn,xx,yy); /* Find midranks and tiegroup sizes for both xx[] and yy[] */ /* makeranks() is in statlib.c */ /* Tie-group information not needed */ makeranks(rx, xx,nn, NULL,NULL); makeranks(ry, yy,nn, NULL,NULL); printf ("\nData and within-sample midranks for both samples:\n\n"); show_data_and_midranks(nn, xx,yy, rx,ry, xname,yname); /* Spearman correlation coefficient: */ do_spearman(nn,rx,ry); /* Kendall correlation coefficient: */ /* Returns the large-sample Kendall Z statistic to avoid recomputing it */ /* as well as the confidence interval for the permutation-test P-value */ do_kendall(nn,xx,yy, &zkendstat, &p1kend, &p2kend, &p3kend); /* Pearson correlation coefficient, revisited */ printf ("\nREVISITING the Pearson correlation coefficient:\n"); printf ("For Xs and log(Y)s:\n"); /* logyy[] is log(Y) */ for (i=0; i=fabs(obs_rho)-DTOL) nbig++; } /* Display permutation test P-value and 95% CI */ pp=(double)nbig/nsims; pwid=1.960*sqrt(pp*(1-pp)/nsims); printf (" P=%d/%s=%g 95%% CI for true P: (%6.4f, %6.4f, %6.4f)\n", nbig,commnum(nsims), pp, pp-pwid,pp,pp+pwid); } void do_pearson(int nn, const double *xx, const double *yy) { double rho,tt; printf ("\nPearson correlation coefficient (n=%d):\n", nn); rho=get_pearson_corr(nn,xx,yy); tt=rho*sqrt((nn-2)/(1-rho*rho)); printf (" Rho=%g T=%g Classical P=%7.5f (Student's t, df=%d)\n", rho, tt, pvttest(nn-2,tt), nn-2); /* Do `nsims' permutations */ printf ("Permutation test based on %s permutations:\n", commnum(nsims)); printf (" Each permutation fixes the Xs, permutes the Ys\n"); shuffle_pearson_display(rho, nn,xx,yy); } void do_spearman(int nn, const double *rx, const double *ry) { double rr,zz; /* The Spearman correlation coefficient (with tie corrections) */ /* is the same as the Pearson correlation for within-sample ranks */ /* The large-sample approximation is different, however */ printf ("\nSpearman correlation coefficient (n=%d):\n", nn); rr=get_pearson_corr(nn,rx,ry); zz=rr*sqrt(nn-1); printf (" R=%g Z=%g Large-sample P=%7.5f (two-sided normal)\n", rr, zz, normpval(zz)); /* Permutation test based on permutations of the Y-values, */ /* keeping the X-values fixed: */ printf ("Doing %s permutations fixing RankXs, permuting RankYs:\n", commnum(nsims)); /* Everything after this point is the same with the within-sample */ /* ranks/midranks replacing the original sample */ shuffle_pearson_display(rr, nn,rx,ry); } /* The Pearson correlation coefficient revisited */ void do_pearson_revisited(int nn, const double *xx, const double *logyy) { double rho,tt; rho=get_pearson_corr(nn,xx,logyy); tt=rho*sqrt((nn-2)/(1-rho*rho)); printf (" Rho=%g T=%g Classical P=%7.5f (Student's t, df=%d)\n", rho, tt, pvttest(nn-2,tt), nn-2); /* Do `nsims' permutations */ printf (" Doing %s permutations fixing Xs, permuting log(Y)s:\n", commnum(nsims)); shuffle_pearson_display(rho, nn,xx,logyy); } int get_kendall_kk(int nn, const double *xx, const double *yy, int *ntieptr) { int i,j, kk,nties; kk=nties=0; for (i=0; i 0.0) kk++; else kk--; }} if (ntieptr!=NULL) *ntieptr=nties; return kk; } /* The next function is probably the easiest way to handle tie corrections */ /* for the large-sample approximation of the Kendall K statistic */ /* Note that it fills three integers in an integer list with tie sums */ void get_kendall_tiesums (int *kt, int ntg, const int *tielist) { int i; kt[0]=kt[1]=kt[2]=0; /* Initialize at zero */ for (i=0; i=abs(kk)) nbig++; } /* Display permutation test P-value and 95% CI */ /* Display permutation test P-value and 95% CI */ pp=(double)nbig/nsims; pwid=1.960*sqrt(pp*(1-pp)/nsims); printf (" P=%d/%s=%g 95%% CI for true P: (%6.4f, %6.4f, %6.4f)\n", nbig,commnum(nsims), pp, pp-pwid,pp,pp+pwid); if (zkendptr!=NULL) *zkendptr=zz; if (p1ptr!=NULL) *p1ptr=pp-pwid; if (p2ptr!=NULL) *p2ptr=pp; if (p3ptr!=NULL) *p3ptr=pp+pwid; } void do_jackknife_pearson(int nn, const double *xx, const double *yy) { int i,j,r; /* Jackknife delete-1 and pseudo-values */ double rho, del1value[MAXN], psvalue[MAXN]; double jsum, jmean,jvar,jstd,tt, tcrit,jzwid,jtwid; rho=get_pearson_corr(nn,xx,yy); printf ("Jackknife of Pearson rho (n=%d, original rho=%6.4f)\n", nn,rho); /* Construct the jackknife delete-1 and pseudo-values */ for (i=0; i0 && (i%5)==0) printf("\n"); printf (" %7.4f(%d)%s",del1value[i],i+1,(i<10)?" ":""); } printf("\n"); printf ("Jackknife pseudo-values for rho:\n"); for (i=0; i0 && (i%5)==0) printf("\n"); printf (" %7.4f(%d)%s",psvalue[i],i+1,(i<10)?" ":""); } printf("\n"); /* Find mean and standard deviation of pseudo-values */ for (jsum=0.0, i=0; i1.0) ? 1.0 : pp; } void show_bootstrap_output (const char *varname, int nboot, double *zboot, double val) { int nhits,ns, blo,bhi; double bootmedian, zlo, pp,pwid; /* First, sort zboot[]: qsort() is a standard C function that implements `quick-sort' that sorts a list of size n in O(n log n) steps One of qsort's arguments is a pointer to a `comparison function' that qsort uses to do the sorting `dubsort' in statlib.c is for sorting doubles in increasing order */ qsort(zboot,nboot, sizeof(double),dubsort); /* Find the median */ if (nboot%2) bootmedian=zboot[nboot/2]; else bootmedian = (zboot[nboot/2] + zboot[(nboot/2)-1])/2; /* The 95% percentile bootstrap CI is the middle 95% of the bootstrap */ /* sample, based on order statistics */ /* We follow the suggestion in the text (p388-389), following */ /* Efron and Tibshirani (1993), about the best way to calculate these */ zlo=0.250*nboot; if (zlo-floor(zlo)<0.001) /* If zlo is an integer */ blo=(int)floor(zlo)-1; /* The zlo-th array entry */ else blo=(int)floor(0.025*(nboot+1))-1; bhi=nboot-1-blo; /* Number k from the top if blo is #k from the bottom */ printf (" Bootstrap median and 95%% CI for %s: (%6.4f, %6.4f, %6.4f)\n", varname, zboot[blo], bootmedian, zboot[bhi]); /* Bootstrap test of var!=0: */ /* Count bootstrap replications where newval has the opposite sign as val */ for (nhits=ns=0; ns