Example R programs and commands 22. Binomial and hypergeometric distributions # All lines preceded by the "#" character are my comments. # All other left-justified lines are my input. # All other indented lines are the R program output. # BINOMIAL probability density # # Example: probability of x=5 Heads in N=7 tosses of a fair coin. x<-5 # event: number of Heads N<-7 # action: number of tosses prob<-0.5 # situation: probability of "Head" in one toss for a fair coin dbinom(x, N, prob) # probability of exactly x Heads in N tosses # Complete list of probabilities for x=0,1,2,...,N: dbinom(0:N, N, prob) # probabilities of x Heads, all x=0,1,...,N # Cumulative binomial density function: pbinom(x, N, prob) # probability of x or fewer Heads in N tosses pbinom(x, N, prob, lower.tail=FALSE) # probability of more than x Heads 1-pbinom(x, N, prob) # ...same result as the previous line. # Binomial test: p-value is probability of the result or worse binom.test(x,N,prob) # direct calculation, noting that x>N*prob: above <- x:N; below <- 0:(N-x); worse <- c(above,below); pval <- sum(dbinom(worse,N,prob)); pval # CHI-SQUARED test for binomial density given observed frequencies. # # CASE 1: test with a known "prob" # N <- 4 # number of trials, == max number of successes fobs <- c(30,51,33,10,2) # observed frequencies at x=0,1,...,N prob <- 0.30 # H0 expected success probability phat <- dbinom(0:N, N, prob) # H0 expected probabilities (NOT frequencies) chisq.test(fobs,p=phat) # uses DF=N, appropriate for known prob. # Result: p-value = 0.9123, DO NOT REJECT H0 phat <- dbinom(0:N, N, 0.40) # H0 expected probabilities for "wrong" prob chisq.test(fobs,p=phat) # uses DF=N, appropriate for known prob. # Result: p-value = 0.0004436, REJECT H0 # Alternative method, for comparison: # Concatenate to a single long trial and use binom.test() trials <- sum(N*fobs); trials # total number of trials success <- sum( (0:N)*fobs ); success # total successes binom.test(success,trials,prob) # p=0.7337 ==> DO NOT REJECT H0 # # ...and with the wrong success probability 0.40: binom.test(success,trials,0.40) # p=1.839e-05 ==> REJECT H0 # # CASE 2: test with unknown "prob" # N <- 4 # number of trials, == max number of successes fobs <- c(30,51,31,11,2) # observed frequencies at x=0,1,...,N tot <- sum(fobs); # total number of observations pbar <- sum( (0:N)*fobs )/tot # average number of successes per trial prob <- pbar/N; prob # compute and show the H0 estimated Prob(success) phat <- dbinom(0:N, N, prob) # H0 expected probabilities (NOT frequencies) chi2<-chisq.test(fobs,p=phat)$statistic # extract the Chi-squared value pchisq(chi2,N-1,lower.tail=FALSE) # p-value with DF=N-1 for computed "prob" # Repeat with a perfect uniform frequency table, not binomial: N <- 4 # number of trials, == max number of successes fobs <- rep(12,N+1) # observed frequencies at x=0,1,...,N tot <- sum(fobs); # total number of observations pbar <- sum( (0:N)*fobs )/tot # average number of successes per trial prob <- pbar/N; prob # compute and show the H0 estimated Prob(success) phat <- dbinom(0:N, N, prob) # H0 expected probabilities (NOT frequencies) chi2<-chisq.test(fobs,p=phat)$statistic # extract the Chi-squared value pchisq(chi2,N-1,lower.tail=FALSE) # p-value with DF=N-1 for computed "prob" # HYPERGEOMETRIC probability density # # Example: probability of drawing x=5 white balls when k=7 balls are drawn # from an urn containing m=10 white and n=15 black balls. x<-5 # event: get x white balls k<-7 # action: draw k balls without replacement m<-10 # situation: m white balls in the urn n<-15 # situation: n black balls in the urn dhyper(x, m,n, k) # probability of exactly x white balls among the k drawn # Complete list of probabilities for x=0,1,2,...,k: dhyper(0:k, m,n, k) # probabilities of x white, all x=0,1,...,k # Cumulative hypergeometric density function: phyper(x, m,n, k) # probability of x or fewer white balls among the k drawn phyper(x, m,n, k, lower.tail=FALSE) # probability of more than x white balls 1-phyper(x, m,n, k) # ...same result as the previous line. ##### Application: compute probabilities of 2x2 contingency tables: # # Frequencies fij: # # f11 f12 | r1 row sums # f21 f22 | r2 # ---------------- # c1 c2 | n total # column sums # # Necessary condition: n = r1+r2 = c1+c2 = f11+f12+f21+f22 # # Then the probability of the table, assuming H0: rows and columns are # independent, is given by using # # dhyper(f11, r1,r2, c1) # namely x=f11, m=r1, n=r2, k=c1 # # or equivalently # # dhyper(f11, c1,c2, r1) # namely x=f11, m=c1, n=c2, k=r1 # # For example, with row sums 2,4 and column sums 3,3, there are three # possible 2x2 contingency tables indexed by f11=0,1,2, and their # probabilities are computed as follows: r <- c(2,4); c <- c(3,3); f11 <- c(0,1,2); dhyper(f11, r[1],r[2],c[1]) # computed one way dhyper(f11, c[1],c[2],r[1]) # computed the other way