Example R programs and commands Visualize the Dirichlet pdf on p1+p2+p3=1 # All lines preceded by the "#" character are my comments. # All other left-justified lines are my input. # All other indented lines are the R program output. ###### VISUALIZE DIRICHLET # The Dirichlet density is not implemented in base R. # It is necessary to get a contributed package. # Several are available, here is one: install.packages('gtools') require('gtools') # ...provides functions # ddirichlet(x,alpha) # Density function at (x1,...,xk) # rdirichlet(n,alpha) # n random samples ###### Dirichlet density on 3 variables x=(x1,x2,x3), ###### with x1+x2+x3=1, and shape parameters a=(a1,a2,a3) # # Generate a grid of x1,x2 values: x1 <- seq(0,1, by=0.01) x2 <- seq(0,1, by=0.01) # Initialize a matrix to hold the pdf value: z <- matrix(0, nrow=length(x1), ncol=length(x2)) # Choose some reasonable shape parameters: alpha <- c( 3,5,12 ) # Fill z by looping over all valid (x1,x2) pairs, putting in # x3=1-x1-x2 as the third variable in ddirichlet(): for(i in 1:length(x1)) { for(j in 1:length(x2) ) { if( x1[i]+x2[j] < 1) { x <- c(x1[i], x2[j], 1-x1[i]-x2[j]) # so x1+x2+x3=1 z[i,j] <- ddirichlet(x,alpha); } else { z[i,j] <- 0 } } } # View the results as a perpective plot: persp(x1,x2,z) # View the results as a contour plot: contour(x1,x2,z) ######## MULTIPLE DIRICHLET PLOTS as a function of shape parameters dirichlet3persp <- function(alpha1=2, alpha2=2, alpha3=2) { alpha<-c(alpha1,alpha2,alpha3) x1 <- seq(0,1, by=0.01) x2 <- seq(0,1, by=0.01) z <- matrix(0, nrow=length(x1), ncol=length(x2)) # Fill z by looping over all valid (x1,x2) pairs, putting in # x3=1-x1-x2 as the third variable in ddirichlet(): for(i in 1:length(x1)) { for(j in 1:length(x2) ) { if( x1[i]+x2[j] < 1) { x <- c(x1[i], x2[j], 1-x1[i]-x2[j]) # so x1+x2+x3=1 z[i,j] <- ddirichlet(x,alpha); } else { z[i,j] <- 0 } } } persp(x1,x2,z) } dirichlet3contour <- function(alpha1=2, alpha2=2, alpha3=2) { alpha<-c(alpha1,alpha2,alpha3) x1 <- seq(0,1, by=0.01) x2 <- seq(0,1, by=0.01) z <- matrix(0, nrow=length(x1), ncol=length(x2)) # Fill z by looping over all valid (x1,x2) pairs, putting in # x3=1-x1-x2 as the third variable in ddirichlet(): for(i in 1:length(x1)) { for(j in 1:length(x2) ) { if( x1[i]+x2[j] < 1) { x <- c(x1[i], x2[j], 1-x1[i]-x2[j]) # so x1+x2+x3=1 z[i,j] <- ddirichlet(x,alpha); } else { z[i,j] <- 0 } } } contour(x1,x2,z) } ### # Examples of use: dirichlet3persp(12,5,32) dirichlet3contour(12,5,32) ##### COMPOSITION OF DIRICHLET AND HARDY-WEINBERG # # Goal: estimate the distribution of alleles (genotypes) from the # distribution of expressed traits (phenotypes) # # Setup: # A,B,O alleles for the blood type gene are present # in a population with respective proportions # pA, pB, and pO, satisfying # pA+pB+pO = 1 and pA>0, pB>0, pO>0 # # Phenotypes A,B,O,AB are assumed to be in Hardy-Weinberg # equilibrium for these allele proportions, so they are found in # proportions sA,sB,sO, and sAB satisfying # sA = pA^2 + 2*pA*pO # sB = pB^2 + 2*pB*pO # sO = pO^2 # sAB = 2*pA*pB # so sA+sB+sO+sAB=1 with sA>0, sB>0, sO>0, sAB>0 # # An experiment finds blood types for n individuals and gets # counts nA,nB,nO,nAB, respectively, with n=nA+nB+nO+nAB. # # The likelihood of this result is given by the multinomial pdf # Pr(nA,nB,nO,nAB | sA,sB,sO,sAB) = # (n ; nA,nB,nO,nAB) * sA^nA * sB^nB * sO^nO * sAB^nAB, # where (n ; nA,nB,nO,nAB) is the multinomial coefficient. # ## Method: Bayesian analysis. Compose the p's into the s's and plot # the likelihood as a function of pA and pB. # Generate a grid of pA,pB values: pA <- seq(0,1, by=0.01) pB <- seq(0,1, by=0.01) # Initialize a matrix to hold the genotypes pdf value: z <- matrix(0, nrow=length(pA), ncol=length(pB)) # Fix some particular counts nA <- 12; nB<-7; nO<-5; nAB<-3; n <- nA+nB+nO+nAB # total number of indivduals blood-typed # Fill z by looping over all valid (x1,x2) pairs, putting in # pO=1-pA-pB as the third success parameter: for(i in 1:length(pA)) { for(j in 1:length(pB) ) { a <- pA[i] # shorthand b <- pB[j] # shorthand if( a+b < 1) { # Otherwise not in the domain. c <- 1-a-b # Shorthand for pO=1-pA[i]-pB[j]. sA <- a^2 + 2*a*c sB <- b^2 + 2*b*c sO <- c^2 sAB <- 2*a*b z[i,j] <- sA^nA * sB^nB * sO^nO * sAB^nAB # unnormalized Dirichlet ## Equivalently, use dmultinom() to compute z[i,j] as follows: # z[i,j] <- dmultinom(c(nA,nB,nO,nAB), prob=c(sA,sB,sO,sAB)) } else { z[i,j] <- 0 } } } # View the results as a perpective plot: persp(pA,pB,z) # View the results as a contour plot: contour(pA,pB,z) # The maximum likelihood estimator is the (pA,pB) coordinates of the # peak of the bump. ####### MULTIPLE DIRICHLET-HARDY-WEINBERG PLOTS ## Implement the previous section as a function: dhw<-function(nA=1, nB=2, nO=3, nAB=4, tol=0.01) { # Generate a grid of pA,pB values: pA <- seq(0,1, by=tol) pB <- seq(0,1, by=tol) # Initialize a matrix to hold the genotypes pdf value: z <- matrix(0, nrow=length(pA), ncol=length(pB)) n <- nA+nB+nO+nAB # total number of indivduals blood-typed # Fill z by looping over all valid (pA,pB) pairs, putting in # pO=1-pA-pB as the third success parameter: for(i in 1:length(pA)) { for(j in 1:length(pB) ) { a <- pA[i] # shorthand b <- pB[j] # shorthand if( a+b < 1) { # Otherwise not in the domain. c <- 1-a-b # Shorthand for pO=1-pA[i]-pB[j]. sA <- a^2 + 2*a*c sB <- b^2 + 2*b*c sO <- c^2 sAB <- 2*a*b z[i,j] <- sA^nA * sB^nB * sO^nO * sAB^nAB # unnormalized Dirichlet } else { z[i,j] <- 0 } } } # View the results as a contour plot: contour(pA,pB,z, xlab="pA",ylab="pB",main="Pdf on A,B allele proportions") ijmax <- which.max(z) imax <- 1+ ijmax%%length(pA) jmax <- 1+ ijmax%/%length(pA) pAmax <- pA[imax] pBmax <- pB[jmax] pOmax <- 1-pAmax-pBmax print("Approximate MLE: (pA,pB,pO) = ") print( c(pAmax,pBmax,pOmax)) return(z) } # Example use: zout <- dhw(20, 25, 13,5) ## Log likelihood is better for plotting: ldhw<-function(nA=1, nB=2, nO=3, nAB=4, tol=0.01) { # Generate a grid of pA,pB values: pA <- seq(0,1, by=tol) pB <- seq(0,1, by=tol) # Initialize a matrix to hold the genotypes pdf value: lz <- matrix(0, nrow=length(pA), ncol=length(pB)) n <- nA+nB+nO+nAB # total number of indivduals blood-typed # Fill z by looping over all valid (pA,pB) pairs, putting in # pO=1-pA-pB as the third success parameter: for(i in 1:length(pA)) { for(j in 1:length(pB) ) { a <- pA[i] # shorthand b <- pB[j] # shorthand if( a+b < 1) { # Otherwise not in the domain. c <- 1-a-b # Shorthand for pO=1-pA[i]-pB[j]. lsA <- log(a^2 + 2*a*c) lsB <- log(b^2 + 2*b*c) lsO <- log(c^2) lsAB <- log(2*a*b) lz[i,j] <- lsA*nA + lsB*nB + lsO*nO + lsAB*nAB # log unnormalized Dirichlet } else { lz[i,j] <- -Inf # log(0) = -Inf } } } # View the results as a contour plot: contour(pA,pB,lz, xlab="pA",ylab="pB",main="Log pdf on A,B allele proportions") ijmax <- which.max(lz) imax <- 1+ ijmax%%length(pA) jmax <- 1+ ijmax%/%length(pA) pAmax <- pA[imax] pBmax <- pB[jmax] pOmax <- 1-pAmax-pBmax print("Approximate MLE: (pA,pB,pO) = ") print( c(pAmax,pBmax,pOmax)) return(lz) } # Example use: lzout <- ldhw(nA=750, nB=250, nO=925, nAB=75, tol=0.001)