# Math 322 Biostatistics # Lecture, February 2, 2009 # R examples for Chapter 6.5-6.7 (Estimation) # Geir Arne Hjelle (hjelle@math.wustl.edu) # Read in self-made functions source("http://www.math.wustl.edu/~hjelle/m322/r/090202_functions.R") # Read in population data pop <- scan("http://www.math.wustl.edu/~hjelle/m322/r/birthweights.txt") # # Show different estimators # par(mfrow = c(2, 2)) # par is used to set graphical parameters, mfrow # is used to put several plots into one graph # Pick 200 samples, each of size 10. Estimate the mean of the population, # using sample mean, sample median, average of min and max in sample, and # max in sample estimate.mean(pop, 200, 10, breaks = seq(80, 150, 10)) estimate.median(pop, 200, 10, breaks = seq(80, 150, 10)) estimate.minmax(pop, 200, 10, breaks = seq(80, 150, 10)) estimate.max(pop, 200, 10, breaks = seq(100, 170, 10)) # # Show effect of sample size # # Pick 200 samples, each of size 20, 100, 300. Estimate the mean of the # population. The number returned is the standard error of the mean par(mfrow = c(3, 1)) estimate.mean(pop, 200, 20, breaks = seq(100, 130, 2), main = "Sample size 20") estimate.mean(pop, 200, 100, breaks = seq(100, 130, 2), main = "Sample size 100") estimate.mean(pop, 200, 300, breaks = seq(100, 130, 2), main = "Sample size 300") # If we could sample all the members of the study population, we would find # the exact value of mu. Notice that the standard error of the mean is zero. par(mfrow = c(1, 1)) estimate.mean(pop, 200, length(pop), breaks = seq(114, 115, .1)) # # Comparison of the normal distribution and the t distribution with 5 and 50 # degrees of freedom # curve(dnorm(x), xlim = c(-3, 3), ylab = "Density") curve(dt(x, 5), add = TRUE, col = "red") curve(dt(x, 50), add = TRUE, col = "green") title("Comparison of normal and t distributions") legend("topright", legend = c("Normal", "t with 5 df", "t with 50 df"), fill = c("black", "red", "green")) # # Constructing a confidence interval by hand # # Pick a sample of size 20, and use the sample to give a 95% confidence # interval for the real value of mu x <- sample(pop, 20) xbar <- mean(x) cat("Estimate for mu:", xbar, "\n") # Write estimate to screen with some # helpful text, could also just inquire # for the value of xbar. \n gives a # new line. sem <- sd(x) / sqrt(20) # 95% confidence interval xbar + qt( c(.025, .975), 19) * sem # 99% confidence interval xbar + qt( c(.005, .995), 19) * sem # 95% confidence interval with sample size 100 x <- sample(pop, 100) xbar <- mean(x) sem <- sd(x) / sqrt(100) xbar + qt( c(.025, .975), 99) * sem # Finally, just a demonstration of what 95% means. We take 10000 samples, # each of size 20, compute the 95% confidence interval and record whether # or not the true value of mu is inside the interval. t.19 <- qt(.975, 19) true.mu <- mean(pop) x <- matrix(nrow = 20, ncol = 10000) for (i in 1:10000) x[,i] <- sample(pop, 20) xbar <- colMeans(x) sem <- sd(x) / sqrt(20) inside= xbar - t.19 * sem < true.mu & true.mu < xbar + t.19 * sem # Now inside is a vector of TRUEs and FALSEs. An easy way to find the ratio # of TRUEs is just to calculate the mean. This works because R converts # TRUE to 1 and FALSE to 0 before calculating the mean (or before any # calculation where R expects numbers and is given TRUEs and FALSEs). cat("Mu is inside the 95% confidence interval ", 100 * mean(inside), "% of the time\n", sep = "")