# Math 322 Biostatistics # Lecture, January 14, 2009 # Introduction to R # Geir Arne Hjelle (hjelle@math.wustl.edu) # # This file contains a bit more than we had time to cover in class. Play # around with the extra commands as well, and see if you can figure out # how they work # # Some useful tricks: # . use the up and down arrows to recall previous commands, this will save # you a lot of typing # . Pressing Ctrl and L clears your screen, may help you relax :) # . Pressing Tab tries to complete what you are writing. Pressing Tab twice # shows you a list of all possible completions of what you are writing. # . When you exit R, R asks if you want to save your workspace. If you answer # yes, all your variables (vectors, data frames etc.) will be there when # you reopen R. # R as a calculator 2 + 2 exp(-1) pi # Precision of R (~53 binary digits) # # See also item 7.31 in the R FAQ # http://cran.r-project.org/doc/FAQ/R-FAQ.html pi - 3.141593 pi - 3.1415926535898 pi - 3.14159265358979323846 sin(pi) options(digits = 3) pi pi * 1000 pi / 1000 options(digits = 14) pi options(digits = 7) options() # Note that functions always need parenthesis sin ls() ls args(sin) # "Everything" is a vector # # List 20 random numbers from the standard normal distribution rnorm(20) # Plot 20 random numbers (not the same ones) from the standard # normal distribution plot(rnorm(20)) # Assign variables x <- rnorm(20) x plot(x) y <- rnorm(20, mean = 10, sd = 5) y plot(y) plot(x, y) # We can do math with our vectors x + y 2 * x y - 10 cos(x) sum(x) length(x) sum(x) / length(x) mean(x) sd(x) var(x) # To access one or several particular elements of our vector x[1] x[3] + x[5] x[c(1, 3, 5)] # The c() function is the most basic way of constructing a vector c(1, 3, 5) c(T, F, T) c("biostatistics", "is", "fun") # There are three types of vectors: Logical, Numerical, Character. All # elements in a vector have to be of the same type c(TRUE, 3, 4) c(FALSE, pi, 3) c(pi, "math") # Other ways of constructing vectors 3:22 seq(3, 22) seq(5, 6, 0.05) rep("Head", 10) rep(c("H", "T"), 4) rep(c("H", "T"), c(4, 4)) rep(c("H", "T"), c(2, 7)) # Different ways of inputing data wh <- c(64, 67, 65, 65, 70, 60, 69, 59, 66, 65, 62) wh summary(wh) plot(wh) mh <- scan() # 62 74 64 73 70 70 72 73 70 69 69 # scan() can also read from files etc. See help(scan). mh summary(mh) plot(mh) h <- c(wh, mh) h summary(h) plot(h) # Read from a file. The file can be online or on the local file system # See also help(read.table) and help.search("read") for more alternatives heights <- read.table("http://www.math.wustl.edu/~hjelle/m322/r/heights.txt", header = TRUE) heights # read.table can also read from a local file in the same way. Then you will # need to give it the file name with full path (e.g. # "C:/funny/directories/file.txt") or relative to the working directory. # On Windows the working directory is typically your My documents-directory. # This can be checked with getwd() and changed with setwd(). getwd() # heights is now a "data frame" which is a very powerful structure for # storing data. A data frame is essentially a list of vectors all of the # same length summary(heights) plot(heights) # Data frames can be created from scratch df <- data.frame() df # They can be edited. Note that edit() does NOT overwrite the original data # frame. If you want to overwrite your original data, use fix() instead. df.new <- edit(df) df.new df fix(df) df # Data frames can also be constructed from existing vectors h s <- rep(c("w", "m"), c(length(wh), length(mh))) s sh <- data.frame(s, h) sh summary(sh) plot(sh) # To access one particular column in a data frame (and any other list of # vectors), we use a $ notation heights heights$sex heights$height heights$height[3] # If we are constantly refering to a particular data frame, we can attach # its columns to R's search path. This means that we can access the columns # directly sex attach(heights) sex height[3] # This may cause crashes with already existing variables attach(sh) # We can check the order R looks for variables (the search path) search() # When we are done with a data frame we can clean up the search path by # detaching it (by default detach() detaches the first data frame in the # search path) detach(sh) search() detach() search() # Here are some more fun ways to play with our vectors and data frames attach(heights) height >= 70 sex == "w" height > 65 & sex != "m" height height[height >= 70] sex[height >= 70] height[sex == "w"] plot(height[sex == "m"]) # Commonly used constructs may be saved to a new variable height.m <- height[sex == "m"] height.w <- height[sex == "w"] # There are many ways of presenting data graphically demo(graphics) help.search("plot") help(boxplot) example(pie) hist(height) par(mfrow = c(2, 1)) hist(height.m) hist(height.w) hist(height.m, xlim = c(59, 74)) hist(height.w, xlim = c(59, 74)) hist(height.m, xlim = c(58, 75), breaks = 58:75, main = "Heights of men", col = "blue", right = F) hist(height.w, xlim = c(58, 75), breaks = 58:75, main = "Heights of women", col = "red", right = F) par(mfrow = c(1, 2)) boxplot(height.m) boxplot(height.w) boxplot(height.m, ylim = c(59, 74)) boxplot(height.w, ylim = c(59, 74)) par(mfrow = c(1, 1)) boxplot(height.m, height.w) boxplot(height ~ sex) stripchart(height ~ sex) stripchart(height ~ sex, method = "stack") # We can find frequency tables etc. of our data table(height) table(sex, height) margin.table(table(sex, height), 1) margin.table(table(sex, height), 2) prop.table(table(sex, height), 2) prop.table(table(sex, height), 2) * 100 # Let us also do some basic statistical tests on our data # # For instance a two-sample t-test to check the hypothesis that our # two samples come from distributions with the same mean (under the # extra assumption that the distributions are normal) t.test(height ~ sex, var.equal = T) # The t.test function creates a type of list (called htest = hypothesis test), # so we can access the individual properties with the $ notation t.test(height ~ sex, var.equal = T)$p.value t.test(height ~ sex, var.equal = T, conf.level = 0.99)$conf.int # The option var.equal = T tells R that we assume the variances of the two # samples are the same. By default R does not make that assumption. t.test(height ~ sex) # If we do not want to make the normal assumption on the distributions # of our samples, we can use a nonparametric test like the two-sample # Wilcoxon rank test, again checking the hypothesis that the two distributions # have the same mean. wilcox.test(height ~ sex) # Finally, let us look at a quick example of regression and analysis of # variance. For regression a linear model is used lm(height ~ sex) summary(lm(height ~ sex)) # Then an ANOVA can be performed on that linear model anova(lm(height ~ sex)) # R comes with several built-in data sets airquality help(airquality) attach(airquality) plot(Temp ~ Month) boxplot(Ozone ~ Month) # Let us finish with a few more elaborate plotting examples. Use the help() # function to understand them. hbar <- tapply(height, sex, mean) s <- tapply(height, sex, sd) stripchart(height ~ sex, method = "jitter", jitter = 0.04, pch = 1, vert = T) arrows(1:length(hbar), hbar + s, 1:length(hbar), hbar - s, angle = 90, code = 3, length = .1) lines(1:length(hbar), hbar, pch = 4, type = "b", cex = 2) sexcol = c("blue", "red") par(mfrow = c(2, 1)) plot(height, col = sexcol[sex], pch = 15) segments(1, mean(height), length(height), mean(height)) segments(1:length(height), height, 1:length(height), mean(height)) plot(height, col = sexcol[sex], pch = 15) segments(1, hbar, length(height), hbar) segments(1:length(height), height, 1:length(height), hbar[sex])