################################################################################ #### Example of Multiple Regression Model #### ################################################################################ #### By Jimin Ding, 04/26/2017 ######################## Example 3. Labor Data ################################ ## In this example, we will learn how to deal with categorical predictors mydata=read.csv(file="LaborData.csv",header=TRUE) head(mydata) attach(mydata) regfit=lm(WEIGHT~GESTATIONAL.AGE+Child.GENDER) summary(regfit) plot(regfit) ## Note here gender is a categorical variable with two levels: Female and Male. ## By default, R select "Female" as the baseline group, ## and create a dummy variable X=0 for Female, and 1 for male. male=rep(1,length(Child.GENDER)) male[Child.GENDER=="Female"]=0 male regfit2=lm(WEIGHT~GESTATIONAL.AGE+male) summary(regfit2) ## How the regression change when we use male (instead of female) as baseline group? female=rep(1,length(Child.GENDER)) female[Child.GENDER=="Male"]=0 regfit3=lm(WEIGHT~GESTATIONAL.AGE+female) summary(regfit3) ## Now let's consider delievery type. summary(lm(WEIGHT~GESTATIONAL.AGE+DELIVERY.TYPE+Child.GENDER)) table(DELIVERY.TYPE) ## Now we combine some delievery type and create a new categorical variable. ## Dtype=0 for natural delivery, =1 for vaginal delivery with help, =2 for C-Sec Dtype=rep(0,length(DELIVERY.TYPE)) Dtype[DELIVERY.TYPE=="Vaginal, Vacuum (Extractor)"|DELIVERY.TYPE=="Vaginal, Forceps"]=1 Dtype[substr(DELIVERY.TYPE,1,1)=="C"]=2 table(Dtype) regfit4=lm(WEIGHT~GESTATIONAL.AGE+Child.GENDER+Dtype) summary(regfit4) ## The problem of above regression is treating Dtype as numerical variable instead of categorical variable. ## Actually, the difference between Dtype=0 and Dtype=1 is different from Dtype=1 and Dtype=2. ## Hence, we should consider Dtype as a "factor" in R regfit5=lm(WEIGHT~GESTATIONAL.AGE+Child.GENDER+factor(Dtype)) summary(regfit5) ## Equivalently, we may create two dummy variables for Dtype=1 and Dtype=2. Dtype1=I(Dtype==1) Dtype2=I(Dtype==2) table(Dtype1) table(Dtype2) regfit6=lm(WEIGHT~GESTATIONAL.AGE+Child.GENDER+Dtype1+Dtype2) summary(regfit6) ## We have seen the delivery types were insignificant. ## To test this claim, we construct a F-test using anova function in R. summary(regfit) summary(regfit5) anova(regfit,regfit5) ## To add a quadratic term I(x^2) regfit7=lm(WEIGHT~GESTATIONAL.AGE+I(GESTATIONAL.AGE^2)+Child.GENDER) summary(regfit7) ## Or equivalently Gagesq=GESTATIONAL.AGE^2 regfit8=lm(WEIGHT~GESTATIONAL.AGE+Gagesq+Child.GENDER) summary(regfit8) ## To add an interaction term x1*x2 or x1:x2 regfit9=lm(WEIGHT~GESTATIONAL.AGE*Child.GENDER) summary(regfit9) ## Or equivalently regfit10=lm(WEIGHT~GESTATIONAL.AGE+Child.GENDER+GESTATIONAL.AGE:Child.GENDER) summmary(regfit10)