Example R programs and commands 19. Chi-squared and log-likelihood goodness of fit tests # All lines preceded by the "#" character are my comments. # All other left-justified lines are my input. # All other indented lines are the R program output. # CHI-SQUARED test syntax # # chisq.test(x, y = NULL, correct = TRUE, # p = rep(1/length(x), length(x)), rescale.p = FALSE, ...) # # Arguments # x a vector or matrix. # y a vector; ignored if x is a matrix. # correct TRUE ==> apply continuity correction for 2x2 tables. # p vector of probabilities, same length as x. # rescale.p TRUE ==> p is rescaled (if necessary) to sum to 1. # FALSE ==> if p does not sum to 1, then complain. ####################################################################### # # Goodness-of-fit: # # Test if an observed frequency table is samples from a uniform density: # obs1 <- c(29, 23, 17, 25, 22, 15, 27) # observed frequencies chisq.test(obs1) # default p=c(1/7,...,1/7) is appropriate here Chi-squared test for given probabilities data: obs X-squared = 6.8987, df = 6, p-value = 0.3303 # ==> do not reject H0 at the .05 level. # Test if another observed frequency table is samples from a uniform density: obs2 <- c(29, 23, 10, 25, 22, 9, 27) # observed frequencies chisq.test(obs2) # default p=c(1/7,...,1/7) is appropriate here Chi-squared test for given probabilities data: obs X-squared = 18.6069, df = 6, p-value = 0.004882 # ==> reject H0 at the .05 level. ####################################################################### # # LOG-LIKELIHOOD computation # # Extract the observed and expected values. obs2 <- c(29, 23, 10, 25, 22, 9, 27) # observed frequencies fit<-chisq.test(obs2) # default p=c(1/7,...,1/7) is appropriate here summary(fit) Length Class Mode statistic 1 -none- numeric # Chi squared (X2) parameter 1 -none- numeric # Degrees of freedom (df) p.value 1 -none- numeric method 1 -none- character data.name 1 -none- character observed 7 -none- numeric expected 7 -none- numeric residuals 7 -none- numeric fit$expected [1] 20.71429 20.71429 20.71429 20.71429 20.71429 20.71429 20.71429 fit$observed [1] 29 23 10 25 22 9 27 fit$observed/fit$expected [1] 1.4000000 1.1103448 0.4827586 1.2068966 1.0620690 0.4344828 1.3034483 log(fit$observed/fit$expected) [1] 0.33647224 0.10467062 -0.72823850 0.18805223 0.06021886 -0.83359902 [7] 0.26501327 g<-2*sum(fit$observed*log(fit$observed/fit$expected)) g [1] 21.12364 # Compute the p-value of g assuming it has the Chi-squared density: nu<-6; 1-pchisq(g,nu) [1] 0.001743084 ####################################################################### # # Heterogeneity Chi Squared (Method 1): # # Test if two frequency tables are samples from the uniform density # obs1 <- c(29, 23, 17, 25, 22, 15, 27) # observed frequencies, table 1 obs2 <- c(29, 23, 10, 25, 22, 9, 27) # observed frequencies, table 2 # # Individual chi-squareds sum to give the total chi-squared deviation # from the expected density p={1/K,...,1/K}: fit1<-chisq.test(obs1) fit2<-chisq.test(obs2) total.chisq <- fit1$statistic + fit2$statistic total.df <- fit1$parameter + fit2$parameter # Pooled chi-squared deviation from expected density p={1/K,...,1/K}: fitp<-chisq.test(obs1+obs2) pooled.chisq <- fitp$statistic; pooled.chisq pooled.df <- fitp$parameter; pooled.df # Compare deviations from expected: heterogeneity.chisq <- total.chisq - pooled.chisq; heterogeneity.chisq heterogeneity.df <- total.df - pooled.df; heterogeneity.df pchisq( heterogeneity.chisq, heterogeneity.df, lower.tail=FALSE) # NOTE: the uniform density "p=c(1/K,...,1/K)" with K=length(x) is R's # default, but any expected density may be used by specifying # "p=c(p1,p2,...pK)" in chisq.test(). ####################################################################### # # Heterogeneity Chi Squared (Method 2 -- Pearson's Chi Squared): # # Test if two frequency tables are samples from the pooled density # obs1 <- c(29, 23, 17, 25, 22, 15, 27) # observed frequencies, table 1 obs2 <- c(29, 23, 10, 25, 22, 9, 27) # observed frequencies, table 2 # # Test if the frequency tables are linearly dependent: fit <- chisq.test(cbind(obs1,obs2)); fit # NOTE: this is the same as the Method 1 test with the expected # density p=c(p1,...,pK) determined by the pooled frequencies. pooled.density <- (obs1+obs2)/sum(obs1+obs2) fit1 <- chisq.test(obs1,p=pooled.density); fit1 fit2 <- chisq.test(obs2,p=pooled.density); fit2 fitp <- chisq.test(obs1+obs2,p=pooled.density); fitp # NOTE: the pooled chi squared is exactly 0, so there is no need to # subtract it: heterogeneity.chisq <- fit1$statistic + fit2$statistic; heterogeneity.chisq # ....but the pooled DF is nonzero and must still be subtracted: heterogeneity.df <- fit1$parameter + fit2$parameter - fitp$parameter; heterogeneity.df # Find the p-value of this test statistic manually: pchisq( heterogeneity.chisq, heterogeneity.df, lower.tail=FALSE ) ####################################################################### # # Kolmogorov-Smirnov Goodness-of-Fit: # # Test if two frequency tables comes from the same density # freq1 <- c(22, 25, 27, 15, 26, 24, 19) # observed frequencies, table 1 freq2 <- c(26, 23, 10, 25, 18, 29, 27) # observed frequencies, table 2 # For R's K-S test, the inputs are observations not frequencies: x1<-rep(1:length(freq1),freq1); x1 # generate observations from freq1 x2<-rep(1:length(freq2),freq2); x2 # generate observations from freq2 ks.test(x1,x2) # test whether x1,x2 come from the same density # # Test if a sequence of observations comes from a specified density # # Specify the uniform density on the discrete set {1,2,...,7}: ks.test(x1,"punif",min=0.5,max=7.5) # {1,...,7} <===> [0.5,7.5] ks.test(x2,"punif",min=0.5,max=7.5) # {1,...,7} <===> [0.5,7.5] # # REMARK: if p is large so H_0 is not rejected, then the frequency # tables may be added (pooled.freq <- freq1+freq2) and the observations # may be concatenated (pooled.x <- c(x1,x2) ).