Example R programs and commands
	  19. Chi-squared and log-likelihood goodness of fit tests

# All lines preceded by the "#" character are my comments.
# All other left-justified lines are my input.
# All other indented lines are the R program output.

# CHI-SQUARED test syntax
#
#  chisq.test(x, y = NULL, correct = TRUE,
#           p = rep(1/length(x), length(x)), rescale.p = FALSE, ...)
#
# Arguments
#   x	 	a vector or matrix.
#   y		a vector; ignored if x is a matrix.
#   correct 	TRUE ==>  apply continuity correction for 2x2 tables.
#   p	 	vector of probabilities, same length as x.
#   rescale.p 	TRUE ==> p is rescaled (if necessary) to sum to 1.
#		FALSE ==> if p does not sum to 1, then complain.

#######################################################################
#
# Goodness-of-fit:
#
# Test if an observed frequency table is samples from a uniform density:
#
obs1 <- c(29, 23, 17, 25, 22, 15, 27)    # observed frequencies
chisq.test(obs1)     # default p=c(1/7,...,1/7) is appropriate here

	  Chi-squared test for given probabilities

  data:  obs
  X-squared = 6.8987, df = 6, p-value = 0.3303
# ==> do not reject H0 at the .05 level.

# Test if another observed frequency table is samples from a uniform density:
obs2 <- c(29, 23, 10, 25, 22, 9, 27)    # observed frequencies
chisq.test(obs2)     # default p=c(1/7,...,1/7) is appropriate here

	  Chi-squared test for given probabilities

  data:  obs
  X-squared = 18.6069, df = 6, p-value = 0.004882
# ==> reject H0 at the .05 level.

#######################################################################
#
# LOG-LIKELIHOOD computation
#
# Extract the observed and expected values.
obs2 <- c(29, 23, 10, 25, 22, 9, 27)    # observed frequencies
fit<-chisq.test(obs2)     # default p=c(1/7,...,1/7) is appropriate here
summary(fit)
	      Length Class  Mode
    statistic 1      -none- numeric     # Chi squared        (X2)
    parameter 1      -none- numeric     # Degrees of freedom (df)
    p.value   1      -none- numeric
    method    1      -none- character
    data.name 1      -none- character
    observed  7      -none- numeric
    expected  7      -none- numeric
    residuals 7      -none- numeric
fit$expected
  [1] 20.71429 20.71429 20.71429 20.71429 20.71429 20.71429 20.71429
fit$observed
  [1] 29 23 10 25 22  9 27
fit$observed/fit$expected
  [1] 1.4000000 1.1103448 0.4827586 1.2068966 1.0620690 0.4344828 1.3034483
log(fit$observed/fit$expected)
  [1]  0.33647224  0.10467062 -0.72823850  0.18805223  0.06021886 -0.83359902
  [7]  0.26501327
g<-2*sum(fit$observed*log(fit$observed/fit$expected))
g
  [1] 21.12364
# Compute the p-value of g assuming it has the Chi-squared density:
nu<-6; 1-pchisq(g,nu)
  [1] 0.001743084


#######################################################################
#
# Heterogeneity Chi Squared (Method 1):
#
# Test if two frequency tables are samples from the uniform density
#
obs1 <- c(29, 23, 17, 25, 22, 15, 27)   # observed frequencies, table 1
obs2 <- c(29, 23, 10, 25, 22, 9, 27)    # observed frequencies, table 2
# 
# Individual chi-squareds sum to give the total chi-squared deviation
# from the expected density p={1/K,...,1/K}:
fit1<-chisq.test(obs1)
fit2<-chisq.test(obs2)
total.chisq <- fit1$statistic + fit2$statistic
total.df <- fit1$parameter + fit2$parameter

# Pooled chi-squared deviation from expected density  p={1/K,...,1/K}: 
fitp<-chisq.test(obs1+obs2)
pooled.chisq <- fitp$statistic; pooled.chisq
pooled.df    <- fitp$parameter; pooled.df

# Compare deviations from expected:
heterogeneity.chisq <- total.chisq - pooled.chisq; heterogeneity.chisq
heterogeneity.df    <- total.df - pooled.df;       heterogeneity.df
pchisq( heterogeneity.chisq, heterogeneity.df, lower.tail=FALSE)

# NOTE: the uniform density "p=c(1/K,...,1/K)" with K=length(x) is R's
# default, but any expected density may be used by specifying
# "p=c(p1,p2,...pK)" in chisq.test().



#######################################################################
#
# Heterogeneity Chi Squared (Method 2 -- Pearson's Chi Squared):
#
# Test if two frequency tables are samples from the pooled density
#
obs1 <- c(29, 23, 17, 25, 22, 15, 27)   # observed frequencies, table 1
obs2 <- c(29, 23, 10, 25, 22, 9, 27)    # observed frequencies, table 2
# 
# Test if the frequency tables are linearly dependent:
fit <- chisq.test(cbind(obs1,obs2)); fit

# NOTE: this is the same as the Method 1 test with the expected
# density p=c(p1,...,pK) determined by the pooled frequencies.
pooled.density <- (obs1+obs2)/sum(obs1+obs2)
fit1 <- chisq.test(obs1,p=pooled.density); fit1
fit2 <- chisq.test(obs2,p=pooled.density); fit2
fitp <- chisq.test(obs1+obs2,p=pooled.density); fitp
# NOTE: the pooled chi squared is exactly 0, so there is no need to
# subtract it:
heterogeneity.chisq <- fit1$statistic + fit2$statistic;  heterogeneity.chisq 
# ....but the pooled DF is nonzero and must still be subtracted:
heterogeneity.df <- fit1$parameter + fit2$parameter - fitp$parameter;
heterogeneity.df
# Find the p-value of this test statistic manually:
pchisq( heterogeneity.chisq, heterogeneity.df, lower.tail=FALSE )


#######################################################################
#
# Kolmogorov-Smirnov Goodness-of-Fit:
#
# Test if two frequency tables comes from the same density
#
freq1 <- c(22, 25, 27, 15, 26, 24, 19)   # observed frequencies, table 1
freq2 <- c(26, 23, 10, 25, 18, 29, 27)   # observed frequencies, table 2

# For R's K-S test, the inputs are observations not frequencies:
x1<-rep(1:length(freq1),freq1);  x1   # generate observations from freq1
x2<-rep(1:length(freq2),freq2);  x2   # generate observations from freq2
ks.test(x1,x2)    # test whether x1,x2 come from the same density
#
# Test if a sequence of observations comes from a specified density
#
# Specify the uniform density on the discrete set {1,2,...,7}:
ks.test(x1,"punif",min=0.5,max=7.5) #  {1,...,7} <===> [0.5,7.5]
ks.test(x2,"punif",min=0.5,max=7.5) #  {1,...,7} <===> [0.5,7.5]
#
# REMARK: if p is large so H_0 is not rejected, then the frequency
# tables may be added (pooled.freq <- freq1+freq2) and the observations
# may be concatenated (pooled.x <- c(x1,x2) ).