CRG Master-PhD course. Module III. Tests on normality & data transformation. Non-parameteric tests.

# install.packages("MASS") - run this code if you do not have MASS package
library(MASS)


3. Tests on Normality of the distirbution and data transformation.

3.1. Tests for normality.


Let's examine a randomly generated sample from a normal distribution.

set.seed(42)
x <- rnorm(100)

shapiro.test(x)
## 
##  Shapiro-Wilk normality test
## 
## data:  x
## W = 0.98122, p-value = 0.1654
qqnorm(x)
qqline(x, col = "red", lwd=3)

plot of chunk unnamed-chunk-2



Q3.1. Repeat the above procedure for the sample of size 10000.

set.seed(42)
x <- rnorm(10000)
qqnorm(x)
qqline(x, col = "red", lwd=3)
shapiro.test(x)


Kolmogorov-Smirnov test for normality (aka, KS test). This is non-parametric test, less powerful than Shapiro-Wilk test.

ks.test(x,"pnorm",mean(x),sqrt(var(x))) #compare sample x with the normal distirbution having mean(x) and var(x)


Other tests on data distirbution

#install.packages("moments")
library(moments)

# skewness and kurtosis, they should be around 0 and 3, respectively, for normally distributed data
x <- rnorm(100)
skewness(x)
## [1] 0.3090857
kurtosis(x)
## [1] 3.032129
#install.packages("car")
library(car)

#qqplot with confidence interval
qqPlot(x)

plot of chunk unnamed-chunk-5



Q3.2. Test the normality of a random sample of size n = 1000 taken from a Gamma distribution; use rgamma(n, shape, scale), for shape (k) = 2, and scale (theta) = 2.

shape <- 2
scale <- 2
n <- 1000
x <- rgamma(n, shape, scale)
hist(x)

plot of chunk unnamed-chunk-6

qqPlot(x) #library(car)

plot of chunk unnamed-chunk-6

skewness(x)
## [1] 1.363373
kurtosis(x)
## [1] 5.849921
shapiro.test(x)
## 
##  Shapiro-Wilk normality test
## 
## data:  x
## W = 0.90418, p-value < 2.2e-16


3.2. Kolmogorov-Smirnov test for two samples.


Kolmogorov-Smirnov test can be used to test if two samples are from the same distirbutions

ks.test(rnorm(10000), rnorm(100))

par(mfrow = c(2,1))
hist(rnorm(1000))
hist(rgamma(1000,2,2))
ks.test(rnorm(1000), rgamma(1000,2,2))

hist(rgamma(1000,1,1))
hist(rgamma(1000,2,2))
ks.test(rgamma(1000,1,1), rgamma(1000,2,2))

par(mfrow = c(1,1))


3.3. Tools to explore data sets: boxplot(), hist(), summary(), qqPlot()

set.seed(400)

x <- rnorm(50, mean=1)
summary(x)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## -1.5120  0.4014  0.7756  0.9503  1.6030  2.6350
y <- rgamma(50,1,1)
summary(y)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## 0.002572 0.127700 0.384200 0.837300 1.164000 4.315000
z <- c(8.11, 8.11, 8.09, 8.08, 8.06, 8.02, 7.99, 7.99, 7.97, 7.95, 7.92, 7.92, 7.92, 7.89, 7.87, 7.84, 7.79, 7.79, 7.77, 7.76, 7.72, 7.71, 7.66, 7.62, 7.61, 7.59, 7.55, 7.53, 7.50, 7.50, 7.42, 7.38, 7.38, 7.26, 7.25, 7.08, 6.96, 6.84, 6.55, 6.50, 6.45, 6.40, 6.38, 6.32, 6.30, 6.05, 6.00, 5.55, 5.50) # men jumps
summary(z)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   5.500   6.960   7.610   7.355   7.920   8.110
par(mfrow = c(3,3))

boxplot(x, main="Normal")
stripchart(x, vertical=TRUE, add=TRUE, method="jitter", pch=20)
loc <- 1:length(mean(x))
segments(loc-0.2, mean(x), loc+0.2, mean(x), lwd=2, col="red")

boxplot(y, main="Gamma")
stripchart(y, vertical=TRUE, add=TRUE, method="jitter", pch=20)
loc <- 1:length(mean(y))
segments(loc-0.2, mean(y), loc+0.2, mean(y), lwd=2, col="red")

boxplot(z, main="Left-skewed")
stripchart(z, vertical=TRUE, add=TRUE, method="jitter", pch=20)
loc <- 1:length(mean(z))
segments(loc-0.2, mean(z), loc+0.2, mean(z), lwd=2, col="red")

hist(x)
segments(median(x), 0, median(x), 12, lwd=2, col="blue")
segments(mean(x), 0, mean(x), 12, lwd=2, col="red")

hist(y)
segments(median(y), 0, median(y), 30, lwd=2, col="blue")
segments(mean(y), 0, mean(y), 30, lwd=2, col="red")

hist(z)
segments(median(z), 0, median(z), 20, lwd=2, col="blue")
segments(mean(z), 0, mean(z), 20, lwd=2, col="red")

qqPlot(x) #library(car)
qqPlot(y) #library(car)
qqPlot(z) #library(car)

plot of chunk unnamed-chunk-8

par(mfrow = c(1,1))


3.4. Data transformation.


Log-tranformation of right-skewed data (long right tail; positive skewness).

Q3.3. Do log-transformation of a gamma-sample (3,2) and run normality tests on transformed data. Which transformation was the best taking into account the Shapiro test? Looking at qqPlot(x)?

shape <- 3
scale <- 2
x <- rgamma(100, shape, scale)
skewness(x)
## [1] 1.257593
kurtosis(x)
## [1] 5.088152
shapiro.test(x)
## 
##  Shapiro-Wilk normality test
## 
## data:  x
## W = 0.91019, p-value = 4.402e-06
y <- log(x + 1) # try also log10, log2
skewness(y)
## [1] 0.3548871
kurtosis(y)
## [1] 2.795442
shapiro.test(y)
## 
##  Shapiro-Wilk normality test
## 
## data:  y
## W = 0.98435, p-value = 0.2849
par(mfrow = c(2,2))
hist(x)
hist(y)

qqPlot(x) #library(car)
qqPlot(y) #library(car)

plot of chunk unnamed-chunk-9

par(mfrow = c(1,1))


Optimal log-transformation can be found as follows:

#install.packages("e1071")
library(e1071)
library(datasets)

data(airquality)

par(mfrow = c(3,2))

x <- airquality$Ozone
x <- x[!is.na(x)]
skewness(x)
hist(x)
qqPlot(x) #library(car)
shapiro.test(x)

y1 <- log(x+1)
skewness(y1)
hist(y1)
qqPlot(y1) #library(car)
shapiro.test(y1)

skew.score <- function(c, x) (skewness(log(x + c)))^2
coef <- optimise(skew.score, c(0, 20), x = x)$minimum
y2 <- log(x + coef)

skewness(y2)
hist(y2)
qqPlot(y2) #library(car)
shapiro.test(y2)

par(mfrow = c(1,1))


Power transformation for left-skewed data (long left tail)

x <- c(8.11, 8.11, 8.09, 8.08, 8.06, 8.02, 7.99, 7.99, 7.97, 7.95, 7.92, 7.92, 7.92, 7.89, 7.87, 7.84, 7.79, 7.79, 7.77, 7.76, 7.72, 7.71, 7.66, 7.62, 7.61, 7.59, 7.55, 7.53, 7.50, 7.50, 7.42, 7.38, 7.38, 7.26, 7.25, 7.08, 6.96, 6.84, 6.55) # London Olympic 2012, men jumps

par(mfrow = c(3,2))

skewness(x)
hist(x)
qqPlot(x)
shapiro.test(x)

y1 <- x^2
skewness(y1)
hist(y1)
qqPlot(y1)
shapiro.test(y1)

y2 <- x^10
hist(y2)
qqPlot(y2)
shapiro.test(y2)

par(mfrow = c(1,1))


Poisson distribution (count data). Do not transform! Use suitable models, e.g., Poisson regression.

lambda <- 4
x <- rpois(100, lambda)

par(mfrow = c(2,1))
hist(x)
qqPlot(x) #library(car)
par(mfrow = c(1,1))


See also http://stats.stackexchange.com/questions/101274/how-to-interpret-a-qq-plot?rq=1



4. Non-parametric tests (distribution-free tests: no assumption on the population distribution or sample size).


Non-parametric tests are for testing medians, while parametric, means.

Non-parametric test Parametric test Charactiristics Usage
One-sample t test (variance uknown & n<40) or z test (otherwise) One-sample Sign test Test on the median for non-symmetric distribution SIGN.test(x) from package “BSDA”
One-sample Wilcoxon test Test on the median for symmetric distribution wilcox.test(x)
Two sample t test Paired Wilcoxon signed-rank test Test on the medians of two paired samples wilcox.test(x,y, paired=TRUE)
Two sample t test Unpaired Mann-Whitney test on ranks Test on the medians of two independent samples using ranks wilcox.test(x,y)
One-way ANOVA Kruskal-Wallis test Test on equality of medians from two or more populations kruskal.test()
Mood's median test ibid. Less robust to outliers but more powerful than Kruskal-Wallis mood.test()

Reasons to use parametric tests rather than non-parametric:

* They suitable for samples 30 =< n (and even 15 =< n) of skewed data with no outliers.

* And for two samples even if the variances are different (t.test() assumes unequal variances by default).

* They have greater power – larger probability to reject H0 when Ha is in fact true.


Reasons to use non-parametric tests:

* Data are better represented by the median rather than the mean.

* A very small sample size.

* Many outliers that cannnot be removed

* Ordinal data (e.g., the size defined as “XS”, “S”, “M”, “L”, “XL”) or ranked data (e.g., cells in the sample ranked by the size as 1st, 2nd, 3rd, etc.).

* Data cannot be transformed (or no need to transform them)

* The distribution is unknown and cannot be transformed to normal.

* Very skewed distribution (e.g., home price in New York)


4.1. Sign test (one sample, non-symmetrical distrbution).

The sign test is a method to find consistent ordinal differences between pairs of observations. It determines if one member in the pair of observations tends to be greater than the other member. Unlike t-test, there is no assumption of normality for small samples, neither any other assumption about the nature of the random variable.

#install.packages("BSDA")
library(BSDA)


Q4.1. The patients were treated by the weight loss drug. 10 patients, 5 reduced weight, 3 gained. Do these data suggest that the weight-control treatment works (that is, can we reject the null hypothesis that there is no difference in patients' weight between before and after treatment?

x1 <- c(200,202,194,188,166,196,180,188,180,210)
x2 <- c(202,204,167,192,166,190,176,182,180,202)
x <- x2 - x1
hist(x)

SIGN.test(x, md=0, alternative = "less")

#p-value can be calculated directly 
size <- length(x)
win <- sum(x<0)
p <- 1 - pbinom(win, size, p=0.5)

p-value = 0.3633 => There is no enough evidence to reject H0.



Q4.2. Test H0 that the weight in group x1 is above 180 pounds. Do we have enough evidence to reject this H0?

hist(x1)

size <- length(x1)
win <- sum(x1 > 180)

p <- pbinom(win, size, p=0.5)
SIGN.test(x1 - 180, md=0, alternative = "less")



Q4.3. Test H0 that the weight in group x2 is below 167 pounds. Do we have enough evidence to reject this H0?

hist(x2)

size <- length(x2)
win <- sum(x2 < 167)

p <- pbinom(win, size, p=0.5) #P(X ≥ 6) = P(X = 6)+P(X = 7)+···+P(X = 10)
SIGN.test(x1 - 167, md=0, alternative = "greater")


4.2. One-sample Wilcoxon test (symmetric distribution) or Two sample paired Wilcoxon signed-rank test

(the test runs with continuity correction by default, can be switched off).
The test is used to assess whether the differences are symmetric and centered around zero. H0: differences from the median (one-sample test) or differences in paired measuments follow a symmetrical distribution.
Ha: differences don’t follow a symmetric distribution around zero
The problem. The effect of a drug on the hormon level was measured in 12 patients before and after the treatment. Was the drug effective?

x1 <- c(125.3, 101, 117.2, 133.7, 96.4, 124.5, 118.7, 106.2, 116.3, 120.2, 125, 128.8)
x2 <- c(127.3, 120.2, 126.2, 125.4, 115.1, 118.5, 135.5, 118.2, 122.9, 120.1, 120.8, 130.7)

df <- data.frame(cbind(x1,x2))
boxplot(df)

wilcox.test(x1,x2, paired=T, alternative="less")

H0 is rejected at the 5% significance level.


Q4.4. Some physiological test measurements were conducted on 9 patients taken at the first (x) and second (y) visits after initiation of a therapy. Do you have enough evidence to reject H0 that the therapy did not work if it was intended to lower that physiological parameter?

x <- c(1.83,  0.50,  1.62,  2.48, 1.68, 1.88, 1.55, 3.06, 1.30)
y <- c(0.878, 0.647, 0.598, 2.05, 1.06, 1.29, 1.06, 3.14, 1.29)
df <- data.frame(cbind(x,y))
boxplot(df)

wilcox.test(y-x, alternative = "less") #one-sample test for differences

H0 is rejected at the 5% significance level (p-value < 0.02).


4.3. Two independent sample Mann-Whitney-Wilcoxon test on ranks.

(the test runs with continuity correction by default, can be switched off). H0: samples come from the same population.



Q4.5. In two independent groups of pregnant women a level of some hormon was measured at two different periods of pregnancy (x and y). H0: the hormon level is the same; Ha: the hormon level is greater in the second group.

x <- c(0.80, 0.83, 1.89, 1.04, 1.45, 1.38, 1.91, 1.64, 0.73, 1.46)
y <- c(1.15, 0.88, 0.90, 0.74, 1.21)

df <- data.frame(cbind(x,y))
boxplot(df)

wilcox.test(x, y, alternative = "greater")        

There is no enough evidence to reject H0 at the 5% significance level.


4.4. Kruskal-Wallis test on equality of medians from two or more independent populations

x <- c(2.9, 3.0, 2.5, 2.6, 3.2) # normal subjects
y <- c(3.8, 2.7, 4.0, 2.4)      # with obstructive airway disease
z <- c(2.8, 3.4, 3.7, 2.2, 2.0) # with asbestosis

df <- data.frame(cbind(x,y,z))
boxplot(df)

kruskal.test(list(x, y, z))


Let's add one more group

w <- c(2.2, 1.8, 2.1, 1.5) # lung cancer subjects

df <- data.frame(cbind(x,y,z,w))
boxplot(df)

kruskal.test(list(x, y, z, w))