# install.packages("MASS") - run this code if you do not have MASS package
library(MASS)
Let's examine a randomly generated sample from a normal distribution.
set.seed(42)
x <- rnorm(100)
shapiro.test(x)
##
## Shapiro-Wilk normality test
##
## data: x
## W = 0.98122, p-value = 0.1654
qqnorm(x)
qqline(x, col = "red", lwd=3)
Q3.1. Repeat the above procedure for the sample of size 10000.
set.seed(42)
x <- rnorm(10000)
qqnorm(x)
qqline(x, col = "red", lwd=3)
shapiro.test(x)
ks.test(x,"pnorm",mean(x),sqrt(var(x))) #compare sample x with the normal distirbution having mean(x) and var(x)
#install.packages("moments")
library(moments)
# skewness and kurtosis, they should be around 0 and 3, respectively, for normally distributed data
x <- rnorm(100)
skewness(x)
## [1] 0.3090857
kurtosis(x)
## [1] 3.032129
#install.packages("car")
library(car)
#qqplot with confidence interval
qqPlot(x)
Q3.2. Test the normality of a random sample of size n = 1000 taken from a Gamma distribution; use rgamma(n, shape, scale), for shape (k) = 2, and scale (theta) = 2.
shape <- 2
scale <- 2
n <- 1000
x <- rgamma(n, shape, scale)
hist(x)
qqPlot(x) #library(car)
skewness(x)
## [1] 1.363373
kurtosis(x)
## [1] 5.849921
shapiro.test(x)
##
## Shapiro-Wilk normality test
##
## data: x
## W = 0.90418, p-value < 2.2e-16
ks.test(rnorm(10000), rnorm(100))
par(mfrow = c(2,1))
hist(rnorm(1000))
hist(rgamma(1000,2,2))
ks.test(rnorm(1000), rgamma(1000,2,2))
hist(rgamma(1000,1,1))
hist(rgamma(1000,2,2))
ks.test(rgamma(1000,1,1), rgamma(1000,2,2))
par(mfrow = c(1,1))
set.seed(400)
x <- rnorm(50, mean=1)
summary(x)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -1.5120 0.4014 0.7756 0.9503 1.6030 2.6350
y <- rgamma(50,1,1)
summary(y)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.002572 0.127700 0.384200 0.837300 1.164000 4.315000
z <- c(8.11, 8.11, 8.09, 8.08, 8.06, 8.02, 7.99, 7.99, 7.97, 7.95, 7.92, 7.92, 7.92, 7.89, 7.87, 7.84, 7.79, 7.79, 7.77, 7.76, 7.72, 7.71, 7.66, 7.62, 7.61, 7.59, 7.55, 7.53, 7.50, 7.50, 7.42, 7.38, 7.38, 7.26, 7.25, 7.08, 6.96, 6.84, 6.55, 6.50, 6.45, 6.40, 6.38, 6.32, 6.30, 6.05, 6.00, 5.55, 5.50) # men jumps
summary(z)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 5.500 6.960 7.610 7.355 7.920 8.110
par(mfrow = c(3,3))
boxplot(x, main="Normal")
stripchart(x, vertical=TRUE, add=TRUE, method="jitter", pch=20)
loc <- 1:length(mean(x))
segments(loc-0.2, mean(x), loc+0.2, mean(x), lwd=2, col="red")
boxplot(y, main="Gamma")
stripchart(y, vertical=TRUE, add=TRUE, method="jitter", pch=20)
loc <- 1:length(mean(y))
segments(loc-0.2, mean(y), loc+0.2, mean(y), lwd=2, col="red")
boxplot(z, main="Left-skewed")
stripchart(z, vertical=TRUE, add=TRUE, method="jitter", pch=20)
loc <- 1:length(mean(z))
segments(loc-0.2, mean(z), loc+0.2, mean(z), lwd=2, col="red")
hist(x)
segments(median(x), 0, median(x), 12, lwd=2, col="blue")
segments(mean(x), 0, mean(x), 12, lwd=2, col="red")
hist(y)
segments(median(y), 0, median(y), 30, lwd=2, col="blue")
segments(mean(y), 0, mean(y), 30, lwd=2, col="red")
hist(z)
segments(median(z), 0, median(z), 20, lwd=2, col="blue")
segments(mean(z), 0, mean(z), 20, lwd=2, col="red")
qqPlot(x) #library(car)
qqPlot(y) #library(car)
qqPlot(z) #library(car)
par(mfrow = c(1,1))
Q3.3. Do log-transformation of a gamma-sample (3,2) and run normality tests on transformed data. Which transformation was the best taking into account the Shapiro test? Looking at qqPlot(x)?
shape <- 3
scale <- 2
x <- rgamma(100, shape, scale)
skewness(x)
## [1] 1.257593
kurtosis(x)
## [1] 5.088152
shapiro.test(x)
##
## Shapiro-Wilk normality test
##
## data: x
## W = 0.91019, p-value = 4.402e-06
y <- log(x + 1) # try also log10, log2
skewness(y)
## [1] 0.3548871
kurtosis(y)
## [1] 2.795442
shapiro.test(y)
##
## Shapiro-Wilk normality test
##
## data: y
## W = 0.98435, p-value = 0.2849
par(mfrow = c(2,2))
hist(x)
hist(y)
qqPlot(x) #library(car)
qqPlot(y) #library(car)
par(mfrow = c(1,1))
#install.packages("e1071")
library(e1071)
library(datasets)
data(airquality)
par(mfrow = c(3,2))
x <- airquality$Ozone
x <- x[!is.na(x)]
skewness(x)
hist(x)
qqPlot(x) #library(car)
shapiro.test(x)
y1 <- log(x+1)
skewness(y1)
hist(y1)
qqPlot(y1) #library(car)
shapiro.test(y1)
skew.score <- function(c, x) (skewness(log(x + c)))^2
coef <- optimise(skew.score, c(0, 20), x = x)$minimum
y2 <- log(x + coef)
skewness(y2)
hist(y2)
qqPlot(y2) #library(car)
shapiro.test(y2)
par(mfrow = c(1,1))
x <- c(8.11, 8.11, 8.09, 8.08, 8.06, 8.02, 7.99, 7.99, 7.97, 7.95, 7.92, 7.92, 7.92, 7.89, 7.87, 7.84, 7.79, 7.79, 7.77, 7.76, 7.72, 7.71, 7.66, 7.62, 7.61, 7.59, 7.55, 7.53, 7.50, 7.50, 7.42, 7.38, 7.38, 7.26, 7.25, 7.08, 6.96, 6.84, 6.55) # London Olympic 2012, men jumps
par(mfrow = c(3,2))
skewness(x)
hist(x)
qqPlot(x)
shapiro.test(x)
y1 <- x^2
skewness(y1)
hist(y1)
qqPlot(y1)
shapiro.test(y1)
y2 <- x^10
hist(y2)
qqPlot(y2)
shapiro.test(y2)
par(mfrow = c(1,1))
lambda <- 4
x <- rpois(100, lambda)
par(mfrow = c(2,1))
hist(x)
qqPlot(x) #library(car)
par(mfrow = c(1,1))
See also http://stats.stackexchange.com/questions/101274/how-to-interpret-a-qq-plot?rq=1
Non-parametric test | Parametric test | Charactiristics | Usage |
---|---|---|---|
One-sample t test (variance uknown & n<40) or z test (otherwise) | One-sample Sign test | Test on the median for non-symmetric distribution | SIGN.test(x) from package “BSDA” |
One-sample Wilcoxon test | Test on the median for symmetric distribution | wilcox.test(x) | |
Two sample t test Paired | Wilcoxon signed-rank test | Test on the medians of two paired samples | wilcox.test(x,y, paired=TRUE) |
Two sample t test Unpaired | Mann-Whitney test on ranks | Test on the medians of two independent samples using ranks | wilcox.test(x,y) |
One-way ANOVA | Kruskal-Wallis test | Test on equality of medians from two or more populations | kruskal.test() |
Mood's median test | ibid. Less robust to outliers but more powerful than Kruskal-Wallis | mood.test() |
The sign test is a method to find consistent ordinal differences between pairs of observations. It determines if one member in the pair of observations tends to be greater than the other member. Unlike t-test, there is no assumption of normality for small samples, neither any other assumption about the nature of the random variable.
#install.packages("BSDA")
library(BSDA)
Q4.1. The patients were treated by the weight loss drug. 10 patients, 5 reduced weight, 3 gained. Do these data suggest that the weight-control treatment works (that is, can we reject the null hypothesis that there is no difference in patients' weight between before and after treatment?
x1 <- c(200,202,194,188,166,196,180,188,180,210)
x2 <- c(202,204,167,192,166,190,176,182,180,202)
x <- x2 - x1
hist(x)
SIGN.test(x, md=0, alternative = "less")
#p-value can be calculated directly
size <- length(x)
win <- sum(x<0)
p <- 1 - pbinom(win, size, p=0.5)
p-value = 0.3633 => There is no enough evidence to reject H0.
Q4.2. Test H0 that the weight in group x1 is above 180 pounds. Do we have enough evidence to reject this H0?
hist(x1)
size <- length(x1)
win <- sum(x1 > 180)
p <- pbinom(win, size, p=0.5)
SIGN.test(x1 - 180, md=0, alternative = "less")
Q4.3. Test H0 that the weight in group x2 is below 167 pounds. Do we have enough evidence to reject this H0?
hist(x2)
size <- length(x2)
win <- sum(x2 < 167)
p <- pbinom(win, size, p=0.5) #P(X ≥ 6) = P(X = 6)+P(X = 7)+···+P(X = 10)
SIGN.test(x1 - 167, md=0, alternative = "greater")
(the test runs with continuity correction by default, can be switched off).
The test is used to assess whether the differences are symmetric and centered around zero.
H0: differences from the median (one-sample test) or differences in paired measuments follow a symmetrical distribution.
Ha: differences don’t follow a symmetric distribution around zero
The problem. The effect of a drug on the hormon level was measured in 12 patients before and after the treatment. Was the drug effective?
x1 <- c(125.3, 101, 117.2, 133.7, 96.4, 124.5, 118.7, 106.2, 116.3, 120.2, 125, 128.8)
x2 <- c(127.3, 120.2, 126.2, 125.4, 115.1, 118.5, 135.5, 118.2, 122.9, 120.1, 120.8, 130.7)
df <- data.frame(cbind(x1,x2))
boxplot(df)
wilcox.test(x1,x2, paired=T, alternative="less")
H0 is rejected at the 5% significance level.
Q4.4. Some physiological test measurements were conducted on 9 patients taken at the first (x) and second (y) visits after initiation of a therapy. Do you have enough evidence to reject H0 that the therapy did not work if it was intended to lower that physiological parameter?
x <- c(1.83, 0.50, 1.62, 2.48, 1.68, 1.88, 1.55, 3.06, 1.30)
y <- c(0.878, 0.647, 0.598, 2.05, 1.06, 1.29, 1.06, 3.14, 1.29)
df <- data.frame(cbind(x,y))
boxplot(df)
wilcox.test(y-x, alternative = "less") #one-sample test for differences
H0 is rejected at the 5% significance level (p-value < 0.02).
(the test runs with continuity correction by default, can be switched off). H0: samples come from the same population.
Q4.5. In two independent groups of pregnant women a level of some hormon was measured at two different periods of pregnancy (x and y). H0: the hormon level is the same; Ha: the hormon level is greater in the second group.
x <- c(0.80, 0.83, 1.89, 1.04, 1.45, 1.38, 1.91, 1.64, 0.73, 1.46)
y <- c(1.15, 0.88, 0.90, 0.74, 1.21)
df <- data.frame(cbind(x,y))
boxplot(df)
wilcox.test(x, y, alternative = "greater")
There is no enough evidence to reject H0 at the 5% significance level.
x <- c(2.9, 3.0, 2.5, 2.6, 3.2) # normal subjects
y <- c(3.8, 2.7, 4.0, 2.4) # with obstructive airway disease
z <- c(2.8, 3.4, 3.7, 2.2, 2.0) # with asbestosis
df <- data.frame(cbind(x,y,z))
boxplot(df)
kruskal.test(list(x, y, z))
Let's add one more group
w <- c(2.2, 1.8, 2.1, 1.5) # lung cancer subjects
df <- data.frame(cbind(x,y,z,w))
boxplot(df)
kruskal.test(list(x, y, z, w))