set.seed(2016)
#install.packages("ggplot2")
#install.packages("lattice")
#install.packages("latticeExtra")
#install.packages("MASS")
#install.packages("car")
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.2.4
library(lattice)
library(latticeExtra)
## Warning: package 'latticeExtra' was built under R version 3.2.3
## Loading required package: RColorBrewer
## 
## Attaching package: 'latticeExtra'
## The following object is masked from 'package:ggplot2':
## 
##     layer
library(MASS)
library(car)
## Warning: package 'car' was built under R version 3.2.4

Problems on linear regression

In this part of the seminar we are gonna to examine data about self-reported and true heights and weights in males and females.

summary(Davis)
##  sex         weight          height          repwt            repht      
##  F:112   Min.   : 39.0   Min.   : 57.0   Min.   : 41.00   Min.   :148.0  
##  M: 88   1st Qu.: 55.0   1st Qu.:164.0   1st Qu.: 55.00   1st Qu.:160.5  
##          Median : 63.0   Median :169.5   Median : 63.00   Median :168.0  
##          Mean   : 65.8   Mean   :170.0   Mean   : 65.62   Mean   :168.5  
##          3rd Qu.: 74.0   3rd Qu.:177.2   3rd Qu.: 73.50   3rd Qu.:175.0  
##          Max.   :166.0   Max.   :197.0   Max.   :124.00   Max.   :200.0  
##                                          NA's   :17       NA's   :17
males <- Davis[which(Davis$sex == "M"),]
females <- Davis[which(Davis$sex == "F"),]

lm_height_estim_vs_height_male <- lm(males$height ~ males$repht)
plot(jitter(males$height, factor=2) ~ jitter(males$repht, factor=2), xlab="Self-reported height (males)", ylab="Height (males)", cex=0.5, main="Red line = true (y = x), black line = fitted")
abline(lm_height_estim_vs_height_male)
abline(b = 1, a = 0, col="red")

summary(lm_height_estim_vs_height_male)
## 
## Call:
## lm(formula = males$height ~ males$repht)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.3397 -1.5701  0.1267  1.1939  5.7536 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 18.19803    5.78046   3.148  0.00231 ** 
## males$repht  0.90672    0.03277  27.669  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.058 on 80 degrees of freedom
##   (6 observations deleted due to missingness)
## Multiple R-squared:  0.9054, Adjusted R-squared:  0.9042 
## F-statistic: 765.6 on 1 and 80 DF,  p-value: < 2.2e-16
plot(lm_height_estim_vs_height_male  )

Q: Let’s assume that one guy made a joke and reported his height as too high. Exectute males[1,]$repht=300. Make the same analysis, interpret linear model’s summary. In order to solve this issue: change lm function to rlm.

Q1: Repeat the analysis for female’s weight. Interpret results. Find ouliers and influental points. Apply rlm.

ANOVA

(Example from http://homepages.inf.ed.ac.uk/bwebb/statistics/ANOVA_in_R.pdf)

attach(InsectSprays)
data(InsectSprays)
str(InsectSprays)
## 'data.frame':    72 obs. of  2 variables:
##  $ count: num  10 7 20 14 14 12 10 23 17 20 ...
##  $ spray: Factor w/ 6 levels "A","B","C","D",..: 1 1 1 1 1 1 1 1 1 1 ...

Examine the data.

At first, read about the data using ?InsectSprays command.

tapply(count, spray, mean)
##         A         B         C         D         E         F 
## 14.500000 15.333333  2.083333  4.916667  3.500000 16.666667
tapply(count, spray, var)
##         A         B         C         D         E         F 
## 22.272727 18.242424  3.901515  6.265152  3.000000 38.606061
tapply(count, spray, length)
##  A  B  C  D  E  F 
## 12 12 12 12 12 12
boxplot(count ~ spray)

Q: What can you say based on these plots?

One-way ANOVA

oneway.test(count~spray)
## 
##  One-way analysis of means (not assuming equal variances)
## 
## data:  count and spray
## F = 36.065, num df = 5.000, denom df = 30.043, p-value = 7.999e-12

Default is equal variances (i.e. homogeneity of variance) not assumed – i.e. Welch’s correction applied (and this explains why the denom df (which is normally k*(n-1)) is not a whole number in the output). To change this, set "var.equal=" option to TRUE.

Q: What can be concluded from this result? Compare result with the following:

aov.out = aov(count ~ spray, data=InsectSprays)
summary(aov.out)
##             Df Sum Sq Mean Sq F value Pr(>F)    
## spray        5   2669   533.8    34.7 <2e-16 ***
## Residuals   66   1015    15.4                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary.lm(aov.out)
## 
## Call:
## aov(formula = count ~ spray, data = InsectSprays)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -8.333 -1.958 -0.500  1.667  9.333 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  14.5000     1.1322  12.807  < 2e-16 ***
## sprayB        0.8333     1.6011   0.520    0.604    
## sprayC      -12.4167     1.6011  -7.755 7.27e-11 ***
## sprayD       -9.5833     1.6011  -5.985 9.82e-08 ***
## sprayE      -11.0000     1.6011  -6.870 2.75e-09 ***
## sprayF        2.1667     1.6011   1.353    0.181    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.922 on 66 degrees of freedom
## Multiple R-squared:  0.7244, Adjusted R-squared:  0.7036 
## F-statistic:  34.7 on 5 and 66 DF,  p-value: < 2.2e-16

Post-hoc tests

TukeyHSD(aov.out)
##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = count ~ spray, data = InsectSprays)
## 
## $spray
##            diff        lwr       upr     p adj
## B-A   0.8333333  -3.866075  5.532742 0.9951810
## C-A -12.4166667 -17.116075 -7.717258 0.0000000
## D-A  -9.5833333 -14.282742 -4.883925 0.0000014
## E-A -11.0000000 -15.699409 -6.300591 0.0000000
## F-A   2.1666667  -2.532742  6.866075 0.7542147
## C-B -13.2500000 -17.949409 -8.550591 0.0000000
## D-B -10.4166667 -15.116075 -5.717258 0.0000002
## E-B -11.8333333 -16.532742 -7.133925 0.0000000
## F-B   1.3333333  -3.366075  6.032742 0.9603075
## D-C   2.8333333  -1.866075  7.532742 0.4920707
## E-C   1.4166667  -3.282742  6.116075 0.9488669
## F-C  14.5833333   9.883925 19.282742 0.0000000
## E-D  -1.4166667  -6.116075  3.282742 0.9488669
## F-D  11.7500000   7.050591 16.449409 0.0000000
## F-E  13.1666667   8.467258 17.866075 0.0000000

Test assumptions

Homogenity of variance

bartlett.test(count ~ spray, data=InsectSprays)
## 
##  Bartlett test of homogeneity of variances
## 
## data:  count by spray
## Bartlett's K-squared = 25.96, df = 5, p-value = 9.085e-05

Check of linear regression assumptions

plot(aov.out)

Q: Interpret this plots and result of the test.

Two-way ANOVA

df <- Salaries
summary(df)
##         rank     discipline yrs.since.phd    yrs.service        sex     
##  AsstProf : 67   A:181      Min.   : 1.00   Min.   : 0.00   Female: 39  
##  AssocProf: 64   B:216      1st Qu.:12.00   1st Qu.: 7.00   Male  :358  
##  Prof     :266              Median :21.00   Median :16.00               
##                             Mean   :22.31   Mean   :17.61               
##                             3rd Qu.:32.00   3rd Qu.:27.00               
##                             Max.   :56.00   Max.   :60.00               
##      salary      
##  Min.   : 57800  
##  1st Qu.: 91000  
##  Median :107300  
##  Mean   :113706  
##  3rd Qu.:134185  
##  Max.   :231545
par(mfrow=c(1,2))
plot(salary ~ sex + rank, data=df)

results = lm(salary ~ sex + rank + sex:rank, data=df)
anova(results)
## Analysis of Variance Table
## 
## Response: salary
##            Df     Sum Sq    Mean Sq  F value    Pr(>F)    
## sex         1 6.9800e+09 6.9800e+09  12.4515 0.0004673 ***
## rank        2 1.3709e+11 6.8546e+10 122.2787 < 2.2e-16 ***
## sex:rank    2 4.3603e+07 2.1802e+07   0.0389 0.9618588    
## Residuals 391 2.1918e+11 5.6057e+08                       
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
qqnorm(results$res)
model.aov <- aov(results)
TukeyHSD(model.aov,
         conf.level = 0.95)
##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = results)
## 
## $sex
##                 diff      lwr      upr     p adj
## Male-Female 14088.01 6238.682 21937.34 0.0004673
## 
## $rank
##                        diff       lwr      upr     p adj
## AssocProf-AsstProf 12988.75  3252.618 22724.88 0.0051802
## Prof-AsstProf      44636.49 37022.332 52250.65 0.0000000
## Prof-AssocProf     31647.74 23892.343 39403.14 0.0000000
## 
## $`sex:rank`
##                                       diff         lwr      upr     p adj
## Male:AsstProf-Female:AsstProf     3261.555 -19101.0778 25624.19 0.9983696
## Female:AssocProf-Female:AsstProf 10462.891 -19164.2127 40089.99 0.9140328
## Male:AssocProf-Female:AsstProf   16819.795  -5610.7256 39250.31 0.2652311
## Female:Prof-Female:AsstProf      43917.702  17967.4073 69868.00 0.0000267
## Male:Prof-Female:AsstProf        49070.913  28177.7795 69964.05 0.0000000
## Female:AssocProf-Male:AsstProf    7201.336 -16077.1028 30479.77 0.9497493
## Male:AssocProf-Male:AsstProf     13558.239    625.7736 26490.71 0.0336773
## Female:Prof-Male:AsstProf        40656.147  22283.9333 59028.36 0.0000000
## Male:Prof-Male:AsstProf          45809.358  35777.2357 55841.48 0.0000000
## Male:AssocProf-Female:AssocProf   6356.904 -16986.7589 29700.57 0.9708114
## Female:Prof-Female:AssocProf     33454.811   6711.2859 60198.34 0.0051114
## Male:Prof-Female:AssocProf       38608.023  16737.4626 60478.58 0.0000098
## Female:Prof-Male:AssocProf       27097.907   8643.1217 45552.69 0.0004615
## Male:Prof-Male:AssocProf         32251.119  22068.5667 42433.67 0.0000000
## Male:Prof-Female:Prof             5153.211 -11398.9463 21705.37 0.9484003

Q: Try the same with applied (B) or theoretical (A) departments (df$discipline) instead of sex.

Data transformation

The need of data transformation is depending on what model do you want to use. It is a powerful technique, however, I do not recommend to use it blindly. Even if your diagnostic plots are looking good for transformed data, sometimes it is better to use more sophisticated model.

Data transformation is an application of function to the data for different purposes:

  1. Normality of data

  2. Visualisation

  3. Variance stabilization

Mathematically: you have set of points x, you apply function f() to the data and obtain new dataset z = f(x). Desired properties of f: it should be continuous and reversible, so you can make transformation, perform some tests/other actions and then reverse it back.

Normality

Simplest thing - Z-score, in R it does scale() function:

set.seed(912)
vect <- rpois(10000, lambda=20)
plot(density(scale(vect)))
x   <- seq(-5,5,length=1000)
lines(x, dnorm(x), type="l",col="red")

qqnorm(scale(vect))
qqline(scale(vect), col="red")
abline(v=c(-2,2),h=c(-2,2))

Q: Do the same with sqrt(vect + 3/8) (Anscombe’s transformation).

Q: Change bandwidth of density: add to the third line plot(density(scale(vect))) parameter width=0.01.

set.seed(1)
hist(replicate(10000, shapiro.test(rpois(100, lambda=80))$p.value))

Q: Change to sqrt(x + 3/8) and look at the histogram of p-values.

vect <- rnbinom(10000, size=5, mu=80)
plot(density((vect)))

plot(density(scale(vect)))

plot(density(scale(sqrt(vect + 3/8))))

However, for more skewed distributions sqrt transformation are not as effective.

set.seed(1)
hist(replicate(10000, shapiro.test(rnbinom(100, size=5, mu=80))$p.value))

hist(replicate(10000, shapiro.test(sqrt(rnbinom(100, size=5, mu=80) + 3/8))$p.value))

Log-normal distribution:

vect <- exp(rnorm(10000))
plot(density(vect))

plot(density(log(vect)))

Variance stabilization

Suppose you have started an experiment: you divided your bacterial culture into 100 parts of equal size, added chemicals with different concentration that affect DNA and started sequencing it in order to estimate mutation rate with conentration.

set.seed(6)
x <- runif(100, 1, 50)
y <- x + rnorm(100, mean=0, sd=x/4)
y_vs_x <- lm(y ~ x)
plot(y ~ x)
abline(y_vs_x)

plot(y_vs_x)

vect_big <- rpois(10000, 1000)
vect_small <- rpois(10000, 10)
plot(density(vect_small), xlim=c(min(vect_big, vect_small), max(vect_big, vect_small)))
lines(density(vect_big))

plot(density(sqrt(vect_small + 3/8)), xlim=c(min(sqrt(vect_big), sqrt(vect_small)), max(sqrt(vect_big), sqrt(vect_small))))
lines(density(sqrt(vect_big + 3/8)))

Box-Cox

Univariate Box-Cox:

set.seed(1926)
data <- rnbinom(1000, size=5, mu=80)
plot(density(data))

summary(p1 <- powerTransform(data))
## bcPower Transformation to Normality 
## 
##      Est.Power Std.Err. Wald Lower Bound Wald Upper Bound
## data    0.3809   0.0526           0.2779            0.484
## 
## Likelihood ratio tests about transformation parameters
##                             LRT df         pval
## LR test, lambda = (0)  56.42637  1 5.839773e-14
## LR test, lambda = (1) 125.81142  1 0.000000e+00
transformedY <- bcPower(data, lambda <- p1$lambda)
plot(density(transformedY))

Box-Cox for regression:

N <- 1000
x <- rnorm(N, mean = 10, sd = 1.5)
beta0 <- 1
beta1 <- 2
beta1_2 <- 0.5
y <- (beta0 + beta1 * x + rnorm(N, sd = 0.5)) ** 10
plot(y ~ x)

boxcox(y ~ x)

p1 <- powerTransform(y ~ x)
p1$lambda
##         Y1 
## 0.09872552

Take lambda and transform y according to it.

Plots residuals vs. fitted

panel <- function(...) {
    panel.xyplot(...)
        panel.lmline(...)
}
N <- 1000
x <- rnorm(N)
beta0 <- 1
beta1 <- 2
beta1_2 <- 0.5
y <- exp(beta0 + beta1 * x + rnorm(N, sd = 0.5))
df <- data.frame(y = y, x = x)
l <- lm(y ~ x, data = df)
l2 <- lm(log(y) ~ x, data = df)
p <- xyplot(y ~ x, panel=panel)
p0 <- xyplot(log(y) ~ x, panel=panel)
plot(c(original = p, logarithm = p0))

p1 <- xyplot(residuals(l) ~ fitted(l), panel = panel)
p2 <- xyplot(residuals(l2) ~ fitted(l2),
    panel = panel)
plot(c(original = p1, logarithm = p2))

y <- beta0 + beta1 * x + beta1_2 * x^2 +
    rnorm(N, sd = 0.1)
df <- data.frame(y = y, x = x)
l <- lm(y ~ x, data = df)
l2 <- lm(y ~ poly(x, degree = 2), data = df)
p1 <- xyplot(residuals(l) ~ fitted(l), panel = panel)
p2 <- xyplot(residuals(l2) ~ fitted(l2),
    panel = panel)
plot(c(linear = p1, quadratic = p2))

Regression: interpretation

I.e., coverage depth of your sequencing experiment. Let us assume that we work with IonTorrent and PCR. You have 48 samples with equal library sizes and you want to study relationship between coverage of two regions. Region_1 is taken from houskeeping gene, Region_2 - one of interest.

plot(density(samples[,2]))

qqnorm(samples[,2])

m <- lm(Region_2 ~ Region_1, data = samples) 
samples_without_log <- cbind(samples, predict(m, interval = "prediction")) 
## Warning in predict.lm(m, interval = "prediction"): predictions on current data refer to _future_ responses
ggplot(samples_without_log, aes(x = Region_1)) + 
   geom_ribbon(aes(ymin = lwr, ymax = upr), 
               fill = "blue", alpha = 0.2) + 
   geom_point(aes(y = Region_2)) + 
   geom_line(aes(y = fit), colour = "blue", size = 1) 

qqnorm(m$residuals)

qqnorm(m$residuals[which(m$residuals < 1000)])

samples_log <- data.frame(apply(samples,2,log))
m <- lm(Region_2 ~ Region_1, data = samples_log) 
samples_log <- cbind(samples_log, predict(m, interval = "prediction")) 
## Warning in predict.lm(m, interval = "prediction"): predictions on current data refer to _future_ responses
ggplot(samples_log, aes(x = Region_1)) + 
   geom_ribbon(aes(ymin = lwr, ymax = upr), 
               fill = "blue", alpha = 0.2) + 
   geom_point(aes(y = Region_2)) + 
   geom_line(aes(y = fit), colour = "blue", size = 1) 

qqnorm(m$residuals)

Comparison of methods

Suppose you developed a new method for measuring some value. You have gold standard method and your method and you need to compare them.

Let us use correlations.

true_value <- runif(100, 20, 100)
gold_standard <- true_value + rnorm(100, mean=0, sd=1)
your_method <- true_value + rnorm(100, mean=log(true_value), sd=2)
plot(gold_standard ~ your_method)
abline(lm(gold_standard ~ your_method))
cor(gold_standard, your_method)
## [1] 0.9961077
abline(a=0,b=1,col="red")

Correlation are related to any linear relationship, but for method’s agreement you are interested only in one particular line y = x. Solution: Bland-Altman plots.

sum_of_methods <- gold_standard + your_method
diff_of_methods <- gold_standard - your_method
plot(sum_of_methods / 2, diff_of_methods)
abline(h=0)

print(c(mean(diff_of_methods) - 1.96 * sd(diff_of_methods), mean(diff_of_methods) + 1.96 * sd(diff_of_methods)))
## [1] -8.0024921 -0.3200175

Coordinates transformation

Log-Log plots are required for relationships of y = ax^k. You take log of both parts and get log(y) = log(a) + k * log(x).

paste(as.character(x), sep="' '", collapse=", ")
## [1] "93.7164649583865, 45.0379004671704, 69.1131034689024, 78.7675003104378, 21.1970542396884, 14.5570967763197, 5.99949701619335, 45.8251067476813, 22.8144320445135, 73.6253506531939, 44.5422760935035, 4.25461317854933, 21.1077077765949, 3.68404087773524, 72.2796415891498, 54.3618900952861, 99.2183673812542, 24.0932481547352, 76.6281159408391, 13.2686588761862"
paste(as.character(y), sep="' '", collapse=", ")
## [1] "1.04517491914865e+21, 686770075795499776, 49731773407922241536, 1.83865661695524e+20, 366256302464860, 8546291226648.34, 1208310118.77278, 816704312308425472, 764058501449566, 93606015182095712256, 614828585516686592, 38871455.7500388, 351108000575968, 9210313.22665865, 77837682928566091776, 4507912500192198656, 1.84905896614405e+21, 1318206756988135, 1.39608375092409e+20, 3382986660106.39"
plot(x, y)

Fit regression value and interpret the coefficients. Find a and k.