Язык R

и его применение в биоинформатике

Лекция 12

Анастасия Жарикова

22 ноября 2024

Несколько гистограмм

library(tidyverse)
iris %>%
  ggplot(aes(x = Sepal.Length, fill = Species)) + 
  geom_histogram() 

Несколько гистограмм

library(tidyverse)
iris %>%
  ggplot(aes(x = Sepal.Length, fill = Species), alpha = 0.7) + 
  geom_histogram() +
  theme_bw() +
  xlab("Sepal length") + ylab("Count") +
  theme(axis.text = element_text(size=16),
        axis.title = element_text(size = 20),
        legend.text = element_text(size = 14),
        legend.title = element_text(size = 20))

ridgeline

# remotes::install_github("R-CoderDotCom/ridgeline@main")
library(ridgeline)

ridgeline(iris$Sepal.Length, iris$Species)

ridgeline

ridgeline(iris$Sepal.Length, iris$Species, mode = TRUE)

ggplot + geom_density_ridges

library(ggridges)
theme_set(theme_minimal())
ggplot(iris, aes(x = Sepal.Length, y = Species, group = Species)) + 
  geom_density_ridges(fill = "coral")

ggplot + geom_density_ridges2

ggplot(iris, aes(x = Sepal.Length, y = Species, group = Species)) + 
  geom_density_ridges2(fill = "coral")

ggplot + geom_density_ridges

ggplot(iris, aes(x = Sepal.Length, y = Species, group = Species)) + 
  geom_density_ridges(fill = "coral", rel_min_height = 0.01)

Зубы

library(ggpubr)
library(rstatix)
df <- ToothGrowth
df$dose <- as.factor(df$dose)
head(df, 3)
   len supp dose
1  4.2   VC  0.5
2 11.5   VC  0.5
3  7.3   VC  0.5
table(df$supp)

OJ VC 
30 30 
table(df$dose)

0.5   1   2 
 20  20  20 

The response is the length of odontoblasts (cells responsible for tooth growth) in 60 guinea pigs. Each animal received one of three dose levels of vitamin C (0.5, 1, and 2 mg/day) by one of two delivery methods, orange juice or ascorbic acid (a form of vitamin C and coded as VC).

df
    len supp dose
1   4.2   VC  0.5
2  11.5   VC  0.5
3   7.3   VC  0.5
4   5.8   VC  0.5
5   6.4   VC  0.5
6  10.0   VC  0.5
7  11.2   VC  0.5
8  11.2   VC  0.5
9   5.2   VC  0.5
10  7.0   VC  0.5
11 16.5   VC    1
12 16.5   VC    1
13 15.2   VC    1
14 17.3   VC    1
15 22.5   VC    1
16 17.3   VC    1
17 13.6   VC    1
18 14.5   VC    1
19 18.8   VC    1
20 15.5   VC    1
21 23.6   VC    2
22 18.5   VC    2
23 33.9   VC    2
24 25.5   VC    2
25 26.4   VC    2
26 32.5   VC    2
27 26.7   VC    2
28 21.5   VC    2
29 23.3   VC    2
30 29.5   VC    2
31 15.2   OJ  0.5
32 21.5   OJ  0.5
33 17.6   OJ  0.5
34  9.7   OJ  0.5
35 14.5   OJ  0.5
36 10.0   OJ  0.5
37  8.2   OJ  0.5
38  9.4   OJ  0.5
39 16.5   OJ  0.5
40  9.7   OJ  0.5
41 19.7   OJ    1
42 23.3   OJ    1
43 23.6   OJ    1
44 26.4   OJ    1
45 20.0   OJ    1
46 25.2   OJ    1
47 25.8   OJ    1
48 21.2   OJ    1
49 14.5   OJ    1
50 27.3   OJ    1
51 25.5   OJ    2
52 26.4   OJ    2
53 22.4   OJ    2
54 24.5   OJ    2
55 24.8   OJ    2
56 30.9   OJ    2
57 26.4   OJ    2
58 27.3   OJ    2
59 29.4   OJ    2
60 23.0   OJ    2

stat test

stat.test <- df %>%
  group_by(dose) %>%
  t_test(len ~ supp) %>%
  adjust_pvalue(method = "bonferroni") %>%
  add_significance("p.adj")
stat.test
# A tibble: 3 × 11
  dose  .y.   group1 group2    n1    n2 statistic    df       p   p.adj
  <fct> <chr> <chr>  <chr>  <int> <int>     <dbl> <dbl>   <dbl>   <dbl>
1 0.5   len   OJ     VC        10    10    3.17    15.0 0.00636 0.0191 
2 1     len   OJ     VC        10    10    4.03    15.4 0.00104 0.00312
3 2     len   OJ     VC        10    10   -0.0461  14.0 0.964   1      
# ℹ 1 more variable: p.adj.signif <chr>

bxp <- ggboxplot(
  df, x = "dose", y = "len", 
  color = "supp", palette = c("darkgreen", "#E7B800")
  )
bxp

stat.test <- stat.test %>%
  add_xy_position(x = "dose", dodge = 0.8)
bxp + stat_pvalue_manual(
  stat.test,  label = "p.adj", tip.length = 0.05
  )

bxp + stat_pvalue_manual(
  stat.test,  label = "p.adj", tip.length = 0.01
  ) +
  scale_y_continuous(expand = expansion(mult = c(0, 0.5)))

bxp + stat_pvalue_manual(
  stat.test,  label = "p.adj", tip.length = 0,
  remove.bracket = TRUE
  )

bxp + stat_pvalue_manual(
  stat.test,  label = "{p.adj}{p.adj.signif}", 
  tip.length = 0, hide.ns = TRUE
  )

stat.test2 <- df %>%
  t_test(len ~ dose, p.adjust.method = "bonferroni")
stat.test2
# A tibble: 3 × 10
  .y.   group1 group2    n1    n2 statistic    df        p    p.adj p.adj.signif
* <chr> <chr>  <chr>  <int> <int>     <dbl> <dbl>    <dbl>    <dbl> <chr>       
1 len   0.5    1         20    20     -6.48  38.0 1.27e- 7 3.81e- 7 ****        
2 len   0.5    2         20    20    -11.8   36.9 4.40e-14 1.32e-13 ****        
3 len   1      2         20    20     -4.90  37.1 1.91e- 5 5.73e- 5 ****        

stat.test <- stat.test %>%
  add_xy_position(x = "dose", dodge = 0.8)
bxp.complex <- bxp + stat_pvalue_manual(
  stat.test,  label = "p.adj", tip.length = 0
  )
bxp.complex

stat.test2 <- stat.test2 %>% add_xy_position(x = "dose")
bxp.complex <- bxp.complex + 
  stat_pvalue_manual(
    stat.test2,  label = "p.adj", tip.length = 0.02,
    step.increase = 0.05
  ) +
  scale_y_continuous(expand = expansion(mult = c(0.05, 0.1)))
bxp.complex

bp <- ggbarplot(
  df, x = "dose", y = "len", add = "mean_sd", 
  color= "supp", palette = c("darkgreen", "#E7B800"),
  position = position_dodge(0.8)
  )
bp

stat.test <- stat.test %>%
  add_xy_position(fun = "mean_sd", x = "dose", dodge = 0.8) 
bp + stat_pvalue_manual(
  stat.test,  label = "p.adj.signif", tip.length = 0.01
  )

bp + stat_pvalue_manual(
  stat.test,  label = "p.adj.signif", tip.length = 0,
  bracket.nudge.y = -2
  )

lp <- ggline(
  df, x = "dose", y = "len", add = "mean_sd", 
  color = "supp", palette = c("darkgreen", "#E7B800")
  )
lp

stat.test <- stat.test %>%
  add_xy_position(fun = "mean_sd", x = "dose") 
lp + stat_pvalue_manual(
  stat.test,  label = "p.adj.signif", 
  tip.length = 0, linetype  = "blank"
  )

lp + stat_pvalue_manual(
  stat.test,  label = "p.adj.signif", 
  linetype  = "blank", vjust = 2
  )

pwc <- df %>%
  group_by(supp) %>%
  t_test(len ~ dose, p.adjust.method = "bonferroni")
pwc
# A tibble: 6 × 11
  supp  .y.   group1 group2    n1    n2 statistic    df            p      p.adj
* <fct> <chr> <chr>  <chr>  <int> <int>     <dbl> <dbl>        <dbl>      <dbl>
1 OJ    len   0.5    1         10    10     -5.05  17.7 0.0000878    0.000263  
2 OJ    len   0.5    2         10    10     -7.82  14.7 0.00000132   0.00000396
3 OJ    len   1      2         10    10     -2.25  15.8 0.039        0.118     
4 VC    len   0.5    1         10    10     -7.46  17.9 0.000000681  0.00000204
5 VC    len   0.5    2         10    10    -10.4   14.3 0.0000000468 0.00000014
6 VC    len   1      2         10    10     -5.47  13.6 0.0000916    0.000275  
# ℹ 1 more variable: p.adj.signif <chr>

pwc <- pwc %>% add_xy_position(x = "dose")
bxp +
  stat_pvalue_manual(
    pwc, color = "supp", step.group.by = "supp",
    tip.length = 0, step.increase = 0.1
    )

pwc <- pwc %>% add_xy_position(x = "dose", group = "supp")
bxp +
  stat_pvalue_manual(
    pwc, color = "supp", step.group.by = "supp",
    tip.length = 0, step.increase = 0.1
  )

Данные про рюкзаки

library(Stat2Data)
data(Backpack)
head(Backpack)
  BackpackWeight BodyWeight     Ratio BackProblems      Major Year    Sex
1              9        125 0.0720000            1        Bio    3 Female
2              8        195 0.0410256            0 Philosophy    5   Male
3             10        120 0.0833333            1        GRC    4 Female
4              6        155 0.0387097            0        CSC    6   Male
5              8        180 0.0444444            0         EE    2 Female
6              5        240 0.0208333            0    History    0   Male
  Status Units
1      U    13
2      U    12
3      U    14
4      G     0
5      U    14
6      G     0

Веса указаны в фунтах

1 фунт = 0,453592 кг

Переведите веса из фунтов в кг

back <- Backpack %>%
  mutate(backpack_kg = 0.45359237 * BackpackWeight,
         body_kg = 0.45359237 * BodyWeight)
head(back)
  BackpackWeight BodyWeight     Ratio BackProblems      Major Year    Sex
1              9        125 0.0720000            1        Bio    3 Female
2              8        195 0.0410256            0 Philosophy    5   Male
3             10        120 0.0833333            1        GRC    4 Female
4              6        155 0.0387097            0        CSC    6   Male
5              8        180 0.0444444            0         EE    2 Female
6              5        240 0.0208333            0    History    0   Male
  Status Units backpack_kg   body_kg
1      U    13    4.082331  56.69905
2      U    12    3.628739  88.45051
3      U    14    4.535924  54.43108
4      G     0    2.721554  70.30682
5      U    14    3.628739  81.64663
6      G     0    2.267962 108.86217

cor.test(back$backpack_kg, back$body_kg)

    Pearson's product-moment correlation

data:  back$backpack_kg and back$body_kg
t = 1.9088, df = 98, p-value = 0.05921
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
 -0.007360697  0.371918344
sample estimates:
      cor 
0.1893312 

Зависит ли размер рюкзака от массы тела?

Рисуем

back %>%
  ggplot(aes(y = backpack_kg, x = body_kg)) + 
  geom_point(size = 6, shape = 21, alpha = 0.7, fill = 'darkgreen', color = 'black') + 
  theme_bw() +
  xlab("Body (kg)") + ylab("Backpack (kg)") +
  theme(axis.text = element_text(size=16),
        axis.title = element_text(size = 20),
        legend.text = element_text(size = 14),
        legend.title = element_text(size = 20))

Простая линейная регрессия

Предиктор (независимая переменная) ~ объясняемая переменная

lm()

model <- lm(backpack_kg ~ body_kg, data = back)
model

Call:
lm(formula = backpack_kg ~ body_kg, data = back)

Coefficients:
(Intercept)      body_kg  
    2.71125      0.03713  

ggplot(data = back,aes(x = body_kg, y = backpack_kg))+
  geom_point(size = 6, shape = 21, alpha = 0.7, fill = 'darkgreen', color = 'black')+
  geom_abline(slope = model$coefficients[2], intercept = model$coefficients[1], size = 2,color = 'darkred') +
  theme_bw() + 
  xlab("Body (kg)") + ylab("Backpack (kg)") +
  theme(axis.text = element_text(size=16),
        axis.title = element_text(size = 20),
        legend.text = element_text(size = 14),
        legend.title = element_text(size = 20))

Сколько будет весить рюкзак у студента весом в 100 кг?

predict(model, newdata = data.frame(body_kg = 100))
       1 
6.424229 

Множественная линейная регрессия

set.seed(42)
u <- rnorm(100)
v <- rnorm(100, mean = 3, sd = 2)
w <- rnorm(100, mean = -3, sd = 1)
e <- rnorm(100, mean = 0, sd = 3)

y <- 5 + 4*u + 3*v + 2*w + e

lm()

m <- lm(y ~ u + v + w)
m

Call:
lm(formula = y ~ u + v + w)

Coefficients:
(Intercept)            u            v            w  
      4.770        4.173        3.013        1.905  

summary(m)

Call:
lm(formula = y ~ u + v + w)

Residuals:
    Min      1Q  Median      3Q     Max 
-5.3832 -1.7601 -0.3115  1.8565  6.9840 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)   4.7703     0.9691   4.922 3.55e-06 ***
u             4.1733     0.2597  16.069  < 2e-16 ***
v             3.0132     0.1484  20.311  < 2e-16 ***
w             1.9052     0.2665   7.150 1.71e-10 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 2.66 on 96 degrees of freedom
Multiple R-squared:  0.8851,    Adjusted R-squared:  0.8815 
F-statistic: 246.6 on 3 and 96 DF,  p-value: < 2.2e-16

Модельные коэффициенты

coef(m)
(Intercept)           u           v           w 
   4.770280    4.173262    3.013244    1.905167 

Доверительные интервалы коэффициентов модели

confint(m)
               2.5 %   97.5 %
(Intercept) 2.846622 6.693939
u           3.657753 4.688771
v           2.718762 3.307726
w           1.376234 2.434101

Остатки

resid(m)
          1           2           3           4           5           6 
-0.56747105  2.28803716  0.09719916  2.14740500 -0.71688199 -0.36171794 
          7           8           9          10          11          12 
 1.03495927  2.80400382 -4.24963443 -0.20484262 -0.64673080 -2.57715137 
         13          14          15          16          17          18 
-2.93387274 -1.93302285  1.78001250 -1.43998837 -2.39887037  0.92448198 
         19          20          21          22          23          24 
-3.36631495  2.68902334 -1.41903066  0.78712300  0.03550952 -0.38058349 
         25          26          27          28          29          30 
 5.04594297 -2.50106882  3.45163338  0.33710226 -2.70985131 -0.07610591 
         31          32          33          34          35          36 
 2.02608607 -1.39023547 -2.70405128  0.39534037  2.72010944 -0.02540898 
         37          38          39          40          41          42 
-3.98866378 -3.90109157 -1.94580315 -1.77007062 -0.26135548  2.09770391 
         43          44          45          46          47          48 
-1.39859792 -3.19102607  1.84391208  0.82178224  3.62728794 -5.38322002 
         49          50          51          52          53          54 
 0.29046722  3.78783896  1.91936090 -2.41061043  1.68549007 -2.79641208 
         55          56          57          58          59          60 
-1.33479747  3.35493092 -1.15246322  2.40117746 -0.53197500 -4.94341180 
         61          62          63          64          65          66 
-2.48991423 -3.27182922 -1.61611118 -1.51194409 -0.44932034 -0.98689884 
         67          68          69          70          71          72 
 5.62731823 -4.46257657 -1.75679992  0.80989421  5.03204632  0.16886524 
         73          74          75          76          77          78 
 3.57606086 -4.86679075  4.27809940 -2.13863250 -0.97390956 -3.63799681 
         79          80          81          82          83          84 
 0.57878900  5.56644767  6.98402323 -3.51188419  1.28423254  4.14446484 
         85          86          87          88          89          90 
-0.46300546 -0.78667310 -0.75653327  1.63836456  3.75781677  1.89415984 
         91          92          93          94          95          96 
 0.55421813 -0.86617701  1.20412400 -1.74009673 -0.72614713  3.27014630 
         97          98          99         100 
 1.40115444  0.94762951 -0.91404098  2.42783988 

Дисперсионный анализ

anova(m)
Analysis of Variance Table

Response: y
          Df  Sum Sq Mean Sq F value    Pr(>F)    
u          1 1776.24 1776.24 251.005 < 2.2e-16 ***
v          1 3097.08 3097.08 437.655 < 2.2e-16 ***
w          1  361.74  361.74  51.118 1.707e-10 ***
Residuals 96  679.35    7.08                      
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

plot

par(mfrow = c(2,2))
plot(m)

Конец!