Язык R

и его применение в биоинформатике

Лекция 11

Анастасия Жарикова

17 ноября 2023

library(tidyverse)
library(ggpubr)
library(Stat2Data)
library(hexSticker)

library(hexSticker)

sticker("img/sticker.webp", package = '', 
        p_size=20, s_x=1, s_y=0.95, s_width=0.8, s_height=0.8,
        filename="img/sticker_hex.png", h_fill="white", h_color="#1d3557")

Данные про рюкзаки

data(Backpack)
head(Backpack)
  BackpackWeight BodyWeight     Ratio BackProblems      Major Year    Sex
1              9        125 0.0720000            1        Bio    3 Female
2              8        195 0.0410256            0 Philosophy    5   Male
3             10        120 0.0833333            1        GRC    4 Female
4              6        155 0.0387097            0        CSC    6   Male
5              8        180 0.0444444            0         EE    2 Female
6              5        240 0.0208333            0    History    0   Male
  Status Units
1      U    13
2      U    12
3      U    14
4      G     0
5      U    14
6      G     0

Веса указаны в фунтах

1 фунт = 0,453592 кг

Переведите веса из фунтов в кг

back <- Backpack %>%
  mutate(backpack_kg = 0.45359237 * BackpackWeight,
         body_kg = 0.45359237 * BodyWeight)
head(back)
  BackpackWeight BodyWeight     Ratio BackProblems      Major Year    Sex
1              9        125 0.0720000            1        Bio    3 Female
2              8        195 0.0410256            0 Philosophy    5   Male
3             10        120 0.0833333            1        GRC    4 Female
4              6        155 0.0387097            0        CSC    6   Male
5              8        180 0.0444444            0         EE    2 Female
6              5        240 0.0208333            0    History    0   Male
  Status Units backpack_kg   body_kg
1      U    13    4.082331  56.69905
2      U    12    3.628739  88.45051
3      U    14    4.535924  54.43108
4      G     0    2.721554  70.30682
5      U    14    3.628739  81.64663
6      G     0    2.267962 108.86217

cor.test(back$backpack_kg, back$body_kg)

    Pearson's product-moment correlation

data:  back$backpack_kg and back$body_kg
t = 1.9088, df = 98, p-value = 0.05921
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
 -0.007360697  0.371918344
sample estimates:
      cor 
0.1893312 

Зависит ли размер рюкзака от массы тела?

Простая линейная регрессия

Предиктор (независимая переменная) ~ объясняемая переменная

lm()

model <- lm(backpack_kg ~ body_kg, data = back)
model

Call:
lm(formula = backpack_kg ~ body_kg, data = back)

Coefficients:
(Intercept)      body_kg  
    2.71125      0.03713  

ggplot(data = back,aes(x = body_kg, y = backpack_kg))+
  geom_point(alpha = 0.3)+
  geom_abline(slope = model$coefficients[2], intercept = model$coefficients[1]) +
  theme_bw()

Сколько будет весить рюкзак у студента весом в 100 кг?

predict(model, newdata = data.frame(body_kg = 100))
       1 
6.424229 

summary(model)

Call:
lm(formula = backpack_kg ~ body_kg, data = back)

Residuals:
    Min      1Q  Median      3Q     Max 
-4.4853 -1.7629 -0.4681  1.2893  9.8803 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)  
(Intercept)  2.71125    1.37483   1.972   0.0514 .
body_kg      0.03713    0.01945   1.909   0.0592 .
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 2.581 on 98 degrees of freedom
Multiple R-squared:  0.03585,   Adjusted R-squared:  0.02601 
F-statistic: 3.644 on 1 and 98 DF,  p-value: 0.05921

plot(model)

model_mult <- lm(backpack_kg ~ body_kg + Units, data = back)
summary(model_mult)

Call:
lm(formula = backpack_kg ~ body_kg + Units, data = back)

Residuals:
    Min      1Q  Median      3Q     Max 
-4.6221 -1.8347 -0.5023  1.2519 10.0623 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)  
(Intercept)  0.28481    2.16170   0.132   0.8955  
body_kg      0.04391    0.01990   2.207   0.0297 *
Units        0.13703    0.09456   1.449   0.1505  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 2.566 on 97 degrees of freedom
Multiple R-squared:  0.05628,   Adjusted R-squared:  0.03682 
F-statistic: 2.892 on 2 and 97 DF,  p-value: 0.06025

ggline(iris, x = "Species", y = "Petal.Length", 
       add = c("mean_se", "jitter"), 
       ylab = "Weight", xlab = "Treatment")

res.aov <- aov(Petal.Length ~ Species, data = iris)
summary(res.aov)
             Df Sum Sq Mean Sq F value Pr(>F)    
Species       2  437.1  218.55    1180 <2e-16 ***
Residuals   147   27.2    0.19                   
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

TukeyHSD(res.aov)
  Tukey multiple comparisons of means
    95% family-wise confidence level

Fit: aov(formula = Petal.Length ~ Species, data = iris)

$Species
                      diff     lwr     upr p adj
versicolor-setosa    2.798 2.59422 3.00178     0
virginica-setosa     4.090 3.88622 4.29378     0
virginica-versicolor 1.292 1.08822 1.49578     0

pairwise.t.test(iris$Petal.Length, iris$Species)

    Pairwise comparisons using t tests with pooled SD 

data:  iris$Petal.Length and iris$Species 

           setosa versicolor
versicolor <2e-16 -         
virginica  <2e-16 <2e-16    

P value adjustment method: holm 

Конец!