Задание 0

(1 балл) Установите seed согласно вашему номеру в ведомости курса.

set.seed(000)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.0     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

Задание 1

(1 балл) Прочитайте набор данных про лосося (salmon_10k.csv, лежит на kodomo) средствами tidyverse. Отберите из прочитанной таблицы случайные шестьсот строк. Далее работайте с полученной на этом этапе таблицей с шестьюстами строками.

salmon_full <- read_csv('https://kodomo.fbb.msu.ru/FBB/year_22/lectures/salmon_10k.csv')
## Rows: 10000 Columns: 27
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (15): Species, Length.Measurement.Type, ASLProjectType, LocationID, Sex...
## dbl  (11): sampleYear, Length, Weight, Salt.Water.Age, fishNum, Fresh.Water....
## date  (1): sampleDate
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(salmon_full) 
## # A tibble: 6 × 27
##   Species Length.Measurement.T…¹ sampleYear ASLProjectType LocationID sampleDate
##   <chr>   <chr>                       <dbl> <chr>          <chr>      <date>    
## 1 chum    mid-eye to fork of ta…       2014 test fishing   Pilot Sta… 2014-06-29
## 2 sockeye mid-eye to fork of ta…       2009 commercial ca… Upper Cla… 2009-08-10
## 3 sockeye <NA>                         1998 commercial ca… Volcano B… 1998-07-18
## 4 sockeye mid-eye to fork of ta…       2000 commercial ca… Point Ard… 2000-07-31
## 5 chinook mid-eye to fork of ta…       1983 escapement     Anvik Riv… 1983-08-08
## 6 sockeye mid-eye to fork of ta…       1982 commercial ca… Alsek Riv… 1982-06-29
## # ℹ abbreviated name: ¹​Length.Measurement.Type
## # ℹ 21 more variables: Length <dbl>, Weight <dbl>, Sex <chr>,
## #   Salt.Water.Age <dbl>, DataSource <chr>, cardNo <chr>, fishNum <dbl>,
## #   Age.Error <chr>, Fresh.Water.Age <dbl>, Sex.Determination.Method <chr>,
## #   subSystem <chr>, Flag <chr>, Gear <chr>, SASAP.Region <chr>,
## #   LocationUnique <chr>, DistrictID <dbl>, Sub.DistrictID <dbl>,
## #   Stat.area <dbl>, Lat <dbl>, Lon <dbl>, AWC_CODE <chr>
dim(salmon_full)
## [1] 10000    27
str(salmon_full)
## spc_tbl_ [10,000 × 27] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ Species                 : chr [1:10000] "chum" "sockeye" "sockeye" "sockeye" ...
##  $ Length.Measurement.Type : chr [1:10000] "mid-eye to fork of tail" "mid-eye to fork of tail" NA "mid-eye to fork of tail" ...
##  $ sampleYear              : num [1:10000] 2014 2009 1998 2000 1983 ...
##  $ ASLProjectType          : chr [1:10000] "test fishing" "commercial catch" "commercial catch" "commercial catch" ...
##  $ LocationID              : chr [1:10000] "Pilot Station" "Upper Clarence/Steamer Bay/Quiet Harbor" "Volcano Bay" "Point Arden to Midway Islands" ...
##  $ sampleDate              : Date[1:10000], format: "2014-06-29" "2009-08-10" ...
##  $ Length                  : num [1:10000] 663 NA NA NA 742 605 508 675 425 520 ...
##  $ Weight                  : num [1:10000] NA NA NA NA NA NA NA NA NA NA ...
##  $ Sex                     : chr [1:10000] "male" "female" NA "male" ...
##  $ Salt.Water.Age          : num [1:10000] NA 3 3 2 NA 3 2 3 2 NA ...
##  $ DataSource              : chr [1:10000] "ADFG AYK" "ADFG Southeast and Westward" "ADFG Southeast and Westward" "ADFG Southeast and Westward" ...
##  $ cardNo                  : chr [1:10000] NA NA NA NA ...
##  $ fishNum                 : num [1:10000] NA NA NA NA 6 NA NA NA NA NA ...
##  $ Age.Error               : chr [1:10000] NA NA NA "1    6" ...
##  $ Fresh.Water.Age         : num [1:10000] 0 1 2 1 NA 1 2 0 1 0 ...
##  $ Sex.Determination.Method: chr [1:10000] "External" NA NA NA ...
##  $ subSystem               : chr [1:10000] NA NA NA NA ...
##  $ Flag                    : chr [1:10000] NA NA NA NA ...
##  $ Gear                    : chr [1:10000] "gillnet" "gillnet" NA "gillnet" ...
##  $ SASAP.Region            : chr [1:10000] "Yukon" "Southeast" "Alaska Peninsula and Aleutian Islands" "Southeast" ...
##  $ LocationUnique          : chr [1:10000] "Pilot Station-test fishing-33423" "Upper Clarence/Steamer Bay/Quiet Harbor-commercial catch-10630" "Volcano Bay-commercial catch-28436" "Point Arden to Midway Islands-commercial catch-11131" ...
##  $ DistrictID              : num [1:10000] 334 106 284 111 334 182 324 115 111 NA ...
##  $ Sub.DistrictID          : num [1:10000] 3342 30 36 31 3344 ...
##  $ Stat.area               : num [1:10000] 33423 10630 28436 11131 33447 ...
##  $ Lat                     : num [1:10000] 62 NA NA NA 63 ...
##  $ Lon                     : num [1:10000] -163 NA NA NA -161 ...
##  $ AWC_CODE                : chr [1:10000] NA NA NA NA ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   Species = col_character(),
##   ..   Length.Measurement.Type = col_character(),
##   ..   sampleYear = col_double(),
##   ..   ASLProjectType = col_character(),
##   ..   LocationID = col_character(),
##   ..   sampleDate = col_date(format = ""),
##   ..   Length = col_double(),
##   ..   Weight = col_double(),
##   ..   Sex = col_character(),
##   ..   Salt.Water.Age = col_double(),
##   ..   DataSource = col_character(),
##   ..   cardNo = col_character(),
##   ..   fishNum = col_double(),
##   ..   Age.Error = col_character(),
##   ..   Fresh.Water.Age = col_double(),
##   ..   Sex.Determination.Method = col_character(),
##   ..   subSystem = col_character(),
##   ..   Flag = col_character(),
##   ..   Gear = col_character(),
##   ..   SASAP.Region = col_character(),
##   ..   LocationUnique = col_character(),
##   ..   DistrictID = col_double(),
##   ..   Sub.DistrictID = col_double(),
##   ..   Stat.area = col_double(),
##   ..   Lat = col_double(),
##   ..   Lon = col_double(),
##   ..   AWC_CODE = col_character()
##   .. )
##  - attr(*, "problems")=<externalptr>
salmon <- salmon_full %>%
  slice_sample(n = 600)

head(salmon) 
## # A tibble: 6 × 27
##   Species Length.Measurement.T…¹ sampleYear ASLProjectType LocationID sampleDate
##   <chr>   <chr>                       <dbl> <chr>          <chr>      <date>    
## 1 coho    mid-eye to fork of ta…       1998 test fishing   Pilot Sta… 1998-09-07
## 2 sockeye mid-eye to fork of ta…       1994 commercial ca… Cohoe-Nin… 1994-07-15
## 3 chum    mid-eye to fork of ta…       2011 commercial ca… Emmonak (… 2011-07-06
## 4 pink    mid-eye to fork of ta…       1969 escapement     Susitna R… 1969-08-03
## 5 sockeye <NA>                         2005 commercial ca… Moser - O… 2005-07-20
## 6 sockeye mid-eye to fork of ta…       1985 escapement     Kvichak R… 1985-07-16
## # ℹ abbreviated name: ¹​Length.Measurement.Type
## # ℹ 21 more variables: Length <dbl>, Weight <dbl>, Sex <chr>,
## #   Salt.Water.Age <dbl>, DataSource <chr>, cardNo <chr>, fishNum <dbl>,
## #   Age.Error <chr>, Fresh.Water.Age <dbl>, Sex.Determination.Method <chr>,
## #   subSystem <chr>, Flag <chr>, Gear <chr>, SASAP.Region <chr>,
## #   LocationUnique <chr>, DistrictID <dbl>, Sub.DistrictID <dbl>,
## #   Stat.area <dbl>, Lat <dbl>, Lon <dbl>, AWC_CODE <chr>
dim(salmon)
## [1] 600  27
str(salmon)
## spc_tbl_ [600 × 27] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ Species                 : chr [1:600] "coho" "sockeye" "chum" "pink" ...
##  $ Length.Measurement.Type : chr [1:600] "mid-eye to fork of tail" "mid-eye to fork of tail" "mid-eye to fork of tail" "mid-eye to fork of tail" ...
##  $ sampleYear              : num [1:600] 1998 1994 2011 1969 2005 ...
##  $ ASLProjectType          : chr [1:600] "test fishing" "commercial catch" "commercial catch" "escapement" ...
##  $ LocationID              : chr [1:600] "Pilot Station" "Cohoe-Ninilchik" "Emmonak (Village/City)" "Susitna River" ...
##  $ sampleDate              : Date[1:600], format: "1998-09-07" "1994-07-15" ...
##  $ Length                  : num [1:600] 545 558 610 518 NA 540 457 508 493 560 ...
##  $ Weight                  : num [1:600] NA NA NA NA NA NA NA NA NA NA ...
##  $ Sex                     : chr [1:600] "female" "male" "male" "female" ...
##  $ Salt.Water.Age          : num [1:600] NA 3 4 2 3 2 2 NA 2 3 ...
##  $ DataSource              : chr [1:600] "ADFG AYK" "Upper Cook Inlet" "ADFG AYK" "Upper Cook Inlet" ...
##  $ cardNo                  : chr [1:600] NA "70" "2" "11" ...
##  $ fishNum                 : num [1:600] NA 7 26 12 NA 7 29 NA 30 5 ...
##  $ Age.Error               : chr [1:600] NA "-999" NA "-999" ...
##  $ Fresh.Water.Age         : num [1:600] NA 1 0 0 2 2 1 NA 1 0 ...
##  $ Sex.Determination.Method: chr [1:600] "External" NA "External" NA ...
##  $ subSystem               : chr [1:600] NA NA NA NA ...
##  $ Flag                    : chr [1:600] NA NA NA NA ...
##  $ Gear                    : chr [1:600] "gillnet" "gillnet" "gillnet" NA ...
##  $ SASAP.Region            : chr [1:600] "Yukon" "Cook Inlet" "Yukon" "Cook Inlet" ...
##  $ LocationUnique          : chr [1:600] "Pilot Station-test fishing-33423" "Cohoe-Ninilchik-commercial catch-244" "Emmonak (Village/City)-commercial catch-33414" "Susitna River-escapement-24741" ...
##  $ DistrictID              : num [1:600] 334 244 334 247 257 324 325 252 244 334 ...
##  $ Sub.DistrictID          : num [1:600] 3342 NA 3341 41 NA ...
##  $ Stat.area               : num [1:600] 33423 NA 33414 24741 NA ...
##  $ Lat                     : num [1:600] 62 NA NA 61.5 NA ...
##  $ Lon                     : num [1:600] -163 NA NA -151 NA ...
##  $ AWC_CODE                : chr [1:600] NA NA NA "247-41-10200" ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   Species = col_character(),
##   ..   Length.Measurement.Type = col_character(),
##   ..   sampleYear = col_double(),
##   ..   ASLProjectType = col_character(),
##   ..   LocationID = col_character(),
##   ..   sampleDate = col_date(format = ""),
##   ..   Length = col_double(),
##   ..   Weight = col_double(),
##   ..   Sex = col_character(),
##   ..   Salt.Water.Age = col_double(),
##   ..   DataSource = col_character(),
##   ..   cardNo = col_character(),
##   ..   fishNum = col_double(),
##   ..   Age.Error = col_character(),
##   ..   Fresh.Water.Age = col_double(),
##   ..   Sex.Determination.Method = col_character(),
##   ..   subSystem = col_character(),
##   ..   Flag = col_character(),
##   ..   Gear = col_character(),
##   ..   SASAP.Region = col_character(),
##   ..   LocationUnique = col_character(),
##   ..   DistrictID = col_double(),
##   ..   Sub.DistrictID = col_double(),
##   ..   Stat.area = col_double(),
##   ..   Lat = col_double(),
##   ..   Lon = col_double(),
##   ..   AWC_CODE = col_character()
##   .. )
##  - attr(*, "problems")=<externalptr>

Задание 2

Изучите данные на наличие пропущенных значений.

(1 балл) Определите, есть ли хотя бы одно пропущенное значение в датасете (выведите TRUE или FALSE).

anyNA(salmon)
## [1] TRUE

(1 балл) Посчитайте, сколько пропущенных значений содержится в каждом столбце таблицы.

col <- c()
na_num <- c()

for(i in 1:ncol(salmon)){
  col[i] = colnames(salmon)[i]
  na_num[i] = sum(is.na(salmon[,i]))
}

tbl <- tibble(
    col = col,
    na_num = na_num
  )

print(tbl, n = nrow(tbl))
## # A tibble: 27 × 2
##    col                      na_num
##    <chr>                     <int>
##  1 Species                       0
##  2 Length.Measurement.Type      63
##  3 sampleYear                    0
##  4 ASLProjectType                7
##  5 LocationID                    0
##  6 sampleDate                    0
##  7 Length                      165
##  8 Weight                      573
##  9 Sex                          67
## 10 Salt.Water.Age              106
## 11 DataSource                    0
## 12 cardNo                      354
## 13 fishNum                     340
## 14 Age.Error                   432
## 15 Fresh.Water.Age              87
## 16 Sex.Determination.Method    515
## 17 subSystem                   598
## 18 Flag                        598
## 19 Gear                         59
## 20 SASAP.Region                  0
## 21 LocationUnique                0
## 22 DistrictID                    4
## 23 Sub.DistrictID              212
## 24 Stat.area                   212
## 25 Lat                         386
## 26 Lon                         386
## 27 AWC_CODE                    402

(0.5 балла) Выведите название столбца с максимальным числом пропущенных значений.

tbl %>%
  arrange(desc(na_num)) 
## # A tibble: 27 × 2
##    col                      na_num
##    <chr>                     <int>
##  1 subSystem                   598
##  2 Flag                        598
##  3 Weight                      573
##  4 Sex.Determination.Method    515
##  5 Age.Error                   432
##  6 AWC_CODE                    402
##  7 Lat                         386
##  8 Lon                         386
##  9 cardNo                      354
## 10 fishNum                     340
## # ℹ 17 more rows
tbl %>% 
  slice(which.max(tbl$na_num)) %>%
  pull(col)
## [1] "subSystem"
tbl %>% 
  filter(na_num == max(na_num)) %>%
  pull(col)
## [1] "subSystem" "Flag"

Задание 3

Проведите фильтрацию вашего набора данных

sal_filt <- salmon %>%
  filter(Sex == "female") %>%
  select(matches("^s"), Length, Weight) %>%
  mutate(YAgo = (2024 - sampleYear)) %>% 
  rename(SDM = Sex.Determination.Method)

(1 балл) Оставьте рыб только одного пола на ваш выбор.

table(salmon$Sex)
## 
##                             1                             2 
##                             2                             7 
## examined but did not identify                        female 
##                            18                           250 
##                          male                       unknown 
##                           251                             5
table(sal_filt$Sex)
## 
## female 
##    250

(1 балл) Оставьте только столбцы, названия которых начинаются на S в любом регистре, а также столбцы с длиной и весом рыб.

colnames(salmon)
##  [1] "Species"                  "Length.Measurement.Type" 
##  [3] "sampleYear"               "ASLProjectType"          
##  [5] "LocationID"               "sampleDate"              
##  [7] "Length"                   "Weight"                  
##  [9] "Sex"                      "Salt.Water.Age"          
## [11] "DataSource"               "cardNo"                  
## [13] "fishNum"                  "Age.Error"               
## [15] "Fresh.Water.Age"          "Sex.Determination.Method"
## [17] "subSystem"                "Flag"                    
## [19] "Gear"                     "SASAP.Region"            
## [21] "LocationUnique"           "DistrictID"              
## [23] "Sub.DistrictID"           "Stat.area"               
## [25] "Lat"                      "Lon"                     
## [27] "AWC_CODE"
colnames(sal_filt)
##  [1] "Species"        "sampleYear"     "sampleDate"     "Sex"           
##  [5] "Salt.Water.Age" "SDM"            "subSystem"      "SASAP.Region"  
##  [9] "Sub.DistrictID" "Stat.area"      "Length"         "Weight"        
## [13] "YAgo"

(1 балл) Добавьте столбец, показывающий, сколько лет назад была выловлена рыба.

sal_filt %>%
  select(Species, sampleYear, YAgo)
## # A tibble: 250 × 3
##    Species sampleYear  YAgo
##    <chr>        <dbl> <dbl>
##  1 coho          1998    26
##  2 pink          1969    55
##  3 sockeye       2004    20
##  4 sockeye       2008    16
##  5 chum          2006    18
##  6 sockeye       1996    28
##  7 chum          1990    34
##  8 sockeye       1991    33
##  9 sockeye       2001    23
## 10 sockeye       1995    29
## # ℹ 240 more rows

(1 балл) Переименуйте столбец Sex.Determination.Method в SDM.

clmns <- c('Sex.Determination.Method', 'SDM')

colnames(salmon)[colnames(salmon) %in% clmns]
## [1] "Sex.Determination.Method"
colnames(sal_filt)[colnames(sal_filt) %in% clmns]
## [1] "SDM"

(1 балл) Удалите все пропущенные значения. Сколько строк осталось?

drop_na(sal_filt)
## # A tibble: 0 × 13
## # ℹ 13 variables: Species <chr>, sampleYear <dbl>, sampleDate <date>,
## #   Sex <chr>, Salt.Water.Age <dbl>, SDM <chr>, subSystem <chr>,
## #   SASAP.Region <chr>, Sub.DistrictID <dbl>, Stat.area <dbl>, Length <dbl>,
## #   Weight <dbl>, YAgo <dbl>

Задание 4

Изучите распределение веса тела рыб из вашего набора данных.

(1 балл) Рассчитайте основные статистики: среднее, стандартное отклонение, медиану, минимальное и максимальное значения. Они должны оказаться в одном векторе.

a <- c(mean(sal_filt$Weight), 
       sd(sal_filt$Weight),
       median(sal_filt$Weight),
       min(sal_filt$Weight),
       max(sal_filt$Weight))
a
## [1] NA NA NA NA NA
a <- c(mean(sal_filt$Weight, na.rm = T), 
       sd(sal_filt$Weight, na.rm = T),
       median(sal_filt$Weight, na.rm = T),
       min(sal_filt$Weight, na.rm = T),
       max(sal_filt$Weight, na.rm = T))
a
## [1] 116.7273 161.1826   0.0000   0.0000 376.0000

(2 балла) На одном графике визуализируйте распределение веса рыб для каждого вида.

p1 <- sal_filt %>%
  ggplot(aes(x = Weight, fill = Species)) +
  geom_histogram()
p1
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 239 rows containing non-finite outside the scale range
## (`stat_bin()`).

p2 <- sal_filt %>%
  ggplot(aes(x = Species, y = Weight, fill = Species)) +
  geom_boxplot()

p2
## Warning: Removed 239 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

(0.5 балла) Добавьте название графика.

p1 + labs(title = 'New plot title')
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 239 rows containing non-finite outside the scale range
## (`stat_bin()`).

(0.5 балла) Увеличьте шрифт для названия OY.

p1 + 
  theme(axis.title.y = element_text(size=22))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 239 rows containing non-finite outside the scale range
## (`stat_bin()`).

(0.5 балла) Замените подпись для OX.

p1 + labs(x = 'Вес рыбов')
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 239 rows containing non-finite outside the scale range
## (`stat_bin()`).

(1 балл) Измените палитру.

p1 + scale_fill_manual(values = c('green', 'brown'))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 239 rows containing non-finite outside the scale range
## (`stat_bin()`).

(0.5 балла) Измените название легенды.

p1 + labs(fill = 'Вид')
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 239 rows containing non-finite outside the scale range
## (`stat_bin()`).

(1 балл) Добавьте на график элементы, показывающие общее среднее значение признака. Не забудьте добавить на график пояснения, что именно вы изобразили.

p1 + geom_vline(xintercept = a[1], linetype = "dashed", color = "firebrick", linewidth = 1) +
  annotate(
    "text", x = 250, y = 50,
    label = str_wrap("Среднее значение веса всех рыб", 20))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 239 rows containing non-finite outside the scale range
## (`stat_bin()`).