set.seed(000)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.0 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
salmon_full <- read_csv('https://kodomo.fbb.msu.ru/FBB/year_22/lectures/salmon_10k.csv')
## Rows: 10000 Columns: 27
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (15): Species, Length.Measurement.Type, ASLProjectType, LocationID, Sex...
## dbl (11): sampleYear, Length, Weight, Salt.Water.Age, fishNum, Fresh.Water....
## date (1): sampleDate
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(salmon_full)
## # A tibble: 6 × 27
## Species Length.Measurement.T…¹ sampleYear ASLProjectType LocationID sampleDate
## <chr> <chr> <dbl> <chr> <chr> <date>
## 1 chum mid-eye to fork of ta… 2014 test fishing Pilot Sta… 2014-06-29
## 2 sockeye mid-eye to fork of ta… 2009 commercial ca… Upper Cla… 2009-08-10
## 3 sockeye <NA> 1998 commercial ca… Volcano B… 1998-07-18
## 4 sockeye mid-eye to fork of ta… 2000 commercial ca… Point Ard… 2000-07-31
## 5 chinook mid-eye to fork of ta… 1983 escapement Anvik Riv… 1983-08-08
## 6 sockeye mid-eye to fork of ta… 1982 commercial ca… Alsek Riv… 1982-06-29
## # ℹ abbreviated name: ¹Length.Measurement.Type
## # ℹ 21 more variables: Length <dbl>, Weight <dbl>, Sex <chr>,
## # Salt.Water.Age <dbl>, DataSource <chr>, cardNo <chr>, fishNum <dbl>,
## # Age.Error <chr>, Fresh.Water.Age <dbl>, Sex.Determination.Method <chr>,
## # subSystem <chr>, Flag <chr>, Gear <chr>, SASAP.Region <chr>,
## # LocationUnique <chr>, DistrictID <dbl>, Sub.DistrictID <dbl>,
## # Stat.area <dbl>, Lat <dbl>, Lon <dbl>, AWC_CODE <chr>
dim(salmon_full)
## [1] 10000 27
str(salmon_full)
## spc_tbl_ [10,000 × 27] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ Species : chr [1:10000] "chum" "sockeye" "sockeye" "sockeye" ...
## $ Length.Measurement.Type : chr [1:10000] "mid-eye to fork of tail" "mid-eye to fork of tail" NA "mid-eye to fork of tail" ...
## $ sampleYear : num [1:10000] 2014 2009 1998 2000 1983 ...
## $ ASLProjectType : chr [1:10000] "test fishing" "commercial catch" "commercial catch" "commercial catch" ...
## $ LocationID : chr [1:10000] "Pilot Station" "Upper Clarence/Steamer Bay/Quiet Harbor" "Volcano Bay" "Point Arden to Midway Islands" ...
## $ sampleDate : Date[1:10000], format: "2014-06-29" "2009-08-10" ...
## $ Length : num [1:10000] 663 NA NA NA 742 605 508 675 425 520 ...
## $ Weight : num [1:10000] NA NA NA NA NA NA NA NA NA NA ...
## $ Sex : chr [1:10000] "male" "female" NA "male" ...
## $ Salt.Water.Age : num [1:10000] NA 3 3 2 NA 3 2 3 2 NA ...
## $ DataSource : chr [1:10000] "ADFG AYK" "ADFG Southeast and Westward" "ADFG Southeast and Westward" "ADFG Southeast and Westward" ...
## $ cardNo : chr [1:10000] NA NA NA NA ...
## $ fishNum : num [1:10000] NA NA NA NA 6 NA NA NA NA NA ...
## $ Age.Error : chr [1:10000] NA NA NA "1 6" ...
## $ Fresh.Water.Age : num [1:10000] 0 1 2 1 NA 1 2 0 1 0 ...
## $ Sex.Determination.Method: chr [1:10000] "External" NA NA NA ...
## $ subSystem : chr [1:10000] NA NA NA NA ...
## $ Flag : chr [1:10000] NA NA NA NA ...
## $ Gear : chr [1:10000] "gillnet" "gillnet" NA "gillnet" ...
## $ SASAP.Region : chr [1:10000] "Yukon" "Southeast" "Alaska Peninsula and Aleutian Islands" "Southeast" ...
## $ LocationUnique : chr [1:10000] "Pilot Station-test fishing-33423" "Upper Clarence/Steamer Bay/Quiet Harbor-commercial catch-10630" "Volcano Bay-commercial catch-28436" "Point Arden to Midway Islands-commercial catch-11131" ...
## $ DistrictID : num [1:10000] 334 106 284 111 334 182 324 115 111 NA ...
## $ Sub.DistrictID : num [1:10000] 3342 30 36 31 3344 ...
## $ Stat.area : num [1:10000] 33423 10630 28436 11131 33447 ...
## $ Lat : num [1:10000] 62 NA NA NA 63 ...
## $ Lon : num [1:10000] -163 NA NA NA -161 ...
## $ AWC_CODE : chr [1:10000] NA NA NA NA ...
## - attr(*, "spec")=
## .. cols(
## .. Species = col_character(),
## .. Length.Measurement.Type = col_character(),
## .. sampleYear = col_double(),
## .. ASLProjectType = col_character(),
## .. LocationID = col_character(),
## .. sampleDate = col_date(format = ""),
## .. Length = col_double(),
## .. Weight = col_double(),
## .. Sex = col_character(),
## .. Salt.Water.Age = col_double(),
## .. DataSource = col_character(),
## .. cardNo = col_character(),
## .. fishNum = col_double(),
## .. Age.Error = col_character(),
## .. Fresh.Water.Age = col_double(),
## .. Sex.Determination.Method = col_character(),
## .. subSystem = col_character(),
## .. Flag = col_character(),
## .. Gear = col_character(),
## .. SASAP.Region = col_character(),
## .. LocationUnique = col_character(),
## .. DistrictID = col_double(),
## .. Sub.DistrictID = col_double(),
## .. Stat.area = col_double(),
## .. Lat = col_double(),
## .. Lon = col_double(),
## .. AWC_CODE = col_character()
## .. )
## - attr(*, "problems")=<externalptr>
salmon <- salmon_full %>%
slice_sample(n = 600)
head(salmon)
## # A tibble: 6 × 27
## Species Length.Measurement.T…¹ sampleYear ASLProjectType LocationID sampleDate
## <chr> <chr> <dbl> <chr> <chr> <date>
## 1 coho mid-eye to fork of ta… 1998 test fishing Pilot Sta… 1998-09-07
## 2 sockeye mid-eye to fork of ta… 1994 commercial ca… Cohoe-Nin… 1994-07-15
## 3 chum mid-eye to fork of ta… 2011 commercial ca… Emmonak (… 2011-07-06
## 4 pink mid-eye to fork of ta… 1969 escapement Susitna R… 1969-08-03
## 5 sockeye <NA> 2005 commercial ca… Moser - O… 2005-07-20
## 6 sockeye mid-eye to fork of ta… 1985 escapement Kvichak R… 1985-07-16
## # ℹ abbreviated name: ¹Length.Measurement.Type
## # ℹ 21 more variables: Length <dbl>, Weight <dbl>, Sex <chr>,
## # Salt.Water.Age <dbl>, DataSource <chr>, cardNo <chr>, fishNum <dbl>,
## # Age.Error <chr>, Fresh.Water.Age <dbl>, Sex.Determination.Method <chr>,
## # subSystem <chr>, Flag <chr>, Gear <chr>, SASAP.Region <chr>,
## # LocationUnique <chr>, DistrictID <dbl>, Sub.DistrictID <dbl>,
## # Stat.area <dbl>, Lat <dbl>, Lon <dbl>, AWC_CODE <chr>
dim(salmon)
## [1] 600 27
str(salmon)
## spc_tbl_ [600 × 27] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ Species : chr [1:600] "coho" "sockeye" "chum" "pink" ...
## $ Length.Measurement.Type : chr [1:600] "mid-eye to fork of tail" "mid-eye to fork of tail" "mid-eye to fork of tail" "mid-eye to fork of tail" ...
## $ sampleYear : num [1:600] 1998 1994 2011 1969 2005 ...
## $ ASLProjectType : chr [1:600] "test fishing" "commercial catch" "commercial catch" "escapement" ...
## $ LocationID : chr [1:600] "Pilot Station" "Cohoe-Ninilchik" "Emmonak (Village/City)" "Susitna River" ...
## $ sampleDate : Date[1:600], format: "1998-09-07" "1994-07-15" ...
## $ Length : num [1:600] 545 558 610 518 NA 540 457 508 493 560 ...
## $ Weight : num [1:600] NA NA NA NA NA NA NA NA NA NA ...
## $ Sex : chr [1:600] "female" "male" "male" "female" ...
## $ Salt.Water.Age : num [1:600] NA 3 4 2 3 2 2 NA 2 3 ...
## $ DataSource : chr [1:600] "ADFG AYK" "Upper Cook Inlet" "ADFG AYK" "Upper Cook Inlet" ...
## $ cardNo : chr [1:600] NA "70" "2" "11" ...
## $ fishNum : num [1:600] NA 7 26 12 NA 7 29 NA 30 5 ...
## $ Age.Error : chr [1:600] NA "-999" NA "-999" ...
## $ Fresh.Water.Age : num [1:600] NA 1 0 0 2 2 1 NA 1 0 ...
## $ Sex.Determination.Method: chr [1:600] "External" NA "External" NA ...
## $ subSystem : chr [1:600] NA NA NA NA ...
## $ Flag : chr [1:600] NA NA NA NA ...
## $ Gear : chr [1:600] "gillnet" "gillnet" "gillnet" NA ...
## $ SASAP.Region : chr [1:600] "Yukon" "Cook Inlet" "Yukon" "Cook Inlet" ...
## $ LocationUnique : chr [1:600] "Pilot Station-test fishing-33423" "Cohoe-Ninilchik-commercial catch-244" "Emmonak (Village/City)-commercial catch-33414" "Susitna River-escapement-24741" ...
## $ DistrictID : num [1:600] 334 244 334 247 257 324 325 252 244 334 ...
## $ Sub.DistrictID : num [1:600] 3342 NA 3341 41 NA ...
## $ Stat.area : num [1:600] 33423 NA 33414 24741 NA ...
## $ Lat : num [1:600] 62 NA NA 61.5 NA ...
## $ Lon : num [1:600] -163 NA NA -151 NA ...
## $ AWC_CODE : chr [1:600] NA NA NA "247-41-10200" ...
## - attr(*, "spec")=
## .. cols(
## .. Species = col_character(),
## .. Length.Measurement.Type = col_character(),
## .. sampleYear = col_double(),
## .. ASLProjectType = col_character(),
## .. LocationID = col_character(),
## .. sampleDate = col_date(format = ""),
## .. Length = col_double(),
## .. Weight = col_double(),
## .. Sex = col_character(),
## .. Salt.Water.Age = col_double(),
## .. DataSource = col_character(),
## .. cardNo = col_character(),
## .. fishNum = col_double(),
## .. Age.Error = col_character(),
## .. Fresh.Water.Age = col_double(),
## .. Sex.Determination.Method = col_character(),
## .. subSystem = col_character(),
## .. Flag = col_character(),
## .. Gear = col_character(),
## .. SASAP.Region = col_character(),
## .. LocationUnique = col_character(),
## .. DistrictID = col_double(),
## .. Sub.DistrictID = col_double(),
## .. Stat.area = col_double(),
## .. Lat = col_double(),
## .. Lon = col_double(),
## .. AWC_CODE = col_character()
## .. )
## - attr(*, "problems")=<externalptr>
Изучите данные на наличие пропущенных значений.
anyNA(salmon)
## [1] TRUE
col <- c()
na_num <- c()
for(i in 1:ncol(salmon)){
col[i] = colnames(salmon)[i]
na_num[i] = sum(is.na(salmon[,i]))
}
tbl <- tibble(
col = col,
na_num = na_num
)
print(tbl, n = nrow(tbl))
## # A tibble: 27 × 2
## col na_num
## <chr> <int>
## 1 Species 0
## 2 Length.Measurement.Type 63
## 3 sampleYear 0
## 4 ASLProjectType 7
## 5 LocationID 0
## 6 sampleDate 0
## 7 Length 165
## 8 Weight 573
## 9 Sex 67
## 10 Salt.Water.Age 106
## 11 DataSource 0
## 12 cardNo 354
## 13 fishNum 340
## 14 Age.Error 432
## 15 Fresh.Water.Age 87
## 16 Sex.Determination.Method 515
## 17 subSystem 598
## 18 Flag 598
## 19 Gear 59
## 20 SASAP.Region 0
## 21 LocationUnique 0
## 22 DistrictID 4
## 23 Sub.DistrictID 212
## 24 Stat.area 212
## 25 Lat 386
## 26 Lon 386
## 27 AWC_CODE 402
tbl %>%
arrange(desc(na_num))
## # A tibble: 27 × 2
## col na_num
## <chr> <int>
## 1 subSystem 598
## 2 Flag 598
## 3 Weight 573
## 4 Sex.Determination.Method 515
## 5 Age.Error 432
## 6 AWC_CODE 402
## 7 Lat 386
## 8 Lon 386
## 9 cardNo 354
## 10 fishNum 340
## # ℹ 17 more rows
tbl %>%
slice(which.max(tbl$na_num)) %>%
pull(col)
## [1] "subSystem"
tbl %>%
filter(na_num == max(na_num)) %>%
pull(col)
## [1] "subSystem" "Flag"
Проведите фильтрацию вашего набора данных
sal_filt <- salmon %>%
filter(Sex == "female") %>%
select(matches("^s"), Length, Weight) %>%
mutate(YAgo = (2024 - sampleYear)) %>%
rename(SDM = Sex.Determination.Method)
table(salmon$Sex)
##
## 1 2
## 2 7
## examined but did not identify female
## 18 250
## male unknown
## 251 5
table(sal_filt$Sex)
##
## female
## 250
colnames(salmon)
## [1] "Species" "Length.Measurement.Type"
## [3] "sampleYear" "ASLProjectType"
## [5] "LocationID" "sampleDate"
## [7] "Length" "Weight"
## [9] "Sex" "Salt.Water.Age"
## [11] "DataSource" "cardNo"
## [13] "fishNum" "Age.Error"
## [15] "Fresh.Water.Age" "Sex.Determination.Method"
## [17] "subSystem" "Flag"
## [19] "Gear" "SASAP.Region"
## [21] "LocationUnique" "DistrictID"
## [23] "Sub.DistrictID" "Stat.area"
## [25] "Lat" "Lon"
## [27] "AWC_CODE"
colnames(sal_filt)
## [1] "Species" "sampleYear" "sampleDate" "Sex"
## [5] "Salt.Water.Age" "SDM" "subSystem" "SASAP.Region"
## [9] "Sub.DistrictID" "Stat.area" "Length" "Weight"
## [13] "YAgo"
sal_filt %>%
select(Species, sampleYear, YAgo)
## # A tibble: 250 × 3
## Species sampleYear YAgo
## <chr> <dbl> <dbl>
## 1 coho 1998 26
## 2 pink 1969 55
## 3 sockeye 2004 20
## 4 sockeye 2008 16
## 5 chum 2006 18
## 6 sockeye 1996 28
## 7 chum 1990 34
## 8 sockeye 1991 33
## 9 sockeye 2001 23
## 10 sockeye 1995 29
## # ℹ 240 more rows
clmns <- c('Sex.Determination.Method', 'SDM')
colnames(salmon)[colnames(salmon) %in% clmns]
## [1] "Sex.Determination.Method"
colnames(sal_filt)[colnames(sal_filt) %in% clmns]
## [1] "SDM"
drop_na(sal_filt)
## # A tibble: 0 × 13
## # ℹ 13 variables: Species <chr>, sampleYear <dbl>, sampleDate <date>,
## # Sex <chr>, Salt.Water.Age <dbl>, SDM <chr>, subSystem <chr>,
## # SASAP.Region <chr>, Sub.DistrictID <dbl>, Stat.area <dbl>, Length <dbl>,
## # Weight <dbl>, YAgo <dbl>
Изучите распределение веса тела рыб из вашего набора данных.
a <- c(mean(sal_filt$Weight),
sd(sal_filt$Weight),
median(sal_filt$Weight),
min(sal_filt$Weight),
max(sal_filt$Weight))
a
## [1] NA NA NA NA NA
a <- c(mean(sal_filt$Weight, na.rm = T),
sd(sal_filt$Weight, na.rm = T),
median(sal_filt$Weight, na.rm = T),
min(sal_filt$Weight, na.rm = T),
max(sal_filt$Weight, na.rm = T))
a
## [1] 116.7273 161.1826 0.0000 0.0000 376.0000
(2 балла) На одном графике визуализируйте распределение веса рыб для каждого вида.
p1 <- sal_filt %>%
ggplot(aes(x = Weight, fill = Species)) +
geom_histogram()
p1
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 239 rows containing non-finite outside the scale range
## (`stat_bin()`).
p2 <- sal_filt %>%
ggplot(aes(x = Species, y = Weight, fill = Species)) +
geom_boxplot()
p2
## Warning: Removed 239 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
p1 + labs(title = 'New plot title')
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 239 rows containing non-finite outside the scale range
## (`stat_bin()`).
p1 +
theme(axis.title.y = element_text(size=22))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 239 rows containing non-finite outside the scale range
## (`stat_bin()`).
p1 + labs(x = 'Вес рыбов')
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 239 rows containing non-finite outside the scale range
## (`stat_bin()`).
p1 + scale_fill_manual(values = c('green', 'brown'))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 239 rows containing non-finite outside the scale range
## (`stat_bin()`).
p1 + labs(fill = 'Вид')
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 239 rows containing non-finite outside the scale range
## (`stat_bin()`).
p1 + geom_vline(xintercept = a[1], linetype = "dashed", color = "firebrick", linewidth = 1) +
annotate(
"text", x = 250, y = 50,
label = str_wrap("Среднее значение веса всех рыб", 20))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 239 rows containing non-finite outside the scale range
## (`stat_bin()`).