library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
# Данные assembly report
contig_data <- data.frame(
Sequence = c('I', 'II', 'III', 'IV', 'V', 'VI', 'VII', 'VIII', 'IX', 'X',
'XI', 'XII', 'XIII', 'XIV', 'XV', 'XVI', 'MT'),
Length = c(230218, 813184, 316620, 1531933, 576874, 270161, 1090940, 562643,
439888, 745751, 666816, 1078177, 924431, 784333, 1091291, 948066, 85779)
)
# Сортировка по убыванию длины
contig_sorted <- contig_data %>%
arrange(desc(Length)) %>%
mutate(Rank = row_number(),
Cumulative_Length = cumsum(Length),
Cumulative_Percent = Cumulative_Length / sum(Length) * 100)
# Вычисление N50 и L50
total_length <- sum(contig_sorted$Length)
half_total <- total_length / 2
cumulative_sum <- 0
n50_value <- 0
l50_value <- 0
n50_contig <- ""
for (i in 1:nrow(contig_sorted)) {
cumulative_sum <- cumulative_sum + contig_sorted$Length[i]
if (cumulative_sum >= half_total && n50_value == 0) {
n50_value <- contig_sorted$Length[i]
l50_value <- i
n50_contig <- contig_sorted$Sequence[i]
break
}
}
# ГРАФИК 1: Убывание длины контигов с N50/L50
p1 <- ggplot(contig_sorted, aes(x = Rank, y = Length)) +
geom_line(color = "blue", linewidth = 1) +
geom_point(color = "blue", size = 2) +
geom_point(data = contig_sorted %>% filter(Rank == l50_value),
aes(x = Rank, y = Length),
color = "red", size = 5, shape = 18) +
geom_label(data = contig_sorted %>% filter(Rank == l50_value),
aes(x = Rank, y = Length,
label = paste("N50/L50\n",
"Контиг:", n50_contig, "\n",
"N50 =", format(n50_value, big.mark = ","), "bp\n",
"L50 =", l50_value)),
nudge_x = 2, nudge_y = n50_value * 0.2,
size = 3.5, color = "red", fill = "lightyellow") +
geom_text(data = contig_sorted %>% head(6),
aes(label = Sequence),
nudge_y = max(contig_sorted$Length) * 0.05,
size = 3, color = "darkgreen") +
scale_x_continuous(breaks = 1:nrow(contig_sorted)) +
scale_y_continuous(labels = scales::comma) +
labs(
title = "Убывание длины контигов генома Saccharomyces cerevisiae S288C",
subtitle = paste("С указанием N50 и L50 (общая длина:", format(total_length, big.mark = ","), "bp)"),
x = "Ранг контига (по убыванию длины)",
y = "Длина (bp)"
) +
theme_minimal() +
theme(
plot.title = element_text(hjust = 0.5, face = "bold", size = 14),
plot.subtitle = element_text(hjust = 0.5, size = 12),
panel.grid.major = element_line(color = "grey80"),
panel.grid.minor = element_line(color = "grey90"),
axis.text = element_text(size = 10),
axis.title = element_text(size = 12)
)
png("contig_length_distribution.png", width = 12, height = 8, units = "in", res = 300)
print(p1)
dev.off()
## quartz_off_screen
## 2
# ГРАФИК 2: Кумулятивное покрытие генома
p2 <- ggplot(contig_sorted, aes(x = Rank, y = Cumulative_Percent)) +
geom_line(color = "purple", linewidth = 1.5) +
geom_point(color = "purple", size = 2) +
geom_hline(yintercept = 50, linetype = "dashed", color = "red", linewidth = 1) +
geom_vline(xintercept = l50_value, linetype = "dashed", color = "red", linewidth = 1) +
geom_point(aes(x = l50_value, y = 50), color = "red", size = 4, shape = 18) +
geom_label(aes(x = l50_value, y = 50,
label = paste("L50 =", l50_value)),
nudge_x = 2, nudge_y = 10,
size = 4, color = "red", fill = "lightyellow") +
scale_x_continuous(breaks = seq(1, nrow(contig_sorted), by = 2)) +
labs(
title = "Кумулятивное покрытие генома Saccharomyces cerevisiae",
subtitle = "Процент генома, покрываемый N крупнейшими контигами",
x = "Количество контигов (по убыванию длины)",
y = "Процент покрытия генома (%)"
) +
theme_minimal() +
theme(
plot.title = element_text(hjust = 0.5, face = "bold", size = 14),
plot.subtitle = element_text(hjust = 0.5, size = 12),
panel.grid.major = element_line(color = "grey80"),
panel.grid.minor = element_line(color = "grey90")
)
png("cumulative_genome_coverage.png", width = 10, height = 7, units = "in", res = 300)
print(p2)
## Warning in geom_point(aes(x = l50_value, y = 50), color = "red", size = 4, : All aesthetics have length 1, but the data has 17 rows.
## ℹ Please consider using `annotate()` or provide this layer with data containing
## a single row.
## Warning in geom_label(aes(x = l50_value, y = 50, label = paste("L50 =", : All aesthetics have length 1, but the data has 17 rows.
## ℹ Please consider using `annotate()` or provide this layer with data containing
## a single row.
dev.off()
## quartz_off_screen
## 2
# ГРАФИК 3: Логарифмическая шкала
p3 <- ggplot(contig_sorted, aes(x = Rank, y = Length)) +
geom_line(color = "darkgreen", linewidth = 1) +
geom_point(color = "darkgreen", size = 2) +
geom_point(data = contig_sorted %>% filter(Rank == l50_value),
aes(x = Rank, y = Length),
color = "red", size = 5, shape = 18) +
geom_label(data = contig_sorted %>% filter(Rank == l50_value),
aes(x = Rank, y = Length,
label = paste("N50/L50\nКонтиг:", n50_contig)),
nudge_x = 2, nudge_y = 0.2,
size = 3.5, color = "red", fill = "lightyellow") +
scale_y_log10(
labels = scales::comma,
breaks = 10^(3:7)
) +
labs(
title = "Убывание длины контигов (логарифмическая шкала)",
subtitle = paste("Saccharomyces cerevisiae S288C | N50 =", format(n50_value, big.mark = ","), "bp"),
x = "Ранг контига",
y = "Длина (bp, log scale)"
) +
theme_minimal() +
theme(
plot.title = element_text(hjust = 0.5, face = "bold", size = 14),
plot.subtitle = element_text(hjust = 0.5, size = 12)
)
png("contig_length_log_scale.png", width = 10, height = 7, units = "in", res = 300)
print(p3)
dev.off()
## quartz_off_screen
## 2
# Вывод статистики
cat("СТАТИСТИКА ГЕНОМА:\n")
## СТАТИСТИКА ГЕНОМА:
cat("Общая длина генома:", format(total_length, big.mark = ","), "bp\n")
## Общая длина генома: 12,157,105 bp
cat("Количество контигов:", nrow(contig_sorted), "\n")
## Количество контигов: 17
cat("N50:", format(n50_value, big.mark = ","), "bp\n")
## N50: 924,431 bp
cat("L50:", l50_value, "\n")
## L50: 6
cat("Контиг, определяющий N50:", n50_contig, "\n")
## Контиг, определяющий N50: XIII
cat("Самый длинный контиг:", contig_sorted$Sequence[1], "(",
format(contig_sorted$Length[1], big.mark = ","), "bp)\n")
## Самый длинный контиг: IV ( 1,531,933 bp)
cat("Самый короткий контиг:", contig_sorted$Sequence[nrow(contig_sorted)], "(",
format(contig_sorted$Length[nrow(contig_sorted)], big.mark = ","), "bp)\n")
## Самый короткий контиг: MT ( 85,779 bp)
stats_text <- paste(
"СТАТИСТИКА ГЕНОМА Saccharomyces cerevisiae S288C",
"================================================",
paste("Общая длина генома:", format(total_length, big.mark = ","), "bp"),
paste("Количество контигов:", nrow(contig_sorted)),
paste("N50:", format(n50_value, big.mark = ","), "bp"),
paste("L50:", l50_value),
paste("Контиг, определяющий N50:", n50_contig),
paste("Самый длинный контиг:", contig_sorted$Sequence[1], "(",
format(contig_sorted$Length[1], big.mark = ","), "bp)"),
paste("Самый короткий контиг:", contig_sorted$Sequence[nrow(contig_sorted)], "(",
format(contig_sorted$Length[nrow(contig_sorted)], big.mark = ","), "bp)"),
sep = "\n"
)
writeLines(stats_text, "genome_statistics.txt")
cat("\nГрафики:\n")
##
## Графики:
cat("1. contig_length_distribution.png - Основной график убывания длин\n")
## 1. contig_length_distribution.png - Основной график убывания длин
cat("2. cumulative_genome_coverage.png - Кумулятивное покрытие\n")
## 2. cumulative_genome_coverage.png - Кумулятивное покрытие
cat("3. contig_length_log_scale.png - Логарифмическая шкала\n")
## 3. contig_length_log_scale.png - Логарифмическая шкала
cat("4. genome_statistics.txt - Статистика генома\n")
## 4. genome_statistics.txt - Статистика генома