library(ggplot2)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyr)

# Данные assembly report
contig_data <- data.frame(
  Sequence = c('I', 'II', 'III', 'IV', 'V', 'VI', 'VII', 'VIII', 'IX', 'X', 
               'XI', 'XII', 'XIII', 'XIV', 'XV', 'XVI', 'MT'),
  Length = c(230218, 813184, 316620, 1531933, 576874, 270161, 1090940, 562643, 
             439888, 745751, 666816, 1078177, 924431, 784333, 1091291, 948066, 85779)
)

# Сортировка по убыванию длины
contig_sorted <- contig_data %>%
  arrange(desc(Length)) %>%
  mutate(Rank = row_number(),
         Cumulative_Length = cumsum(Length),
         Cumulative_Percent = Cumulative_Length / sum(Length) * 100)

# Вычисление N50 и L50
total_length <- sum(contig_sorted$Length)
half_total <- total_length / 2

cumulative_sum <- 0
n50_value <- 0
l50_value <- 0
n50_contig <- ""

for (i in 1:nrow(contig_sorted)) {
  cumulative_sum <- cumulative_sum + contig_sorted$Length[i]
  if (cumulative_sum >= half_total && n50_value == 0) {
    n50_value <- contig_sorted$Length[i]
    l50_value <- i
    n50_contig <- contig_sorted$Sequence[i]
    break
  }
}

# ГРАФИК 1: Убывание длины контигов с N50/L50
p1 <- ggplot(contig_sorted, aes(x = Rank, y = Length)) +
  geom_line(color = "blue", linewidth = 1) +
  geom_point(color = "blue", size = 2) +
  geom_point(data = contig_sorted %>% filter(Rank == l50_value),
             aes(x = Rank, y = Length), 
             color = "red", size = 5, shape = 18) +
  geom_label(data = contig_sorted %>% filter(Rank == l50_value),
            aes(x = Rank, y = Length, 
                label = paste("N50/L50\n", 
                             "Контиг:", n50_contig, "\n",
                             "N50 =", format(n50_value, big.mark = ","), "bp\n",
                             "L50 =", l50_value)),
            nudge_x = 2, nudge_y = n50_value * 0.2,
            size = 3.5, color = "red", fill = "lightyellow") +
  geom_text(data = contig_sorted %>% head(6),
            aes(label = Sequence), 
            nudge_y = max(contig_sorted$Length) * 0.05,
            size = 3, color = "darkgreen") +
  scale_x_continuous(breaks = 1:nrow(contig_sorted)) +
  scale_y_continuous(labels = scales::comma) +
  labs(
    title = "Убывание длины контигов генома Saccharomyces cerevisiae S288C",
    subtitle = paste("С указанием N50 и L50 (общая длина:", format(total_length, big.mark = ","), "bp)"),
    x = "Ранг контига (по убыванию длины)",
    y = "Длина (bp)"
  ) +
  theme_minimal() +
  theme(
    plot.title = element_text(hjust = 0.5, face = "bold", size = 14),
    plot.subtitle = element_text(hjust = 0.5, size = 12),
    panel.grid.major = element_line(color = "grey80"),
    panel.grid.minor = element_line(color = "grey90"),
    axis.text = element_text(size = 10),
    axis.title = element_text(size = 12)
  )


png("contig_length_distribution.png", width = 12, height = 8, units = "in", res = 300)
print(p1)
dev.off()
## quartz_off_screen 
##                 2
# ГРАФИК 2: Кумулятивное покрытие генома
p2 <- ggplot(contig_sorted, aes(x = Rank, y = Cumulative_Percent)) +
  geom_line(color = "purple", linewidth = 1.5) +
  geom_point(color = "purple", size = 2) +
  geom_hline(yintercept = 50, linetype = "dashed", color = "red", linewidth = 1) +
  geom_vline(xintercept = l50_value, linetype = "dashed", color = "red", linewidth = 1) +
  geom_point(aes(x = l50_value, y = 50), color = "red", size = 4, shape = 18) +
  geom_label(aes(x = l50_value, y = 50, 
                label = paste("L50 =", l50_value)),
            nudge_x = 2, nudge_y = 10,
            size = 4, color = "red", fill = "lightyellow") +
  scale_x_continuous(breaks = seq(1, nrow(contig_sorted), by = 2)) +
  labs(
    title = "Кумулятивное покрытие генома Saccharomyces cerevisiae",
    subtitle = "Процент генома, покрываемый N крупнейшими контигами",
    x = "Количество контигов (по убыванию длины)",
    y = "Процент покрытия генома (%)"
  ) +
  theme_minimal() +
  theme(
    plot.title = element_text(hjust = 0.5, face = "bold", size = 14),
    plot.subtitle = element_text(hjust = 0.5, size = 12),
    panel.grid.major = element_line(color = "grey80"),
    panel.grid.minor = element_line(color = "grey90")
  )


png("cumulative_genome_coverage.png", width = 10, height = 7, units = "in", res = 300)
print(p2)
## Warning in geom_point(aes(x = l50_value, y = 50), color = "red", size = 4, : All aesthetics have length 1, but the data has 17 rows.
## ℹ Please consider using `annotate()` or provide this layer with data containing
##   a single row.
## Warning in geom_label(aes(x = l50_value, y = 50, label = paste("L50 =", : All aesthetics have length 1, but the data has 17 rows.
## ℹ Please consider using `annotate()` or provide this layer with data containing
##   a single row.
dev.off()
## quartz_off_screen 
##                 2
# ГРАФИК 3: Логарифмическая шкала
p3 <- ggplot(contig_sorted, aes(x = Rank, y = Length)) +
  geom_line(color = "darkgreen", linewidth = 1) +
  geom_point(color = "darkgreen", size = 2) +
  geom_point(data = contig_sorted %>% filter(Rank == l50_value),
             aes(x = Rank, y = Length), 
             color = "red", size = 5, shape = 18) +
  geom_label(data = contig_sorted %>% filter(Rank == l50_value),
            aes(x = Rank, y = Length, 
                label = paste("N50/L50\nКонтиг:", n50_contig)),
            nudge_x = 2, nudge_y = 0.2,
            size = 3.5, color = "red", fill = "lightyellow") +
  scale_y_log10(
    labels = scales::comma,
    breaks = 10^(3:7)
  ) +
  labs(
    title = "Убывание длины контигов (логарифмическая шкала)",
    subtitle = paste("Saccharomyces cerevisiae S288C | N50 =", format(n50_value, big.mark = ","), "bp"),
    x = "Ранг контига",
    y = "Длина (bp, log scale)"
  ) +
  theme_minimal() +
  theme(
    plot.title = element_text(hjust = 0.5, face = "bold", size = 14),
    plot.subtitle = element_text(hjust = 0.5, size = 12)
  )


png("contig_length_log_scale.png", width = 10, height = 7, units = "in", res = 300)
print(p3)
dev.off()
## quartz_off_screen 
##                 2
# Вывод статистики
cat("СТАТИСТИКА ГЕНОМА:\n")
## СТАТИСТИКА ГЕНОМА:
cat("Общая длина генома:", format(total_length, big.mark = ","), "bp\n")
## Общая длина генома: 12,157,105 bp
cat("Количество контигов:", nrow(contig_sorted), "\n")
## Количество контигов: 17
cat("N50:", format(n50_value, big.mark = ","), "bp\n")
## N50: 924,431 bp
cat("L50:", l50_value, "\n")
## L50: 6
cat("Контиг, определяющий N50:", n50_contig, "\n")
## Контиг, определяющий N50: XIII
cat("Самый длинный контиг:", contig_sorted$Sequence[1], "(", 
    format(contig_sorted$Length[1], big.mark = ","), "bp)\n")
## Самый длинный контиг: IV ( 1,531,933 bp)
cat("Самый короткий контиг:", contig_sorted$Sequence[nrow(contig_sorted)], "(", 
    format(contig_sorted$Length[nrow(contig_sorted)], big.mark = ","), "bp)\n")
## Самый короткий контиг: MT ( 85,779 bp)
stats_text <- paste(
  "СТАТИСТИКА ГЕНОМА Saccharomyces cerevisiae S288C",
  "================================================",
  paste("Общая длина генома:", format(total_length, big.mark = ","), "bp"),
  paste("Количество контигов:", nrow(contig_sorted)),
  paste("N50:", format(n50_value, big.mark = ","), "bp"),
  paste("L50:", l50_value),
  paste("Контиг, определяющий N50:", n50_contig),
  paste("Самый длинный контиг:", contig_sorted$Sequence[1], "(", 
        format(contig_sorted$Length[1], big.mark = ","), "bp)"),
  paste("Самый короткий контиг:", contig_sorted$Sequence[nrow(contig_sorted)], "(", 
        format(contig_sorted$Length[nrow(contig_sorted)], big.mark = ","), "bp)"),
  sep = "\n"
)

writeLines(stats_text, "genome_statistics.txt")

cat("\nГрафики:\n")
## 
## Графики:
cat("1. contig_length_distribution.png - Основной график убывания длин\n")
## 1. contig_length_distribution.png - Основной график убывания длин
cat("2. cumulative_genome_coverage.png - Кумулятивное покрытие\n")
## 2. cumulative_genome_coverage.png - Кумулятивное покрытие
cat("3. contig_length_log_scale.png - Логарифмическая шкала\n")
## 3. contig_length_log_scale.png - Логарифмическая шкала
cat("4. genome_statistics.txt - Статистика генома\n")
## 4. genome_statistics.txt - Статистика генома