Влияние микрогравитации на экспрессию генов

Чтение исходных данных (таблица метаданных была изменена)

counts <- read_tsv("06_marotta_2024_counts.tsv.gz")
## Rows: 39376 Columns: 37
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## dbl (37): GeneID, GSM8115892, GSM8115893, GSM8115894, GSM8115895, GSM8115896...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
metadata <- read_tsv("metadata_advanced.tsv")
## Rows: 42 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (7): sample, treatment, subject, diagnosis, organoid, microglia, condition
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Проведение диффэкспрессии Космос vs Земля (для здоровых клеток, кортикальных органоидов)

# другие библиотеки до запуска этого скрипта лучше не импортировать а то они любят конфликтовать
library(tidyverse)
library(DESeq2)
library(clusterProfiler)

# НУЖНО САМОСТОЯТЕЛЬНО ЗАДАТЬ: 
# 1) отфильтрованные метаданные в переменную metadata_doph 
# 2) вектор из нужных ID образцов в переменную samplesID
# 3) колонку по которой будет строиться дизайн в переменную design
# 4) в переменную control задать значение которое будет использовано как контрольный эксперимент (из колонки, по которой строится дизайн) - т.е. относительно чего мы будем мерить обогащение экспрессии

################## меняем тут
metadata_doph <- metadata %>%
  filter(organoid == 'Cortical organoid' & subject == 'Subject1')


samplesID <- metadata_doph$sample

design <- 'treatment'

control <- 'ground'
##################

metadata_doph
## # A tibble: 11 × 7
##    sample     treatment    subject  diagnosis organoid       microglia condition
##    <chr>      <chr>        <chr>    <chr>     <chr>          <chr>     <chr>    
##  1 GSM8115892 ground       Subject1 Healthy   Cortical orga… without … Ground1  
##  2 GSM8115893 ground       Subject1 Healthy   Cortical orga… without … Ground2  
##  3 GSM8115894 ground       Subject1 Healthy   Cortical orga… without … Ground3  
##  4 GSM8115895 ground       Subject1 Healthy   Cortical orga… with mic… Ground1  
##  5 GSM8115896 ground       Subject1 Healthy   Cortical orga… with mic… Ground2  
##  6 GSM8115897 ground       Subject1 Healthy   Cortical orga… with mic… Ground3  
##  7 GSM8115902 microgravity Subject1 Healthy   Cortical orga… without … LEO1     
##  8 GSM8115903 microgravity Subject1 Healthy   Cortical orga… without … LEO2     
##  9 GSM8115904 microgravity Subject1 Healthy   Cortical orga… with mic… LEO1     
## 10 GSM8115905 microgravity Subject1 Healthy   Cortical orga… with mic… LEO2     
## 11 GSM8115906 microgravity Subject1 Healthy   Cortical orga… with mic… LEO3
################## Это просто запустить
samplesID <- c('GeneID', samplesID)
# отбираем только гены у которых больше 10 каунтов на все образцы
counts_doph <- counts[, names(counts) %in% samplesID]
counts_doph$total_sum <- rowSums(counts_doph[, -1])
counts_doph <- counts_doph[counts_doph$total_sum >= 20, ]
counts_doph <- counts_doph[, -ncol(counts_doph)]

# проверка чтобы в counts и metadata содержались одинаковые образцы
counts_exps <- names(counts_doph)
intersect_IDs <- intersect(samplesID, counts_exps)
counts_doph <- counts[, names(counts) %in% intersect_IDs]
metadata_doph <- metadata %>%
  filter(sample %in% intersect_IDs)
# преобразуем в матрицу каунты
counts_doph <- as.data.frame(counts_doph)
rownames(counts_doph) <- counts_doph$GeneID
counts_mtx <- counts_doph %>% dplyr::select(where(is.numeric)) %>% dplyr::select(-GeneID) %>% as.matrix()

metadata_doph <- as.data.frame(metadata_doph)
rownames(metadata_doph) <- metadata_doph$sample

# доработать, а пока меняю вручную
metadata_doph <- metadata_doph %>%
  mutate(treatment = as.factor(treatment))

# control - базовое условие относительно которого сичтается fold change
metadata_doph <- metadata_doph %>%
  mutate(treatment = relevel(as.factor(treatment), ref = control))

dds <- DESeqDataSetFromMatrix(
  countData = counts_mtx,
  colData = metadata_doph,
  design= as.formula(paste("~", design)))
## converting counts to integer mode
dds <- DESeq(dds)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
name_var <- resultsNames(dds)[2]

res <- results(dds, name = name_var)

diff_expression_data <- res %>% as.data.frame() %>% 
  mutate(gene = rownames(.)) %>% 
  select(gene, log2FoldChange, padj) %>% 
  drop_na(padj) %>% 
  arrange(desc(abs(log2FoldChange)))

diff_expression_data <- bitr(
  geneID = diff_expression_data$gene,           # вектор идентификаторов генов
  fromType = "ENTREZID",               # откуда
  toType = c("SYMBOL", "ENTREZID"), # куда
  OrgDb = "org.Hs.eg.db") %>% 
  inner_join(diff_expression_data, by = c("ENTREZID" = "gene"))
## 
## 'select()' returned 1:1 mapping between keys and columns
## Warning in bitr(geneID = diff_expression_data$gene, fromType = "ENTREZID", :
## 2.5% of input gene IDs are fail to map...
library(ggplot2)
transformed_dds <- rlog(dds)
#transformed_dds <- vst(dds)
pca_plot <- plotPCA(transformed_dds, intgroup=c("treatment")) +
  geom_label(aes(label = name))
## using ntop=500 top features by variance
ggsave("pca_plot_cortic.png", plot = pca_plot, width = 8, height = 6, dpi = 300)

pca_plot

library(EnhancedVolcano)
## Loading required package: ggrepel
volcano_plot <- EnhancedVolcano(diff_expression_data,
    lab = diff_expression_data$SYMBOL,
    x = 'log2FoldChange',
    y = 'padj',
    legendPosition = 'top',
    pCutoff = 0.1,
    FCcutoff = 1,
    title = 'Space vs Ground')

ggsave("volcano_plot_cortic.png", plot = volcano_plot, width = 7, height = 7, dpi = 300)

volcano_plot

detach("package:EnhancedVolcano", unload=TRUE)

ORA

Помогала делать Тимошкова ## Activated

diff_expression_data_up <- diff_expression_data %>% filter(padj <= 0.05, log2FoldChange > 0.5)

ora_up_go <- enrichGO(
  gene = diff_expression_data_up$SYMBOL,  # вектор идентификаторов генов
  OrgDb = "org.Hs.eg.db",    # модельный организм
  keyType = "SYMBOL",        # тип идентификатора генов
  ont = "ALL",               # аннотация GO BP+CC+MF
  universe = diff_expression_data$SYMBOL,
  qvalueCutoff = 0.1)
go_plot <- barplot(ora_up_go,
                   ) +
  labs(title = "Activated \n GO categories in space")

ggsave("Activated categories in space.png", plot = go_plot, width = 8, height = 5, dpi = 300)

go_plot

dotplot(ora_up_go) +
  labs(title = "Activated \n GO categories")

cnetplot_goup <- cnetplot(ora_up_go) 

ggsave("cnetplot_Activated categories in space.png", plot = cnetplot_goup, width = 5, height = 8, dpi = 300)

cnetplot_goup

ora_up_go %>% filter(ONTOLOGY == "BP") %>% 
  enrichplot::pairwise_termsim() %>%  # сходство между категориями
  emapplot()

Suppressed

diff_expression_data_down <- diff_expression_data %>% filter(padj <= 0.05, log2FoldChange < 0)

ora_up_go <- enrichGO(
  gene = diff_expression_data_down$SYMBOL,  # вектор идентификаторов генов
  OrgDb = "org.Hs.eg.db",    # модельный организм
  keyType = "SYMBOL",        # тип идентификатора генов
  ont = "ALL",               # аннотация GO BP+CC+MF
  universe = diff_expression_data$SYMBOL,
  qvalueCutoff = 0.05)
go_plot <- barplot(ora_up_go, colorByPvalue=TRUE) +
  labs(title = "Suppressed \n GO categories in space")

ggsave("Suppressed categories in space.png", plot = go_plot, width = 8, height = 5, dpi = 300)

go_plot

cne_plot <- cnetplot(ora_up_go) 

ggsave("cnetplot Suppressed categories in space.png", plot = cne_plot, width = 5, height = 8, dpi = 300)
cne_plot
## Warning: ggrepel: 1 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

# GSEA kegg

gene_list <- diff_expression_data$log2FoldChange
names(gene_list) <- diff_expression_data$ENTREZID
gene_list <- sort(gene_list, decreasing = TRUE)

gsea_kegg <- gseKEGG(
  geneList = gene_list, # список генов
  organism = "hsa",     # модельный организм
  pvalueCutoff = 0.01)   # фильтр по p-значению
## Reading KEGG annotation online: "https://rest.kegg.jp/link/hsa/pathway"...
## Reading KEGG annotation online: "https://rest.kegg.jp/list/pathway/hsa"...
## using 'fgsea' for GSEA analysis, please cite Korotkevich et al (2019).
## preparing geneSet collections...
## GSEA analysis...
## Warning in preparePathwaysAndStats(pathways, stats, minSize, maxSize, gseaParam, : There are ties in the preranked stats (0.01% of the list).
## The order of those tied genes will be arbitrary, which may produce unexpected results.
## Warning in fgseaMultilevel(pathways = pathways, stats = stats, minSize =
## minSize, : There were 4 pathways for which P-values were not calculated
## properly due to unbalanced (positive and negative) gene-level statistic values.
## For such pathways pval, padj, NES, log2err are set to NA. You can try to
## increase the value of the argument nPermSimple (for example set it nPermSimple
## = 10000)
## Warning in fgseaMultilevel(pathways = pathways, stats = stats, minSize =
## minSize, : For some of the pathways the P-values were likely overestimated. For
## such pathways log2err is set to NA.
## leading edge analysis...
## done...
ridgeplot(gsea_kegg, showCategory=10)
## Picking joint bandwidth of 0.0949

gseaplot(gsea_kegg, geneSetID = 1, title = gsea_kegg$Description[1])

WGCNA

counts_WGCNA <- counts[, names(counts) %in% samplesID]
metadata_doph_WGCNA <- metadata_doph %>%
  mutate(word = gsub("\\d+", "", condition)) %>%  # Удаляем цифры из condition
  group_by(word) %>%  # Группируем по извлеченному слову
  mutate(name = paste0(word, '-', sample)) %>%  # Добавляем номер строки для каждой группы
  ungroup() %>%  # Убираем группировку
  select(-word)  # Удаляем временный столбец word
wgcan_colnames <- metadata_doph_WGCNA$name
wgcan_colnames <- c('GeneID', wgcan_colnames)
names(counts_WGCNA) <- wgcan_colnames
counts_WGCNA
## # A tibble: 39,376 × 11
##       GeneID `Ground-GSM8115892` `Ground-GSM8115893` `Ground-GSM8115894`
##        <dbl>               <dbl>               <dbl>               <dbl>
##  1 100287102                  22                   4                  17
##  2    653635                 308                 188                 379
##  3 102466751                   2                   0                   0
##  4 107985730                   4                  13                   8
##  5 100302278                   1                   0                   3
##  6    645520                   2                   3                   7
##  7     79501                   0                   0                   0
##  8 100996442                  10                  28                  25
##  9    729737                  50                 102                  80
## 10 102725121                  30                  18                  37
## # ℹ 39,366 more rows
## # ℹ 7 more variables: `Ground-GSM8115895` <dbl>, `Ground-GSM8115896` <dbl>,
## #   `Ground-GSM8115897` <dbl>, `LEO-GSM8115902` <dbl>, `LEO-GSM8115903` <dbl>,
## #   `LEO-GSM8115904` <dbl>, `LEO-GSM8115906` <dbl>
mdata <- counts_WGCNA %>%
  pivot_longer( !GeneID, names_to = "name", values_to = "value" ) %>%
  mutate( group = gsub("-.*","", name) %>% gsub("[.].*","", .)) %>%
  mutate(new_column = sub(".*-", "", name))
mdata
## # A tibble: 393,760 × 5
##       GeneID name              value group  new_column
##        <dbl> <chr>             <dbl> <chr>  <chr>     
##  1 100287102 Ground-GSM8115892    22 Ground GSM8115892
##  2 100287102 Ground-GSM8115893     4 Ground GSM8115893
##  3 100287102 Ground-GSM8115894    17 Ground GSM8115894
##  4 100287102 Ground-GSM8115895    13 Ground GSM8115895
##  5 100287102 Ground-GSM8115896    29 Ground GSM8115896
##  6 100287102 Ground-GSM8115897    20 Ground GSM8115897
##  7 100287102 LEO-GSM8115902       21 LEO    GSM8115902
##  8 100287102 LEO-GSM8115903       13 LEO    GSM8115903
##  9 100287102 LEO-GSM8115904       15 LEO    GSM8115904
## 10 100287102 LEO-GSM8115906       23 LEO    GSM8115906
## # ℹ 393,750 more rows
p <- mdata %>%
#  head(10000) %>%
  ggplot(aes(x = new_column, y = value)) +
  geom_violin() + 
  geom_point(alpha = 0.2) + 
  theme_bw() +
  theme(axis.text.x = element_text(angle = 90,size=10),
        text = element_text(size = 20)) +
  labs(x = "Treatment Groups", y = "RNA Seq Counts", title = 'Row counts') +
  facet_grid(cols = vars(group), drop = TRUE, scales = "free_x")

ggsave("whitiout_norm.png", plot = p, width = 8, height = 6, dpi = 300)
p

de_input = as.matrix(counts_WGCNA[,-1])
row.names(de_input) = counts_WGCNA$GeneID

meta_df <- data.frame( Sample = names(counts_WGCNA[-1])) %>%
  mutate(Type = gsub("-.*","", Sample) %>% 
           gsub("[.].*","", .))
dds <- DESeqDataSetFromMatrix(round(de_input),
                              meta_df,
                              design = ~Type)
## converting counts to integer mode
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
dds <- DESeq(dds)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
vsd <- varianceStabilizingTransformation(dds)

wpn_vsd <- getVarianceStabilizedData(dds)
wpn_vsd[1:5, 1:5]
##           Ground-GSM8115892 Ground-GSM8115893 Ground-GSM8115894
## 100287102          6.334068          5.535064          6.100980
## 653635             8.838685          7.920377          8.894648
## 102466751          5.445491          5.045074          5.045074
## 107985730          5.609560          5.919234          5.777795
## 100302278          5.328664          5.045074          5.496763
##           Ground-GSM8115895 Ground-GSM8115896
## 100287102          5.854474          6.427703
## 653635             7.963320          8.717918
## 102466751          5.045074          5.045074
## 107985730          5.721198          5.835222
## 100302278          5.045074          5.421160
rv_wpn <- rowVars(wpn_vsd)
q95_wpn <- quantile( rowVars(wpn_vsd), .95) 

expr_normalized <- wpn_vsd[ rv_wpn > q95_wpn, ]
expr_normalized[1:5,1:5]
##           Ground-GSM8115892 Ground-GSM8115893 Ground-GSM8115894
## 113219467          9.052877         10.673510         10.358306
## 107985728          6.248051          5.535064          6.264256
## 57801             12.525696         13.147242         12.042059
## 105378604          5.609560          5.045074          6.070734
## 7161               6.919040          7.286699          7.304778
##           Ground-GSM8115895 Ground-GSM8115896
## 113219467         10.878194          9.313557
## 107985728          5.272282          6.057071
## 57801             13.342494         12.147071
## 105378604          5.823483          5.505039
## 7161               7.167813          6.701383
expr_normalized_df <- data.frame(expr_normalized) %>%
  mutate(
    Gene_id = row.names(expr_normalized)
  ) %>%
  pivot_longer(-Gene_id) %>%
  mutate( group = gsub("-.*","", name) %>% gsub("[.].*","", .)) %>%
  mutate(name_2 = sub(".*\\.", "", name))

head(expr_normalized_df)
## # A tibble: 6 × 5
##   Gene_id   name              value group  name_2    
##   <chr>     <chr>             <dbl> <chr>  <chr>     
## 1 113219467 Ground.GSM8115892  9.05 Ground GSM8115892
## 2 113219467 Ground.GSM8115893 10.7  Ground GSM8115893
## 3 113219467 Ground.GSM8115894 10.4  Ground GSM8115894
## 4 113219467 Ground.GSM8115895 10.9  Ground GSM8115895
## 5 113219467 Ground.GSM8115896  9.31 Ground GSM8115896
## 6 113219467 Ground.GSM8115897 10.3  Ground GSM8115897
p <- expr_normalized_df %>%
#  head(10000) %>%
  ggplot(aes(x = name_2, y = value)) +
  geom_violin() + 
  geom_point(alpha = 0.2) + 
  theme_bw() +
  theme(axis.text.x = element_text(angle = 90, size=10),
        text = element_text(size = 20)) +
  ylim(0, NA) +
  labs(title = 'Normalized and 95 quantile Expression', x = "Treatment Groups", y = "RNA Seq Counts") +
  facet_grid(cols = vars(group), drop = TRUE, scales = "free_x")

ggsave("normalized_counts.png", plot = p, width = 8, height = 6, dpi = 300)
p

input_mat = t(expr_normalized)

input_mat[1:5,1:4]
##                   113219467 107985728    57801 105378604
## Ground-GSM8115892  9.052877  6.248051 12.52570  5.609560
## Ground-GSM8115893 10.673510  5.535064 13.14724  5.045074
## Ground-GSM8115894 10.358306  6.264256 12.04206  6.070734
## Ground-GSM8115895 10.878194  5.272282 13.34249  5.823483
## Ground-GSM8115896  9.313557  6.057071 12.14707  5.505039
powers = c(c(1:10), seq(from = 12, to = 20, by = 2))
sft <- pickSoftThreshold(
  input_mat, 
  powerVector = powers,
  verbose = 5
)
## pickSoftThreshold: will use block size 1969.
##  pickSoftThreshold: calculating connectivity for given powers...
##    ..working on genes 1 through 1969 of 1969
## Warning: executing %dopar% sequentially: no parallel backend registered
##    Power SFT.R.sq   slope truncated.R.sq mean.k. median.k. max.k.
## 1      1  0.72900  2.6600          0.938   921.0     963.0 1260.0
## 2      2  0.45800  0.9220          0.847   555.0     583.0  914.0
## 3      3  0.09240  0.2710          0.693   374.0     386.0  703.0
## 4      4  0.00728 -0.0641          0.666   269.0     275.0  559.0
## 5      5  0.13800 -0.2950          0.663   202.0     202.0  455.0
## 6      6  0.27100 -0.4460          0.704   157.0     153.0  377.0
## 7      7  0.39400 -0.5310          0.768   125.0     118.0  317.0
## 8      8  0.48400 -0.6190          0.800   101.0      93.6  270.0
## 9      9  0.56600 -0.6950          0.834    83.5      75.2  235.0
## 10    10  0.63200 -0.7530          0.878    69.9      61.8  207.0
## 11    12  0.67400 -0.8850          0.900    50.6      43.0  165.0
## 12    14  0.71000 -0.9940          0.922    38.0      31.0  136.0
## 13    16  0.77200 -1.0500          0.961    29.4      22.9  114.0
## 14    18  0.80800 -1.1200          0.979    23.3      17.3   97.5
## 15    20  0.82400 -1.2100          0.985    18.8      13.3   85.8
sft
## $powerEstimate
## [1] NA
## 
## $fitIndices
##    Power    SFT.R.sq       slope truncated.R.sq   mean.k. median.k.     max.k.
## 1      1 0.728779870  2.66096877      0.9382001 920.83114 963.01609 1257.37659
## 2      2 0.457685689  0.92163682      0.8470379 555.47554 582.50603  914.19154
## 3      3 0.092368042  0.27123343      0.6929240 374.10370 386.23628  702.77926
## 4      4 0.007278128 -0.06405253      0.6658327 268.87362 274.50500  558.83134
## 5      5 0.137690580 -0.29508552      0.6627614 201.95561 202.00735  455.00717
## 6      6 0.270872931 -0.44572802      0.7040691 156.68635 152.85207  377.17961
## 7      7 0.393983294 -0.53056837      0.7684007 124.64809 118.45379  317.17494
## 8      8 0.483946654 -0.61892093      0.8003009 101.17287  93.63633  270.46229
## 9      9 0.565699046 -0.69472855      0.8337424  83.48897  75.22531  235.32851
## 10    10 0.631539306 -0.75321757      0.8778638  69.86162  61.79687  206.54549
## 11    12 0.674076485 -0.88546941      0.9003671  50.61456  43.04831  165.46153
## 12    14 0.710464928 -0.99417614      0.9221527  38.02940  30.99294  136.02176
## 13    16 0.772403372 -1.04941935      0.9610752  29.40487  22.88995  113.90616
## 14    18 0.807704547 -1.12178881      0.9792095  23.27130  17.28731   97.52804
## 15    20 0.824071445 -1.21279633      0.9849861  18.77579  13.28135   85.84917
par(mfrow = c(1,2));
cex1 = 0.9;

plot(sft$fitIndices[, 1],
     -sign(sft$fitIndices[, 3]) * sft$fitIndices[, 2],
     xlab = "Soft Threshold (power)",
     ylab = "Scale Free Topology Model Fit, signed R^2",
     main = paste("Scale independence")
)
text(sft$fitIndices[, 1],
     -sign(sft$fitIndices[, 3]) * sft$fitIndices[, 2],
     labels = powers, cex = cex1, col = "red"
)
abline(h = 0.90, col = "red")
plot(sft$fitIndices[, 1],
     sft$fitIndices[, 5],
     xlab = "Soft Threshold (power)",
     ylab = "Mean Connectivity",
     type = "n",
     main = paste("Mean connectivity")
)
text(sft$fitIndices[, 1],
     sft$fitIndices[, 5],
     labels = powers,
     cex = cex1, col = "red")

#10
picked_power = 10
temp_cor <- cor       
cor <- WGCNA::cor 
netwk <- blockwiseModules(input_mat,
                          power = picked_power,   
                          networkType = "signed",
                          deepSplit = 2,
                          pamRespectsDendro = F,
                          minModuleSize = 30,
                          maxBlockSize = 4000,
                          reassignThreshold = 0,
                          mergeCutHeight = 0.25,
                          saveTOMs = T,
                          saveTOMFileBase = "ER",
                          numericLabels = T,
                          verbose = 3)
##  Calculating module eigengenes block-wise from all genes
##    Flagging genes and samples with too many missing values...
##     ..step 1
##  ..Working on block 1 .
##     TOM calculation: adjacency..
##     ..will not use multithreading.
##      Fraction of slow calculations: 0.000000
##     ..connectivity..
##     ..matrix multiplication (system BLAS)..
##     ..normalization..
##     ..done.
##    ..saving TOM for block 1 into file ER-block.1.RData
##  ....clustering..
##  ....detecting modules..
##  ....calculating module eigengenes..
##  ....checking kME in modules..
##      ..removing 1 genes from module 2 because their KME is too low.
##      ..removing 1 genes from module 4 because their KME is too low.
##      ..removing 2 genes from module 5 because their KME is too low.
##      ..removing 1 genes from module 7 because their KME is too low.
##      ..removing 1 genes from module 8 because their KME is too low.
##  ..merging modules that are too close..
##      mergeCloseModules: Merging modules whose distance is less than 0.25
##        Calculating new MEs...
mergedColors = labels2colors(netwk$colors)
# Plot the dendrogram and the module colors underneath
plotDendroAndColors(
  netwk$dendrograms[[1]],
  mergedColors[netwk$blockGenes[[1]]],
  "Module colors",
  dendroLabels = FALSE,
  hang = 0.03,
  addGuide = TRUE,
  guideHang = 0.05 )

module_df <- data.frame(
  gene_id = names(netwk$colors),
  colors = labels2colors(netwk$colors)
)
table(module_df$colors)
## 
##     black      blue     brown     green      grey      pink       red turquoise 
##        85       443       284       201         6        53       128       527 
##    yellow 
##       242
MEs0 <- moduleEigengenes(input_mat, mergedColors)$eigengenes
MEs0 <- orderMEs(MEs0)
module_order = names(MEs0) %>% gsub("ME","", .)
MEs0$treatment = row.names(MEs0)

mME = MEs0 %>%
  pivot_longer(-treatment) %>%
  mutate(
    name = gsub("ME", "", name),
    name = factor(name, levels = module_order)
  )
mME %>% ggplot(aes(x=treatment, y=name, fill=value)) +
  geom_tile() +
  theme_bw() +
  scale_fill_gradient2(
    low = "blue",
    high = "red",
    mid = "white",
    midpoint = 0,
    limit = c(-1,1)) +
  theme(axis.text.x = element_text(angle=90)) +
  labs(title = "Module-trait Relationships", y = "Modules", fill="corr")

mME
## # A tibble: 90 × 3
##    treatment         name        value
##    <chr>             <fct>       <dbl>
##  1 Ground-GSM8115892 turquoise  0.124 
##  2 Ground-GSM8115892 yellow    -0.189 
##  3 Ground-GSM8115892 blue      -0.342 
##  4 Ground-GSM8115892 red       -0.434 
##  5 Ground-GSM8115892 black      0.455 
##  6 Ground-GSM8115892 green      0.493 
##  7 Ground-GSM8115892 brown     -0.0113
##  8 Ground-GSM8115892 pink       0.365 
##  9 Ground-GSM8115892 grey      -0.172 
## 10 Ground-GSM8115893 turquoise -0.621 
## # ℹ 80 more rows
correlation_matrix <- mME %>%
  pivot_wider(names_from = treatment, values_from = value) %>%
  column_to_rownames('name') %>%
  as.matrix()
correlation_matrix
##           Ground-GSM8115892 Ground-GSM8115893 Ground-GSM8115894
## turquoise        0.12425687       -0.62126077        0.13785724
## yellow          -0.18891558       -0.45294506        0.51227834
## blue            -0.34226568       -0.44083811        0.09444174
## red             -0.43430775       -0.12839683       -0.05957096
## black            0.45482062       -0.36536788        0.31851792
## green            0.49304791        0.26137267        0.01299616
## brown           -0.01126801        0.50594690       -0.49105596
## pink             0.36460089        0.17774576       -0.33727510
## grey            -0.17248158        0.02399165       -0.33475927
##           Ground-GSM8115895 Ground-GSM8115896 Ground-GSM8115897 LEO-GSM8115902
## turquoise       -0.63116885        0.18343456        0.17035777     0.16191022
## yellow          -0.39662421       -0.06360123        0.32762108     0.04507156
## blue            -0.38752227       -0.13435287       -0.10828996     0.47202334
## red             -0.21905874       -0.36163887        0.04017425     0.12208362
## black           -0.21249528        0.50665632        0.14585193    -0.09405857
## green            0.28051968        0.33199763       -0.04205212    -0.27154333
## brown            0.61851099        0.02267231       -0.07742515    -0.27995249
## pink             0.11071709        0.16384063       -0.47403400     0.30948827
## grey            -0.02320097        0.45731933        0.09939372     0.22537264
##           LEO-GSM8115903 LEO-GSM8115904 LEO-GSM8115906
## turquoise     0.21680762      0.2066660     0.05113931
## yellow       -0.02284437      0.4335778    -0.19361836
## blue          0.12381467      0.3527469     0.37024223
## red           0.60635964      0.4760698    -0.04171417
## black        -0.40722177     -0.1614138    -0.18528952
## green        -0.47296345     -0.4068235    -0.18655166
## brown        -0.09634056     -0.1579061    -0.03318192
## pink         -0.15860347     -0.4811835     0.32470341
## grey          0.50193870     -0.2603227    -0.51725158
library(ComplexHeatmap)
## Loading required package: grid
## ========================================
## ComplexHeatmap version 2.22.0
## Bioconductor page: http://bioconductor.org/packages/ComplexHeatmap/
## Github page: https://github.com/jokergoo/ComplexHeatmap
## Documentation: http://jokergoo.github.io/ComplexHeatmap-reference
## 
## If you use it in published research, please cite either one:
## - Gu, Z. Complex Heatmap Visualization. iMeta 2022.
## - Gu, Z. Complex heatmaps reveal patterns and correlations in multidimensional 
##     genomic data. Bioinformatics 2016.
## 
## 
## The new InteractiveComplexHeatmap package can directly export static 
## complex heatmaps into an interactive Shiny app with zero effort. Have a try!
## 
## This message can be suppressed by:
##   suppressPackageStartupMessages(library(ComplexHeatmap))
## ========================================
set.seed(123)
data_matrix <- matrix(rnorm(100), nrow=10, ncol=10)
rownames(data_matrix) <- paste0("Row", 1:10)
colnames(data_matrix) <- paste0("Col", 1:10)
Heatmap(correlation_matrix, cluster_rows = TRUE, cluster_columns = TRUE)

#меняем
modules_of_interest = c("red", "blue", 'yellow', 'turquoise')
submod = module_df %>%
  subset(colors %in% modules_of_interest)

row.names(module_df) = module_df$gene_id

subexpr = expr_normalized[submod$gene_id,]

submod_df = data.frame(subexpr) %>%
  mutate(
    gene_id = row.names(.)
  ) %>%
  pivot_longer(-gene_id) %>%
  mutate(
    module = module_df[gene_id,]$colors
  )

submod_df %>% ggplot(aes(x=name, y=value, group=gene_id)) +
  geom_line(aes(color = module),
            alpha = 0.2) +
  theme_bw() +
  theme(
    axis.text.x = element_text(angle = 90)
  ) +
  facet_grid(rows = vars(module)) +
  labs(x = "treatment",
       y = "normalized expression")

modules_of_interest = c("green", "black", 'pink', 'brown', 'grey')
submod = module_df %>%
  subset(colors %in% modules_of_interest)

row.names(module_df) = module_df$gene_id

subexpr = expr_normalized[submod$gene_id,]

submod_df = data.frame(subexpr) %>%
  mutate(
    gene_id = row.names(.)
  ) %>%
  pivot_longer(-gene_id) %>%
  mutate(
    module = module_df[gene_id,]$colors
  )

submod_df %>% ggplot(aes(x=name, y=value, group=gene_id)) +
  geom_line(aes(color = module),
            alpha = 0.2) +
  theme_bw() +
  theme(
    axis.text.x = element_text(angle = 90)
  ) +
  facet_grid(rows = vars(module)) +
  labs(x = "treatment",
       y = "normalized expression")

Проанализируем выбранный модуль генов

modules_of_interest = c("green")
submod = module_df %>%
  subset(colors %in% modules_of_interest)

row.names(module_df) = module_df$gene_id

subexpr = expr_normalized[submod$gene_id,]

submod_df = data.frame(subexpr) %>%
  mutate(
    gene_id = row.names(.)
  ) %>%
  pivot_longer(-gene_id) %>%
  mutate(
    module = module_df[gene_id,]$colors
  )

genes_of_interest = module_df %>%
  subset(colors %in% modules_of_interest)

expr_of_interest = expr_normalized[genes_of_interest$gene_id,]

target_list_genes <- rownames(expr_of_interest)

length(target_list_genes)
## [1] 201

ORA для модуля генов

Activated

ora_up_go2 <- enrichGO(
  gene = target_list_genes,  # вектор идентификаторов генов
  OrgDb = "org.Hs.eg.db",    # модельный организм
  keyType = "ENTREZID",        # тип идентификатора генов
  ont = "ALL",               # аннотация GO BP+CC+MF
  universe = diff_expression_data$ENTREZID,
  qvalueCutoff = 0.1)
ora_up_go2
## #
## # over-representation test
## #
## #...@organism     Homo sapiens 
## #...@ontology     GOALL 
## #...@keytype      ENTREZID 
## #...@gene     chr [1:201] "374946" "100379251" "4878" "3925" "3151" "113452" "55268" ...
## #...pvalues adjusted by 'BH' with cutoff <0.05 
## #...105 enriched terms found
## 'data.frame':    105 obs. of  13 variables:
##  $ ONTOLOGY      : chr  "BP" "BP" "BP" "BP" ...
##  $ ID            : chr  "GO:0030900" "GO:0033555" "GO:0050804" "GO:0099177" ...
##  $ Description   : chr  "forebrain development" "multicellular organismal response to stress" "modulation of chemical synaptic transmission" "regulation of trans-synaptic signaling" ...
##  $ GeneRatio     : chr  "19/175" "9/175" "20/175" "20/175" ...
##  $ BgRatio       : chr  "391/14640" "85/14640" "469/14640" "470/14640" ...
##  $ RichFactor    : num  0.0486 0.1059 0.0426 0.0426 0.0549 ...
##  $ FoldEnrichment: num  4.07 8.86 3.57 3.56 4.6 ...
##  $ zScore        : num  6.76 7.99 6.22 6.2 6.6 ...
##  $ pvalue        : num  2.26e-07 7.66e-07 8.48e-07 8.77e-07 9.89e-07 ...
##  $ p.adjust      : num  0.000521 0.000521 0.000521 0.000521 0.000521 ...
##  $ qvalue        : num  0.00046 0.00046 0.00046 0.00046 0.00046 ...
##  $ geneID        : chr  "374946/3398/2016/4929/4760/56978/51176/84709/2890/63974/9760/4781/6585/6506/2290/64919/26153/8851/6752" "4929/348980/2890/117/5179/157753/5024/41/4761" "79012/590/2890/10590/5179/84628/5024/4908/2904/41/1138/1136/1143/5579/4761/4804/81832/24141/5413/91851" "79012/590/2890/10590/5179/84628/5024/4908/2904/41/1138/1136/1143/5579/4761/4804/81832/24141/5413/91851" ...
##  $ Count         : int  19 9 20 20 15 20 6 6 7 6 ...
## #...Citation
## S Xu, E Hu, Y Cai, Z Xie, X Luo, L Zhan, W Tang, Q Wang, B Liu, R Wang, W Xie, T Wu, L Xie, G Yu. Using clusterProfiler to characterize multiomics data. Nature Protocols. 2024, doi:10.1038/s41596-024-01020-z
barplot(ora_up_go2) +
  labs(title = "Enriched \n GO categories for green genes")

cnetplot(ora_up_go, showCategory = 10, showCategorySize = 10) 
## Warning: ggrepel: 6 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

Вершины графа. Нахождение драйверных генов.

TOM = TOMsimilarityFromExpr(t(expr_of_interest),
                            power = picked_power)
## TOM calculation: adjacency..
## ..will not use multithreading.
##  Fraction of slow calculations: 0.000000
## ..connectivity..
## ..matrix multiplication (system BLAS)..
## ..normalization..
## ..done.
row.names(TOM) = row.names(expr_of_interest)
colnames(TOM) = row.names(expr_of_interest)

edge_list = data.frame(TOM) %>%
  mutate(
    gene1 = row.names(.)
  ) %>%
  pivot_longer(-gene1) %>%
  dplyr::rename(gene2 = name, correlation = value) %>%
  unique() %>%
  subset(!(gene1==gene2)) %>%
  mutate(
    module1 = module_df[gene1,]$colors,
    module2 = module_df[gene2,]$colors
  )
edge_list
## # A tibble: 40,401 × 5
##    gene1  gene2      correlation module1 module2
##    <chr>  <chr>            <dbl> <chr>   <chr>  
##  1 374946 X374946         1      green   <NA>   
##  2 374946 X100379251      0.0275 green   <NA>   
##  3 374946 X4878           0.121  green   <NA>   
##  4 374946 X3925           0.127  green   <NA>   
##  5 374946 X3151           0.252  green   <NA>   
##  6 374946 X113452         0.226  green   <NA>   
##  7 374946 X55268          0.149  green   <NA>   
##  8 374946 X3725           0.0553 green   <NA>   
##  9 374946 X4774           0.212  green   <NA>   
## 10 374946 X8543           0.299  green   <NA>   
## # ℹ 40,391 more rows
degree_counts <- edge_list %>%
  pivot_longer(cols = c(gene1, gene2), names_to = "gene", values_to = "vertex") %>%
  group_by(vertex) %>%
  summarise(degree = sum(correlation)) %>%
  arrange(desc(degree))

degree_counts <- degree_counts[!grepl("^X", degree_counts$vertex), ]
degree_counts
## # A tibble: 201 × 2
##    vertex degree
##    <chr>   <dbl>
##  1 3344     49.7
##  2 3166     48.7
##  3 4188     46.3
##  4 53335    46.0
##  5 63974    45.6
##  6 8543     45.0
##  7 374946   44.7
##  8 10276    44.4
##  9 9124     43.7
## 10 26035    42.8
## # ℹ 191 more rows
ggplot(degree_counts, aes(x = degree)) +
geom_histogram(binwidth = 0.7, fill = "blue", color = "black") +
  labs(title = "Гистограмма степеней вершин для желтого кластера", x = "Степень", y = "Частота") +
  theme_minimal()

Получение GO terms для драйверных генов

gene_of_interest <- 4188
library(biomaRt)

# Создание объекта biomaRt для доступа к базе данных Ensembl
ensembl <- useMart("ensembl", dataset = "hsapiens_gene_ensembl")


# Извлечение GO аннотаций для гена
go_annotations <- getBM(
  attributes = c('external_gene_name', "name_1006", "definition_1006"),
  filters = "entrezgene_id",
  values = gene_of_interest,
  mart = ensembl
)


go_annotations
##    external_gene_name                                                 name_1006
## 1                MDFI                                                          
## 2                MDFI                                                   nucleus
## 3                MDFI                                                 cytoplasm
## 4                MDFI                                           protein binding
## 5                MDFI                                      cell differentiation
## 6                MDFI                                 identical protein binding
## 7                MDFI              negative regulation of Wnt signaling pathway
## 8                MDFI negative regulation of transcription by RNA polymerase II
## 9                MDFI          cytoplasmic sequestering of transcription factor
## 10               MDFI                transcription regulator inhibitor activity
## 11               MDFI                        negative regulation of DNA binding
## 12               MDFI                         dorsal/ventral axis specification
## 13               MDFI                   embryonic skeletal system morphogenesis
## 14               MDFI                    trophoblast giant cell differentiation
##                                                                                                                                                                                                                                                                                                                                                   definition_1006
## 1                                                                                                                                                                                                                                                                                                                                                                
## 2          A membrane-bounded organelle of eukaryotic cells in which chromosomes are housed and replicated. In most cells, the nucleus contains all of the cell's chromosomes except the organellar chromosomes, and is the site of RNA synthesis and processing. In some species, or in specialized cell types, RNA metabolism or DNA replication may be absent.
## 3                                                                                                                                                                                                                                                   The contents of a cell excluding the plasma membrane and nucleus, but including other subcellular structures.
## 4                                                                                                                                                                                                                                                                                                                                           Binding to a protein.
## 5  The cellular developmental process in which a relatively unspecialized cell, e.g. embryonic or regenerative cell, acquires specialized structural and/or functional features that characterize a specific cell. Differentiation includes the processes involved in commitment of a cell to a specific fate and its subsequent development to the mature state.
## 6                                                                                                                                                                                                                                                                                                                    Binding to an identical protein or proteins.
## 7                                                                                                                                                                                                                                                        Any process that stops, prevents, or reduces the frequency, rate or extent of the Wnt signaling pathway.
## 8                                                                                                                                                                                                                                      Any process that stops, prevents, or reduces the frequency, rate or extent of transcription mediated by RNA polymerase II.
## 9                                                                                                                                                                                                            The selective interaction of a transcription factor with specific molecules in the cytoplasm, thereby inhibiting its translocation into the nucleus.
## 10                                                                                                                                                                                                              A molecular function regulator that inhibits the activity of a transcription regulator via direct binding and/or post-translational modification.
## 11                                                                                                                                                                     Any process that stops or reduces the frequency, rate or extent of DNA binding. DNA binding is any process in which a gene product interacts selectively with DNA (deoxyribonucleic acid).
## 12                     The establishment, maintenance and elaboration of the dorsal/ventral axis. The dorsal/ventral axis is defined by a line that runs orthogonal to both the anterior/posterior and left/right axes. The dorsal end is defined by the upper or back side of an organism. The ventral end is defined by the lower or front side of an organism.
## 13                                                                                                                                                                                                                                         The process in which the anatomical structures of the skeleton are generated and organized during the embryonic phase.
## 14                                                                                                                                           The process in which a relatively unspecialized cell acquires specialized features of a trophoblast giant cell of the placenta. Trophoblast giant cells are the cell of the placenta that line the maternal decidua.
hgnc_symbols <- getBM(
  attributes = c("entrezgene_id", "hgnc_symbol"),
  filters = "entrezgene_id",
  values = gene_of_interest,
  mart = ensembl
)
hgnc_symbols$hgnc_symbol
## [1] "MDFI"