counts <- read_tsv("06_marotta_2024_counts.tsv.gz")
## Rows: 39376 Columns: 37
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## dbl (37): GeneID, GSM8115892, GSM8115893, GSM8115894, GSM8115895, GSM8115896...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
metadata <- read_tsv("metadata_advanced.tsv")
## Rows: 42 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (7): sample, treatment, subject, diagnosis, organoid, microglia, condition
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# другие библиотеки до запуска этого скрипта лучше не импортировать а то они любят конфликтовать
library(tidyverse)
library(DESeq2)
library(clusterProfiler)
# НУЖНО САМОСТОЯТЕЛЬНО ЗАДАТЬ:
# 1) отфильтрованные метаданные в переменную metadata_doph
# 2) вектор из нужных ID образцов в переменную samplesID
# 3) колонку по которой будет строиться дизайн в переменную design
# 4) в переменную control задать значение которое будет использовано как контрольный эксперимент (из колонки, по которой строится дизайн) - т.е. относительно чего мы будем мерить обогащение экспрессии
################## меняем тут
metadata_doph <- metadata %>%
filter(organoid == 'Cortical organoid' & subject == 'Subject1')
samplesID <- metadata_doph$sample
design <- 'treatment'
control <- 'ground'
##################
metadata_doph
## # A tibble: 11 × 7
## sample treatment subject diagnosis organoid microglia condition
## <chr> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 GSM8115892 ground Subject1 Healthy Cortical orga… without … Ground1
## 2 GSM8115893 ground Subject1 Healthy Cortical orga… without … Ground2
## 3 GSM8115894 ground Subject1 Healthy Cortical orga… without … Ground3
## 4 GSM8115895 ground Subject1 Healthy Cortical orga… with mic… Ground1
## 5 GSM8115896 ground Subject1 Healthy Cortical orga… with mic… Ground2
## 6 GSM8115897 ground Subject1 Healthy Cortical orga… with mic… Ground3
## 7 GSM8115902 microgravity Subject1 Healthy Cortical orga… without … LEO1
## 8 GSM8115903 microgravity Subject1 Healthy Cortical orga… without … LEO2
## 9 GSM8115904 microgravity Subject1 Healthy Cortical orga… with mic… LEO1
## 10 GSM8115905 microgravity Subject1 Healthy Cortical orga… with mic… LEO2
## 11 GSM8115906 microgravity Subject1 Healthy Cortical orga… with mic… LEO3
################## Это просто запустить
samplesID <- c('GeneID', samplesID)
# отбираем только гены у которых больше 10 каунтов на все образцы
counts_doph <- counts[, names(counts) %in% samplesID]
counts_doph$total_sum <- rowSums(counts_doph[, -1])
counts_doph <- counts_doph[counts_doph$total_sum >= 20, ]
counts_doph <- counts_doph[, -ncol(counts_doph)]
# проверка чтобы в counts и metadata содержались одинаковые образцы
counts_exps <- names(counts_doph)
intersect_IDs <- intersect(samplesID, counts_exps)
counts_doph <- counts[, names(counts) %in% intersect_IDs]
metadata_doph <- metadata %>%
filter(sample %in% intersect_IDs)
# преобразуем в матрицу каунты
counts_doph <- as.data.frame(counts_doph)
rownames(counts_doph) <- counts_doph$GeneID
counts_mtx <- counts_doph %>% dplyr::select(where(is.numeric)) %>% dplyr::select(-GeneID) %>% as.matrix()
metadata_doph <- as.data.frame(metadata_doph)
rownames(metadata_doph) <- metadata_doph$sample
# доработать, а пока меняю вручную
metadata_doph <- metadata_doph %>%
mutate(treatment = as.factor(treatment))
# control - базовое условие относительно которого сичтается fold change
metadata_doph <- metadata_doph %>%
mutate(treatment = relevel(as.factor(treatment), ref = control))
dds <- DESeqDataSetFromMatrix(
countData = counts_mtx,
colData = metadata_doph,
design= as.formula(paste("~", design)))
## converting counts to integer mode
dds <- DESeq(dds)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
name_var <- resultsNames(dds)[2]
res <- results(dds, name = name_var)
diff_expression_data <- res %>% as.data.frame() %>%
mutate(gene = rownames(.)) %>%
select(gene, log2FoldChange, padj) %>%
drop_na(padj) %>%
arrange(desc(abs(log2FoldChange)))
diff_expression_data <- bitr(
geneID = diff_expression_data$gene, # вектор идентификаторов генов
fromType = "ENTREZID", # откуда
toType = c("SYMBOL", "ENTREZID"), # куда
OrgDb = "org.Hs.eg.db") %>%
inner_join(diff_expression_data, by = c("ENTREZID" = "gene"))
##
## 'select()' returned 1:1 mapping between keys and columns
## Warning in bitr(geneID = diff_expression_data$gene, fromType = "ENTREZID", :
## 2.5% of input gene IDs are fail to map...
library(ggplot2)
transformed_dds <- rlog(dds)
#transformed_dds <- vst(dds)
pca_plot <- plotPCA(transformed_dds, intgroup=c("treatment")) +
geom_label(aes(label = name))
## using ntop=500 top features by variance
ggsave("pca_plot_cortic.png", plot = pca_plot, width = 8, height = 6, dpi = 300)
pca_plot
library(EnhancedVolcano)
## Loading required package: ggrepel
volcano_plot <- EnhancedVolcano(diff_expression_data,
lab = diff_expression_data$SYMBOL,
x = 'log2FoldChange',
y = 'padj',
legendPosition = 'top',
pCutoff = 0.1,
FCcutoff = 1,
title = 'Space vs Ground')
ggsave("volcano_plot_cortic.png", plot = volcano_plot, width = 7, height = 7, dpi = 300)
volcano_plot
detach("package:EnhancedVolcano", unload=TRUE)
Помогала делать Тимошкова ## Activated
diff_expression_data_up <- diff_expression_data %>% filter(padj <= 0.05, log2FoldChange > 0.5)
ora_up_go <- enrichGO(
gene = diff_expression_data_up$SYMBOL, # вектор идентификаторов генов
OrgDb = "org.Hs.eg.db", # модельный организм
keyType = "SYMBOL", # тип идентификатора генов
ont = "ALL", # аннотация GO BP+CC+MF
universe = diff_expression_data$SYMBOL,
qvalueCutoff = 0.1)
go_plot <- barplot(ora_up_go,
) +
labs(title = "Activated \n GO categories in space")
ggsave("Activated categories in space.png", plot = go_plot, width = 8, height = 5, dpi = 300)
go_plot
dotplot(ora_up_go) +
labs(title = "Activated \n GO categories")
cnetplot_goup <- cnetplot(ora_up_go)
ggsave("cnetplot_Activated categories in space.png", plot = cnetplot_goup, width = 5, height = 8, dpi = 300)
cnetplot_goup
ora_up_go %>% filter(ONTOLOGY == "BP") %>%
enrichplot::pairwise_termsim() %>% # сходство между категориями
emapplot()
diff_expression_data_down <- diff_expression_data %>% filter(padj <= 0.05, log2FoldChange < 0)
ora_up_go <- enrichGO(
gene = diff_expression_data_down$SYMBOL, # вектор идентификаторов генов
OrgDb = "org.Hs.eg.db", # модельный организм
keyType = "SYMBOL", # тип идентификатора генов
ont = "ALL", # аннотация GO BP+CC+MF
universe = diff_expression_data$SYMBOL,
qvalueCutoff = 0.05)
go_plot <- barplot(ora_up_go, colorByPvalue=TRUE) +
labs(title = "Suppressed \n GO categories in space")
ggsave("Suppressed categories in space.png", plot = go_plot, width = 8, height = 5, dpi = 300)
go_plot
cne_plot <- cnetplot(ora_up_go)
ggsave("cnetplot Suppressed categories in space.png", plot = cne_plot, width = 5, height = 8, dpi = 300)
cne_plot
## Warning: ggrepel: 1 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
# GSEA kegg
gene_list <- diff_expression_data$log2FoldChange
names(gene_list) <- diff_expression_data$ENTREZID
gene_list <- sort(gene_list, decreasing = TRUE)
gsea_kegg <- gseKEGG(
geneList = gene_list, # список генов
organism = "hsa", # модельный организм
pvalueCutoff = 0.01) # фильтр по p-значению
## Reading KEGG annotation online: "https://rest.kegg.jp/link/hsa/pathway"...
## Reading KEGG annotation online: "https://rest.kegg.jp/list/pathway/hsa"...
## using 'fgsea' for GSEA analysis, please cite Korotkevich et al (2019).
## preparing geneSet collections...
## GSEA analysis...
## Warning in preparePathwaysAndStats(pathways, stats, minSize, maxSize, gseaParam, : There are ties in the preranked stats (0.01% of the list).
## The order of those tied genes will be arbitrary, which may produce unexpected results.
## Warning in fgseaMultilevel(pathways = pathways, stats = stats, minSize =
## minSize, : There were 4 pathways for which P-values were not calculated
## properly due to unbalanced (positive and negative) gene-level statistic values.
## For such pathways pval, padj, NES, log2err are set to NA. You can try to
## increase the value of the argument nPermSimple (for example set it nPermSimple
## = 10000)
## Warning in fgseaMultilevel(pathways = pathways, stats = stats, minSize =
## minSize, : For some of the pathways the P-values were likely overestimated. For
## such pathways log2err is set to NA.
## leading edge analysis...
## done...
ridgeplot(gsea_kegg, showCategory=10)
## Picking joint bandwidth of 0.0949
gseaplot(gsea_kegg, geneSetID = 1, title = gsea_kegg$Description[1])
counts_WGCNA <- counts[, names(counts) %in% samplesID]
metadata_doph_WGCNA <- metadata_doph %>%
mutate(word = gsub("\\d+", "", condition)) %>% # Удаляем цифры из condition
group_by(word) %>% # Группируем по извлеченному слову
mutate(name = paste0(word, '-', sample)) %>% # Добавляем номер строки для каждой группы
ungroup() %>% # Убираем группировку
select(-word) # Удаляем временный столбец word
wgcan_colnames <- metadata_doph_WGCNA$name
wgcan_colnames <- c('GeneID', wgcan_colnames)
names(counts_WGCNA) <- wgcan_colnames
counts_WGCNA
## # A tibble: 39,376 × 11
## GeneID `Ground-GSM8115892` `Ground-GSM8115893` `Ground-GSM8115894`
## <dbl> <dbl> <dbl> <dbl>
## 1 100287102 22 4 17
## 2 653635 308 188 379
## 3 102466751 2 0 0
## 4 107985730 4 13 8
## 5 100302278 1 0 3
## 6 645520 2 3 7
## 7 79501 0 0 0
## 8 100996442 10 28 25
## 9 729737 50 102 80
## 10 102725121 30 18 37
## # ℹ 39,366 more rows
## # ℹ 7 more variables: `Ground-GSM8115895` <dbl>, `Ground-GSM8115896` <dbl>,
## # `Ground-GSM8115897` <dbl>, `LEO-GSM8115902` <dbl>, `LEO-GSM8115903` <dbl>,
## # `LEO-GSM8115904` <dbl>, `LEO-GSM8115906` <dbl>
mdata <- counts_WGCNA %>%
pivot_longer( !GeneID, names_to = "name", values_to = "value" ) %>%
mutate( group = gsub("-.*","", name) %>% gsub("[.].*","", .)) %>%
mutate(new_column = sub(".*-", "", name))
mdata
## # A tibble: 393,760 × 5
## GeneID name value group new_column
## <dbl> <chr> <dbl> <chr> <chr>
## 1 100287102 Ground-GSM8115892 22 Ground GSM8115892
## 2 100287102 Ground-GSM8115893 4 Ground GSM8115893
## 3 100287102 Ground-GSM8115894 17 Ground GSM8115894
## 4 100287102 Ground-GSM8115895 13 Ground GSM8115895
## 5 100287102 Ground-GSM8115896 29 Ground GSM8115896
## 6 100287102 Ground-GSM8115897 20 Ground GSM8115897
## 7 100287102 LEO-GSM8115902 21 LEO GSM8115902
## 8 100287102 LEO-GSM8115903 13 LEO GSM8115903
## 9 100287102 LEO-GSM8115904 15 LEO GSM8115904
## 10 100287102 LEO-GSM8115906 23 LEO GSM8115906
## # ℹ 393,750 more rows
p <- mdata %>%
# head(10000) %>%
ggplot(aes(x = new_column, y = value)) +
geom_violin() +
geom_point(alpha = 0.2) +
theme_bw() +
theme(axis.text.x = element_text(angle = 90,size=10),
text = element_text(size = 20)) +
labs(x = "Treatment Groups", y = "RNA Seq Counts", title = 'Row counts') +
facet_grid(cols = vars(group), drop = TRUE, scales = "free_x")
ggsave("whitiout_norm.png", plot = p, width = 8, height = 6, dpi = 300)
p
de_input = as.matrix(counts_WGCNA[,-1])
row.names(de_input) = counts_WGCNA$GeneID
meta_df <- data.frame( Sample = names(counts_WGCNA[-1])) %>%
mutate(Type = gsub("-.*","", Sample) %>%
gsub("[.].*","", .))
dds <- DESeqDataSetFromMatrix(round(de_input),
meta_df,
design = ~Type)
## converting counts to integer mode
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
dds <- DESeq(dds)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
vsd <- varianceStabilizingTransformation(dds)
wpn_vsd <- getVarianceStabilizedData(dds)
wpn_vsd[1:5, 1:5]
## Ground-GSM8115892 Ground-GSM8115893 Ground-GSM8115894
## 100287102 6.334068 5.535064 6.100980
## 653635 8.838685 7.920377 8.894648
## 102466751 5.445491 5.045074 5.045074
## 107985730 5.609560 5.919234 5.777795
## 100302278 5.328664 5.045074 5.496763
## Ground-GSM8115895 Ground-GSM8115896
## 100287102 5.854474 6.427703
## 653635 7.963320 8.717918
## 102466751 5.045074 5.045074
## 107985730 5.721198 5.835222
## 100302278 5.045074 5.421160
rv_wpn <- rowVars(wpn_vsd)
q95_wpn <- quantile( rowVars(wpn_vsd), .95)
expr_normalized <- wpn_vsd[ rv_wpn > q95_wpn, ]
expr_normalized[1:5,1:5]
## Ground-GSM8115892 Ground-GSM8115893 Ground-GSM8115894
## 113219467 9.052877 10.673510 10.358306
## 107985728 6.248051 5.535064 6.264256
## 57801 12.525696 13.147242 12.042059
## 105378604 5.609560 5.045074 6.070734
## 7161 6.919040 7.286699 7.304778
## Ground-GSM8115895 Ground-GSM8115896
## 113219467 10.878194 9.313557
## 107985728 5.272282 6.057071
## 57801 13.342494 12.147071
## 105378604 5.823483 5.505039
## 7161 7.167813 6.701383
expr_normalized_df <- data.frame(expr_normalized) %>%
mutate(
Gene_id = row.names(expr_normalized)
) %>%
pivot_longer(-Gene_id) %>%
mutate( group = gsub("-.*","", name) %>% gsub("[.].*","", .)) %>%
mutate(name_2 = sub(".*\\.", "", name))
head(expr_normalized_df)
## # A tibble: 6 × 5
## Gene_id name value group name_2
## <chr> <chr> <dbl> <chr> <chr>
## 1 113219467 Ground.GSM8115892 9.05 Ground GSM8115892
## 2 113219467 Ground.GSM8115893 10.7 Ground GSM8115893
## 3 113219467 Ground.GSM8115894 10.4 Ground GSM8115894
## 4 113219467 Ground.GSM8115895 10.9 Ground GSM8115895
## 5 113219467 Ground.GSM8115896 9.31 Ground GSM8115896
## 6 113219467 Ground.GSM8115897 10.3 Ground GSM8115897
p <- expr_normalized_df %>%
# head(10000) %>%
ggplot(aes(x = name_2, y = value)) +
geom_violin() +
geom_point(alpha = 0.2) +
theme_bw() +
theme(axis.text.x = element_text(angle = 90, size=10),
text = element_text(size = 20)) +
ylim(0, NA) +
labs(title = 'Normalized and 95 quantile Expression', x = "Treatment Groups", y = "RNA Seq Counts") +
facet_grid(cols = vars(group), drop = TRUE, scales = "free_x")
ggsave("normalized_counts.png", plot = p, width = 8, height = 6, dpi = 300)
p
input_mat = t(expr_normalized)
input_mat[1:5,1:4]
## 113219467 107985728 57801 105378604
## Ground-GSM8115892 9.052877 6.248051 12.52570 5.609560
## Ground-GSM8115893 10.673510 5.535064 13.14724 5.045074
## Ground-GSM8115894 10.358306 6.264256 12.04206 6.070734
## Ground-GSM8115895 10.878194 5.272282 13.34249 5.823483
## Ground-GSM8115896 9.313557 6.057071 12.14707 5.505039
powers = c(c(1:10), seq(from = 12, to = 20, by = 2))
sft <- pickSoftThreshold(
input_mat,
powerVector = powers,
verbose = 5
)
## pickSoftThreshold: will use block size 1969.
## pickSoftThreshold: calculating connectivity for given powers...
## ..working on genes 1 through 1969 of 1969
## Warning: executing %dopar% sequentially: no parallel backend registered
## Power SFT.R.sq slope truncated.R.sq mean.k. median.k. max.k.
## 1 1 0.72900 2.6600 0.938 921.0 963.0 1260.0
## 2 2 0.45800 0.9220 0.847 555.0 583.0 914.0
## 3 3 0.09240 0.2710 0.693 374.0 386.0 703.0
## 4 4 0.00728 -0.0641 0.666 269.0 275.0 559.0
## 5 5 0.13800 -0.2950 0.663 202.0 202.0 455.0
## 6 6 0.27100 -0.4460 0.704 157.0 153.0 377.0
## 7 7 0.39400 -0.5310 0.768 125.0 118.0 317.0
## 8 8 0.48400 -0.6190 0.800 101.0 93.6 270.0
## 9 9 0.56600 -0.6950 0.834 83.5 75.2 235.0
## 10 10 0.63200 -0.7530 0.878 69.9 61.8 207.0
## 11 12 0.67400 -0.8850 0.900 50.6 43.0 165.0
## 12 14 0.71000 -0.9940 0.922 38.0 31.0 136.0
## 13 16 0.77200 -1.0500 0.961 29.4 22.9 114.0
## 14 18 0.80800 -1.1200 0.979 23.3 17.3 97.5
## 15 20 0.82400 -1.2100 0.985 18.8 13.3 85.8
sft
## $powerEstimate
## [1] NA
##
## $fitIndices
## Power SFT.R.sq slope truncated.R.sq mean.k. median.k. max.k.
## 1 1 0.728779870 2.66096877 0.9382001 920.83114 963.01609 1257.37659
## 2 2 0.457685689 0.92163682 0.8470379 555.47554 582.50603 914.19154
## 3 3 0.092368042 0.27123343 0.6929240 374.10370 386.23628 702.77926
## 4 4 0.007278128 -0.06405253 0.6658327 268.87362 274.50500 558.83134
## 5 5 0.137690580 -0.29508552 0.6627614 201.95561 202.00735 455.00717
## 6 6 0.270872931 -0.44572802 0.7040691 156.68635 152.85207 377.17961
## 7 7 0.393983294 -0.53056837 0.7684007 124.64809 118.45379 317.17494
## 8 8 0.483946654 -0.61892093 0.8003009 101.17287 93.63633 270.46229
## 9 9 0.565699046 -0.69472855 0.8337424 83.48897 75.22531 235.32851
## 10 10 0.631539306 -0.75321757 0.8778638 69.86162 61.79687 206.54549
## 11 12 0.674076485 -0.88546941 0.9003671 50.61456 43.04831 165.46153
## 12 14 0.710464928 -0.99417614 0.9221527 38.02940 30.99294 136.02176
## 13 16 0.772403372 -1.04941935 0.9610752 29.40487 22.88995 113.90616
## 14 18 0.807704547 -1.12178881 0.9792095 23.27130 17.28731 97.52804
## 15 20 0.824071445 -1.21279633 0.9849861 18.77579 13.28135 85.84917
par(mfrow = c(1,2));
cex1 = 0.9;
plot(sft$fitIndices[, 1],
-sign(sft$fitIndices[, 3]) * sft$fitIndices[, 2],
xlab = "Soft Threshold (power)",
ylab = "Scale Free Topology Model Fit, signed R^2",
main = paste("Scale independence")
)
text(sft$fitIndices[, 1],
-sign(sft$fitIndices[, 3]) * sft$fitIndices[, 2],
labels = powers, cex = cex1, col = "red"
)
abline(h = 0.90, col = "red")
plot(sft$fitIndices[, 1],
sft$fitIndices[, 5],
xlab = "Soft Threshold (power)",
ylab = "Mean Connectivity",
type = "n",
main = paste("Mean connectivity")
)
text(sft$fitIndices[, 1],
sft$fitIndices[, 5],
labels = powers,
cex = cex1, col = "red")
#10
picked_power = 10
temp_cor <- cor
cor <- WGCNA::cor
netwk <- blockwiseModules(input_mat,
power = picked_power,
networkType = "signed",
deepSplit = 2,
pamRespectsDendro = F,
minModuleSize = 30,
maxBlockSize = 4000,
reassignThreshold = 0,
mergeCutHeight = 0.25,
saveTOMs = T,
saveTOMFileBase = "ER",
numericLabels = T,
verbose = 3)
## Calculating module eigengenes block-wise from all genes
## Flagging genes and samples with too many missing values...
## ..step 1
## ..Working on block 1 .
## TOM calculation: adjacency..
## ..will not use multithreading.
## Fraction of slow calculations: 0.000000
## ..connectivity..
## ..matrix multiplication (system BLAS)..
## ..normalization..
## ..done.
## ..saving TOM for block 1 into file ER-block.1.RData
## ....clustering..
## ....detecting modules..
## ....calculating module eigengenes..
## ....checking kME in modules..
## ..removing 1 genes from module 2 because their KME is too low.
## ..removing 1 genes from module 4 because their KME is too low.
## ..removing 2 genes from module 5 because their KME is too low.
## ..removing 1 genes from module 7 because their KME is too low.
## ..removing 1 genes from module 8 because their KME is too low.
## ..merging modules that are too close..
## mergeCloseModules: Merging modules whose distance is less than 0.25
## Calculating new MEs...
mergedColors = labels2colors(netwk$colors)
# Plot the dendrogram and the module colors underneath
plotDendroAndColors(
netwk$dendrograms[[1]],
mergedColors[netwk$blockGenes[[1]]],
"Module colors",
dendroLabels = FALSE,
hang = 0.03,
addGuide = TRUE,
guideHang = 0.05 )
module_df <- data.frame(
gene_id = names(netwk$colors),
colors = labels2colors(netwk$colors)
)
table(module_df$colors)
##
## black blue brown green grey pink red turquoise
## 85 443 284 201 6 53 128 527
## yellow
## 242
MEs0 <- moduleEigengenes(input_mat, mergedColors)$eigengenes
MEs0 <- orderMEs(MEs0)
module_order = names(MEs0) %>% gsub("ME","", .)
MEs0$treatment = row.names(MEs0)
mME = MEs0 %>%
pivot_longer(-treatment) %>%
mutate(
name = gsub("ME", "", name),
name = factor(name, levels = module_order)
)
mME %>% ggplot(aes(x=treatment, y=name, fill=value)) +
geom_tile() +
theme_bw() +
scale_fill_gradient2(
low = "blue",
high = "red",
mid = "white",
midpoint = 0,
limit = c(-1,1)) +
theme(axis.text.x = element_text(angle=90)) +
labs(title = "Module-trait Relationships", y = "Modules", fill="corr")
mME
## # A tibble: 90 × 3
## treatment name value
## <chr> <fct> <dbl>
## 1 Ground-GSM8115892 turquoise 0.124
## 2 Ground-GSM8115892 yellow -0.189
## 3 Ground-GSM8115892 blue -0.342
## 4 Ground-GSM8115892 red -0.434
## 5 Ground-GSM8115892 black 0.455
## 6 Ground-GSM8115892 green 0.493
## 7 Ground-GSM8115892 brown -0.0113
## 8 Ground-GSM8115892 pink 0.365
## 9 Ground-GSM8115892 grey -0.172
## 10 Ground-GSM8115893 turquoise -0.621
## # ℹ 80 more rows
correlation_matrix <- mME %>%
pivot_wider(names_from = treatment, values_from = value) %>%
column_to_rownames('name') %>%
as.matrix()
correlation_matrix
## Ground-GSM8115892 Ground-GSM8115893 Ground-GSM8115894
## turquoise 0.12425687 -0.62126077 0.13785724
## yellow -0.18891558 -0.45294506 0.51227834
## blue -0.34226568 -0.44083811 0.09444174
## red -0.43430775 -0.12839683 -0.05957096
## black 0.45482062 -0.36536788 0.31851792
## green 0.49304791 0.26137267 0.01299616
## brown -0.01126801 0.50594690 -0.49105596
## pink 0.36460089 0.17774576 -0.33727510
## grey -0.17248158 0.02399165 -0.33475927
## Ground-GSM8115895 Ground-GSM8115896 Ground-GSM8115897 LEO-GSM8115902
## turquoise -0.63116885 0.18343456 0.17035777 0.16191022
## yellow -0.39662421 -0.06360123 0.32762108 0.04507156
## blue -0.38752227 -0.13435287 -0.10828996 0.47202334
## red -0.21905874 -0.36163887 0.04017425 0.12208362
## black -0.21249528 0.50665632 0.14585193 -0.09405857
## green 0.28051968 0.33199763 -0.04205212 -0.27154333
## brown 0.61851099 0.02267231 -0.07742515 -0.27995249
## pink 0.11071709 0.16384063 -0.47403400 0.30948827
## grey -0.02320097 0.45731933 0.09939372 0.22537264
## LEO-GSM8115903 LEO-GSM8115904 LEO-GSM8115906
## turquoise 0.21680762 0.2066660 0.05113931
## yellow -0.02284437 0.4335778 -0.19361836
## blue 0.12381467 0.3527469 0.37024223
## red 0.60635964 0.4760698 -0.04171417
## black -0.40722177 -0.1614138 -0.18528952
## green -0.47296345 -0.4068235 -0.18655166
## brown -0.09634056 -0.1579061 -0.03318192
## pink -0.15860347 -0.4811835 0.32470341
## grey 0.50193870 -0.2603227 -0.51725158
library(ComplexHeatmap)
## Loading required package: grid
## ========================================
## ComplexHeatmap version 2.22.0
## Bioconductor page: http://bioconductor.org/packages/ComplexHeatmap/
## Github page: https://github.com/jokergoo/ComplexHeatmap
## Documentation: http://jokergoo.github.io/ComplexHeatmap-reference
##
## If you use it in published research, please cite either one:
## - Gu, Z. Complex Heatmap Visualization. iMeta 2022.
## - Gu, Z. Complex heatmaps reveal patterns and correlations in multidimensional
## genomic data. Bioinformatics 2016.
##
##
## The new InteractiveComplexHeatmap package can directly export static
## complex heatmaps into an interactive Shiny app with zero effort. Have a try!
##
## This message can be suppressed by:
## suppressPackageStartupMessages(library(ComplexHeatmap))
## ========================================
set.seed(123)
data_matrix <- matrix(rnorm(100), nrow=10, ncol=10)
rownames(data_matrix) <- paste0("Row", 1:10)
colnames(data_matrix) <- paste0("Col", 1:10)
Heatmap(correlation_matrix, cluster_rows = TRUE, cluster_columns = TRUE)
#меняем
modules_of_interest = c("red", "blue", 'yellow', 'turquoise')
submod = module_df %>%
subset(colors %in% modules_of_interest)
row.names(module_df) = module_df$gene_id
subexpr = expr_normalized[submod$gene_id,]
submod_df = data.frame(subexpr) %>%
mutate(
gene_id = row.names(.)
) %>%
pivot_longer(-gene_id) %>%
mutate(
module = module_df[gene_id,]$colors
)
submod_df %>% ggplot(aes(x=name, y=value, group=gene_id)) +
geom_line(aes(color = module),
alpha = 0.2) +
theme_bw() +
theme(
axis.text.x = element_text(angle = 90)
) +
facet_grid(rows = vars(module)) +
labs(x = "treatment",
y = "normalized expression")
modules_of_interest = c("green", "black", 'pink', 'brown', 'grey')
submod = module_df %>%
subset(colors %in% modules_of_interest)
row.names(module_df) = module_df$gene_id
subexpr = expr_normalized[submod$gene_id,]
submod_df = data.frame(subexpr) %>%
mutate(
gene_id = row.names(.)
) %>%
pivot_longer(-gene_id) %>%
mutate(
module = module_df[gene_id,]$colors
)
submod_df %>% ggplot(aes(x=name, y=value, group=gene_id)) +
geom_line(aes(color = module),
alpha = 0.2) +
theme_bw() +
theme(
axis.text.x = element_text(angle = 90)
) +
facet_grid(rows = vars(module)) +
labs(x = "treatment",
y = "normalized expression")
modules_of_interest = c("green")
submod = module_df %>%
subset(colors %in% modules_of_interest)
row.names(module_df) = module_df$gene_id
subexpr = expr_normalized[submod$gene_id,]
submod_df = data.frame(subexpr) %>%
mutate(
gene_id = row.names(.)
) %>%
pivot_longer(-gene_id) %>%
mutate(
module = module_df[gene_id,]$colors
)
genes_of_interest = module_df %>%
subset(colors %in% modules_of_interest)
expr_of_interest = expr_normalized[genes_of_interest$gene_id,]
target_list_genes <- rownames(expr_of_interest)
length(target_list_genes)
## [1] 201
ora_up_go2 <- enrichGO(
gene = target_list_genes, # вектор идентификаторов генов
OrgDb = "org.Hs.eg.db", # модельный организм
keyType = "ENTREZID", # тип идентификатора генов
ont = "ALL", # аннотация GO BP+CC+MF
universe = diff_expression_data$ENTREZID,
qvalueCutoff = 0.1)
ora_up_go2
## #
## # over-representation test
## #
## #...@organism Homo sapiens
## #...@ontology GOALL
## #...@keytype ENTREZID
## #...@gene chr [1:201] "374946" "100379251" "4878" "3925" "3151" "113452" "55268" ...
## #...pvalues adjusted by 'BH' with cutoff <0.05
## #...105 enriched terms found
## 'data.frame': 105 obs. of 13 variables:
## $ ONTOLOGY : chr "BP" "BP" "BP" "BP" ...
## $ ID : chr "GO:0030900" "GO:0033555" "GO:0050804" "GO:0099177" ...
## $ Description : chr "forebrain development" "multicellular organismal response to stress" "modulation of chemical synaptic transmission" "regulation of trans-synaptic signaling" ...
## $ GeneRatio : chr "19/175" "9/175" "20/175" "20/175" ...
## $ BgRatio : chr "391/14640" "85/14640" "469/14640" "470/14640" ...
## $ RichFactor : num 0.0486 0.1059 0.0426 0.0426 0.0549 ...
## $ FoldEnrichment: num 4.07 8.86 3.57 3.56 4.6 ...
## $ zScore : num 6.76 7.99 6.22 6.2 6.6 ...
## $ pvalue : num 2.26e-07 7.66e-07 8.48e-07 8.77e-07 9.89e-07 ...
## $ p.adjust : num 0.000521 0.000521 0.000521 0.000521 0.000521 ...
## $ qvalue : num 0.00046 0.00046 0.00046 0.00046 0.00046 ...
## $ geneID : chr "374946/3398/2016/4929/4760/56978/51176/84709/2890/63974/9760/4781/6585/6506/2290/64919/26153/8851/6752" "4929/348980/2890/117/5179/157753/5024/41/4761" "79012/590/2890/10590/5179/84628/5024/4908/2904/41/1138/1136/1143/5579/4761/4804/81832/24141/5413/91851" "79012/590/2890/10590/5179/84628/5024/4908/2904/41/1138/1136/1143/5579/4761/4804/81832/24141/5413/91851" ...
## $ Count : int 19 9 20 20 15 20 6 6 7 6 ...
## #...Citation
## S Xu, E Hu, Y Cai, Z Xie, X Luo, L Zhan, W Tang, Q Wang, B Liu, R Wang, W Xie, T Wu, L Xie, G Yu. Using clusterProfiler to characterize multiomics data. Nature Protocols. 2024, doi:10.1038/s41596-024-01020-z
barplot(ora_up_go2) +
labs(title = "Enriched \n GO categories for green genes")
cnetplot(ora_up_go, showCategory = 10, showCategorySize = 10)
## Warning: ggrepel: 6 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
TOM = TOMsimilarityFromExpr(t(expr_of_interest),
power = picked_power)
## TOM calculation: adjacency..
## ..will not use multithreading.
## Fraction of slow calculations: 0.000000
## ..connectivity..
## ..matrix multiplication (system BLAS)..
## ..normalization..
## ..done.
row.names(TOM) = row.names(expr_of_interest)
colnames(TOM) = row.names(expr_of_interest)
edge_list = data.frame(TOM) %>%
mutate(
gene1 = row.names(.)
) %>%
pivot_longer(-gene1) %>%
dplyr::rename(gene2 = name, correlation = value) %>%
unique() %>%
subset(!(gene1==gene2)) %>%
mutate(
module1 = module_df[gene1,]$colors,
module2 = module_df[gene2,]$colors
)
edge_list
## # A tibble: 40,401 × 5
## gene1 gene2 correlation module1 module2
## <chr> <chr> <dbl> <chr> <chr>
## 1 374946 X374946 1 green <NA>
## 2 374946 X100379251 0.0275 green <NA>
## 3 374946 X4878 0.121 green <NA>
## 4 374946 X3925 0.127 green <NA>
## 5 374946 X3151 0.252 green <NA>
## 6 374946 X113452 0.226 green <NA>
## 7 374946 X55268 0.149 green <NA>
## 8 374946 X3725 0.0553 green <NA>
## 9 374946 X4774 0.212 green <NA>
## 10 374946 X8543 0.299 green <NA>
## # ℹ 40,391 more rows
degree_counts <- edge_list %>%
pivot_longer(cols = c(gene1, gene2), names_to = "gene", values_to = "vertex") %>%
group_by(vertex) %>%
summarise(degree = sum(correlation)) %>%
arrange(desc(degree))
degree_counts <- degree_counts[!grepl("^X", degree_counts$vertex), ]
degree_counts
## # A tibble: 201 × 2
## vertex degree
## <chr> <dbl>
## 1 3344 49.7
## 2 3166 48.7
## 3 4188 46.3
## 4 53335 46.0
## 5 63974 45.6
## 6 8543 45.0
## 7 374946 44.7
## 8 10276 44.4
## 9 9124 43.7
## 10 26035 42.8
## # ℹ 191 more rows
ggplot(degree_counts, aes(x = degree)) +
geom_histogram(binwidth = 0.7, fill = "blue", color = "black") +
labs(title = "Гистограмма степеней вершин для желтого кластера", x = "Степень", y = "Частота") +
theme_minimal()
gene_of_interest <- 4188
library(biomaRt)
# Создание объекта biomaRt для доступа к базе данных Ensembl
ensembl <- useMart("ensembl", dataset = "hsapiens_gene_ensembl")
# Извлечение GO аннотаций для гена
go_annotations <- getBM(
attributes = c('external_gene_name', "name_1006", "definition_1006"),
filters = "entrezgene_id",
values = gene_of_interest,
mart = ensembl
)
go_annotations
## external_gene_name name_1006
## 1 MDFI
## 2 MDFI nucleus
## 3 MDFI cytoplasm
## 4 MDFI protein binding
## 5 MDFI cell differentiation
## 6 MDFI identical protein binding
## 7 MDFI negative regulation of Wnt signaling pathway
## 8 MDFI negative regulation of transcription by RNA polymerase II
## 9 MDFI cytoplasmic sequestering of transcription factor
## 10 MDFI transcription regulator inhibitor activity
## 11 MDFI negative regulation of DNA binding
## 12 MDFI dorsal/ventral axis specification
## 13 MDFI embryonic skeletal system morphogenesis
## 14 MDFI trophoblast giant cell differentiation
## definition_1006
## 1
## 2 A membrane-bounded organelle of eukaryotic cells in which chromosomes are housed and replicated. In most cells, the nucleus contains all of the cell's chromosomes except the organellar chromosomes, and is the site of RNA synthesis and processing. In some species, or in specialized cell types, RNA metabolism or DNA replication may be absent.
## 3 The contents of a cell excluding the plasma membrane and nucleus, but including other subcellular structures.
## 4 Binding to a protein.
## 5 The cellular developmental process in which a relatively unspecialized cell, e.g. embryonic or regenerative cell, acquires specialized structural and/or functional features that characterize a specific cell. Differentiation includes the processes involved in commitment of a cell to a specific fate and its subsequent development to the mature state.
## 6 Binding to an identical protein or proteins.
## 7 Any process that stops, prevents, or reduces the frequency, rate or extent of the Wnt signaling pathway.
## 8 Any process that stops, prevents, or reduces the frequency, rate or extent of transcription mediated by RNA polymerase II.
## 9 The selective interaction of a transcription factor with specific molecules in the cytoplasm, thereby inhibiting its translocation into the nucleus.
## 10 A molecular function regulator that inhibits the activity of a transcription regulator via direct binding and/or post-translational modification.
## 11 Any process that stops or reduces the frequency, rate or extent of DNA binding. DNA binding is any process in which a gene product interacts selectively with DNA (deoxyribonucleic acid).
## 12 The establishment, maintenance and elaboration of the dorsal/ventral axis. The dorsal/ventral axis is defined by a line that runs orthogonal to both the anterior/posterior and left/right axes. The dorsal end is defined by the upper or back side of an organism. The ventral end is defined by the lower or front side of an organism.
## 13 The process in which the anatomical structures of the skeleton are generated and organized during the embryonic phase.
## 14 The process in which a relatively unspecialized cell acquires specialized features of a trophoblast giant cell of the placenta. Trophoblast giant cells are the cell of the placenta that line the maternal decidua.
hgnc_symbols <- getBM(
attributes = c("entrezgene_id", "hgnc_symbol"),
filters = "entrezgene_id",
values = gene_of_interest,
mart = ensembl
)
hgnc_symbols$hgnc_symbol
## [1] "MDFI"