labbook part 8 DEA between 2 patients + GSEA of each DEA
Thu Jan 23 2025 20:11:47 GMT+0000 (Coordinated Universal Time)
Saved by @eho135
# DEA between BC1 BC6 #### # DEA of each community between BC1 and BC6 to see differences in gene expression # i look particularly at immune communities 5-7 (defined at resolution 0.25) # ensure assay is integrated DefaultAssay(so_donorint) <- "integrated" # combine cluster number and condition info into new variable 'celltype.condition' # then set it as identity class of the cells # consider - how cell clusters behave under different conditions so_donorint$celltype.condition <- paste(so_donorint$seurat_clusters, so_donorint$Conditions, sep="_") Idents(so_donorint) <- "celltype.condition" # run DEA for 2 sets of cells BC1, BC6 - compare expression profiles uro_BC1vsBC6 <- FindMarkers(so_donorint, ident.1 = "3_BC1", ident.2 = "3_BC6", test.use = "bimod") # check output head(uro_BC1vsBC6) # prepare data needed for volcano plot, as previous uro_BC1vsBC6["p_val_adj"] <- lapply(uro_BC1vsBC6["p_val_adj"], pmax, 1e-300) uro_BC1vsBC6$DEA <- "NO" uro_BC1vsBC6$DEA[uro_BC1vsBC6$avg_log2FC > 1 & uro_BC1vsBC6$p_val_adj < 0.05] <- "UP" uro_BC1vsBC6$DEA[uro_BC1vsBC6$avg_log2FC < -1 & uro_BC1vsBC6$p_val_adj < 0.05] <- "DOWN" uro_BC1vsBC6$genes <- rownames(uro_BC1vsBC6) # Volcano plot to visualize DEA results to compare between BC1 and BC6 ggplot(uro_BC1vsBC6, aes(x=avg_log2FC, y=-log10(p_val_adj))) + geom_point(aes(colour = DEA), show.legend = FALSE) + scale_colour_manual(values = c("blue", "gray", "red")) + geom_hline(yintercept = -log10(0.05), linetype = "dotted") + geom_vline(xintercept = c(-1,1), linetype = "dotted") + theme_bw() + theme(panel.border = element_blank(), panel.grid.major = element_blank(), panel.grid.minor = element_blank(), axis.line = element_line(colour = "black"), plot.title = element_text(hjust = 0.5)) + geom_text_repel(data=subset(uro_BC1vsBC6, (abs(avg_log2FC) > 7.3 & p_val_adj < 0.05)| (avg_log2FC < -3.5 & p_val_adj < 0.05)), aes(x=avg_log2FC, y=-log10(p_val_adj), label=genes), max.overlaps = 1000, size=4.5) + ggtitle("DEA of community 3 between BC1 and BC6 (BC1 in red)") ggsave("plots5/BC1vsBC6_3.png") # ident.1 = BC1 community 0 shown red // ident.2 = BC6 community 0 shown blue # Extract upregulated genes (log fold change > 3, p-value < 0.05 ) upreg_0 <- uro_BC1vsBC6[uro_BC1vsBC6$avg_log2FC > 7 & uro_BC1vsBC6$p_val_adj < 0.05, "genes"] # Extract downregulated genes (log fold change > 4, p-value < 0.05 ) downreg_0 <- uro_BC1vsBC6[uro_BC1vsBC6$avg_log2FC < -3 & uro_BC1vsBC6$p_val_adj < 0.05, "genes"] print(upreg_0) print(downreg_0) # for ident.1 = 0_BC1 shown red, genes expressed are ranked in descending: # majority for cell signalling and transcription regulation # structural proteins and cytoskeleton # metabolism and detoxification # immune response and inflammation # cellular transport and membrane proteins # MANY non-coding RNAs, uncharacterized genes, miscellaneous functions # may be linked to age/ health factors again # for ident.2 = 0_BC6 shown blue, genes expressed are ranked in descending: # most significant gene PLD4 (immune response) # extracellular matrix and structural proteins (8/16 genes) # cell signalling and regulation (4/16 genes) # membrane transport and transporters (2/16 genes) # cell cycle and division (1/16) # synaptic function and signalling (1/16 genes) # unknown or uncharacterized (2/16 genes) # create pi values # add new column to data frame 'uroG1_T1vsT3' where each gene is assigned a pi score (calc below) # pi score allows me to rank genes - based on measure of log2fold change + stats significance, can then use to identify top genes for further analysis uro_BC1vsBC6 <- mutate(uro_BC1vsBC6, pi = (avg_log2FC * (-1 * log10(p_val_adj)))) # check data set - look at the top hit genes !! head(uro_BC1vsBC6) # select top 20 genes (for Ta6) based on pi score in descending order - fr highest pi scores # these genes are most significant and differentially expressed when comparing between Ta1, Ta6 for G1 urothelial cluster uro_BC1vsBC6 %>% arrange(desc(pi)) %>% slice(1:20) # select top 20 genes (for Ta1) based on pi score in ascending order - fr low pi score # these genes are either of smaller fold changes or less stats significant p-values uro_BC1vsBC6 %>% arrange(pi) %>% slice(1:20) # prep for GSEA # extract 'genes', 'pi' from data frame and convert 'prerank' to numeric vector prerank <- uro_BC1vsBC6[c("genes", "pi")] prerank <- setNames(prerank$pi, prerank$genes) str(prerank) # run GSEA and store results in fgseaRes genesets = gmtPathways("data5/c2.cp.v2024.1.Hs.symbols.gmt") fgseaRes <- fgsea(pathways = genesets, stats = prerank, minSize=15, maxSize=300, eps=0) # filter results from GSEA stored in fgseaRes # checking top hits for either side of volcano plot fgseaRes %>% arrange(padj) %>% filter(NES<0) %>% slice(1:20) fgseaRes %>% arrange(padj) %>% filter(NES>0) %>% slice(1:20) # add bar plot to show the differences following GSEA # Barplot of top GSEA results fgseaRes %>% arrange(desc(NES)) %>% slice(1:20) %>% ggplot(aes(x = reorder(pathway, NES), y = NES, fill = NES)) + geom_bar(stat = "identity") + coord_flip() + labs(title = "Top 20 Pathways by NES, BC1vsBC6_3 ", x = "Pathway", y = "Normalized Enrichment Score (NES)") + theme_minimal() ggsave("plots5/BC1vsBC6_3_GSEA.png", bg = "white")
Comments