labbook part 8 DEA between 2 patients + GSEA of each DEA

PHOTO EMBED

Thu Jan 23 2025 20:11:47 GMT+0000 (Coordinated Universal Time)

Saved by @eho135

# DEA between BC1 BC6 ####
# DEA of each community between BC1 and BC6 to see differences in gene expression
# i look particularly at immune communities 5-7 (defined at resolution 0.25)

# ensure assay is integrated 
DefaultAssay(so_donorint) <- "integrated"

# combine cluster number and condition info into new variable 'celltype.condition'
# then set it as identity class of the cells 
# consider - how cell clusters behave under different conditions
so_donorint$celltype.condition <- paste(so_donorint$seurat_clusters, so_donorint$Conditions, sep="_")
Idents(so_donorint) <- "celltype.condition"

# run DEA for 2 sets of cells BC1, BC6 - compare expression profiles 
uro_BC1vsBC6 <- FindMarkers(so_donorint, ident.1 = "3_BC1", ident.2 = "3_BC6", test.use = "bimod")

# check output 
head(uro_BC1vsBC6)

# prepare data needed for volcano plot, as previous 
uro_BC1vsBC6["p_val_adj"] <- lapply(uro_BC1vsBC6["p_val_adj"], pmax, 1e-300)
uro_BC1vsBC6$DEA <- "NO"
uro_BC1vsBC6$DEA[uro_BC1vsBC6$avg_log2FC > 1 & uro_BC1vsBC6$p_val_adj < 0.05] <- "UP"
uro_BC1vsBC6$DEA[uro_BC1vsBC6$avg_log2FC < -1 & uro_BC1vsBC6$p_val_adj < 0.05] <- "DOWN"
uro_BC1vsBC6$genes <- rownames(uro_BC1vsBC6)

# Volcano plot to visualize DEA results to compare between BC1 and BC6
ggplot(uro_BC1vsBC6, aes(x=avg_log2FC, y=-log10(p_val_adj))) + 
  geom_point(aes(colour = DEA), show.legend = FALSE) + 
  scale_colour_manual(values = c("blue", "gray", "red")) +
  geom_hline(yintercept = -log10(0.05), linetype = "dotted") + 
  geom_vline(xintercept = c(-1,1), linetype = "dotted") + 
  theme_bw() + 
  theme(panel.border = element_blank(), panel.grid.major = element_blank(), panel.grid.minor = element_blank(), axis.line = element_line(colour = "black"), plot.title = element_text(hjust = 0.5)) +
  geom_text_repel(data=subset(uro_BC1vsBC6, (abs(avg_log2FC) > 7.3 & p_val_adj < 0.05)| (avg_log2FC < -3.5 & p_val_adj < 0.05)), aes(x=avg_log2FC, y=-log10(p_val_adj), label=genes), max.overlaps = 1000, size=4.5) +
  ggtitle("DEA of community 3 between BC1 and BC6 (BC1 in red)") 

 ggsave("plots5/BC1vsBC6_3.png")
# ident.1 = BC1 community 0 shown red // ident.2 = BC6 community 0 shown blue

# Extract upregulated genes (log fold change > 3, p-value < 0.05 )
upreg_0 <- uro_BC1vsBC6[uro_BC1vsBC6$avg_log2FC > 7 & uro_BC1vsBC6$p_val_adj < 0.05, "genes"]

# Extract downregulated genes (log fold change > 4, p-value < 0.05 )
downreg_0 <- uro_BC1vsBC6[uro_BC1vsBC6$avg_log2FC < -3 & uro_BC1vsBC6$p_val_adj < 0.05, "genes"]

print(upreg_0) 
print(downreg_0)

# for ident.1 = 0_BC1 shown red, genes expressed are ranked in descending:
# majority for cell signalling and transcription regulation 
# structural proteins and cytoskeleton
# metabolism and detoxification
# immune response and inflammation
# cellular transport and membrane proteins
# MANY non-coding RNAs, uncharacterized genes, miscellaneous functions
# may be linked to age/ health factors again

# for ident.2 = 0_BC6 shown blue, genes expressed are ranked in descending:
# most significant gene PLD4 (immune response)
# extracellular matrix and structural proteins (8/16 genes)
# cell signalling and regulation (4/16 genes)
# membrane transport and transporters (2/16 genes)
# cell cycle and division (1/16)
# synaptic function and signalling (1/16 genes)
# unknown or uncharacterized (2/16 genes)

# create pi values
# add new column to data frame 'uroG1_T1vsT3' where each gene is assigned a pi score (calc below)
# pi score allows me to rank genes - based on measure of log2fold change + stats significance, can then use to identify top genes for further analysis 
uro_BC1vsBC6 <- mutate(uro_BC1vsBC6, pi = (avg_log2FC * (-1 * log10(p_val_adj))))

# check data set - look at the top hit genes !!
head(uro_BC1vsBC6)

# select top 20 genes (for Ta6) based on pi score in descending order - fr highest pi scores
# these genes are most significant and differentially expressed when comparing between Ta1, Ta6 for G1 urothelial cluster
uro_BC1vsBC6 %>% arrange(desc(pi)) %>% slice(1:20) 

# select top 20 genes (for Ta1) based on pi score in ascending order - fr low pi score
# these genes are either of smaller fold changes or less stats significant p-values 
uro_BC1vsBC6 %>% arrange(pi) %>% slice(1:20) 

# prep for GSEA 
# extract 'genes', 'pi' from data frame and convert 'prerank' to numeric vector 
prerank <- uro_BC1vsBC6[c("genes", "pi")]
prerank <- setNames(prerank$pi, prerank$genes)
str(prerank)

# run GSEA and store results in fgseaRes
genesets = gmtPathways("data5/c2.cp.v2024.1.Hs.symbols.gmt")
fgseaRes <- fgsea(pathways = genesets, stats = prerank, minSize=15, maxSize=300, eps=0)

# filter results from GSEA stored in fgseaRes
# checking top hits for either side of volcano plot 
fgseaRes %>% arrange(padj) %>% filter(NES<0) %>% slice(1:20)
fgseaRes %>% arrange(padj) %>% filter(NES>0) %>% slice(1:20)

# add bar plot to show the differences following GSEA

# Barplot of top GSEA results
fgseaRes %>%
  arrange(desc(NES)) %>%
  slice(1:20) %>%
  ggplot(aes(x = reorder(pathway, NES), y = NES, fill = NES)) +
  geom_bar(stat = "identity") +
  coord_flip() +
  labs(title = "Top 20 Pathways by NES, BC1vsBC6_3 ",
       x = "Pathway",
       y = "Normalized Enrichment Score (NES)") +
  theme_minimal()

ggsave("plots5/BC1vsBC6_3_GSEA.png", bg = "white")
content_copyCOPY