thiscodeWorks | thiscodeWorks

# 6 Res 0.25 ####

# ensure assay is integrated 
DefaultAssay(so_donorint) <- "integrated"

# set resolution at 0.25 which is higher than previously used 0.05 for clustering cells 
res <- 0.25

# now assign cells to a cluster and run UMAP - reduce dimensions so I can visualize
# UMAP allows visualizing clusters 
# 2D representation of cells to visualize similarities/differences based on high dimensional features (i.e gene expression) - based on first PCA
so_donorint <- FindClusters(so_donorint, resolution = res)
so_donorint <- RunUMAP(so_donorint, dims=1:pcs, return.model = TRUE, verbose = FALSE)

# UMAP plot - Visualization of my clustered data using UMAP plot 
DimPlot(so_donorint, reduction = "umap", label = TRUE, label.size = 6)
# as expected, further subdivision of community X shown

ggsave("plots5/UMAP_res0.25.png")

# split UMAP plot - based on 'conditions' + add labels 
# shows me how clusters are distributed across different experimental conditions 
DimPlot(so_donorint, reduction = "umap", split.by = "Conditions", label = TRUE, label.size = 6)
# similar to UMAP at res 0.05, community 0 shown to lack large proportion of cells 

ggsave("plots5/split_UMAP_res0.25.png") 

# % distribution of each cell cluster across different conditions 
# allows me to understand how cell populations are organized in relation to experimental conditions 
round(prop.table(table(Idents(so_donorint), so_donorint$Conditions))*100,3)


# DEA for cell communities at resolution 0.25

# now prepare DEA
# switch back to integrated assay for comparisons 
DefaultAssay(so_donorint) <- "integrated"

# run DEA to identify differentially expressed genes between community 1, 0
uro.markers <- FindMarkers(so_donorint, ident.1 = 7, ident.2 = 5)
# graph description is based on ident.1 = community x
# but I have repeated the above code with different 'ident' to compare gene expression between communities

# check data frame 
head(uro.markers)

# avoid errors caused by small p-values downstream by setting p-value limits
# all adjusted p-value > or = to (1e-300)
uro.markers["p_val_adj"] <- lapply(uro.markers["p_val_adj"], pmax, 1e-300)

# preparation for volcano plot
# categorize genes (in uro.markers) based on their DEA results
# create new column 'DEA' to classify each gene as up, down, no based on log fold-change and adjusted p-value
uro.markers$DEA <- "NO"
uro.markers$DEA[uro.markers$avg_log2FC > 1 & uro.markers$p_val_adj < 0.05] <- "UP"
uro.markers$DEA[uro.markers$avg_log2FC < -1 & uro.markers$p_val_adj < 0.05] <- "DOWN"
uro.markers$genes <- rownames(uro.markers)

# Volcano plot to visualize DEA results for comparison between the conditions (Ta1, Ta6)
ggplot(uro.markers, aes(x=avg_log2FC, y=-log10(p_val_adj))) + 
  geom_point(aes(colour = DEA), show.legend = FALSE) + 
  scale_colour_manual(values = c("blue", "gray", "red")) +
  geom_hline(yintercept = -log10(0.05), linetype = "dotted") + 
  geom_vline(xintercept = c(-1,1), linetype = "dotted") + 
  theme_bw() + 
  theme(panel.border = element_blank(), panel.grid.major = element_blank(), panel.grid.minor = element_blank(), axis.line = element_line(colour = "black"), plot.title = element_text(hjust = 0.5)) +
  geom_text_repel(data=subset(uro.markers, (abs(avg_log2FC) > 9 & p_val_adj < 0.05)| (avg_log2FC < -5 & p_val_adj < 0.05)), aes(x=avg_log2FC, y=-log10(p_val_adj), label=genes), max.overlaps = 1000, size=4.5) +
  ggtitle("DEA between community 7 and 5 (7 in red)") 

# Extract upregulated genes (log fold change > 4, p-value < 0.05 )
upreg_res_0.25 <- uro.markers[uro.markers$avg_log2FC > 2 & uro.markers$p_val_adj < 0.05, "genes"]

# Extract downregulated genes (log fold change > 4, p-value < 0.05 )
downreg_res_0.25<- uro.markers[uro.markers$avg_log2FC < -3 & uro.markers$p_val_adj < 0.05, "genes"]

print(upreg_res_0.25) 

print(downreg_res_0.25)

ggsave("plots5/volcano_integrated_7vs5.png")