labbook part 4-5

PHOTO EMBED

Thu Jan 23 2025 20:07:12 GMT+0000 (Coordinated Universal Time)

Saved by @eho135

# 4 Find clusters of cell types ####

# Data quality is sorted following QC (section 2) and integration (section 3) - data is single analysis object!

# I can now look at biology - try identify communities of cells alike. genes can work in pathways so not equally informative
# use PCA to reduce dimension and identify patterns of shared variance then derive (potential) communities 
# also work out how many clusters are (potentially!) informative 

# run PCA on integrated data set 
# recap PCA reduces dimensions in data, only capture max variance in data so easy to visualize
# in context of scRNAseq, I'm using PCA to get main sources of variation in gene expression
so_donorint <- RunPCA(so_donorint)

# top 5 PCs shown, biggest contributor in each direction are : 
# gene info obtained via uniprot
# PC1 positive = HLA-DRB6 - antigen presenting to CD4+ T cells, immune response
# PC1 negative = ADIRF - regulates adipogenesis and has a role in fat metabolism 
# PC2 positive = TXNIP - negative regulator of thioredoxin, tumour suppressor gene 
# PC2 negative = MT-CO1 - component of cytochrome c oxidase for respiration

# pick the most informative PC
# elbow plot to show me percentage variance explained by each PC
ElbowPlot(so_donorint)

# As number of PC increases, the variance explained by each subsequent PC decrease more gradually
# The "elbow" is the point where the curve flattens out - stop considering additional PC as they contribute little to explaining data's variance.

# after running PCA to assess how much variance in each PC
# calc % variance explained by each PC and cumulative % variance across all PCs 
# helps me determine number of PCs to keep for downstream analysis
pct <- so_donorint[["pca"]]@stdev / sum(so_donorint[["pca"]]@stdev) * 100
cumu <- cumsum(pct)

# identify point/PC where cumulative variance > 90%, % variance explained < 5% (does not contribute much to variance)
co1 <- which(cumu > 90 & pct < 5)[1]
co1 # 43

# identify point/PC where difference in % variance explained (contribution to variance) between 2 consecutive PCs is > 0.1%
# I can find the point of significant drop in contribution of variance explained by each PC
co2 <- sort(which((pct[1:length(pct) - 1] - pct[2:length(pct)]) > 0.1), decreasing = T)[1]
co2 # 12

# select minimum value between the 2 measures (co1, co2)
# provides criteria to decide when to stop including additional PCs
# co1 - point where a PC has cumulative variance > 90%, variance explained < 5%
# co2 - point where there's significant drop in variance explained between 2 consecutive PCs
pcs <- min(co1, co2)
pcs # 12

# Here is comparison of point at 90% variance explained and where PCAs stops being informative
# create data frame containing 3 variables
# pct - % variance contributed by each PC
# cumu - cumulative % variance contributed by PCs 
# rank - ranking of each PC showing its order in analysis
cumu_df <- data.frame(pct = pct, cumu = cumu, rank = 1:length(pct))

# Elbow plot to visualize relation between cumulative variance explained, % variance contribution by each PC
ggplot(cumu_df, aes(cumu, pct, label = rank, color = rank > pcs)) + 
  geom_text() +
  theme_bw()

# clean up - remove objects unnecessary for downstream analysis 
rm(pct, cumu, cumu_df, co1, co2)

# PCA-based scatter plot to visualize 4 selected features/genes - check if PCA successful
# cells expressing these markers are shaded from dark blue (most expression) to pale pink (least/none)
FeaturePlot(so_donorint, reduction = "pca", dims = c(1, 2), 
            features = c("HLA-DRB6", "ADIRF", "TXNIP", "MT-CO1"), cols = c("#fcd5ce", "#4886af"), 
            pt.size = 2.5, alpha = 0.7, order = TRUE,)

ggsave("plots5/6_featureplot.png")

# look at first 2 PCs
# gene 'MT-CO1' shown to have greatest expression across PC1 PC2, closely followed by gene 'ADIRF'
# MT-CO1 involved in respiration and could be linked to providing energy required for proliferation
# since BC1 BC6 = non-malignant, proliferation of the communities highly expressing MT-CO1 should be a contributor to non-malignancy
# but need to perform more analysis before making any conclusions
# gene 'HLA-DRB6' have least expression across PC1 PC2 

# Neighbor analysis - to find relationships between cells (calc neighborhoods) based on first PC
DefaultAssay(so_donorint) <- "integrated"
so_donorint <- FindNeighbors(so_donorint, dims = 1:pcs)

# problems with neighbor analysis - finds potential communities but does not give reference of how many communities should be present
# community detection starts with all cells in 1 community - followed by successive subdivisions
# using cluster tree = can visualize how communities derived + subdivided 

# so, perform clustering analysis across multiple resolution values
# then visualize resulting clusters by cluster tree plot 
so_donorint_orgs <- FindClusters(so_donorint, resolution = seq(0, 0.3, 0.05))
clustree(so_donorint_orgs, show_axis = TRUE) + theme(legend.position = "none") + ylab("Resolution")

ggsave("plots5/7_clustertree_lineage.png")

# cluster tree plot shows successive community divisions as resolution increases
# issue is - most numerous type often results in most subgroups. Purely due to its high abundance and not biologically different
# I will therefore start my analysis at low resolution to monitor closely the main community lineages

# OK before that clear up first
rm(so_donorint_orgs)

# set initial resolution to 0.05 for clustering cells - I should see 3 communities 
res <- 0.05

# now assign cells to a cluster and run UMAP - reduce dimensions so I can visualize
# UMAP allows me to visualize clusters based on first PCA 
# 2D representation of cells to visualize similarities/differences based on high dimensional features (i.e gene expression) - based on first PCA
so_donorint <- FindClusters(so_donorint, resolution = res)
so_donorint <- RunUMAP(so_donorint, dims=1:pcs, return.model = TRUE, verbose = FALSE)

# UMAP plot - Visualization of my clustered data 
DimPlot(so_donorint, reduction = "umap", label = TRUE, label.size = 6)

# based on cluster tree plot I should see 3 communities 
# however population 1 and 2 each shown as 2 separate groups suggesting further subdivision
# I will test with higher resolution later in this analysis to see subdivisions for different cell types (see section 6)

# save plot first
ggsave("plots5/8_UMAP_res0.05.png")

# split UMAP plot - based on 'conditions' + add labels 
# shows how clusters are distributed across conditions BC1 BC6
DimPlot(so_donorint, reduction = "umap", split.by = "Conditions", label = TRUE, label.size = 6)

ggsave("plots5/9_splitUMAP_res0.05.png")

# samples are identical - same grade, same non-malignant NMIBC tumour pattern should be the same
# differences are observed and could be due to person differences. Samples are from individuals of different ages, and possibly very different lifestyles, health status
# genetic differences, mutations can also contribute to these differences
# community 0 from BC1 lacks a great number of cells in comparison to BC6
# community 2 is suggested to further subdivide and I will look at a higher resolution later (see section 6)
# community 1 is also suggested to further subdivide, and BC6 is shown to have less cells here compared to BC1
# please see section 6 - Vary resolution

# % distribution of each cell cluster across different conditions 
round(prop.table(table(Idents(so_donorint), so_donorint$Conditions))*100,3)
# community 0 = 22.6% (BC1) 52.3% (BC6) differs most significantly



############################################################

# 5 Annotate clusters ####

# set default assay to 'RNA' and normalize gene expression data in 'RNA'
DefaultAssay(so_donorint) <- "RNA"
so_donorint_dp_norm <- NormalizeData(so_donorint, verbose = TRUE)

# now define list of genes for biological markers - likely communities in urothelium
# likely urothelial markers + others 
features <- c(
              "TERT",                               # telomerase
              "UPK1A",                              # marker for urothelial differentiation
              
              "UPK2", "EPCAM", "KRT18", "KRT13",    # urothelial markers
              "COL1A1", "CALD1",                    # muscle markers
              "PECAM1",                             # endothelial
              "DCN", "PDPN", "TAGLN",               # fibroblasts
              "CD2", "CD3D", "CD3E",                # T cells
              "C1QC", "CD14", "CSF1R",             # macrophages/myeloid
            "MRC1")

# DotPlot to visualize expression of list of genes across different clusters of cells
# look at intensity/ proportion of cells expressing each marker in each community
# list of genes were defined previously
DotPlot(so_donorint_dp_norm, features = features, dot.scale=8) + 
  theme(axis.text.x = element_text(size = 11)) + 
  RotatedAxis ()

ggsave("plots5/10_Dotplot.png", bg = "white")
# urothelial markers are shown to have high % expression - expected 

# FeaturePlot to visualize expression of selected genes across UMAP-reduced data set
# check how specific the markers are 
FeaturePlot(so_donorint_dp_norm, reduction = "umap", 
            pt.size = 2, alpha = 0.7, order = TRUE, 
            features = c("UPK2", "EPCAM","PECAM1", "CD2", 
                         "C1QC", "TERT", "CD3D", 
                         "CD3E", "MRC1", "CD19", "CD20", "CD22"))

ggsave("plots5/11_featureplot.png")

# community 2 (although later subdivides as suggested by clustertree) both appears to be immune cells
# C1QC highly expressed = macrophages // CD3D highly expressed = T cells
# communities 1,0 contains urothelial markers but subdivision was shown in cluster tree
# community 1, 0 important as significant difference observed between BC1, BC6
# find differences with differential expression analysis then visualize with volcano
# visualize genes expressed in each community, and attempt to categorize them to a theme i.e immune/ cell cycle related

# now prepare DEA
# switch back to integrated assay for comparisons (not for plotting)
DefaultAssay(so_donorint) <- "integrated"

# run DEA to identify differentially expressed genes between community 0, 1
uro.markers <- FindMarkers(so_donorint, ident.1 = 0, ident.2 = 1)

# check data frame 
head(uro.markers)

# avoid errors caused by small p-values downstream by setting p-value limits
# all adjusted p-value > or = to (1e-300)
uro.markers["p_val_adj"] <- lapply(uro.markers["p_val_adj"], pmax, 1e-300)

# preparation for volcano plot
# categorize genes (in uro.markers) based on their DEA results
# create new column 'DEA' to classify each gene as up, down, no based on log fold-change and adjusted p-value
uro.markers$DEA <- "NO"
uro.markers$DEA[uro.markers$avg_log2FC > 1 & uro.markers$p_val_adj < 0.05] <- "UP"
uro.markers$DEA[uro.markers$avg_log2FC < -1 & uro.markers$p_val_adj < 0.05] <- "DOWN"
uro.markers$genes <- rownames(uro.markers)

# Volcano plot to visualize DEA results for comparison between ident.1 = 0, ident.2 = 1 at res 0.05
ggplot(uro.markers, aes(x=avg_log2FC, y=-log10(p_val_adj))) + 
  geom_point(aes(colour = DEA), show.legend = FALSE) + 
  scale_colour_manual(values = c("blue", "gray", "red")) +
  geom_hline(yintercept = -log10(0.05), linetype = "dotted") + 
  geom_vline(xintercept = c(-1,1), linetype = "dotted") + 
  theme_bw() + 
  theme(panel.border = element_blank(), panel.grid.major = element_blank(), panel.grid.minor = element_blank(), axis.line = element_line(colour = "black")) +
  geom_text_repel(data=subset(uro.markers, (abs(avg_log2FC) > 4 & p_val_adj < 0.05)), aes(x=avg_log2FC, y=-log10(p_val_adj), label=genes), max.overlaps = 1000, size=4.5)

ggsave("plots5/12_volcano_res0.05.png")

# Extract upregulated genes (log fold change > 4, p-value < 0.05 )
upreg_res_0.05 <- uro.markers[uro.markers$avg_log2FC > 4 & uro.markers$p_val_adj < 0.05, "genes"]

# Extract downregulated genes (log fold change > 4, p-value < 0.05 )
downreg_res_0.05<- uro.markers[uro.markers$avg_log2FC < -3 & uro.markers$p_val_adj < 0.05, "genes"]

print(upreg_res_0.05) 
print(downreg_res_0.05)

# upreg genes red (expressed in ident.1 = 0) 
# IGHG4 (immune), HS3ST2 (biosynthesis), BGN (inflammation), PALDI (cell migration + cancer), SCN10A (Na channel subunit, neuronal)
# cannot really define community 0 just yet and is suggested to subdivide based on cluster tree plot
# Will look again at higher resolution (see section 7)

# downreg genes blue (expressed in ident.2 = 1)
# majority linked to cell cycle and division in community 1 
# Not suggested to further subdivide. Go ahead with cell cycle scoring 


######################################################

content_copyCOPY

4) Find clusters 5) Annotate clusters