labbook part 4-5
Thu Jan 23 2025 20:07:12 GMT+0000 (Coordinated Universal Time)
Saved by @eho135
# 4 Find clusters of cell types #### # Data quality is sorted following QC (section 2) and integration (section 3) - data is single analysis object! # I can now look at biology - try identify communities of cells alike. genes can work in pathways so not equally informative # use PCA to reduce dimension and identify patterns of shared variance then derive (potential) communities # also work out how many clusters are (potentially!) informative # run PCA on integrated data set # recap PCA reduces dimensions in data, only capture max variance in data so easy to visualize # in context of scRNAseq, I'm using PCA to get main sources of variation in gene expression so_donorint <- RunPCA(so_donorint) # top 5 PCs shown, biggest contributor in each direction are : # gene info obtained via uniprot # PC1 positive = HLA-DRB6 - antigen presenting to CD4+ T cells, immune response # PC1 negative = ADIRF - regulates adipogenesis and has a role in fat metabolism # PC2 positive = TXNIP - negative regulator of thioredoxin, tumour suppressor gene # PC2 negative = MT-CO1 - component of cytochrome c oxidase for respiration # pick the most informative PC # elbow plot to show me percentage variance explained by each PC ElbowPlot(so_donorint) # As number of PC increases, the variance explained by each subsequent PC decrease more gradually # The "elbow" is the point where the curve flattens out - stop considering additional PC as they contribute little to explaining data's variance. # after running PCA to assess how much variance in each PC # calc % variance explained by each PC and cumulative % variance across all PCs # helps me determine number of PCs to keep for downstream analysis pct <- so_donorint[["pca"]]@stdev / sum(so_donorint[["pca"]]@stdev) * 100 cumu <- cumsum(pct) # identify point/PC where cumulative variance > 90%, % variance explained < 5% (does not contribute much to variance) co1 <- which(cumu > 90 & pct < 5)[1] co1 # 43 # identify point/PC where difference in % variance explained (contribution to variance) between 2 consecutive PCs is > 0.1% # I can find the point of significant drop in contribution of variance explained by each PC co2 <- sort(which((pct[1:length(pct) - 1] - pct[2:length(pct)]) > 0.1), decreasing = T)[1] co2 # 12 # select minimum value between the 2 measures (co1, co2) # provides criteria to decide when to stop including additional PCs # co1 - point where a PC has cumulative variance > 90%, variance explained < 5% # co2 - point where there's significant drop in variance explained between 2 consecutive PCs pcs <- min(co1, co2) pcs # 12 # Here is comparison of point at 90% variance explained and where PCAs stops being informative # create data frame containing 3 variables # pct - % variance contributed by each PC # cumu - cumulative % variance contributed by PCs # rank - ranking of each PC showing its order in analysis cumu_df <- data.frame(pct = pct, cumu = cumu, rank = 1:length(pct)) # Elbow plot to visualize relation between cumulative variance explained, % variance contribution by each PC ggplot(cumu_df, aes(cumu, pct, label = rank, color = rank > pcs)) + geom_text() + theme_bw() # clean up - remove objects unnecessary for downstream analysis rm(pct, cumu, cumu_df, co1, co2) # PCA-based scatter plot to visualize 4 selected features/genes - check if PCA successful # cells expressing these markers are shaded from dark blue (most expression) to pale pink (least/none) FeaturePlot(so_donorint, reduction = "pca", dims = c(1, 2), features = c("HLA-DRB6", "ADIRF", "TXNIP", "MT-CO1"), cols = c("#fcd5ce", "#4886af"), pt.size = 2.5, alpha = 0.7, order = TRUE,) ggsave("plots5/6_featureplot.png") # look at first 2 PCs # gene 'MT-CO1' shown to have greatest expression across PC1 PC2, closely followed by gene 'ADIRF' # MT-CO1 involved in respiration and could be linked to providing energy required for proliferation # since BC1 BC6 = non-malignant, proliferation of the communities highly expressing MT-CO1 should be a contributor to non-malignancy # but need to perform more analysis before making any conclusions # gene 'HLA-DRB6' have least expression across PC1 PC2 # Neighbor analysis - to find relationships between cells (calc neighborhoods) based on first PC DefaultAssay(so_donorint) <- "integrated" so_donorint <- FindNeighbors(so_donorint, dims = 1:pcs) # problems with neighbor analysis - finds potential communities but does not give reference of how many communities should be present # community detection starts with all cells in 1 community - followed by successive subdivisions # using cluster tree = can visualize how communities derived + subdivided # so, perform clustering analysis across multiple resolution values # then visualize resulting clusters by cluster tree plot so_donorint_orgs <- FindClusters(so_donorint, resolution = seq(0, 0.3, 0.05)) clustree(so_donorint_orgs, show_axis = TRUE) + theme(legend.position = "none") + ylab("Resolution") ggsave("plots5/7_clustertree_lineage.png") # cluster tree plot shows successive community divisions as resolution increases # issue is - most numerous type often results in most subgroups. Purely due to its high abundance and not biologically different # I will therefore start my analysis at low resolution to monitor closely the main community lineages # OK before that clear up first rm(so_donorint_orgs) # set initial resolution to 0.05 for clustering cells - I should see 3 communities res <- 0.05 # now assign cells to a cluster and run UMAP - reduce dimensions so I can visualize # UMAP allows me to visualize clusters based on first PCA # 2D representation of cells to visualize similarities/differences based on high dimensional features (i.e gene expression) - based on first PCA so_donorint <- FindClusters(so_donorint, resolution = res) so_donorint <- RunUMAP(so_donorint, dims=1:pcs, return.model = TRUE, verbose = FALSE) # UMAP plot - Visualization of my clustered data DimPlot(so_donorint, reduction = "umap", label = TRUE, label.size = 6) # based on cluster tree plot I should see 3 communities # however population 1 and 2 each shown as 2 separate groups suggesting further subdivision # I will test with higher resolution later in this analysis to see subdivisions for different cell types (see section 6) # save plot first ggsave("plots5/8_UMAP_res0.05.png") # split UMAP plot - based on 'conditions' + add labels # shows how clusters are distributed across conditions BC1 BC6 DimPlot(so_donorint, reduction = "umap", split.by = "Conditions", label = TRUE, label.size = 6) ggsave("plots5/9_splitUMAP_res0.05.png") # samples are identical - same grade, same non-malignant NMIBC tumour pattern should be the same # differences are observed and could be due to person differences. Samples are from individuals of different ages, and possibly very different lifestyles, health status # genetic differences, mutations can also contribute to these differences # community 0 from BC1 lacks a great number of cells in comparison to BC6 # community 2 is suggested to further subdivide and I will look at a higher resolution later (see section 6) # community 1 is also suggested to further subdivide, and BC6 is shown to have less cells here compared to BC1 # please see section 6 - Vary resolution # % distribution of each cell cluster across different conditions round(prop.table(table(Idents(so_donorint), so_donorint$Conditions))*100,3) # community 0 = 22.6% (BC1) 52.3% (BC6) differs most significantly ############################################################ # 5 Annotate clusters #### # set default assay to 'RNA' and normalize gene expression data in 'RNA' DefaultAssay(so_donorint) <- "RNA" so_donorint_dp_norm <- NormalizeData(so_donorint, verbose = TRUE) # now define list of genes for biological markers - likely communities in urothelium # likely urothelial markers + others features <- c( "TERT", # telomerase "UPK1A", # marker for urothelial differentiation "UPK2", "EPCAM", "KRT18", "KRT13", # urothelial markers "COL1A1", "CALD1", # muscle markers "PECAM1", # endothelial "DCN", "PDPN", "TAGLN", # fibroblasts "CD2", "CD3D", "CD3E", # T cells "C1QC", "CD14", "CSF1R", # macrophages/myeloid "MRC1") # DotPlot to visualize expression of list of genes across different clusters of cells # look at intensity/ proportion of cells expressing each marker in each community # list of genes were defined previously DotPlot(so_donorint_dp_norm, features = features, dot.scale=8) + theme(axis.text.x = element_text(size = 11)) + RotatedAxis () ggsave("plots5/10_Dotplot.png", bg = "white") # urothelial markers are shown to have high % expression - expected # FeaturePlot to visualize expression of selected genes across UMAP-reduced data set # check how specific the markers are FeaturePlot(so_donorint_dp_norm, reduction = "umap", pt.size = 2, alpha = 0.7, order = TRUE, features = c("UPK2", "EPCAM","PECAM1", "CD2", "C1QC", "TERT", "CD3D", "CD3E", "MRC1", "CD19", "CD20", "CD22")) ggsave("plots5/11_featureplot.png") # community 2 (although later subdivides as suggested by clustertree) both appears to be immune cells # C1QC highly expressed = macrophages // CD3D highly expressed = T cells # communities 1,0 contains urothelial markers but subdivision was shown in cluster tree # community 1, 0 important as significant difference observed between BC1, BC6 # find differences with differential expression analysis then visualize with volcano # visualize genes expressed in each community, and attempt to categorize them to a theme i.e immune/ cell cycle related # now prepare DEA # switch back to integrated assay for comparisons (not for plotting) DefaultAssay(so_donorint) <- "integrated" # run DEA to identify differentially expressed genes between community 0, 1 uro.markers <- FindMarkers(so_donorint, ident.1 = 0, ident.2 = 1) # check data frame head(uro.markers) # avoid errors caused by small p-values downstream by setting p-value limits # all adjusted p-value > or = to (1e-300) uro.markers["p_val_adj"] <- lapply(uro.markers["p_val_adj"], pmax, 1e-300) # preparation for volcano plot # categorize genes (in uro.markers) based on their DEA results # create new column 'DEA' to classify each gene as up, down, no based on log fold-change and adjusted p-value uro.markers$DEA <- "NO" uro.markers$DEA[uro.markers$avg_log2FC > 1 & uro.markers$p_val_adj < 0.05] <- "UP" uro.markers$DEA[uro.markers$avg_log2FC < -1 & uro.markers$p_val_adj < 0.05] <- "DOWN" uro.markers$genes <- rownames(uro.markers) # Volcano plot to visualize DEA results for comparison between ident.1 = 0, ident.2 = 1 at res 0.05 ggplot(uro.markers, aes(x=avg_log2FC, y=-log10(p_val_adj))) + geom_point(aes(colour = DEA), show.legend = FALSE) + scale_colour_manual(values = c("blue", "gray", "red")) + geom_hline(yintercept = -log10(0.05), linetype = "dotted") + geom_vline(xintercept = c(-1,1), linetype = "dotted") + theme_bw() + theme(panel.border = element_blank(), panel.grid.major = element_blank(), panel.grid.minor = element_blank(), axis.line = element_line(colour = "black")) + geom_text_repel(data=subset(uro.markers, (abs(avg_log2FC) > 4 & p_val_adj < 0.05)), aes(x=avg_log2FC, y=-log10(p_val_adj), label=genes), max.overlaps = 1000, size=4.5) ggsave("plots5/12_volcano_res0.05.png") # Extract upregulated genes (log fold change > 4, p-value < 0.05 ) upreg_res_0.05 <- uro.markers[uro.markers$avg_log2FC > 4 & uro.markers$p_val_adj < 0.05, "genes"] # Extract downregulated genes (log fold change > 4, p-value < 0.05 ) downreg_res_0.05<- uro.markers[uro.markers$avg_log2FC < -3 & uro.markers$p_val_adj < 0.05, "genes"] print(upreg_res_0.05) print(downreg_res_0.05) # upreg genes red (expressed in ident.1 = 0) # IGHG4 (immune), HS3ST2 (biosynthesis), BGN (inflammation), PALDI (cell migration + cancer), SCN10A (Na channel subunit, neuronal) # cannot really define community 0 just yet and is suggested to subdivide based on cluster tree plot # Will look again at higher resolution (see section 7) # downreg genes blue (expressed in ident.2 = 1) # majority linked to cell cycle and division in community 1 # Not suggested to further subdivide. Go ahead with cell cycle scoring ######################################################
4) Find clusters 5) Annotate clusters
Comments