labbook part 1-3
Thu Jan 23 2025 19:57:35 GMT+0000 (Coordinated Universal Time)
Saved by @eho135
# I am working on scRNAseq data derived from human bladder cancer 'Urothelial carcinoma' # Urothelial carcinoma has 2 subtypes - muscle invasive (MIBC), non-muscle invasive (NMIBC) # NMIBC has recurrence rate 70%-80% if untreated and approx 50% progress to MIBC (Aya T Shalata et al. 2022), where MIBC has a 50-70% mortality rate! (Jong Chul Park et al. 2014) # I will look at urothelial heterogeneity within non-malignant NMIBC papilloma as intratumoural heterogeneity is a major driver to drug resistance (NA Saunders et al. 2012) # Data comes from 2 individuals. (M 67yrs, M 58yrs) # sample BC1 = low grade Ta non-malignant NMIBC cells from (M, 67yrs) # sample BC6 = low grade Ta non-malignant NMIBC cells from (M, 58yrs) # differences between the two cell sets could be, but not limited to person differences (age, genetic mutations, lifestyle) # Firstly, I will access the Genomics shared libraries .libPaths("libs/R_4.3.3") # load relevant libraries library(tidyverse) citation("tidyverse") library(Seurat) citation("Seurat") library(clustree) citation("clustree") library(ggplot2) citation("ggplot2") library(ggrepel) citation("ggrepel") library(fgsea) citation("fgsea") # 1 Load, preview and explore data #### counts <- Read10X("data5/02_assessment_options/assessment02_BC1-BC6") # check dimensions of data (i.e number of genes - rows, cells - columns) dim(counts) # number of rows = 38606, columns = 16155 # quick preview of data. Structure, type, first few values str(counts) # setting IDs and conditions (here = cancer T stages) for counts ids <- c("BC1", "BC6") con <- c("BC1", "BC6") # create a table 'cell_counts' based on column names from 'counts' # create metadata df, and annotate my data with labels cell_counts <- table(substring(colnames(counts),18)) # construct two vectors 'sample', 'conditions' from 'ids', 'con' based on 'cell_counts' # set up sample and condition labels for the backgrounds samples <- c() for (s in 1:length(ids)) { samples <- c(samples, (rep(paste(ids)[s], cell_counts[s]))) } conditions <- c() for (c in 1:length(con)) { conditions <- c(conditions, (rep(paste(con)[c], cell_counts[c]))) } # now create a metadata df metadata <- data.frame(Sample=samples, Conditions=conditions, row.names=colnames(counts)) # then check metadata df cell numbers match expected metadata |> group_by(Sample) |> summarise(Cells=n()) metadata |> group_by(Conditions) |> summarise(Cells=n()) # create seurat object 'so' from data - R package used for scRNAseq analysis that stores, organise scRNAseq data, metadata, results # rows = genes, columns = cells # 'meta.data' specifies a metadata df which has additional info of each cell (ie cell types) so <- CreateSeuratObject(counts=counts, project="Genomics_W5assessment", meta.data=metadata) # assign cell identities based on 'conditions' column from metadata # for further analysis to consider cells belonging to different conditions # i.e later DEA, clustering etc can be done based on 'conditions' Idents(so) <- so@meta.data$Conditions # check seurat version packageVersion("Seurat") # remove unused variables to tidy up memory rm(c, s, cell_counts, con, conditions, ids, samples, counts, metadata) # ################################### # # 2 Data QC check #### # removing low quality data from data set # high Mt% indicates dying cells, often correlated with low number of features # high number of features suggests 2 cells are present in sample instead of single cell # calc %MT gene expression # add a new metadata column 'percent.mt' of %MT for each cell so <- PercentageFeatureSet(so, pattern="^MT-", col.name="percent.mt") # Visualize distribution of MT gene percentages VlnPlot(so, features = "percent.mt") # violin plot to visualize and compare distribution of 2 selected features - number of features, MT% # across different cell groups (i.e sample or condition) # assists QC by easing detection of any outliers in my data (low quality/stressed/dying cells) VlnPlot(so, features=c("nFeature_RNA","percent.mt"), ncol=2, pt.size=0, group.by="Sample", raster=FALSE) # distribution pattern is different for BC1 BC6 in nFeature_RNA # BC1 data is too evenly distributed, no median possible to be observed # In contrast BC6 has nice violin shaped distribution making it relatively easier to decide cut off points to remove low quality data # so I decide data cut off threshold based on BC6 # save plot ggsave("plots5/1_violinplot_preQC.png") # Summarize key stats based on distribution above, related to %MT and number of features # Create a summary table summary_metadata <- so@meta.data |> group_by(Sample) |> summarize( mt_med = quantile(percent.mt, 0.5), mt_q95 = quantile(percent.mt, 0.95), mt_q75_plus_IQR = quantile(percent.mt, 0.75) + (1.5 * IQR(percent.mt)), feat_med = median(nFeature_RNA), feat_q95 = quantile(nFeature_RNA, 0.95), feat_q75_plus_IQR = quantile(nFeature_RNA, 0.75) + (1.5 * IQR(nFeature_RNA)), count = n() ) # print the table summary_metadata # scatter plot number of features VS %MT for each cell # assists my QC by identifying cells with high MT%, low features (indicative of dying cells/ empty droplets) ggplot(so@meta.data, aes(nFeature_RNA, percent.mt, colour=Sample)) + geom_point(alpha=0.2) + facet_grid(.~Sample) + theme_bw() ggsave("plots5/2_nfeatures_vs_%MT_preQC.png") # now remove low feature/high MT% cells using UQ + (1.5*IQR) rule # find the upper threshold limit for MT% in both samples, then take the lowest value mt_filt <- min(summary_metadata$mt_q75_plus_IQR) # find upper threshold limit for feature number in both, then take the lowest value feat_max_filt <- min(summary_metadata$feat_q75_plus_IQR) # print proposed thresholds mt_filt # 6.370424 feat_max_filt # 5279.75 # These are proposed threshold, but I need to take closer look at data first to decide cut off points # Taking a closer look at low quality data first - I will use 2 graphs, one for each feature # this is in preparation to filter data after visualization # 1 - zoomed-in version scatter plot focusing on cells < 10% MT content (clear up feature 1) # I can exclude cells that are potentially stressed/ dying more clearly since zoomed in view ggplot(subset(so@meta.data, subset = percent.mt < 10), aes(nFeature_RNA, percent.mt, colour=Sample)) + geom_point(alpha=0.2) + facet_grid(.~Sample) + theme_bw() + xlim(0,8000) + # set limits to x axis to zoom in when viewing graph to decide cut off point ggsave("plots5/3_%MT_zoomedlowqual_QC.png") # 2 - density plot for number of features detected per cell (clear up feature 2) # across different samples in the seurat object 'so' I created earlier # from plot I can identify potential problems like under-sampling or a variability in number of genes detected ggplot(so@meta.data, aes(x=nFeature_RNA, colour=Sample)) + geom_density() + facet_grid(.~Sample) + theme_bw() + xlim(0,3500) # set limits for x axis to zoom in when viewing graph to decide cut off point, the 'dip' in graph # graph 'dip' is approx at 1800 on x axis ggsave("plots5/4_nfeature_zoomedlowqual_QC.png") # Now I can begin data filtering process! - filter based on 3 conditions # 1 - cells < 1800 features (genes) detected # 2 - cells with more than number of features defined by 'feat_max_filt' # 3 - % MT gene expression. Threshold 6.370424 prev defined by 'mt_filt'. Again, filtering out cells with high MT content as indicative of low quality cells (i.e dying) so <- subset(so, subset = nFeature_RNA >= 1800 & nFeature_RNA <= feat_max_filt & percent.mt <= mt_filt) # after filtering I have new data set - post QC # so generate new summary data for filtered data set so@meta.data |> group_by(Sample) |> summarize(mt_med = quantile(percent.mt, 0.5), mt_q95 = quantile(percent.mt, 0.95), mt_q75_plus_IQR = quantile(percent.mt, 0.75) + (1.5 * IQR(percent.mt)), feat_med = median(nFeature_RNA), feat_q95 = quantile(nFeature_RNA, 0.95), feat_q75_plus_IQR = quantile(nFeature_RNA, 0.75) + (1.5 * IQR(nFeature_RNA)), count = n()) # now visualize new summary data post QC - violin plot like previous # violin plot to visualize and compare distribution of 2 selected features - number of features, MT% # across different cell groups (i.e sample or condition) VlnPlot(so, features=c("nFeature_RNA", "percent.mt"), ncol=2, pt.size=0, group.by="Sample", raster=FALSE) # Recap - I have chosen thresholds based on BC6 as it had better initial distribution to decide data cut off points # filtered data only shows cells of good quality facilitating my downstream analysis ggsave("plots5/5_violinplot_postQC.png") # some points to consider about the QC process is that : # same filters are applied to both data sets BC1 BC6, which would surely have differences (i.e starting distributions, sequencing depth across data sets) # a threshold that works well for one data set may i.e discard good quality cells in another # but can only try our best when deciding cut-off point - i.e zooming into graph. It is better than including bad quality data! # create save point save.image("savepointA_QC-complete.RData") # ############################################## # # 3 Data Integration #### # merge data sets after filtering each data sets individually. Need to ensure filtering doesn't bias data # potential problem - imbalanced data set (i.e one was aggressively filtered = has far fewer cells than other, could cause bias downstream) # use stats to account for these differences (i.e normalization, batch correction, merging strategies) # here I use normalization # Merging data is crucial for comparison # different cell types from one person can be more similar than same cell type from 2 people # this means same cell type from different donors would be separated during analysis/ clustering - not useful # integration needed to allow comparison # split objects by conditions defined in section 1 con.list <- SplitObject(so, split.by = "Conditions") # normalize and identify variable features for each data set independently # also ensure the modified object is returned so con.list is updated # top 3000 most variable genes selected for downstream analysis con.list <- lapply(X = con.list, FUN = function(x) { x <- NormalizeData(x, normalization.method = "LogNormalize") x <- FindVariableFeatures(x, selection.method = "vst", nfeatures = 3000) return(x) }) # Scale the data and perform PCA for each condition con.list <- lapply(X = con.list, FUN = function(x) { x <- ScaleData(x) x <- RunPCA(x) return(x) }) # after normalization and variable feature selection - standard thing to do is integrate the 2 conditions based on shared variable features # then can proceed with other analyses such as scaling, PCA, clustering, or differential expression # I prev selected top 3000 most variable genes to look at (nfeature) # when shared variable features close to 3000 = likely lose biological difference # but some similarities/ shared variables (to certain extent) needed to merge 2 data sets = ~600-2400, closer halfway the better # check overlap between most variable features in 2 conditions (BC1,BC6) length(intersect(VariableFeatures(con.list$BC1), VariableFeatures(con.list$BC6))) # 1385 # select features repeatedly variable across data sets for integration intfeats <- SelectIntegrationFeatures(object.list = con.list, nfeatures = 3000) # integration - finally! Now combine data sets using anchors int.anchors <- FindIntegrationAnchors(object.list = con.list, anchor.features = intfeats) so_donorint <- IntegrateData(anchorset = int.anchors) # scaling data to ensure genes with higher variance don't dominate analysis following integration, each feature contributes equally # set default assay to 'integrated' so downstream analysis uses integrated data (corrected values) # instead of data before integration/ raw counts etc DefaultAssay(so_donorint) <- "integrated" so_donorint <- ScaleData(so_donorint, features = rownames(so_donorint)) # clear up after integration # splitted conditions, anchors used to integrate datasets, features selected for integration, original 'so' object before processing rm(con.list, int.anchors, intfeats, so) # create save point save.image("savepointB_QC-integration-complete.RData")
1) Load and preview of data 2) data QC 3_ data integration
Comments