labbook part 1-3

PHOTO
Thu Jan 23 2025 19:57:35 GMT+0000 (Coordinated Universal Time)
# I am working on scRNAseq data derived from human bladder cancer 'Urothelial carcinoma'
# Urothelial carcinoma has 2 subtypes - muscle invasive (MIBC), non-muscle invasive (NMIBC)
# NMIBC has recurrence rate 70%-80% if untreated and approx 50% progress to MIBC (Aya T Shalata et al. 2022), where MIBC has a 50-70% mortality rate! (Jong Chul Park et al. 2014)
# I will look at urothelial heterogeneity within non-malignant NMIBC papilloma as intratumoural heterogeneity is a major driver to drug resistance (NA Saunders et al. 2012)

# Data comes from 2 individuals. (M 67yrs, M 58yrs)
# sample BC1 = low grade Ta non-malignant NMIBC cells from (M, 67yrs)
# sample BC6 = low grade Ta non-malignant NMIBC cells from (M, 58yrs)
# differences between the two cell sets could be, but not limited to person differences (age, genetic mutations, lifestyle)

# Firstly, I will access the Genomics shared libraries 
.libPaths("libs/R_4.3.3")

# load relevant libraries
library(tidyverse)
citation("tidyverse")
library(Seurat)
citation("Seurat")
library(clustree)
citation("clustree")
library(ggplot2)
citation("ggplot2")
library(ggrepel)
citation("ggrepel")
library(fgsea)
citation("fgsea")

# 1 Load, preview and explore data ####
counts <- Read10X("data5/02_assessment_options/assessment02_BC1-BC6")

# check dimensions of data (i.e number of genes - rows, cells - columns)
dim(counts)
# number of rows = 38606, columns = 16155

# quick preview of data. Structure, type, first few values
str(counts)

# setting IDs and conditions (here = cancer T stages) for counts
ids <- c("BC1", "BC6")
con <- c("BC1", "BC6")

# create a table 'cell_counts' based on column names from 'counts'
# create metadata df, and annotate my data with labels
cell_counts <- table(substring(colnames(counts),18))

# construct two vectors 'sample', 'conditions' from 'ids', 'con' based on 'cell_counts'
# set up sample and condition labels for the backgrounds
samples <- c()
for (s in 1:length(ids)) { 
  samples <- c(samples, (rep(paste(ids)[s], cell_counts[s])))
} 
conditions <- c()
for (c in 1:length(con)) { 
  conditions <- c(conditions, (rep(paste(con)[c], cell_counts[c])))
} 

# now create a metadata df
metadata <- data.frame(Sample=samples, Conditions=conditions, row.names=colnames(counts))

# then check metadata df cell numbers match expected 
metadata |> group_by(Sample) |> summarise(Cells=n())
metadata |> group_by(Conditions) |> summarise(Cells=n())

# create seurat object 'so' from data - R package used for scRNAseq analysis that stores, organise scRNAseq data, metadata, results 
# rows = genes, columns = cells
# 'meta.data' specifies a metadata df which has additional info of each cell (ie cell types)
so <- CreateSeuratObject(counts=counts, project="Genomics_W5assessment", meta.data=metadata)

# assign cell identities based on 'conditions' column from metadata
# for further analysis to consider cells belonging to different conditions
# i.e later DEA, clustering etc can be done based on 'conditions'
Idents(so) <- so@meta.data$Conditions

# check seurat version
packageVersion("Seurat")

# remove unused variables to tidy up memory
rm(c, s, cell_counts, con, conditions, ids, samples, counts, metadata)

# ################################### #

# 2 Data QC check ####

# removing low quality data from data set 
# high Mt% indicates dying cells, often correlated with low number of features
# high number of features suggests 2 cells are present in sample instead of single cell

# calc %MT gene expression  
# add a new metadata column 'percent.mt' of %MT for each cell
so <- PercentageFeatureSet(so, pattern="^MT-", col.name="percent.mt")

# Visualize distribution of MT gene percentages 
VlnPlot(so, features = "percent.mt")

# violin plot to visualize and compare distribution of 2 selected features - number of features, MT%
# across different cell groups (i.e sample or condition)
# assists QC by easing detection of any outliers in my data (low quality/stressed/dying cells)
VlnPlot(so, features=c("nFeature_RNA","percent.mt"), 
        ncol=2, pt.size=0, group.by="Sample", raster=FALSE)

# distribution pattern is different for BC1 BC6 in nFeature_RNA
# BC1 data is too evenly distributed, no median possible to be observed
# In contrast BC6 has nice violin shaped distribution making it relatively easier to decide cut off points to remove low quality data
# so I decide data cut off threshold based on BC6 

# save plot
ggsave("plots5/1_violinplot_preQC.png")

# Summarize key stats based on distribution above, related to %MT and number of features 
# Create a summary table 
summary_metadata <- so@meta.data |> 
  group_by(Sample) |> 
  summarize(
    mt_med = quantile(percent.mt, 0.5),
    mt_q95 = quantile(percent.mt, 0.95),
    mt_q75_plus_IQR = quantile(percent.mt, 0.75) + (1.5 * IQR(percent.mt)),
    feat_med = median(nFeature_RNA),
    feat_q95 = quantile(nFeature_RNA, 0.95),
    feat_q75_plus_IQR = quantile(nFeature_RNA, 0.75) + (1.5 * IQR(nFeature_RNA)),
    count = n()
  )

# print the table
summary_metadata

# scatter plot number of features VS %MT for each cell 
# assists my QC by identifying cells with high MT%, low features (indicative of dying cells/ empty droplets)
ggplot(so@meta.data, aes(nFeature_RNA, percent.mt, colour=Sample)) +
  geom_point(alpha=0.2) + 
  facet_grid(.~Sample) + 
  theme_bw()

ggsave("plots5/2_nfeatures_vs_%MT_preQC.png")

# now remove low feature/high MT% cells using UQ + (1.5*IQR) rule
# find the upper threshold limit for MT% in both samples, then take the lowest value
mt_filt <- min(summary_metadata$mt_q75_plus_IQR)

# find upper threshold limit for feature number in both, then take the lowest value
feat_max_filt <- min(summary_metadata$feat_q75_plus_IQR)

# print proposed thresholds
mt_filt # 6.370424
feat_max_filt # 5279.75
# These are proposed threshold, but I need to take closer look at data first to decide cut off points  

# Taking a closer look at low quality data first - I will use 2 graphs, one for each feature
# this is in preparation to filter data after visualization 

# 1 - zoomed-in version scatter plot focusing on cells < 10% MT content (clear up feature 1)
# I can exclude cells that are potentially stressed/ dying more clearly since zoomed in view
ggplot(subset(so@meta.data, subset = percent.mt < 10), aes(nFeature_RNA, percent.mt, colour=Sample)) + 
  geom_point(alpha=0.2) + 
  facet_grid(.~Sample) + 
  theme_bw() + 
  xlim(0,8000) + # set limits to x axis to zoom in when viewing graph to decide cut off point

ggsave("plots5/3_%MT_zoomedlowqual_QC.png")

# 2 - density plot for number of features detected per cell (clear up feature 2) 
# across different samples in the seurat object 'so' I created earlier
# from plot I can identify potential problems like under-sampling or a variability in number of genes detected
ggplot(so@meta.data, aes(x=nFeature_RNA, colour=Sample)) + 
  geom_density() + 
  facet_grid(.~Sample) + 
  theme_bw() + 
  xlim(0,3500) # set limits for x axis to zoom in when viewing graph to decide cut off point, the 'dip' in graph 
# graph 'dip' is approx at 1800 on x axis

ggsave("plots5/4_nfeature_zoomedlowqual_QC.png")

# Now I can begin data filtering process! - filter based on 3 conditions
# 1 - cells < 1800 features (genes) detected
# 2 - cells with more than number of features defined by 'feat_max_filt' 
# 3 - % MT gene expression. Threshold 6.370424 prev defined by 'mt_filt'. Again, filtering out cells with high MT content as indicative of low quality cells (i.e dying)
so <- subset(so, subset = nFeature_RNA >= 1800 & nFeature_RNA <= feat_max_filt & percent.mt <= mt_filt)

# after filtering I have new data set - post QC
# so generate new summary data for filtered data set 
so@meta.data |> group_by(Sample) |> summarize(mt_med = quantile(percent.mt, 0.5),
                                              mt_q95 = quantile(percent.mt, 0.95),
                                              mt_q75_plus_IQR = quantile(percent.mt, 0.75) + (1.5 * IQR(percent.mt)),
                                              feat_med = median(nFeature_RNA),
                                              feat_q95 = quantile(nFeature_RNA, 0.95),
                                              feat_q75_plus_IQR = quantile(nFeature_RNA, 0.75) + (1.5 * IQR(nFeature_RNA)),
                                              count = n())

# now visualize new summary data post QC - violin plot like previous
# violin plot to visualize and compare distribution of 2 selected features - number of features, MT%
# across different cell groups (i.e sample or condition)
VlnPlot(so, features=c("nFeature_RNA", "percent.mt"), 
        ncol=2, pt.size=0, group.by="Sample", raster=FALSE)

# Recap - I have chosen thresholds based on BC6 as it had better initial distribution to decide data cut off points
# filtered data only shows cells of good quality facilitating my downstream analysis

ggsave("plots5/5_violinplot_postQC.png")

# some points to consider about the QC process is that : 
# same filters are applied to both data sets BC1 BC6, which would surely have differences (i.e starting distributions, sequencing depth across data sets)
# a threshold that works well for one data set may i.e discard good quality cells in another
# but can only try our best when deciding cut-off point - i.e zooming into graph. It is better than including bad quality data!

# create save point
save.image("savepointA_QC-complete.RData")

# ############################################## #

# 3 Data Integration ####

# merge data sets after filtering each data sets individually. Need to ensure filtering doesn't bias data
# potential problem - imbalanced data set (i.e one was aggressively filtered = has far fewer cells than other, could cause bias downstream)
# use stats to account for these differences (i.e normalization, batch correction, merging strategies)
# here I use normalization 

# Merging data is crucial for comparison 
# different cell types from one person can be more similar than same cell type from 2 people 
# this means same cell type from different donors would be separated during analysis/ clustering - not useful
# integration needed to allow comparison

# split objects by conditions defined in section 1 
con.list <- SplitObject(so, split.by = "Conditions")

# normalize and identify variable features for each data set independently
# also ensure the modified object is returned so con.list is updated
# top 3000 most variable genes selected for downstream analysis
con.list <- lapply(X = con.list, FUN = function(x) {
  x <- NormalizeData(x, normalization.method = "LogNormalize")
  x <- FindVariableFeatures(x, selection.method = "vst", nfeatures = 3000)
  return(x)  
})

# Scale the data and perform PCA for each condition
con.list <- lapply(X = con.list, FUN = function(x) {
  x <- ScaleData(x)
  x <- RunPCA(x)
  return(x)
})

# after normalization and variable feature selection - standard thing to do is integrate the 2 conditions based on shared variable features
# then can proceed with other analyses such as scaling, PCA, clustering, or differential expression
# I prev selected top 3000 most variable genes to look at (nfeature)
# when shared variable features close to 3000 = likely lose biological difference
# but some similarities/ shared variables (to certain extent) needed to merge 2 data sets = ~600-2400, closer halfway the better

# check overlap between most variable features in 2 conditions (BC1,BC6)
length(intersect(VariableFeatures(con.list$BC1), VariableFeatures(con.list$BC6)))
# 1385

# select features repeatedly variable across data sets for integration
intfeats <- SelectIntegrationFeatures(object.list = con.list, nfeatures = 3000)

# integration - finally! Now combine data sets using anchors 
int.anchors <- FindIntegrationAnchors(object.list = con.list, anchor.features = intfeats)
so_donorint <- IntegrateData(anchorset = int.anchors)

# scaling data to ensure genes with higher variance don't dominate analysis following integration, each feature contributes equally
# set default assay to 'integrated' so downstream analysis uses integrated data (corrected values)
# instead of data before integration/ raw counts etc
DefaultAssay(so_donorint) <- "integrated"
so_donorint <- ScaleData(so_donorint, features = rownames(so_donorint))

# clear up after integration 
# splitted conditions, anchors used to integrate datasets, features selected for integration, original 'so' object before processing
rm(con.list, int.anchors, intfeats, so)

# create save point 
save.image("savepointB_QC-integration-complete.RData")
COPY
1) Load and preview of data 2) data QC 3_ data integration
labbook part 1-3

Save snippets that work from anywhere online with our extensions

Comments

More like this

Browse more snippets >>

labbook part 1-3

Save snippets that work from anywhere online with our extensions

Comments

More like this

Browse more snippets >>

Embed code snippet