# I am working on scRNAseq data derived from human bladder cancer 'Urothelial carcinoma'
# Urothelial carcinoma has 2 subtypes - muscle invasive (MIBC), non-muscle invasive (NMIBC)
# NMIBC has recurrence rate 70%-80% if untreated and approx 50% progress to MIBC (Aya T Shalata et al. 2022), where MIBC has a 50-70% mortality rate! (Jong Chul Park et al. 2014)
# I will look at urothelial heterogeneity within non-malignant NMIBC papilloma as intratumoural heterogeneity is a major driver to drug resistance (NA Saunders et al. 2012)
# Data comes from 2 individuals. (M 67yrs, M 58yrs)
# sample BC1 = low grade Ta non-malignant NMIBC cells from (M, 67yrs)
# sample BC6 = low grade Ta non-malignant NMIBC cells from (M, 58yrs)
# differences between the two cell sets could be, but not limited to person differences (age, genetic mutations, lifestyle)
# Firstly, I will access the Genomics shared libraries
.libPaths("libs/R_4.3.3")
# load relevant libraries
library(tidyverse)
citation("tidyverse")
library(Seurat)
citation("Seurat")
library(clustree)
citation("clustree")
library(ggplot2)
citation("ggplot2")
library(ggrepel)
citation("ggrepel")
library(fgsea)
citation("fgsea")
# 1 Load, preview and explore data ####
counts <- Read10X("data5/02_assessment_options/assessment02_BC1-BC6")
# check dimensions of data (i.e number of genes - rows, cells - columns)
dim(counts)
# number of rows = 38606, columns = 16155
# quick preview of data. Structure, type, first few values
str(counts)
# setting IDs and conditions (here = cancer T stages) for counts
ids <- c("BC1", "BC6")
con <- c("BC1", "BC6")
# create a table 'cell_counts' based on column names from 'counts'
# create metadata df, and annotate my data with labels
cell_counts <- table(substring(colnames(counts),18))
# construct two vectors 'sample', 'conditions' from 'ids', 'con' based on 'cell_counts'
# set up sample and condition labels for the backgrounds
samples <- c()
for (s in 1:length(ids)) {
samples <- c(samples, (rep(paste(ids)[s], cell_counts[s])))
}
conditions <- c()
for (c in 1:length(con)) {
conditions <- c(conditions, (rep(paste(con)[c], cell_counts[c])))
}
# now create a metadata df
metadata <- data.frame(Sample=samples, Conditions=conditions, row.names=colnames(counts))
# then check metadata df cell numbers match expected
metadata |> group_by(Sample) |> summarise(Cells=n())
metadata |> group_by(Conditions) |> summarise(Cells=n())
# create seurat object 'so' from data - R package used for scRNAseq analysis that stores, organise scRNAseq data, metadata, results
# rows = genes, columns = cells
# 'meta.data' specifies a metadata df which has additional info of each cell (ie cell types)
so <- CreateSeuratObject(counts=counts, project="Genomics_W5assessment", meta.data=metadata)
# assign cell identities based on 'conditions' column from metadata
# for further analysis to consider cells belonging to different conditions
# i.e later DEA, clustering etc can be done based on 'conditions'
Idents(so) <- so@meta.data$Conditions
# check seurat version
packageVersion("Seurat")
# remove unused variables to tidy up memory
rm(c, s, cell_counts, con, conditions, ids, samples, counts, metadata)
# ################################### #
# 2 Data QC check ####
# removing low quality data from data set
# high Mt% indicates dying cells, often correlated with low number of features
# high number of features suggests 2 cells are present in sample instead of single cell
# calc %MT gene expression
# add a new metadata column 'percent.mt' of %MT for each cell
so <- PercentageFeatureSet(so, pattern="^MT-", col.name="percent.mt")
# Visualize distribution of MT gene percentages
VlnPlot(so, features = "percent.mt")
# violin plot to visualize and compare distribution of 2 selected features - number of features, MT%
# across different cell groups (i.e sample or condition)
# assists QC by easing detection of any outliers in my data (low quality/stressed/dying cells)
VlnPlot(so, features=c("nFeature_RNA","percent.mt"),
ncol=2, pt.size=0, group.by="Sample", raster=FALSE)
# distribution pattern is different for BC1 BC6 in nFeature_RNA
# BC1 data is too evenly distributed, no median possible to be observed
# In contrast BC6 has nice violin shaped distribution making it relatively easier to decide cut off points to remove low quality data
# so I decide data cut off threshold based on BC6
# save plot
ggsave("plots5/1_violinplot_preQC.png")
# Summarize key stats based on distribution above, related to %MT and number of features
# Create a summary table
summary_metadata <- so@meta.data |>
group_by(Sample) |>
summarize(
mt_med = quantile(percent.mt, 0.5),
mt_q95 = quantile(percent.mt, 0.95),
mt_q75_plus_IQR = quantile(percent.mt, 0.75) + (1.5 * IQR(percent.mt)),
feat_med = median(nFeature_RNA),
feat_q95 = quantile(nFeature_RNA, 0.95),
feat_q75_plus_IQR = quantile(nFeature_RNA, 0.75) + (1.5 * IQR(nFeature_RNA)),
count = n()
)
# print the table
summary_metadata
# scatter plot number of features VS %MT for each cell
# assists my QC by identifying cells with high MT%, low features (indicative of dying cells/ empty droplets)
ggplot(so@meta.data, aes(nFeature_RNA, percent.mt, colour=Sample)) +
geom_point(alpha=0.2) +
facet_grid(.~Sample) +
theme_bw()
ggsave("plots5/2_nfeatures_vs_%MT_preQC.png")
# now remove low feature/high MT% cells using UQ + (1.5*IQR) rule
# find the upper threshold limit for MT% in both samples, then take the lowest value
mt_filt <- min(summary_metadata$mt_q75_plus_IQR)
# find upper threshold limit for feature number in both, then take the lowest value
feat_max_filt <- min(summary_metadata$feat_q75_plus_IQR)
# print proposed thresholds
mt_filt # 6.370424
feat_max_filt # 5279.75
# These are proposed threshold, but I need to take closer look at data first to decide cut off points
# Taking a closer look at low quality data first - I will use 2 graphs, one for each feature
# this is in preparation to filter data after visualization
# 1 - zoomed-in version scatter plot focusing on cells < 10% MT content (clear up feature 1)
# I can exclude cells that are potentially stressed/ dying more clearly since zoomed in view
ggplot(subset(so@meta.data, subset = percent.mt < 10), aes(nFeature_RNA, percent.mt, colour=Sample)) +
geom_point(alpha=0.2) +
facet_grid(.~Sample) +
theme_bw() +
xlim(0,8000) + # set limits to x axis to zoom in when viewing graph to decide cut off point
ggsave("plots5/3_%MT_zoomedlowqual_QC.png")
# 2 - density plot for number of features detected per cell (clear up feature 2)
# across different samples in the seurat object 'so' I created earlier
# from plot I can identify potential problems like under-sampling or a variability in number of genes detected
ggplot(so@meta.data, aes(x=nFeature_RNA, colour=Sample)) +
geom_density() +
facet_grid(.~Sample) +
theme_bw() +
xlim(0,3500) # set limits for x axis to zoom in when viewing graph to decide cut off point, the 'dip' in graph
# graph 'dip' is approx at 1800 on x axis
ggsave("plots5/4_nfeature_zoomedlowqual_QC.png")
# Now I can begin data filtering process! - filter based on 3 conditions
# 1 - cells < 1800 features (genes) detected
# 2 - cells with more than number of features defined by 'feat_max_filt'
# 3 - % MT gene expression. Threshold 6.370424 prev defined by 'mt_filt'. Again, filtering out cells with high MT content as indicative of low quality cells (i.e dying)
so <- subset(so, subset = nFeature_RNA >= 1800 & nFeature_RNA <= feat_max_filt & percent.mt <= mt_filt)
# after filtering I have new data set - post QC
# so generate new summary data for filtered data set
so@meta.data |> group_by(Sample) |> summarize(mt_med = quantile(percent.mt, 0.5),
mt_q95 = quantile(percent.mt, 0.95),
mt_q75_plus_IQR = quantile(percent.mt, 0.75) + (1.5 * IQR(percent.mt)),
feat_med = median(nFeature_RNA),
feat_q95 = quantile(nFeature_RNA, 0.95),
feat_q75_plus_IQR = quantile(nFeature_RNA, 0.75) + (1.5 * IQR(nFeature_RNA)),
count = n())
# now visualize new summary data post QC - violin plot like previous
# violin plot to visualize and compare distribution of 2 selected features - number of features, MT%
# across different cell groups (i.e sample or condition)
VlnPlot(so, features=c("nFeature_RNA", "percent.mt"),
ncol=2, pt.size=0, group.by="Sample", raster=FALSE)
# Recap - I have chosen thresholds based on BC6 as it had better initial distribution to decide data cut off points
# filtered data only shows cells of good quality facilitating my downstream analysis
ggsave("plots5/5_violinplot_postQC.png")
# some points to consider about the QC process is that :
# same filters are applied to both data sets BC1 BC6, which would surely have differences (i.e starting distributions, sequencing depth across data sets)
# a threshold that works well for one data set may i.e discard good quality cells in another
# but can only try our best when deciding cut-off point - i.e zooming into graph. It is better than including bad quality data!
# create save point
save.image("savepointA_QC-complete.RData")
# ############################################## #
# 3 Data Integration ####
# merge data sets after filtering each data sets individually. Need to ensure filtering doesn't bias data
# potential problem - imbalanced data set (i.e one was aggressively filtered = has far fewer cells than other, could cause bias downstream)
# use stats to account for these differences (i.e normalization, batch correction, merging strategies)
# here I use normalization
# Merging data is crucial for comparison
# different cell types from one person can be more similar than same cell type from 2 people
# this means same cell type from different donors would be separated during analysis/ clustering - not useful
# integration needed to allow comparison
# split objects by conditions defined in section 1
con.list <- SplitObject(so, split.by = "Conditions")
# normalize and identify variable features for each data set independently
# also ensure the modified object is returned so con.list is updated
# top 3000 most variable genes selected for downstream analysis
con.list <- lapply(X = con.list, FUN = function(x) {
x <- NormalizeData(x, normalization.method = "LogNormalize")
x <- FindVariableFeatures(x, selection.method = "vst", nfeatures = 3000)
return(x)
})
# Scale the data and perform PCA for each condition
con.list <- lapply(X = con.list, FUN = function(x) {
x <- ScaleData(x)
x <- RunPCA(x)
return(x)
})
# after normalization and variable feature selection - standard thing to do is integrate the 2 conditions based on shared variable features
# then can proceed with other analyses such as scaling, PCA, clustering, or differential expression
# I prev selected top 3000 most variable genes to look at (nfeature)
# when shared variable features close to 3000 = likely lose biological difference
# but some similarities/ shared variables (to certain extent) needed to merge 2 data sets = ~600-2400, closer halfway the better
# check overlap between most variable features in 2 conditions (BC1,BC6)
length(intersect(VariableFeatures(con.list$BC1), VariableFeatures(con.list$BC6)))
# 1385
# select features repeatedly variable across data sets for integration
intfeats <- SelectIntegrationFeatures(object.list = con.list, nfeatures = 3000)
# integration - finally! Now combine data sets using anchors
int.anchors <- FindIntegrationAnchors(object.list = con.list, anchor.features = intfeats)
so_donorint <- IntegrateData(anchorset = int.anchors)
# scaling data to ensure genes with higher variance don't dominate analysis following integration, each feature contributes equally
# set default assay to 'integrated' so downstream analysis uses integrated data (corrected values)
# instead of data before integration/ raw counts etc
DefaultAssay(so_donorint) <- "integrated"
so_donorint <- ScaleData(so_donorint, features = rownames(so_donorint))
# clear up after integration
# splitted conditions, anchors used to integrate datasets, features selected for integration, original 'so' object before processing
rm(con.list, int.anchors, intfeats, so)
# create save point
save.image("savepointB_QC-integration-complete.RData")