# I am working on scRNAseq data derived from human bladder cancer 'Urothelial carcinoma'
# Urothelial carcinoma has 2 subtypes - muscle invasive (MIBC), non-muscle invasive (NMIBC)
# NMIBC has high likelihood of recurrence 50-70% and progression to MIBC (cite 1), where MIBC has a concerning 50-70% mortality rate! (cite 2)
# Hence progression is a useful point of target, and I will look at urothelial heterogeneity within non-malignant NMIBC papilloma
# to see what could possibly be contributing to the process, as well as what makes it non-malignant?
# Data comes from 2 individuals. (M 67yrs, M 58yrs)
# sample BC1 = low grade Ta non-malignant NMIBC cells from (M, 67yrs)
# sample BC6 = low grade Ta non-malignant NMIBC cells from (M, 58yrs)
# differences between the two cell sets should be due to person differences (let's see if it's true!)
# Firstly, I will access the Genomics shared libraries
.libPaths("libs/R_4.3.3")
# load relevant libraries
library(tidyverse)
library(Seurat)
library(clustree)
library(ggplot2)
library(ggrepel)
library(fgsea)
# Questions to ask ####
# REFERENCES????
# 2 - filters cells with more than number of features defined by 'feat_max_filt' defined in line 130
# find out the feat_max threshold and then put in numerical values
# end of section 2 data QC - does 2 data sets refer to pre QC and post QC or the 2 features (nfeature, MT%)
# 1 Load, preview and explore data ####
counts <- Read10X("data5/02_assessment_options/assessment02_BC1-BC6")
# check dimensions of data (i.e number of genes - rows, cells - columns)
dim(counts)
# number of rows = 38606, columns = 16155
# quick preview of data. Structure, type, first few values
str(counts)
# setting IDs and conditions (here = cancer T stages) for counts
# reminder that all data originates from one person (80 yrs, M)
# sample BC4 = high grade T2 malignant MIBC cells
# sample BCN = paracancerous normal urothelial cells (hence T0 since not defined by T stages)
ids <- c("BC1", "BC6")
con <- c("Ta1", "Ta6")
# create a table 'cell_counts' based on column names from 'counts'
# create metadata df, and annotate my data with labels
cell_counts <- table(substring(colnames(counts),18))
# construct two vectors 'sample', 'conditions' from 'ids', 'con' based on 'cell_counts'
# set up sample and condition labels for the backgrounds
samples <- c()
for (s in 1:length(ids)) {
samples <- c(samples, (rep(paste(ids)[s], cell_counts[s])))
}
conditions <- c()
for (c in 1:length(con)) {
conditions <- c(conditions, (rep(paste(con)[c], cell_counts[c])))
}
# now I will create a metadata df
metadata <- data.frame(Sample=samples, Conditions=conditions, row.names=colnames(counts))
# then check metadata df cell numbers match expected
metadata |> group_by(Sample) |> summarise(Cells=n())
metadata |> group_by(Conditions) |> summarise(Cells=n())
# create seurat object 'so' from data
# an R package used for scRNAseq analysis that stores + organise scRNAseq data, metadata, results
# rows = genes, columns = cells
# 'meta.data' specifies a metadata df which has additional info of each cell (ie cell types)
so <- CreateSeuratObject(counts=counts, project="Genomics_W5assessment", meta.data=metadata)
# assign cell identities based on 'conditions' column from metadata
# this allows further analysis to consider cells belonging to different conditions
# i.e DEA, clustering etc can be done based on 'conditions'
Idents(so) <- so@meta.data$Conditions
# remove unused variables to tidy up memory
rm(c, s, cell_counts, con, conditions, ids, samples, counts, metadata)
# ################################### #
# 2 Data QC check ####
# removing low quality data from dataset
# Main areas to lookout in scRNAseq - contamination of data
# high Mt% indicates dying cells, often correlated with low number of features
# high number of features suggests 2 cells are present in sample instead of single cell
# calc %MT gene expression
# adds a new metadata column 'percent.mt' of %MT for each cell
so <- PercentageFeatureSet(so, pattern="^MT-", col.name="percent.mt")
# Visualize the distribution of mitochondrial gene percentages
VlnPlot(so, features = "percent.mt")
# violin plot to visualize and compare distribution of 2 selected features - number of features, MT%
# across different cell groups (i.e sample or condition)
# assists QC by easing detection of any outliers in my data (low quality/stressed/dying cells)
VlnPlot(so, features=c("nFeature_RNA","percent.mt"),
ncol=2, pt.size=0, group.by="Sample", raster=FALSE)
# distribution pattern is different for BC1 BC6 in nFeature_RNA
# BC1 data is too evenly distributed, no median possible to be observed
# In contrast BC6 has nice violin shaped distribution making it relatively easier to decide cut off points to remove low quality data
# so I decide data cut off threshold based on BC6
# save plot
ggsave("plots5/1_violinplot_preQC.png")
# Summarize key stats based on distribution above, related to %MT and number of features
# Create a summary table
summary_metadata <- so@meta.data |>
group_by(Sample) |>
summarize(
mt_med = quantile(percent.mt, 0.5),
mt_q95 = quantile(percent.mt, 0.95),
mt_q75_plus_IQR = quantile(percent.mt, 0.75) + (1.5 * IQR(percent.mt)),
feat_med = median(nFeature_RNA),
feat_q95 = quantile(nFeature_RNA, 0.95),
feat_q75_plus_IQR = quantile(nFeature_RNA, 0.75) + (1.5 * IQR(nFeature_RNA)),
count = n()
)
# print the table
summary_metadata
# scatter plot number of features VS %MT for each cell
# assists my QC by identifying cells with high MT%, low features (indicative of dying cells/ empty droplets)
# also can identify cell clusters - useful for downstream analysis
ggplot(so@meta.data, aes(nFeature_RNA, percent.mt, colour=Sample)) +
geom_point(alpha=0.2) +
facet_grid(.~Sample) +
theme_bw()
ggsave("plots5/2_nfeatures_vs_%MT_preQC.png")
# removing low feature/high MT% cells using UQ + (1.5*IQR) rule
# find the upper threshold limit for MT% in both samples, then take the lowest value
mt_filt <- min(summary_metadata$mt_q75_plus_IQR)
# find upper threshold limit for feature number in both, then take the lowest value
feat_max_filt <- min(summary_metadata$feat_q75_plus_IQR)
# print proposed thresholds
mt_filt # 6.370424
feat_max_filt # 5279.75
# These are proposed threshold, but I need to take closer look at data first to decide cut off points
# Taking a closer look at low quality data first - I will use 2 graphs, one for each feature
# this is in preparation to filter data after visualization
# 1 - zoomed-in version scatter plot focusing on cells < 10% MT content (clear up feature 1)
# I can exclude cells that are potentially stressed/ dying more clearly since zoomed in view
ggplot(subset(so@meta.data, subset = percent.mt < 10), aes(nFeature_RNA, percent.mt, colour=Sample)) +
geom_point(alpha=0.2) +
facet_grid(.~Sample) +
theme_bw() +
xlim(0,8000) + # set limits to x axis to zoom in when viewing graph to decide cut off point
ggsave("plots5/3_%MT_zoomedlowqual_QC.png")
# 2 - density plot for number of features detected per cell (clear up feature 2)
# across different samples in the seurat object 'so' I created earlier
# from plot I can identify potential problems like under-sampling or a variability in number of genes detected
ggplot(so@meta.data, aes(x=nFeature_RNA, colour=Sample)) +
geom_density() +
facet_grid(.~Sample) +
theme_bw() +
xlim(0,3500) # set limits for x axis to zoom in when viewing graph to decide cut off point, the 'dip' in graph
# graph 'dip' is approx at 1800 on x axis
ggsave("plots5/4_nfeature_zoomedlowqual_QC.png")
# Now I can begin data filtering process! - filter based on 3 conditions
# 1 - cells < 1800 features (genes) detected
# 2 - cells with more than number of features defined by 'feat_max_filt' defined in line 130 - an upper limit
# 3 - % MT gene expression. Threshold 6.370424 prev defined by 'mt_filt'. Again, filtering out cells with high MT content as indicative of low quality cells (i.e dying)
so <- subset(so, subset = nFeature_RNA >= 1800 & nFeature_RNA <= feat_max_filt & percent.mt <= mt_filt)
# after filtering I have new data set - post QC
# so generate new summary data for filtered data set
so@meta.data |> group_by(Sample) |> summarize(mt_med = quantile(percent.mt, 0.5),
mt_q95 = quantile(percent.mt, 0.95),
mt_q75_plus_IQR = quantile(percent.mt, 0.75) + (1.5 * IQR(percent.mt)),
feat_med = median(nFeature_RNA),
feat_q95 = quantile(nFeature_RNA, 0.95),
feat_q75_plus_IQR = quantile(nFeature_RNA, 0.75) + (1.5 * IQR(nFeature_RNA)),
count = n())
# now visualize new summary data post QC - violin plot like previous
# violin plot to visualize and compare distribution of 2 selected features - number of features, MT%
# across different cell groups (i.e sample or condition)
VlnPlot(so, features=c("nFeature_RNA", "percent.mt"),
ncol=2, pt.size=0, group.by="Sample", raster=FALSE)
# Recap - I have chosen thresholds based on BC6 as it had better initial distribution to decide data cut off points
# filtered data only shows cells of good quality facilitating my downstream analysis
ggsave("plots5/5_violinplot_postQC.png")
# some points to consider about the QC process is that :
# same filters are applied to both data sets BC1 BC6, which would surely have differences (i.e starting distributions, sequencing depth across data sets)
# a threshold that works well for one data set may i.e discard good quality cells in another
# but can only try our best when deciding cut-off point - i.e zooming into graph. It is better than including bad quality data!
# create save point
save.image("savepointA_QC-complete.RData")
# ############################################## #
# 3 Data Integration ####
# merge data sets after filtering each data sets individually. Need to ensure filtering doesn't bias data
# potential problem - imbalanced data set (i.e one was aggressively filtered = has far fewer cells than other, could cause bias downstream)
# can use stats to account for these differences (i.e normalization, batch correction, merging strategies)
# here I use normalization
# Merging data is crucial for comparison
# different cell types from one person can be more similar than same cell type from 2 people
# this means same cell type from different donors would be separated during analysis/ clustering - not useful, hence integration needed
# split objects by conditions defined in section 1 (Ta1, Ta6)
con.list <- SplitObject(so, split.by = "Conditions")
# normalize and identify variable features for each data set independently
# also ensure the modified object is returned so con.list is updated
# top 3000 most variable genes selected for downstream analysis
con.list <- lapply(X = con.list, FUN = function(x) {
x <- NormalizeData(x, normalization.method = "LogNormalize")
x <- FindVariableFeatures(x, selection.method = "vst", nfeatures = 3000)
return(x)
})
# Scale the data and perform PCA for each condition
con.list <- lapply(X = con.list, FUN = function(x) {
x <- ScaleData(x)
x <- RunPCA(x)
return(x)
})
# (extra from workshop)####
# after normalization and variable feature selection, can proceed with other analyses such as scaling, PCA, clustering, or differential expression
# after normalization and variable feature selection - standard thing to do is integrate the 2 conditions based on shared variable features
# I prev selected top 3000 most variable genes to look at (nfeature)
# when shared variable features close to 3000 = likely lose biological difference
# but some similarities/ shared variables (to certain extent) needed to merge 2 data sets = ~600-2400, closer halfway the better
# check overlap between most variable features in 2 conditions (Ta1,Ta6)
length(intersect(VariableFeatures(con.list$Ta1), VariableFeatures(con.list$Ta6)))
# select features repeatedly variable across data sets for integration
intfeats <- SelectIntegrationFeatures(object.list = con.list, nfeatures = 3000)
# integration - finally! Now combine data sets using anchors
int.anchors <- FindIntegrationAnchors(object.list = con.list, anchor.features = intfeats)
so_donorint <- IntegrateData(anchorset = int.anchors)
# scaling data to ensure genes with higher variance don't dominate analysis following integration, each feature contributes equally
# set default assay to 'integrated' so downstream analysis uses integrated data (corrected values)
# instead of data before integration/ raw counts etc
DefaultAssay(so_donorint) <- "integrated"
so_donorint <- ScaleData(so_donorint, features = rownames(so_donorint))
# clear up objects no longer needed after integration
# splitted conditions, anchors used to integrate datasets, features selected for integration, original 'so' object before processing
rm(con.list, int.anchors, intfeats, so)
# create save point
save.image("savepointB_QC-integration-complete.RData")
# #################################### #
# 4 Find clusters of cell types ####
# Data quality is sorted following QC (section 1) and integration (section 3)
# data is now a single analysis object!
# can now look at biology - try identify communities of cells alike
# genes can work in pathways so not equally informative
# use PCA to reduce dimension and identify patterns of shared variance then derive (potential) communities
# also work out how many clusters are (potentially!) informative
# run PCA on integrated data set
# recap PCA reduces dimensions in data, only capture max variance in data so easy to visualize
# in context of scRNAseq, I'm using PCA to get main sources of variation in gene expression
so_donorint <- RunPCA(so_donorint)
# top 5 PCs shown, biggest contributor in each direction are :
# COSMIC to see if these genes contain mutations that is implicated in cancer?
# PC1 positive = HLA-DRB6 - antigen processing + presenting, immune response
# PC1 negative = ADIRF - positive regulation of fat cell differentiation
# PC2 positive = TXNIP - tumour suppressor gene (uniprot)
# PC2 negative = MT-CO1 -
# PC3 positive = DHRS2
# PC3 negative = CD3D
# PC4 positive = SGK1
# PC4 negative = DHRS2
# PC5 positive = MT-CO3
# PC5 negative = HSPA1A
# pick the most informative PC
# elbow plot to show me percentage variance explained by each PC
ElbowPlot(so_donorint)
# As number of PC increases, the variance explained by each subsequent PC decrease more gradually
# The "elbow" is the point where the curve flattens out - stop considering additional PC as they contribute little to explaining data's variance.
# after running PCA to assess how much variance in each PC
# calc % variance explained by each PC and cumulative % variance across all PCs
# helps me determine number of PCs to keep for downstream analysis
pct <- so_donorint[["pca"]]@stdev / sum(so_donorint[["pca"]]@stdev) * 100
cumu <- cumsum(pct)
# identify point/PC where cumulative variance > 90%, % variance explained < 5% (does not contribute much to variance)
co1 <- which(cumu > 90 & pct < 5)[1]
co1 # 43
# identify point/PC where difference in % variance explained (contribution to variance) between 2 consecutive PCs is > 0.1%
# I can find the point of significant drop in contribution of variance explained by each PC
co2 <- sort(which((pct[1:length(pct) - 1] - pct[2:length(pct)]) > 0.1), decreasing = T)[1]
co2 # 12
# select minimum value between the 2 measures (co1, co2)
# provides criteria to decide when to stop including additional PCs
# co1 - point where a PC has cumulative variance > 90%, variance explained < 5%
# co2 - point where there's significant drop in variance explained between 2 consecutive PCs
pcs <- min(co1, co2)
pcs # 12
# create data frame containing 3 variables
# pct - % variance contributed by each PC
# cumu - cumulative % variance contributed by PCs
# rank - ranking of each PC showing its order in analysis
cumu_df <- data.frame(pct = pct, cumu = cumu, rank = 1:length(pct))
# Elbow plot to visualize relation between cumulative variance explained, % variance contribution by each PC
ggplot(cumu_df, aes(cumu, pct, label = rank, color = rank > pcs)) +
geom_text() +
theme_bw()
# clean up - remove objects unnecessary for downstream analysis
rm(pct, cumu, cumu_df, co1, co2)
# PCA-based scatter plot to visualize 4 selected features/genes - check if PCA successful
# cells expressing these markers are shaded from dark blue (most expression) to pale pink (least/none)
FeaturePlot(so_donorint, reduction = "pca", dims = c(1, 2),
features = c("HLA-DRB6", "ADIRF", "TXNIP", "MT-CO1"), cols = c("#fcd5ce", "#4886af"),
pt.size = 2.5, alpha = 0.7, order = TRUE,)
ggsave("plots5/6_featureplot.png")
# data is spread across first 2 main PCs only
# gene 'MT-CO1' shown to have greatest expression across PC1 PC2, closely followed by gene 'ADIRF'
# gene 'HLA-DRB6' have least expression across PC1 PC2
# add comment on - Are the genes are typically markers of different cell types present in tumors
# Neighbor analysis
# to find relationships between cells (calc neighborhoods) based on first PC (pcs)
DefaultAssay(so_donorint) <- "integrated"
so_donorint <- FindNeighbors(so_donorint, dims = 1:pcs)
# problems with neighbour analysis
# Neighbour analysis finds potential communities but does not give reference of how many communities should be present
# community detection starts with all cells in 1 community - followed by successive subdivisions
# using cluster tree = can visualize how communities derived + subdivided
# so, perform clustering analysis across multiple resolution values
# then visualize resulting clusters by cluster tree plot
so_donorint_orgs <- FindClusters(so_donorint, resolution = seq(0, 0.3, 0.05))
clustree(so_donorint_orgs, show_axis = TRUE) + theme(legend.position = "none") + ylab("Resolution")
ggsave("plots5/7_clustertree_lineage.png")
# so above cluster tree plot shows successive community divisions as resolution increases
# an issue is - most numerous type often results in most subgroups. Purely due to its high abundance and not biologically different
# I will therefore start my analysis at low resolution to monitor closely the main community lineages
# clear up unnecessary objects for downstream analysis
rm(so_donorint_orgs)
# set initial resolution to 0.05 for clustering cells
# based on cluster tree plot I should see 3 communities
res <- 0.05
# now assign cells to a cluster and run UMAP - reduce dimensions so I can visualize
# UMAP allows me to visualize clusters - based on first PCA
# 2D representation of cells to visualize similarities/differences based on high dimensional features (i.e gene expression) - based on first PCA
so_donorint <- FindClusters(so_donorint, resolution = res)
so_donorint <- RunUMAP(so_donorint, dims=1:pcs, return.model = TRUE, verbose = FALSE)
# UMAP plot - Visualization of my clustered data using UMAP plot
DimPlot(so_donorint, reduction = "umap", label = TRUE, label.size = 6)
# based on cluster tree plot I should see 3 communities
# however population 1 (green) is shown to be in 2 separate groups suggesting further subdivision
# I will test with higher resolution later in this analysis to see subdivisions for different cell types
# save plot first
ggsave("plots5/8_UMAP_res0.05.png")
# split UMAP plot - based on 'conditions' + add labels
# shows how clusters are distributed across conditions Ta1 Ta6
DimPlot(so_donorint, reduction = "umap", split.by = "Conditions", label = TRUE, label.size = 6)
ggsave("plots5/9_splitUMAP_res0.05.png")
# since Ta1 Ta6 are identical - same grade, same non-malignant NMIBC tumour pattern should be the same
# any differences technically should be due to person differences
# distribution is roughly the same, but xxxxxxxxxxxxxx
# % distribution of each cell cluster across different conditions
round(prop.table(table(Idents(so_donorint), so_donorint$Conditions))*100,3)
# add comments of graph pls - how organization of cell populations are in relation to exp conditions
# which clusters are more abundant in certain conditions
# are any clusters enriched n one condition over another
# how evenly or unevenly distributed are the clusters across the different conditions
# ################################## #
# 5 Annotate clusters ####
# set default assay to 'RNA' and normalize gene expression data in 'RNA'
DefaultAssay(so_donorint) <- "RNA"
so_donorint_dp_norm <- NormalizeData(so_donorint, verbose = TRUE)
# now define list of genes for biological markers - likely communities
# can find genes that are related to cancer and see if its present via dotplot
features <- c("TP53", "PTEN", # tumour suppressing - but if mutated = MIBC characteristic markers
"FGFR3", # low grade NIBC tumour development markers
"UPK2", "KRT13", "EPCAM", "KRT18", # urothelial markers
"COL1A1", "CALD1", # muscle markers
"PECAM1", # endothelial
"CD2", "CD3D", "CD3E", # T cells
"C1QC", "CD14", "CSF1R") # macrophages/myeloid
# DotPlot to visualize expression of list of genes across different clusters of cells
# look at intensity/ proportion of cells expressing each marker in each community
# list of genes were defined previously
DotPlot(so_donorint_dp_norm, features = features, dot.scale=8) +
theme(axis.text.x = element_text(size = 11)) +
RotatedAxis ()
ggsave("plots5/Dotplot.png")
# again comment on the plot
# what are the intensity/ proportion of cells expresker in each community
# FeaturePlot to visualize expression of 6 specific genes across UMAP-reduced data set
# check how specific the markers are
FeaturePlot(so_donorint_dp_norm, reduction = "umap",
pt.size = 2, alpha = 0.7, order = TRUE,
features = c("UPK2", "EPCAM", "TP53", "PECAM1", "CD2", "C1QC", "PTEN"))
# again comment on plot
# cells more similar = placed closer together
# colour of cells corresponds to expression level of specified genes. Higher expression = warmer colours (red/yellow). lower = blue/purple etc
# by specify multiple features = generate separate plot for each gene where colour intensity refelct expression of that gene across all cells
# can use plot to - identify cell types - which clusters of cells express markers I've specified
# ie. UPK2 might be strongly expressed in one cluster (urothelial cells), while PECAM1 might be more expressed in endothelial cells
# put into your own context!
# can use plot to - assess whether gene-co-expression/ distinct expression across different cell clusters
# what communities contain what ?
# switch back to integrated assay for comparisons (not for plotting)
DefaultAssay(so_donorint) <- "integrated"
# run DEA to identify differentially expressed genes between the 2 sets of cells
uro.markers <- FindMarkers(so_donorint, ident.1 = 0, ident.2 = 1)
# check data frame
head(uro.markers)
# avoid errors caused by small p-values downstream by setting p-value limits
# all adjusted p-value > or = to (1e-300)
uro.markers["p_val_adj"] <- lapply(uro.markers["p_val_adj"], pmax, 1e-300)
# preparation for volcano plot
# categorize genes (in uro.markers) based on their DEA results
# create new column 'DEA' to classify each gene as up, down, no based on log fold-change and adjusted p-value
uro.markers$DEA <- "NO"
uro.markers$DEA[uro.markers$avg_log2FC > 1 & uro.markers$p_val_adj < 0.05] <- "UP"
uro.markers$DEA[uro.markers$avg_log2FC < -1 & uro.markers$p_val_adj < 0.05] <- "DOWN"
uro.markers$genes <- rownames(uro.markers)
# Volcano plot to visualize DEA results for comparison between the conditions (T1, T3)
ggplot(uro.markers, aes(x=avg_log2FC, y=-log10(p_val_adj))) +
geom_point(aes(colour = DEA), show.legend = FALSE) +
scale_colour_manual(values = c("blue", "gray", "red")) +
geom_hline(yintercept = -log10(0.05), linetype = "dotted") +
geom_vline(xintercept = c(-1,1), linetype = "dotted") +
theme_bw() +
theme(panel.border = element_blank(), panel.grid.major = element_blank(), panel.grid.minor = element_blank(), axis.line = element_line(colour = "black")) +
geom_text_repel(data=subset(uro.markers, (abs(avg_log2FC) > 4 & p_val_adj < 0.05)), aes(x=avg_log2FC, y=-log10(p_val_adj), label=genes), max.overlaps = 1000, size=4.5)
# comment on volcano plot
# Cell cycle analysis - cell cycle scoring
# to assign cell cycle phase (G1, S, G2M) based on expression of predefined sets of genes
so_donorint.cc <- CellCycleScoring(so_donorint, s.features = cc.genes$s.genes,
g2m.features = cc.genes$g2m.genes, set.ident = TRUE)
# check output to see newly assigned cell cycle phase for each cell
head(so_donorint.cc)
# visualize data in UMAP, split by cell cycle phase
DimPlot(so_donorint.cc, reduction="umap", split.by="Phase", label=TRUE, label.size=10)
# comment on the plots
# Validate the above interpretations with GSEA
# add a new column 'pi' to the 'uro.markers' data frame
# pi is indicative of fold change and significance of each gene
uro.markers <- mutate(uro.markers, pi = (avg_log2FC * (-1 * log10(p_val_adj))))
# set data frame to include only 2 column genes, pi
prerank <- uro.markers[c("genes", "pi")]
prerank <- setNames(prerank$pi, prerank$genes)
# check structure
str(prerank)
# prepare for GSEA - load gene set from GMT file
genesets = gmtPathways("data/h.all.v2024.1.Hs.symbols.gmt")
# need to copy from workshop 5 data file
# check output
str(genesets)
# run GSEA on 'prerank'. Results stored in fgseaRes
# stats method to determine whether a set of genes is statistically enriched in a ranked gene list (from DEA)
fgseaRes <- fgsea(pathways = genesets, stats = prerank, minSize=15, maxSize=500)
# check output
head(fgseaRes)
# extract top 10 significant pathways from GSEA results based on smallest p-value
top10_fgseaRes <- head(fgseaRes[order(pval), ], 10)
top10_fgseaRes
# visualize enrichment of gene set in ranked gene list 'prerank'
plotEnrichment(genesets[["HALLMARK_G2M_CHECKPOINT"]], prerank) + labs(title="G2M checkpoint", subtitle="Gene set enrichment analysis", x="Ranked Genes", y="Enrichment Score")
# comment on the plot
# plot should help me understand where in the ranked gene list the genes associated with the G2/M checkpoint pathway are concentrated
# possibly relate to any biological insights related to cell cycle
# Now I've identified cell communities with marker genes, differential expression, cell cycle analysis, GSEA
# add names to UMAP - some communities can do further subdivision (highlighted)
# zoom in by changing resolution to see lineages
# add RES ####
# so prev xxx on UMAP shows community (X) in X colour is separately into 2 populations, indicative by its different positions
# this suggests further subdivision of community (X) could be seen at higher resolution
# Since I am looking at heterogeneity I want to identify different cell types present
# set resolution at 0.03 which is higher than previously used 0.05 for clustering cells
res <- 0.1
# now assign cells to a cluster and run UMAP - reduce dimensions so I can visualize
# UMAP allows visualizing clusters
# 2D representation of cells to visualize similarities/differences based on high dimensional features (i.e gene expression) - based on first PCA
so_donorint <- FindClusters(so_donorint, resolution = res)
so_donorint <- RunUMAP(so_donorint, dims=1:pcs, return.model = TRUE, verbose = FALSE)
# UMAP plot - Visualization of my clustered data using UMAP plot
DimPlot(so_donorint, reduction = "umap", label = TRUE, label.size = 6)
# comment on interpret the relationships between different clusters visually.
# It is useful for quickly assessing how well the clusters are separated and how distinct they are in the UMAP space.
# split UMAP plot - based on 'conditions' + add labels
# shows me how clusters are distributed across different experimental conditions
DimPlot(so_donorint, reduction = "umap", split.by = "Conditions", label = TRUE, label.size = 6)
# comment on split UMAP plot pls - the distributions and stuff
# % distribution of each cell cluster across different conditions
# allows me to understand how cell populations are organized in relation to experimental conditions
round(prop.table(table(Idents(so_donorint), so_donorint$Conditions))*100,3)
# add comments of graph pls - how organization of cell populations are in relation to exp conditions
# which clusters are more abundant in certain conditions
# are any clusters enriched n one condition over another
# how evenly or unevenly distributed are the clusters across the different conditions
# rename each cluster to make them biologically relevant
cluster_names <- c("urothelial_G1", "urothelial_G2M", "immune", "fibroblasts", "endothelial")
names(cluster_names) <- levels(so_donorint)
so_donorint <- RenameIdents(so_donorint, cluster_names)
# generate UMAP for 'so_donorint' to visualize
DimPlot(so_donorint, reduction = "umap", label = TRUE, label.size = 6) + NoLegend()
# comment on plot
# save point
save.image("savepointC_QC-integration-annotation-complete.RData")
# 6 Are Ta1, Ta6 cells different? ####
# In both my data sets, cells are of same grade and cancer T stage
# since I am looking at heterogeneity, all cells will be assessed as a whole
# differences should originate from person differences
# ensure assay is integrated
DefaultAssay(so_donorint) <- "integrated"
# combine cluster number and condition info into new variable 'celltype.condition'
# then set it as identity class of the cells
# consider - how cell clusters behave under different conditions
so_donorint$celltype.condition <- paste(so_donorint$seurat_clusters, so_donorint$Conditions, sep="_")
Idents(so_donorint) <- "celltype.condition"
# run DEA for 2 sets of cells Ta1, Ta6 - compare expression profiles
uroG1_T1vsT3 <- FindMarkers(so_donorint, ident.1 = "0_T3", ident.2 = "0_T1", test.use = "bimod")
# check output
head(uroG1_T1vsT3)
# prepare data needed for volcano plot, as previous
uroG1_T1vsT3["p_val_adj"] <- lapply(uroG1_T1vsT3["p_val_adj"], pmax, 1e-300)
uroG1_T1vsT3$DEA <- "NO"
uroG1_T1vsT3$DEA[uroG1_T1vsT3$avg_log2FC > 1 & uroG1_T1vsT3$p_val_adj < 0.05] <- "UP"
uroG1_T1vsT3$DEA[uroG1_T1vsT3$avg_log2FC < -1 & uroG1_T1vsT3$p_val_adj < 0.05] <- "DOWN"
uroG1_T1vsT3$genes <- rownames(uroG1_T1vsT3)
# Volcano plot to visualize DEA results to compare between Ta1, Ta6
ggplot(uroG1_T1vsT3, aes(x=avg_log2FC, y=-log10(p_val_adj))) +
geom_point(aes(colour = DEA), show.legend = FALSE) +
scale_colour_manual(values = c("blue", "gray", "red")) +
geom_hline(yintercept = -log10(0.05), linetype = "dotted") +
geom_vline(xintercept = c(-1,1), linetype = "dotted") +
theme_bw() +
theme(panel.border = element_blank(), panel.grid.major = element_blank(), panel.grid.minor = element_blank(), axis.line = element_line(colour = "black")) +
geom_text_repel(data=subset(uroG1_T1vsT3, (abs(avg_log2FC) > 8 & p_val_adj < 0.05)), aes(x=avg_log2FC, y=-log10(p_val_adj), label=genes), max.overlaps = 1000, size=4.5)
# comment on volcano plot and what it shows and why
# technically patterns should be the same as both same cell type, grade etc
# only differences should only be due to person differences
# create pi values
# add new column to data frame 'uroG1_T1vsT3' where each gene is assigned a pi score (calc below)
# pi score allows me to rank genes - based on measure of log2fold change + stats significance, can then use to identify top genes for further analysis
uroG1_T1vsT3 <- mutate(uroG1_T1vsT3, pi = (avg_log2FC * (-1 * log10(p_val_adj))))
# check data set - look at the top hit genes !!
head(uroG1_T1vsT3)
# select top 20 genes (for Ta6) based on pi score in descending order - fr highest pi scores
# these genes are most significant and differentially expressed when comparing between Ta1, Ta6 for G1 urothelial cluster
uroG1_T1vsT3 %>% arrange(desc(pi)) %>% slice(1:20)
# select top 20 genes (for Ta1) based on pi score in ascending order - fr low pi score
# these genes are either of smaller fold changes or less stats significant p-values
uroG1_T1vsT3 %>% arrange(pi) %>% slice(1:20)
# prep for GSEA
# extract 'genes', 'pi' from data frame and convert 'prerank' to numeric vector
prerank <- uroG1_T1vsT3[c("genes", "pi")]
prerank <- setNames(prerank$pi, prerank$genes)
str(prerank)
# run GSEA and store results in fgseaRes
genesets = gmtPathways("data/c2.cp.v2024.1.Hs.symbols.gmt")
fgseaRes <- fgsea(pathways = genesets, stats = prerank, minSize=15, maxSize=300, eps=0)
# filter results from GSEA stored in fgseaRes
# checking top hits for either side of volcano plot
fgseaRes %>% arrange(padj) %>% filter(NES<0) %>% slice(1:20)
fgseaRes %>% arrange(padj) %>% filter(NES>0) %>% slice(1:20)
# final save point!
save.image("savepointD_QC-integration-annotation-biocomp-complete.RData")
# Extra analysis (1,2,3,4) ####
# ######################################################### #
# Single-cell gene regulatory network inference
# how TF regulate gene expression within different cell types or states
# provides insight to mechanisms driving cellular heterogeneity
# GENIE3 - infers gene regulatory networks from single cell exp data
library(GENIE3)
grn <- GENIE3(expression_data)
# SCENIC - identifies regulons (set of genes co-reg by common TF) + assess activity
library(SCENIC)
scenic_result <- runSCENIC(expression_data)
# ######################################################### #
# Trajectory (pseudotime) analysis
# follow progression of cells through differentiation, study transitions between different cell states
# monocle - construct single cell trajectory + orders cells based on gene expression profiles
library(monocle3)
cds <- as.cell_data_set(so)
cds <- preprocess_cds(cds, num_dim = 50)
cds <- reduce_dimension(cds)
plot_cells(cds, color_cells_by = "pseudotime")
# slingshot - orders cells along a trajectory + visualizes developmental/ differentiation pathways
library(Slingshot)
slingshot_data <- slingshot(so, clusterLabels = "seurat_clusters", reducedDim = "UMAP")
plot(slingshot_data, type = "line")
######################################################## #
# Cell-cell interaction analysis using Cellchat
# study communication between cells based on expression profiles
# I use this to identify interactions between different cell type to understand cell behaviour in disease (cancer)
# identify the most abundant interaction as it may be crucial for determining tumour as 'non-malignant'
# can possibly induce similar interactions in malignant communities if identical cell types are also present
# Install Bioconductor
install.packages("BiocManager")
BiocManager::install()
# Install CellChat package
devtools::install_github("sqjin/CellChat")
# Load the required libraries
library(CellChat)
library(Seurat)
# prepare single cell data
# Load Seurat object containing single-cell RNA-seq data
so_donorint <- readRDS("path_to_your_seurat_object.rds")
# Ensure Seurat object has cell type information, stored in the "ident" column
so_donorint$celltype <- Idents(so_donorint)
# preprocess data for CellChat - extract gene data, identify cell types
# Extract the expression matrix from the Seurat object
expression_data <- as.matrix(so_donorint@assays$RNA@counts)
# Identify cell types/conditions (you should have this in your Seurat object metadata)
cell_types <- so_donorint$celltype
# create CellChat object after processing data - stores necessary info for interaction analysis
cellchat <- createCellChat(object = expression_data, meta = so_donorint@meta.data, group.by = "celltype")
# Preprocess data for the interaction analysis
cellchat <- subsetData(cellchat) # Subset the data (e.g., remove lowly expressed genes)
# run cell-cell interaction analysis
# using pre-defined ligand-receptor interactions to compute potential cell-cell communication activities
cellchat <- computeCommunProb(cellchat)
# Identify significant interactions and pathways
cellchat <- computeCommunProbPathway(cellchat)
# Identify important ligand-receptor pairs
cellchat <- computeNetSimilarity(cellchat)
# Compute the signaling pathways' activity
cellchat <- aggregateNet(cellchat)
# now visualize cell-cell interaction networks (results)
# Plot network of cell-cell interactions/ communication
netVisual_aggregate(cellchat, signaling = "Ligand-Receptor", layout = "fr")
# visualize interactions for a specific ligand-receptor pair
netVisual_bubble(cellchat, sources = "T_cells", targets = "Tumor_cells", signaling = "TNF")
# visualize cell communication pathways
pathwayPlot(cellchat, signaling = "TNF")
# add comments - how much activity in the given cell types or conditions
# heatmap of communication strength - visualize strength of communication between different cell types
netVisual_heatmap(cellchat)
# comment on heatmap
# cell-cell interaction analysis is complete. Now decide which interactions are significant
# Differential cell-cell interaction to identify conditions that have significantly changed between conditions (Ta1 vs Ta6)
# in my research context would be same cells between 2 individuals. However results/ patterns are expected to be same as same cell community, just different individual
# so there should not be significant changes - if correct
# Further analysis - Perform differential analysis between two conditions (e.g., "Condition1" and "Condition2")
cellchat <- identifyOverExpressedGenes(cellchat)
cellchat <- computeDifferentialNet(cellchat, group.by = "Condition")
# Further analysis - Compare interaction strength across different cell types between conditions
netVisual_compare(cellchat, group.by = "Condition")
# Export cell communication network results as a data frame
interaction_results <- extractNet(cellchat)
write.csv(interaction_results, "cell_cell_interaction_happy_results.csv")
############################################################ #
# Copy number variation (CNV) analysis
# identify genomic regions where gene copy number differs from normal diploid state
# Install necessary R packages
install.packages("BiocManager")
BiocManager::install(c("DNAcopy", "GenomicRanges", "DESeq2", "infercnv"))
# Load the libraries
library(DNAcopy)
library(GenomicRanges)
library(DESeq2)
library(infercnv)
# preparing data from seurat
library(Seurat)
# Load Seurat object with RNA-seq data
so <- readRDS("path_to_your_seurat_object.rds")
# Extract gene expression matrix
expression_matrix <- as.matrix(so@assays$RNA@counts)
# check output
head(expression_matrix)
# prepare data input for CNV analysis by inferCNV. Need:
# Expression data: A gene expression matrix (cells as columns and genes as rows).
# Cell annotations: A vector indicating cell type or condition
# Reference cells: Cells that are used as a reference for determining CNVs
# Assume you have cell type annotations in the Seurat object metadata
cell_types <- so$celltype # This assumes cell type is in the metadata
# Prepare the CNV input (matrix of gene expression counts)
expression_data <- as.data.frame(t(expression_matrix)) # Transpose so that cells are rows
# Create a cell annotation file (indicating which cells are reference cells)
# Example: Create a vector where "1" indicates normal cells, "2" indicates tumor cells
cell_annotations <- ifelse(cell_types == "normal", 1, 2)
names(cell_annotations) <- colnames(expression_data)
# run CNV analysis on prepared data
infercnv_obj <- CreateInfercnvObject(
raw_counts_matrix = expression_data, # Input gene expression data matrix
annotations_file = cell_annotations, # Cell type annotations
delim = "\t", # Delimiter if annotations file is a text file
gene_order_file = "path_to_gene_order_file", # File containing the gene order (for human genome, hg38 gene order)
ref_group_names = c("normal") # Define the reference group (normal cells)
)
# Now run CNV analysis with default settings
infercnv_obj <- infercnv::run(infercnv_obj,
cutoff = 1, # Cutoff for filtering genes
num_threads = 4, # Number of threads
window_length = 101, # Window length for CNV smoothing
plot_steps = TRUE) # Optionally plot the intermediate steps
# visualize CNV results after inferCNV - Plot the CNV results
infercnv::plot_cnv(infercnv_obj,
output_filename = "cnv_results.png",
color_by = "celltype") # Optional color by cell type
# comment on the plot
# Perform differential CNV analysis across conditions Ta1, Ta6
# however here I am using it to hopefully prove same pattern across the 2 conditions
# due to same cells, just originate from different people
# well but this is generally used to identify significant copy number changes between conditions
dds <- DESeqDataSetFromMatrix(countData = cn_data,
colData = data.frame(condition = c("normal", "tumor")),
design = ~ condition)
# Run DESeq2 to find differential CNVs
dds <- DESeq(dds)
res <- results(dds)
# View differential CNV results
head(res)
# Export CNV results as CSV
write.csv(segments$output, "cnv_segments_results.csv")
# Save differential CNV results
write.csv(res, "differential_cnv_results.csv")
############################################################ #