setwd("//files.wustl.edu/Shares/DOM/ONC/Hirbe_Lab/Diana/UBR5 KO RNASeq/analysis/HOM VS WT_2")
Counts <- read.csv("Counts.csv")
# Remove duplicate rows from Counts
Counts <- Counts[!duplicated(Counts[, 1]), ]
rownames(Counts) <- Counts[, 1]
Counts<- Counts [, -1]

# Calculate row means
row_means <- rowMeans(Counts)

# Order genes by row means in descending order
ordered_counts <- Counts[rev(order(row_means)), ]
#alternative code to the above is (ordered_counts <- Counts[order(row_means, decreasing = TRUE), ])
# Filter out rows with row means less than 10
filtered_counts <- ordered_counts[rowMeans(ordered_counts) >= 10, ]
#save filtered data frame
write.csv(filtered_counts, "filtered_counts.csv")

#prepare metadata telling R the conditions (columns) 
metadata <- data.frame( 
  
  sample_id = colnames(filtered_counts),  # Assuming you have loaded the filtered expression data 
  
  condition = c(rep("UBR5 WT", 3), rep("UBR5 HOM", 3)),  # Treatment conditions 
  
  replicate = c(1, 2, 3, 1, 2, 3)  # Sample replicates 
  
) 

metadata$condition <- factor(metadata$condition, levels = c("UBR5 WT", "UBR5 HOM"))

#Load DESEQ2 for normalization
library(DESeq2) 

#Use the DESeqDataSetFromMatrix function from DESeq2 to create a DESeqDataSet object
dds <- DESeqDataSetFromMatrix(countData = filtered_counts,
                              colData = metadata,
                              design = ~ condition)



#Perform normalization and estimation of dispersions: Use the DESeq() function to perform normalization and estimation of dispersions.

dds <- DESeq(dds)
results <- results(dds, alpha = 0.05)
DEGs <- subset(results, abs(log2FoldChange) > 1 & padj < 0.05)


#save the de_genes data frame
write.csv(DEGs, file = "DEG_HOM_VS_WT.csv",)
write.csv(results, file = "DeseqResults_HOM_VS_WT.csv",)



#create volcano plot
library(ggplot2)

# Add column to classify genes as DEG or not
results_df <- as.data.frame(results)
results_df$gene <- rownames(results_df)
results_df$threshold <- "Unchanged"
results_df$threshold[results_df$padj < 0.05 & abs(results_df$log2FoldChange) > 1] <- "DEG"


library(ggrepel)


# Volcano plot
ggplot(results_df, aes(x = log2FoldChange, y = -log10(padj), color = threshold)) +
  geom_point(alpha = 0.6, size = 1.5) +
  scale_color_manual(values = c("Unchanged" = "grey", "DEG" = "red")) +
  theme_minimal(base_size = 14) +
  labs(title = "Volcano Plot: HOM vs WT",
       x = "log2 Fold Change (HOM vs WT)",
       y = "-log10 Adjusted p-value",
       color = "Gene status") +
  geom_vline(xintercept = c(-1, 1), linetype = "dashed", color = "black") +
  geom_hline(yintercept = -log10(0.05), linetype = "dashed", color = "black")


# Select top 50 significant genes by padj
top100 <- results_df[order(results_df$padj), ][1:100, ]

# Volcano plot
ggplot(results_df, aes(x = log2FoldChange, y = -log10(padj), color = threshold)) +
  geom_point(alpha = 0.6, size = 1.5) +
  scale_color_manual(values = c("Unchanged" = "grey", "DEG" = "red")) +
  theme_minimal(base_size = 14) +
  labs(title = "Volcano Plot: HOM vs WT",
       x = "log2 Fold Change (HOM vs WT)",
       y = "-log10 Adjusted p-value",
       color = "Gene status") +
  geom_vline(xintercept = c(-1, 1), linetype = "dashed", color = "black") +
  geom_hline(yintercept = -log10(0.05), linetype = "dashed", color = "black") +
  geom_text_repel(data = top100,
                  aes(label = gene),
                  size = 3,
                  max.overlaps = Inf,
                  box.padding = 0.3,
                  point.padding = 0.2,
                  segment.color = "grey50")

write.csv(top100, file = "Top100_HOM_vs_WT.csv", row.names = FALSE)


# Define genes of interest
genes_of_interest <- c("Egfr", "Hsp90ab1", "Map2k2", "Cerk", "Pdgfra", "Tyk2", "Jak1",
                       "Yap1", "Taz", "Kdr", "Aurka", "Pten", "Csf1r","Ptch1", "Smo", "Gli2", "Gli3", "Wnt10a", "Rac2", "Rspo2", "Apc",
                       "Cd274", "Pdcd1", "Id1", "Id3", "Cdh1", "Cdc73", "Hrpt2","Csf1","Golph3", "Cdk1", "Acsl4", "Ptk2b", "Akt1", "Akt2", "Akt3", "Pik3ca", "Pik3c2a", "Pik3cb" , "Pik3c3", "Pik3c2b", "Pik3cd", "Atmin", "Cdkn1a", "Cdk9", "Rela", "Nfkb1", "Nfkb2", "Capza1", "Stat1", "Stat3", "Irf1", "Irf3")

# Subset DEGs for these genes (case-sensitive match!)
genes_subset <- results_df[rownames(results_df) %in% genes_of_interest, ]

# Save genes of interest with stats
write.csv(genes_subset, file = "GenesOfInterest_HOM_vs_WT.csv", row.names = TRUE)

ggplot(results_df, aes(x = log2FoldChange, y = -log10(padj), color = threshold)) +
  geom_point(alpha = 0.6, size = 1.5) +
  scale_color_manual(values = c("Unchanged" = "grey", "DEG" = "red")) +
  theme_minimal(base_size = 14) +
  labs(title = "Volcano Plot: HET vs WT",
       x = "log2 Fold Change (HET vs WT)",
       y = "-log10 Adjusted p-value",
       color = "Gene status") +
  geom_vline(xintercept = c(-1, 1), linetype = "dashed", color = "black") +
  geom_hline(yintercept = -log10(0.05), linetype = "dashed", color = "black") +
  geom_text_repel(
    data = genes_subset,
    aes(label = gene),
    color = "black",       # <-- force label text to black
    size = 3,
    max.overlaps = Inf,
    box.padding = 0.3,
    point.padding = 0.2,
    segment.color = "grey50"
  )


# Subset DEGs only
DEGs_df <- as.data.frame(DEGs)
DEGs_df$gene <- rownames(DEGs_df)

# Find overlap between DEGs and genes of interest
genes_subset <- DEGs_df[rownames(DEGs_df) %in% genes_of_interest, ]

# Save overlapping genes with stats
write.csv(genes_subset, file = "GenesOfInterest_DEGs_HOM_vs_WT.csv", row.names = TRUE)

# Volcano plot with labels ONLY for genes of interest that are DEGs
ggplot(results_df, aes(x = log2FoldChange, y = -log10(padj), color = threshold)) +
  geom_point(alpha = 0.6, size = 1.5) +
  scale_color_manual(values = c("Unchanged" = "grey", "DEG" = "red")) +
  theme_minimal(base_size = 14) +
  labs(title = "Volcano Plot: HET vs WT",
       x = "log2 Fold Change (HET vs WT)",
       y = "-log10 Adjusted p-value",
       color = "Gene status") +
  geom_vline(xintercept = c(-1, 1), linetype = "dashed", color = "black") +
  geom_hline(yintercept = -log10(0.05), linetype = "dashed", color = "black") +
  geom_text_repel(
    data = genes_subset,
    aes(label = gene),
    color = "black",       # labels in black
    size = 5,
    max.overlaps = Inf,
    box.padding = 0.3,
    point.padding = 0.2,
    segment.color = "grey50"
  )





if (!requireNamespace("clusterProfiler", quietly = TRUE)) {
  BiocManager::install("clusterProfiler")
}
if (!requireNamespace("msigdbr", quietly = TRUE)) {
  install.packages("msigdbr")
}
library(clusterProfiler)
library(msigdbr)


# Convert results to dataframe
res_df <- as.data.frame(results)

# Remove NA log2FC
res_df <- res_df[!is.na(res_df$log2FoldChange), ]

# Create named vector: names = gene symbols, values = log2FC
gene_list <- res_df$log2FoldChange
names(gene_list) <- rownames(res_df)

# Sort decreasing for clusterProfiler
gene_list <- sort(gene_list, decreasing = TRUE)


# Mouse Hallmark gene sets
hallmark_sets <- msigdbr(species = "Mus musculus", category = "H")  # H = Hallmark

# Use as two-column dataframe: gs_name (pathway), gene_symbol
term2gene <- hallmark_sets[, c("gs_name", "gene_symbol")]

# Make sure your DESeq2 results have no NA log2FC
res_df <- as.data.frame(results)
res_df <- res_df[!is.na(res_df$log2FoldChange), ]

# Named vector: names = gene symbols, values = log2FC
gene_list <- res_df$log2FoldChange
names(gene_list) <- rownames(res_df)
gene_list <- sort(gene_list, decreasing = TRUE)


gsea_res <- GSEA(
  geneList = gene_list,
  TERM2GENE = term2gene,  # <- must be dataframe, not list
  pvalueCutoff = 0.1,
  verbose = FALSE
)

# View top pathways
head(as.data.frame(gsea_res))

# Save results
write.csv(as.data.frame(gsea_res), "GSEA_Hallmark_Mouse_HOM_vs_WT.csv", row.names = FALSE)

library(enrichplot)

# Convert GSEA results to dataframe
gsea_df <- as.data.frame(gsea_res)

# Suppose the top (and only) enriched pathway:
top_pathway <- gsea_df$ID[5]  # or use $Description if you prefer

# Classic GSEA plot for the top pathway
gseaplot2(
  gsea_res,
  geneSetID = top_pathway,   # pathway ID
  title = gsea_df$Description[5],  # nice descriptive title
  color = "red"
)
#PLOT HALLMARK PATHWAYS
library(ggplot2)

# Convert GSEA results to dataframe
gsea_df <- as.data.frame(gsea_res)

# Order pathways by NES (normalized enrichment score)
gsea_df <- gsea_df[order(gsea_df$NES, decreasing = TRUE), ]

# Plot ALL enriched pathways
ggplot(gsea_df, aes(x = reorder(Description, NES), y = NES, fill = -log10(p.adjust))) +
  geom_col() +
  coord_flip() +
  labs(
    title = "GSEA: All Enriched Hallmark Pathways",
    x = "Pathway",
    y = "Normalized Enrichment Score (NES)",
    fill = "-log10 adj p-value"
  ) +
  theme_minimal(base_size = 14)


#plot GSEA KEGG, GO, REACTOME --------------------------
# Prepare ranked gene list with Entrez IDs for GSEA
# --------------------------
library(clusterProfiler)
library(org.Mm.eg.db)
library(ReactomePA)

# Convert gene symbols to Entrez IDs
entrez_map <- bitr(names(gene_list), fromType="SYMBOL", toType="ENTREZID", OrgDb=org.Mm.eg.db)
gene_list_df <- merge(entrez_map, data.frame(log2FC = gene_list), by.x="SYMBOL", by.y="row.names")
gene_list_df <- gene_list_df[!duplicated(gene_list_df$ENTREZID), ]
gene_list_named <- gene_list_df$log2FC
names(gene_list_named) <- gene_list_df$ENTREZID

# Sort decreasing for GSEA
gene_list_named <- sort(gene_list_named, decreasing = TRUE)

# --------------------------
# 1) GSEA: KEGG Pathways
# --------------------------
gsea_kegg <- gseKEGG(
  geneList = gene_list_named,
  organism = "mmu",
  minGSSize = 10,
  pvalueCutoff = 0.1,
  verbose = TRUE
)

# Save KEGG GSEA results
write.csv(as.data.frame(gsea_kegg), "GSEA_KEGG_HOM_vs_WT.csv", row.names = FALSE)

# Top 30 KEGG pathways barplot
library(enrichplot)
library(ggplot2)

# Convert gseaResult to dataframe to see top pathways
gsea_df <- as.data.frame(gsea_kegg)

# Select top 30 pathways by NES or pvalue
top30 <- gsea_df[order(gsea_df$NES, decreasing = TRUE)[1:30], ]

# Ridgeplot (shows enrichment distribution for multiple pathways)
ridgeplot(gsea_kegg, showCategory = 30) +
  ggtitle("GSEA: KEGG Top 30 Pathways") +
  theme_minimal(base_size = 14)

# Optional: classic GSEA plot for the top pathway
top_pathway <- top30$ID[1]
gseaplot2(gsea_kegg, geneSetID = top_pathway,
          title = top30$Description[1], color = "red")


# --------------------------
# 2) GSEA: GO Biological Process (BP)
# --------------------------
gsea_go_bp <- gseGO(
  geneList = gene_list_named,
  OrgDb = org.Mm.eg.db,
  ont = "ALL",
  keyType = "ENTREZID",
  minGSSize = 10,
  maxGSSize = 500,
  pvalueCutoff = 0.1,
  verbose = TRUE
)

# Save GO BP GSEA results
write.csv(as.data.frame(gsea_go_bp), "GSEA_GO_BP_HOM_vs_WT.csv", row.names = FALSE)

# Top 30 GO BP pathways barplot
barplot(gsea_go_bp, showCategory = 30, title = "GSEA: GO BP Top 30 Pathways")

# --------------------------
# 3) GSEA: Reactome Pathways
# --------------------------
gsea_reactome <- gsePathway(
  geneList = gene_list_named,
  organism = "mouse",
  minGSSize = 10,
  pvalueCutoff = 0.1,
  verbose = TRUE
)

# Save Reactome GSEA results
write.csv(as.data.frame(gsea_reactome), "GSEA_Reactome_HOM_vs_WT.csv", row.names = FALSE)

# Top 30 Reactome pathways barplot
barplot(gsea_reactome, showCategory = 30, title = "GSEA: Reactome Top 30 Pathways")

# --------------------------


library(ggplot2)
library(dplyr)

# Convert GSEA Reactome results to dataframe
gsea_reactome_df <- as.data.frame(gsea_reactome)

# Select top 30 pathways by NES magnitude
top30_reactome <- gsea_reactome_df %>%
  arrange(desc(abs(NES))) %>%
  slice(1:30)

# Reorder for plotting (highest NES on top)
top30_reactome$Description <- factor(top30_reactome$Description, levels = rev(top30_reactome$Description))

# Plot barplot: NES on x-axis, pathways on y-axis, fill by -log10(padj)
ggplot(top30_reactome, aes(x = NES, y = Description, fill = -log10(p.adjust))) +
  geom_bar(stat = "identity") +
  scale_fill_gradient(low = "red", high = "darkred") +
  theme_minimal(base_size = 14) +
  labs(title = "GSEA: Top 30 Reactome Pathways",
       x = "Normalized Enrichment Score (NES)",
       y = "",
       fill = "-log10(adj.p)")



#ORA Enrichment analysis
### --- DEG-based Over-Representation Analysis (ORA) ---

library(clusterProfiler)
library(org.Mm.eg.db)
library(msigdbr)
library(ReactomePA)
library(enrichplot)
library(ggplot2)
library(dplyr)

# Subset DEGs you already defined
DEGs_df <- as.data.frame(DEGs)
DEGs_df$gene <- rownames(DEGs_df)

# Convert SYMBOL -> ENTREZ
deg_entrez <- bitr(
  DEGs_df$gene,
  fromType = "SYMBOL",
  toType   = "ENTREZID",
  OrgDb    = org.Mm.eg.db
)

# Remove duplicates
deg_entrez <- deg_entrez[!duplicated(deg_entrez$ENTREZID), ]


### --- ORA: GO Biological Process ---
ego_bp <- enrichGO(
  gene          = deg_entrez$ENTREZID,
  OrgDb         = org.Mm.eg.db,
  keyType       = "ENTREZID",
  ont           = "BP",   # Biological Process
  pAdjustMethod = "BH",
  pvalueCutoff  = 0.05,
  qvalueCutoff  = 0.2,
  readable      = TRUE    # back to SYMBOL
)

# Save results
write.csv(as.data.frame(ego_bp), "ORA_GO_BP_HET_vs_WT.csv", row.names = FALSE)

# Plot top 20
dotplot(ego_bp, showCategory = 20) + ggtitle("ORA: GO Biological Process (DEGs)")


### --- ORA: KEGG Pathways ---
ekegg <- enrichKEGG(
  gene          = deg_entrez$ENTREZID,
  organism      = "mmu",
  pvalueCutoff  = 0.05
)

# Save results
write.csv(as.data.frame(ekegg), "ORA_KEGG_HET_vs_WT.csv", row.names = FALSE)

# Plot top 20
dotplot(ekegg, showCategory = 20) + ggtitle("ORA: KEGG Pathways (DEGs)")


### --- ORA: Reactome Pathways ---
ereact <- enrichPathway(
  gene          = deg_entrez$ENTREZID,
  organism      = "mouse",
  pvalueCutoff  = 0.05,
  readable      = TRUE
)

# Save results
write.csv(as.data.frame(ereact), "ORA_Reactome_HET_vs_WT.csv", row.names = FALSE)

# Plot top 20
dotplot(ereact, showCategory = 20) + ggtitle("ORA: Reactome Pathways (DEGs)")


### --- ORA: Hallmark Gene Sets ---
hallmark_sets <- msigdbr(species = "Mus musculus", category = "H")
term2gene <- hallmark_sets[, c("gs_name", "entrez_gene")]
term2gene$entrez_gene <- as.character(term2gene$entrez_gene)

ora_hallmark <- enricher(
  gene          = deg_entrez$ENTREZID,
  TERM2GENE     = term2gene,
  pvalueCutoff  = 0.05
)

# Save results
write.csv(as.data.frame(ora_hallmark), "ORA_Hallmark_HET_vs_WT.csv", row.names = FALSE)

# Plot top 20
dotplot(ora_hallmark, showCategory = 20) + ggtitle("ORA: Hallmark Pathways (DEGs)")


### --- DEG-based ORA: Up vs Downregulated DEGs ---


# Split DEGs
up_DEGs   <- rownames(DEGs[DEGs$log2FoldChange > 1, ])
down_DEGs <- rownames(DEGs[DEGs$log2FoldChange < -1, ])

# Convert SYMBOL -> ENTREZ for both
up_entrez <- bitr(up_DEGs, fromType="SYMBOL", toType="ENTREZID", OrgDb=org.Mm.eg.db)
up_entrez <- up_entrez[!duplicated(up_entrez$ENTREZID), ]

down_entrez <- bitr(down_DEGs, fromType="SYMBOL", toType="ENTREZID", OrgDb=org.Mm.eg.db)
down_entrez <- down_entrez[!duplicated(down_entrez$ENTREZID), ]


### --- ORA: GO Biological Process ---
ego_up <- enrichGO(gene=up_entrez$ENTREZID, OrgDb=org.Mm.eg.db, keyType="ENTREZID",
                   ont="BP", pAdjustMethod="BH", pvalueCutoff=0.05, qvalueCutoff=0.2, readable=TRUE)
ego_down <- enrichGO(gene=down_entrez$ENTREZID, OrgDb=org.Mm.eg.db, keyType="ENTREZID",
                     ont="BP", pAdjustMethod="BH", pvalueCutoff=0.05, qvalueCutoff=0.2, readable=TRUE)

write.csv(as.data.frame(ego_up), "ORA_GO_BP_Upregulated.csv", row.names=FALSE)
write.csv(as.data.frame(ego_down), "ORA_GO_BP_Downregulated.csv", row.names=FALSE)

dotplot(ego_up, showCategory=20) + ggtitle("ORA: GO BP (Upregulated)")
dotplot(ego_down, showCategory=20) + ggtitle("ORA: GO BP (Downregulated)")
                     

### --- ORA: Hallmark Gene Sets ---
hallmark_sets <- msigdbr(species="Mus musculus", category="H")
term2gene <- hallmark_sets[, c("gs_name", "entrez_gene")]
term2gene$entrez_gene <- as.character(term2gene$entrez_gene)

ora_hallmark_up <- enricher(gene=up_entrez$ENTREZID, TERM2GENE=term2gene, pvalueCutoff=0.05)
ora_hallmark_down <- enricher(gene=down_entrez$ENTREZID, TERM2GENE=term2gene, pvalueCutoff=0.05)

write.csv(as.data.frame(ora_hallmark_up), "ORA_Hallmark_Upregulated.csv", row.names=FALSE)
write.csv(as.data.frame(ora_hallmark_down), "ORA_Hallmark_Downregulated.csv", row.names=FALSE)

dotplot(ora_hallmark_up, showCategory=20) + ggtitle("ORA: Hallmark (Upregulated)")
dotplot(ora_hallmark_down, showCategory=20) + ggtitle("ORA: Hallmark (Downregulated)")


### --- ORA: Reactome ---
ereact_up <- enrichPathway(gene=up_entrez$ENTREZID, organism="mouse", pvalueCutoff=0.05, readable=TRUE)
ereact_down <- enrichPathway(gene=down_entrez$ENTREZID, organism="mouse", pvalueCutoff=0.05, readable=TRUE)

write.csv(as.data.frame(ereact_up), "ORA_Reactome_Upregulated.csv", row.names=FALSE)
write.csv(as.data.frame(ereact_down), "ORA_Reactome_Downregulated.csv", row.names=FALSE)

dotplot(ereact_up, showCategory=20) + ggtitle("ORA: Reactome (Upregulated)")
dotplot(ereact_down, showCategory=20) + ggtitle("ORA: Reactome (Downregulated)")



### --- ORA: KEGG ---
ekegg_up <- enrichKEGG(gene=up_entrez$ENTREZID, organism="mmu", pvalueCutoff=0.05)
ekegg_down <- enrichKEGG(gene=down_entrez$ENTREZID, organism="mmu", pvalueCutoff=0.05)

write.csv(as.data.frame(ekegg_up), "ORA_KEGG_Upregulated.csv", row.names=FALSE)
write.csv(as.data.frame(ekegg_down), "ORA_KEGG_Downregulated.csv", row.names=FALSE)

dotplot(ekegg_up, showCategory=20) + ggtitle("ORA: KEGG (Upregulated)")
dotplot(ekegg_down, showCategory=20) + ggtitle("ORA: KEGG (Downregulated)")



#Heatmap
ifna_genes <- c(
  "Serpine2","Sntb1","Mmp14","Tnc","Fstl1","Vcan","Il15","Scg2","Ecm1","Vegfa",
  "Cxcl12","Dst","Tgfbr3","Ptx3","Pcolce","Spock1","Adam12","Tagln","Loxl1","Cdh6",
  "Pvr","Gadd45b","Gadd45a","Rhob","Rgs4","Fzd8","Tfpi2","Vegfc","Oxtr","Dab2",
  "Lum","Col16a1","Tnfrsf11b","Fap","Matn2","Snai2","Cxcl5","Calu","Capg","Emp3",
  "Nnmt","Gpc1","Tnfaip3","Nid2","Mcm7","Slit3","Slit2","Matn3","Fmod","Edil3",
  "Dkk1","Qsox1","Copa","Cxcl15","Crlf1","Grem1","Fbln5","Sgcb","Sgcg","Sgcd",
  "Plod2","Plod3","Foxc2","Sdc1","Sdc4","Dpysl3","Tnfrsf12a","Pdlim4","Mfap5","Col5a3",
  "P3h1","Cadm1","Fstl3","Efemp2","Cap2","Gpx7","Cthrc1","Basp1","Glipr1","Lrrc15",
  "Pcolce2","Colgalt1","Postn","Htra1","Pmepa1","Myl9","Slc6a8","Magee1","Wipf1","Fermt2",
  "Abi3bp","Ntm","Tpm4","Ecm2","Anpep","Gm21451","Acta2","Aplp1","Areg","Bdnf",
  "Bgn","Bmp1","Cald1","Serpinh1","Cd44","Cdh2","Col11a1","Col12a1","Col3a1","Col4a1",
  "Col4a2","Col5a1","Col5a2","Col6a2","Col6a3","Col7a1","Col8a2","Col1a1","Col1a2","Comp",
  "Ccn1","Sfrp4","Sfrp1","Mylk","Dcn","Eln","Eno2","Fas","Fbln1","Fbln2",
  "Fbn1","Fbn2","Fgf2","Ccn2","Flna","Fn1","Fuca1","Gas1","Gja1","Id2",
  "Igfbp2","Igfbp3","Igfbp4","Il6","Inhba","Itga2","Itga5","Itgav","Itgb1","Itgb3",
  "Itgb5","Jun","Lgals1","Lox","Lrp1","Mest","Mgp","Mmp2","Mmp3","Msx1","Notch2",
  "Pdgfrb","Pfn2","Serpine1","Plaur","Pmp22","Prrx1","Ppib","Pthlh","Sat1","Sparc",
  "Spp1","Tgfb1","Tgm2","Thbs1","Thbs2","Thy1","Timp1","Timp3","Tpm1","Tpm2",
  "Vcam1","Vim","Wnt5a","Cdh11","Nt5e","Gem","Lama1","Plod1","Lama3","Lama2",
  "Lamc2","Lamc1","Tgfbi"
)



# Get normalized counts from DESeq2
norm_counts <- counts(dds, normalized=TRUE)

# Subset for IFN-α response genes (keep only genes present in your dataset)
ifna_counts <- norm_counts[rownames(norm_counts) %in% ifna_genes, ]

# Optionally, z-score normalize each gene for heatmap visualization
ifna_counts_z <- t(scale(t(ifna_counts)))


# Create annotation for columns
ann_col <- data.frame(
  Condition = metadata$condition
)
rownames(ann_col) <- metadata$sample_id


library(pheatmap)

# Make sure columns are in your desired order
desired_order <- c("sample.WT_1", "sample.WT_2", "sample.WT_3",
                   "sample.neg.neg_1", "sample.neg.neg_2", "sample.neg.neg_3")
ifna_counts_z <- ifna_counts_z[, desired_order]

# Make sure annotation matches
ann_col <- ann_col[desired_order, , drop = FALSE]

# Heatmap
pheatmap(ifna_counts_z,
         annotation_col = ann_col,
         show_rownames = TRUE,
         show_colnames = TRUE,
         cluster_rows = TRUE,
         cluster_cols = FALSE,   # keep the column order fixed
         scale = "row",
         fontsize_row = 8,
         main = "Interferon Alpha Response Genes")