-
Notifications
You must be signed in to change notification settings - Fork 131
/
Copy pathannotateSingleR_multipleRefs.R
136 lines (94 loc) · 4.51 KB
/
annotateSingleR_multipleRefs.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
# script to annotate cell types from 20k Human PBMCs from a healthy female donor
# setwd("~/Desktop/demo/singleCell_singleR_part2/scripts")
library(SingleR)
library(celldex)
library(Seurat)
library(tidyverse)
library(pheatmap)
# 10X CellRanger .HDF5 format ---------
hdf5_obj <- Read10X_h5(filename = "../data/20k_PBMC_3p_HT_nextgem_Chromium_X_filtered_feature_bc_matrix.h5",
use.names = TRUE,
unique.features = TRUE)
pbmc.seurat <- CreateSeuratObject(counts = hdf5_obj)
pbmc.seurat
# QC and Filtering ----------
# explore QC
pbmc.seurat$mitoPercent <- PercentageFeatureSet(pbmc.seurat, pattern = '^MT-')
VlnPlot(pbmc.seurat, features = c("nFeature_RNA", "nCount_RNA", "mitoPercent"), ncol = 3)
pbmc.seurat.filtered <- subset(pbmc.seurat, subset = nCount_RNA > 800 &
nFeature_RNA > 500 &
mitoPercent < 10)
# It is a good practice to filter out cells with non-sufficient genes identified and genes with non-sufficient expression across cells.
# pre-process standard workflow ---------------
pbmc.seurat.filtered <- NormalizeData(object = pbmc.seurat.filtered)
pbmc.seurat.filtered <- FindVariableFeatures(object = pbmc.seurat.filtered)
pbmc.seurat.filtered <- ScaleData(object = pbmc.seurat.filtered)
pbmc.seurat.filtered <- RunPCA(object = pbmc.seurat.filtered)
ElbowPlot(pbmc.seurat.filtered)
pbmc.seurat.filtered <- FindNeighbors(object = pbmc.seurat.filtered, dims = 1:20)
pbmc.seurat.filtered <- FindClusters(object = pbmc.seurat.filtered)
pbmc.seurat.filtered <- RunUMAP(object = pbmc.seurat.filtered, dims = 1:20)
# running steps above to get clusters
DimPlot(pbmc.seurat.filtered, reduction = "umap")
View([email protected])
# run SingleR with multiple reference datasets (default mode) ---------
# for pbmc data, we will use two datasets
hpca <- celldex::HumanPrimaryCellAtlasData()
dice <- celldex::DatabaseImmuneCellExpressionData()
# ...1. Strategy 1: Using reference-specific labels ----------
hpca$label.main
dice$label.main
# adding ref info to labels
hpca$label.main <- paste0('HPCA.', hpca$label.main)
dice$label.main <- paste0('DICE.', dice$label.main)
# create a combined ref based on shared genes
shared <- intersect(rownames(hpca), rownames(dice))
combined <- cbind(hpca[shared,], dice[shared,])
combined
combined$label.main
# run singleR using combined ref
# savings counts into a separate object
pbmc_counts <- GetAssayData(pbmc.seurat.filtered, slot = 'counts')
com.res1 <- SingleR(test = pbmc_counts, ref = combined, labels = combined$label.main)
table(com.res1$labels)
pbmc.seurat.filtered$com.res1.labels <- com.res1[match(rownames([email protected]), rownames(com.res1)), 'labels']
View([email protected])
DimPlot(pbmc.seurat.filtered, reduction = 'umap', group.by = 'com.res1.labels', label = TRUE)
# ...2. Strategy 2: Comparing scores across references ----------
hpca$label.main
dice$label.main
hpca$label.main <- gsub('HPCA\\.','', hpca$label.main)
dice$label.main <- gsub('DICE\\.','', dice$label.main)
com.res2 <- SingleR(test = pbmc_counts,
ref = list(HPCA = hpca, DICE = dice),
labels = list(hpca$label.main, dice$label.main))
# Check the final label from the combined assignment.
table(com.res2$labels)
# which reference scored best for which label?
grouping <- paste0(com.res2$labels,'.', com.res2$reference)
best_ref <- as.data.frame(split(com.res2, grouping))
# get de. genes from each individual references
metadata(com.res2$orig.results$HPCA)$de.genes
metadata(com.res2$orig.results$DICE)$de.genes
# Combined diagnostics
plotScoreHeatmap(com.res2)
# ...3. Strategy 3: Using Harmonized Labels ----------
hpca.ont <- celldex::HumanPrimaryCellAtlasData(cell.ont = 'nonna')
dice.ont <- celldex::DatabaseImmuneCellExpressionData(cell.ont = 'nonna')
# Using the same sets of genes:
shared <- intersect(rownames(hpca.ont), rownames(dice.ont))
hpca.ont <- hpca.ont[shared,]
dice.ont <- dice.ont[shared,]
# Showing the top 10 most frequent terms:
tail(sort(table(hpca.ont$label.ont)),10)
tail(sort(table(dice.ont$label.ont)), 10)
# using label.ont instead on label.main while running SingleR
com.res3 <- SingleR(test = pbmc_counts,
ref = list(HPCA = hpca.ont, DICE = dice.ont),
labels = list(hpca.ont$label.ont, dice.ont$label.ont))
table(com.res3$labels)
# How to map cell ontology terms? ----------------
colData(hpca.ont)
colData(dice.ont)
hpca.fle <- system.file("mapping","hpca.tsv", package = "celldex")
hpca.mapping <- read.delim(hpca.fle, header = F)