cd14_clusterizations_comparisons.qmd

---
title: "CD14+ clusterizations comparisons with CellTypist"
---

```{r}
#library(SingleCellExperiment)
#library(DuoClustering2018)
#library(tidyr)
library(rlang)
library(ggplot2)
library(ggsankey) # remotes::install_github("davidsjoberg/ggsankey")
library(tibble)
library(zeallot)
library(assertthat)
library(COTAN)
library(caret)
theme_set(theme_bw())
library(stringr)
library(nnet)
#devtools::load_all("~/dev/COTAN/COTAN/")

options(parallelly.fork.enable = TRUE)

outDir <- "Results/Clusterization/"

setLoggingLevel(2)
setLoggingFile(file.path(outDir, "CD14_Monocytes_ClusterizationsComparisons.log"))
```

```{r}
cd14Obj <- readRDS(file = file.path("Data/CD14Cleaned/", "CD14_Monocytes.cotan.RDS"))

sampleCondition <- getMetadataElement(cd14Obj, datasetTags()[["cond"]])

sampleCondition

getClusterizations(cd14Obj)

metaC <- getMetadataCells(cd14Obj)
```

```{r}
splitClusters <- getClusters(cd14Obj, "split")
mergedClusters <- getClusters(cd14Obj, "merge")
```

```{r, echo=TRUE,eval=FALSE}
labelsDF <- read.csv(file.path("Data/CD14Cleaned/", "CD14Cleaned_Immune_All_Low_predicted_labels.csv"), header = TRUE)
labelsDF <- column_to_rownames(labelsDF, var = "X")
rownames(labelsDF) <- gsub("[.]", "-", rownames(labelsDF))

cells_to_keep <- rownames(labelsDF)[rownames(labelsDF) %in% getCells(cd14Obj)]
assert_that(identical(cells_to_keep, getCells(cd14Obj)))

majorityVotingClusters <- labelsDF[cells_to_keep, "majority_voting"]
names(majorityVotingClusters) <- cells_to_keep

majorityVotingCoexDF <- DEAOnClusters(cd14Obj,clName ="majority-voting",clusters =   majorityVotingClusters)

cd14Obj <- addClusterization(cd14Obj, clName = "majority-voting",
                             clusters = majorityVotingClusters,
                             coexDF = majorityVotingCoexDF)
```

```{r,echo=FALSE}
majorityVotingClusters <- getClusterizationData(cd14Obj,clName = "majority-voting")$clusters

```

Save the COTAN object

```{r,eval=FALSE, echo=TRUE}
saveRDS(cd14Obj, file = file.path(outDir, paste0(sampleCondition, ".cotan.RDS")))
```

```{r eval=FALSE}
nlevels(splitClusters)
head(sort(table(splitClusters), decreasing = TRUE), 10L)

nlevels(mergedClusters)
head(sort(table(mergedClusters), decreasing = TRUE), 10L)

nlevels(majorityVotingClusters)
head(sort(table(majorityVotingClusters), decreasing = TRUE), 10L)
```

```{r}
splitClustersDF <- as.data.frame(splitClusters)
splitClustersDF[["cell"]] <- rownames(splitClustersDF)
colnames(splitClustersDF)[[1]] <- "COTAN.split.cluster"
splitClustersDF <- splitClustersDF[order(splitClustersDF[["COTAN.split.cluster"]]), ]

mergedClustersDF <- as.data.frame(mergedClusters)
mergedClustersDF[["cell"]] <- rownames(mergedClustersDF)
colnames(mergedClustersDF)[[1]] <- "COTAN.merged.cluster"
mergedClustersDF <- mergedClustersDF[order(mergedClustersDF[["COTAN.merged.cluster"]]), ]

majorityVotingClustersDF <- as.data.frame(majorityVotingClusters)
majorityVotingClustersDF[["cell"]] <- rownames(majorityVotingClustersDF)
colnames(majorityVotingClustersDF)[[1]] <- "majority.voting.cluster"
majorityVotingClustersDF <- majorityVotingClustersDF[order(majorityVotingClustersDF[["majority.voting.cluster"]]), ]
```

```{r}
mjvt_split.table <- merge.data.frame(x = majorityVotingClustersDF, y = splitClustersDF,
                                         by = "cell", all.x = TRUE, all.y = TRUE)

table(mjvt_split.table[,c(2L, 3L)])
```

```{r}
#| label: COTAN.split.cluster.sankey.maj.voting
mjvt_split.table2 <- mjvt_split.table %>% make_long(majority.voting.cluster, COTAN.split.cluster)

ggplot(mjvt_split.table2,
       aes(x = x,
           next_x = next_x,
           node = node,
           next_node = next_node,
           fill = factor(node),
           label = node)) +
  geom_sankey(flow.alpha = 0.75, node.color = 1) +
  geom_sankey_label(size = 3.5, color = 1, fill = "white") +
  scale_fill_viridis_d(option = "A", alpha = 0.95) +
  theme_sankey(base_size = 16) +
  theme(legend.position = "none")
```

```{r}
#| label: COTAN.merged.cluster.sankey.maj.voting
mjvt_merged.table <- merge.data.frame(x = majorityVotingClustersDF, y = mergedClustersDF,
                                         by = "cell", all.x = TRUE, all.y = TRUE)

table(mjvt_merged.table[,c(2L, 3L)])
```

```{r}
mjvt_merged.table2 <- mjvt_merged.table %>% make_long(majority.voting.cluster, COTAN.merged.cluster)

ggplot(mjvt_merged.table2,
       aes(x = x,
           next_x = next_x,
           node = node,
           next_node = next_node,
           fill = factor(node),
           label = node)) +
  geom_sankey(flow.alpha = 0.75, node.color = 1) +
  geom_sankey_label(size = 3.5, color = 1, fill = "white") +
  scale_fill_viridis_d(option = "A", alpha = 0.95) +
  theme_sankey(base_size = 16) +
  theme(legend.position = "none")
```

```{r}
markersCD14 <- findClustersMarkers(cd14Obj,n = 100,clName = "merge",method = "BH")
write.csv(markersCD14,file = "Data/CD14Cleaned/ClusterMarkerGenes.csv")
head(markersCD14)
```

```{r}
cd14GDI <- calculateGDI(cd14Obj)

subsetGDI <- cd14GDI[cd14GDI$sum.raw.norm > 7,]
top.GDI.genes <- rownames(subsetGDI[order(subsetGDI$GDI,decreasing = T),])[1:50]

GDIPlot(cd14Obj,genes = "",GDIIn = cd14GDI)
```

```{r}
data <- getNormalizedData(cd14Obj)

data <- data[!rowSums(as.matrix(data)) < 1,]
data <- log(data*10000+1)

row_stdev <- apply(data, 1, sd, na.rm=TRUE)
row_stdev <- row_stdev[order(row_stdev,decreasing = T)]

genes.to.keep <- c(names(row_stdev[1:100]),top.GDI.genes)

data.small <- data[rownames(data) %in% genes.to.keep,]

#data <- t(as.matrix(data))
data.small <- t(as.matrix(data.small))


COTAN_Cl.code <- getClusterizationData(cd14Obj,clName = "merge")[[1]]
#COTAN_Cl.code[COTAN_Cl.code == "A549"] <- 0
#COTAN_Cl.code[COTAN_Cl.code == "CCL-185-IG"] <- 1

COTAN_Cl.code <- as.numeric(COTAN_Cl.code)

data.small <- cbind(data.small,COTAN_Cl.code)
data.small <- as.data.frame(data.small)

# Split the data into training and test set
set.seed(123)
training.samples <- data.small[,"COTAN_Cl.code"] %>% 
  createDataPartition(p = 0.8, list = FALSE)
train.data  <- data.small[training.samples, ]
test.data <- data.small[-training.samples, ]
head(train.data)
```

```{r}
train.data$COTAN_Cl.code <- as.factor(train.data$COTAN_Cl.code)
test.data$COTAN_Cl.code <- as.factor(test.data$COTAN_Cl.code)
# Fit the model
#model <- glm( COTAN_Cl.code ~., data = train.data, family = binomial,control = list(maxit = 75))
model <- multinom(COTAN_Cl.code ~ ., data = train.data, maxit = 500)
probabilities <- predict(model, newdata = test.data, type = "probs")

# Find the class with the highest probability for each case
predicted.classes <- apply(probabilities, 1, which.max)

# Adjust predicted classes to match your factor levels
levels <- levels(train.data$COTAN_Cl.code)
predicted.classes <- levels[predicted.classes]

# Calculate model accuracy
accuracy <- mean(predicted.classes == test.data$COTAN_Cl.code)
#result.df[nrow(result.df),"Accuracy"] <- accuracy
print(accuracy)
```

So the logistic regression accuracy for COTAN three clusters is quite good.

![Cluster 1 top enriched genes. In red enriched Reactome Pathways innate immune system genes.](Data/CD14Cleaned/FromStringrEnrichr/CD14Cluster1.png){fig-align="center"}

![cluster 2 - 100 top enriched genes. In red Antigen processing and presentation of exogenous peptide antigen via MHC class II genes, in violet Regulation of leukocyte cell-cell adhesion (GO Biological Process) and in green Adaptive Immune System](Data/CD14Cleaned/FromStringrEnrichr/CD14Cluster2string_vector_graphic.png){fig-align="center"}

![Cluster 3 - 100 top enriched genes. In violet Cytokine Signaling in Immune system and in red Immunoregulatory interactions between a Lymphoid and a non-Lymphoid cell enriched genes for Reatome Pathways.](Data/CD14Cleaned/FromStringrEnrichr/CD14Cluster3.png){fig-align="center"}

The cells in cluster 1 seem to be classical CD14 monocytes since they express CD14 while depleted in MHC class II proteins complex which is enriched in cell cluster 2 that seems intermediate monocytes.

Cluster 3 does not seem to be a monocyte cluster. Using enrichr website it is enriched in Plasmacytoid Dendritic cell marker genes.

```{r}
sessionInfo()
```