Here, we’re going to load a CyTOF fcs file, and we’re going to test the Julia implementation of UMAP to that of R, and see which is faster.
Let’s initialize Julia.
library(JuliaConnectoR)
library(JuliaCall)
julia_setup(JULIA_HOME = "/Applications/Julia-1.8.app/Contents/Resources/julia/bin/")
julia_command("using Pkg; Pkg.add(\"RCall\")")
julia_command("using RCall")
julia_command("using Pkg; Pkg.add(\"DataFrames\")")
julia_command("using DataFrames")
julia_command("using Pkg; Pkg.add(\"UMAP\")")
julia_command("using UMAP")
julia_command("using LinearAlgebra")
And now let’s get the fcs files imported.
library(flowCore)
library(tidyverse)
set.seed(1)
setwd(here::here("data", 'mike_LISLinnegCD45pos-101917-relabeled-CORRECT'))
ff <- flowCore::read.FCS('Center_1_A_1_MATLAB_LiveIntactSing_Linneg_CD45pos.fcs')
ff
## flowFrame object 'Center_1_A_1_MATLAB_LiveIntactSing_Linneg_CD45pos.fcs'
## with 93816 cells and 30 observables:
## name desc range minRange maxRange
## $P1 Ce140Di beads1 2911 0 7
## $P2 Ce142Di beads2 1509 0 1
## $P3 Er167Di CD27 781 0 17
## $P4 Er170Di CD3 1664 0 27
## $P5 Eu151Di CD123 2814 0 16
## ... ... ... ... ... ...
## $P26 Tm169Di CD45RA 12000 0 23
## $P27 Yb172Di CD38 5629 0 18
## $P28 Yb174Di HLADR 6990 0 11
## $P29 Yb176Di CD56 1349 0 12
## $P30 Time Time 0 0 1
## 261 keywords are stored in the 'description' slot
Let’s tidy the data.
mass_tags <- ff@parameters@data$name
params <- ff@parameters@data$desc
# All NA values in marker names will revert to mass tags names
params <- ifelse(is.na(params), mass_tags, params)
colnames(ff) <- params
We do our asinh transform.
exp <- flowCore::exprs(ff)
as_tibble(exp)
## # A tibble: 93,816 × 30
## beads1 beads2 CD27 CD3 CD123 beads3 `Event #` Event_length CD33
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0 0 0 1.74 2.35 2.09 0 24.3 3.41
## 2 0 0 0 56.1 3.39 16.7 0 25.2 0.686
## 3 0 0 0.437 2.93 6.23 228. 0 17.7 2.23
## 4 0 0 108. 288. 5.61 91.3 0 15.9 2.84
## 5 2.48 0 9.69 7.18 3.35 22.2 0 20.5 5.29
## 6 1.49 0 27.6 191. 2.44 2.25 0 24.3 0
## 7 1.57 0 26.0 160. 2.30 46.6 0 27.1 0.894
## 8 0 0 1.29 42.2 2.56 22.9 0 23.3 0
## 9 0 0 4.12 9.30 0 14.1 0 19.6 8.39
## 10 0 0 0.715 0 3.28 1.21 0 19.6 70.8
## # ℹ 93,806 more rows
## # ℹ 21 more variables: CD14 <dbl>, CD61 <dbl>, cells1 <dbl>, cells2 <dbl>,
## # beads4 <dbl>, beads5 <dbl>, CD19 <dbl>, CD4 <dbl>, CD8A <dbl>, CD16 <dbl>,
## # CD235 <dbl>, livedead <dbl>, CD20 <dbl>, CD66 <dbl>, CD45 <dbl>,
## # CD11C <dbl>, CD45RA <dbl>, CD38 <dbl>, HLADR <dbl>, CD56 <dbl>, Time <dbl>
exp <- asinh(exp/5)
as_tibble(exp)
## # A tibble: 93,816 × 30
## beads1 beads2 CD27 CD3 CD123 beads3 `Event #` Event_length CD33 CD14
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0 0 0 0.342 0.454 0.407 0 2.28 0.638 1.59
## 2 0 0 0 3.11 0.635 1.92 0 2.32 0.137 1.35
## 3 0 0 0.0872 0.557 1.05 4.51 0 1.98 0.432 1.08
## 4 0 0 3.76 4.75 0.965 3.60 0 1.87 0.542 0.446
## 5 0.478 0 1.42 1.16 0.627 2.19 0 2.12 0.922 1.88
## 6 0.294 0 2.41 4.34 0.470 0.435 0 2.28 0 0.0199
## 7 0.308 0 2.35 4.16 0.444 2.93 0 2.39 0.178 0.384
## 8 0 0 0.255 2.83 0.492 2.22 0 2.24 0 1.60
## 9 0 0 0.752 1.38 0 1.76 0 2.07 1.29 2.20
## 10 0 0 0.142 0 0.617 0.240 0 2.07 3.34 2.01
## # ℹ 93,806 more rows
## # ℹ 20 more variables: CD61 <dbl>, cells1 <dbl>, cells2 <dbl>, beads4 <dbl>,
## # beads5 <dbl>, CD19 <dbl>, CD4 <dbl>, CD8A <dbl>, CD16 <dbl>, CD235 <dbl>,
## # livedead <dbl>, CD20 <dbl>, CD66 <dbl>, CD45 <dbl>, CD11C <dbl>,
## # CD45RA <dbl>, CD38 <dbl>, HLADR <dbl>, CD56 <dbl>, Time <dbl>
flowCore::exprs(ff) <- exp
We get the surface markers of interest.
params <- ff@parameters@data$desc
# Markers to use
cd <- grep("CD", params)
ig <- grep("Ig", params)
hla <- grep("HLA", params)
ccr <- c(grep("CCR", params), grep("CXC", params))
surface <- c(cd, ig, hla, ccr)
surface
## [1] 3 4 5 9 10 11 16 17 18 19 20 22 23 24 25 26 27 29 28
Let’s sample it.
# Dev
# ff <- ff[sample(nrow(ff), 20000),]
And now to put it into Julia.
# We have to transpose the expression matrix for Julia to read it properly
out <- ff@exprs[,surface] %>% t()
out[1:5, 1:5]
## [,1] [,2] [,3] [,4] [,5]
## CD27 0.0000000 0.0000000 0.0872166 3.7627048 1.4154413
## CD3 0.3421098 3.1133225 0.5565164 4.7460117 1.1584713
## CD123 0.4537044 0.6348307 1.0454967 0.9651255 0.6274731
## CD33 0.6384434 0.1367997 0.4320906 0.5417258 0.9220192
## CD14 1.5862250 1.3474719 1.0755468 0.4458862 1.8761743
Ok, let’s get Julia up and running.
# Retrieve "out" from R
@rget out
## 19×93816 Matrix{Float64}:
## 0.0 0.0 0.0872166 … 0.368193 0.500623 0.0
## 0.34211 3.11332 0.556516 3.03859 0.596295 0.175523
## 0.453704 0.634831 1.0455 1.15862 0.752023 0.545209
## 0.638443 0.1368 0.432091 0.28775 1.26376 3.43226
## 1.58622 1.34747 1.07555 2.08884 1.37103 2.34549
## 0.198598 0.0704348 1.03031 … 1.39157 0.0 1.42908
## 4.1842 0.416731 0.0 0.0966332 0.452087 0.0
## 0.845268 1.21637 0.621818 1.35861 0.579154 1.25884
## 1.11282 4.45331 1.79687 4.92435 1.58902 0.00159929
## 2.32197 2.41456 4.24653 2.74927 3.05129 0.454894
## 1.15779 2.20775 1.68733 … 2.6857 2.01022 1.41758
## 3.60714 2.09793 2.4869 2.17349 1.10625 0.946835
## 1.00422 1.90886 2.29907 1.53119 1.3649 0.637822
## 5.52877 4.70376 5.48747 4.77042 4.7392 3.64652
## 0.0 0.710323 0.0662841 0.666844 4.28743 2.83439
## 5.22602 0.511767 6.05748 … 4.95054 5.2411 1.46861
## 3.53151 1.52569 0.397969 3.33941 0.320734 3.15861
## 0.520374 0.934533 2.98138 0.927598 0.0996453 0.317896
## 3.93252 0.988068 0.457855 1.65632 3.01901 1.70374
# Run UMAP, and also time it
@time begin
dimr = umap(out)
end
## 108.653283 seconds (41.50 M allocations: 4.073 GiB, 0.63% gc time, 4.97% compilation time)
## 2×93816 Matrix{Float64}:
## 10.5461 -4.83923 6.83231 -1.65526 … -1.40113 8.43417 4.00585
## 4.98304 -1.74717 -7.79605 9.8458 -2.83664 -13.4154 -15.5386
# Place the output back into R
@rput dimr
## 2×93816 Matrix{Float64}:
## 10.5461 -4.83923 6.83231 -1.65526 … -1.40113 8.43417 4.00585
## 4.98304 -1.74717 -7.79605 9.8458 -2.83664 -13.4154 -15.5386
Ok, now we get it back into our environment.
# Tidy the umap output from Julia
dimr_julia <- as_tibble(t(dimr))
names(dimr_julia) <- c("umap1", "umap2")
dimr_julia
## # A tibble: 93,816 × 2
## umap1 umap2
## <dbl> <dbl>
## 1 10.5 4.98
## 2 -4.84 -1.75
## 3 6.83 -7.80
## 4 -1.66 9.85
## 5 4.61 -7.54
## 6 -6.86 9.70
## 7 -6.87 9.51
## 8 -6.22 6.80
## 9 5.92 -7.63
## 10 6.53 -15.0
## # ℹ 93,806 more rows
library(umap)
# Run UMAP in R, and time it
start_time <- Sys.time()
dimr_r <- umap::umap(ff@exprs[,surface])$layout %>% as_tibble()
names(dimr_r) <- c("umap1", "umap2")
end_time <- Sys.time()
elapsed_time <- difftime(end_time, start_time, units = "secs")
cat("Elapsed time:", elapsed_time, "seconds\n")
## Elapsed time: 408.0728 seconds
dimr_r
## # A tibble: 93,816 × 2
## umap1 umap2
## <dbl> <dbl>
## 1 3.07 -11.4
## 2 -1.24 5.43
## 3 -5.74 -3.35
## 4 8.08 -1.40
## 5 -4.85 -5.28
## 6 8.83 3.81
## 7 8.87 3.89
## 8 7.40 5.28
## 9 -5.46 -4.13
## 10 -12.7 -6.45
## # ℹ 93,806 more rows
# The Julia UMAP
ggplot(dimr_julia, aes(x = umap1, y = umap2)) + geom_point()
# The R UMAP
ggplot(dimr_r, aes(x = umap1, y = umap2)) + geom_point()