Home

Here, we’re going to load a CyTOF fcs file, and we’re going to test the Julia implementation of UMAP to that of R, and see which is faster.

Let’s initialize Julia.

library(JuliaConnectoR)
library(JuliaCall)
julia_setup(JULIA_HOME = "/Applications/Julia-1.8.app/Contents/Resources/julia/bin/")
julia_command("using Pkg; Pkg.add(\"RCall\")")
julia_command("using RCall")
julia_command("using Pkg; Pkg.add(\"DataFrames\")")
julia_command("using DataFrames")
julia_command("using Pkg; Pkg.add(\"UMAP\")")
julia_command("using UMAP")
julia_command("using LinearAlgebra")

Pre-processing

And now let’s get the fcs files imported.

library(flowCore)
library(tidyverse)
set.seed(1)
setwd(here::here("data", 'mike_LISLinnegCD45pos-101917-relabeled-CORRECT'))

ff <- flowCore::read.FCS('Center_1_A_1_MATLAB_LiveIntactSing_Linneg_CD45pos.fcs')
ff
## flowFrame object 'Center_1_A_1_MATLAB_LiveIntactSing_Linneg_CD45pos.fcs'
## with 93816 cells and 30 observables:
##         name   desc     range  minRange  maxRange
## $P1  Ce140Di beads1      2911         0         7
## $P2  Ce142Di beads2      1509         0         1
## $P3  Er167Di   CD27       781         0        17
## $P4  Er170Di    CD3      1664         0        27
## $P5  Eu151Di  CD123      2814         0        16
## ...      ...    ...       ...       ...       ...
## $P26 Tm169Di CD45RA     12000         0        23
## $P27 Yb172Di   CD38      5629         0        18
## $P28 Yb174Di  HLADR      6990         0        11
## $P29 Yb176Di   CD56      1349         0        12
## $P30    Time   Time         0         0         1
## 261 keywords are stored in the 'description' slot

Let’s tidy the data.

mass_tags <- ff@parameters@data$name
params <- ff@parameters@data$desc

# All NA values in marker names will revert to mass tags names
params <- ifelse(is.na(params), mass_tags, params)
colnames(ff) <- params

We do our asinh transform.

exp <- flowCore::exprs(ff)
as_tibble(exp)
## # A tibble: 93,816 × 30
##    beads1 beads2    CD27    CD3 CD123 beads3 `Event #` Event_length   CD33
##     <dbl>  <dbl>   <dbl>  <dbl> <dbl>  <dbl>     <dbl>        <dbl>  <dbl>
##  1   0         0   0       1.74  2.35   2.09         0         24.3  3.41 
##  2   0         0   0      56.1   3.39  16.7          0         25.2  0.686
##  3   0         0   0.437   2.93  6.23 228.           0         17.7  2.23 
##  4   0         0 108.    288.    5.61  91.3          0         15.9  2.84 
##  5   2.48      0   9.69    7.18  3.35  22.2          0         20.5  5.29 
##  6   1.49      0  27.6   191.    2.44   2.25         0         24.3  0    
##  7   1.57      0  26.0   160.    2.30  46.6          0         27.1  0.894
##  8   0         0   1.29   42.2   2.56  22.9          0         23.3  0    
##  9   0         0   4.12    9.30  0     14.1          0         19.6  8.39 
## 10   0         0   0.715   0     3.28   1.21         0         19.6 70.8  
## # ℹ 93,806 more rows
## # ℹ 21 more variables: CD14 <dbl>, CD61 <dbl>, cells1 <dbl>, cells2 <dbl>,
## #   beads4 <dbl>, beads5 <dbl>, CD19 <dbl>, CD4 <dbl>, CD8A <dbl>, CD16 <dbl>,
## #   CD235 <dbl>, livedead <dbl>, CD20 <dbl>, CD66 <dbl>, CD45 <dbl>,
## #   CD11C <dbl>, CD45RA <dbl>, CD38 <dbl>, HLADR <dbl>, CD56 <dbl>, Time <dbl>
exp <- asinh(exp/5)
as_tibble(exp)
## # A tibble: 93,816 × 30
##    beads1 beads2   CD27   CD3 CD123 beads3 `Event #` Event_length  CD33   CD14
##     <dbl>  <dbl>  <dbl> <dbl> <dbl>  <dbl>     <dbl>        <dbl> <dbl>  <dbl>
##  1  0          0 0      0.342 0.454  0.407         0         2.28 0.638 1.59  
##  2  0          0 0      3.11  0.635  1.92          0         2.32 0.137 1.35  
##  3  0          0 0.0872 0.557 1.05   4.51          0         1.98 0.432 1.08  
##  4  0          0 3.76   4.75  0.965  3.60          0         1.87 0.542 0.446 
##  5  0.478      0 1.42   1.16  0.627  2.19          0         2.12 0.922 1.88  
##  6  0.294      0 2.41   4.34  0.470  0.435         0         2.28 0     0.0199
##  7  0.308      0 2.35   4.16  0.444  2.93          0         2.39 0.178 0.384 
##  8  0          0 0.255  2.83  0.492  2.22          0         2.24 0     1.60  
##  9  0          0 0.752  1.38  0      1.76          0         2.07 1.29  2.20  
## 10  0          0 0.142  0     0.617  0.240         0         2.07 3.34  2.01  
## # ℹ 93,806 more rows
## # ℹ 20 more variables: CD61 <dbl>, cells1 <dbl>, cells2 <dbl>, beads4 <dbl>,
## #   beads5 <dbl>, CD19 <dbl>, CD4 <dbl>, CD8A <dbl>, CD16 <dbl>, CD235 <dbl>,
## #   livedead <dbl>, CD20 <dbl>, CD66 <dbl>, CD45 <dbl>, CD11C <dbl>,
## #   CD45RA <dbl>, CD38 <dbl>, HLADR <dbl>, CD56 <dbl>, Time <dbl>
flowCore::exprs(ff) <- exp

We get the surface markers of interest.

params <- ff@parameters@data$desc

# Markers to use
cd <- grep("CD", params)
ig <- grep("Ig", params)
hla <- grep("HLA", params)
ccr <- c(grep("CCR", params), grep("CXC", params))

surface <- c(cd, ig, hla, ccr)
surface
##  [1]  3  4  5  9 10 11 16 17 18 19 20 22 23 24 25 26 27 29 28

Let’s sample it.

# Dev
# ff <- ff[sample(nrow(ff), 20000),]

UMAP

Julia

And now to put it into Julia.

# We have to transpose the expression matrix for Julia to read it properly
out <- ff@exprs[,surface] %>% t() 
out[1:5, 1:5]
##            [,1]      [,2]      [,3]      [,4]      [,5]
## CD27  0.0000000 0.0000000 0.0872166 3.7627048 1.4154413
## CD3   0.3421098 3.1133225 0.5565164 4.7460117 1.1584713
## CD123 0.4537044 0.6348307 1.0454967 0.9651255 0.6274731
## CD33  0.6384434 0.1367997 0.4320906 0.5417258 0.9220192
## CD14  1.5862250 1.3474719 1.0755468 0.4458862 1.8761743

Ok, let’s get Julia up and running.

# Retrieve "out" from R
@rget out
## 19×93816 Matrix{Float64}:
##  0.0       0.0        0.0872166  …  0.368193   0.500623   0.0
##  0.34211   3.11332    0.556516      3.03859    0.596295   0.175523
##  0.453704  0.634831   1.0455        1.15862    0.752023   0.545209
##  0.638443  0.1368     0.432091      0.28775    1.26376    3.43226
##  1.58622   1.34747    1.07555       2.08884    1.37103    2.34549
##  0.198598  0.0704348  1.03031    …  1.39157    0.0        1.42908
##  4.1842    0.416731   0.0           0.0966332  0.452087   0.0
##  0.845268  1.21637    0.621818      1.35861    0.579154   1.25884
##  1.11282   4.45331    1.79687       4.92435    1.58902    0.00159929
##  2.32197   2.41456    4.24653       2.74927    3.05129    0.454894
##  1.15779   2.20775    1.68733    …  2.6857     2.01022    1.41758
##  3.60714   2.09793    2.4869        2.17349    1.10625    0.946835
##  1.00422   1.90886    2.29907       1.53119    1.3649     0.637822
##  5.52877   4.70376    5.48747       4.77042    4.7392     3.64652
##  0.0       0.710323   0.0662841     0.666844   4.28743    2.83439
##  5.22602   0.511767   6.05748    …  4.95054    5.2411     1.46861
##  3.53151   1.52569    0.397969      3.33941    0.320734   3.15861
##  0.520374  0.934533   2.98138       0.927598   0.0996453  0.317896
##  3.93252   0.988068   0.457855      1.65632    3.01901    1.70374

# Run UMAP, and also time it
@time begin
    dimr = umap(out)
end
## 108.653283 seconds (41.50 M allocations: 4.073 GiB, 0.63% gc time, 4.97% compilation time)
## 2×93816 Matrix{Float64}:
##  10.5461   -4.83923   6.83231  -1.65526  …  -1.40113    8.43417    4.00585
##   4.98304  -1.74717  -7.79605   9.8458      -2.83664  -13.4154   -15.5386

# Place the output back into R
@rput dimr
## 2×93816 Matrix{Float64}:
##  10.5461   -4.83923   6.83231  -1.65526  …  -1.40113    8.43417    4.00585
##   4.98304  -1.74717  -7.79605   9.8458      -2.83664  -13.4154   -15.5386

Ok, now we get it back into our environment.

# Tidy the umap output from Julia
dimr_julia <- as_tibble(t(dimr))
names(dimr_julia) <- c("umap1", "umap2")
dimr_julia
## # A tibble: 93,816 × 2
##    umap1  umap2
##    <dbl>  <dbl>
##  1 10.5    4.98
##  2 -4.84  -1.75
##  3  6.83  -7.80
##  4 -1.66   9.85
##  5  4.61  -7.54
##  6 -6.86   9.70
##  7 -6.87   9.51
##  8 -6.22   6.80
##  9  5.92  -7.63
## 10  6.53 -15.0 
## # ℹ 93,806 more rows

R

library(umap)

# Run UMAP in R, and time it
start_time <- Sys.time()
dimr_r <- umap::umap(ff@exprs[,surface])$layout %>% as_tibble()
names(dimr_r) <- c("umap1", "umap2")
end_time <- Sys.time()

elapsed_time <- difftime(end_time, start_time, units = "secs")
cat("Elapsed time:", elapsed_time, "seconds\n")
## Elapsed time: 408.0728 seconds
dimr_r
## # A tibble: 93,816 × 2
##     umap1  umap2
##     <dbl>  <dbl>
##  1   3.07 -11.4 
##  2  -1.24   5.43
##  3  -5.74  -3.35
##  4   8.08  -1.40
##  5  -4.85  -5.28
##  6   8.83   3.81
##  7   8.87   3.89
##  8   7.40   5.28
##  9  -5.46  -4.13
## 10 -12.7   -6.45
## # ℹ 93,806 more rows

Plotting

# The Julia UMAP
ggplot(dimr_julia, aes(x = umap1, y = umap2)) + geom_point()

# The R UMAP
ggplot(dimr_r, aes(x = umap1, y = umap2)) + geom_point()