The purpose of this project is to make synthetic data, and then make distance matrices using multiple distance metrics, and correlate them as we increase dimensionality of the synthetic data.
DistMatrix <- function(df, dist.measure, p = 2) {
    # A distance matrix wrapper that includes cosine distance
    # Args:
    #   df: data frame or tibble of interest
    #   dist.measure: the distance measure of interest between two vectors
    #   p: the p of the Lp norm, only relevant if minkowski is used 
    # Returns:
    #   result: a pairwise distance matrix for the rows of the data frame 
    #       used as input
    if(dist.measure == "cosine") {
        result <- lsa::cosine(t(df))
        result <- 1-result # To make it cosine distance rather than sim
    } else {
        result <- dist(x = df, method = dist.measure, p = p) %>% as.matrix %>% unname
    }
    return(result)
}
MakeSynthData <- function(dims, ncells = 1000, mean = 5, sd = 2, randomize_mean_and_sd = FALSE) {
    # Generates a synthetic data tibbel
    # Args:
    #   dims: the number of columns (features)
    #   ncells: the number of rows (cells)
    # Returns:
    #   result: a tibble of cells by features
    result <- lapply(1:dims, function(i) {
        if(randomize_mean_and_sd) {
            curr <- rnorm(n = ncells, mean = rnorm(1, 5, 10), sd = abs(rnorm(1, 2, 10)))
        } else {
            curr <- rnorm(n = ncells, mean, sd)
        }
        return(curr)
    })
    result <- do.call(cbind, result) %>% as_tibble()
    return(result)
}
DimTitration <- function(dims, to_compare = "cosine") {
    # A wrapper around the titration that I do in this markdown
    # Args:
    #   dims: a numeric vector of dimesnions you're going to test
    # Returns
    #   out: a tibble with columns for dimension tested and spearman correlation
    out <- lapply(dims, function(dim) {
        print(dim)
        dat <- MakeSynthData(dims = dim, ncells = 1000, mean = 5, sd = 2)
        d1 <- DistMatrix(dat, dist.measure = "euclidean")
        
        if(to_compare == "cosine") {
            d2 <- DistMatrix(dat, dist.measure = "cosine")
        } else if(to_compare == "manhattan") {
            d2 <- DistMatrix(dat, dist.measure = "manhattan")
        } else if (to_compare == "l3") {
            d2 <- DistMatrix(dat, dist.measure = "minkowski", p = 3)
        }
        
        result <- cor(c(d1), c(d2), method = "spearman")
  
        # Visualize
        print(qplot(c(d1), c(d2)) + ggtitle(paste(dim, "dimensions")))
        return(result)
    }) %>% unlist()
    names(out) <- dims
    out <- tibble(dims = as.numeric(names(out)), correlation = out)
    return(out)
}library(tidyverse)## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.4.0      ✔ purrr   0.3.5 
## ✔ tibble  3.1.8      ✔ dplyr   1.0.10
## ✔ tidyr   1.2.1      ✔ stringr 1.5.0 
## ✔ readr   2.1.3      ✔ forcats 0.5.2 
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()library(lsa)## Loading required package: SnowballCdat <- MakeSynthData(dims = 30, ncells = 1000, mean = 5, sd = 2)## Warning: The `x` argument of `as_tibble.matrix()` must have unique column names if
## `.name_repair` is omitted as of tibble 2.0.0.
## ℹ Using compatibility `.name_repair`.l2 <- DistMatrix(dat, dist.measure = "euclidean", p = 2)
cos <- DistMatrix(dat, dist.measure = "cosine", p = 2)
cor(c(l2), c(cos), method = "spearman")## [1] 0.9171096dims <- c(2, 5, 10, 20, 50, 100, 200, 500, 1000)And now for the experiment
dim_cor <- DimTitration(dims, to_compare = "cosine")## [1] 2## Warning: `qplot()` was deprecated in ggplot2 3.4.0.## [1] 5## [1] 10## [1] 20## [1] 50## [1] 100## [1] 200## [1] 500## [1] 1000dim_cor## # A tibble: 9 × 2
##    dims correlation
##   <dbl>       <dbl>
## 1     2       0.675
## 2     5       0.834
## 3    10       0.897
## 4    20       0.916
## 5    50       0.919
## 6   100       0.931
## 7   200       0.925
## 8   500       0.930
## 9  1000       0.923ggplot(dim_cor, aes(x = dims, y = correlation)) + geom_point()dim_cor <- DimTitration(dims, to_compare = "manhattan")## [1] 2## [1] 5## [1] 10## [1] 20## [1] 50## [1] 100## [1] 200## [1] 500## [1] 1000dim_cor## # A tibble: 9 × 2
##    dims correlation
##   <dbl>       <dbl>
## 1     2       0.982
## 2     5       0.956
## 3    10       0.945
## 4    20       0.939
## 5    50       0.933
## 6   100       0.932
## 7   200       0.930
## 8   500       0.930
## 9  1000       0.929ggplot(dim_cor, aes(x = dims, y = correlation)) + geom_point()dim_cor <- DimTitration(dims, to_compare = "l3")## [1] 2## [1] 5## [1] 10## [1] 20## [1] 50## [1] 100## [1] 200## [1] 500## [1] 1000dim_cor## # A tibble: 9 × 2
##    dims correlation
##   <dbl>       <dbl>
## 1     2       0.997
## 2     5       0.989
## 3    10       0.980
## 4    20       0.970
## 5    50       0.964
## 6   100       0.959
## 7   200       0.957
## 8   500       0.957
## 9  1000       0.956ggplot(dim_cor, aes(x = dims, y = correlation)) + geom_point()