The purpose of this project is to make synthetic data, and then make distance matrices using multiple distance metrics, and correlate them as we increase dimensionality of the synthetic data.

DistMatrix <- function(df, dist.measure, p = 2) {
    # A distance matrix wrapper that includes cosine distance
    # Args:
    #   df: data frame or tibble of interest
    #   dist.measure: the distance measure of interest between two vectors
    #   p: the p of the Lp norm, only relevant if minkowski is used 
    # Returns:
    #   result: a pairwise distance matrix for the rows of the data frame 
    #       used as input
    if(dist.measure == "cosine") {
        result <- lsa::cosine(t(df))
        result <- 1-result # To make it cosine distance rather than sim
    } else {
        result <- dist(x = df, method = dist.measure, p = p) %>% as.matrix %>% unname
    }
    return(result)
}


MakeSynthData <- function(dims, ncells = 1000, mean = 5, sd = 2, randomize_mean_and_sd = FALSE) {
    # Generates a synthetic data tibbel
    # Args:
    #   dims: the number of columns (features)
    #   ncells: the number of rows (cells)
    # Returns:
    #   result: a tibble of cells by features
    result <- lapply(1:dims, function(i) {
        if(randomize_mean_and_sd) {
            curr <- rnorm(n = ncells, mean = rnorm(1, 5, 10), sd = abs(rnorm(1, 2, 10)))
        } else {
            curr <- rnorm(n = ncells, mean, sd)
        }
        return(curr)
    })
    result <- do.call(cbind, result) %>% as_tibble()
    return(result)
}

DimTitration <- function(dims, to_compare = "cosine") {
    # A wrapper around the titration that I do in this markdown
    # Args:
    #   dims: a numeric vector of dimesnions you're going to test
    # Returns
    #   out: a tibble with columns for dimension tested and spearman correlation
    out <- lapply(dims, function(dim) {
        print(dim)
        dat <- MakeSynthData(dims = dim, ncells = 1000, mean = 5, sd = 2)
        d1 <- DistMatrix(dat, dist.measure = "euclidean")
        
        if(to_compare == "cosine") {
            d2 <- DistMatrix(dat, dist.measure = "cosine")
        } else if(to_compare == "manhattan") {
            d2 <- DistMatrix(dat, dist.measure = "manhattan")
        } else if (to_compare == "l3") {
            d2 <- DistMatrix(dat, dist.measure = "minkowski", p = 3)
        }
        
        result <- cor(c(d1), c(d2), method = "spearman")
  
        # Visualize
        print(qplot(c(d1), c(d2)) + ggtitle(paste(dim, "dimensions")))
        return(result)
    }) %>% unlist()

    names(out) <- dims
    out <- tibble(dims = as.numeric(names(out)), correlation = out)
    return(out)
}
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.4.0      ✔ purrr   0.3.5 
## ✔ tibble  3.1.8      ✔ dplyr   1.0.10
## ✔ tidyr   1.2.1      ✔ stringr 1.5.0 
## ✔ readr   2.1.3      ✔ forcats 0.5.2 
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(lsa)
## Loading required package: SnowballC
dat <- MakeSynthData(dims = 30, ncells = 1000, mean = 5, sd = 2)
## Warning: The `x` argument of `as_tibble.matrix()` must have unique column names if
## `.name_repair` is omitted as of tibble 2.0.0.
## ℹ Using compatibility `.name_repair`.
l2 <- DistMatrix(dat, dist.measure = "euclidean", p = 2)
cos <- DistMatrix(dat, dist.measure = "cosine", p = 2)
cor(c(l2), c(cos), method = "spearman")
## [1] 0.9171096
dims <- c(2, 5, 10, 20, 50, 100, 200, 500, 1000)

And now for the experiment

dim_cor <- DimTitration(dims, to_compare = "cosine")
## [1] 2
## Warning: `qplot()` was deprecated in ggplot2 3.4.0.

## [1] 5

## [1] 10

## [1] 20

## [1] 50

## [1] 100

## [1] 200

## [1] 500

## [1] 1000

dim_cor
## # A tibble: 9 × 2
##    dims correlation
##   <dbl>       <dbl>
## 1     2       0.675
## 2     5       0.834
## 3    10       0.897
## 4    20       0.916
## 5    50       0.919
## 6   100       0.931
## 7   200       0.925
## 8   500       0.930
## 9  1000       0.923
ggplot(dim_cor, aes(x = dims, y = correlation)) + geom_point()

dim_cor <- DimTitration(dims, to_compare = "manhattan")
## [1] 2

## [1] 5

## [1] 10

## [1] 20

## [1] 50

## [1] 100

## [1] 200

## [1] 500

## [1] 1000

dim_cor
## # A tibble: 9 × 2
##    dims correlation
##   <dbl>       <dbl>
## 1     2       0.982
## 2     5       0.956
## 3    10       0.945
## 4    20       0.939
## 5    50       0.933
## 6   100       0.932
## 7   200       0.930
## 8   500       0.930
## 9  1000       0.929
ggplot(dim_cor, aes(x = dims, y = correlation)) + geom_point()

dim_cor <- DimTitration(dims, to_compare = "l3")
## [1] 2

## [1] 5

## [1] 10

## [1] 20

## [1] 50

## [1] 100

## [1] 200

## [1] 500

## [1] 1000

dim_cor
## # A tibble: 9 × 2
##    dims correlation
##   <dbl>       <dbl>
## 1     2       0.997
## 2     5       0.989
## 3    10       0.980
## 4    20       0.970
## 5    50       0.964
## 6   100       0.959
## 7   200       0.957
## 8   500       0.957
## 9  1000       0.956
ggplot(dim_cor, aes(x = dims, y = correlation)) + geom_point()