library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5     ✓ purrr   0.3.4
## ✓ tibble  3.1.6     ✓ dplyr   1.0.8
## ✓ tidyr   1.2.0     ✓ stringr 1.4.0
## ✓ readr   2.1.2     ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

Introduction

I have two sequences of what I say are 10 coin flips. For both, half of them are heads. The first sequence is 1111100000, and the second sequence is 1011010011. We’ve already touched upon testing for biased coins in terms of looking at means and standard deviations of the sums of Bernouli processes. However, here the first sequence doesn’t look like coin flips at all. It could very well be a program that outputs 1 for half of the sequence and 0 for the second half. How can we test whether these are likely coin flips and not something else entirely? To start to address this question, we’re going to look at sequences within sequences. Doing so will allow us to discover the concept of convolutions, which are absolutely central to image processing, as well as a class of neural nets for image classification and analysis.

2-mer sum frequencies

We will now do 100 coin flips, 100000 times as we have done before. Remember, we got the binomial distribution with mean 0.5 and standard deviation of 0.05 in terms of what fraction of each sequence is heads. To review, make the list and plot this below.

# Binomial distribution
n_sessions <- 100000

seq(n_sessions)[1:10] # in case you haven't seen this command before
##  [1]  1  2  3  4  5  6  7  8  9 10
flip_list <- lapply(seq(n_sessions), function(i) {
    result <- sample(x = c(0, 1), size = 100, replace = TRUE)
    return(result)
}) 

num_heads <- lapply(flip_list, function(i) {
    return(sum(i)/length(i))
}) %>% unlist()

# Example of output
num_heads[1:5]
## [1] 0.54 0.50 0.54 0.54 0.41
# Histogram
hist(num_heads, breaks = 50, xlim = c(0, 1))

# Basic statistics
mean(num_heads)
## [1] 0.4997848
sd(num_heads)
## [1] 0.05016408

We will now do a basic convolution on each sequence. We will slide a window of size 2 along each sequence. At each step of the slide, we will compute the sum. This will either be 0 for 00, 1 for 01 and 10, or 2 for 11. We will then plot the frequencies of each of the sums. For terminology, the window is called a kernel.

#' @title Convolve
#' @description Perform a 1-D convolution on a given sequence, specifically with a kernel of (1, 1), which means that for each window of size 2 on the sequence, return the sum of the two elements in the window. 
#' @param sequence A numeric vector of interest
#' @return The tabulation of the vector of sums
Convolve <- function(sequence) {
    result <- lapply(seq(2, length(sequence)), function(i) {
        window_sum <- sum(sequence[i], sequence[i - 1])
        return(window_sum)
    }) %>% unlist() %>% table()
}

conv_result <- lapply(flip_list, function(i) {
    Convolve(i)
}) %>% bind_rows()

# Go through each. 
count <- 0
invisible(apply(conv_result, 2, function(i) {
    count <<- count + 1
    col_name <- names(conv_result)[count]
    hist(i/100, breaks = 50, main = paste("Frequency of sum", col_name))
}))

count <- 0
conv_summary <- apply(conv_result, 2, function(i) {
    count <<- count + 1
    col_name <- names(conv_result)[count]
    i <- i/100 # normalize
    result <- summary(i) %>% as.list()
    result$sd <- sd(i)
    result <- c(sums_to = col_name, result)
    return(result)
}) %>% bind_rows()

conv_summary
## # A tibble: 3 × 8
##   sums_to  Min. `1st Qu.` Median  Mean `3rd Qu.`  Max.     sd
##   <chr>   <dbl>     <dbl>  <dbl> <dbl>     <dbl> <dbl>  <dbl>
## 1 0        0.06      0.21   0.25 0.248      0.28  0.5  0.0557
## 2 1        0.3       0.46   0.5  0.495      0.53  0.73 0.0499
## 3 2        0.05      0.21   0.25 0.247      0.28  0.52 0.0557

The frequency of summing to 1 is a binomial with mean of 0.5 and standard deviation of 0.05, identical to the sums of coin tosses. The frequency of summing to 0 or 2 have a mean of 0.25 and 0.75 respectively. Their standard deviations are both a little bit higher, at 0.55.

Now, we’re going to generate a sequence of 0 and 1 of size 100 from a random walk, rather than coin tosses. We have the walker start at 0 and walk either up or down at each step. For all numbers greater than zero, we will set it to 1, and for all numbers less than zero, we will set it to 0. This will generate a sequence that will not flip between 0 and 1 nearly as often. We will then convolve it and look at the frequency of sums in relation to the distributions we generated in the previous block of code.

set.seed(1)
rw_operation <- sample(c(-1, 1), 100, replace = TRUE) 
walker <- c()
curr_pos <- 0
for(i in rw_operation) {
    curr_pos <- curr_pos + i
    walker <- c(walker, curr_pos)
}
plot(walker)

walker <- ifelse(walker > 0, 1, 0)
plot(walker)

Now, we run a convolution on this sequence and see where the sums of 0, 1, and 2 show up on the respective distributions, in terms of standard deviations from the mean.

walker_conv <- Convolve(walker)
walker_conv
## .
##  0  1  2 
## 63  1 35
#' @title Get Member Z-score
#' @description Given a vector of numbers, and an element of the vector as input, returns the z score for the given element of the vector. If the element is not in the vector, it gets added to the vector prior to computation.
#' @param member The element of the vector to be queried
#' @param group Numeric vector of interest
#' @return A double corresponding to the Z-score of the member of the group
GetMemberZscore <- function(member, group) {
    if(!(member %in% group)) {
        group <- c(member, group)
    }
    group <- sort(group)
    member_index <- which(group %in% member)
    group <- scale(group) %>% as.numeric()
    result <- group[member_index][1]
    return(result)
}

# Control
fair_coin_ctl <- sample(x = c(0, 1), size = 100, replace = TRUE) %>% Convolve()
GetMemberZscore(member = fair_coin_ctl[1], group = conv_result$`0`)
## [1] 0.5794376
GetMemberZscore(member = fair_coin_ctl[2], group = conv_result$`1`)
## [1] -0.09911119
GetMemberZscore(member = fair_coin_ctl[3], group = conv_result$`2`)
## [1] -0.4905525
# Exp
GetMemberZscore(member = walker_conv[1], group = conv_result$`0`)
## [1] 6.862749
GetMemberZscore(member = walker_conv[2], group = conv_result$`1`)
## [1] -9.704108
GetMemberZscore(member = walker_conv[3], group = conv_result$`2`)
## [1] 1.843924

Our controls each have results that are below one standard deviation from the mean, suggesting that the series we generated is in fact random. With our random walk series (not generated from coin tosses), we find that the instances of 0, 1, and 2 in our random walk generated series as several standard deviations away from the mean of our distributions. As a rule of thumb, if we assume a normal distribution (the binomial counts), a p value of 0.05 corresponds to a z score of +/- 1.96, which we will consider as statistically significant. All we need is one of the numbers to be statistically significant to be suspect that the series is not random, which we have. We note that we can do this convolution strategy to test for coin fairness as well. We will do this below.

# Control
fair_coin_ctl <- sample(x = c(0, 1), size = 100, replace = TRUE) %>% Convolve()
GetMemberZscore(member = fair_coin_ctl[1], group = conv_result$`0`)
## [1] -0.6775548
GetMemberZscore(member = fair_coin_ctl[2], group = conv_result$`1`)
## [1] -0.6997123
GetMemberZscore(member = fair_coin_ctl[3], group = conv_result$`2`)
## [1] 1.305199
# Exp: unfair coin
unfair_coin <- sample(x = c(0, 1), size = 100, replace = TRUE, prob = c(0.4, 0.6)) %>% Convolve()
GetMemberZscore(member = unfair_coin[1], group = conv_result$`0`)
## [1] -2.293688
GetMemberZscore(member = unfair_coin[2], group = conv_result$`1`)
## [1] -0.09911119
GetMemberZscore(member = unfair_coin[3], group = conv_result$`2`)
## [1] 2.382649

We can see that the instances of summing to 1 is not statistically significant compared to fair coin tosses. However, we can see that the summing to 0 and 2 are above the threshold of 1.96, showing statistical significance. Therefore, we can conclude that this unfair coin sequence is in fact not drawn from a fair coin toss.

Conclusions

We can see that by simply looking at the details of a sequence, element by element, we can make some headway in determining whether a sequence is random or not. We got the fair coin post-convolution frequency distributions of summing to 0, 1, and 2 for a 2-element sliding window that does summation (convolution). We then ran the same procedure on our sequences of interest (fair coin control, random walk generated sequence, and unfair coin) to show that this rather simple method is very powerful in terms of determining whether a mystery sequence comes from a specified initial random process. Testing for randomness is a rich branch of statistics, and there are many ways to do it. Hopefully, this piece has given you some intuition around how you would go about testing for randomness, sufficient that you might find or even invent some unique randomness tests of your own.