library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.3     ✓ purrr   0.3.4
## ✓ tibble  3.1.1     ✓ dplyr   1.0.5
## ✓ tidyr   1.1.3     ✓ stringr 1.4.0
## ✓ readr   1.4.0     ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

Introduction

This markdown will perform an abstraction of the coin toss, where rather than sampling randomly from 0 and 1 (heads and tails), we will sample from more than two numbers. The purpose of this is to determine how the binomial distribution we discovered in previous markdowns changes as we add more possible outcomes per trial.

Binary sums

We will first make our binomial distribution with coin tosses, but we will adjust the sampling function so we draw not from 0 and 1, but from 1 and 2, or 2 and 3, etc. We will see if our distribution changes accordingly. This can give us some intuition around what will happen when we begin rolling 3 or more sided dice.

As a review, below is the code to make the biomial distribution from coin toss sums, where heads is 1 and tails is 0.

# Binomial distribution
n_sessions <- 100000

seq(n_sessions)[1:10] # in case you haven't seen this command before
##  [1]  1  2  3  4  5  6  7  8  9 10
num_heads <- lapply(seq(n_sessions), function(i) {
    result <- sample(x = c(0, 1), size = 100, replace = TRUE)
    result <- sum(result)/length(result)
    return(result)
}) %>% unlist()

# Example of output
num_heads[1:5]
## [1] 0.47 0.48 0.58 0.53 0.53
# Histogram
hist(num_heads, breaks = 50, xlim = c(0, 1))

# Basic statistics
mean(num_heads)
## [1] 0.4999534
sd(num_heads)
## [1] 0.04994634

We now perform the same procedure but with heads as 2 and tails as 1.

n_sessions <- 100000

seq(n_sessions)[1:10] # in case you haven't seen this command before
##  [1]  1  2  3  4  5  6  7  8  9 10
num_heads <- lapply(seq(n_sessions), function(i) {
    result <- sample(x = c(1, 2), size = 100, replace = TRUE)
    result <- sum(result)/length(result)
    return(result)
}) %>% unlist()

# Example of output
num_heads[1:5]
## [1] 1.44 1.44 1.59 1.51 1.47
# Histogram
hist(num_heads, breaks = 50, xlim = c(1, 2))

# Basic statistics
mean(num_heads)
## [1] 1.500009
sd(num_heads)
## [1] 0.04999888

We get another bimomial distribution with a mean of 1.5 and a standard deviation of 0.5. In other words, the same distribution was shifted up by one. Now, we’re going to change our coin toss into a three sided die. We will sample from 1, 2, and 3, and see how the distribution changes.

n_sessions <- 100000

seq(n_sessions)[1:10] # in case you haven't seen this command before
##  [1]  1  2  3  4  5  6  7  8  9 10
num_heads <- lapply(seq(n_sessions), function(i) {
    result <- sample(x = c(1, 2, 3), size = 100, replace = TRUE)
    result <- sum(result)/length(result)
    return(result)
}) %>% unlist()

# Example of output
num_heads[1:5]
## [1] 2.05 1.92 1.98 2.05 1.98
# Histogram
hist(num_heads, breaks = 50, xlim = c(1, 3))

# Basic statistics
mean(num_heads)
## [1] 1.999645
sd(num_heads)
## [1] 0.08175727

We get another binomial distribution with a mean of 2 (the middle value of the die) and a standard deviation a little bit higher than we previously had, at 0.08 instead of 0.05. Let’s take this a step further, and make our distributions for all dice between two (coin) and 10 sides. We can then plot the mean and standard deviation of the distributions and see if we can derive any relationship between them and the number of sides on the dice.

n_sessions <- 100000
sides <-  c(seq(2, 10), 50, 100)

dice_sums <- lapply(sides, function(s) {
    result_dist <- lapply(seq(n_sessions), function(i) {
        result <- sample(x = seq(1, s),
                         size = 100,
                         replace = TRUE)
        result <- sum(result) / length(result)
        return(result)
    }) %>% unlist()
    return(result_dist)
})
names(dice_sums) <- sides

dice_stats <- lapply(names(dice_sums), function(i) {
    curr <- dice_sums[[i]]
    result <- list(num_sides = as.numeric(i), mean = mean(curr), sd = sd(curr))
    return(result)
}) %>% bind_rows()

print(dice_stats, n = nrow(dice_stats))
## # A tibble: 11 x 3
##    num_sides  mean     sd
##        <dbl> <dbl>  <dbl>
##  1         2  1.50 0.0499
##  2         3  2.00 0.0815
##  3         4  2.50 0.112 
##  4         5  3.00 0.141 
##  5         6  3.50 0.171 
##  6         7  4.00 0.201 
##  7         8  4.50 0.229 
##  8         9  5.00 0.259 
##  9        10  5.50 0.287 
## 10        50 25.5  1.45  
## 11       100 50.5  2.88
# Plot
qplot(dice_stats$num_sides, dice_stats$mean)

qplot(dice_stats$num_sides, dice_stats$sd)

# Check for linearity: what's the slope at each point? 
slope <- lapply(seq(1, nrow(dice_stats)), function(i) {
    if(i == 1) {
        return(list(mean_slope = 0, sd_slope = 0))
    }
    num_sides_change <- dice_stats[i,]$num_sides - dice_stats[i - 1,]$num_sides
    mean_change <- dice_stats[i,]$mean - dice_stats[i - 1,]$mean
    sd_change<- dice_stats[i,]$sd - dice_stats[i - 1,]$sd
    
    result <- list(mean_slope = mean_change/num_sides_change, 
                   sd_slope = sd_change/num_sides_change)
    return(result)
}) %>% bind_rows()

dice_stats <- bind_cols(dice_stats, slope)
print(dice_stats, n = nrow(dice_stats))
## # A tibble: 11 x 5
##    num_sides  mean     sd mean_slope sd_slope
##        <dbl> <dbl>  <dbl>      <dbl>    <dbl>
##  1         2  1.50 0.0499      0       0     
##  2         3  2.00 0.0815      0.500   0.0316
##  3         4  2.50 0.112       0.501   0.0305
##  4         5  3.00 0.141       0.499   0.0295
##  5         6  3.50 0.171       0.500   0.0294
##  6         7  4.00 0.201       0.500   0.0297
##  7         8  4.50 0.229       0.500   0.0280
##  8         9  5.00 0.259       0.501   0.0302
##  9        10  5.50 0.287       0.498   0.0284
## 10        50 25.5  1.45        0.500   0.0290
## 11       100 50.5  2.88        0.500   0.0286

We can see that the mean increases linearly as the number of sides go up. The mean is simply the mean of the sequence. In other words, the mean of the 4 sided die is 2.5, which is also the mean of (1, 2, 3, 4). The standard deviation is almost linear, going up by a little less than 0.3 each time, though as we show, the slope decreases little by little as the number of sides go up. To get a better feel for the standard deviation of these distributions, let’s look at the standard deviation of the sides of the dice themselves and see how it compares to the standard deviation of the distritbutions related to the number of sides.

sd_sides <- lapply(sides, function(i) {
    curr <- seq(1, i)
    result <- sd(curr)
    return(result)
}) %>% unlist()

dice_stats$sd_sides <- sd_sides

sd_sides_slope <- lapply(seq(length(sd_sides)), function(i) {
    if(i == 1) {
        return(0)
    }
    sd_change <- sd_sides[i] - sd_sides[i - 1]
    num_sides_change <- dice_stats[i,]$num_sides - dice_stats[i - 1,]$num_sides
    return(sd_change/num_sides_change)
}) %>% unlist()

dice_stats$sd_sides_slope <- sd_sides_slope

print(dice_stats, n = nrow(dice_stats))
## # A tibble: 11 x 7
##    num_sides  mean     sd mean_slope sd_slope sd_sides sd_sides_slope
##        <dbl> <dbl>  <dbl>      <dbl>    <dbl>    <dbl>          <dbl>
##  1         2  1.50 0.0499      0       0         0.707          0    
##  2         3  2.00 0.0815      0.500   0.0316    1              0.293
##  3         4  2.50 0.112       0.501   0.0305    1.29           0.291
##  4         5  3.00 0.141       0.499   0.0295    1.58           0.290
##  5         6  3.50 0.171       0.500   0.0294    1.87           0.290
##  6         7  4.00 0.201       0.500   0.0297    2.16           0.289
##  7         8  4.50 0.229       0.500   0.0280    2.45           0.289
##  8         9  5.00 0.259       0.501   0.0302    2.74           0.289
##  9        10  5.50 0.287       0.498   0.0284    3.03           0.289
## 10        50 25.5  1.45        0.500   0.0290   14.6            0.289
## 11       100 50.5  2.88        0.500   0.0286   29.0            0.289
dice_stats$sd_sides/dice_stats$sd
##  [1] 14.17584 12.26673 11.52544 11.17518 10.94515 10.77011 10.71461 10.58334
##  [9] 10.54393 10.07574 10.07894

The relationship is almost linear, where if you take the much simpler operation of finding the standard deviation of the number of sides on the dice, and you divide by 10, and take away 0.02, you get very close to the standard deviation of the distribution. Notice also that the slope is almost linear but slowly decreasing, similar to the standard deviation of the distributions. I show by dividing the standard deviation of the sequence of sides by the standard deviation of the distributions that the relationship approaches 10 as the number of sides gets large.