library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.3 ✓ purrr 0.3.4
## ✓ tibble 3.1.1 ✓ dplyr 1.0.5
## ✓ tidyr 1.1.3 ✓ stringr 1.4.0
## ✓ readr 1.4.0 ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
This markdown will perform an abstraction of the coin toss, where rather than sampling randomly from 0 and 1 (heads and tails), we will sample from more than two numbers. The purpose of this is to determine how the binomial distribution we discovered in previous markdowns changes as we add more possible outcomes per trial.
We will first make our binomial distribution with coin tosses, but we will adjust the sampling function so we draw not from 0 and 1, but from 1 and 2, or 2 and 3, etc. We will see if our distribution changes accordingly. This can give us some intuition around what will happen when we begin rolling 3 or more sided dice.
As a review, below is the code to make the biomial distribution from coin toss sums, where heads is 1 and tails is 0.
# Binomial distribution
n_sessions <- 100000
seq(n_sessions)[1:10] # in case you haven't seen this command before
## [1] 1 2 3 4 5 6 7 8 9 10
num_heads <- lapply(seq(n_sessions), function(i) {
result <- sample(x = c(0, 1), size = 100, replace = TRUE)
result <- sum(result)/length(result)
return(result)
}) %>% unlist()
# Example of output
num_heads[1:5]
## [1] 0.47 0.48 0.58 0.53 0.53
# Histogram
hist(num_heads, breaks = 50, xlim = c(0, 1))
# Basic statistics
mean(num_heads)
## [1] 0.4999534
sd(num_heads)
## [1] 0.04994634
We now perform the same procedure but with heads as 2 and tails as 1.
n_sessions <- 100000
seq(n_sessions)[1:10] # in case you haven't seen this command before
## [1] 1 2 3 4 5 6 7 8 9 10
num_heads <- lapply(seq(n_sessions), function(i) {
result <- sample(x = c(1, 2), size = 100, replace = TRUE)
result <- sum(result)/length(result)
return(result)
}) %>% unlist()
# Example of output
num_heads[1:5]
## [1] 1.44 1.44 1.59 1.51 1.47
# Histogram
hist(num_heads, breaks = 50, xlim = c(1, 2))
# Basic statistics
mean(num_heads)
## [1] 1.500009
sd(num_heads)
## [1] 0.04999888
We get another bimomial distribution with a mean of 1.5 and a standard deviation of 0.5. In other words, the same distribution was shifted up by one. Now, we’re going to change our coin toss into a three sided die. We will sample from 1, 2, and 3, and see how the distribution changes.
n_sessions <- 100000
seq(n_sessions)[1:10] # in case you haven't seen this command before
## [1] 1 2 3 4 5 6 7 8 9 10
num_heads <- lapply(seq(n_sessions), function(i) {
result <- sample(x = c(1, 2, 3), size = 100, replace = TRUE)
result <- sum(result)/length(result)
return(result)
}) %>% unlist()
# Example of output
num_heads[1:5]
## [1] 2.05 1.92 1.98 2.05 1.98
# Histogram
hist(num_heads, breaks = 50, xlim = c(1, 3))
# Basic statistics
mean(num_heads)
## [1] 1.999645
sd(num_heads)
## [1] 0.08175727
We get another binomial distribution with a mean of 2 (the middle value of the die) and a standard deviation a little bit higher than we previously had, at 0.08 instead of 0.05. Let’s take this a step further, and make our distributions for all dice between two (coin) and 10 sides. We can then plot the mean and standard deviation of the distributions and see if we can derive any relationship between them and the number of sides on the dice.
n_sessions <- 100000
sides <- c(seq(2, 10), 50, 100)
dice_sums <- lapply(sides, function(s) {
result_dist <- lapply(seq(n_sessions), function(i) {
result <- sample(x = seq(1, s),
size = 100,
replace = TRUE)
result <- sum(result) / length(result)
return(result)
}) %>% unlist()
return(result_dist)
})
names(dice_sums) <- sides
dice_stats <- lapply(names(dice_sums), function(i) {
curr <- dice_sums[[i]]
result <- list(num_sides = as.numeric(i), mean = mean(curr), sd = sd(curr))
return(result)
}) %>% bind_rows()
print(dice_stats, n = nrow(dice_stats))
## # A tibble: 11 x 3
## num_sides mean sd
## <dbl> <dbl> <dbl>
## 1 2 1.50 0.0499
## 2 3 2.00 0.0815
## 3 4 2.50 0.112
## 4 5 3.00 0.141
## 5 6 3.50 0.171
## 6 7 4.00 0.201
## 7 8 4.50 0.229
## 8 9 5.00 0.259
## 9 10 5.50 0.287
## 10 50 25.5 1.45
## 11 100 50.5 2.88
# Plot
qplot(dice_stats$num_sides, dice_stats$mean)
qplot(dice_stats$num_sides, dice_stats$sd)
# Check for linearity: what's the slope at each point?
slope <- lapply(seq(1, nrow(dice_stats)), function(i) {
if(i == 1) {
return(list(mean_slope = 0, sd_slope = 0))
}
num_sides_change <- dice_stats[i,]$num_sides - dice_stats[i - 1,]$num_sides
mean_change <- dice_stats[i,]$mean - dice_stats[i - 1,]$mean
sd_change<- dice_stats[i,]$sd - dice_stats[i - 1,]$sd
result <- list(mean_slope = mean_change/num_sides_change,
sd_slope = sd_change/num_sides_change)
return(result)
}) %>% bind_rows()
dice_stats <- bind_cols(dice_stats, slope)
print(dice_stats, n = nrow(dice_stats))
## # A tibble: 11 x 5
## num_sides mean sd mean_slope sd_slope
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 2 1.50 0.0499 0 0
## 2 3 2.00 0.0815 0.500 0.0316
## 3 4 2.50 0.112 0.501 0.0305
## 4 5 3.00 0.141 0.499 0.0295
## 5 6 3.50 0.171 0.500 0.0294
## 6 7 4.00 0.201 0.500 0.0297
## 7 8 4.50 0.229 0.500 0.0280
## 8 9 5.00 0.259 0.501 0.0302
## 9 10 5.50 0.287 0.498 0.0284
## 10 50 25.5 1.45 0.500 0.0290
## 11 100 50.5 2.88 0.500 0.0286
We can see that the mean increases linearly as the number of sides go up. The mean is simply the mean of the sequence. In other words, the mean of the 4 sided die is 2.5, which is also the mean of (1, 2, 3, 4). The standard deviation is almost linear, going up by a little less than 0.3 each time, though as we show, the slope decreases little by little as the number of sides go up. To get a better feel for the standard deviation of these distributions, let’s look at the standard deviation of the sides of the dice themselves and see how it compares to the standard deviation of the distritbutions related to the number of sides.
sd_sides <- lapply(sides, function(i) {
curr <- seq(1, i)
result <- sd(curr)
return(result)
}) %>% unlist()
dice_stats$sd_sides <- sd_sides
sd_sides_slope <- lapply(seq(length(sd_sides)), function(i) {
if(i == 1) {
return(0)
}
sd_change <- sd_sides[i] - sd_sides[i - 1]
num_sides_change <- dice_stats[i,]$num_sides - dice_stats[i - 1,]$num_sides
return(sd_change/num_sides_change)
}) %>% unlist()
dice_stats$sd_sides_slope <- sd_sides_slope
print(dice_stats, n = nrow(dice_stats))
## # A tibble: 11 x 7
## num_sides mean sd mean_slope sd_slope sd_sides sd_sides_slope
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 2 1.50 0.0499 0 0 0.707 0
## 2 3 2.00 0.0815 0.500 0.0316 1 0.293
## 3 4 2.50 0.112 0.501 0.0305 1.29 0.291
## 4 5 3.00 0.141 0.499 0.0295 1.58 0.290
## 5 6 3.50 0.171 0.500 0.0294 1.87 0.290
## 6 7 4.00 0.201 0.500 0.0297 2.16 0.289
## 7 8 4.50 0.229 0.500 0.0280 2.45 0.289
## 8 9 5.00 0.259 0.501 0.0302 2.74 0.289
## 9 10 5.50 0.287 0.498 0.0284 3.03 0.289
## 10 50 25.5 1.45 0.500 0.0290 14.6 0.289
## 11 100 50.5 2.88 0.500 0.0286 29.0 0.289
dice_stats$sd_sides/dice_stats$sd
## [1] 14.17584 12.26673 11.52544 11.17518 10.94515 10.77011 10.71461 10.58334
## [9] 10.54393 10.07574 10.07894
The relationship is almost linear, where if you take the much simpler operation of finding the standard deviation of the number of sides on the dice, and you divide by 10, and take away 0.02, you get very close to the standard deviation of the distribution. Notice also that the slope is almost linear but slowly decreasing, similar to the standard deviation of the distributions. I show by dividing the standard deviation of the sequence of sides by the standard deviation of the distributions that the relationship approaches 10 as the number of sides gets large.