#----------------------------------------------------------------------
#             Answers to exercises of Chapter 2
#                      Probabilities
#----------------------------------------------------------------------

#----------------------------------------------------------------------
#                      Lab session 1
#----------------------------------------------------------------------

#----------------------------------------------------------------------
#                       Exercise 1.1
#----------------------------------------------------------------------

#----------------------- question 1 -----------------------------------
# Assuming that the outcomes (success or failure) of the 5 attempts
# are independent, the number of successes follows the binomial distribution
# with parameters
n <- 5; p <- 0.9
# The random variable X takes its values in the set {0,1,2,3,4,5}.
d <- dbinom(0:5,n,p); d                     # probabilities
sum(d)                                      # they add up to 1
#----------------------- question 2 -----------------------------------
n*p                                         # expectation
n*p*(1-p)                                   # variance
sqrt(n*p*(1-p))                             # standard-deviation
qbinom(0.5,n,p)                             # median
qbinom(0.25,n,p)                            # first quartile
qbinom(0.75,n,p)                            # third quartile
#----------------------- question 3 -----------------------------------
dbinom(5,n,p)                               # exactly 5 times
dbinom(3,n,p)                               # exactly 3 times
sum(dbinom(0:3,n,p))                        # at most 3 times
pbinom(3,n,p)                               # same result
sum(dbinom(3:5,n,p))                        # at least 3 times
pbinom(2,n,p,lower.tail=FALSE)              # same result
sum(dbinom(2:4,n,p))                        # from 2 to 4 times
pbinom(4,n,p)-pbinom(1,n,p)                 # same result
#----------------------- question 4 -----------------------------------
N <- 100                                    # sample size
x <- rbinom(N,n,p)                          # random sample
table(x)/N                                  # relative frequencies
#----------------------- question 5 -----------------------------------
N <- 1e4                                    # sample size
n <- 5; p <- 0.9                            # parameters
x <- rbinom(N,n,p)                          # random sample
plot(ecdf(x),col="blue")                    # ecdf
points(0:5,pbinom(0:5,n,p),col="red")       # cumulative probabilities
#----------------------- question 6 -----------------------------------
xm <- cumsum(x)/(1:N)                       # partial empirical means
plot(1:N,xm,pch=".",col="blue")             # plot as blue dots
abline(h=n*p,col="red")                     # true expectation
#----------------------- question 7 -----------------------------------
y <-  ifelse(x<=3,1,0)                      # binary variable
yf <- cumsum(y)/(1:N)                       # partial relative frequencies
plot(1:N,yf,pch=".",col="blue")             # plot as blue dots
abline(h=pbinom(3,n,p),col="red")           # true probability

#----------------------------------------------------------------------
#                        Exercise 1.2
#----------------------------------------------------------------------

#----------------------- question 1 -----------------------------------
dbinom(0,6,1/4)                             # none of the witnesses
dbinom(1,6,1/4)                             # exactly once
pbinom(1,6,1/4,lower.tail=FALSE)            # twice or more
#----------------------- question 2 -----------------------------------
# the probability to be identified by 2 or more witnesses is 0.466:
# yes, this might be due to chance.
#----------------------- question 3 -----------------------------------
pbinom(3,6,1/4,lower.tail=FALSE)            # 4 witnesses or more
# the result is 0.0376: this is a small probability, the judge has all
# reasons to be suspicious.

#----------------------------------------------------------------------
#                        Exercise 1.3
#----------------------------------------------------------------------
#----------------------- question 1 -----------------------------------
TA <- read.table("data/tauber.csv",header=TRUE,sep=";")
H <- TA[,"height"]
sum(H>110)
sum(H<=110)
#----------------------- question 2 -----------------------------------
n <- length(H)
plot(1:n,cumsum(H>110)/(1:n),pch=".",ylim=c(0,1))
#----------------------- question 3 -----------------------------------
rH <- sample(H)
points(1:n,cumsum(rH>110)/(1:n),pch=".",col="blue")
#----------------------- question 4 -----------------------------------
iH <- sort(H)
points(1:n,cumsum(iH>110)/(1:n),pch=".",col="green")
#----------------------- question 5 -----------------------------------
dH <- sort(H,decreasing=TRUE)
points(1:n,cumsum(dH>110)/(1:n),pch=".",col="red")

#----------------------------------------------------------------------
#                      Session 2
#----------------------------------------------------------------------

#----------------------------------------------------------------------
#                       Exercise 2.1
#----------------------------------------------------------------------

#----------------------- question 1 -----------------------------------
mc <- 2970; sdc <- 251                      # data for control population
mr <- 3350; sdr <- 223                      # data for runners
                       # control
pnorm(3000,mc,sdc)                          # less than 3000
pnorm(3000,mc,sdc,lower.tail=FALSE)         # more than 3000
#----------------------- question 2 -----------------------------------
pnorm(2600,mc,sdc)                          # less than 2600
pnorm(3400,mc,sdc,lower.tail=FALSE)         # more than 4000
pnorm(3400,mc,sdc)-pnorm(2600,mc,sds)       # between 2600 and 3400
#----------------------- question 3 -----------------------------------
qnorm(0.01,mc,sdc)                          # control population 1% smaller
qnorm(0.01,mr,sdr,lower.tail=FALSE)         # runner 1% taller

xlim <- c(2350,3900)                        # bounds for graphics
{curve(dnorm(x,mc,sdc),                     # density for control
       col="blue",xlim=xlim)}
{curve(dnorm(x,mr,sdr),                     # density for runers
     col="red",add=TRUE)}
abline(v=2600,col="green")                  # vertical line at 2600
abline(v=3400,col="green")                  # vertical line at 3400
#----------------------- question 4 -----------------------------------
qnorm(0.05,mc,sdc)                          # control 5% smaller
qnorm(0.01,mc,sdc,lower.tail=FALSE)         # control 5% taller
#----------------------- question 5 -----------------------------------
{curve(pnorm(x,mc,sdc),                     # cdf for control
       col="blue",xlim=xlim)}
{curve(pnorm(x,mr,sdr),                     # cdf for runners
     col="red",add=TRUE)}
abline(v=2600,col="green")                  # vertical line at 2600
abline(v=3400,col="green")                  # vertical line at 3400
#----------------------- question 6 -----------------------------------
                       # control
qnorm(0.4,mc,sdc)                           # 40% smaller
qnorm(0.8,mc,sdc)                           # 80% smaller
#----------------------- question 7 -----------------------------------
# The difference control minus runner has normal distribution with parameters
md <- mc-mr; md                             # mean difference
sdd <- sqrt(sdc^2+sdr^2); sdd               # standard deviation of difference
lb <- qnorm(0.01,md,sdd); lb                # lower bound
ub <- qnorm(0.01,md,sdd,lower.tail=FALSE);ub# upper bound
curve(dnorm(x,md,sdd),xlim=c(lb,ub))        # density of difference
abline(v=c(0,-400),col="green")             # add vertical lines
pnorm(0,md,sdd)                             # chances of a negative difference
pnorm(-400,md,sdd)                          # runner eat more by 400 Kcal

#----------------------------------------------------------------------
#                      Lab session 3
#----------------------------------------------------------------------

#----------------------------------------------------------------------
#                       Exercise 3.1
#----------------------------------------------------------------------
mb15 <- 175.2                               # mean 15 year old boys
sdb15 <- 7.9                                # standard deviation
#----------------------- question 1 -----------------------------------
normal.fluctuation(x=160,mean=mb15,sd=sdb15,alternative="greater")
# 97.3% of Dutch boys are taller than 160 cm
pnorm(160,mean=mb15,sd=sdb15,lower.tail=FALSE)
# same value of p
normal.fluctuation(x=c(160,185),mean=mb15,sd=sdb15)
# 86.5% of Dutch boys are between 160 and 185 cm tall
pnorm(185,mb,sdb)-pnorm(160,mb,sdb)
# same value of p
#----------------------- question 2 -----------------------------------
n <- 1e4                                    # sample size
S15 <- rnorm(n,mb,sdb)                      # random sample
plot(ecdf(S15),col="blue")                  # ecdf
curve(pnorm(x,mb,sdb),col="red",add=TRUE)   # theoretical cdf
abline(v=c(160,185),col="green")
#----------------------- question 3 -----------------------------------
M <- cumsum(S15>160)/(1:n)                  # relative frequencies
plot(1:n,M,pch=".",col="blue")
p <- pnorm(160,mean=mb15,sd=sdb15,lower.tail=FALSE)
abline(h=p,col="red")                       # theoretical probability
M <- cumsum((S15>160)&(S15<185))/(1:n)      # relative frequencies
plot(1:n,M,pch=".",col="blue")
p <- pnorm(185,mb,sdb)-pnorm(160,mb,sdb)
abline(h=p,col="red")                       # theoretical probability
#----------------------- question 4 -----------------------------------
normal.fluctuation(p=0.05,mean=mb15,sd=sdb15,alternative="greater")
# 5% of boys are taller than 188.2 cm
qnorm(0.05,mb15,sdb15,lower.tail=FALSE)
# same bound
normal.fluctuation(p=0.95,mean=mb15,sd=sdb15,alternative="two.sided")
# 95% of boys are between 159.7 and 190.7 cm tall
qnorm(c(0.025,0.975),mb15,sdb15)
# same bounds
#----------------------- question 5 -----------------------------------
x <- qnorm(0.05,mb15,sdb15,lower.tail=FALSE)
M <- cumsum(S15>x)/(1:n)                    # relative frequencies
plot(1:n,M,pch=".",col="blue")
abline(h=0.05,col="red")                    # theoretical probability
x <- qnorm(c(0.025,0.975),mb15,sdb15)
M <- cumsum((S15>x[1])&(S15<x[2]))/(1:n)    # relative frequencies
plot(1:n,M,pch=".",col="blue")
abline(h=0.95,col="red")                    # theoretical probability
#----------------------- question 6 -----------------------------------
normal.fluctuation(p=0.90,mean=mb15,sd=sdb15)

#----------------------------------------------------------------------
#                      Lab session 4
#----------------------------------------------------------------------

#----------------------------------------------------------------------
#                       Exercise 4.1
#----------------------------------------------------------------------

#----------------------- question 1 -----------------------------------
N <- 1e4                                    # sample size
mb <- 175.2; sdb <- 7.9                     # mean and standard deviation
X <- rnorm(N,mb,sdb); X[1:100]              # normal sample
Xs <- sort(X); Xs[1:100]                    # sorted values
plot(Xs,(1:N)/N)                            # ecdf
plot((1:N)/N,Xs)                            # empirical quantiles
plot(qnorm(((1:N)-0.5)/n),Xs)               # empirical quantiles against qnorm
abline(a=mb,b=sdb,col="red")                # straight line
qqnorm(X)                                   # qqplot
abline(a=mb,b=sdb,col="red")                # straight line
#----------------------- question 2 -----------------------------------
X <- rt(N,df=50)                            # Student sample
qqnorm(X)
abline(a=0,b=1,col="red")
#----------------------- question 3 -----------------------------------
df <- 500
X <- rchisq(N,df=df)                        # chi-squared sample
qqnorm(X)
abline(a=df,b=sqrt(2*df),col="red")
#----------------------- question 4 -----------------------------------
n <- 50; p <- 0.5
X <- rbinom(N,n,p)                          # binomial sample
qqnorm(X)
abline(a=n*p,b=sqrt(n*p*(1-p)),col="red")

#----------------------------------------------------------------------
#                        Exercise 4.2
#----------------------------------------------------------------------

#----------------------- question 1 -----------------------------------
# The exact model is the binomial distribution with parameters:
n <- 400; p <- 0.9
# It is approximated by the normal distribution with parameters
ms <- n*p; sds <- sqrt(n*p*(1-p))
#----------------------- question 2 -----------------------------------
pbinom(344,n,p,lower.tail=FALSE)            # exact model
pnorm(344,ms,sds,lower.tail=FALSE)          # approximate model
#----------------------- question 3 -----------------------------------
n-qbinom(0.01,n,p)                          # exact model
n-qnorm(0.01,ms,sds)                        # approximate model

#----------------------------------------------------------------------
#                        Exercise 4.3
#----------------------------------------------------------------------

#----------------------- question 1 -----------------------------------
# The exact model is the binomial distribution with parameters:
n <- 150000; p <- 0.4
# It is approximated by the normal distribution with parameters
ms <- n*p; sds <- sqrt(n*p*(1-p))
#----------------------- question 2 -----------------------------------
pbinom(60500,n,p,lower.tail=FALSE)          # exact model
pnorm(60500,ms,sds,lower.tail=FALSE)        # approximate model
#----------------------- question 3 -----------------------------------
qbinom(0.9,n,p)                             # exact model
qnorm(0.9,ms,sds)                           # approximate model

