#----------------------------------------------------------------------
#             Answers to exercises of Chapter 6
#                     Two sample tests
#----------------------------------------------------------------------

#----------------------------------------------------------------------
#                      Lab session 1
#----------------------------------------------------------------------

#----------------------------------------------------------------------
#                       Exercise 1.1
#----------------------------------------------------------------------
titanic <- read.table("data/titanic.csv",header=TRUE,sep=";")
P <- titanic[,"pclass"]
S <- titanic[,"survived"]
G <- titanic[,"gender"]
A <- titanic[,"age"]
#----------------------- question 1 -----------------------------------
by(A,G,sumary)                              # summaries
boxplot(A~G)                                # box plot
oneway.test(A~G)                            # one-way anova
# p-value = 0.04072: means are significantly different
Aw <- A[G=="F"]                             # ages of women
Am <- A[G=="M"]                             # ages of men
t.test(Aw,Am,alternative="less")            # one-sided T-test
# p-value = 0.02036: women were younger than men on average
t.test(Aw,Am)                               # two-sided T-test
# same p-value as oneway.test: twice the p-value of one-sided test
#----------------------- question 2 -----------------------------------
plot(ecdf(Aw),col="red")                    # ages of women
lines(ecdf(Am),col="blue")                  # ages of men
ks.test(Aw,Am,alternative="greater")        # one-sided test
# p-value = 0.03029: the ecdf of women ages is significantly above
#----------------------- question 3 -----------------------------------
var.test(Aw,Am)
# p-value = 0.6442: the variances are not significantly different
#----------------------- question 5 -----------------------------------
boxplot(A~P)                                # box plot
oneway.test(A~P)                            # one-way anova
# p-value = 9.986857e-38: means are significantly different
A1 <- A[P==1]                               # ages of first class
A2 <- A[P==2]                               # ages of second class
A3 <- A[P==3]                               # ages of third class passengers
t.test(A1,c(A2,A3),alternative="greater")   # first against others
t.test(A1,A2,alternative="greater")         # first against second
t.test(A1,A3,alternative="greater")         # first against third
t.test(A2,c(A1,A3))                         # second against others
t.test(A2,A3,alternative="greater")         # first against others
#----------------------- question 6 -----------------------------------
plot(ecdf(A1),col="red")                    # ages of 1st class
lines(ecdf(A2),col="green")                 # ages of 2nd class
lines(ecdf(A3),col="blue")                  # ages of 3rd class
ks.test(A2,A3,alternative="less")
# p-value = 4.911e-06: ecdf of A2 significanlty below
ks.test(A2,A1,alternative="greater")
# p-value = 9.434e-14: ecdf of A2 significantly above
#----------------------- question 7 -----------------------------------
var.test(A1,A2)
# p-value =  0.2898: no significant difference
var.test(A1,A3,alternative="greater")
# p-value = 7.496e-05: the variance is larger in first class
#----------------------- question 8 -----------------------------------
G:S                                         # cross values
table(G:S)                                  # "flat" contingency table
boxplot(A~G:S)
oneway.test(A~G:S)
# p-value =  5.335e-05: there are significant differences
t.test(A[G:S=="F:no"],A[G:S!="F:no"],alternative="less")
# p-value =   0.0003225: non surviving women were younger
t.test(A[G:S=="M:yes"],A[G:S!="M:yes"],alternative="less")
# p-value =  0.009979: surviving men were younger

#----------------------------------------------------------------------
#                      Lab session 2
#----------------------------------------------------------------------

#----------------------------------------------------------------------
#                       Exercise 2.1
#----------------------------------------------------------------------
DE <- 23                                    # diseased exposed
HE <- 56                                    # healthy exposed
DU <- 67                                    # diseased unexposed
HU <- 136                                   # healthy unexposed
#----------------------- question 1 -----------------------------------
oddsE <- DE/HE; oddsE                       # odds exposed 
oddsU <- DU/HU; oddsU                       # odds unexposed
OR <- oddsE/oddsU; OR                       # OR<1: exposure protects
#----------------------- question 2 -----------------------------------
ta <- rbind(c(DE,HE),c(DU,HU))              # contingency table
rownames(ta) <- c("exposed","unexposed")
colnames(ta) <- c("diseased","healthy")
ta
barplot(prop.table(ta,1),beside=TRUE)
chisq.test(ta)
# p-value = 0.6261: the two variables are independent
prop.test(c(DE,DU),c(DE+HE,DU+HU),alternative="less")
# p-value = 0.313: the two proportions are equal
fisher.test(ta,alternative="less")
# p-value = 0.3152: OR is not less than  1

#----------------------------------------------------------------------
#                       Exercise 2.2
#----------------------------------------------------------------------
titanic <- read.table("data/titanic.csv",header=TRUE,sep=";")
P <- titanic[,"pclass"]
S <- titanic[,"survived"]
G <- titanic[,"gender"]
A <- titanic[,"age"]
#----------------------- question 1 -----------------------------------
ta <- table(S,G); ta
barplot(prop.table(ta,2),beside=TRUE)       # conditional frequencies
chisq.test(ta)
# p-value =  2.567603e-67: the variables are dependent
n <- colSums(ta)                            # frequencies of genders         
x <- ta[2,]                                 # survivors per gender
prop.test(x,n,alternative="greater")
# p-value =  1.283802e-67: the proportion of survivors among women was greater
fisher.test(ta,alternative="less")
fisher.test(ta[c(2,1),],alternative="greater")
# swap rows: the odds-ratio is inversed
fisher.test(ta[c(2,1),])$p.value
# p-value = 1.028653e-69
#----------------------- question 2 -----------------------------------
ta <- table(S,A<10); ta
barplot(prop.table(ta,2),beside=TRUE)       # conditional frequencies
chisq.test(ta)
# p-value =  0.0001763: the variables are dependent
n <- colSums(ta)                            # frequencies of below, abova         
x <- ta[2,]                                 # survivors per age class
prop.test(x,n,alternative="less")
# p-value = 8.817e-05: the proportion of survivors among chidren was greater
fisher.test(ta,alternative="greater")
# p-value = 0.0001016: the odds-ratio is larger than 1
#----------------------- question 3 -----------------------------------
ta <- table(S,A<21); ta
barplot(prop.table(ta,2),beside=TRUE)       # conditional frequencies
chisq.test(ta)
# p-value =  0.07999: the variables are independent
n <- colSums(ta)                            # frequencies of below, above         
x <- ta[2,]                                 # survivors per age class
prop.test(x,n,alternative="less")
# p-value = 0.03999: the proportion of survivors among people below 21 was greater
fisher.test(ta,alternative="greater")
# p-value = 0.04035: same conclusion
#----------------------- question 4 -----------------------------------
ta <- table(S,P); ta
barplot(prop.table(ta,2),beside=TRUE)       # conditional frequencies
chisq.test(ta)
# p-value =  4.541475e-24: the variables are dependent
n <- colSums(ta)                            # frequencies of classes       
x <- ta[2,]                                 # survivors per class
prop.test(x,n)$p.value
# p-value = 4.541475e-24: there are significant differences in proportion
fisher.test(ta[,c(1,2)],alternative="less") # 1st class against 2nd class
fisher.test(ta[,c(1,3)],alternative="less") # 1st class against 3rd class
fisher.test(ta[,c(2,3)],alternative="less") # 2nd class against 3rd class

#----------------------------------------------------------------------
#                       Exercise 2.4
#----------------------------------------------------------------------
tauber <- read.table("data/tauber.csv",header=TRUE,sep=";")
A <- tauber[,"age"]
H <- tauber[,"height"]
W <- tauber[,"weight"]
AHW <- tauber[,2:4]
#----------------------- question 1 -----------------------------------
pairs(AHW)
cor(AHW)
#----------------------- question 2 -----------------------------------
cor.test(A,H,alternative="greater")
cor.test(A,W,alternative="greater")
cor.test(H,W,alternative="greater")
