#----------------------------------------------------------------------
#             Answers to exercises of Chapter 1
#                     Data exploration
#----------------------------------------------------------------------

#----------------------------------------------------------------------
#                      Lab session 1
#----------------------------------------------------------------------

#----------------------------------------------------------------------
#                        Exercise 1.1
#----------------------------------------------------------------------

#----------------------- question 2 -----------------------------------
c(1,2,3,4,5)                                # create a vector
X <- c(1,2,3,4,5)                           # assign it to X
X                                           # display X
#----------------------- question 3 -----------------------------------
Y <- c(1,4,9,16,25)                         # create a new vector
#----------------------- question 4 -----------------------------------
length(X)                                   # display the length of X
length(Y)                                   # display the length of Y
length(X)==length(Y)                        # Boolean (TRUE or FALSE) 
#----------------------- question 5 -----------------------------------
plot(X,Y)                                   # plot as empty circles
plot(X,Y,pch=2)                             # plot as empty triangles
plot(X,Y,type="b")                          # plot as circles and lines
                                            # title, labels on both axes
plot(X,Y,main = "Y against X",xlab = "X",ylab = "Y")
#----------------------- question 6 -----------------------------------
curve(x^2,add=TRUE)                         # add curve y=x^2

#----------------------------------------------------------------------
#                        Exercise 1.2
#----------------------------------------------------------------------

#----------------------- question 1 -----------------------------------
X <- 0:7                                    # create a vector
#----------------------- question 2 -----------------------------------
X*5                                         # multiply a vector by a scalar 
X/5                                         # divide a vector by a scalar 
X+5                                         # add a scalar to a vector
#----------------------- question 3 -----------------------------------
sum(X)                                      # add up the elements of x
cumsum(X)                                   # cumulative sums of x
#----------------------- question 4 -----------------------------------
sqrt(X)                                     # square root of x 
X^3                                         # x to the power 3rd

#----------------------------------------------------------------------
#                        Exercise 1.3
#----------------------------------------------------------------------

#----------------------- question 1 -----------------------------------
X <- (0:4)^2                                # create a vector
X[c(3,5)]                                   # extract values
X[X>2]                                      # extract values
X[(X>2)&(X<10)]                             # extract values
#----------------------- question 2 -----------------------------------
Y <- rep(1,5)                               # create a vector
Z <- seq(3,11,by=2)                         # create a vector
c(X,Y,Z)                                    # concatenate
rbind(X,Y,Z)                                # bind as rows
XYZ <- cbind(X,Y,Z)                         # bind as columns
#----------------------- question 3 -----------------------------------
rowSums(XYZ)                                # row sums
colSums(XYZ)                                # column sums
#----------------------- question 4 -----------------------------------
XYZ[4,]                                     # row number 4
XYZ[,3]                                     # columns number 3
XYZ[c(3,5),c(2,3)]                          # sub-matrix
XYZ[X>2,]                                   # sub-matrix
XYZ[,c("Y","Z")]                            # submatrix


#----------------------------------------------------------------------
#                      Lab session 2
#----------------------------------------------------------------------

#----------------------------------------------------------------------
#                       Exercise 2.1
#----------------------------------------------------------------------

#----------------------- question 1 -----------------------------------
BO <- read.table(file="data/bosson.csv",header=TRUE,sep=";")
C <- BO[,1]                                 # discrete
G <- BO[,2]                                 # discrete
A <- BO[,3]                                 # continuous
B <- BO[,4]                                 # continuous
R <- BO[,5]                                 # discrete
#----------------------- question 2 -----------------------------------
head(BO)                                    # first 6 rows
BO[c(28,34,78),c(2,4)]                      # selected rows and columns
BO[C=="Vietnam",]                           # data for Vietnamese
B[G=="M"]                                   # body mass index of men
#----------------------- question 3 -----------------------------------
tC <- table(C); tC                          # absolute frequencies
prop.table(tC)                              # relative frequencies
barplot(tC)                                 # barplot
#----------------------- question 4 -----------------------------------
barplot(table(C,G))                         # C per value of G
barplot(table(G,C))                         # G per value of C
tCG <- table(C,G); tCG                      # absolute frequencies
prop.table(tCG)                             # global
prop.table(tCG,1)                           # conditioned on C
prop.table(tCG,2)                           # conditioned on G
#----------------------- question 5 -----------------------------------
barplot(table(C,R))                         # C per value of R
barplot(table(R,C))                         # R per value of C
tCG <- table(C,R); tCR                      # absolute frequencies
prop.table(tCR)                             # global
prop.table(tCR,1)                           # conditioned on C
prop.table(tCR,2)                           # conditioned on R
#----------------------- question 6 -----------------------------------
barplot(table(G,R))                         # G per value of R
barplot(table(R,G))                         # R per value of G
tCG <- table(G,R); tGR                      # absolute frequencies
prop.table(tGR)                             # global
prop.table(tGR,1)                           # conditioned on G
prop.table(tGR,2)                           # conditioned on R
#----------------------- question 7 -----------------------------------
summary(A)
boxplot(A)
hist(A,prob=TRUE)
plot(ecdf(A))
#----------------------- question 8 -----------------------------------
mean(A)
var(A)
sd(A)
median(A)
quantile(A,c(1/3,2/3))
IQR(A)
#----------------------- question 9 -----------------------------------
Acr <- (A-mean(A))/sd(A)
summary(A)
summary(Acr)
par(mfrow=c(1,2))
boxplot(A); boxplot(Acr)
hist(A,prob=TRUE)
hist(Acr,prob=TRUE)
plot(ecdf(A))
plot(ecdf(Acr))
mean(Acr)
var(Acr)
sd(Acr)
median(Acr)
quantile(Acr,c(1/3,2/3))
IQR(Acr)
#----------------------- question 10 -----------------------------------
sum(A<40)
sum(A>60)
n <- length(A)
sum(A<40)/n
sum(A>60)/n
quantile(A,0.1)
quantile(A,0.9)
#----------------------- question 11 -----------------------------------
boxplot(A)
abline(h=mean(A),col="red")
abline(h=median(A),col="blue")
abline(h=quantile(A,0.25),col="green")
abline(h=quantile(A,0.75),col="green")
#----------------------- question 12 -----------------------------------
hist(A,prob=TRUE)
abline(v=mean(A),col="red")
abline(v=median(A),col="blue")
abline(v=quantile(A,0.25),col="green")
abline(v=quantile(A,0.75),col="green")
#----------------------- question 13 -----------------------------------
plot(ecdf(A))
abline(v=mean(A),col="red")
abline(v=median(A),col="blue")
abline(v=quantile(A,0.25),col="green")
abline(v=quantile(A,0.75),col="green")
abline(h=0.5,col="blue")
abline(h=0.25,col="green")
abline(h=0.75,col="green")

#----------------------------------------------------------------------
#                        Exercise 2.2
#----------------------------------------------------------------------

#----------------------- question 1 -----------------------------------
FE <- read.table(file="data/ferretti.csv",header=TRUE,sep=";")
H <- FE[,1]                                 # continuous
DI <- FE[,2]                                # continuous
DE <- FE[,3]                                # discrete
I <- FE[,4]                                 # discrete
#----------------------- question 2 -----------------------------------
head(FE)                                    # first 6 rows
FE[10:15,c(2,4)]                            # selected rows and columns
FE[I=="yes",]                               # data for invasive tumors
H[DE=="positive"]                           # height for positive density
#----------------------- question 3 -----------------------------------
tD <- table(DE); tD                         # absolute frequencies
prop.table(tD)                              # relative frequencies
barplot(tD)                                 # barplot
#----------------------- question 4 -----------------------------------
barplot(table(DE,I))                        # DE per value of I
barplot(table(I,DE))                        # I per value of DE
tDI <- table(DE,I); tDI                     # absolute frequencies
prop.table(tDI)                             # global
prop.table(tDI,1)                           # conditioned on DE
prop.table(tDI,2)                           # conditioned on I


#----------------------------------------------------------------------
#                       Exercise 2.3
#----------------------------------------------------------------------

#----------------------- question 1 -----------------------------------
TA <- read.table(file="data/tauber.csv",header=TRUE,sep=";")
G <- TA[,1]                                 # discrete
A <- TA[,2]                                 # continuous
H <- TA[,3]                                 # continuous
W <- TA[,4]                                 # continuous
#----------------------- question 2 -----------------------------------
summary(A)
boxplot(A)
hist(A,prob=TRUE)
plot(ecdf(A))
#----------------------- question 3 -----------------------------------
mean(A)
var(A)
sd(A)
median(A)
quantile(A,c(1/3,2/3))
IQR(A)
#----------------------- question 4 -----------------------------------
sum(A<60)
sum(A>72)
n <- length(A)
sum(A<60)/n
sum(A>72)/n
quantile(A,0.1)
quantile(A,0.9)
#----------------------- question 5 -----------------------------------
boxplot(A)
abline(h=mean(A),col="red")
abline(h=median(A),col="blue")
abline(h=quantile(A,0.25),col="green")
abline(h=quantile(A,0.75),col="green")
#----------------------- question 6 -----------------------------------
hist(A,prob=TRUE)
abline(v=mean(A),col="red")
abline(v=median(A),col="blue")
abline(v=quantile(A,0.25),col="green")
abline(v=quantile(A,0.75),col="green")
#----------------------- question 7 -----------------------------------
plot(ecdf(A))
abline(v=mean(A),col="red")
abline(v=median(A),col="blue")
abline(v=quantile(A,0.25),col="green")
abline(v=quantile(A,0.75),col="green")
abline(h=0.5,col="blue")
abline(h=0.25,col="green")
abline(h=0.75,col="green")

#----------------------------------------------------------------------
#                      Lab session 4
#----------------------------------------------------------------------


#----------------------------------------------------------------------
#                       Exercise 4.1
#----------------------------------------------------------------------

#----------------------- question 1 -----------------------------------
BO <- read.table(file="data/bosson.csv",header=TRUE,sep=";")
C <- BO[,1]                                 # discrete
G <- BO[,2]                                 # discrete
A <- BO[,3]                                 # continuous
B <- BO[,4]                                 # continuous
R <- BO[,5]                                 # discrete
R[R>3] <- 3
#----------------------- question 2 -----------------------------------
A0 <- A[R==0]
A1 <- A[R==1]
A2 <- A[R==2]
A3 <- A[R==3]
plot(ecdf(A0),col="green")
lines(ecdf(A1),col="blue")
lines(ecdf(A2),col="red")
lines(ecdf(A1),col="black")
#----------------------- question 3 -----------------------------------
AV <- A[C=="Vietnam"]
AF <- A[C=="France"]
plot(ecdf(AV),col="green")
lines(ecdf(AF),col="red")
#----------------------- question 4 -----------------------------------
AM <- A[G=="M"]
AF <- A[G=="F"]
plot(ecdf(AM),col="blue")
lines(ecdf(AF),col="red")
#----------------------- question 5 -----------------------------------
boxplot(A~R,border=c("green","blue","red","black"))
#----------------------- question 6 -----------------------------------
boxplot(A~C,border=c("red","green"))
#----------------------- question 7 -----------------------------------
boxplot(A~G,border=c("red","blue"))
#----------------------- question 8 -----------------------------------
boxplot(B~R,border=c("green","blue","red","black"))
boxplot(B~C,border=c("red","green"))
boxplot(B~G,border=c("red","blue"))
#----------------------- question 9 -----------------------------------
plot(B,A)
cor(A,B)
co <- ifelse(C=="Vietnam","green","red")
plot(B,A,pch=19,col=co)
co <- ifelse(G=="M","blue","red")
plot(B,A,pch=19,col=co)

#----------------------------------------------------------------------
#                       Exercise 4.2
#----------------------------------------------------------------------

#----------------------- question 1 -----------------------------------
FI <- read.table(file="data/fires.csv",header=TRUE,sep=";")
M <- FI[,1]                                 # discrete
D <- FI[,2]                                 # discrete
TE <- FI[,3]                                # continuous
RH <- FI[,4]                                # continuous
W <- FI[,5]                                 # continuous
A <- FI[,6]                                 # continuous
lA <- log(A)
#----------------------- question 2 -----------------------------------
boxplot(A~M)
boxplot(A~D)
#----------------------- question 3 -----------------------------------
plot(TE,A)
plot(RH,A)
plot(W,A)
#----------------------- question 4 -----------------------------------
Az <- (A-mean(A))/sd(A)                    # area standardized
Wz <- (W-mean(W))/sd(W)                    # wind standardized
plot(ecdf(Az))
lines(ecdf(Wz),col="green")
#----------------------- question 5 -----------------------------------
cor(A,TE)
cor(A,RH)
cor(A,W)

#----------------------------------------------------------------------
#                       Exercise 4.3
#----------------------------------------------------------------------

#----------------------- question 1 -----------------------------------
TA <- read.table(file="data/tauber.csv",header=TRUE,sep=";")
G <- TA[,1]                                 # discrete
A <- TA[,2]                                 # continuous
H <- TA[,3]                                 # continuous
W <- TA[,4]                                 # continuous
#----------------------- question 2 -----------------------------------
co <- ifelse(G=="M","blue","red")           # vector of colors
plot(H~A,pch=19,col=co)
plot(W~A,pch=19,col=co)
plot(W~H,pch=19,col=co)
#----------------------- question 3 -----------------------------------
Hz <- (H-mean(H))/sd(H)                    # height standardized
Wz <- (W-mean(W))/sd(W)                    # weight standardized
plot(ecdf(Hz))
lines(ecdf(Wz),col="green")
#----------------------- question 4 -----------------------------------
cor(A,H)
cor(A,W)
cor(H,W)
#----------------------- question 5 -----------------------------------
AHW <-  TA[,2:4]
pairs(AHW,pch=19,col=co)
cor(AHW)
