#----------------------------------------------------------------------
#             Answers to exercises of Chapter 8
#                      Regression models
#----------------------------------------------------------------------

#----------------------------------------------------------------------
#                      Lab session 1
#----------------------------------------------------------------------
#----------------------------------------------------------------------
#                       Exercise 1.1
#----------------------------------------------------------------------
Bo <- read.table("data/bosson.csv",header=TRUE,sep=";")
A <- Bo[,"aneurysm"]
B <- Bo[,"bmi"]
#----------------------- question 1 -----------------------------------
fit<-lm(A~B)
summary(fit)
# R-squared = 0.05237: only 5% of the variability of A is explained by B
# p-value = 0.000503: regression is significant
plot(B,A)
abline(a=coefficients(fit)[1],b=coefficients(fit)[2], col="red")
#----------------------- question 2 -----------------------------------
hist(rstandard(fit))                   # the histogram is not symmetric
plot(B,residuals(fit))                 # no specific structure
abline(h=0)
plot(fitted(fit),residuals(fit))       # no specific structure
abline(h=0)
plot(A,fitted(fit))                    # points are not aligned
abline(0,1)
# the model is not validated
#----------------------- question 3 -----------------------------------
new<-data.frame(B=25)
predict(fit,new, interval="confidence")
predict(fit,new, interval="prediction")
#----------------------- question 4 -----------------------------------
G <- Bo[,"gender"]
AF <- A[G=='F']
BF <- B[G=='F']
summary(lm(AF~BF))

 
#----------------------------------------------------------------------
#                       Exercise 1.2
#----------------------------------------------------------------------
tau <- read.table("data/tauber.csv",header=TRUE,sep=";")
G <- tau[,"gender"]
A <- tau[,"age"]
H <- tau[,"height"]
W <- tau[,"weight"]
#----------------------- question 7 -----------------------------------
lmA <- lm(W~A)                              # response against age
lmH <- lm(W~H)                              # response against height
lmAH <- lm(W~A+H)                           # response against age and height
summary(lmA)                                # ajusted R-squared: 0.04902
summary(lmH)                                # ajusted R-squared: 0.5164
summary(lmAH)                               # adjusted R-squared: 0.5185
#----------------------- question 8 -----------------------------------
anova(lmA,lmAH)
# p-value = 2.2e-16: height does bring significant information on top of age
anova(lmH,lmAH)
# p-value = 0.0004193: age does bring significant information on top of height
# conclusion: the best predictors of weight are age and height together. 


#----------------------------------------------------------------------
#                      Lab session 2
#----------------------------------------------------------------------

#----------------------------------------------------------------------
#                       Exercise 2.1
#----------------------------------------------------------------------
#----------------------- question 1 -----------------------------------
XT<-readRDS("data/LenzT.rds")
Y<-readRDS("data/LenzI.rds")
names(Y)
dim(X)
rg <- sample(colnames(X), 500)                    # random genes
X <- XT[,rg]                                      # sub-matrix of X
YF<-Y$follup
#----------------------- question 2 -----------------------------------
res<-lm(YF~X)                                     # standard linear regression model
summary(res$coefficients)                         # 587 coefficients are set to 0
res$coefficients[400:420]
#----------------------- question 3 -----------------------------------
library(glmnet)
require(glmnet)
fit_lasso<-glmnet(X, YF)                          # lasso estimation
plot(fit_lasso, xvar="lambda")                    # regularisation path
plot(fit_lasso, xvar="lambda",label=TRUE)
#----------------------- question 4 -----------------------------------
cvfit = cv.glmnet(X, YF)                          # cross-validation selection of lambda
coef(cvfit, s="lambda.min")                       # sparse estimation of the coefficients
#run the following function
print_glmnet_coefs <- function(cvfit, s="lambda.min") {
  ind <- which(coef(cvfit, s=s) != 0)
  df <- data.frame(
    feature=rownames(coef(cvfit, s=s))[ind],
    coeficient=coef(cvfit, s=s)[ind]
  )
  return(df)
}
print_glmnet_coefs(cvfit)                          #non-zero coefficients
#----------------------- question 5 -----------------------------------
X <- XT                                            # complete matrix of X
cvfit = cv.glmnet(X, YF)                           # cross-validation selection of lambda
print_glmnet_coefs(cvfit)                          #non-zero coefficients
# cross-validation is a random procedure, that gives random results:
cvfit = cv.glmnet(X, YF)                           # cross-validation selection of lambda
print_glmnet_coefs(cvfit)                          #non-zero coefficients
#----------------------- question 6 -----------------------------------
cvfit = cv.glmnet(X, YF, nfolds = 15)
print_glmnet_coefs(cvfit)


#----------------------------------------------------------------------
#                       Exercise 2.2
#----------------------------------------------------------------------
#----------------------- question 1 -----------------------------------
XT<-readRDS("data/LenzT.rds")
Y<-readRDS("data/LenzI.rds")
Ye<-Y$ecog
Yec <- Ye[!is.na(Ye)]                              # remove the missing data
rg <- sample(colnames(X), 500)                     # random genes
X <- XT[!is.na(Ye),rg]                             # sub-matrix of X, removing the missing data
#----------------------- question 2 -----------------------------------
res<-lm(Yec~X)                                     # standard linear regression model
summary(res$coefficients)                          # 112 coefficients are set to 0
#----------------------- question 3 -----------------------------------
library(glmnet)
require(glmnet)
fit_lasso<-glmnet(X, Yec)                          # lasso estimation
plot(fit_lasso, xvar="lambda")                     # regularisation path
#----------------------- question 4 -----------------------------------
cvfit = cv.glmnet(X, Yec)                          # cross-validation selection of lambda
coef(cvfit, s="lambda.min")                        # sparse estimation of the coefficients
print_glmnet_coefs(cvfit)                          #non-zero coefficients
#----------------------- question 5 -----------------------------------
X <- XT[!is.na(Ye),]                               # complete matrix of X
cvfit = cv.glmnet(X, Yec)                          # cross-validation selection of lambda
print_glmnet_coefs(cvfit)                          #non-zero coefficients


#----------------------------------------------------------------------
#                      Lab session 4
#----------------------------------------------------------------------

#----------------------------------------------------------------------
#                       Exercise 4.1
#----------------------------------------------------------------------
#----------------------- question 1 -----------------------------------
titanic <- read.table("data/titanic.csv",header=TRUE,sep=";")
P <- titanic[ , "pclass"]
S <- titanic[ ,"survived"]
G <- titanic[ ,"gender"]
G<-ifelse(G=="F",1,0)
#----------------------- question 2 -----------------------------------
fitG <- glm(S~G, family=binomial(link="logit"))     # logistic model of S onto G
summary(fitG)
# p-value is <2e-16, Survival rate was significantly different between women and men
exp(coefficients(fitG)[2])
# odds-ratio is 11.78: the odds for women is 2.47 larger than odds for men
#----------------------- question 3 -----------------------------------
fitP <- glm(S~P, family=binomial(link="logit"))     # logistic model of S onto P
summary(fitP)
exp(coefficients(fitP)[2])
# odds of third class passengers is half of odss of second class passengers
#----------------------- question 4 -----------------------------------
fitGP <- glm(S~P+G, family=binomial(link="logit"))  # logistic model of S onto P and G
summary(fitGP)
exp(coefficients(fitGP)[2:3])
# odds-ratio of gender increases when passengers class is included in the model
#----------------------- question 5 -----------------------------------
step(fitGP)


#----------------------------------------------------------------------
#                       Exercise 4.2
#----------------------------------------------------------------------
#----------------------- question 1 -----------------------------------
Y <- readRDS("data/LenzI.rds")
S <- Y$status
S <- ifelse(S=="dead", 1,0)
G <- Y$gender
D <- Y$diagno
E <- Y$ecog
A <- Y$age
R <- Y$regim
St <- Y$stage
L <- Y$ldhrat
#----------------------- question 2 -----------------------------------
fit<-glm(S~G,  family=binomial(link="logit"))
summary(fit)
#----------------------- question 3 -----------------------------------
fit <- glm(S~D, famil=binomial(link="logit"))
summary(fit)
#----------------------- question 4 -----------------------------------
fitE <- glm(S~E, famil=binomial(link="logit"))
summary(fitE)
fitED <- glm(S~E+D, famil=binomial(link="logit"))
summary(fitED)
fitEDi <- glm(S~E*D, famil=binomial(link="logit"))
summary(fitEDi)
step(fitEDi)
#----------------------- question 4 -----------------------------------
Ex <- Ex[!is.na(Y$extnod)]
G <- G[!is.na(Y$extnod)]
S <- S[!is.na(Y$extnod)]
D <- D[!is.na(Y$extnod)]
A <- A[!is.na(Y$extnod)]
R <- R[!is.na(Y$extnod)]
E <- E[!is.na(Y$extnod)]
St <- St[!is.na(Y$extnod)]
L <- L[!is.na(Y$extnod)]
#----------------------- question 5 -----------------------------------
fitfull <- glm(S~E+D+A+R+St+L+Ex, family=binomial(link="logit"), na.action=na.exclude)
step(fitfull)

#----------------------------------------------------------------------
#                       Exercise 4.3
#----------------------------------------------------------------------
#----------------------- question 1 -----------------------------------
XT<-readRDS("data/LenzT.rds")
Y<-readRDS("data/LenzI.rds")
names(Y)
dim(X)
rg <- sample(colnames(X), 500)                    # random genes
X <- XT[,rg]                                      # sub-matrix of X
Ys<-Y$status
#----------------------- question 2 -----------------------------------
fit_lasso <- glmnet(X, Ys, family="binomial")
plot(fit_lasso, xvar="lambda")                                   # regularisation path
#----------------------- question 3 -----------------------------------
cvfit = cv.glmnet(X, Ys, family="binomial")                          # cross-validation selection of lambda
coef(cvfit, s="lambda.min")                        # sparse estimation of the coefficients
print_glmnet_coefs(cvfit)                          #non-zero coefficients
#----------------------- question 4 -----------------------------------
cvfit = cv.glmnet(XT, Ys, family="binomial")       # cross-validation selection of lambda
print_glmnet_coefs(cvfit)                          #non-zero coefficients


#----------------------------------------------------------------------
#                       Exercise 4.4
#----------------------------------------------------------------------
#----------------------- question 1 -----------------------------------
XT<-readRDS("data/LenzT.rds")
Y<-readRDS("data/LenzI.rds")
Fo <- Y$follup
S <- ifelse(Y$status=="dead",1,0)
G <- Y$gender
#----------------------- question 2 -----------------------------------
library(survival)
coxph(Surv(Fo,S)~G)
summary(coxph(Surv(Fo,S)~G))
#----------------------- question 3 -----------------------------------
rg <- sample(colnames(XT), 1000)                   # random genes
X <- XT[,rg]                                       # sub-matrix of X
Fo2 <- Fo
Fo2[Fo2==0] <-0.01                                 # replace nul values in Fo by 0.01
fit_lasso <- glmnet(X, cbind(time=Fo2,status=S), family="cox")
plot(fit_lasso, xvar="lambda")                     # regularisation path
#----------------------- question 4 -----------------------------------
cvfit = cv.glmnet(X, cbind(time=Fo2,status=S), family="cox")                         
coef(cvfit, s="lambda.min")                        # sparse estimation of the coefficients
print_glmnet_coefs(cvfit)                          #non-zero coefficients
#----------------------- question 5 -----------------------------------
cvfit = cv.glmnet(XT, cbind(time=Fo2,status=S), family="cox")      
print_glmnet_coefs(cvfit)                          #non-zero coefficients
