This document presents an example analysis on the ABIDE data set, which first identifies important neuroimaging features (out of 2000) that may be predictive of autisum diagnosis, and then predicts the diagnosis for single subjects and evaluates the prediction performance.
rm(list=ls())
# install and load required packages
#install.packages("arm")
#install.packages("glmnet")
#install.packages("knitr")
#install.packages("crossval")
library(glmnet)
## Loading required package: Matrix
## Loading required package: foreach
## Loaded glmnet 2.0-5
library(arm)
## Loading required package: MASS
## Loading required package: lme4
##
## arm (Version 1.9-3, built: 2016-11-21)
## Working directory is /Users/lzhao/My_Works/projects/PheWas/R_scripts
library(knitr)
require(crossval)
## Loading required package: crossval
# pick random seed
seed = 1234
data_phenotypes <- read.table('/Users/lzhao/My_Works/projects/PheWas/ABIDE/ABIDE-databag-males/FS_metrics_restructured_ABIDE_no_IQ.csv',sep=",", header=T)
data_genotypes <- read.table('/Users/lzhao/My_Works/projects/PheWas/ABIDE/ABIDE-databag-males/ABIDE_demog_agg_no_IQ.csv',sep=",", header=T)
age_subg<-(data_genotypes$age<=18) # discard adults
Y<-as.data.frame(data_genotypes$DX_group[age_subg]); table(Y); Y <- ifelse(Y==" Autism",1,0); Y <- as.factor(Y)
## Y
## Autism Control
## 312 305
X<-cbind(data_genotypes$age[age_subg], data_phenotypes[age_subg,2:ncol(data_phenotypes)])
X<-as.matrix(X)
sample_size <- floor(0.8 * length(Y))
set.seed(seed)
input.train.index <- sample(seq_len(length(Y)), size = sample_size)
input.train <- X[input.train.index, ]
output.train <- Y[input.train.index]
# TESTING DATA
input.test <- X[-input.train.index, ]
output.test <- Y[-input.train.index]
fitLASSO = glmnet(input.train, output.train, alpha = 1,family = "binomial")
plot(fitLASSO,xvar="lambda", label="TRUE")
mtext("Number of Nonzero (Active) Coefficients", side=3, line=2.5)
The plot of LASSO path shows how the LASSO regression shrinks the beta coefficients and selects active variables with the change of Lambda.
set.seed(seed) # set seed
cvLASSO = cv.glmnet(input.train, output.train, alpha = 1,family = "binomial" )
plot(cvLASSO)
mtext("Number of Nonzero (Active) Coefficients", side=3, line=2.5)
betaHatLASSO = as.double(coef(fitLASSO, s = cvLASSO$lambda.min)) # s is lambda
par(mar=c(2,20,1,1)) # extra large left margin
varNames_all <- colnames(data_phenotypes[ , 2:ncol(data_phenotypes)]); #varNames; length(varNames)
varNames_active <- varNames_all[betaHatLASSO!=0] ;
coefplot(rev(betaHatLASSO[betaHatLASSO!=0]), sd = rep(0,length(varNames_active)-1), cex.pts = 1, main = "Regression Coefficient Estimates", varnames = rev(varNames_active))
71 variables with nonzero coefficients were selected as important (out of 2000).
# 1. Define LASSO Prediction fundtion
predfun.lasso = function(train.x, train.y, test.x, test.y, negative)
{ lasso.fit = glmnet(train.x, train.y, alpha = 1, family = "binomial") # The LASSO
# ynew = predict(lasso.fit, s=0.01, test.x, type="response")
ynew = predict(fitLASSO, s=cvLASSO$lambda.min, type="response", newx=test.x)
ynew.bin <- ifelse(ynew>=0.5, 1, 0)
out = confusionMatrix(test.y, ynew.bin, negative=negative)
return( out )
}
neg <- "0" # control
# 2. evaluate prediction performance with 5-fold cross validation
cv.out <- crossval::crossval(predfun.lasso, X, Y, K = 5, B = 10, negative = neg)
## Number of folds: 5
## Total number of CV fits: 50
##
## Round # 1 of 10
## CV Fit # 1 of 50
## CV Fit # 2 of 50
## CV Fit # 3 of 50
## CV Fit # 4 of 50
## CV Fit # 5 of 50
##
## Round # 2 of 10
## CV Fit # 6 of 50
## CV Fit # 7 of 50
## CV Fit # 8 of 50
## CV Fit # 9 of 50
## CV Fit # 10 of 50
##
## Round # 3 of 10
## CV Fit # 11 of 50
## CV Fit # 12 of 50
## CV Fit # 13 of 50
## CV Fit # 14 of 50
## CV Fit # 15 of 50
##
## Round # 4 of 10
## CV Fit # 16 of 50
## CV Fit # 17 of 50
## CV Fit # 18 of 50
## CV Fit # 19 of 50
## CV Fit # 20 of 50
##
## Round # 5 of 10
## CV Fit # 21 of 50
## CV Fit # 22 of 50
## CV Fit # 23 of 50
## CV Fit # 24 of 50
## CV Fit # 25 of 50
##
## Round # 6 of 10
## CV Fit # 26 of 50
## CV Fit # 27 of 50
## CV Fit # 28 of 50
## CV Fit # 29 of 50
## CV Fit # 30 of 50
##
## Round # 7 of 10
## CV Fit # 31 of 50
## CV Fit # 32 of 50
## CV Fit # 33 of 50
## CV Fit # 34 of 50
## CV Fit # 35 of 50
##
## Round # 8 of 10
## CV Fit # 36 of 50
## CV Fit # 37 of 50
## CV Fit # 38 of 50
## CV Fit # 39 of 50
## CV Fit # 40 of 50
##
## Round # 9 of 10
## CV Fit # 41 of 50
## CV Fit # 42 of 50
## CV Fit # 43 of 50
## CV Fit # 44 of 50
## CV Fit # 45 of 50
##
## Round # 10 of 10
## CV Fit # 46 of 50
## CV Fit # 47 of 50
## CV Fit # 48 of 50
## CV Fit # 49 of 50
## CV Fit # 50 of 50
diagnosticErrors(cv.out$stat)
## acc sens spec ppv npv lor
## 0.7585089 0.7435897 0.7737705 0.7707641 0.7468354 2.2944360
The prediction accuracy was 75.85%, sensitivity was 74.36% and specificity was 77.38%.