Machine learning on ABIDE cohort for autism diagnosis

Lu Zhao

Feb 16, 2017

This document presents an example analysis on the ABIDE data set, which first identifies important neuroimaging features (out of 2000) that may be predictive of autisum diagnosis, and then predicts the diagnosis for single subjects and evaluates the prediction performance.

1. Initialization

rm(list=ls())
# install and load required packages
#install.packages("arm")
#install.packages("glmnet")
#install.packages("knitr")
#install.packages("crossval")

library(glmnet)

## Loading required package: Matrix

## Loading required package: foreach

## Loaded glmnet 2.0-5

library(arm)

## Loading required package: MASS

## Loading required package: lme4

## 
## arm (Version 1.9-3, built: 2016-11-21)

## Working directory is /Users/lzhao/My_Works/projects/PheWas/R_scripts

library(knitr) 
require(crossval)

## Loading required package: crossval

# pick random seed
seed = 1234

2. Load and organize data

2.1 load FreeSurfer metrics and demographics of the ABIDE cohort

data_phenotypes <- read.table('/Users/lzhao/My_Works/projects/PheWas/ABIDE/ABIDE-databag-males/FS_metrics_restructured_ABIDE_no_IQ.csv',sep=",", header=T)    
data_genotypes <- read.table('/Users/lzhao/My_Works/projects/PheWas/ABIDE/ABIDE-databag-males/ABIDE_demog_agg_no_IQ.csv',sep=",", header=T)

2.2 set diagnosis as the outcomes Y and freesurfer metrics as observations X

age_subg<-(data_genotypes$age<=18) # discard adults
Y<-as.data.frame(data_genotypes$DX_group[age_subg]); table(Y); Y <- ifelse(Y==" Autism",1,0); Y <- as.factor(Y)

## Y
##   Autism  Control 
##      312      305

X<-cbind(data_genotypes$age[age_subg], data_phenotypes[age_subg,2:ncol(data_phenotypes)])
X<-as.matrix(X)

3. Identify predictive features for autism diagnosis

3.1 randomly split data into training (80%) and testing (20%) sets

sample_size <- floor(0.8 * length(Y))
set.seed(seed)
input.train.index <- sample(seq_len(length(Y)), size = sample_size)
input.train <- X[input.train.index, ]
output.train <- Y[input.train.index]

# TESTING DATA
input.test <- X[-input.train.index, ]
output.test <- Y[-input.train.index]

3.2 LASSO regression

fitLASSO = glmnet(input.train, output.train, alpha = 1,family = "binomial")  
plot(fitLASSO,xvar="lambda", label="TRUE")
mtext("Number of Nonzero (Active) Coefficients", side=3, line=2.5)

The plot of LASSO path shows how the LASSO regression shrinks the beta coefficients and selects active variables with the change of Lambda.

3.3 find the optimal Lambda that gives the minimum cross-validated error using 10-fold cross validation

set.seed(seed)  # set seed 
cvLASSO = cv.glmnet(input.train, output.train, alpha = 1,family = "binomial" )
plot(cvLASSO)
mtext("Number of Nonzero (Active) Coefficients", side=3, line=2.5)

betaHatLASSO = as.double(coef(fitLASSO, s = cvLASSO$lambda.min))  # s is lambda
par(mar=c(2,20,1,1))   # extra large left margin
varNames_all <- colnames(data_phenotypes[ , 2:ncol(data_phenotypes)]);  #varNames; length(varNames)
varNames_active <- varNames_all[betaHatLASSO!=0] ;
coefplot(rev(betaHatLASSO[betaHatLASSO!=0]), sd = rep(0,length(varNames_active)-1), cex.pts = 1, main = "Regression Coefficient Estimates", varnames = rev(varNames_active))

71 variables with nonzero coefficients were selected as important (out of 2000).

4. Predict autism diagnosis for individual subjects and evaluate the prediction performance

# 1. Define LASSO Prediction fundtion
predfun.lasso = function(train.x, train.y, test.x, test.y, negative)
{   lasso.fit = glmnet(train.x, train.y, alpha = 1, family = "binomial")  # The LASSO
# ynew = predict(lasso.fit, s=0.01, test.x, type="response") 
ynew = predict(fitLASSO, s=cvLASSO$lambda.min, type="response", newx=test.x)
ynew.bin <- ifelse(ynew>=0.5, 1, 0)
out = confusionMatrix(test.y, ynew.bin, negative=negative)
return( out )
}
neg <- "0"  # control
# 2. evaluate prediction performance with 5-fold cross validation
cv.out <- crossval::crossval(predfun.lasso, X, Y, K = 5, B = 10, negative = neg)

## Number of folds: 5 
## Total number of CV fits: 50 
## 
## Round # 1 of 10 
## CV Fit # 1 of 50 
## CV Fit # 2 of 50 
## CV Fit # 3 of 50 
## CV Fit # 4 of 50 
## CV Fit # 5 of 50 
## 
## Round # 2 of 10 
## CV Fit # 6 of 50 
## CV Fit # 7 of 50 
## CV Fit # 8 of 50 
## CV Fit # 9 of 50 
## CV Fit # 10 of 50 
## 
## Round # 3 of 10 
## CV Fit # 11 of 50 
## CV Fit # 12 of 50 
## CV Fit # 13 of 50 
## CV Fit # 14 of 50 
## CV Fit # 15 of 50 
## 
## Round # 4 of 10 
## CV Fit # 16 of 50 
## CV Fit # 17 of 50 
## CV Fit # 18 of 50 
## CV Fit # 19 of 50 
## CV Fit # 20 of 50 
## 
## Round # 5 of 10 
## CV Fit # 21 of 50 
## CV Fit # 22 of 50 
## CV Fit # 23 of 50 
## CV Fit # 24 of 50 
## CV Fit # 25 of 50 
## 
## Round # 6 of 10 
## CV Fit # 26 of 50 
## CV Fit # 27 of 50 
## CV Fit # 28 of 50 
## CV Fit # 29 of 50 
## CV Fit # 30 of 50 
## 
## Round # 7 of 10 
## CV Fit # 31 of 50 
## CV Fit # 32 of 50 
## CV Fit # 33 of 50 
## CV Fit # 34 of 50 
## CV Fit # 35 of 50 
## 
## Round # 8 of 10 
## CV Fit # 36 of 50 
## CV Fit # 37 of 50 
## CV Fit # 38 of 50 
## CV Fit # 39 of 50 
## CV Fit # 40 of 50 
## 
## Round # 9 of 10 
## CV Fit # 41 of 50 
## CV Fit # 42 of 50 
## CV Fit # 43 of 50 
## CV Fit # 44 of 50 
## CV Fit # 45 of 50 
## 
## Round # 10 of 10 
## CV Fit # 46 of 50 
## CV Fit # 47 of 50 
## CV Fit # 48 of 50 
## CV Fit # 49 of 50 
## CV Fit # 50 of 50

diagnosticErrors(cv.out$stat)

##       acc      sens      spec       ppv       npv       lor 
## 0.7585089 0.7435897 0.7737705 0.7707641 0.7468354 2.2944360

The prediction accuracy was 75.85%, sensitivity was 74.36% and specificity was 77.38%.