Feature Selection and mAb Classification

Read dataset and preprocess.

Remove mAb 135 because it is missing Neut_micro.

df <- read.csv("./data/master_log.csv", na.strings=c("?", "nd"), strip.white=TRUE, row.names=c("Ab"))
df <- df[!rownames(df) %in% c("135"),]

Convert Epitope Class to a factor.

df$Epitope_Class <- factor(df$Epitope_Class)

Drop columns with missing values.

df <- subset(df, select=-c(Protect_binary, Epitope_Class_ELISA, Endotoxin, Endotoxin.1, mW_Loss, aTTD, Epitope.Class..assigning.method.)) #Remove columns with empty values.
df[,"Escape..code"] <- as.factor(df[,"Escape..code"])
df[,"Makona.binding"] <- as.factor(df[,"Makona.binding"])

Make unNeutFrac, Neut_dVP30 and Neut_VSV correlate positively with Protection.

df[,"unNeutFrac"] <- 1- df[,"unNeutFrac"]
df[,"Neut_dVP30"] <- 1- df[,"Neut_dVP30"]
df[,"Neut_VSV"] <- (100 - df[,"Neut_VSV"])/100

Separate Human IgG1 and Mouse IgG1

levels(df[,"Isotype"]) <- c(levels(df[,"Isotype"]), "HumanIgG1")
df[df[,"Isotype"]=="IgG1" & df[,"Species"]=="human", "Isotype"] <- "HumanIgG1"
levels(df[,"Isotype"]) <- c(levels(df[,"Isotype"]), "MouseIgG1")
df[df[,"Isotype"]=="IgG1" & df[,"Species"]=="mouse", "Isotype"] <- "MouseIgG1"
df[,"Isotype"] <- droplevels(df[,"Isotype"])

Convert polyfunctionality to numeric type.

df[,"Polyfunctionality"] <- as.numeric(df[,"Polyfunctionality"])

Add Epitope Tier columns

df[df$Epitope_Class %in% c("Cap", "GP1/Head", "Mucin"), "Epitope_Tier"] <- "Tier1"
df[df$Epitope_Class %in% c("Base", "GP1/Core", "Fusion"), "Epitope_Tier"] <- "Tier2"
df[df$Epitope_Class %in% c("GP1/2", "HR2"), "Epitope_Tier"] <- "Tier3"
df[df$Epitope_Class %in% c("Unknown"), "Epitope_Tier"] <- "TierUnknown"

Dropidentical features and features with missing experimental data.

t <- subset(df, select=-c(Round, Makona.binding, Escape..code, Epitope_Tier, Total_SA,Cross.reactivity))

Remove features with near zero variance. Features are centered and scaled.

library(caret)
library(ggplot2)
s <- dummyVars(~., data <- t, levelsOnly=TRUE)
q <- predict(s, t)
h <- nearZeroVar(q, saveMetrics=TRUE)

t.var <- apply(q, 2, var)
t.var <- as.data.frame(t.var)
t.var[,"Var"] <- rownames(t.var)
t.var[,"nzv"] <- h$nzv
ggplot(t.var, aes(x=reorder(Var, t.var), y=t.var)) + geom_bar(stat="identity", aes(fill=nzv)) + scale_y_continuous(trans='log10') + theme(axis.text.x = element_text(angle = 90, hjust = 1)) + scale_fill_manual(values = c("steelblue", "indianred"),name= "Near Zero Variance") + ggtitle("Features with near zero variance that will be removed.") +xlab("Feature") + ylab("log10(Variance)")

q <- q[,(colnames(q) %in% rownames(h[h[,"nzv"]==FALSE | h[,"zeroVar"]==TRUE,]))]
q <- subset(q, select =-c(Unknown, sGP.binderFALSE))

norm <- preProcess(q, method=c("center", "scale"))
q <- predict(norm, q)
q <- as.data.frame(q)
head(q)
##   Protection      Base        Cap     GP1/2   GP1/Core      Mucin
## 1  0.4289280 -0.301588  1.6930603 -0.301588 -0.4854046 -0.3677495
## 2 -1.0652518 -0.301588 -0.5871096 -0.301588 -0.4854046  2.7029591
## 3 -0.7664158 -0.301588  1.6930603 -0.301588 -0.4854046 -0.3677495
## 4 -0.4675799 -0.301588 -0.5871096 -0.301588 -0.4854046  2.7029591
## 5 -0.1687439 -0.301588  1.6930603 -0.301588 -0.4854046 -0.3677495
## 6  0.1300921 -0.301588 -0.5871096 -0.301588 -0.4854046  2.7029591
##     Neut_VSV unNeutFrac Neut_micro Neut_dVP30    GP_EC50   sGP_EC50
## 1 -0.7135912 -0.7075685 -1.0074874 -0.6029719 -0.3779862 -1.0684007
## 2 -0.7135912 -0.7075685 -1.0074874 -0.6029719 -0.3799966  0.9542301
## 3 -0.7135912 -0.7075685 -0.5342092 -0.6029719  0.3680830 -1.0247257
## 4 -0.7135912 -0.7075685 -1.0074874 -0.6029719 -0.3809804  0.9542301
## 5  1.4151100  0.8370870  0.4596750 -0.6029719 -0.1946556 -1.0545575
## 6 -0.7135912 -0.7075685 -1.0074874 -0.6029719 -0.3654533  0.9542301
##        human      mouse      IgG2a  HumanIgG1  MouseIgG1 Polyfunctionality
## 1  0.8059481 -0.8059481 -0.5318939  0.8160809 -0.3677495         1.5775151
## 2  0.8059481 -0.8059481 -0.5318939  0.8160809 -0.3677495         1.5775151
## 3 -1.2333449  1.2333449  1.8688163 -1.2180312 -0.3677495        -0.5730565
## 4  0.8059481 -0.8059481 -0.5318939  0.8160809 -0.3677495         1.5775151
## 5 -1.2333449  1.2333449  1.8688163 -1.2180312 -0.3677495        -0.5730565
## 6 -1.2333449  1.2333449  1.8688163 -1.2180312 -0.3677495        -0.0354136
##       huADCP      mADCP     CD107a       IFNg     MIP.1b     huADNP
## 1  0.9420515  1.4928146  1.4855860  1.4721098  1.6246636  4.3450612
## 2 -0.6288721  1.0345560  2.0526654  1.9820216  2.5693466  3.3395864
## 3  0.4718370 -0.2366025 -0.4504150 -0.4327751 -0.5714592 -0.6443498
## 4  0.6436291  0.8137532  2.4809491  2.1313530  3.1345643  2.4476388
## 5  1.7027185 -0.2927284 -0.4067935 -0.4021804 -0.5029962 -0.5420307
## 6 -0.1046406  0.5491598 -0.2783084 -0.4262191 -0.3862376  0.2830299
##         mADNP sGP_huADCP sGP_huADNP Total_G0   Total_G1   Total_G2
## 1 -0.12824504  1.0847236  1.0050389 2.337267 -2.5251621 -0.8281199
## 2 -0.61622083 -0.3945998  0.4790355 1.619398 -1.6106900 -0.8294177
## 3 -0.28376405  1.0175055 -0.3613869 0.701013 -0.5869325 -0.5607759
## 4 -0.94029636 -0.3672924 -0.1666382 2.546092 -2.6400880 -1.1045484
## 5 -0.87790251  0.3006875 -0.1921194 1.325271 -1.5218515 -0.3038141
## 6 -0.04536366 -1.1169318 -0.4437456 1.033914 -1.0565072 -0.4777175
##   Total_Fucose Total_Bisecting Total_mono_SA         G0        G0F
## 1   -5.3719604      -0.1683970    -0.1427142  5.5195898 -2.1311307
## 2   -2.4561018      -0.7369070     0.3107384  2.5342760 -0.3934907
## 3    0.3274304      -0.9737862     0.6070378 -0.2220388  0.9525519
## 4   -5.3980247      -0.4810775    -0.4144584  5.5077929 -1.9230581
## 5    0.2532474      -0.3863258     1.2618430 -0.2539599  1.6387239
## 6    0.3341135      -0.9927366     0.6905255 -0.2504902  1.3295374
##         G0FB         G1         G1.       G1F        G1F.       G1S1F
## 1 -1.1673856 -0.7037159  0.22572059 -1.797929 -2.00926166  0.43313807
## 2 -0.7431267 -0.2825927 -0.25374768 -1.285002 -1.26239409  0.65351661
## 3 -0.6017070 -0.4184389 -0.10914614 -1.299307 -0.07266768 -0.02840944
## 4 -0.6788450 -0.6357928  0.26377363 -1.771363 -1.99464586 -0.35274011
## 5 -0.6017070 -0.2554234  0.08111905 -1.509791 -1.15058320  0.85726280
## 6 -0.6402760 -0.3641004 -0.37551740 -1.381048 -0.48848729 -0.09909689
##         G1FB        G2F         G2FB       G2S1       G2S1F       G2S1B
## 1 -0.4367530 -0.6487676 -0.329686262  0.1525347 -0.70223990  1.64704157
## 2 -0.3973272 -0.9934518 -0.351766713  0.4920949 -0.01897163 -0.07530043
## 3 -0.5944562 -1.2077149 -0.351766713 -0.5077212  1.13312379 -0.96759809
## 4 -0.6601659 -1.0291623 -0.219284004  0.7373328 -0.64968080  0.69249058
## 5 -0.5681724 -1.2387675 -0.009519716 -0.3945344  1.29921054  1.14901497
## 6 -0.6995917 -1.2092675 -0.384887390 -0.2750596  1.17937580 -0.42806927
##       G2S1FB sGP.binderTRUE
## 1  0.5254709      1.0522613
## 2 -0.4418074     -0.9446437
## 3 -0.4686763      1.0522613
## 4 -0.1328157     -0.9446437
## 5 -0.6164549      1.0522613
## 6 -0.6836270     -0.9446437

Let’s build a classifier to classify mAbs into “high” and “low” protection. We set the threshold between “high” and “low” as 0.6.

threshold <- 0.6
t[t[,"Protection"]<threshold,"label"] <- "Low"
t[(t[,"Protection"]>=threshold),"label"] <- "High"
q[,"label"] <- t[,"label"]

t <- q

Before we start training our models. Let’s enable R to use multiple cores.

library(doMC)
## Loading required package: foreach
## Loading required package: iterators
## Loading required package: parallel
registerDoMC(cores = 16)

Let’s first train a Random Forest Model and get the predictions.

repeats <- 1000
set.seed(112358)
rfControl <- trainControl(
  method = "repeatedcv",
  number = 10,
  repeats= repeats,
  verboseIter = FALSE,
  returnData = FALSE,
  allowParallel = TRUE,
  summaryFunction = twoClassSummary,
  classProbs = TRUE,
  savePredictions = T,
  search="grid"
)

rfTrain <- train(
  x = as.matrix(subset(t, select=-c(label, Protection))),
  y = as.character(t[,"label"]),
  trControl = rfControl,
  method = "rf",
  ntree = 1000,
  tuneLength = 10
)
## Warning in train.default(x = as.matrix(subset(t, select = -c(label,
## Protection))), : The metric "Accuracy" was not in the result set. ROC will
## be used instead.
rfTrain
## Random Forest 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 1000 times) 
## Summary of sample sizes: 150, 150, 150, 150, 151, 150, ... 
## Resampling results across tuning parameters:
## 
##   mtry  ROC        Sens      Spec     
##    2    0.9525084  0.728150  0.9604356
##    7    0.9522726  0.806900  0.9484303
##   12    0.9487039  0.817065  0.9479485
##   17    0.9449837  0.813575  0.9463879
##   22    0.9416123  0.812030  0.9432561
##   27    0.9385880  0.809420  0.9404083
##   32    0.9362907  0.807640  0.9352356
##   37    0.9345193  0.807210  0.9305008
##   42    0.9332669  0.807080  0.9279485
##   48    0.9316669  0.805435  0.9250939
## 
## ROC was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 2.
rfTrain$finalModel
## 
## Call:
##  randomForest(x = x, y = y, ntree = 1000, mtry = param$mtry) 
##                Type of random forest: classification
##                      Number of trees: 1000
## No. of variables tried at each split: 2
## 
##         OOB estimate of  error rate: 8.98%
## Confusion matrix:
##      High Low class.error
## High   36  12  0.25000000
## Low     3 116  0.02521008
predRf <- rfTrain$pred
predRf <- predRf[predRf[,"mtry"]==rfTrain$bestTune$mtry,]

Let’s plot feature importance for the Random Forest model.

impRf <- varImp(rfTrain$finalModel, scaled=FALSE)
impRf[,"Variable"] <- rownames(impRf)
impRf <- impRf[with(impRf, order(-Overall)),]
impRf[,"Variable"] <- factor(impRf[,"Variable"], levels = impRf[,"Variable"])
pp <- ggplot(impRf, aes(Variable, Overall)) + geom_bar(stat="identity", fill="#000000")
pp + theme_bw()+ theme(text = element_text(size=14), axis.text = element_text(size=14), panel.border = element_blank(), panel.grid.major.x = element_blank(), panel.grid.major.y = element_line( size=.3, color="#f5f5f5"), panel.grid.minor = element_blank(), axis.line = element_line(colour = "black"), axis.text.x = element_text(angle = 90, hjust = 1)) + xlab("Feature") + ylab("Mean Decrease in Gini Index")

Let’s now train a logistic regression model with elastic net regularization.

require(methods)
## Loading required package: methods
lrControl <- trainControl(
  method = "repeatedcv",
  number = 10,
  repeats= repeats,
  verboseIter = FALSE,
  returnData = FALSE,
  allowParallel = TRUE,
  summaryFunction = twoClassSummary,
  classProbs = TRUE,
  savePredictions = T
)

set.seed(112358)
lrTrain <- train(
  x = as.matrix(subset(t, select=-c(label, Protection))),
  y = as.character(t[,"label"]),
  trControl = lrControl,
  method = "glmnet",
  tuneLength = 10
)
## Warning in train.default(x = as.matrix(subset(t, select = -c(label,
## Protection))), : The metric "Accuracy" was not in the result set. ROC will
## be used instead.
lrTrain
## glmnet 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 1000 times) 
## Summary of sample sizes: 150, 150, 150, 150, 151, 150, ... 
## Resampling results across tuning parameters:
## 
##   alpha  lambda        ROC        Sens      Spec     
##   0.1    0.0001439009  0.9022667  0.773270  0.8887076
##   0.1    0.0003324298  0.9028229  0.773545  0.8894235
##   0.1    0.0007679560  0.9124555  0.777705  0.8980848
##   0.1    0.0017740779  0.9237042  0.781010  0.9040197
##   0.1    0.0040983502  0.9353654  0.786125  0.9085947
##   0.1    0.0094677206  0.9450206  0.783035  0.9123780
##   0.1    0.0218716632  0.9508816  0.771535  0.9232629
##   0.1    0.0505263800  0.9543302  0.763675  0.9457023
##   0.1    0.1167224941  0.9546458  0.744200  0.9539394
##   0.1    0.2696441069  0.9539600  0.735500  0.9538417
##   0.2    0.0001439009  0.8960588  0.770740  0.8818212
##   0.2    0.0003324298  0.9021359  0.773695  0.8887205
##   0.2    0.0007679560  0.9119925  0.778115  0.8977833
##   0.2    0.0017740779  0.9238889  0.782355  0.9043750
##   0.2    0.0040983502  0.9362814  0.788130  0.9095091
##   0.2    0.0094677206  0.9461351  0.783980  0.9132114
##   0.2    0.0218716632  0.9515568  0.773055  0.9240023
##   0.2    0.0505263800  0.9552831  0.761840  0.9467803
##   0.2    0.1167224941  0.9566325  0.743340  0.9509720
##   0.2    0.2696441069  0.9506997  0.711135  0.9587962
##   0.3    0.0001439009  0.8948953  0.771030  0.8806598
##   0.3    0.0003324298  0.9014411  0.773985  0.8879409
##   0.3    0.0007679560  0.9113319  0.778485  0.8974909
##   0.3    0.0017740779  0.9238642  0.783500  0.9044727
##   0.3    0.0040983502  0.9370390  0.789580  0.9101000
##   0.3    0.0094677206  0.9470046  0.784790  0.9141023
##   0.3    0.0218716632  0.9526017  0.777685  0.9253705
##   0.3    0.0505263800  0.9577725  0.748915  0.9466773
##   0.3    0.1167224941  0.9571508  0.739710  0.9523303
##   0.3    0.2696441069  0.9470105  0.714255  0.9645409
##   0.4    0.0001439009  0.8942170  0.771190  0.8801091
##   0.4    0.0003324298  0.9007486  0.774145  0.8868932
##   0.4    0.0007679560  0.9105733  0.778610  0.8967242
##   0.4    0.0017740779  0.9236388  0.784035  0.9043811
##   0.4    0.0040983502  0.9376111  0.790515  0.9104030
##   0.4    0.0094677206  0.9475278  0.786135  0.9152083
##   0.4    0.0218716632  0.9544834  0.784175  0.9278083
##   0.4    0.0505263800  0.9597870  0.739580  0.9465652
##   0.4    0.1167224941  0.9542294  0.743350  0.9491280
##   0.4    0.2696441069  0.9356844  0.726895  0.9601697
##   0.5    0.0001439009  0.8934652  0.771235  0.8794121
##   0.5    0.0003324298  0.8999180  0.774325  0.8857538
##   0.5    0.0007679560  0.9097052  0.778815  0.8959167
##   0.5    0.0017740779  0.9234156  0.785015  0.9042955
##   0.5    0.0040983502  0.9377031  0.791075  0.9109258
##   0.5    0.0094677206  0.9481332  0.788420  0.9169970
##   0.5    0.0218716632  0.9561970  0.788005  0.9307538
##   0.5    0.0505263800  0.9604094  0.737255  0.9462311
##   0.5    0.1167224941  0.9509791  0.745440  0.9466432
##   0.5    0.2696441069  0.9210561  0.681955  0.9703205
##   0.6    0.0001439009  0.8925075  0.771080  0.8786742
##   0.6    0.0003324298  0.8988819  0.773995  0.8846485
##   0.6    0.0007679560  0.9087216  0.778215  0.8948856
##   0.6    0.0017740779  0.9229032  0.786100  0.9041545
##   0.6    0.0040983502  0.9374145  0.790790  0.9107750
##   0.6    0.0094677206  0.9487077  0.791910  0.9187144
##   0.6    0.0218716632  0.9575510  0.787665  0.9338318
##   0.6    0.0505263800  0.9602879  0.739295  0.9451212
##   0.6    0.1167224941  0.9485951  0.749475  0.9468174
##   0.6    0.2696441069  0.9199951  0.352010  0.9771939
##   0.7    0.0001439009  0.8913529  0.771355  0.8778864
##   0.7    0.0003324298  0.8976728  0.774165  0.8833121
##   0.7    0.0007679560  0.9073753  0.778175  0.8939227
##   0.7    0.0017740779  0.9221378  0.786650  0.9040871
##   0.7    0.0040983502  0.9368032  0.790830  0.9110856
##   0.7    0.0094677206  0.9493961  0.794605  0.9198061
##   0.7    0.0218716632  0.9586158  0.787065  0.9363432
##   0.7    0.0505263800  0.9592765  0.744465  0.9427826
##   0.7    0.1167224941  0.9437099  0.749260  0.9443235
##   0.7    0.2696441069  0.9194806  0.011830  0.9978621
##   0.8    0.0001439009  0.8901186  0.772415  0.8769636
##   0.8    0.0003324298  0.8961788  0.775050  0.8821182
##   0.8    0.0007679560  0.9059348  0.778705  0.8919606
##   0.8    0.0017740779  0.9207007  0.787120  0.9035015
##   0.8    0.0040983502  0.9359925  0.790735  0.9112545
##   0.8    0.0094677206  0.9500041  0.796835  0.9209712
##   0.8    0.0218716632  0.9594174  0.787885  0.9380242
##   0.8    0.0505263800  0.9578768  0.751910  0.9414924
##   0.8    0.1167224941  0.9374282  0.748690  0.9430750
##   0.8    0.2696441069  0.9177588  0.000040  0.9999833
##   0.9    0.0001439009  0.8883450  0.773995  0.8759061
##   0.9    0.0003324298  0.8938113  0.776450  0.8801152
##   0.9    0.0007679560  0.9037259  0.779200  0.8898545
##   0.9    0.0017740779  0.9184661  0.786955  0.9030311
##   0.9    0.0040983502  0.9348438  0.791265  0.9112333
##   0.9    0.0094677206  0.9505600  0.799165  0.9218083
##   0.9    0.0218716632  0.9600598  0.789330  0.9378977
##   0.9    0.0505263800  0.9563553  0.757495  0.9403576
##   0.9    0.1167224941  0.9302110  0.748335  0.9421068
##   0.9    0.2696441069  0.9158173  0.000000  1.0000000
##   1.0    0.0001439009  0.8862564  0.776300  0.8733462
##   1.0    0.0003324298  0.8917341  0.778115  0.8775591
##   1.0    0.0007679560  0.9012294  0.780095  0.8869500
##   1.0    0.0017740779  0.9166701  0.787225  0.9014591
##   1.0    0.0040983502  0.9350004  0.791665  0.9106955
##   1.0    0.0094677206  0.9515805  0.802095  0.9226273
##   1.0    0.0218716632  0.9606717  0.794515  0.9364333
##   1.0    0.0505263800  0.9548985  0.764665  0.9383273
##   1.0    0.1167224941  0.9223452  0.748605  0.9418333
##   1.0    0.2696441069  0.9053560  0.000000  1.0000000
## 
## ROC was used to select the optimal model using the largest value.
## The final values used for the model were alpha = 1 and lambda = 0.02187166.
predlr <- lrTrain$pred
predlr <- predlr[predlr[,"alpha"]==lrTrain$bestTune$alpha & predlr[,"lambda"]==lrTrain$bestTune$lambda, ]

In logistic regression notice how alpha = 1(lasso) performs best based on ROC. Let’s now train a support vector machine.

set.seed(112358)
svmControl <- trainControl(
  method = "repeatedcv",
  number = 10,
  repeats= repeats,
  classProbs = T,
  savePredictions = T
)

svmTrainLinear <- train(label~., data = subset(t, select=-c(Protection)),trControl = svmControl,method = "svmLinear", tuneLength = 10)
svmTrainRadial <- train(label~., data = subset(t, select=-c(Protection)),trControl = svmControl,method = "svmRadial", tuneLength = 10)

svmTrainLinear
## Support Vector Machines with Linear Kernel 
## 
## 167 samples
##  48 predictor
##   2 classes: 'High', 'Low' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 1000 times) 
## Summary of sample sizes: 150, 150, 150, 150, 151, 150, ... 
## Resampling results:
## 
##   Accuracy   Kappa    
##   0.8623524  0.6488982
## 
## Tuning parameter 'C' was held constant at a value of 1
svmTrainRadial
## Support Vector Machines with Radial Basis Function Kernel 
## 
## 167 samples
##  48 predictor
##   2 classes: 'High', 'Low' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 1000 times) 
## Summary of sample sizes: 151, 150, 151, 150, 151, 150, ... 
## Resampling results across tuning parameters:
## 
##   C       Accuracy   Kappa    
##     0.25  0.8729162  0.6904658
##     0.50  0.8719364  0.6647638
##     1.00  0.8681088  0.6531761
##     2.00  0.8660776  0.6479920
##     4.00  0.8705183  0.6633729
##     8.00  0.8660380  0.6517849
##    16.00  0.8613192  0.6385256
##    32.00  0.8552618  0.6254806
##    64.00  0.8532770  0.6207187
##   128.00  0.8529305  0.6197895
## 
## Tuning parameter 'sigma' was held constant at a value of 0.01357846
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were sigma = 0.01357846 and C = 0.25.
predSVMLinear <- svmTrainLinear$pred
predSVMRadial <- svmTrainRadial$pred

We see that a linear function does better than the radial basis function.

predSVM <- predSVMLinear

Let’s now train a K nearest neighbour model

## KNN
set.seed(112358)
knnControl <- trainControl(
  method = "repeatedcv",
  number = 10,
  repeats= repeats,
  savePredictions = T,
  classProbs= T
)
knnTrain <- train(label~., data = subset(t, select=-c(Protection)),trControl = knnControl,method = "knn", tuneLength = 10)
predKnn <- knnTrain$pred
predKnn <- predKnn[predKnn[,"k"]==knnTrain$bestTune$k,]

knnTrain
## k-Nearest Neighbors 
## 
## 167 samples
##  48 predictor
##   2 classes: 'High', 'Low' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 1000 times) 
## Summary of sample sizes: 150, 150, 150, 150, 151, 150, ... 
## Resampling results across tuning parameters:
## 
##   k   Accuracy   Kappa    
##    5  0.8850371  0.7125980
##    7  0.8829172  0.7067125
##    9  0.8776160  0.6920031
##   11  0.8711967  0.6763921
##   13  0.8747048  0.6892204
##   15  0.8806433  0.7059315
##   17  0.8817743  0.7084903
##   19  0.8738948  0.6884186
##   21  0.8781741  0.6983748
##   23  0.8842415  0.7097732
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was k = 5.

Let’s now compute the ROC curves for all four classifiers. The function getRocCV computes the ROC curves for each iteration(We used a 1000 iterations for each classifier). getAvgRoc plots the average of all 1000 ROC curves.

library(ROCR)
## Loading required package: gplots
## 
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
## 
##     lowess
getRocCV <- function(pred){
    temp.obs <- c()
    temp.pred <- c()
    for(i in seq(1, repeats)){
        temp <- paste("Rep", sprintf("%04d",i), sep="")
        temp.p <- pred[grepl(temp, pred[,"Resample"]), "Low"]
        temp.o <- pred[grepl(temp, pred[,"Resample"]), "obs"]
        temp.pred[[length(temp.pred)+1]] <- temp.p
        temp.obs[[length(temp.obs)+1]] <- temp.o
    }
    return(list(temp.pred, temp.obs));
}

getAvgRoc <- function(roc){
    m <- max(sapply(roc@x.values, length))
    resx <- sapply(roc@x.values, function(x){
        x <- c(x, rep(NA, m-length(x)));
    });
    resy <- sapply(roc@y.values, function(x){
        x <- c(x, rep(NA, m-length(x)));
    });
    roc.df <- data.frame(rowMeans(as.data.frame(resx), na.rm=T), rowMeans(as.data.frame(resy), na.rm=T))
    colnames(roc.df) <- c("meanx", "meany")
    return(roc.df)
}

predRf <- getRocCV(predRf)
p <- prediction(predRf[1][[1]], predRf[2][[1]])
rocRf <- performance(p, "tpr", "fpr")
rocRf.avg <- getAvgRoc(rocRf)

predlr <- getRocCV(predlr)
p <- prediction(predlr[1][[1]], predlr[2][[1]])
roclr <- performance(p, "tpr", "fpr")
roclr.avg <- getAvgRoc(roclr)

predSVM <- getRocCV(predSVM)
p <- prediction(predSVM[1][[1]], predSVM[2][[1]])
rocSvm <- performance(p, "tpr", "fpr")
rocSvm.avg <- getAvgRoc(rocSvm)

predKnn <- getRocCV(predKnn)
p <- prediction(predKnn[1][[1]], predKnn[2][[1]])
rocKnn <- performance(p, "tpr", "fpr")
rocKnn.avg <- getAvgRoc(rocKnn)

Let’s compute the average accuracy and AUC across all 1000 iterations for each classifier and plot the ROC curves.

l <- c("Random Forest", "Logistic Regression", "SVM", "KNN")
## Get AUC
r <- list(predRf, predlr, predSVM, predKnn)
auc <- c()
acc <- c()
for(i in seq(1, length(r))){
    p <- prediction(r[i][[1]][[1]], r[i][[1]][[2]])
    a <- performance(p, "auc")
    a <- mean(unlist(a@y.values))
    auc <- c(auc,a)
    a <- performance(p, "acc")
    a <- mean(unlist(a@y.values))
    acc <- c(acc,a)
}

acc
## [1] 0.6985934 0.6864788 0.6614572 0.7449007
auc
## [1] 0.9493283 0.9579699 0.8965198 0.9211666
r <- list(rocRf, roclr, rocSvm, rocKnn)
r.avg <- list(rocRf.avg, roclr.avg, rocSvm.avg, rocKnn.avg)
rocdf <- as.data.frame(matrix(ncol = 4, nrow = 0))
rocdf.avg <- as.data.frame(matrix(ncol = 3, nrow = 0))
for(i in seq(1, length(r))){
    r.t <- r[i]
    for(j in seq(1, repeats)){
        temp <- data.frame(r.t[[1]]@x.values[[j]], r.t[[1]]@y.values[[j]], rep(l[i], length(r.t[[1]]@y.values[[j]])), rep(paste(l[i], j, sep="."), length(r.t[[1]]@y.values[[j]])))
        colnames(temp) <- c("x", "y", "classifier", "rep")
        rocdf <- rbind(rocdf, temp)
    }
    r.t <- r.avg[[i]]
    temp <- data.frame(r.t$meanx, r.t$meany, rep(paste(l[i], "Average", sep=" "), length(r.t$meanx)))
    colnames(temp) <- c("meanx", "meany", "classifieravg")
    rocdf.avg <- rbind(rocdf.avg, temp)
}
colnames(rocdf) <- c("x", "y", "classifier", "rep")

library(RColorBrewer)

classifierColor <- brewer.pal(9,"Dark2")
## Warning in brewer.pal(9, "Dark2"): n too large, allowed maximum for palette Dark2 is 8
## Returning the palette you asked for with that many colors
cc <- c()
ll <- c()
for(i in seq(1, length(l))){
    cc <- c(cc, rep(classifierColor[i], 2))
    ll <- c(ll, l[i], paste(l[i], "Average", sep=" "))
}
names(cc) <- ll

pp <- ggplot(rocdf[rocdf[,"classifier"]=="Random Forest",]) + geom_line(aes(x = meanx, y = meany), data=rocdf.avg[rocdf.avg[,"classifieravg"]=="Random Forest Average",], alpha=1, size=0.5) + geom_abline(color="#707070") + geom_boxplot(alpha=0.05, aes(x=x, y=y,group = cut_width(x, 0.02))) + scale_color_manual(values=cc) +xlab("FPR") +ylab("TPR") + ggtitle(paste("ROC Curves for Random Forest model with 10-fold CV repeated", repeats, "times"))
pp

pp <- ggplot(rocdf[rocdf[,"classifier"]=="Logistic Regression",]) + geom_line(aes(x = meanx, y = meany), data=rocdf.avg[rocdf.avg[,"classifieravg"]=="Logistic Regression Average",], alpha=1, size=0.5) + geom_abline(color="#707070") + geom_boxplot(alpha=0.05, aes(x=x, y=y,group = cut_width(x, 0.02))) + scale_color_manual(values=cc) +xlab("FPR") +ylab("TPR") + ggtitle(paste("ROC Curves for Logistic Regression model with 10-fold CV repeated", repeats, "times"))
pp

pp <- ggplot(rocdf[rocdf[,"classifier"]=="SVM",]) + geom_line(aes(x = meanx, y = meany), data=rocdf.avg[rocdf.avg[,"classifieravg"]=="SVM Average",], alpha=1, size=0.5) + geom_abline(color="#707070") + geom_boxplot(alpha=0.05, aes(x=x, y=y,group = cut_width(x, 0.02))) + scale_color_manual(values=cc) +xlab("FPR") +ylab("TPR") + ggtitle(paste("ROC Curves for SVM with 10-fold CV repeated", repeats, "times"))
pp

pp <- ggplot(rocdf[rocdf[,"classifier"]=="KNN",]) + geom_line(aes(x = meanx, y = meany), data=rocdf.avg[rocdf.avg[,"classifieravg"]=="KNN Average",], alpha=1, size=0.5) + geom_abline(color="#707070") + geom_boxplot(alpha=0.05, aes(x=x, y=y,group = cut_width(x, 0.02))) + scale_color_manual(values=cc) +xlab("FPR") +ylab("TPR") + ggtitle(paste("ROC Curves for KNN model with 10-fold CV repeated", repeats, "times"))
pp

pp <- ggplot() + geom_line(aes(meanx, meany, color=classifieravg), data=rocdf.avg, alpha=1, size=0.5) + geom_abline(color="#707070") + scale_color_manual(values=cc) +xlab("FPR") +ylab("TPR") + ggtitle(paste("Average ROC Curves for 10-fold CV repeated", repeats, "times"))
for(i in seq(1, length(auc))){
    print(i)
    pp <- pp + annotate("text", label = paste(l[i], "AUC:", round(auc[i], 3), "ACC:", round(acc[i], 3), sep=" "), x = 1, y = 0.25-(i*0.05), size = 4, colour = cc[l[i]][[1]], hjust=1)
}
## [1] 1
## [1] 2
## [1] 3
## [1] 4
pp <- pp + theme_bw()+ theme(text = element_text(size=14), axis.text = element_text(size=14), panel.border = element_blank(), panel.grid.major.x = element_blank(), panel.grid.major.y = element_blank(), panel.grid.minor = element_blank(), axis.line = element_line(colour = "black")) + ggtitle("ROC curves across 1000 iterations")
pp

Let’s take a look at the features selected by the logistic regression

bestcoef <- coef(lrTrain$finalModel, s = lrTrain$bestTune$lambda)
bestcoef <- as.data.frame(as.matrix(bestcoef))
temp <- bestcoef[bestcoef[,"1"]!=0 & rownames(bestcoef)!="(Intercept)",]
temp.name <- rownames(bestcoef)[bestcoef[,"1"]!=0 & rownames(bestcoef)!="(Intercept)"]
temp <- as.data.frame(temp)
colnames(temp) <- c("val")
temp$name <- temp.name
temp[,"val"] <- temp["val"]*-1
col <- as.vector(unlist(lapply(temp[,"val"], function(x){if(x>0) "Positive" else "Negative"})))
ggplot(temp, aes(y=val, x=reorder(name, abs(val)))) + geom_bar(stat="identity", aes(fill=col)) + coord_flip()+xlab("Feature") +ylab("Coeffecient") + scale_fill_manual(guide=FALSE, values=c("Red", "Blue")) + theme(panel.background = element_rect(fill = '#FFFFFF', colour = '#000000'))

We see here taht Neut_micro and Neut_VSV are the only neutralization readouts selected. Let’s examine the correlations between the different correlation readouts.

library(reshape2)
c <- cor(df[,c("Protection", "Neut_micro", "Neut_dVP30", "Neut_VSV", "unNeutFrac")])
c
##            Protection Neut_micro Neut_dVP30  Neut_VSV unNeutFrac
## Protection  1.0000000  0.7264262  0.6954513 0.6874017  0.7251843
## Neut_micro  0.7264262  1.0000000  0.7432890 0.8452369  0.8492636
## Neut_dVP30  0.6954513  0.7432890  1.0000000 0.8315803  0.8916419
## Neut_VSV    0.6874017  0.8452369  0.8315803 1.0000000  0.9705272
## unNeutFrac  0.7251843  0.8492636  0.8916419 0.9705272  1.0000000
c <- melt(c)
ggplot(c, aes(x=Var1, y=Var2)) + geom_tile(aes(fill=value)) + scale_fill_gradient(low="white", high="steelblue")

Let’s look at the distribution of the four neutrlization readouts vs Protection.

library(patchwork)

plts <- list()
for(i in c("Neut_micro", "Neut_dVP30", "Neut_VSV", "unNeutFrac")){
    p <- ggplot(df, aes_string(x=i,y="Protection")) + geom_point() + geom_smooth(method='lm') + theme_bw()
    plts[[length(plts)+1]] <- p
}

p <- plts[[1]]
for(x in plts[-1]){
    p <- p+ x
}

p + plot_layout(ncol = 2)

We see that the four neutralization readouts are highyl correlated. Let’s try removing Neut_microand Neut_dVP30 to see how the classifier does with unNeutFrac and Neut_VSV.

t <- subset(t, select=-c(Neut_micro, Neut_dVP30))
lrControl <- trainControl(
  method = "repeatedcv",
  number = 10,
  repeats= repeats,
  verboseIter = FALSE,
  returnData = FALSE,
  allowParallel = TRUE,
  summaryFunction = twoClassSummary,
  classProbs = TRUE,
  savePredictions = T
)

set.seed(112358)
lrTrain <- train(
  x = as.matrix(subset(t, select=-c(label, Protection))),
  y = as.character(t[,"label"]),
  trControl = lrControl,
  method = "glmnet",
  tuneLength = 10
)
## Warning in train.default(x = as.matrix(subset(t, select = -c(label,
## Protection))), : The metric "Accuracy" was not in the result set. ROC will
## be used instead.
lrTrain
## glmnet 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 1000 times) 
## Summary of sample sizes: 150, 150, 150, 150, 151, 150, ... 
## Resampling results across tuning parameters:
## 
##   alpha  lambda        ROC        Sens      Spec     
##   0.1    0.0001407182  0.8716917  0.708545  0.8707682
##   0.1    0.0003250773  0.8726078  0.709085  0.8716583
##   0.1    0.0007509707  0.8850941  0.712015  0.8847492
##   0.1    0.0017348398  0.8975295  0.705670  0.8956606
##   0.1    0.0040077050  0.9090007  0.701920  0.9010129
##   0.1    0.0092583185  0.9191009  0.709030  0.9072992
##   0.1    0.0213879171  0.9274150  0.703090  0.9178015
##   0.1    0.0494088638  0.9339700  0.711125  0.9356818
##   0.1    0.1141408866  0.9365498  0.720145  0.9487015
##   0.1    0.2636802588  0.9337961  0.707345  0.9699167
##   0.2    0.0001407182  0.8604415  0.694380  0.8608947
##   0.2    0.0003250773  0.8719087  0.708780  0.8709652
##   0.2    0.0007509707  0.8849660  0.712210  0.8842962
##   0.2    0.0017348398  0.8981791  0.706705  0.8961818
##   0.2    0.0040077050  0.9104803  0.701815  0.9023136
##   0.2    0.0092583185  0.9211828  0.707240  0.9094795
##   0.2    0.0213879171  0.9296668  0.706240  0.9222652
##   0.2    0.0494088638  0.9371056  0.715165  0.9387470
##   0.2    0.1141408866  0.9375664  0.730145  0.9557758
##   0.2    0.2636802588  0.9310188  0.658375  0.9617576
##   0.3    0.0001407182  0.8583485  0.690935  0.8588530
##   0.3    0.0003250773  0.8710855  0.707985  0.8702750
##   0.3    0.0007509707  0.8845583  0.712455  0.8835659
##   0.3    0.0017348398  0.8988328  0.707640  0.8966424
##   0.3    0.0040077050  0.9118799  0.701710  0.9036447
##   0.3    0.0092583185  0.9231815  0.704730  0.9116045
##   0.3    0.0213879171  0.9319316  0.708700  0.9252538
##   0.3    0.0494088638  0.9386165  0.723500  0.9424485
##   0.3    0.1141408866  0.9367500  0.743730  0.9523977
##   0.3    0.2636802588  0.9278429  0.585910  0.9529621
##   0.4    0.0001407182  0.8571024  0.689275  0.8576235
##   0.4    0.0003250773  0.8701175  0.707105  0.8692083
##   0.4    0.0007509707  0.8840902  0.712790  0.8826333
##   0.4    0.0017348398  0.8992454  0.708640  0.8971492
##   0.4    0.0040077050  0.9131922  0.702595  0.9049197
##   0.4    0.0092583185  0.9249430  0.703540  0.9140326
##   0.4    0.0213879171  0.9341662  0.709130  0.9278992
##   0.4    0.0494088638  0.9398105  0.732710  0.9443288
##   0.4    0.1141408866  0.9358463  0.743265  0.9435598
##   0.4    0.2636802588  0.9231022  0.403625  0.9667697
##   0.5    0.0001407182  0.8556533  0.687020  0.8564053
##   0.5    0.0003250773  0.8688303  0.705675  0.8682879
##   0.5    0.0007509707  0.8835236  0.713085  0.8813008
##   0.5    0.0017348398  0.8996328  0.709470  0.8972106
##   0.5    0.0040077050  0.9144863  0.704090  0.9063273
##   0.5    0.0092583185  0.9265201  0.702055  0.9164970
##   0.5    0.0213879171  0.9360828  0.709185  0.9283811
##   0.5    0.0494088638  0.9408444  0.743195  0.9465780
##   0.5    0.1141408866  0.9345570  0.729725  0.9366114
##   0.5    0.2636802588  0.8973614  0.012780  0.9965136
##   0.6    0.0001407182  0.8540244  0.684285  0.8548689
##   0.6    0.0003250773  0.8675458  0.704025  0.8668045
##   0.6    0.0007509707  0.8825884  0.713015  0.8800856
##   0.6    0.0017348398  0.8997948  0.709645  0.8970545
##   0.6    0.0040077050  0.9155266  0.705430  0.9076477
##   0.6    0.0092583185  0.9282102  0.702440  0.9196371
##   0.6    0.0213879171  0.9375682  0.711915  0.9277705
##   0.6    0.0494088638  0.9415923  0.751220  0.9458098
##   0.6    0.1141408866  0.9321814  0.721720  0.9293038
##   0.6    0.2636802588  0.8667154  0.000085  0.9999583
##   0.7    0.0001407182  0.8519575  0.681820  0.8529447
##   0.7    0.0003250773  0.8659119  0.701655  0.8651182
##   0.7    0.0007509707  0.8814914  0.712635  0.8789561
##   0.7    0.0017348398  0.8997292  0.710325  0.8960727
##   0.7    0.0040077050  0.9164052  0.707125  0.9086985
##   0.7    0.0092583185  0.9297317  0.703870  0.9227977
##   0.7    0.0213879171  0.9388396  0.716710  0.9284326
##   0.7    0.0494088638  0.9419743  0.751940  0.9413152
##   0.7    0.1141408866  0.9308437  0.722670  0.9250250
##   0.7    0.2636802588  0.8675297  0.000000  1.0000000
##   0.8    0.0001407182  0.8495185  0.678460  0.8506083
##   0.8    0.0003250773  0.8638317  0.698460  0.8627409
##   0.8    0.0007509707  0.8801207  0.712975  0.8771598
##   0.8    0.0017348398  0.8991881  0.710715  0.8945561
##   0.8    0.0040077050  0.9170448  0.708450  0.9092712
##   0.8    0.0092583185  0.9310097  0.705920  0.9245977
##   0.8    0.0213879171  0.9400690  0.721660  0.9302288
##   0.8    0.0494088638  0.9419780  0.748975  0.9358068
##   0.8    0.1141408866  0.9273848  0.737350  0.9196295
##   0.8    0.2636802588  0.8684132  0.000000  1.0000000
##   0.9    0.0001407182  0.8461051  0.675720  0.8475758
##   0.9    0.0003250773  0.8610645  0.694740  0.8592553
##   0.9    0.0007509707  0.8780555  0.711885  0.8745826
##   0.9    0.0017348398  0.8985008  0.711180  0.8928485
##   0.9    0.0040077050  0.9176233  0.710035  0.9093985
##   0.9    0.0092583185  0.9318462  0.706620  0.9243265
##   0.9    0.0213879171  0.9413319  0.727950  0.9323992
##   0.9    0.0494088638  0.9415059  0.747355  0.9328561
##   0.9    0.1141408866  0.9213497  0.765630  0.9124886
##   0.9    0.2636802588  0.8711438  0.000000  1.0000000
##   1.0    0.0001407182  0.8414896  0.673195  0.8409727
##   1.0    0.0003250773  0.8573410  0.689490  0.8534902
##   1.0    0.0007509707  0.8763516  0.712485  0.8701447
##   1.0    0.0017348398  0.8983501  0.715020  0.8907182
##   1.0    0.0040077050  0.9187792  0.716665  0.9098008
##   1.0    0.0092583185  0.9330482  0.713940  0.9236742
##   1.0    0.0213879171  0.9422862  0.739255  0.9318583
##   1.0    0.0494088638  0.9403839  0.749090  0.9287652
##   1.0    0.1141408866  0.9066605  0.783610  0.9078826
##   1.0    0.2636802588  0.8711618  0.000000  1.0000000
## 
## ROC was used to select the optimal model using the largest value.
## The final values used for the model were alpha = 1 and lambda = 0.02138792.
predlr <- lrTrain$pred
predlr <- predlr[predlr[,"alpha"]==lrTrain$bestTune$alpha & predlr[,"lambda"]==lrTrain$bestTune$lambda, ]

predlr <- getRocCV(predlr)
p <- prediction(predlr[1][[1]], predlr[2][[1]])
a <- performance(p, "auc")
a <- mean(unlist(a@y.values))
auc <- a
a <- performance(p, "acc")
a <- mean(unlist(a@y.values))
acc <-a

auc
## [1] 0.9372778
acc
## [1] 0.6780533
bestcoef <- coef(lrTrain$finalModel, s = lrTrain$bestTune$lambda)
bestcoef <- as.data.frame(as.matrix(bestcoef))
temp <- bestcoef[bestcoef[,"1"]!=0 & rownames(bestcoef)!="(Intercept)",]
temp.name <- rownames(bestcoef)[bestcoef[,"1"]!=0 & rownames(bestcoef)!="(Intercept)"]
temp <- as.data.frame(temp)
colnames(temp) <- c("val")
temp$name <- temp.name
temp[,"val"] <- temp["val"]*-1
col <- as.vector(unlist(lapply(temp[,"val"], function(x){if(x>0) "Positive" else "Negative"})))
ggplot(temp, aes(y=val, x=reorder(name, abs(val)))) + geom_bar(stat="identity", aes(fill=col)) + coord_flip()+xlab("Feature") +ylab("Coeffecient") + scale_fill_manual(guide=FALSE, values=c("Red", "Blue")) + theme(panel.background = element_rect(fill = '#FFFFFF', colour = '#000000'), panel.grid.major.x = element_blank(), panel.grid.major.y = element_blank())