Feature Selection and mAb Classification

Read dataset and preprocess.

Remove mAb 135 because it is missing Neut_micro.

df <- read.csv("./data/master_log.csv", na.strings=c("?", "nd"), strip.white=TRUE, row.names=c("Ab"))
df <- df[!rownames(df) %in% c("135"),]

Convert Epitope Class to a factor.

df$Epitope_Class <- factor(df$Epitope_Class)

Drop columns with missing values.

df <- subset(df, select=-c(Protect_binary, Epitope_Class_ELISA, Endotoxin, Endotoxin.1, mW_Loss, aTTD, Epitope.Class..assigning.method.)) #Remove columns with empty values.
df[,"Escape..code"] <- as.factor(df[,"Escape..code"])
df[,"Makona.binding"] <- as.factor(df[,"Makona.binding"])

Transform into IIP.

df[,"unNeutFrac"] <- -log10(1/df[,"unNeutFrac"])
df[,"Neut_dVP30"] <- 1- df[,"Neut_dVP30"]
df[,"Neut_VSV"] <- log10(100/df[,"Neut_VSV"])

Let’s examine the correlations between the different neutralization readouts.

library(ggplot2)
theme_set(theme_bw(base_size = 14))
library(reshape2)
c <- cor(df[,c("Protection", "Neut_micro", "Neut_dVP30", "Neut_VSV", "unNeutFrac")])
c
##            Protection Neut_micro Neut_dVP30   Neut_VSV unNeutFrac
## Protection  1.0000000  0.7264262  0.6954513  0.7210592 -0.7437038
## Neut_micro  0.7264262  1.0000000  0.7432890  0.7995016 -0.7810941
## Neut_dVP30  0.6954513  0.7432890  1.0000000  0.8395953 -0.9443173
## Neut_VSV    0.7210592  0.7995016  0.8395953  1.0000000 -0.9178748
## unNeutFrac -0.7437038 -0.7810941 -0.9443173 -0.9178748  1.0000000
c <- melt(c)
ggplot(c, aes(x=Var1, y=Var2)) + geom_tile(aes(fill=value)) + scale_fill_gradient(low="white", high="steelblue")

Let’s look at the distribution of the four neutrlization readouts vs Protection.

library(patchwork)

plts <- list()
for(i in c("Neut_micro", "Neut_dVP30", "Neut_VSV", "unNeutFrac")){
    p <- ggplot(df, aes_string(x=i,y="Protection")) + geom_point() + geom_smooth(method='lm') + theme_bw()
    plts[[length(plts)+1]] <- p
}

p <- plts[[1]]
for(x in plts[-1]){
    p <- p+ x
}

p + plot_layout(ncol = 2)

Separate Human IgG1 and Mouse IgG1

levels(df[,"Isotype"]) <- c(levels(df[,"Isotype"]), "HumanIgG1")
df[df[,"Isotype"]=="IgG1" & df[,"Species"]=="human", "Isotype"] <- "HumanIgG1"
levels(df[,"Isotype"]) <- c(levels(df[,"Isotype"]), "MouseIgG1")
df[df[,"Isotype"]=="IgG1" & df[,"Species"]=="mouse", "Isotype"] <- "MouseIgG1"
df[,"Isotype"] <- droplevels(df[,"Isotype"])

Convert polyfunctionality to numeric type.

df[,"Polyfunctionality"] <- as.numeric(df[,"Polyfunctionality"])

Add Epitope Tier columns

df[df$Epitope_Class %in% c("Cap", "GP1/Head", "Mucin"), "Epitope_Tier"] <- "Tier1"
df[df$Epitope_Class %in% c("Base", "GP1/Core", "Fusion"), "Epitope_Tier"] <- "Tier2"
df[df$Epitope_Class %in% c("GP1/2", "HR2"), "Epitope_Tier"] <- "Tier3"
df[df$Epitope_Class %in% c("Unknown"), "Epitope_Tier"] <- "TierUnknown"

Dropidentical features and features with missing experimental data.

t <- subset(df, select=-c(Round, Makona.binding, Escape..code, Epitope_Tier, Total_SA,Cross.reactivity))
t <- subset(t, select=-c(Isotype))

Remove features with near zero variance. Features are centered and scaled.

library(caret)
s <- dummyVars(~., data <- t, levelsOnly=TRUE)
q <- predict(s, t)
h <- nearZeroVar(q, saveMetrics=TRUE)

t.var <- apply(q, 2, var)
t.var <- as.data.frame(t.var)
t.var[,"Var"] <- rownames(t.var)
t.var[,"nzv"] <- h$nzv
ggplot(t.var, aes(x=reorder(Var, t.var), y=t.var)) + geom_bar(stat="identity", aes(fill=nzv)) + scale_y_continuous(trans='log10') + theme(axis.text.x = element_text(angle = 90, hjust = 1)) + scale_fill_manual(values = c("steelblue", "indianred"),name= "Near Zero Variance") + ggtitle("Features with near zero variance that will be removed.") +xlab("Feature") + ylab("log10(Variance)")

q <- q[,(colnames(q) %in% rownames(h[h[,"nzv"]==FALSE ,]))]
q <- subset(q, select =-c(Unknown, sGP.binderFALSE))

norm <- preProcess(q, method=c("center", "scale"))
q <- predict(norm, q)
q <- as.data.frame(q)
head(q)
##   Protection      Base        Cap     GP1/2   GP1/Core      Mucin
## 1  0.4289280 -0.301588  1.6930603 -0.301588 -0.4854046 -0.3677495
## 2 -1.0652518 -0.301588 -0.5871096 -0.301588 -0.4854046  2.7029591
## 3 -0.7664158 -0.301588  1.6930603 -0.301588 -0.4854046 -0.3677495
## 4 -0.4675799 -0.301588 -0.5871096 -0.301588 -0.4854046  2.7029591
## 5 -0.1687439 -0.301588  1.6930603 -0.301588 -0.4854046 -0.3677495
## 6  0.1300921 -0.301588 -0.5871096 -0.301588 -0.4854046  2.7029591
##     Neut_VSV unNeutFrac Neut_micro Neut_dVP30    GP_EC50   sGP_EC50
## 1 -0.6613522  0.6363919 -1.0074874 -0.6029719 -0.3779862 -1.0684007
## 2 -0.6613522  0.6363919 -1.0074874 -0.6029719 -0.3799966  0.9542301
## 3 -0.6613522  0.6363919 -0.5342092 -0.6029719  0.3680830 -1.0247257
## 4 -0.6613522  0.6363919 -1.0074874 -0.6029719 -0.3809804  0.9542301
## 5  1.0107626  0.1670253  0.4596750 -0.6029719 -0.1946556 -1.0545575
## 6 -0.6613522  0.6363919 -1.0074874 -0.6029719 -0.3654533  0.9542301
##        human      mouse Polyfunctionality     huADCP      mADCP     CD107a
## 1  0.8059481 -0.8059481         1.5775151  0.9420515  1.4928146  1.4855860
## 2  0.8059481 -0.8059481         1.5775151 -0.6288721  1.0345560  2.0526654
## 3 -1.2333449  1.2333449        -0.5730565  0.4718370 -0.2366025 -0.4504150
## 4  0.8059481 -0.8059481         1.5775151  0.6436291  0.8137532  2.4809491
## 5 -1.2333449  1.2333449        -0.5730565  1.7027185 -0.2927284 -0.4067935
## 6 -1.2333449  1.2333449        -0.0354136 -0.1046406  0.5491598 -0.2783084
##         IFNg     MIP.1b     huADNP       mADNP sGP_huADCP sGP_huADNP
## 1  1.4721098  1.6246636  4.3450612 -0.12824504  1.0847236  1.0050389
## 2  1.9820216  2.5693466  3.3395864 -0.61622083 -0.3945998  0.4790355
## 3 -0.4327751 -0.5714592 -0.6443498 -0.28376405  1.0175055 -0.3613869
## 4  2.1313530  3.1345643  2.4476388 -0.94029636 -0.3672924 -0.1666382
## 5 -0.4021804 -0.5029962 -0.5420307 -0.87790251  0.3006875 -0.1921194
## 6 -0.4262191 -0.3862376  0.2830299 -0.04536366 -1.1169318 -0.4437456
##   Total_G0   Total_G1   Total_G2 Total_Fucose Total_Bisecting
## 1 2.337267 -2.5251621 -0.8281199   -5.3719604      -0.1683970
## 2 1.619398 -1.6106900 -0.8294177   -2.4561018      -0.7369070
## 3 0.701013 -0.5869325 -0.5607759    0.3274304      -0.9737862
## 4 2.546092 -2.6400880 -1.1045484   -5.3980247      -0.4810775
## 5 1.325271 -1.5218515 -0.3038141    0.2532474      -0.3863258
## 6 1.033914 -1.0565072 -0.4777175    0.3341135      -0.9927366
##   Total_mono_SA         G0        G0F       G0FB         G1         G1.
## 1    -0.1427142  5.5195898 -2.1311307 -1.1673856 -0.7037159  0.22572059
## 2     0.3107384  2.5342760 -0.3934907 -0.7431267 -0.2825927 -0.25374768
## 3     0.6070378 -0.2220388  0.9525519 -0.6017070 -0.4184389 -0.10914614
## 4    -0.4144584  5.5077929 -1.9230581 -0.6788450 -0.6357928  0.26377363
## 5     1.2618430 -0.2539599  1.6387239 -0.6017070 -0.2554234  0.08111905
## 6     0.6905255 -0.2504902  1.3295374 -0.6402760 -0.3641004 -0.37551740
##         G1F        G1F.       G1S1F       G1FB        G2F         G2FB
## 1 -1.797929 -2.00926166  0.43313807 -0.4367530 -0.6487676 -0.329686262
## 2 -1.285002 -1.26239409  0.65351661 -0.3973272 -0.9934518 -0.351766713
## 3 -1.299307 -0.07266768 -0.02840944 -0.5944562 -1.2077149 -0.351766713
## 4 -1.771363 -1.99464586 -0.35274011 -0.6601659 -1.0291623 -0.219284004
## 5 -1.509791 -1.15058320  0.85726280 -0.5681724 -1.2387675 -0.009519716
## 6 -1.381048 -0.48848729 -0.09909689 -0.6995917 -1.2092675 -0.384887390
##         G2S1       G2S1F       G2S1B     G2S1FB sGP.binderTRUE
## 1  0.1525347 -0.70223990  1.64704157  0.5254709      1.0522613
## 2  0.4920949 -0.01897163 -0.07530043 -0.4418074     -0.9446437
## 3 -0.5077212  1.13312379 -0.96759809 -0.4686763      1.0522613
## 4  0.7373328 -0.64968080  0.69249058 -0.1328157     -0.9446437
## 5 -0.3945344  1.29921054  1.14901497 -0.6164549      1.0522613
## 6 -0.2750596  1.17937580 -0.42806927 -0.6836270     -0.9446437

Let’s build a classifier to classify mAbs into “high” and “low” protection. We set the threshold between “high” and “low” as 0.6.

threshold <- 0.6
t[t[,"Protection"]<threshold,"label"] <- "Low"
t[(t[,"Protection"]>=threshold),"label"] <- "High"
q[,"label"] <- t[,"label"]

t <- q

Before we start training our models. Let’s enable R to use multiple cores.

library(doMC)
## Loading required package: foreach
## Loading required package: iterators
## Loading required package: parallel
registerDoMC(cores = 16)

Let’s first train a Random Forest Model and get the predictions.

repeats <- 1000
set.seed(112358)
rfControl <- trainControl(
  method = "repeatedcv",
  number = 10,
  repeats= repeats,
  verboseIter = FALSE,
  returnData = FALSE,
  allowParallel = TRUE,
  summaryFunction = twoClassSummary,
  classProbs = TRUE,
  savePredictions = T,
  search="grid"
)

rfTrain <- train(
  x = as.matrix(subset(t, select=-c(label, Protection))),
  y = as.character(t[,"label"]),
  trControl = rfControl,
  method = "rf",
  ntree = 1000,
  tuneLength = 10
)
## Warning in train.default(x = as.matrix(subset(t, select = -c(label,
## Protection))), : The metric "Accuracy" was not in the result set. ROC will
## be used instead.
rfTrain
## Random Forest 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 1000 times) 
## Summary of sample sizes: 150, 150, 150, 150, 151, 150, ... 
## Resampling results across tuning parameters:
## 
##   mtry  ROC        Sens      Spec     
##    2    0.9532592  0.739830  0.9609015
##    6    0.9528026  0.799540  0.9483038
##   11    0.9491424  0.814320  0.9477629
##   16    0.9450272  0.812080  0.9462788
##   21    0.9413282  0.809870  0.9432970
##   25    0.9387529  0.808605  0.9404811
##   30    0.9365608  0.807020  0.9356091
##   35    0.9344291  0.805315  0.9308591
##   40    0.9329533  0.804490  0.9273265
##   45    0.9316744  0.804155  0.9252356
## 
## ROC was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 2.
rfTrain$finalModel
## 
## Call:
##  randomForest(x = x, y = y, ntree = 1000, mtry = param$mtry) 
##                Type of random forest: classification
##                      Number of trees: 1000
## No. of variables tried at each split: 2
## 
##         OOB estimate of  error rate: 9.58%
## Confusion matrix:
##      High Low class.error
## High   36  12  0.25000000
## Low     4 115  0.03361345
predRf <- rfTrain$pred
predRf <- predRf[predRf[,"mtry"]==rfTrain$bestTune$mtry,]

Let’s plot feature importance for the Random Forest model.

impRf <- varImp(rfTrain$finalModel, scaled=FALSE)
impRf[,"Variable"] <- rownames(impRf)
impRf <- impRf[with(impRf, order(-Overall)),]
impRf[,"Variable"] <- factor(impRf[,"Variable"], levels = impRf[,"Variable"])
pp <- ggplot(impRf, aes(Variable, Overall)) + geom_bar(stat="identity", fill="#000000")
pp + theme_bw()+ theme(text = element_text(size=14), axis.text = element_text(size=14), panel.border = element_blank(), panel.grid.major.x = element_blank(), panel.grid.major.y = element_line( size=.3, color="#f5f5f5"), panel.grid.minor = element_blank(), axis.line = element_line(colour = "black"), axis.text.x = element_text(angle = 90, hjust = 1)) + xlab("Feature") + ylab("Mean Decrease in Gini Index")

Let’s now train a logistic regression model with elastic net regularization.

require(methods)
## Loading required package: methods
lrControl <- trainControl(
  method = "repeatedcv",
  number = 10,
  repeats= repeats,
  verboseIter = FALSE,
  returnData = FALSE,
  allowParallel = TRUE,
  summaryFunction = twoClassSummary,
  classProbs = TRUE,
  savePredictions = T
)

set.seed(112358)
lrTrain <- train(
  x = as.matrix(subset(t, select=-c(label, Protection))),
  y = as.character(t[,"label"]),
  trControl = lrControl,
  method = "glmnet",
  family="binomial",
  tuneLength = 10
)
## Warning in train.default(x = as.matrix(subset(t, select = -c(label,
## Protection))), : The metric "Accuracy" was not in the result set. ROC will
## be used instead.
lrTrain
## glmnet 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 1000 times) 
## Summary of sample sizes: 150, 150, 150, 150, 151, 150, ... 
## Resampling results across tuning parameters:
## 
##   alpha  lambda        ROC        Sens      Spec     
##   0.1    0.0001502019  0.9137703  0.778490  0.8895182
##   0.1    0.0003469858  0.9144522  0.779090  0.8903341
##   0.1    0.0008015822  0.9228753  0.781465  0.9011644
##   0.1    0.0018517590  0.9308202  0.786980  0.9049947
##   0.1    0.0042778035  0.9393052  0.792255  0.9071447
##   0.1    0.0098822812  0.9474335  0.800670  0.9161008
##   0.1    0.0228293518  0.9529352  0.790285  0.9291644
##   0.1    0.0527387649  0.9578061  0.764705  0.9516432
##   0.1    0.1218333901  0.9602731  0.747650  0.9571068
##   0.1    0.2814509401  0.9590345  0.744310  0.9593371
##   0.2    0.0001502019  0.9070355  0.773150  0.8827727
##   0.2    0.0003469858  0.9138259  0.778545  0.8896500
##   0.2    0.0008015822  0.9225662  0.781875  0.9008288
##   0.2    0.0018517590  0.9307521  0.786940  0.9050212
##   0.2    0.0042778035  0.9399869  0.791165  0.9073091
##   0.2    0.0098822812  0.9485003  0.798805  0.9168326
##   0.2    0.0228293518  0.9538168  0.787910  0.9305538
##   0.2    0.0527387649  0.9600655  0.761680  0.9546295
##   0.2    0.1218333901  0.9605824  0.762860  0.9569659
##   0.2    0.2814509401  0.9555530  0.766325  0.9643174
##   0.3    0.0001502019  0.9059250  0.772040  0.8816530
##   0.3    0.0003469858  0.9133042  0.777945  0.8891439
##   0.3    0.0008015822  0.9222647  0.781710  0.9003152
##   0.3    0.0018517590  0.9307360  0.786900  0.9050348
##   0.3    0.0042778035  0.9407258  0.790775  0.9074417
##   0.3    0.0098822812  0.9490719  0.795680  0.9177098
##   0.3    0.0228293518  0.9554276  0.788250  0.9326606
##   0.3    0.0527387649  0.9622674  0.762535  0.9552386
##   0.3    0.1218333901  0.9594614  0.785740  0.9555659
##   0.3    0.2814509401  0.9501256  0.752835  0.9661598
##   0.4    0.0001502019  0.9052845  0.771580  0.8810106
##   0.4    0.0003469858  0.9125261  0.777600  0.8885606
##   0.4    0.0008015822  0.9218761  0.781455  0.8997462
##   0.4    0.0018517590  0.9309344  0.786495  0.9048614
##   0.4    0.0042778035  0.9413537  0.790610  0.9076470
##   0.4    0.0098822812  0.9492646  0.794455  0.9186523
##   0.4    0.0228293518  0.9574250  0.790110  0.9353705
##   0.4    0.0527387649  0.9630433  0.770965  0.9564152
##   0.4    0.1218333901  0.9568769  0.787470  0.9508174
##   0.4    0.2814509401  0.9356430  0.698795  0.9661576
##   0.5    0.0001502019  0.9045698  0.771440  0.8801636
##   0.5    0.0003469858  0.9117711  0.777130  0.8877598
##   0.5    0.0008015822  0.9213198  0.781155  0.8990553
##   0.5    0.0018517590  0.9308780  0.786730  0.9044644
##   0.5    0.0042778035  0.9418714  0.790325  0.9081606
##   0.5    0.0098822812  0.9495269  0.794745  0.9198038
##   0.5    0.0228293518  0.9589687  0.791635  0.9380121
##   0.5    0.0527387649  0.9625539  0.777590  0.9569841
##   0.5    0.1218333901  0.9545387  0.776445  0.9469250
##   0.5    0.2814509401  0.9207545  0.588955  0.9663417
##   0.6    0.0001502019  0.9037667  0.771195  0.8796515
##   0.6    0.0003469858  0.9108517  0.776235  0.8866606
##   0.6    0.0008015822  0.9206239  0.781000  0.8978841
##   0.6    0.0018517590  0.9306203  0.786685  0.9043432
##   0.6    0.0042778035  0.9421677  0.790265  0.9086864
##   0.6    0.0098822812  0.9498382  0.795870  0.9205500
##   0.6    0.0228293518  0.9600617  0.791330  0.9406644
##   0.6    0.0527387649  0.9614406  0.782285  0.9563621
##   0.6    0.1218333901  0.9509284  0.770990  0.9474871
##   0.6    0.2814509401  0.9195468  0.428230  0.9784114
##   0.7    0.0001502019  0.9027185  0.770685  0.8790030
##   0.7    0.0003469858  0.9096162  0.775025  0.8855053
##   0.7    0.0008015822  0.9196314  0.780345  0.8966083
##   0.7    0.0018517590  0.9302576  0.786230  0.9037038
##   0.7    0.0042778035  0.9422897  0.790840  0.9094326
##   0.7    0.0098822812  0.9502333  0.796985  0.9210076
##   0.7    0.0228293518  0.9605094  0.789845  0.9424038
##   0.7    0.0527387649  0.9598780  0.786075  0.9532159
##   0.7    0.1218333901  0.9455998  0.760925  0.9465735
##   0.7    0.2814509401  0.9190737  0.149095  0.9980167
##   0.8    0.0001502019  0.9015157  0.770545  0.8782970
##   0.8    0.0003469858  0.9078433  0.773885  0.8839583
##   0.8    0.0008015822  0.9181157  0.779925  0.8953356
##   0.8    0.0018517590  0.9294290  0.785650  0.9028886
##   0.8    0.0042778035  0.9422117  0.790850  0.9094621
##   0.8    0.0098822812  0.9507573  0.798245  0.9220000
##   0.8    0.0228293518  0.9607676  0.789810  0.9433598
##   0.8    0.0527387649  0.9578892  0.787930  0.9491864
##   0.8    0.1218333901  0.9389147  0.740330  0.9444992
##   0.8    0.2814509401  0.9183226  0.002760  0.9998326
##   0.9    0.0001502019  0.8995496  0.771000  0.8771098
##   0.9    0.0003469858  0.9051700  0.773695  0.8816205
##   0.9    0.0008015822  0.9155809  0.779095  0.8925356
##   0.9    0.0018517590  0.9282226  0.784945  0.9018144
##   0.9    0.0042778035  0.9417558  0.791145  0.9097091
##   0.9    0.0098822812  0.9509883  0.800415  0.9229803
##   0.9    0.0228293518  0.9608764  0.791695  0.9436803
##   0.9    0.0527387649  0.9557799  0.788810  0.9457932
##   0.9    0.1218333901  0.9303100  0.732465  0.9444523
##   0.9    0.2814509401  0.9125319  0.000000  1.0000000
##   1.0    0.0001502019  0.8938665  0.770750  0.8739576
##   1.0    0.0003469858  0.8995557  0.772585  0.8775076
##   1.0    0.0008015822  0.9107408  0.777545  0.8882273
##   1.0    0.0018517590  0.9258452  0.784200  0.9001091
##   1.0    0.0042778035  0.9410308  0.791830  0.9098000
##   1.0    0.0098822812  0.9511013  0.802415  0.9242205
##   1.0    0.0228293518  0.9603100  0.793415  0.9424515
##   1.0    0.0527387649  0.9534488  0.787690  0.9405576
##   1.0    0.1218333901  0.9229691  0.725065  0.9504515
##   1.0    0.2814509401  0.8753034  0.000000  1.0000000
## 
## ROC was used to select the optimal model using the largest value.
## The final values used for the model were alpha = 0.4 and lambda
##  = 0.05273876.
predlr <- lrTrain$pred
predlr <- predlr[predlr[,"alpha"]==lrTrain$bestTune$alpha & predlr[,"lambda"]==lrTrain$bestTune$lambda, ]

In logistic regression notice how alpha = 1(lasso) performs best based on ROC. Let’s now train a support vector machine.

set.seed(112358)
svmControl <- trainControl(
  method = "repeatedcv",
  number = 10,
  repeats= repeats,
  classProbs = T,
  savePredictions = T
)

svmTrainLinear <- train(label~., data = subset(t, select=-c(Protection)),trControl = svmControl,method = "svmLinear", tuneLength = 10)
svmTrainRadial <- train(label~., data = subset(t, select=-c(Protection)),trControl = svmControl,method = "svmRadial", tuneLength = 10)

svmTrainLinear
## Support Vector Machines with Linear Kernel 
## 
## 167 samples
##  45 predictor
##   2 classes: 'High', 'Low' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 1000 times) 
## Summary of sample sizes: 150, 150, 150, 150, 151, 150, ... 
## Resampling results:
## 
##   Accuracy   Kappa    
##   0.8656652  0.6560753
## 
## Tuning parameter 'C' was held constant at a value of 1
svmTrainRadial
## Support Vector Machines with Radial Basis Function Kernel 
## 
## 167 samples
##  45 predictor
##   2 classes: 'High', 'Low' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 1000 times) 
## Summary of sample sizes: 151, 150, 151, 150, 151, 150, ... 
## Resampling results across tuning parameters:
## 
##   C       Accuracy   Kappa    
##     0.25  0.8752351  0.6960810
##     0.50  0.8819904  0.6908564
##     1.00  0.8740590  0.6698671
##     2.00  0.8705411  0.6618988
##     4.00  0.8716944  0.6667776
##     8.00  0.8684993  0.6584494
##    16.00  0.8614388  0.6401486
##    32.00  0.8468164  0.6032123
##    64.00  0.8441723  0.5973946
##   128.00  0.8444904  0.5983386
## 
## Tuning parameter 'sigma' was held constant at a value of 0.01430736
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were sigma = 0.01430736 and C = 0.5.
predSVMLinear <- svmTrainLinear$pred
predSVMRadial <- svmTrainRadial$pred

We see that a linear function does better than the radial basis function.

predSVM <- predSVMLinear

Let’s now train a K nearest neighbour model

## KNN
set.seed(112358)
knnControl <- trainControl(
  method = "repeatedcv",
  number = 10,
  repeats= repeats,
  savePredictions = T,
  classProbs= T
)
knnTrain <- train(label~., data = subset(t, select=-c(Protection)),trControl = knnControl,method = "knn", tuneLength = 10)
predKnn <- knnTrain$pred
predKnn <- predKnn[predKnn[,"k"]==knnTrain$bestTune$k,]

knnTrain
## k-Nearest Neighbors 
## 
## 167 samples
##  45 predictor
##   2 classes: 'High', 'Low' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 1000 times) 
## Summary of sample sizes: 150, 150, 150, 150, 151, 150, ... 
## Resampling results across tuning parameters:
## 
##   k   Accuracy   Kappa    
##    5  0.8816973  0.6991489
##    7  0.8753678  0.6909522
##    9  0.8914540  0.7311594
##   11  0.8978546  0.7455154
##   13  0.8905556  0.7275586
##   15  0.8910849  0.7315575
##   17  0.8970209  0.7448255
##   19  0.8944119  0.7367887
##   21  0.8944554  0.7333482
##   23  0.8944807  0.7302065
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was k = 11.

Let’s now compute the ROC curves for all four classifiers. The function getRocCV computes the ROC curves for each iteration(We used a 1000 iterations for each classifier). getAvgRoc plots the average of all 1000 ROC curves.

library(ROCR)
## Loading required package: gplots
## 
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
## 
##     lowess
getRocCV <- function(pred){
    temp.obs <- c()
    temp.pred <- c()
    for(i in seq(1, repeats)){
        temp <- paste("Rep", sprintf("%04d",i), sep="")
        temp.p <- pred[grepl(temp, pred[,"Resample"]), "Low"]
        temp.o <- pred[grepl(temp, pred[,"Resample"]), "obs"]
        temp.pred[[length(temp.pred)+1]] <- temp.p
        temp.obs[[length(temp.obs)+1]] <- temp.o
    }
    return(list(temp.pred, temp.obs));
}

getAvgRoc <- function(roc){
    m <- max(sapply(roc@x.values, length))
    resx <- sapply(roc@x.values, function(x){
        x <- c(x, rep(NA, m-length(x)));
    });
    resy <- sapply(roc@y.values, function(x){
        x <- c(x, rep(NA, m-length(x)));
    });
    roc.df <- data.frame(rowMeans(as.data.frame(resx), na.rm=T), rowMeans(as.data.frame(resy), na.rm=T))
    colnames(roc.df) <- c("meanx", "meany")
    return(roc.df)
}

predRf <- getRocCV(predRf)
p <- prediction(predRf[1][[1]], predRf[2][[1]])
rocRf <- performance(p, "tpr", "fpr")
rocRf.avg <- getAvgRoc(rocRf)

predlr <- getRocCV(predlr)
p <- prediction(predlr[1][[1]], predlr[2][[1]])
roclr <- performance(p, "tpr", "fpr")
roclr.avg <- getAvgRoc(roclr)

predSVM <- getRocCV(predSVM)
p <- prediction(predSVM[1][[1]], predSVM[2][[1]])
rocSvm <- performance(p, "tpr", "fpr")
rocSvm.avg <- getAvgRoc(rocSvm)

predKnn <- getRocCV(predKnn)
p <- prediction(predKnn[1][[1]], predKnn[2][[1]])
rocKnn <- performance(p, "tpr", "fpr")
rocKnn.avg <- getAvgRoc(rocKnn)

Let’s compute the average accuracy and AUC across all 1000 iterations for each classifier and plot the ROC curves.

l <- c("Random Forest", "Logistic Regression", "SVM", "KNN")
## Get AUC
r <- list(predRf, predlr, predSVM, predKnn)
auc <- c()
acc <- c()
for(i in seq(1, length(r))){
    p <- prediction(r[i][[1]][[1]], r[i][[1]][[2]])
    a <- performance(p, "auc")
    a <- mean(unlist(a@y.values))
    auc <- c(auc,a)
    a <- performance(p, "acc")
    a <- mean(unlist(a@y.values))
    acc <- c(acc,a)
}

acc
## [1] 0.6998238 0.6872538 0.6668959 0.7694027
auc
## [1] 0.9501534 0.9598734 0.9098766 0.9176502
r <- list(rocRf, roclr, rocSvm, rocKnn)
r.avg <- list(rocRf.avg, roclr.avg, rocSvm.avg, rocKnn.avg)
rocdf <- as.data.frame(matrix(ncol = 4, nrow = 0))
rocdf.avg <- as.data.frame(matrix(ncol = 3, nrow = 0))
for(i in seq(1, length(r))){
    r.t <- r[i]
    for(j in seq(1, repeats)){
        temp <- data.frame(r.t[[1]]@x.values[[j]], r.t[[1]]@y.values[[j]], rep(l[i], length(r.t[[1]]@y.values[[j]])), rep(paste(l[i], j, sep="."), length(r.t[[1]]@y.values[[j]])))
        colnames(temp) <- c("x", "y", "classifier", "rep")
        rocdf <- rbind(rocdf, temp)
    }
    r.t <- r.avg[[i]]
    temp <- data.frame(r.t$meanx, r.t$meany, rep(paste(l[i], "Average", sep=" "), length(r.t$meanx)))
    colnames(temp) <- c("meanx", "meany", "classifieravg")
    rocdf.avg <- rbind(rocdf.avg, temp)
}
colnames(rocdf) <- c("x", "y", "classifier", "rep")

library(RColorBrewer)

classifierColor <- brewer.pal(9,"Dark2")
## Warning in brewer.pal(9, "Dark2"): n too large, allowed maximum for palette Dark2 is 8
## Returning the palette you asked for with that many colors
cc <- c()
ll <- c()
for(i in seq(1, length(l))){
    cc <- c(cc, rep(classifierColor[i], 2))
    ll <- c(ll, l[i], paste(l[i], "Average", sep=" "))
}
names(cc) <- ll

pp <- ggplot(rocdf[rocdf[,"classifier"]=="Random Forest",]) + geom_line(aes(x = meanx, y = meany), data=rocdf.avg[rocdf.avg[,"classifieravg"]=="Random Forest Average",], alpha=1, size=0.5) + geom_abline(color="#707070") + geom_boxplot(alpha=0.05, aes(x=x, y=y,group = cut_width(x, 0.02))) + scale_color_manual(values=cc) +xlab("FPR") +ylab("TPR") + ggtitle(paste("ROC Curves for Random Forest model with 10-fold CV repeated", repeats, "times"))
pp

pp <- ggplot(rocdf[rocdf[,"classifier"]=="Logistic Regression",]) + geom_line(aes(x = meanx, y = meany), data=rocdf.avg[rocdf.avg[,"classifieravg"]=="Logistic Regression Average",], alpha=1, size=0.5) + geom_abline(color="#707070") + geom_boxplot(alpha=0.05, aes(x=x, y=y,group = cut_width(x, 0.02))) + scale_color_manual(values=cc) +xlab("FPR") +ylab("TPR") + ggtitle(paste("ROC Curves for Logistic Regression model with 10-fold CV repeated", repeats, "times"))
pp

pp <- ggplot(rocdf[rocdf[,"classifier"]=="SVM",]) + geom_line(aes(x = meanx, y = meany), data=rocdf.avg[rocdf.avg[,"classifieravg"]=="SVM Average",], alpha=1, size=0.5) + geom_abline(color="#707070") + geom_boxplot(alpha=0.05, aes(x=x, y=y,group = cut_width(x, 0.02))) + scale_color_manual(values=cc) +xlab("FPR") +ylab("TPR") + ggtitle(paste("ROC Curves for SVM with 10-fold CV repeated", repeats, "times"))
pp

pp <- ggplot(rocdf[rocdf[,"classifier"]=="KNN",]) + geom_line(aes(x = meanx, y = meany), data=rocdf.avg[rocdf.avg[,"classifieravg"]=="KNN Average",], alpha=1, size=0.5) + geom_abline(color="#707070") + geom_boxplot(alpha=0.05, aes(x=x, y=y,group = cut_width(x, 0.02))) + scale_color_manual(values=cc) +xlab("FPR") +ylab("TPR") + ggtitle(paste("ROC Curves for KNN model with 10-fold CV repeated", repeats, "times"))
pp

pp <- ggplot() + geom_line(aes(meanx, meany, color=classifieravg), data=rocdf.avg, alpha=1, size=0.5) + geom_abline(color="#707070") + scale_color_manual(values=cc) +xlab("FPR") +ylab("TPR") + ggtitle(paste("Average ROC Curves for 10-fold CV repeated", repeats, "times"))
for(i in seq(1, length(auc))){
    pp <- pp + annotate("text", label = paste("AUC:", round(auc[i], 3), "ACC:", round(acc[i], 3), sep=" "), x = 1, y = 0.25-(i*0.05), size = 4, colour = cc[l[i]][[1]], hjust=1)
}
pp <- pp + theme_bw()+ theme(text = element_text(size=14), axis.text = element_text(size=14), panel.border = element_blank(), panel.grid.major.x = element_blank(), panel.grid.major.y = element_blank(), panel.grid.minor = element_blank(), axis.line = element_line(colour = "black")) + ggtitle("ROC curves across 1000 iterations")
pp

pp <- ggplot() + geom_line(aes(meanx, meany, color=classifieravg), data=rocdf.avg, alpha=1, size=0.5) + geom_abline(color="#707070") + scale_color_manual(values=cc) +xlab("FPR") +ylab("TPR") + ggtitle(paste("Average ROC Curves for 10-fold CV repeated", repeats, "times"))
## for(i in seq(1, length(auc))){
##     print(i)
##     pp <- pp + annotate("text", label = paste(l[i], "AUC:", round(auc[i], 3), "ACC:", round(acc[i], 3), sep=" "), x = 1, y = 0.25-(i*0.05), size = 4, colour = cc[l[i]][[1]], hjust=1)
## }
pp <- pp + theme_bw()+ theme(text = element_text(size=14), axis.text = element_text(size=14), panel.border = element_blank(), panel.grid.major.x = element_blank(), panel.grid.major.y = element_line( size=.3, color="#f5f5f5"), panel.grid.minor = element_blank(), axis.line = element_line(colour = "black")) + ggtitle("ROC curves across 1000 iterations")

Let’s take a look at the features selected by the logistic regression

bestcoef <- coef(lrTrain$finalModel, s = lrTrain$bestTune$lambda)
bestcoef <- as.data.frame(as.matrix(bestcoef))
temp <- bestcoef[bestcoef[,"1"]!=0 & rownames(bestcoef)!="(Intercept)",]
temp.name <- rownames(bestcoef)[bestcoef[,"1"]!=0 & rownames(bestcoef)!="(Intercept)"]
temp <- as.data.frame(temp)
colnames(temp) <- c("val")
temp$name <- temp.name
temp[,"val"] <- temp["val"]*-1
col <- as.vector(unlist(lapply(temp[,"val"], function(x){if(x>0) "Positive" else "Negative"})))
ggplot(temp, aes(y=val, x=reorder(name, abs(val)))) + geom_bar(stat="identity", aes(fill=col)) + coord_flip()+xlab("Feature") +ylab("Coeffecient") + scale_fill_manual(guide=FALSE, values=c("Red", "Blue")) + theme(panel.background = element_rect(fill = '#FFFFFF', colour = '#000000'), panel.grid.major.x = element_blank(), panel.grid.major.y = element_blank(), panel.grid.minor = element_blank())

Let’s now take a look at the antibodies for which the LR model failed. Let’s plot true protection vs the probability with which the LR model decided to classify as High. We shall also annotate the false positives(bottom right corner) and false negatives(top left) with ids.

lrProb <- predict(lrTrain, q, type="prob")
lrProb[,"predicted"] <- predict(lrTrain, q)
lrProb[,"truth"] <- q[,"label"]
lrProb[,"Protection"] <- df[,"Protection"]
lrProb[,"id"] <- rownames(df)
lrProb[lrProb$predicted==lrProb$truth,"id"] <- ""
ggplot(lrProb, aes(x=High, y=Protection, color=predicted, shape=truth, group=interaction(predicted, truth))) + geom_point(size = 6) + xlab("Predicted Probability of High Protection") + ylab("True Protection") + geom_hline(yintercept=0.6, linetype="dashed", alpha=0.6) + geom_vline(xintercept=0.5, linetype="dashed", alpha=0.6) + geom_text(aes(label=id), color="black", alpha=1, nudge_y = 0.05)