Remove mAb 135 because it is missing Neut_micro.
df <- read.csv("./data/master_log.csv", na.strings=c("?", "nd"), strip.white=TRUE, row.names=c("Ab"))
df <- df[!rownames(df) %in% c("135"),]
Convert Epitope Class to a factor.
df$Epitope_Class <- factor(df$Epitope_Class)
Drop columns with missing values.
df <- subset(df, select=-c(Protect_binary, Epitope_Class_ELISA, Endotoxin, Endotoxin.1, mW_Loss, aTTD, Epitope.Class..assigning.method.)) #Remove columns with empty values.
df[,"Escape..code"] <- as.factor(df[,"Escape..code"])
df[,"Makona.binding"] <- as.factor(df[,"Makona.binding"])
Make unNeutFrac, Neut_dVP30 and Neut_VSV correlate positively with Protection.
df[,"unNeutFrac"] <- 1- df[,"unNeutFrac"]
df[,"Neut_dVP30"] <- 1- df[,"Neut_dVP30"]
df[,"Neut_VSV"] <- (100 - df[,"Neut_VSV"])/100
Separate Human IgG1 and Mouse IgG1
levels(df[,"Isotype"]) <- c(levels(df[,"Isotype"]), "HumanIgG1")
df[df[,"Isotype"]=="IgG1" & df[,"Species"]=="human", "Isotype"] <- "HumanIgG1"
levels(df[,"Isotype"]) <- c(levels(df[,"Isotype"]), "MouseIgG1")
df[df[,"Isotype"]=="IgG1" & df[,"Species"]=="mouse", "Isotype"] <- "MouseIgG1"
df[,"Isotype"] <- droplevels(df[,"Isotype"])
Convert polyfunctionality to numeric type.
df[,"Polyfunctionality"] <- as.numeric(df[,"Polyfunctionality"])
Add Epitope Tier columns
df[df$Epitope_Class %in% c("Cap", "GP1/Head", "Mucin"), "Epitope_Tier"] <- "Tier1"
df[df$Epitope_Class %in% c("Base", "GP1/Core", "Fusion"), "Epitope_Tier"] <- "Tier2"
df[df$Epitope_Class %in% c("GP1/2", "HR2"), "Epitope_Tier"] <- "Tier3"
df[df$Epitope_Class %in% c("Unknown"), "Epitope_Tier"] <- "TierUnknown"
Dropidentical features and features with missing experimental data.
t <- subset(df, select=-c(Round, Makona.binding, Escape..code, Epitope_Tier, Total_SA,Cross.reactivity))
Remove features with near zero variance. Features are centered and scaled.
library(caret)
library(ggplot2)
s <- dummyVars(~., data <- t, levelsOnly=TRUE)
q <- predict(s, t)
h <- nearZeroVar(q, saveMetrics=TRUE)
t.var <- apply(q, 2, var)
t.var <- as.data.frame(t.var)
t.var[,"Var"] <- rownames(t.var)
t.var[,"nzv"] <- h$nzv
ggplot(t.var, aes(x=reorder(Var, t.var), y=t.var)) + geom_bar(stat="identity", aes(fill=nzv)) + scale_y_continuous(trans='log10') + theme(axis.text.x = element_text(angle = 90, hjust = 1)) + scale_fill_manual(values = c("steelblue", "indianred"),name= "Near Zero Variance") + ggtitle("Features with near zero variance that will be removed.") +xlab("Feature") + ylab("log10(Variance)")
q <- q[,(colnames(q) %in% rownames(h[h[,"nzv"]==FALSE | h[,"zeroVar"]==TRUE,]))]
q <- subset(q, select =-c(Unknown, sGP.binderFALSE))
norm <- preProcess(q, method=c("center", "scale"))
q <- predict(norm, q)
q <- as.data.frame(q)
head(q)
## Protection Base Cap GP1/2 GP1/Core Mucin
## 1 0.4289280 -0.301588 1.6930603 -0.301588 -0.4854046 -0.3677495
## 2 -1.0652518 -0.301588 -0.5871096 -0.301588 -0.4854046 2.7029591
## 3 -0.7664158 -0.301588 1.6930603 -0.301588 -0.4854046 -0.3677495
## 4 -0.4675799 -0.301588 -0.5871096 -0.301588 -0.4854046 2.7029591
## 5 -0.1687439 -0.301588 1.6930603 -0.301588 -0.4854046 -0.3677495
## 6 0.1300921 -0.301588 -0.5871096 -0.301588 -0.4854046 2.7029591
## Neut_VSV unNeutFrac Neut_micro Neut_dVP30 GP_EC50 sGP_EC50
## 1 -0.7135912 -0.7075685 -1.0074874 -0.6029719 -0.3779862 -1.0684007
## 2 -0.7135912 -0.7075685 -1.0074874 -0.6029719 -0.3799966 0.9542301
## 3 -0.7135912 -0.7075685 -0.5342092 -0.6029719 0.3680830 -1.0247257
## 4 -0.7135912 -0.7075685 -1.0074874 -0.6029719 -0.3809804 0.9542301
## 5 1.4151100 0.8370870 0.4596750 -0.6029719 -0.1946556 -1.0545575
## 6 -0.7135912 -0.7075685 -1.0074874 -0.6029719 -0.3654533 0.9542301
## human mouse IgG2a HumanIgG1 MouseIgG1 Polyfunctionality
## 1 0.8059481 -0.8059481 -0.5318939 0.8160809 -0.3677495 1.5775151
## 2 0.8059481 -0.8059481 -0.5318939 0.8160809 -0.3677495 1.5775151
## 3 -1.2333449 1.2333449 1.8688163 -1.2180312 -0.3677495 -0.5730565
## 4 0.8059481 -0.8059481 -0.5318939 0.8160809 -0.3677495 1.5775151
## 5 -1.2333449 1.2333449 1.8688163 -1.2180312 -0.3677495 -0.5730565
## 6 -1.2333449 1.2333449 1.8688163 -1.2180312 -0.3677495 -0.0354136
## huADCP mADCP CD107a IFNg MIP.1b huADNP
## 1 0.9420515 1.4928146 1.4855860 1.4721098 1.6246636 4.3450612
## 2 -0.6288721 1.0345560 2.0526654 1.9820216 2.5693466 3.3395864
## 3 0.4718370 -0.2366025 -0.4504150 -0.4327751 -0.5714592 -0.6443498
## 4 0.6436291 0.8137532 2.4809491 2.1313530 3.1345643 2.4476388
## 5 1.7027185 -0.2927284 -0.4067935 -0.4021804 -0.5029962 -0.5420307
## 6 -0.1046406 0.5491598 -0.2783084 -0.4262191 -0.3862376 0.2830299
## mADNP sGP_huADCP sGP_huADNP Total_G0 Total_G1 Total_G2
## 1 -0.12824504 1.0847236 1.0050389 2.337267 -2.5251621 -0.8281199
## 2 -0.61622083 -0.3945998 0.4790355 1.619398 -1.6106900 -0.8294177
## 3 -0.28376405 1.0175055 -0.3613869 0.701013 -0.5869325 -0.5607759
## 4 -0.94029636 -0.3672924 -0.1666382 2.546092 -2.6400880 -1.1045484
## 5 -0.87790251 0.3006875 -0.1921194 1.325271 -1.5218515 -0.3038141
## 6 -0.04536366 -1.1169318 -0.4437456 1.033914 -1.0565072 -0.4777175
## Total_Fucose Total_Bisecting Total_mono_SA G0 G0F
## 1 -5.3719604 -0.1683970 -0.1427142 5.5195898 -2.1311307
## 2 -2.4561018 -0.7369070 0.3107384 2.5342760 -0.3934907
## 3 0.3274304 -0.9737862 0.6070378 -0.2220388 0.9525519
## 4 -5.3980247 -0.4810775 -0.4144584 5.5077929 -1.9230581
## 5 0.2532474 -0.3863258 1.2618430 -0.2539599 1.6387239
## 6 0.3341135 -0.9927366 0.6905255 -0.2504902 1.3295374
## G0FB G1 G1. G1F G1F. G1S1F
## 1 -1.1673856 -0.7037159 0.22572059 -1.797929 -2.00926166 0.43313807
## 2 -0.7431267 -0.2825927 -0.25374768 -1.285002 -1.26239409 0.65351661
## 3 -0.6017070 -0.4184389 -0.10914614 -1.299307 -0.07266768 -0.02840944
## 4 -0.6788450 -0.6357928 0.26377363 -1.771363 -1.99464586 -0.35274011
## 5 -0.6017070 -0.2554234 0.08111905 -1.509791 -1.15058320 0.85726280
## 6 -0.6402760 -0.3641004 -0.37551740 -1.381048 -0.48848729 -0.09909689
## G1FB G2F G2FB G2S1 G2S1F G2S1B
## 1 -0.4367530 -0.6487676 -0.329686262 0.1525347 -0.70223990 1.64704157
## 2 -0.3973272 -0.9934518 -0.351766713 0.4920949 -0.01897163 -0.07530043
## 3 -0.5944562 -1.2077149 -0.351766713 -0.5077212 1.13312379 -0.96759809
## 4 -0.6601659 -1.0291623 -0.219284004 0.7373328 -0.64968080 0.69249058
## 5 -0.5681724 -1.2387675 -0.009519716 -0.3945344 1.29921054 1.14901497
## 6 -0.6995917 -1.2092675 -0.384887390 -0.2750596 1.17937580 -0.42806927
## G2S1FB sGP.binderTRUE
## 1 0.5254709 1.0522613
## 2 -0.4418074 -0.9446437
## 3 -0.4686763 1.0522613
## 4 -0.1328157 -0.9446437
## 5 -0.6164549 1.0522613
## 6 -0.6836270 -0.9446437
Let’s build a classifier to classify mAbs into “high” and “low” protection. We set the threshold between “high” and “low” as 0.6.
threshold <- 0.6
t[t[,"Protection"]<threshold,"label"] <- "Low"
t[(t[,"Protection"]>=threshold),"label"] <- "High"
q[,"label"] <- t[,"label"]
t <- q
Before we start training our models. Let’s enable R to use multiple cores.
library(doMC)
## Loading required package: foreach
## Loading required package: iterators
## Loading required package: parallel
registerDoMC(cores = 16)
Let’s first train a Random Forest Model and get the predictions.
repeats <- 1000
set.seed(112358)
rfControl <- trainControl(
method = "repeatedcv",
number = 10,
repeats= repeats,
verboseIter = FALSE,
returnData = FALSE,
allowParallel = TRUE,
summaryFunction = twoClassSummary,
classProbs = TRUE,
savePredictions = T,
search="grid"
)
rfTrain <- train(
x = as.matrix(subset(t, select=-c(label, Protection))),
y = as.character(t[,"label"]),
trControl = rfControl,
method = "rf",
ntree = 1000,
tuneLength = 10
)
## Warning in train.default(x = as.matrix(subset(t, select = -c(label,
## Protection))), : The metric "Accuracy" was not in the result set. ROC will
## be used instead.
rfTrain
## Random Forest
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 1000 times)
## Summary of sample sizes: 150, 150, 150, 150, 151, 150, ...
## Resampling results across tuning parameters:
##
## mtry ROC Sens Spec
## 2 0.9525084 0.728150 0.9604356
## 7 0.9522726 0.806900 0.9484303
## 12 0.9487039 0.817065 0.9479485
## 17 0.9449837 0.813575 0.9463879
## 22 0.9416123 0.812030 0.9432561
## 27 0.9385880 0.809420 0.9404083
## 32 0.9362907 0.807640 0.9352356
## 37 0.9345193 0.807210 0.9305008
## 42 0.9332669 0.807080 0.9279485
## 48 0.9316669 0.805435 0.9250939
##
## ROC was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 2.
rfTrain$finalModel
##
## Call:
## randomForest(x = x, y = y, ntree = 1000, mtry = param$mtry)
## Type of random forest: classification
## Number of trees: 1000
## No. of variables tried at each split: 2
##
## OOB estimate of error rate: 8.98%
## Confusion matrix:
## High Low class.error
## High 36 12 0.25000000
## Low 3 116 0.02521008
predRf <- rfTrain$pred
predRf <- predRf[predRf[,"mtry"]==rfTrain$bestTune$mtry,]
Let’s plot feature importance for the Random Forest model.
impRf <- varImp(rfTrain$finalModel, scaled=FALSE)
impRf[,"Variable"] <- rownames(impRf)
impRf <- impRf[with(impRf, order(-Overall)),]
impRf[,"Variable"] <- factor(impRf[,"Variable"], levels = impRf[,"Variable"])
pp <- ggplot(impRf, aes(Variable, Overall)) + geom_bar(stat="identity", fill="#000000")
pp + theme_bw()+ theme(text = element_text(size=14), axis.text = element_text(size=14), panel.border = element_blank(), panel.grid.major.x = element_blank(), panel.grid.major.y = element_line( size=.3, color="#f5f5f5"), panel.grid.minor = element_blank(), axis.line = element_line(colour = "black"), axis.text.x = element_text(angle = 90, hjust = 1)) + xlab("Feature") + ylab("Mean Decrease in Gini Index")
Let’s now train a logistic regression model with elastic net regularization.
require(methods)
## Loading required package: methods
lrControl <- trainControl(
method = "repeatedcv",
number = 10,
repeats= repeats,
verboseIter = FALSE,
returnData = FALSE,
allowParallel = TRUE,
summaryFunction = twoClassSummary,
classProbs = TRUE,
savePredictions = T
)
set.seed(112358)
lrTrain <- train(
x = as.matrix(subset(t, select=-c(label, Protection))),
y = as.character(t[,"label"]),
trControl = lrControl,
method = "glmnet",
tuneLength = 10
)
## Warning in train.default(x = as.matrix(subset(t, select = -c(label,
## Protection))), : The metric "Accuracy" was not in the result set. ROC will
## be used instead.
lrTrain
## glmnet
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 1000 times)
## Summary of sample sizes: 150, 150, 150, 150, 151, 150, ...
## Resampling results across tuning parameters:
##
## alpha lambda ROC Sens Spec
## 0.1 0.0001439009 0.9022667 0.773270 0.8887076
## 0.1 0.0003324298 0.9028229 0.773545 0.8894235
## 0.1 0.0007679560 0.9124555 0.777705 0.8980848
## 0.1 0.0017740779 0.9237042 0.781010 0.9040197
## 0.1 0.0040983502 0.9353654 0.786125 0.9085947
## 0.1 0.0094677206 0.9450206 0.783035 0.9123780
## 0.1 0.0218716632 0.9508816 0.771535 0.9232629
## 0.1 0.0505263800 0.9543302 0.763675 0.9457023
## 0.1 0.1167224941 0.9546458 0.744200 0.9539394
## 0.1 0.2696441069 0.9539600 0.735500 0.9538417
## 0.2 0.0001439009 0.8960588 0.770740 0.8818212
## 0.2 0.0003324298 0.9021359 0.773695 0.8887205
## 0.2 0.0007679560 0.9119925 0.778115 0.8977833
## 0.2 0.0017740779 0.9238889 0.782355 0.9043750
## 0.2 0.0040983502 0.9362814 0.788130 0.9095091
## 0.2 0.0094677206 0.9461351 0.783980 0.9132114
## 0.2 0.0218716632 0.9515568 0.773055 0.9240023
## 0.2 0.0505263800 0.9552831 0.761840 0.9467803
## 0.2 0.1167224941 0.9566325 0.743340 0.9509720
## 0.2 0.2696441069 0.9506997 0.711135 0.9587962
## 0.3 0.0001439009 0.8948953 0.771030 0.8806598
## 0.3 0.0003324298 0.9014411 0.773985 0.8879409
## 0.3 0.0007679560 0.9113319 0.778485 0.8974909
## 0.3 0.0017740779 0.9238642 0.783500 0.9044727
## 0.3 0.0040983502 0.9370390 0.789580 0.9101000
## 0.3 0.0094677206 0.9470046 0.784790 0.9141023
## 0.3 0.0218716632 0.9526017 0.777685 0.9253705
## 0.3 0.0505263800 0.9577725 0.748915 0.9466773
## 0.3 0.1167224941 0.9571508 0.739710 0.9523303
## 0.3 0.2696441069 0.9470105 0.714255 0.9645409
## 0.4 0.0001439009 0.8942170 0.771190 0.8801091
## 0.4 0.0003324298 0.9007486 0.774145 0.8868932
## 0.4 0.0007679560 0.9105733 0.778610 0.8967242
## 0.4 0.0017740779 0.9236388 0.784035 0.9043811
## 0.4 0.0040983502 0.9376111 0.790515 0.9104030
## 0.4 0.0094677206 0.9475278 0.786135 0.9152083
## 0.4 0.0218716632 0.9544834 0.784175 0.9278083
## 0.4 0.0505263800 0.9597870 0.739580 0.9465652
## 0.4 0.1167224941 0.9542294 0.743350 0.9491280
## 0.4 0.2696441069 0.9356844 0.726895 0.9601697
## 0.5 0.0001439009 0.8934652 0.771235 0.8794121
## 0.5 0.0003324298 0.8999180 0.774325 0.8857538
## 0.5 0.0007679560 0.9097052 0.778815 0.8959167
## 0.5 0.0017740779 0.9234156 0.785015 0.9042955
## 0.5 0.0040983502 0.9377031 0.791075 0.9109258
## 0.5 0.0094677206 0.9481332 0.788420 0.9169970
## 0.5 0.0218716632 0.9561970 0.788005 0.9307538
## 0.5 0.0505263800 0.9604094 0.737255 0.9462311
## 0.5 0.1167224941 0.9509791 0.745440 0.9466432
## 0.5 0.2696441069 0.9210561 0.681955 0.9703205
## 0.6 0.0001439009 0.8925075 0.771080 0.8786742
## 0.6 0.0003324298 0.8988819 0.773995 0.8846485
## 0.6 0.0007679560 0.9087216 0.778215 0.8948856
## 0.6 0.0017740779 0.9229032 0.786100 0.9041545
## 0.6 0.0040983502 0.9374145 0.790790 0.9107750
## 0.6 0.0094677206 0.9487077 0.791910 0.9187144
## 0.6 0.0218716632 0.9575510 0.787665 0.9338318
## 0.6 0.0505263800 0.9602879 0.739295 0.9451212
## 0.6 0.1167224941 0.9485951 0.749475 0.9468174
## 0.6 0.2696441069 0.9199951 0.352010 0.9771939
## 0.7 0.0001439009 0.8913529 0.771355 0.8778864
## 0.7 0.0003324298 0.8976728 0.774165 0.8833121
## 0.7 0.0007679560 0.9073753 0.778175 0.8939227
## 0.7 0.0017740779 0.9221378 0.786650 0.9040871
## 0.7 0.0040983502 0.9368032 0.790830 0.9110856
## 0.7 0.0094677206 0.9493961 0.794605 0.9198061
## 0.7 0.0218716632 0.9586158 0.787065 0.9363432
## 0.7 0.0505263800 0.9592765 0.744465 0.9427826
## 0.7 0.1167224941 0.9437099 0.749260 0.9443235
## 0.7 0.2696441069 0.9194806 0.011830 0.9978621
## 0.8 0.0001439009 0.8901186 0.772415 0.8769636
## 0.8 0.0003324298 0.8961788 0.775050 0.8821182
## 0.8 0.0007679560 0.9059348 0.778705 0.8919606
## 0.8 0.0017740779 0.9207007 0.787120 0.9035015
## 0.8 0.0040983502 0.9359925 0.790735 0.9112545
## 0.8 0.0094677206 0.9500041 0.796835 0.9209712
## 0.8 0.0218716632 0.9594174 0.787885 0.9380242
## 0.8 0.0505263800 0.9578768 0.751910 0.9414924
## 0.8 0.1167224941 0.9374282 0.748690 0.9430750
## 0.8 0.2696441069 0.9177588 0.000040 0.9999833
## 0.9 0.0001439009 0.8883450 0.773995 0.8759061
## 0.9 0.0003324298 0.8938113 0.776450 0.8801152
## 0.9 0.0007679560 0.9037259 0.779200 0.8898545
## 0.9 0.0017740779 0.9184661 0.786955 0.9030311
## 0.9 0.0040983502 0.9348438 0.791265 0.9112333
## 0.9 0.0094677206 0.9505600 0.799165 0.9218083
## 0.9 0.0218716632 0.9600598 0.789330 0.9378977
## 0.9 0.0505263800 0.9563553 0.757495 0.9403576
## 0.9 0.1167224941 0.9302110 0.748335 0.9421068
## 0.9 0.2696441069 0.9158173 0.000000 1.0000000
## 1.0 0.0001439009 0.8862564 0.776300 0.8733462
## 1.0 0.0003324298 0.8917341 0.778115 0.8775591
## 1.0 0.0007679560 0.9012294 0.780095 0.8869500
## 1.0 0.0017740779 0.9166701 0.787225 0.9014591
## 1.0 0.0040983502 0.9350004 0.791665 0.9106955
## 1.0 0.0094677206 0.9515805 0.802095 0.9226273
## 1.0 0.0218716632 0.9606717 0.794515 0.9364333
## 1.0 0.0505263800 0.9548985 0.764665 0.9383273
## 1.0 0.1167224941 0.9223452 0.748605 0.9418333
## 1.0 0.2696441069 0.9053560 0.000000 1.0000000
##
## ROC was used to select the optimal model using the largest value.
## The final values used for the model were alpha = 1 and lambda = 0.02187166.
predlr <- lrTrain$pred
predlr <- predlr[predlr[,"alpha"]==lrTrain$bestTune$alpha & predlr[,"lambda"]==lrTrain$bestTune$lambda, ]
In logistic regression notice how alpha = 1(lasso) performs best based on ROC. Let’s now train a support vector machine.
set.seed(112358)
svmControl <- trainControl(
method = "repeatedcv",
number = 10,
repeats= repeats,
classProbs = T,
savePredictions = T
)
svmTrainLinear <- train(label~., data = subset(t, select=-c(Protection)),trControl = svmControl,method = "svmLinear", tuneLength = 10)
svmTrainRadial <- train(label~., data = subset(t, select=-c(Protection)),trControl = svmControl,method = "svmRadial", tuneLength = 10)
svmTrainLinear
## Support Vector Machines with Linear Kernel
##
## 167 samples
## 48 predictor
## 2 classes: 'High', 'Low'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 1000 times)
## Summary of sample sizes: 150, 150, 150, 150, 151, 150, ...
## Resampling results:
##
## Accuracy Kappa
## 0.8623524 0.6488982
##
## Tuning parameter 'C' was held constant at a value of 1
svmTrainRadial
## Support Vector Machines with Radial Basis Function Kernel
##
## 167 samples
## 48 predictor
## 2 classes: 'High', 'Low'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 1000 times)
## Summary of sample sizes: 151, 150, 151, 150, 151, 150, ...
## Resampling results across tuning parameters:
##
## C Accuracy Kappa
## 0.25 0.8729162 0.6904658
## 0.50 0.8719364 0.6647638
## 1.00 0.8681088 0.6531761
## 2.00 0.8660776 0.6479920
## 4.00 0.8705183 0.6633729
## 8.00 0.8660380 0.6517849
## 16.00 0.8613192 0.6385256
## 32.00 0.8552618 0.6254806
## 64.00 0.8532770 0.6207187
## 128.00 0.8529305 0.6197895
##
## Tuning parameter 'sigma' was held constant at a value of 0.01357846
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were sigma = 0.01357846 and C = 0.25.
predSVMLinear <- svmTrainLinear$pred
predSVMRadial <- svmTrainRadial$pred
We see that a linear function does better than the radial basis function.
predSVM <- predSVMLinear
Let’s now train a K nearest neighbour model
## KNN
set.seed(112358)
knnControl <- trainControl(
method = "repeatedcv",
number = 10,
repeats= repeats,
savePredictions = T,
classProbs= T
)
knnTrain <- train(label~., data = subset(t, select=-c(Protection)),trControl = knnControl,method = "knn", tuneLength = 10)
predKnn <- knnTrain$pred
predKnn <- predKnn[predKnn[,"k"]==knnTrain$bestTune$k,]
knnTrain
## k-Nearest Neighbors
##
## 167 samples
## 48 predictor
## 2 classes: 'High', 'Low'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 1000 times)
## Summary of sample sizes: 150, 150, 150, 150, 151, 150, ...
## Resampling results across tuning parameters:
##
## k Accuracy Kappa
## 5 0.8850371 0.7125980
## 7 0.8829172 0.7067125
## 9 0.8776160 0.6920031
## 11 0.8711967 0.6763921
## 13 0.8747048 0.6892204
## 15 0.8806433 0.7059315
## 17 0.8817743 0.7084903
## 19 0.8738948 0.6884186
## 21 0.8781741 0.6983748
## 23 0.8842415 0.7097732
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was k = 5.
Let’s now compute the ROC curves for all four classifiers. The function getRocCV computes the ROC curves for each iteration(We used a 1000 iterations for each classifier). getAvgRoc plots the average of all 1000 ROC curves.
library(ROCR)
## Loading required package: gplots
##
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
##
## lowess
getRocCV <- function(pred){
temp.obs <- c()
temp.pred <- c()
for(i in seq(1, repeats)){
temp <- paste("Rep", sprintf("%04d",i), sep="")
temp.p <- pred[grepl(temp, pred[,"Resample"]), "Low"]
temp.o <- pred[grepl(temp, pred[,"Resample"]), "obs"]
temp.pred[[length(temp.pred)+1]] <- temp.p
temp.obs[[length(temp.obs)+1]] <- temp.o
}
return(list(temp.pred, temp.obs));
}
getAvgRoc <- function(roc){
m <- max(sapply(roc@x.values, length))
resx <- sapply(roc@x.values, function(x){
x <- c(x, rep(NA, m-length(x)));
});
resy <- sapply(roc@y.values, function(x){
x <- c(x, rep(NA, m-length(x)));
});
roc.df <- data.frame(rowMeans(as.data.frame(resx), na.rm=T), rowMeans(as.data.frame(resy), na.rm=T))
colnames(roc.df) <- c("meanx", "meany")
return(roc.df)
}
predRf <- getRocCV(predRf)
p <- prediction(predRf[1][[1]], predRf[2][[1]])
rocRf <- performance(p, "tpr", "fpr")
rocRf.avg <- getAvgRoc(rocRf)
predlr <- getRocCV(predlr)
p <- prediction(predlr[1][[1]], predlr[2][[1]])
roclr <- performance(p, "tpr", "fpr")
roclr.avg <- getAvgRoc(roclr)
predSVM <- getRocCV(predSVM)
p <- prediction(predSVM[1][[1]], predSVM[2][[1]])
rocSvm <- performance(p, "tpr", "fpr")
rocSvm.avg <- getAvgRoc(rocSvm)
predKnn <- getRocCV(predKnn)
p <- prediction(predKnn[1][[1]], predKnn[2][[1]])
rocKnn <- performance(p, "tpr", "fpr")
rocKnn.avg <- getAvgRoc(rocKnn)
Let’s compute the average accuracy and AUC across all 1000 iterations for each classifier and plot the ROC curves.
l <- c("Random Forest", "Logistic Regression", "SVM", "KNN")
## Get AUC
r <- list(predRf, predlr, predSVM, predKnn)
auc <- c()
acc <- c()
for(i in seq(1, length(r))){
p <- prediction(r[i][[1]][[1]], r[i][[1]][[2]])
a <- performance(p, "auc")
a <- mean(unlist(a@y.values))
auc <- c(auc,a)
a <- performance(p, "acc")
a <- mean(unlist(a@y.values))
acc <- c(acc,a)
}
acc
## [1] 0.6985934 0.6864788 0.6614572 0.7449007
auc
## [1] 0.9493283 0.9579699 0.8965198 0.9211666
r <- list(rocRf, roclr, rocSvm, rocKnn)
r.avg <- list(rocRf.avg, roclr.avg, rocSvm.avg, rocKnn.avg)
rocdf <- as.data.frame(matrix(ncol = 4, nrow = 0))
rocdf.avg <- as.data.frame(matrix(ncol = 3, nrow = 0))
for(i in seq(1, length(r))){
r.t <- r[i]
for(j in seq(1, repeats)){
temp <- data.frame(r.t[[1]]@x.values[[j]], r.t[[1]]@y.values[[j]], rep(l[i], length(r.t[[1]]@y.values[[j]])), rep(paste(l[i], j, sep="."), length(r.t[[1]]@y.values[[j]])))
colnames(temp) <- c("x", "y", "classifier", "rep")
rocdf <- rbind(rocdf, temp)
}
r.t <- r.avg[[i]]
temp <- data.frame(r.t$meanx, r.t$meany, rep(paste(l[i], "Average", sep=" "), length(r.t$meanx)))
colnames(temp) <- c("meanx", "meany", "classifieravg")
rocdf.avg <- rbind(rocdf.avg, temp)
}
colnames(rocdf) <- c("x", "y", "classifier", "rep")
library(RColorBrewer)
classifierColor <- brewer.pal(9,"Dark2")
## Warning in brewer.pal(9, "Dark2"): n too large, allowed maximum for palette Dark2 is 8
## Returning the palette you asked for with that many colors
cc <- c()
ll <- c()
for(i in seq(1, length(l))){
cc <- c(cc, rep(classifierColor[i], 2))
ll <- c(ll, l[i], paste(l[i], "Average", sep=" "))
}
names(cc) <- ll
pp <- ggplot(rocdf[rocdf[,"classifier"]=="Random Forest",]) + geom_line(aes(x = meanx, y = meany), data=rocdf.avg[rocdf.avg[,"classifieravg"]=="Random Forest Average",], alpha=1, size=0.5) + geom_abline(color="#707070") + geom_boxplot(alpha=0.05, aes(x=x, y=y,group = cut_width(x, 0.02))) + scale_color_manual(values=cc) +xlab("FPR") +ylab("TPR") + ggtitle(paste("ROC Curves for Random Forest model with 10-fold CV repeated", repeats, "times"))
pp
pp <- ggplot(rocdf[rocdf[,"classifier"]=="Logistic Regression",]) + geom_line(aes(x = meanx, y = meany), data=rocdf.avg[rocdf.avg[,"classifieravg"]=="Logistic Regression Average",], alpha=1, size=0.5) + geom_abline(color="#707070") + geom_boxplot(alpha=0.05, aes(x=x, y=y,group = cut_width(x, 0.02))) + scale_color_manual(values=cc) +xlab("FPR") +ylab("TPR") + ggtitle(paste("ROC Curves for Logistic Regression model with 10-fold CV repeated", repeats, "times"))
pp
pp <- ggplot(rocdf[rocdf[,"classifier"]=="SVM",]) + geom_line(aes(x = meanx, y = meany), data=rocdf.avg[rocdf.avg[,"classifieravg"]=="SVM Average",], alpha=1, size=0.5) + geom_abline(color="#707070") + geom_boxplot(alpha=0.05, aes(x=x, y=y,group = cut_width(x, 0.02))) + scale_color_manual(values=cc) +xlab("FPR") +ylab("TPR") + ggtitle(paste("ROC Curves for SVM with 10-fold CV repeated", repeats, "times"))
pp
pp <- ggplot(rocdf[rocdf[,"classifier"]=="KNN",]) + geom_line(aes(x = meanx, y = meany), data=rocdf.avg[rocdf.avg[,"classifieravg"]=="KNN Average",], alpha=1, size=0.5) + geom_abline(color="#707070") + geom_boxplot(alpha=0.05, aes(x=x, y=y,group = cut_width(x, 0.02))) + scale_color_manual(values=cc) +xlab("FPR") +ylab("TPR") + ggtitle(paste("ROC Curves for KNN model with 10-fold CV repeated", repeats, "times"))
pp
pp <- ggplot() + geom_line(aes(meanx, meany, color=classifieravg), data=rocdf.avg, alpha=1, size=0.5) + geom_abline(color="#707070") + scale_color_manual(values=cc) +xlab("FPR") +ylab("TPR") + ggtitle(paste("Average ROC Curves for 10-fold CV repeated", repeats, "times"))
for(i in seq(1, length(auc))){
print(i)
pp <- pp + annotate("text", label = paste(l[i], "AUC:", round(auc[i], 3), "ACC:", round(acc[i], 3), sep=" "), x = 1, y = 0.25-(i*0.05), size = 4, colour = cc[l[i]][[1]], hjust=1)
}
## [1] 1
## [1] 2
## [1] 3
## [1] 4
pp <- pp + theme_bw()+ theme(text = element_text(size=14), axis.text = element_text(size=14), panel.border = element_blank(), panel.grid.major.x = element_blank(), panel.grid.major.y = element_blank(), panel.grid.minor = element_blank(), axis.line = element_line(colour = "black")) + ggtitle("ROC curves across 1000 iterations")
pp
Let’s take a look at the features selected by the logistic regression
bestcoef <- coef(lrTrain$finalModel, s = lrTrain$bestTune$lambda)
bestcoef <- as.data.frame(as.matrix(bestcoef))
temp <- bestcoef[bestcoef[,"1"]!=0 & rownames(bestcoef)!="(Intercept)",]
temp.name <- rownames(bestcoef)[bestcoef[,"1"]!=0 & rownames(bestcoef)!="(Intercept)"]
temp <- as.data.frame(temp)
colnames(temp) <- c("val")
temp$name <- temp.name
temp[,"val"] <- temp["val"]*-1
col <- as.vector(unlist(lapply(temp[,"val"], function(x){if(x>0) "Positive" else "Negative"})))
ggplot(temp, aes(y=val, x=reorder(name, abs(val)))) + geom_bar(stat="identity", aes(fill=col)) + coord_flip()+xlab("Feature") +ylab("Coeffecient") + scale_fill_manual(guide=FALSE, values=c("Red", "Blue")) + theme(panel.background = element_rect(fill = '#FFFFFF', colour = '#000000'))
We see here taht Neut_micro and Neut_VSV are the only neutralization readouts selected. Let’s examine the correlations between the different correlation readouts.
library(reshape2)
c <- cor(df[,c("Protection", "Neut_micro", "Neut_dVP30", "Neut_VSV", "unNeutFrac")])
c
## Protection Neut_micro Neut_dVP30 Neut_VSV unNeutFrac
## Protection 1.0000000 0.7264262 0.6954513 0.6874017 0.7251843
## Neut_micro 0.7264262 1.0000000 0.7432890 0.8452369 0.8492636
## Neut_dVP30 0.6954513 0.7432890 1.0000000 0.8315803 0.8916419
## Neut_VSV 0.6874017 0.8452369 0.8315803 1.0000000 0.9705272
## unNeutFrac 0.7251843 0.8492636 0.8916419 0.9705272 1.0000000
c <- melt(c)
ggplot(c, aes(x=Var1, y=Var2)) + geom_tile(aes(fill=value)) + scale_fill_gradient(low="white", high="steelblue")
Let’s look at the distribution of the four neutrlization readouts vs Protection.
library(patchwork)
plts <- list()
for(i in c("Neut_micro", "Neut_dVP30", "Neut_VSV", "unNeutFrac")){
p <- ggplot(df, aes_string(x=i,y="Protection")) + geom_point() + geom_smooth(method='lm') + theme_bw()
plts[[length(plts)+1]] <- p
}
p <- plts[[1]]
for(x in plts[-1]){
p <- p+ x
}
p + plot_layout(ncol = 2)
We see that the four neutralization readouts are highyl correlated. Let’s try removing Neut_microand Neut_dVP30 to see how the classifier does with unNeutFrac and Neut_VSV.
t <- subset(t, select=-c(Neut_micro, Neut_dVP30))
lrControl <- trainControl(
method = "repeatedcv",
number = 10,
repeats= repeats,
verboseIter = FALSE,
returnData = FALSE,
allowParallel = TRUE,
summaryFunction = twoClassSummary,
classProbs = TRUE,
savePredictions = T
)
set.seed(112358)
lrTrain <- train(
x = as.matrix(subset(t, select=-c(label, Protection))),
y = as.character(t[,"label"]),
trControl = lrControl,
method = "glmnet",
tuneLength = 10
)
## Warning in train.default(x = as.matrix(subset(t, select = -c(label,
## Protection))), : The metric "Accuracy" was not in the result set. ROC will
## be used instead.
lrTrain
## glmnet
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 1000 times)
## Summary of sample sizes: 150, 150, 150, 150, 151, 150, ...
## Resampling results across tuning parameters:
##
## alpha lambda ROC Sens Spec
## 0.1 0.0001407182 0.8716917 0.708545 0.8707682
## 0.1 0.0003250773 0.8726078 0.709085 0.8716583
## 0.1 0.0007509707 0.8850941 0.712015 0.8847492
## 0.1 0.0017348398 0.8975295 0.705670 0.8956606
## 0.1 0.0040077050 0.9090007 0.701920 0.9010129
## 0.1 0.0092583185 0.9191009 0.709030 0.9072992
## 0.1 0.0213879171 0.9274150 0.703090 0.9178015
## 0.1 0.0494088638 0.9339700 0.711125 0.9356818
## 0.1 0.1141408866 0.9365498 0.720145 0.9487015
## 0.1 0.2636802588 0.9337961 0.707345 0.9699167
## 0.2 0.0001407182 0.8604415 0.694380 0.8608947
## 0.2 0.0003250773 0.8719087 0.708780 0.8709652
## 0.2 0.0007509707 0.8849660 0.712210 0.8842962
## 0.2 0.0017348398 0.8981791 0.706705 0.8961818
## 0.2 0.0040077050 0.9104803 0.701815 0.9023136
## 0.2 0.0092583185 0.9211828 0.707240 0.9094795
## 0.2 0.0213879171 0.9296668 0.706240 0.9222652
## 0.2 0.0494088638 0.9371056 0.715165 0.9387470
## 0.2 0.1141408866 0.9375664 0.730145 0.9557758
## 0.2 0.2636802588 0.9310188 0.658375 0.9617576
## 0.3 0.0001407182 0.8583485 0.690935 0.8588530
## 0.3 0.0003250773 0.8710855 0.707985 0.8702750
## 0.3 0.0007509707 0.8845583 0.712455 0.8835659
## 0.3 0.0017348398 0.8988328 0.707640 0.8966424
## 0.3 0.0040077050 0.9118799 0.701710 0.9036447
## 0.3 0.0092583185 0.9231815 0.704730 0.9116045
## 0.3 0.0213879171 0.9319316 0.708700 0.9252538
## 0.3 0.0494088638 0.9386165 0.723500 0.9424485
## 0.3 0.1141408866 0.9367500 0.743730 0.9523977
## 0.3 0.2636802588 0.9278429 0.585910 0.9529621
## 0.4 0.0001407182 0.8571024 0.689275 0.8576235
## 0.4 0.0003250773 0.8701175 0.707105 0.8692083
## 0.4 0.0007509707 0.8840902 0.712790 0.8826333
## 0.4 0.0017348398 0.8992454 0.708640 0.8971492
## 0.4 0.0040077050 0.9131922 0.702595 0.9049197
## 0.4 0.0092583185 0.9249430 0.703540 0.9140326
## 0.4 0.0213879171 0.9341662 0.709130 0.9278992
## 0.4 0.0494088638 0.9398105 0.732710 0.9443288
## 0.4 0.1141408866 0.9358463 0.743265 0.9435598
## 0.4 0.2636802588 0.9231022 0.403625 0.9667697
## 0.5 0.0001407182 0.8556533 0.687020 0.8564053
## 0.5 0.0003250773 0.8688303 0.705675 0.8682879
## 0.5 0.0007509707 0.8835236 0.713085 0.8813008
## 0.5 0.0017348398 0.8996328 0.709470 0.8972106
## 0.5 0.0040077050 0.9144863 0.704090 0.9063273
## 0.5 0.0092583185 0.9265201 0.702055 0.9164970
## 0.5 0.0213879171 0.9360828 0.709185 0.9283811
## 0.5 0.0494088638 0.9408444 0.743195 0.9465780
## 0.5 0.1141408866 0.9345570 0.729725 0.9366114
## 0.5 0.2636802588 0.8973614 0.012780 0.9965136
## 0.6 0.0001407182 0.8540244 0.684285 0.8548689
## 0.6 0.0003250773 0.8675458 0.704025 0.8668045
## 0.6 0.0007509707 0.8825884 0.713015 0.8800856
## 0.6 0.0017348398 0.8997948 0.709645 0.8970545
## 0.6 0.0040077050 0.9155266 0.705430 0.9076477
## 0.6 0.0092583185 0.9282102 0.702440 0.9196371
## 0.6 0.0213879171 0.9375682 0.711915 0.9277705
## 0.6 0.0494088638 0.9415923 0.751220 0.9458098
## 0.6 0.1141408866 0.9321814 0.721720 0.9293038
## 0.6 0.2636802588 0.8667154 0.000085 0.9999583
## 0.7 0.0001407182 0.8519575 0.681820 0.8529447
## 0.7 0.0003250773 0.8659119 0.701655 0.8651182
## 0.7 0.0007509707 0.8814914 0.712635 0.8789561
## 0.7 0.0017348398 0.8997292 0.710325 0.8960727
## 0.7 0.0040077050 0.9164052 0.707125 0.9086985
## 0.7 0.0092583185 0.9297317 0.703870 0.9227977
## 0.7 0.0213879171 0.9388396 0.716710 0.9284326
## 0.7 0.0494088638 0.9419743 0.751940 0.9413152
## 0.7 0.1141408866 0.9308437 0.722670 0.9250250
## 0.7 0.2636802588 0.8675297 0.000000 1.0000000
## 0.8 0.0001407182 0.8495185 0.678460 0.8506083
## 0.8 0.0003250773 0.8638317 0.698460 0.8627409
## 0.8 0.0007509707 0.8801207 0.712975 0.8771598
## 0.8 0.0017348398 0.8991881 0.710715 0.8945561
## 0.8 0.0040077050 0.9170448 0.708450 0.9092712
## 0.8 0.0092583185 0.9310097 0.705920 0.9245977
## 0.8 0.0213879171 0.9400690 0.721660 0.9302288
## 0.8 0.0494088638 0.9419780 0.748975 0.9358068
## 0.8 0.1141408866 0.9273848 0.737350 0.9196295
## 0.8 0.2636802588 0.8684132 0.000000 1.0000000
## 0.9 0.0001407182 0.8461051 0.675720 0.8475758
## 0.9 0.0003250773 0.8610645 0.694740 0.8592553
## 0.9 0.0007509707 0.8780555 0.711885 0.8745826
## 0.9 0.0017348398 0.8985008 0.711180 0.8928485
## 0.9 0.0040077050 0.9176233 0.710035 0.9093985
## 0.9 0.0092583185 0.9318462 0.706620 0.9243265
## 0.9 0.0213879171 0.9413319 0.727950 0.9323992
## 0.9 0.0494088638 0.9415059 0.747355 0.9328561
## 0.9 0.1141408866 0.9213497 0.765630 0.9124886
## 0.9 0.2636802588 0.8711438 0.000000 1.0000000
## 1.0 0.0001407182 0.8414896 0.673195 0.8409727
## 1.0 0.0003250773 0.8573410 0.689490 0.8534902
## 1.0 0.0007509707 0.8763516 0.712485 0.8701447
## 1.0 0.0017348398 0.8983501 0.715020 0.8907182
## 1.0 0.0040077050 0.9187792 0.716665 0.9098008
## 1.0 0.0092583185 0.9330482 0.713940 0.9236742
## 1.0 0.0213879171 0.9422862 0.739255 0.9318583
## 1.0 0.0494088638 0.9403839 0.749090 0.9287652
## 1.0 0.1141408866 0.9066605 0.783610 0.9078826
## 1.0 0.2636802588 0.8711618 0.000000 1.0000000
##
## ROC was used to select the optimal model using the largest value.
## The final values used for the model were alpha = 1 and lambda = 0.02138792.
predlr <- lrTrain$pred
predlr <- predlr[predlr[,"alpha"]==lrTrain$bestTune$alpha & predlr[,"lambda"]==lrTrain$bestTune$lambda, ]
predlr <- getRocCV(predlr)
p <- prediction(predlr[1][[1]], predlr[2][[1]])
a <- performance(p, "auc")
a <- mean(unlist(a@y.values))
auc <- a
a <- performance(p, "acc")
a <- mean(unlist(a@y.values))
acc <-a
auc
## [1] 0.9372778
acc
## [1] 0.6780533
bestcoef <- coef(lrTrain$finalModel, s = lrTrain$bestTune$lambda)
bestcoef <- as.data.frame(as.matrix(bestcoef))
temp <- bestcoef[bestcoef[,"1"]!=0 & rownames(bestcoef)!="(Intercept)",]
temp.name <- rownames(bestcoef)[bestcoef[,"1"]!=0 & rownames(bestcoef)!="(Intercept)"]
temp <- as.data.frame(temp)
colnames(temp) <- c("val")
temp$name <- temp.name
temp[,"val"] <- temp["val"]*-1
col <- as.vector(unlist(lapply(temp[,"val"], function(x){if(x>0) "Positive" else "Negative"})))
ggplot(temp, aes(y=val, x=reorder(name, abs(val)))) + geom_bar(stat="identity", aes(fill=col)) + coord_flip()+xlab("Feature") +ylab("Coeffecient") + scale_fill_manual(guide=FALSE, values=c("Red", "Blue")) + theme(panel.background = element_rect(fill = '#FFFFFF', colour = '#000000'), panel.grid.major.x = element_blank(), panel.grid.major.y = element_blank())