Remove mAb 135 because it is missing Neut_micro.
df <- read.csv("./data/master_log.csv", na.strings=c("?", "nd"), strip.white=TRUE, row.names=c("Ab"))
df <- df[!rownames(df) %in% c("135"),]
Convert Epitope Class to a factor.
df$Epitope_Class <- factor(df$Epitope_Class)
Drop columns with missing values.
df <- subset(df, select=-c(Protect_binary, Epitope_Class_ELISA, Endotoxin, Endotoxin.1, mW_Loss, aTTD, Epitope.Class..assigning.method.)) #Remove columns with empty values.
df[,"Escape..code"] <- as.factor(df[,"Escape..code"])
df[,"Makona.binding"] <- as.factor(df[,"Makona.binding"])
Transform into IIP.
df[,"unNeutFrac"] <- -log10(1/df[,"unNeutFrac"])
df[,"Neut_dVP30"] <- 1- df[,"Neut_dVP30"]
df[,"Neut_VSV"] <- log10(100/df[,"Neut_VSV"])
Let’s examine the correlations between the different neutralization readouts.
library(ggplot2)
theme_set(theme_bw(base_size = 14))
library(reshape2)
c <- cor(df[,c("Protection", "Neut_micro", "Neut_dVP30", "Neut_VSV", "unNeutFrac")])
c
## Protection Neut_micro Neut_dVP30 Neut_VSV unNeutFrac
## Protection 1.0000000 0.7264262 0.6954513 0.7210592 -0.7437038
## Neut_micro 0.7264262 1.0000000 0.7432890 0.7995016 -0.7810941
## Neut_dVP30 0.6954513 0.7432890 1.0000000 0.8395953 -0.9443173
## Neut_VSV 0.7210592 0.7995016 0.8395953 1.0000000 -0.9178748
## unNeutFrac -0.7437038 -0.7810941 -0.9443173 -0.9178748 1.0000000
c <- melt(c)
ggplot(c, aes(x=Var1, y=Var2)) + geom_tile(aes(fill=value)) + scale_fill_gradient(low="white", high="steelblue")
Let’s look at the distribution of the four neutrlization readouts vs Protection.
library(patchwork)
plts <- list()
for(i in c("Neut_micro", "Neut_dVP30", "Neut_VSV", "unNeutFrac")){
p <- ggplot(df, aes_string(x=i,y="Protection")) + geom_point() + geom_smooth(method='lm') + theme_bw()
plts[[length(plts)+1]] <- p
}
p <- plts[[1]]
for(x in plts[-1]){
p <- p+ x
}
p + plot_layout(ncol = 2)
Separate Human IgG1 and Mouse IgG1
levels(df[,"Isotype"]) <- c(levels(df[,"Isotype"]), "HumanIgG1")
df[df[,"Isotype"]=="IgG1" & df[,"Species"]=="human", "Isotype"] <- "HumanIgG1"
levels(df[,"Isotype"]) <- c(levels(df[,"Isotype"]), "MouseIgG1")
df[df[,"Isotype"]=="IgG1" & df[,"Species"]=="mouse", "Isotype"] <- "MouseIgG1"
df[,"Isotype"] <- droplevels(df[,"Isotype"])
Convert polyfunctionality to numeric type.
df[,"Polyfunctionality"] <- as.numeric(df[,"Polyfunctionality"])
Add Epitope Tier columns
df[df$Epitope_Class %in% c("Cap", "GP1/Head", "Mucin"), "Epitope_Tier"] <- "Tier1"
df[df$Epitope_Class %in% c("Base", "GP1/Core", "Fusion"), "Epitope_Tier"] <- "Tier2"
df[df$Epitope_Class %in% c("GP1/2", "HR2"), "Epitope_Tier"] <- "Tier3"
df[df$Epitope_Class %in% c("Unknown"), "Epitope_Tier"] <- "TierUnknown"
Dropidentical features and features with missing experimental data.
t <- subset(df, select=-c(Round, Makona.binding, Escape..code, Epitope_Tier, Total_SA,Cross.reactivity))
t <- subset(t, select=-c(Isotype))
Remove features with near zero variance. Features are centered and scaled.
library(caret)
s <- dummyVars(~., data <- t, levelsOnly=TRUE)
q <- predict(s, t)
h <- nearZeroVar(q, saveMetrics=TRUE)
t.var <- apply(q, 2, var)
t.var <- as.data.frame(t.var)
t.var[,"Var"] <- rownames(t.var)
t.var[,"nzv"] <- h$nzv
ggplot(t.var, aes(x=reorder(Var, t.var), y=t.var)) + geom_bar(stat="identity", aes(fill=nzv)) + scale_y_continuous(trans='log10') + theme(axis.text.x = element_text(angle = 90, hjust = 1)) + scale_fill_manual(values = c("steelblue", "indianred"),name= "Near Zero Variance") + ggtitle("Features with near zero variance that will be removed.") +xlab("Feature") + ylab("log10(Variance)")
q <- q[,(colnames(q) %in% rownames(h[h[,"nzv"]==FALSE ,]))]
q <- subset(q, select =-c(Unknown, sGP.binderFALSE))
norm <- preProcess(q, method=c("center", "scale"))
q <- predict(norm, q)
q <- as.data.frame(q)
head(q)
## Protection Base Cap GP1/2 GP1/Core Mucin
## 1 0.4289280 -0.301588 1.6930603 -0.301588 -0.4854046 -0.3677495
## 2 -1.0652518 -0.301588 -0.5871096 -0.301588 -0.4854046 2.7029591
## 3 -0.7664158 -0.301588 1.6930603 -0.301588 -0.4854046 -0.3677495
## 4 -0.4675799 -0.301588 -0.5871096 -0.301588 -0.4854046 2.7029591
## 5 -0.1687439 -0.301588 1.6930603 -0.301588 -0.4854046 -0.3677495
## 6 0.1300921 -0.301588 -0.5871096 -0.301588 -0.4854046 2.7029591
## Neut_VSV unNeutFrac Neut_micro Neut_dVP30 GP_EC50 sGP_EC50
## 1 -0.6613522 0.6363919 -1.0074874 -0.6029719 -0.3779862 -1.0684007
## 2 -0.6613522 0.6363919 -1.0074874 -0.6029719 -0.3799966 0.9542301
## 3 -0.6613522 0.6363919 -0.5342092 -0.6029719 0.3680830 -1.0247257
## 4 -0.6613522 0.6363919 -1.0074874 -0.6029719 -0.3809804 0.9542301
## 5 1.0107626 0.1670253 0.4596750 -0.6029719 -0.1946556 -1.0545575
## 6 -0.6613522 0.6363919 -1.0074874 -0.6029719 -0.3654533 0.9542301
## human mouse Polyfunctionality huADCP mADCP CD107a
## 1 0.8059481 -0.8059481 1.5775151 0.9420515 1.4928146 1.4855860
## 2 0.8059481 -0.8059481 1.5775151 -0.6288721 1.0345560 2.0526654
## 3 -1.2333449 1.2333449 -0.5730565 0.4718370 -0.2366025 -0.4504150
## 4 0.8059481 -0.8059481 1.5775151 0.6436291 0.8137532 2.4809491
## 5 -1.2333449 1.2333449 -0.5730565 1.7027185 -0.2927284 -0.4067935
## 6 -1.2333449 1.2333449 -0.0354136 -0.1046406 0.5491598 -0.2783084
## IFNg MIP.1b huADNP mADNP sGP_huADCP sGP_huADNP
## 1 1.4721098 1.6246636 4.3450612 -0.12824504 1.0847236 1.0050389
## 2 1.9820216 2.5693466 3.3395864 -0.61622083 -0.3945998 0.4790355
## 3 -0.4327751 -0.5714592 -0.6443498 -0.28376405 1.0175055 -0.3613869
## 4 2.1313530 3.1345643 2.4476388 -0.94029636 -0.3672924 -0.1666382
## 5 -0.4021804 -0.5029962 -0.5420307 -0.87790251 0.3006875 -0.1921194
## 6 -0.4262191 -0.3862376 0.2830299 -0.04536366 -1.1169318 -0.4437456
## Total_G0 Total_G1 Total_G2 Total_Fucose Total_Bisecting
## 1 2.337267 -2.5251621 -0.8281199 -5.3719604 -0.1683970
## 2 1.619398 -1.6106900 -0.8294177 -2.4561018 -0.7369070
## 3 0.701013 -0.5869325 -0.5607759 0.3274304 -0.9737862
## 4 2.546092 -2.6400880 -1.1045484 -5.3980247 -0.4810775
## 5 1.325271 -1.5218515 -0.3038141 0.2532474 -0.3863258
## 6 1.033914 -1.0565072 -0.4777175 0.3341135 -0.9927366
## Total_mono_SA G0 G0F G0FB G1 G1.
## 1 -0.1427142 5.5195898 -2.1311307 -1.1673856 -0.7037159 0.22572059
## 2 0.3107384 2.5342760 -0.3934907 -0.7431267 -0.2825927 -0.25374768
## 3 0.6070378 -0.2220388 0.9525519 -0.6017070 -0.4184389 -0.10914614
## 4 -0.4144584 5.5077929 -1.9230581 -0.6788450 -0.6357928 0.26377363
## 5 1.2618430 -0.2539599 1.6387239 -0.6017070 -0.2554234 0.08111905
## 6 0.6905255 -0.2504902 1.3295374 -0.6402760 -0.3641004 -0.37551740
## G1F G1F. G1S1F G1FB G2F G2FB
## 1 -1.797929 -2.00926166 0.43313807 -0.4367530 -0.6487676 -0.329686262
## 2 -1.285002 -1.26239409 0.65351661 -0.3973272 -0.9934518 -0.351766713
## 3 -1.299307 -0.07266768 -0.02840944 -0.5944562 -1.2077149 -0.351766713
## 4 -1.771363 -1.99464586 -0.35274011 -0.6601659 -1.0291623 -0.219284004
## 5 -1.509791 -1.15058320 0.85726280 -0.5681724 -1.2387675 -0.009519716
## 6 -1.381048 -0.48848729 -0.09909689 -0.6995917 -1.2092675 -0.384887390
## G2S1 G2S1F G2S1B G2S1FB sGP.binderTRUE
## 1 0.1525347 -0.70223990 1.64704157 0.5254709 1.0522613
## 2 0.4920949 -0.01897163 -0.07530043 -0.4418074 -0.9446437
## 3 -0.5077212 1.13312379 -0.96759809 -0.4686763 1.0522613
## 4 0.7373328 -0.64968080 0.69249058 -0.1328157 -0.9446437
## 5 -0.3945344 1.29921054 1.14901497 -0.6164549 1.0522613
## 6 -0.2750596 1.17937580 -0.42806927 -0.6836270 -0.9446437
Let’s build a classifier to classify mAbs into “high” and “low” protection. We set the threshold between “high” and “low” as 0.6.
threshold <- 0.6
t[t[,"Protection"]<threshold,"label"] <- "Low"
t[(t[,"Protection"]>=threshold),"label"] <- "High"
q[,"label"] <- t[,"label"]
t <- q
Before we start training our models. Let’s enable R to use multiple cores.
library(doMC)
## Loading required package: foreach
## Loading required package: iterators
## Loading required package: parallel
registerDoMC(cores = 16)
Let’s first train a Random Forest Model and get the predictions.
repeats <- 1000
set.seed(112358)
rfControl <- trainControl(
method = "repeatedcv",
number = 10,
repeats= repeats,
verboseIter = FALSE,
returnData = FALSE,
allowParallel = TRUE,
summaryFunction = twoClassSummary,
classProbs = TRUE,
savePredictions = T,
search="grid"
)
rfTrain <- train(
x = as.matrix(subset(t, select=-c(label, Protection))),
y = as.character(t[,"label"]),
trControl = rfControl,
method = "rf",
ntree = 1000,
tuneLength = 10
)
## Warning in train.default(x = as.matrix(subset(t, select = -c(label,
## Protection))), : The metric "Accuracy" was not in the result set. ROC will
## be used instead.
rfTrain
## Random Forest
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 1000 times)
## Summary of sample sizes: 150, 150, 150, 150, 151, 150, ...
## Resampling results across tuning parameters:
##
## mtry ROC Sens Spec
## 2 0.9532592 0.739830 0.9609015
## 6 0.9528026 0.799540 0.9483038
## 11 0.9491424 0.814320 0.9477629
## 16 0.9450272 0.812080 0.9462788
## 21 0.9413282 0.809870 0.9432970
## 25 0.9387529 0.808605 0.9404811
## 30 0.9365608 0.807020 0.9356091
## 35 0.9344291 0.805315 0.9308591
## 40 0.9329533 0.804490 0.9273265
## 45 0.9316744 0.804155 0.9252356
##
## ROC was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 2.
rfTrain$finalModel
##
## Call:
## randomForest(x = x, y = y, ntree = 1000, mtry = param$mtry)
## Type of random forest: classification
## Number of trees: 1000
## No. of variables tried at each split: 2
##
## OOB estimate of error rate: 9.58%
## Confusion matrix:
## High Low class.error
## High 36 12 0.25000000
## Low 4 115 0.03361345
predRf <- rfTrain$pred
predRf <- predRf[predRf[,"mtry"]==rfTrain$bestTune$mtry,]
Let’s plot feature importance for the Random Forest model.
impRf <- varImp(rfTrain$finalModel, scaled=FALSE)
impRf[,"Variable"] <- rownames(impRf)
impRf <- impRf[with(impRf, order(-Overall)),]
impRf[,"Variable"] <- factor(impRf[,"Variable"], levels = impRf[,"Variable"])
pp <- ggplot(impRf, aes(Variable, Overall)) + geom_bar(stat="identity", fill="#000000")
pp + theme_bw()+ theme(text = element_text(size=14), axis.text = element_text(size=14), panel.border = element_blank(), panel.grid.major.x = element_blank(), panel.grid.major.y = element_line( size=.3, color="#f5f5f5"), panel.grid.minor = element_blank(), axis.line = element_line(colour = "black"), axis.text.x = element_text(angle = 90, hjust = 1)) + xlab("Feature") + ylab("Mean Decrease in Gini Index")
Let’s now train a logistic regression model with elastic net regularization.
require(methods)
## Loading required package: methods
lrControl <- trainControl(
method = "repeatedcv",
number = 10,
repeats= repeats,
verboseIter = FALSE,
returnData = FALSE,
allowParallel = TRUE,
summaryFunction = twoClassSummary,
classProbs = TRUE,
savePredictions = T
)
set.seed(112358)
lrTrain <- train(
x = as.matrix(subset(t, select=-c(label, Protection))),
y = as.character(t[,"label"]),
trControl = lrControl,
method = "glmnet",
family="binomial",
tuneLength = 10
)
## Warning in train.default(x = as.matrix(subset(t, select = -c(label,
## Protection))), : The metric "Accuracy" was not in the result set. ROC will
## be used instead.
lrTrain
## glmnet
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 1000 times)
## Summary of sample sizes: 150, 150, 150, 150, 151, 150, ...
## Resampling results across tuning parameters:
##
## alpha lambda ROC Sens Spec
## 0.1 0.0001502019 0.9137703 0.778490 0.8895182
## 0.1 0.0003469858 0.9144522 0.779090 0.8903341
## 0.1 0.0008015822 0.9228753 0.781465 0.9011644
## 0.1 0.0018517590 0.9308202 0.786980 0.9049947
## 0.1 0.0042778035 0.9393052 0.792255 0.9071447
## 0.1 0.0098822812 0.9474335 0.800670 0.9161008
## 0.1 0.0228293518 0.9529352 0.790285 0.9291644
## 0.1 0.0527387649 0.9578061 0.764705 0.9516432
## 0.1 0.1218333901 0.9602731 0.747650 0.9571068
## 0.1 0.2814509401 0.9590345 0.744310 0.9593371
## 0.2 0.0001502019 0.9070355 0.773150 0.8827727
## 0.2 0.0003469858 0.9138259 0.778545 0.8896500
## 0.2 0.0008015822 0.9225662 0.781875 0.9008288
## 0.2 0.0018517590 0.9307521 0.786940 0.9050212
## 0.2 0.0042778035 0.9399869 0.791165 0.9073091
## 0.2 0.0098822812 0.9485003 0.798805 0.9168326
## 0.2 0.0228293518 0.9538168 0.787910 0.9305538
## 0.2 0.0527387649 0.9600655 0.761680 0.9546295
## 0.2 0.1218333901 0.9605824 0.762860 0.9569659
## 0.2 0.2814509401 0.9555530 0.766325 0.9643174
## 0.3 0.0001502019 0.9059250 0.772040 0.8816530
## 0.3 0.0003469858 0.9133042 0.777945 0.8891439
## 0.3 0.0008015822 0.9222647 0.781710 0.9003152
## 0.3 0.0018517590 0.9307360 0.786900 0.9050348
## 0.3 0.0042778035 0.9407258 0.790775 0.9074417
## 0.3 0.0098822812 0.9490719 0.795680 0.9177098
## 0.3 0.0228293518 0.9554276 0.788250 0.9326606
## 0.3 0.0527387649 0.9622674 0.762535 0.9552386
## 0.3 0.1218333901 0.9594614 0.785740 0.9555659
## 0.3 0.2814509401 0.9501256 0.752835 0.9661598
## 0.4 0.0001502019 0.9052845 0.771580 0.8810106
## 0.4 0.0003469858 0.9125261 0.777600 0.8885606
## 0.4 0.0008015822 0.9218761 0.781455 0.8997462
## 0.4 0.0018517590 0.9309344 0.786495 0.9048614
## 0.4 0.0042778035 0.9413537 0.790610 0.9076470
## 0.4 0.0098822812 0.9492646 0.794455 0.9186523
## 0.4 0.0228293518 0.9574250 0.790110 0.9353705
## 0.4 0.0527387649 0.9630433 0.770965 0.9564152
## 0.4 0.1218333901 0.9568769 0.787470 0.9508174
## 0.4 0.2814509401 0.9356430 0.698795 0.9661576
## 0.5 0.0001502019 0.9045698 0.771440 0.8801636
## 0.5 0.0003469858 0.9117711 0.777130 0.8877598
## 0.5 0.0008015822 0.9213198 0.781155 0.8990553
## 0.5 0.0018517590 0.9308780 0.786730 0.9044644
## 0.5 0.0042778035 0.9418714 0.790325 0.9081606
## 0.5 0.0098822812 0.9495269 0.794745 0.9198038
## 0.5 0.0228293518 0.9589687 0.791635 0.9380121
## 0.5 0.0527387649 0.9625539 0.777590 0.9569841
## 0.5 0.1218333901 0.9545387 0.776445 0.9469250
## 0.5 0.2814509401 0.9207545 0.588955 0.9663417
## 0.6 0.0001502019 0.9037667 0.771195 0.8796515
## 0.6 0.0003469858 0.9108517 0.776235 0.8866606
## 0.6 0.0008015822 0.9206239 0.781000 0.8978841
## 0.6 0.0018517590 0.9306203 0.786685 0.9043432
## 0.6 0.0042778035 0.9421677 0.790265 0.9086864
## 0.6 0.0098822812 0.9498382 0.795870 0.9205500
## 0.6 0.0228293518 0.9600617 0.791330 0.9406644
## 0.6 0.0527387649 0.9614406 0.782285 0.9563621
## 0.6 0.1218333901 0.9509284 0.770990 0.9474871
## 0.6 0.2814509401 0.9195468 0.428230 0.9784114
## 0.7 0.0001502019 0.9027185 0.770685 0.8790030
## 0.7 0.0003469858 0.9096162 0.775025 0.8855053
## 0.7 0.0008015822 0.9196314 0.780345 0.8966083
## 0.7 0.0018517590 0.9302576 0.786230 0.9037038
## 0.7 0.0042778035 0.9422897 0.790840 0.9094326
## 0.7 0.0098822812 0.9502333 0.796985 0.9210076
## 0.7 0.0228293518 0.9605094 0.789845 0.9424038
## 0.7 0.0527387649 0.9598780 0.786075 0.9532159
## 0.7 0.1218333901 0.9455998 0.760925 0.9465735
## 0.7 0.2814509401 0.9190737 0.149095 0.9980167
## 0.8 0.0001502019 0.9015157 0.770545 0.8782970
## 0.8 0.0003469858 0.9078433 0.773885 0.8839583
## 0.8 0.0008015822 0.9181157 0.779925 0.8953356
## 0.8 0.0018517590 0.9294290 0.785650 0.9028886
## 0.8 0.0042778035 0.9422117 0.790850 0.9094621
## 0.8 0.0098822812 0.9507573 0.798245 0.9220000
## 0.8 0.0228293518 0.9607676 0.789810 0.9433598
## 0.8 0.0527387649 0.9578892 0.787930 0.9491864
## 0.8 0.1218333901 0.9389147 0.740330 0.9444992
## 0.8 0.2814509401 0.9183226 0.002760 0.9998326
## 0.9 0.0001502019 0.8995496 0.771000 0.8771098
## 0.9 0.0003469858 0.9051700 0.773695 0.8816205
## 0.9 0.0008015822 0.9155809 0.779095 0.8925356
## 0.9 0.0018517590 0.9282226 0.784945 0.9018144
## 0.9 0.0042778035 0.9417558 0.791145 0.9097091
## 0.9 0.0098822812 0.9509883 0.800415 0.9229803
## 0.9 0.0228293518 0.9608764 0.791695 0.9436803
## 0.9 0.0527387649 0.9557799 0.788810 0.9457932
## 0.9 0.1218333901 0.9303100 0.732465 0.9444523
## 0.9 0.2814509401 0.9125319 0.000000 1.0000000
## 1.0 0.0001502019 0.8938665 0.770750 0.8739576
## 1.0 0.0003469858 0.8995557 0.772585 0.8775076
## 1.0 0.0008015822 0.9107408 0.777545 0.8882273
## 1.0 0.0018517590 0.9258452 0.784200 0.9001091
## 1.0 0.0042778035 0.9410308 0.791830 0.9098000
## 1.0 0.0098822812 0.9511013 0.802415 0.9242205
## 1.0 0.0228293518 0.9603100 0.793415 0.9424515
## 1.0 0.0527387649 0.9534488 0.787690 0.9405576
## 1.0 0.1218333901 0.9229691 0.725065 0.9504515
## 1.0 0.2814509401 0.8753034 0.000000 1.0000000
##
## ROC was used to select the optimal model using the largest value.
## The final values used for the model were alpha = 0.4 and lambda
## = 0.05273876.
predlr <- lrTrain$pred
predlr <- predlr[predlr[,"alpha"]==lrTrain$bestTune$alpha & predlr[,"lambda"]==lrTrain$bestTune$lambda, ]
In logistic regression notice how alpha = 1(lasso) performs best based on ROC. Let’s now train a support vector machine.
set.seed(112358)
svmControl <- trainControl(
method = "repeatedcv",
number = 10,
repeats= repeats,
classProbs = T,
savePredictions = T
)
svmTrainLinear <- train(label~., data = subset(t, select=-c(Protection)),trControl = svmControl,method = "svmLinear", tuneLength = 10)
svmTrainRadial <- train(label~., data = subset(t, select=-c(Protection)),trControl = svmControl,method = "svmRadial", tuneLength = 10)
svmTrainLinear
## Support Vector Machines with Linear Kernel
##
## 167 samples
## 45 predictor
## 2 classes: 'High', 'Low'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 1000 times)
## Summary of sample sizes: 150, 150, 150, 150, 151, 150, ...
## Resampling results:
##
## Accuracy Kappa
## 0.8656652 0.6560753
##
## Tuning parameter 'C' was held constant at a value of 1
svmTrainRadial
## Support Vector Machines with Radial Basis Function Kernel
##
## 167 samples
## 45 predictor
## 2 classes: 'High', 'Low'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 1000 times)
## Summary of sample sizes: 151, 150, 151, 150, 151, 150, ...
## Resampling results across tuning parameters:
##
## C Accuracy Kappa
## 0.25 0.8752351 0.6960810
## 0.50 0.8819904 0.6908564
## 1.00 0.8740590 0.6698671
## 2.00 0.8705411 0.6618988
## 4.00 0.8716944 0.6667776
## 8.00 0.8684993 0.6584494
## 16.00 0.8614388 0.6401486
## 32.00 0.8468164 0.6032123
## 64.00 0.8441723 0.5973946
## 128.00 0.8444904 0.5983386
##
## Tuning parameter 'sigma' was held constant at a value of 0.01430736
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were sigma = 0.01430736 and C = 0.5.
predSVMLinear <- svmTrainLinear$pred
predSVMRadial <- svmTrainRadial$pred
We see that a linear function does better than the radial basis function.
predSVM <- predSVMLinear
Let’s now train a K nearest neighbour model
## KNN
set.seed(112358)
knnControl <- trainControl(
method = "repeatedcv",
number = 10,
repeats= repeats,
savePredictions = T,
classProbs= T
)
knnTrain <- train(label~., data = subset(t, select=-c(Protection)),trControl = knnControl,method = "knn", tuneLength = 10)
predKnn <- knnTrain$pred
predKnn <- predKnn[predKnn[,"k"]==knnTrain$bestTune$k,]
knnTrain
## k-Nearest Neighbors
##
## 167 samples
## 45 predictor
## 2 classes: 'High', 'Low'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 1000 times)
## Summary of sample sizes: 150, 150, 150, 150, 151, 150, ...
## Resampling results across tuning parameters:
##
## k Accuracy Kappa
## 5 0.8816973 0.6991489
## 7 0.8753678 0.6909522
## 9 0.8914540 0.7311594
## 11 0.8978546 0.7455154
## 13 0.8905556 0.7275586
## 15 0.8910849 0.7315575
## 17 0.8970209 0.7448255
## 19 0.8944119 0.7367887
## 21 0.8944554 0.7333482
## 23 0.8944807 0.7302065
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was k = 11.
Let’s now compute the ROC curves for all four classifiers. The function getRocCV computes the ROC curves for each iteration(We used a 1000 iterations for each classifier). getAvgRoc plots the average of all 1000 ROC curves.
library(ROCR)
## Loading required package: gplots
##
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
##
## lowess
getRocCV <- function(pred){
temp.obs <- c()
temp.pred <- c()
for(i in seq(1, repeats)){
temp <- paste("Rep", sprintf("%04d",i), sep="")
temp.p <- pred[grepl(temp, pred[,"Resample"]), "Low"]
temp.o <- pred[grepl(temp, pred[,"Resample"]), "obs"]
temp.pred[[length(temp.pred)+1]] <- temp.p
temp.obs[[length(temp.obs)+1]] <- temp.o
}
return(list(temp.pred, temp.obs));
}
getAvgRoc <- function(roc){
m <- max(sapply(roc@x.values, length))
resx <- sapply(roc@x.values, function(x){
x <- c(x, rep(NA, m-length(x)));
});
resy <- sapply(roc@y.values, function(x){
x <- c(x, rep(NA, m-length(x)));
});
roc.df <- data.frame(rowMeans(as.data.frame(resx), na.rm=T), rowMeans(as.data.frame(resy), na.rm=T))
colnames(roc.df) <- c("meanx", "meany")
return(roc.df)
}
predRf <- getRocCV(predRf)
p <- prediction(predRf[1][[1]], predRf[2][[1]])
rocRf <- performance(p, "tpr", "fpr")
rocRf.avg <- getAvgRoc(rocRf)
predlr <- getRocCV(predlr)
p <- prediction(predlr[1][[1]], predlr[2][[1]])
roclr <- performance(p, "tpr", "fpr")
roclr.avg <- getAvgRoc(roclr)
predSVM <- getRocCV(predSVM)
p <- prediction(predSVM[1][[1]], predSVM[2][[1]])
rocSvm <- performance(p, "tpr", "fpr")
rocSvm.avg <- getAvgRoc(rocSvm)
predKnn <- getRocCV(predKnn)
p <- prediction(predKnn[1][[1]], predKnn[2][[1]])
rocKnn <- performance(p, "tpr", "fpr")
rocKnn.avg <- getAvgRoc(rocKnn)
Let’s compute the average accuracy and AUC across all 1000 iterations for each classifier and plot the ROC curves.
l <- c("Random Forest", "Logistic Regression", "SVM", "KNN")
## Get AUC
r <- list(predRf, predlr, predSVM, predKnn)
auc <- c()
acc <- c()
for(i in seq(1, length(r))){
p <- prediction(r[i][[1]][[1]], r[i][[1]][[2]])
a <- performance(p, "auc")
a <- mean(unlist(a@y.values))
auc <- c(auc,a)
a <- performance(p, "acc")
a <- mean(unlist(a@y.values))
acc <- c(acc,a)
}
acc
## [1] 0.6998238 0.6872538 0.6668959 0.7694027
auc
## [1] 0.9501534 0.9598734 0.9098766 0.9176502
r <- list(rocRf, roclr, rocSvm, rocKnn)
r.avg <- list(rocRf.avg, roclr.avg, rocSvm.avg, rocKnn.avg)
rocdf <- as.data.frame(matrix(ncol = 4, nrow = 0))
rocdf.avg <- as.data.frame(matrix(ncol = 3, nrow = 0))
for(i in seq(1, length(r))){
r.t <- r[i]
for(j in seq(1, repeats)){
temp <- data.frame(r.t[[1]]@x.values[[j]], r.t[[1]]@y.values[[j]], rep(l[i], length(r.t[[1]]@y.values[[j]])), rep(paste(l[i], j, sep="."), length(r.t[[1]]@y.values[[j]])))
colnames(temp) <- c("x", "y", "classifier", "rep")
rocdf <- rbind(rocdf, temp)
}
r.t <- r.avg[[i]]
temp <- data.frame(r.t$meanx, r.t$meany, rep(paste(l[i], "Average", sep=" "), length(r.t$meanx)))
colnames(temp) <- c("meanx", "meany", "classifieravg")
rocdf.avg <- rbind(rocdf.avg, temp)
}
colnames(rocdf) <- c("x", "y", "classifier", "rep")
library(RColorBrewer)
classifierColor <- brewer.pal(9,"Dark2")
## Warning in brewer.pal(9, "Dark2"): n too large, allowed maximum for palette Dark2 is 8
## Returning the palette you asked for with that many colors
cc <- c()
ll <- c()
for(i in seq(1, length(l))){
cc <- c(cc, rep(classifierColor[i], 2))
ll <- c(ll, l[i], paste(l[i], "Average", sep=" "))
}
names(cc) <- ll
pp <- ggplot(rocdf[rocdf[,"classifier"]=="Random Forest",]) + geom_line(aes(x = meanx, y = meany), data=rocdf.avg[rocdf.avg[,"classifieravg"]=="Random Forest Average",], alpha=1, size=0.5) + geom_abline(color="#707070") + geom_boxplot(alpha=0.05, aes(x=x, y=y,group = cut_width(x, 0.02))) + scale_color_manual(values=cc) +xlab("FPR") +ylab("TPR") + ggtitle(paste("ROC Curves for Random Forest model with 10-fold CV repeated", repeats, "times"))
pp
pp <- ggplot(rocdf[rocdf[,"classifier"]=="Logistic Regression",]) + geom_line(aes(x = meanx, y = meany), data=rocdf.avg[rocdf.avg[,"classifieravg"]=="Logistic Regression Average",], alpha=1, size=0.5) + geom_abline(color="#707070") + geom_boxplot(alpha=0.05, aes(x=x, y=y,group = cut_width(x, 0.02))) + scale_color_manual(values=cc) +xlab("FPR") +ylab("TPR") + ggtitle(paste("ROC Curves for Logistic Regression model with 10-fold CV repeated", repeats, "times"))
pp
pp <- ggplot(rocdf[rocdf[,"classifier"]=="SVM",]) + geom_line(aes(x = meanx, y = meany), data=rocdf.avg[rocdf.avg[,"classifieravg"]=="SVM Average",], alpha=1, size=0.5) + geom_abline(color="#707070") + geom_boxplot(alpha=0.05, aes(x=x, y=y,group = cut_width(x, 0.02))) + scale_color_manual(values=cc) +xlab("FPR") +ylab("TPR") + ggtitle(paste("ROC Curves for SVM with 10-fold CV repeated", repeats, "times"))
pp
pp <- ggplot(rocdf[rocdf[,"classifier"]=="KNN",]) + geom_line(aes(x = meanx, y = meany), data=rocdf.avg[rocdf.avg[,"classifieravg"]=="KNN Average",], alpha=1, size=0.5) + geom_abline(color="#707070") + geom_boxplot(alpha=0.05, aes(x=x, y=y,group = cut_width(x, 0.02))) + scale_color_manual(values=cc) +xlab("FPR") +ylab("TPR") + ggtitle(paste("ROC Curves for KNN model with 10-fold CV repeated", repeats, "times"))
pp
pp <- ggplot() + geom_line(aes(meanx, meany, color=classifieravg), data=rocdf.avg, alpha=1, size=0.5) + geom_abline(color="#707070") + scale_color_manual(values=cc) +xlab("FPR") +ylab("TPR") + ggtitle(paste("Average ROC Curves for 10-fold CV repeated", repeats, "times"))
for(i in seq(1, length(auc))){
pp <- pp + annotate("text", label = paste("AUC:", round(auc[i], 3), "ACC:", round(acc[i], 3), sep=" "), x = 1, y = 0.25-(i*0.05), size = 4, colour = cc[l[i]][[1]], hjust=1)
}
pp <- pp + theme_bw()+ theme(text = element_text(size=14), axis.text = element_text(size=14), panel.border = element_blank(), panel.grid.major.x = element_blank(), panel.grid.major.y = element_blank(), panel.grid.minor = element_blank(), axis.line = element_line(colour = "black")) + ggtitle("ROC curves across 1000 iterations")
pp
pp <- ggplot() + geom_line(aes(meanx, meany, color=classifieravg), data=rocdf.avg, alpha=1, size=0.5) + geom_abline(color="#707070") + scale_color_manual(values=cc) +xlab("FPR") +ylab("TPR") + ggtitle(paste("Average ROC Curves for 10-fold CV repeated", repeats, "times"))
## for(i in seq(1, length(auc))){
## print(i)
## pp <- pp + annotate("text", label = paste(l[i], "AUC:", round(auc[i], 3), "ACC:", round(acc[i], 3), sep=" "), x = 1, y = 0.25-(i*0.05), size = 4, colour = cc[l[i]][[1]], hjust=1)
## }
pp <- pp + theme_bw()+ theme(text = element_text(size=14), axis.text = element_text(size=14), panel.border = element_blank(), panel.grid.major.x = element_blank(), panel.grid.major.y = element_line( size=.3, color="#f5f5f5"), panel.grid.minor = element_blank(), axis.line = element_line(colour = "black")) + ggtitle("ROC curves across 1000 iterations")
Let’s take a look at the features selected by the logistic regression
bestcoef <- coef(lrTrain$finalModel, s = lrTrain$bestTune$lambda)
bestcoef <- as.data.frame(as.matrix(bestcoef))
temp <- bestcoef[bestcoef[,"1"]!=0 & rownames(bestcoef)!="(Intercept)",]
temp.name <- rownames(bestcoef)[bestcoef[,"1"]!=0 & rownames(bestcoef)!="(Intercept)"]
temp <- as.data.frame(temp)
colnames(temp) <- c("val")
temp$name <- temp.name
temp[,"val"] <- temp["val"]*-1
col <- as.vector(unlist(lapply(temp[,"val"], function(x){if(x>0) "Positive" else "Negative"})))
ggplot(temp, aes(y=val, x=reorder(name, abs(val)))) + geom_bar(stat="identity", aes(fill=col)) + coord_flip()+xlab("Feature") +ylab("Coeffecient") + scale_fill_manual(guide=FALSE, values=c("Red", "Blue")) + theme(panel.background = element_rect(fill = '#FFFFFF', colour = '#000000'), panel.grid.major.x = element_blank(), panel.grid.major.y = element_blank(), panel.grid.minor = element_blank())
Let’s now take a look at the antibodies for which the LR model failed. Let’s plot true protection vs the probability with which the LR model decided to classify as High. We shall also annotate the false positives(bottom right corner) and false negatives(top left) with ids.
lrProb <- predict(lrTrain, q, type="prob")
lrProb[,"predicted"] <- predict(lrTrain, q)
lrProb[,"truth"] <- q[,"label"]
lrProb[,"Protection"] <- df[,"Protection"]
lrProb[,"id"] <- rownames(df)
lrProb[lrProb$predicted==lrProb$truth,"id"] <- ""
ggplot(lrProb, aes(x=High, y=Protection, color=predicted, shape=truth, group=interaction(predicted, truth))) + geom_point(size = 6) + xlab("Predicted Probability of High Protection") + ylab("True Protection") + geom_hline(yintercept=0.6, linetype="dashed", alpha=0.6) + geom_vline(xintercept=0.5, linetype="dashed", alpha=0.6) + geom_text(aes(label=id), color="black", alpha=1, nudge_y = 0.05)