We first assess the number of true positives and false positives from spiked in sample with Zika virus RNA from isolates PRVABC59 (Puerto Rico 2015, Genbank KX087101, ‘virus #1’) and FSS13025 (Cambodia 2010, Genbank KU955593, ‘virus #2’). Virus #2 was spiked in at 10%. The sites with true iSNVs(true positives) and sites that were expected to be constant were determined using gold standard metagenomic sequencing. These positions are documented in ZIKV-intrahost_True-False-Remove.csv.

iSNVs were called for each of three technical replicates PZ34(A), PZ35(B) and PZ36(C) separately and then filtered by looking at common variants using the command “ivar filtervariants”.

library(ggplot2)
## Warning in .doLoadActions(where, attach): trying to execute load actions
## without 'methods' package
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(reshape2)

## Generated in a specific order for figure. If order isn't important just use list.files("masked_variants/mix10", full.names=TRUE).
prefixes <- c("PZ34", "PZ35", "PZ36", "PZ34_35", "PZ35_36", "PZ34_36", "PZ34_35_36", "PZ34_34", "PZ35_35", "PZ36_36", "PZ34_34_34", "PZ35_35_35", "PZ36_36_36")

labels <- c("A", "B", "C", "A+B", "B+C", "A+C", "A+B+C", "A+A", "B+B", "C+C", "A+A+A", "B+B+B", "C+C+C")

true_variants <- read.csv("./ZIKV-intrahost_True-False-Remove.csv", header=TRUE)

for(i in c(1:length(prefixes))){
    r <- read.table(paste("./masked_variants/mix10/", prefixes[i], ".tsv", sep=""), header=TRUE, sep = "\t")

    alt_freq <- colnames(r)[grepl("ALT_FREQ", colnames(r))]
    if(length(alt_freq)>1){
        r[,"ALT_FREQ"] <- rowMeans(r[,alt_freq]) #In case of filtered iSNV reports, take average frequency since no filtering is done.
    } else {
        r[,"ALT_FREQ"] <- r[,alt_freq]
    }
    alt_freq <- "ALT_FREQ"
    
    r[,"REPLICATE"] <- labels[i]
    r <- r[(r[,alt_freq] >= 0.001) & (r$REF != "N"), ]
    r <- r[order(r$POS, -abs(r[,alt_freq]) ), ]
    r <- r[ !duplicated(r$POS), ]
    
    tp <- r[r[,"POS"] %in% true_variants[true_variants[,"True.False.Remove"]==TRUE,"Position"],]
    tp[,"CLASS"] <- "TP"

    fp <- r[r[,"POS"] %in% true_variants[true_variants[,"True.False.Remove"]==FALSE,"Position"],]
    fp[,"CLASS"] <- "FP"
    
    if(i==1){
        res <- rbind(tp[,c("REPLICATE", "CLASS", alt_freq)], fp[,c("REPLICATE", "CLASS", alt_freq)])
    } else {
        res <- rbind(res, tp[,c("REPLICATE", "CLASS", alt_freq)], fp[,c("REPLICATE", "CLASS", alt_freq)])
    }
}

res$REPLICATE <- factor(res$REPLICATE, levels = labels)

p <- ggplot(res, aes(x=CLASS, y=ALT_FREQ))  +
    stat_summary(fun.y = mean,
               fun.ymin = function(x) mean(x) - sd(x), 
               fun.ymax = function(x) mean(x) + sd(x), 
               geom = "pointrange") +
    geom_violin() +
    geom_hline(yintercept = c(0.01,0.03), linetype = "dashed") + scale_y_log10(breaks = c(0.001, 0.01, 0.03, 0.1, 1)) +ylab("iSNV frequency") + facet_wrap(~REPLICATE, ncol = 3) + theme_bw(base_size = 18)

p

In order to see the effect of using pseudo replicates i.e., the same replicate multiple times instead of technical replicates, let’s look at the correlation between the frequencies found.

x <- c("A", "A+A", "B", "B+B", "C", "C+C")
y <- c("A+A", "A+A+A", "B+B", "B+B+B", "C+C", "C+C+C")
corr <- sapply(c(1:6), function(i){
    cor.test(res[res$REPLICATE==x[i], "ALT_FREQ"], res[res$REPLICATE==y[i],"ALT_FREQ"])$estimate
})

corr.df <- data.frame(
    x <- x,
    y <- y,
    corr <- corr
)
colnames(corr.df) <- c("x", "y", "corr")

ggplot(corr.df, aes(x=x, y=y, fill=corr)) + geom_tile() + xlab("iSNVs 1") + ylab("iSNVs 2")

Calling iSNVs from invitro, invivo and field samples(Zika and West Nile virus).

Variants called after removing reads from primers with mismtaches are at masked_variants/. These variants were called using pipelines created with iVar, SAMtools, bwa and BEDtools. The pipeline were created using snakemake and are housed in the pipeline/ folder in the iVar repository. The iSNVs were called at a minimum frequency of 3% and a minimum quality of 20.

Below are two functions that will extract the positions and frequencies of alternate alleles from the variants report generated by iVar.

library(plyr)
## -------------------------------------------------------------------------
## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)
## -------------------------------------------------------------------------
## 
## Attaching package: 'plyr'
## The following objects are masked from 'package:dplyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize
get_pos_freq_list_replicate_a <- function(filtered_variants, metadata){
    pos_freq.list <- lapply(filtered_variants, function(x){
        field_samples <- read.table(x, sep="\t", header=TRUE)
        field_samples <- field_samples[!grepl("\\+|\\-", field_samples[,"ALT"]),] #Remove insertions and deletions
        rep.a <- cbind(field_samples[, 1:4], field_samples[, grepl("_a", colnames(field_samples))])
        colnames(rep.a) <- sapply(colnames(rep.a), function(x){
            strsplit(x, "_\\.")[[1]][1]
        })
        rep.a <- rep.a[rep.a[,"TOTAL_DP"]>=400,]
        if(nrow(rep.a)>0){
            rep.a[,"REPLICATE"] ="a"
            rep.a[,"FILE"] = strsplit(x, "/")[[1]][7]
            rep.a[,"SPECIES"] <- metadata[metadata$file==strsplit(x, "/")[[1]][4], "species"][1]
        } else {
            rep.a[,"REPLICATE"] = as.character()
            rep.a[,"FILE"] = as.character()
            rep.a[,"SPECIES"] <- as.character()
        }    
        return (rep.a[,c("POS", "ALT_FREQ", "REPLICATE", "SPECIES")]);
    });
    return(pos_freq.list);
}

get_pos_freq_list_replicate_b <- function(filtered_variants, metadata){
    pos_freq.list <- lapply(filtered_variants, function(x){
        field_samples <- read.table(x, sep="\t", header=TRUE)
        field_samples <- field_samples[!grepl("\\+|\\-", field_samples[,"ALT"]),] #Remove insertions and deletions
        rep.b <- cbind(field_samples[, 1:4], field_samples[, grepl("_b", colnames(field_samples))])
        colnames(rep.b) <- sapply(colnames(rep.b), function(x){
            strsplit(x, "_\\.")[[1]][1]
        })
        rep.b <- rep.b[rep.b[,"TOTAL_DP"]>=400,]
        if(nrow(rep.b)>0){
            rep.b[,"REPLICATE"] ="b"
            rep.b[,"FILE"] = strsplit(x, "/")[[1]][7]
            rep.b[,"SPECIES"] <- metadata[metadata$file==strsplit(x, "/")[[1]][4], "species"][1]
        } else {
            rep.b[,"REPLICATE"] = as.character()
            rep.b[,"FILE"] = as.character()
            rep.b[,"SPECIES"] <- as.character()
        }
        return (rep.b[,c("POS", "ALT_FREQ", "REPLICATE", "SPECIES")]);
    })
    return(pos_freq.list);
}

Let’s now plot the iSNVs found in the intersection of both replicates.

Field Zika samples

field_zika_path <- "./masked_variants/field_samples_zkv"

filtered_variants <- list.files(field_zika_path, pattern="*.filtered.tsv", full.names=TRUE)

metadata <- data.frame(
    files <- c("ZI-28", "ZI-32", "ZI-33", "ZI-34", "ZI-35", "ZI-47", "ZI-26", "ZI-27"),
    species <- c("Human", "Human", "Aedes", "Aedes", "Aedes", "Aedes", "Human", "Human")
)
colnames(metadata) <- c("file", "species")
metadata$file
## [1] ZI-28 ZI-32 ZI-33 ZI-34 ZI-35 ZI-47 ZI-26 ZI-27
## Levels: ZI-26 ZI-27 ZI-28 ZI-32 ZI-33 ZI-34 ZI-35 ZI-47
metadata$file <- sapply(as.character(metadata$file), function(x){
    paste(x,".filtered.tsv", sep="")
})


## Let's look at the correlation between the iSNVs called form both replicates for all samples
freq.corr <- sapply(filtered_variants, function(x){
    sample <- read.table(x, sep="\t", header=TRUE)
    freq <- cbind(sample[,"POS"], sample[, grepl("ALT_FREQ", colnames(sample))])
    freq.test <- cor.test(freq[,2], freq[,3], method="pearson")
    freq.test$estimate
})

print(freq.corr)
## ./masked_variants/field_samples_zkv/ZI-26.filtered.tsv.cor 
##                                                  0.9877711 
## ./masked_variants/field_samples_zkv/ZI-27.filtered.tsv.cor 
##                                                  0.8799016 
## ./masked_variants/field_samples_zkv/ZI-28.filtered.tsv.cor 
##                                                  0.9858001 
## ./masked_variants/field_samples_zkv/ZI-32.filtered.tsv.cor 
##                                                 -0.0401659 
## ./masked_variants/field_samples_zkv/ZI-33.filtered.tsv.cor 
##                                                  0.9983822 
## ./masked_variants/field_samples_zkv/ZI-34.filtered.tsv.cor 
##                                                  0.2437190 
## ./masked_variants/field_samples_zkv/ZI-35.filtered.tsv.cor 
##                                                  0.9216131 
## ./masked_variants/field_samples_zkv/ZI-47.filtered.tsv.cor 
##                                                  0.9826250
pos_freq.list.a <- get_pos_freq_list_replicate_a(filtered_variants, metadata)
pos_freq.list.b <- get_pos_freq_list_replicate_b(filtered_variants, metadata)


pos_freq <- ldply(pos_freq.list.a)
pos_freq <- rbind(pos_freq, ldply(pos_freq.list.b))

ggplot(pos_freq, aes(x=POS, y=ALT_FREQ, color=SPECIES, shape=REPLICATE)) + geom_point() + scale_x_continuous(breaks = c(0, 2000, 4000, 6000, 8000, 10000)) + scale_y_continuous(limits = c(0, 1), breaks = c(0, 0.2,0.4,0.6,0.8,1))

Field West Nile virus samples

field_wnv_path <- "./masked_variants/field_samples_wnv"

filtered_variants <- list.files(field_wnv_path, pattern="*.filtered.tsv", full.names=TRUE)

metadata <- data.frame(
    files <- c("ZI-55", "ZI-56", "ZI-57", "ZI-58", "ZI-59", "ZI-60", "ZI-61", "ZI-62"),
    species <- c("Culex", "Culex", "Culex", "Culex", "Crow", "Crow", "Crow", "Crow")
)

colnames(metadata) <- c("file", "species")
metadata$file
## [1] ZI-55 ZI-56 ZI-57 ZI-58 ZI-59 ZI-60 ZI-61 ZI-62
## Levels: ZI-55 ZI-56 ZI-57 ZI-58 ZI-59 ZI-60 ZI-61 ZI-62
metadata$file <- sapply(as.character(metadata$file), function(x){
    paste(x,".filtered.tsv", sep="")
})


## Let's look at the correlation between the iSNVs called form both replicates for all samples
freq.corr <- sapply(filtered_variants, function(x){
    sample <- read.table(x, sep="\t", header=TRUE)
    freq <- cbind(sample[,"POS"], sample[, grepl("ALT_FREQ", colnames(sample))])
    freq.test <- cor.test(freq[,2], freq[,3], method="pearson")
    freq.test$estimate
})

print(freq.corr)
## ./masked_variants/field_samples_wnv/ZI-55.filtered.tsv.cor 
##                                                  0.8255764 
## ./masked_variants/field_samples_wnv/ZI-56.filtered.tsv.cor 
##                                                  0.9771972 
## ./masked_variants/field_samples_wnv/ZI-57.filtered.tsv.cor 
##                                                  0.9884330 
## ./masked_variants/field_samples_wnv/ZI-58.filtered.tsv.cor 
##                                                  0.3526524 
## ./masked_variants/field_samples_wnv/ZI-59.filtered.tsv.cor 
##                                                  0.9978326 
## ./masked_variants/field_samples_wnv/ZI-60.filtered.tsv.cor 
##                                                  0.9984815 
## ./masked_variants/field_samples_wnv/ZI-61.filtered.tsv.cor 
##                                                  0.9733419 
## ./masked_variants/field_samples_wnv/ZI-62.filtered.tsv.cor 
##                                                  0.9991160
pos_freq.list.a <- get_pos_freq_list_replicate_a(filtered_variants, metadata)
pos_freq.list.b <- get_pos_freq_list_replicate_b(filtered_variants, metadata)

pos_freq <- ldply(pos_freq.list.a)
pos_freq <- rbind(pos_freq, ldply(pos_freq.list.b))

ggplot(pos_freq, aes(x=POS, y=ALT_FREQ, color=SPECIES, shape=REPLICATE)) + geom_point() + scale_x_continuous(breaks = c(0, 2000, 4000, 6000, 8000, 10000)) + scale_y_continuous(limits = c(0, 1), breaks = c(0, 0.2,0.4,0.6,0.8,1))

Invivo samples

invivo_path <- "./masked_variants/invivo"

filtered_variants <- list.files(invivo_path, pattern="*.filtered.tsv", full.names=TRUE)

metadata <- data.frame(
    files <- c("ZI-06", "ZI-07", "ZI-09", "ZI-10", "ZI-11", "ZI-12", "ZI-13", "ZI-15", "ZI-16", "ZI-37", "ZI-48"),
    species <- c("Aedes", "Aedes", "Aedes", "Aedes", "NHP", "NHP", "NHP", "NHP", "NHP", "NHP", "Aedes")
)
colnames(metadata) <- c("file", "species")
metadata$file
##  [1] ZI-06 ZI-07 ZI-09 ZI-10 ZI-11 ZI-12 ZI-13 ZI-15 ZI-16 ZI-37 ZI-48
## 11 Levels: ZI-06 ZI-07 ZI-09 ZI-10 ZI-11 ZI-12 ZI-13 ZI-15 ZI-16 ... ZI-48
metadata$file <- sapply(as.character(metadata$file), function(x){
    paste(x,".masked.filtered.tsv", sep="")
})


## Let's look at the correlation between the iSNVs called form both replicates for all samples
freq.corr <- sapply(filtered_variants, function(x){
    sample <- read.table(x, sep="\t", header=TRUE)
    freq <- cbind(sample[,"POS"], sample[, grepl("ALT_FREQ", colnames(sample))])
    freq.test <- cor.test(freq[,2], freq[,3], method="pearson")
    freq.test$estimate
})

print(freq.corr)
## ./masked_variants/invivo/ZI-06.masked.filtered.tsv.cor 
##                                              0.9922843 
## ./masked_variants/invivo/ZI-07.masked.filtered.tsv.cor 
##                                              0.9941981 
## ./masked_variants/invivo/ZI-09.masked.filtered.tsv.cor 
##                                              0.9992304 
## ./masked_variants/invivo/ZI-10.masked.filtered.tsv.cor 
##                                              0.9919301 
## ./masked_variants/invivo/ZI-11.masked.filtered.tsv.cor 
##                                              0.9704092 
## ./masked_variants/invivo/ZI-12.masked.filtered.tsv.cor 
##                                              0.7792300 
## ./masked_variants/invivo/ZI-13.masked.filtered.tsv.cor 
##                                              0.8208284 
## ./masked_variants/invivo/ZI-15.masked.filtered.tsv.cor 
##                                              0.9750105 
## ./masked_variants/invivo/ZI-16.masked.filtered.tsv.cor 
##                                              0.8437387 
## ./masked_variants/invivo/ZI-37.masked.filtered.tsv.cor 
##                                              0.9887140 
## ./masked_variants/invivo/ZI-48.masked.filtered.tsv.cor 
##                                              0.9575096
pos_freq.list.a <- get_pos_freq_list_replicate_a(filtered_variants, metadata)
pos_freq.list.b <- get_pos_freq_list_replicate_b(filtered_variants, metadata)

pos_freq <- ldply(pos_freq.list.a)
pos_freq <- rbind(pos_freq, ldply(pos_freq.list.b))

ggplot(pos_freq, aes(x=POS, y=ALT_FREQ, color=SPECIES, shape=REPLICATE)) + geom_point() + scale_x_continuous(breaks = c(0, 2000, 4000, 6000, 8000, 10000)) + scale_y_continuous(limits = c(0, 1), breaks = c(0, 0.2,0.4,0.6,0.8,1))

Invitro samples

invivo_path <- "./masked_variants/invitro"

filtered_variants <- list.files(invivo_path, pattern="*.filtered.tsv", full.names=TRUE)

metadata <- data.frame(
    files <- c("ZI-40", "ZI-41", "ZI-42", "ZI-43", "ZI-44", "ZI-45", "ZI-49", "ZI-50", "ZI-51", "ZI-52", "ZI-53"),
    species <- c("Input", "Aag2", "Aag2", "Aag2", "Aag2", "Aag2", "Hela", "Hela", "Hela", "Hela", "Hela")
)
colnames(metadata) <- c("file", "species")
metadata$file
##  [1] ZI-40 ZI-41 ZI-42 ZI-43 ZI-44 ZI-45 ZI-49 ZI-50 ZI-51 ZI-52 ZI-53
## 11 Levels: ZI-40 ZI-41 ZI-42 ZI-43 ZI-44 ZI-45 ZI-49 ZI-50 ZI-51 ... ZI-53
metadata$file <- sapply(as.character(metadata$file), function(x){
    paste(x,".masked.filtered.tsv", sep="")
})


## Let's look at the correlation between the iSNVs called form both replicates for all samples
freq.corr <- sapply(filtered_variants, function(x){
    sample <- read.table(x, sep="\t", header=TRUE)
    freq <- cbind(sample[,"POS"], sample[, grepl("ALT_FREQ", colnames(sample))])
    freq.test <- cor.test(freq[,2], freq[,3], method="pearson")
    freq.test$estimate
})

print(freq.corr)
## ./masked_variants/invitro/ZI-40.masked.filtered.tsv.cor 
##                                               0.9987326 
## ./masked_variants/invitro/ZI-41.masked.filtered.tsv.cor 
##                                               0.9916750 
## ./masked_variants/invitro/ZI-42.masked.filtered.tsv.cor 
##                                               0.9127039 
## ./masked_variants/invitro/ZI-43.masked.filtered.tsv.cor 
##                                               0.9340806 
## ./masked_variants/invitro/ZI-44.masked.filtered.tsv.cor 
##                                               0.4962088 
## ./masked_variants/invitro/ZI-45.masked.filtered.tsv.cor 
##                                               0.3234183 
## ./masked_variants/invitro/ZI-49.masked.filtered.tsv.cor 
##                                               0.8949094 
## ./masked_variants/invitro/ZI-50.masked.filtered.tsv.cor 
##                                               0.9779420 
## ./masked_variants/invitro/ZI-51.masked.filtered.tsv.cor 
##                                               0.9816088 
## ./masked_variants/invitro/ZI-52.masked.filtered.tsv.cor 
##                                                     NaN 
## ./masked_variants/invitro/ZI-53.masked.filtered.tsv.cor 
##                                               0.9828178
pos_freq.list.a <- get_pos_freq_list_replicate_a(filtered_variants, metadata)
pos_freq.list.b <- get_pos_freq_list_replicate_b(filtered_variants, metadata)

pos_freq <- ldply(pos_freq.list.a)
pos_freq <- rbind(pos_freq, ldply(pos_freq.list.b))

ggplot(pos_freq, aes(x=POS, y=ALT_FREQ, color=SPECIES, shape=REPLICATE)) + geom_point() + scale_x_continuous(breaks = c(0, 2000, 4000, 6000, 8000, 10000)) + scale_y_continuous(limits = c(0, 1), breaks = c(0, 0.2,0.4,0.6,0.8,1))