Use the commands below to load the required packages for following along in this workshop
library(dada2)
library(DECIPHER)
library(phangorn)
library(ggplot2)
library(phyloseq)
First we Specify the directory where FASTQ files are located
path <- "~/DADA2_Tutorial"
Print out all the files that exist within the dir
list.files(path)
Next we sort the file names to ensure that the file names for the corresponding forward/reverse reads are in same order and store them in two variables
fnFs
for files with the forward readsfnRs
for files with the reverse readsfnFs <- sort(list.files(path, pattern="_R1_001.fastq"))
fnRs <- sort(list.files(path, pattern="_R2_001.fastq"))
Now we extract sample names from the filenames, assuming filenames have the format: SAMPLENAME_XXX.fastq
sample.names <- sapply(strsplit(fnFs, "_"), `[`, 1)
Finally we make sure that the file names specify the full path to the fnFs and fnRs by prefixing the path to the directory the files reside in
fnFs <- file.path(path, fnFs)
fnRs <- file.path(path, fnRs)
Plot average quality scores for forward reads
plotQualityProfile(fnFs[1:2])
Plot average quality scores of reverse reads
plotQualityProfile(fnRs[1:2])
Create a new file path to store filtered and trimmed reads. Here we place filtered files in a subdirectory called filtered/
filt_path <- file.path(path, "filtered")
Next rename filtered files
filtFs <- file.path(filt_path, paste0(sample.names, "_F_filt.fastq.gz"))
filtRs <- file.path(filt_path, paste0(sample.names, "_R_filt.fastq.gz"))
Finally we use a dada2
function to filter and trim reads
out <- filterAndTrim(fnFs, filtFs, fnRs, filtRs, truncLen=c(240,160),
maxN=0, maxEE=c(2,2), truncQ=2, rm.phix=TRUE,
compress=TRUE, multithread=TRUE) # On Windows set multithread=FALSE
Print the summary after filtering
print(out)
Estimate the error model for DADA2 algorithm using forward reads
errF <- learnErrors(filtFs, multithread=TRUE)
Estimate the error model for DADA2 algorithim using reverse reads
errR <- learnErrors(filtRs, multithread=TRUE)
Plot the error rates for all possible base transitions
plotErrors(errF, nominalQ=TRUE)
Dereplicate FASTQ files for both forward and reverse reads to speed up computation
derepFs <- derepFastq(filtFs, verbose=TRUE)
derepRs <- derepFastq(filtRs, verbose=TRUE)
Name the derep-class objects by the sample names
names(derepFs) <- sample.names
names(derepRs) <- sample.names
Apply core sequence-variant inference algorithim to forward reads
dadaFs <- dada(derepFs, err=errF, multithread=TRUE)
Apply core sequence-variant inference algorithim to reverse reads
dadaRs <- dada(derepRs, err=errR, multithread=TRUE)
Merge denoised reads
mergers <- mergePairs(dadaFs, derepFs, dadaRs, derepRs, verbose=TRUE)
Inspect the merger data.frame from the first sample
head(mergers[[1]])
Organize ribosomal sequence variants (RSVs) into a sequence table (analogous to an OTU table)
seqtab <- makeSequenceTable(mergers)
dim(seqtab)
View the length of all total RSVs
table(nchar(getSequences(seqtab)))
Perform de novo chimera sequence detection and removal
seqtab.nochim <- removeBimeraDenovo(seqtab, method="consensus", multithread=TRUE, verbose=TRUE)
dim(seqtab.nochim)
Calculate the proportion of non-chimeric RSVs
sum(seqtab.nochim)/sum(seqtab)
Assign taxonomy using RDP database (Greengenes and SILVA also available)
This is performed in two steps:
1) this first one assigns phylum to genus
taxa <- assignTaxonomy(seqtab.nochim, "~/DADA2_Tutorial/Taxonomy/rdp_train_set_16.fa.gz", multithread=TRUE)
unname(head(taxa))
2) Assign species (when possible)
taxa.plus <- addSpecies(taxa, "~/DADA2_Tutorial/Taxonomy/rdp_species_assignment_16.fa.gz", verbose=TRUE)
colnames(taxa.plus) <- c("Kingdom", "Phylum", "Class", "Order", "Family", "Genus","Species")
unname(taxa.plus)
Extract sequences from DADA2 output
sequences<-getSequences(seqtab.nochim)
names(sequences)<-sequences
Run Sequence Alignment (MSA) using DECIPHER
alignment <- AlignSeqs(DNAStringSet(sequences), anchor=NA)
Change sequence alignment output into a phyDat structure
phang.align <- phyDat(as(alignment, "matrix"), type="DNA")
Create distance matrix
dm <- dist.ml(phang.align)
Perform Neighbor joining
treeNJ <- NJ(dm) # Note, tip order != sequence order
Internal maximum likelihood
fit = pml(treeNJ, data=phang.align)
negative edges length changed to 0!
fitGTR <- update(fit, k=4, inv=0.2)
fitGTR <- optim.pml(fitGTR, model="GTR", optInv=TRUE, optGamma=TRUE,
rearrangement = "stochastic", control = pml.control(trace = 0))
map <- import_qiime_sample_data("~/DADA2_Tutorial/Tutorial_Map.txt")
ps <- phyloseq(otu_table(seqtab.nochim, taxa_are_rows=FALSE),
tax_table(taxa.plus),phy_tree(fitGTR$tree))
Merge PhyloSeq object with map
ps <- merge_phyloseq(ps,map)
ps
Currently, the phylogenetic tree is not rooted Though it is not necessary here, you will need to root the tree if you want to calculate any phylogeny based diversity metrics (like Unifrac)
set.seed(711)
phy_tree(ps) <- root(phy_tree(ps), sample(taxa_names(ps), 1), resolve.root = TRUE)
is.rooted(phy_tree(ps))
plot_richness(ps, x="day", measures=c("Shannon", "Simpson"), color="when") + theme_bw()
ord.nmds.bray <- ordinate(ps, method="NMDS", distance="bray")
plot_ordination(ps, ord.nmds.bray, color="when", title="Bray NMDS")