Load packages needed for tutorial¶

Use the commands below to load the required packages for following along in this workshop

library(dada2)
library(DECIPHER)
library(phangorn)
library(ggplot2)
library(phyloseq)

Load fastq files into the session¶

First we Specify the directory where FASTQ files are located

path <- "~/DADA2_Tutorial"

Print out all the files that exist within the dir

list.files(path)

Next we sort the file names to ensure that the file names for the corresponding forward/reverse reads are in same order and store them in two variables

fnFs for files with the forward reads
fnRs for files with the reverse reads

fnFs <- sort(list.files(path, pattern="_R1_001.fastq"))
fnRs <- sort(list.files(path, pattern="_R2_001.fastq"))

Now we extract sample names from the filenames, assuming filenames have the format: SAMPLENAME_XXX.fastq

sample.names <- sapply(strsplit(fnFs, "_"), `[`, 1)

Finally we make sure that the file names specify the full path to the fnFs and fnRs by prefixing the path to the directory the files reside in

fnFs <- file.path(path, fnFs)
fnRs <- file.path(path, fnRs)

Inspect and Trim Reads¶

Plot average quality scores for forward reads

plotQualityProfile(fnFs[1:2])

Plot average quality scores of reverse reads

plotQualityProfile(fnRs[1:2])

Create a new file path to store filtered and trimmed reads. Here we place filtered files in a subdirectory called filtered/

filt_path <- file.path(path, "filtered")

Next rename filtered files

filtFs <- file.path(filt_path, paste0(sample.names, "_F_filt.fastq.gz"))
filtRs <- file.path(filt_path, paste0(sample.names, "_R_filt.fastq.gz"))

Finally we use a dada2 function to filter and trim reads

out <- filterAndTrim(fnFs, filtFs, fnRs, filtRs, truncLen=c(240,160),
              maxN=0, maxEE=c(2,2), truncQ=2, rm.phix=TRUE,
              compress=TRUE, multithread=TRUE) # On Windows set multithread=FALSE

Print the summary after filtering

print(out)

Estimate Error Rates from Data¶

Estimate the error model for DADA2 algorithm using forward reads

errF <- learnErrors(filtFs, multithread=TRUE)

Estimate the error model for DADA2 algorithim using reverse reads

errR <- learnErrors(filtRs, multithread=TRUE)

Plot the error rates for all possible base transitions

plotErrors(errF, nominalQ=TRUE)

Dereplicate and Denoise Reads using DADA2 Algorithm¶

Dereplicate FASTQ files for both forward and reverse reads to speed up computation

derepFs <- derepFastq(filtFs, verbose=TRUE)
derepRs <- derepFastq(filtRs, verbose=TRUE)

Name the derep-class objects by the sample names

names(derepFs) <- sample.names
names(derepRs) <- sample.names

Apply core sequence-variant inference algorithim to forward reads

dadaFs <- dada(derepFs, err=errF, multithread=TRUE)

Apply core sequence-variant inference algorithim to reverse reads

dadaRs <- dada(derepRs, err=errR, multithread=TRUE)

Merge Paired Reads and Create Sequence Table¶

Merge denoised reads

mergers <- mergePairs(dadaFs, derepFs, dadaRs, derepRs, verbose=TRUE)

Inspect the merger data.frame from the first sample

head(mergers[[1]])

Organize ribosomal sequence variants (RSVs) into a sequence table (analogous to an OTU table)

seqtab <- makeSequenceTable(mergers)
dim(seqtab)

View the length of all total RSVs

table(nchar(getSequences(seqtab)))

Identify and Remove Chimeric Sequences¶

Perform de novo chimera sequence detection and removal

seqtab.nochim <- removeBimeraDenovo(seqtab, method="consensus", multithread=TRUE, verbose=TRUE)
dim(seqtab.nochim)

Calculate the proportion of non-chimeric RSVs

sum(seqtab.nochim)/sum(seqtab)

Assign Taxonomy¶

Assign taxonomy using RDP database (Greengenes and SILVA also available)

This is performed in two steps:

 1) this first one assigns phylum to genus

taxa <- assignTaxonomy(seqtab.nochim, "~/DADA2_Tutorial/Taxonomy/rdp_train_set_16.fa.gz", multithread=TRUE)
unname(head(taxa))

2) Assign species (when possible)

taxa.plus <- addSpecies(taxa, "~/DADA2_Tutorial/Taxonomy/rdp_species_assignment_16.fa.gz", verbose=TRUE)
colnames(taxa.plus) <- c("Kingdom", "Phylum", "Class", "Order", "Family", "Genus","Species")
unname(taxa.plus)

Construct Phylogenetic Tree¶

Extract sequences from DADA2 output

sequences<-getSequences(seqtab.nochim)
names(sequences)<-sequences

Run Sequence Alignment (MSA) using DECIPHER

alignment <- AlignSeqs(DNAStringSet(sequences), anchor=NA)

Change sequence alignment output into a phyDat structure

phang.align <- phyDat(as(alignment, "matrix"), type="DNA")

Create distance matrix

dm <- dist.ml(phang.align)

Perform Neighbor joining

treeNJ <- NJ(dm) # Note, tip order != sequence order

Internal maximum likelihood

fit = pml(treeNJ, data=phang.align)

negative edges length changed to 0!

fitGTR <- update(fit, k=4, inv=0.2)
fitGTR <- optim.pml(fitGTR, model="GTR", optInv=TRUE, optGamma=TRUE,
                    rearrangement = "stochastic", control = pml.control(trace = 0))

Import into phyloseq¶

map <- import_qiime_sample_data("~/DADA2_Tutorial/Tutorial_Map.txt")

ps <- phyloseq(otu_table(seqtab.nochim, taxa_are_rows=FALSE), 
               tax_table(taxa.plus),phy_tree(fitGTR$tree))

Merge PhyloSeq object with map

ps <- merge_phyloseq(ps,map)
ps

Currently, the phylogenetic tree is not rooted Though it is not necessary here, you will need to root the tree if you want to calculate any phylogeny based diversity metrics (like Unifrac)

set.seed(711)
phy_tree(ps) <- root(phy_tree(ps), sample(taxa_names(ps), 1), resolve.root = TRUE)
is.rooted(phy_tree(ps))

Alpha Diversity¶

plot_richness(ps, x="day", measures=c("Shannon", "Simpson"), color="when") + theme_bw()

Beta Diversity¶

ord.nmds.bray <- ordinate(ps, method="NMDS", distance="bray")

plot_ordination(ps, ord.nmds.bray, color="when", title="Bray NMDS")