This document gets progressively fancier towards the end!
At some point in this document, you will surely wonder which ordination method you should use. For an in depth discussion of the mathematical and statistical differences between these methods, start with this lecture. Choice of ordination method will depend on what your hypotheses are, what kind of transformation (of abundance) you want to do, and what your goal is with the analysis.
library(compositions)
library(zCompositions)
library(ape)
library(plyr)
library(dplyr)
library(ggplot2)
library(gplots)
library(lme4)
library(tidyr)
library(vegan)
library(scales)
library(grid)
library(reshape2)
library(phyloseq)
library(ALDEx2)
library(gridExtra)
library(stringr)
m.g.colors <- c( "#999999", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7", "#CBD588", "#5F7FC7", "orange","#DA5724", "#508578", "#CD9BCD",
"#AD6F3B", "#673770","#D14285", "#652926", "#C84248",
"#8569D5", "#5E738F","#D1A33D", "#8A7C64", "#599861" )
more.colors <- c("indianred1", "steelblue3", "skyblue1", "mediumorchid","royalblue4", "olivedrab3",
"pink", "#FFED6F", "mediumorchid3", "ivory2", "tan1", "aquamarine3", "#C0C0C0",
"mediumvioletred", "#999933", "#666699", "#CC9933", "#006666", "#3399FF",
"#993300", "#CCCC99", "#666666", "#FFCC66", "#6699CC", "#663366", "#9999CC", "#CCCCCC",
"#669999", "#CCCC66", "#CC6600", "#9999FF", "#0066CC", "#99CCCC", "#999999", "#FFCC00",
"#009999", "#FF9900", "#999966", "#66CCCC", "#339966", "#CCCC33", "#EDEDED"
)
theme_set(theme_classic(base_size=12, base_family="Avenir"))
taxfile="ACS.cons.taxonomy"
sharedfile="ACS.shared"
mdf <- read.csv("fakenames.mdf.csv", row.names=1, header=TRUE)
mdf
Standard import of mothur data
mothur_data <- import_mothur(mothur_constaxonomy_file = taxfile, mothur_shared_file = sharedfile)
moth_merge <- merge_phyloseq(sample_data(mdf), tax_table(mothur_data), otu_table(mothur_data))
colnames(tax_table(moth_merge)) <- c("Kingdom", "Phylum", "Class",
"Order", "Family", "Genus")
waterdat <- moth_merge %>%
subset_taxa(
Kingdom %in% c("Bacteria", "Archaea") &
Family != "mitochondria" &
Class != "Chloroplast"
)
waterdat
## phyloseq-class experiment-level object
## otu_table() OTU Table: [ 47725 taxa and 358 samples ]
## sample_data() Sample Data: [ 358 samples by 9 sample variables ]
## tax_table() Taxonomy Table: [ 47725 taxa by 6 taxonomic ranks ]
Clean data by removing singletons and pruning samples with a depth < 10000.
otu.table <- otu_table(waterdat)
#remove singleton reads
otu.table[otu.table==1] <- 0
acsdat.clean <- merge_phyloseq(otu_table(otu.table, taxa_are_rows=TRUE), tax_table(waterdat), sample_data(waterdat)) %>% prune_samples(sample_sums(.) > 10000,.) %>% prune_taxa(taxa_sums(.) > 0,.)
acsdat.clean
## phyloseq-class experiment-level object
## otu_table() OTU Table: [ 18583 taxa and 344 samples ]
## sample_data() Sample Data: [ 344 samples by 9 sample variables ]
## tax_table() Taxonomy Table: [ 18583 taxa by 6 taxonomic ranks ]
otu.table <- data.frame(otu_table(acsdat.clean))
tax.table <- data.frame(tax_table(acsdat.clean))
mdf <- data.frame(sample_data(acsdat.clean))
transform.clr <- function(otu.df){
d.1 <- data.frame(otu.df[which(rowSums(otu.df) > 10),],
check.names=F)
d.czm <- t(cmultRepl(t(d.1), label=0, method="CZM"))
d.clr <- apply(d.czm, 2, function(x){log(x) - mean(log(x))})
return(d.clr)
}
My favorite ordinations use a CLR transform of the data. We’ll also do some that rely on samples having an even depth later in the document.
otu.clr.table <- transform.clr(otu.table)
## No. corrected values: 10092
The prcomp
implementation of PCA expects your samples to be rows and your columns to be OTUs. It returns several things of interest… the variable rotation
is the matrix of OTU loadings. The variable x
contains your sample compositions multiplied by the rotational matrix.
The data is already scaled as much as we want it to be, but you should center around zero. This feels counter-intuitive because we just applied a centered log ratio transform, but centering the data now will make it easier to interpret the loading variables of the PCA at the end.
d.pcx <- prcomp(t(otu.clr.table), scale=FALSE, center=TRUE)
d.mvar <- sum(d.pcx$sdev^2)
PC1.label <- paste("PC1: ", percent(sum(d.pcx$sdev[1]^2)/d.mvar, accuracy=0.1))
PC2.label <- paste("PC2: ", percent(sum(d.pcx$sdev[2]^2)/d.mvar, accuracy=0.1))
PC3.label <- paste("PC3: ", percent(sum(d.pcx$sdev[3]^2)/d.mvar, accuracy=0.1))
print(c(PC1.label, PC2.label, PC3.label))
## [1] "PC1: 14.8%" "PC2: 11.8%" "PC3: 5.8%"
The total variance accounted for in the first three axes is 32.4% percent. That’s pretty good.
pcx.importance <- summary(d.pcx)$importance
#extract the percent variance like so:
percent.pcx <- pcx.importance[2,]
barplot(percent.pcx[1:20])
This scree plot shows us the drop off point for variables contributing to the PCA. Here it looks like there is only meaningful gain out to PC3. Most information is in PC1 and PC2.
To visualize, extract the rotated data (x
) and make it a dataframe. We can then pull variables to use as symbology from your metadata file.
pca.points <- data.frame(d.pcx$x)
pca.points$site <- mdf[as.vector(row.names(pca.points)),"site"]
pca.points$stream <- mdf[as.vector(row.names(pca.points)),"stream"]
pca.points$community <- mdf[as.vector(row.names(pca.points)),"community"]
pca.points$site.type <- mdf[as.vector(row.names(pca.points)),"type"]
pca.points$date <- mdf[as.vector(row.names(pca.points)),"date"]
#make a month variable
pca.points$month <- as.factor(sapply(pca.points$date, function(x) str_split(x, pattern="/")[[1]][1]))
ggplot(pca.points, aes(x=PC1, y=PC2, col=site.type, shape=community)) + geom_point(size=3, alpha=0.7, stroke=1) + scale_color_manual(values=m.g.colors) + ylab(PC2.label) + xlab(PC1.label) + scale_shape_manual(values=c(1,2,7,8))
## Warning: Removed 1 rows containing missing values (geom_point).
I like to add quadrant lines to make interpretation a little easier.
ggplot(pca.points, aes(x=PC1, y=PC2, col=site.type, shape=community)) + geom_point(size=3, alpha=0.7, stroke=1) + scale_color_manual(values=m.g.colors) + ylab(PC2.label) + xlab(PC1.label) + scale_shape_manual(values=c(1,2,7,8)) + geom_hline(yintercept=0, linetype=2, color="gray", alpha=0.7)+ geom_vline(xintercept=0, linetype=2, color="gray", alpha=0.7)
## Warning: Removed 1 rows containing missing values (geom_point).
In this dataset, it’s not necessary to display variation on the third PC axis, but if you wanted to do it, you would just switch out the PC1 and PC3 variables like so.
ggplot(pca.points, aes(x=PC2, y=PC3, col=site.type, shape=community)) + geom_point(size=3, alpha=0.7, stroke=1) + scale_color_manual(values=m.g.colors) + ylab(PC3.label) + xlab(PC2.label) + scale_shape_manual(values=c(1,2,7,8)) + geom_hline(yintercept=0, linetype=2, color="gray", alpha=0.7)+ geom_vline(xintercept=0, linetype=2, color="gray", alpha=0.7)
## Warning: Removed 1 rows containing missing values (geom_point).
ggplot doesn’t have direct support for plotting in 3D, but plotly
and gg3d
do. Here’s how to install and use those.
gg3D must be downloaded directly from github. Devtools will display a message in your console asking you to update packages. I don’t usually update packages through this tool unless installing the package I want doesn’t work the first time.
devtools::install_github("AckerDWM/gg3d")
library(gg3D)
ggplot(pca.points, aes(x=PC1, y=PC2, z=PC3, col=site.type, shape=community)) + theme_void() + axes_3D() + stat_3D() + scale_color_manual(values=m.g.colors) + ylab(PC3.label) + xlab(PC2.label) + scale_shape_manual(values=c(1,2,7,8))
## Warning: Removed 1 rows containing missing values (geom_point).
This is somewhat nice because it’s easy to see that the clusters are well separated, but unfortunately there are no axis labels!
Here’s how to do it in plotly
. You can install plotly
the regular way using install.packages()
.
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following objects are masked from 'package:plyr':
##
## arrange, mutate, rename, summarise
## The following object is masked from 'package:MASS':
##
## select
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
plot_ly(x=pca.points$PC1, y=pca.points$PC2, z=pca.points$PC3, type="scatter3d", mode="markers", color=pca.points$site.type, symbol=pca.points$community)
## Warning: `arrange_()` is deprecated as of dplyr 0.7.0.
## Please use `arrange()` instead.
## See vignette('programming') for more help
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
## Warning: The following are not valid symbol codes:
## 'NA'
## Valid symbols include:
## '0', 'circle', '100', 'circle-open', '200', 'circle-dot', '300', 'circle-open-dot', '1', 'square', '101', 'square-open', '201', 'square-dot', '301', 'square-open-dot', '2', 'diamond', '102', 'diamond-open', '202', 'diamond-dot', '302', 'diamond-open-dot', '3', 'cross', '103', 'cross-open', '203', 'cross-dot', '303', 'cross-open-dot', '4', 'x', '104', 'x-open', '204', 'x-dot', '304', 'x-open-dot', '5', 'triangle-up', '105', 'triangle-up-open', '205', 'triangle-up-dot', '305', 'triangle-up-open-dot', '6', 'triangle-down', '106', 'triangle-down-open', '206', 'triangle-down-dot', '306', 'triangle-down-open-dot', '7', 'triangle-left', '107', 'triangle-left-open', '207', 'triangle-left-dot', '307', 'triangle-left-open-dot', '8', 'triangle-right', '108', 'triangle-right-open', '208', 'triangle-right-dot', '308', 'triangle-right-open-dot', '9', 'triangle-ne', '109', 'triangle-ne-open', '209', 'triangle-ne-dot', '309', 'triangle-ne-open-dot', '10', 'triangle-se', '110', 'triangle-se-open', '210', 'triangle-se-dot', '310', 'triangle-se-open-dot', '11', 'triangle-sw', '111', 'triangle-sw-open', '211', 'triangle-sw-dot', '311', 'triangle-sw-open-dot', '12', 'triangle-nw', '112', 'triangle-nw-open', '212', 'triangle-nw-dot', '312', 'triangle-nw-open-dot', '13', 'pentagon', '113', 'pentagon-open', '213', 'pentagon-dot', '313', 'pentagon-open-dot', '14', 'hexagon', '114', 'hexagon-open', '214', 'hexagon-dot', '314', 'hexagon-open-dot', '15', 'hexagon2', '115', 'hexagon2-open', '215', 'hexagon2-dot', '315', 'hexagon2-open-dot', '16', 'octagon', '116', 'octagon-open', '216', 'octagon-dot', '316', 'octagon-open-dot', '17', 'star', '117', 'star-open', '217', 'star-dot', '317', 'star-open-dot', '18', 'hexagram', '118', 'hexagram-open', '218', 'hexagram-dot', '318', 'hexagram-open-dot', '19', 'star-triangle-up', '119', 'star-triangle-up-open', '219', 'star-triangle-up-dot', '319', 'star-triangle-up-open-dot', '20', 'star-triangle-down', '120', 'star-triangle-down-open', '220', 'star-triangle-down-dot', '320', 'star-triangle-down-open-dot', '21', 'star-square', '121', 'star-square-open', '221', 'star-square-dot', '321', 'star-square-open-dot', '22', 'star-diamond', '122', 'star-diamond-open', '222', 'star-diamond-dot', '322', 'star-diamond-open-dot', '23', 'diamond-tall', '123', 'diamond-tall-open', '223', 'diamond-tall-dot', '323', 'diamond-tall-open-dot', '24', 'diamond-wide', '124', 'diamond-wide-open', '224', 'diamond-wide-dot', '324', 'diamond-wide-open-dot', '25', 'hourglass', '125', 'hourglass-open', '26', 'bowtie', '126', 'bowtie-open', '27', 'circle-cross', '127', 'circle-cross-open', '28', 'circle-x', '128', 'circle-x-open', '29', 'square-cross', '129', 'square-cross-open', '30', 'square-x', '130', 'square-x-open', '31', 'diamond-cross', '131', 'diamond-cross-open', '32', 'diamond-x', '132', 'diamond-x-open', '33', 'cross-thin', '133', 'cross-thin-open', '34', 'x-thin', '134', 'x-thin-open', '35', 'asterisk', '135', 'asterisk-open', '36', 'hash', '136', 'hash-open', '236', 'hash-dot', '336', 'hash-open-dot', '37', 'y-up', '137', 'y-up-open', '38', 'y-down', '138', 'y-down-open', '39', 'y-left', '139', 'y-left-open', '40', 'y-right', '140', 'y-right-open', '41', 'line-ew', '141', 'line-ew-open', '42', 'line-ns', '142', 'line-ns-open', '43', 'line-ne', '143', 'line-ne-open', '44', 'line-nw', '144', 'line-nw-open
Plotly is really a world unto itself and the data formatting is in a very different style from ggplot.
The interactiveness of the widget is great, but as you might imagine, not ideal for making figures for publications. Dive into formatting in plotly if you want to, but you will probably not need to.
Biplots display your loadings and rotated data at the same time. They can be a good way to visually explore the contribution of different OTUs to variation between your samples.
PCAs made from many many columns (many many OTUs) don’t usually make readable biplots without a lot of filtering. I’ll show you how to do that, but the best use of biplots is definitely for PCAs made at lower taxonomic resolution (Order level or above for bacteria) or with a subset of columns.
Here’s a biplot with everything:
biplot(x=d.pcx$x, y=d.pcx$rotation, var.axes=TRUE, col=c("black", "#56B4E9"))
How gross! We can improve this.
#change sample labels to site labels
sample.labels <- mdf[row.names(d.pcx$x),"site"]
#shrink text by changing cex. Text size is multiplied by this factor, so < 1 will shrink it.
biplot(x=d.pcx$x, y=d.pcx$rotation, var.axes=TRUE, col=c("black", "#56B4E9"), xlabs=sample.labels, cex=0.5)
Make a subset of the rotational matrix to remove OTUs that don’t contirbute strongly to either axis. This threshold is arbitrary. Choose based on the distribution of values.
summary(d.pcx$rotation[,"PC1"])
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -0.1068653 -0.0039681 -0.0018742 0.0000000 0.0008445 0.0673561
summary(d.pcx$rotation[,"PC2"])
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -0.108679 -0.002995 -0.001034 0.000000 0.001104 0.138112
pc1q <- quantile(d.pcx$rotation[,"PC1"], probs=c(0.01, 0.03, 0.05, 0.1, 0.9, 0.95, 0.97, 0.99))
pc2q <- quantile(d.pcx$rotation[,"PC2"], probs=c(0.01, 0.03, 0.05, 0.1, 0.9, 0.95, 0.97, 0.99))
pc1q
## 1% 3% 5% 10% 90% 95%
## -0.028171852 -0.015966457 -0.011441280 -0.007374515 0.010818191 0.022211952
## 97% 99%
## 0.032403087 0.047582507
pc2q
## 1% 3% 5% 10% 90% 95%
## -0.030723456 -0.019042811 -0.013598755 -0.007774894 0.007995538 0.018018425
## 97% 99%
## 0.025846880 0.046757499
Let’s do the top and bottom 1% for both axes.
rotation.sub <- data.frame(d.pcx$rotation)[,c("PC1", "PC2")]
dim(rotation.sub)
## [1] 7103 2
rotation.sub <- rotation.sub[which(rotation.sub$PC1 <= pc1q["1%"] | rotation.sub$PC1 >= pc1q["99%"] | rotation.sub$PC2 <= pc2q["1%"] | rotation.sub$PC2 >= pc2q["99%"]),]
dim(rotation.sub)
## [1] 285 2
285 taxa is really still a lot! It will slightly improve the chart.
biplot(x=d.pcx$x, y=rotation.sub, var.axes=TRUE, col=c("black", "#56B4E9"), xlabs=sample.labels, cex=0.5)
Not so helpful.
Let’s try one at a higher taxonomic level. Let’s look at phylum contributions.
Agglomerate by taxa before you use a CLR transform.
acs.phylum <- acsdat.clean %>% tax_glom(taxrank="Phylum")
phylum.otu.table <- data.frame(otu_table(acs.phylum))
phylum.clr.table <- transform.clr(phylum.otu.table)
## No. corrected values: 39
dim(phylum.clr.table)
## [1] 30 344
As you can see it is a much smaller table.
d.pcx <- prcomp(t(phylum.clr.table), scale=FALSE, center=TRUE)
d.mvar <- sum(d.pcx$sdev^2)
PC1.label <- paste("PC1: ", percent(sum(d.pcx$sdev[1]^2)/d.mvar, accuracy=0.1))
PC2.label <- paste("PC2: ", percent(sum(d.pcx$sdev[2]^2)/d.mvar, accuracy=0.1))
PC3.label <- paste("PC3: ", percent(sum(d.pcx$sdev[3]^2)/d.mvar, accuracy=0.1))
print(c(PC1.label, PC2.label, PC3.label))
## [1] "PC1: 25.2%" "PC2: 15.2%" "PC3: 12.3%"
The total variance accounted for in the first three axes is 52.7% percent. The higher value isn’t so surprising, since we went from >7000 columns to only 30!
pcx.importance <- summary(d.pcx)$importance
#extract the percent variance like so:
percent.pcx <- pcx.importance[2,]
barplot(percent.pcx[1:20])
This scree plot shows that there is measurable gain in variance out to PC4.
The same basic visualization again, just to compare to the OTU based plot.
pca.points <- data.frame(d.pcx$x)
pca.points$site <- mdf[as.vector(row.names(pca.points)),"site"]
pca.points$stream <- mdf[as.vector(row.names(pca.points)),"stream"]
pca.points$community <- mdf[as.vector(row.names(pca.points)),"community"]
pca.points$site.type <- mdf[as.vector(row.names(pca.points)),"type"]
pca.points$date <- mdf[as.vector(row.names(pca.points)),"date"]
#make a month variable
pca.points$month <- as.factor(sapply(pca.points$date, function(x) str_split(x, pattern="/")[[1]][1]))
ggplot(pca.points, aes(x=PC1, y=PC2, col=site.type, shape=community)) + geom_point(size=3, alpha=0.7, stroke=1) + scale_color_manual(values=m.g.colors) + ylab(PC2.label) + xlab(PC1.label) + scale_shape_manual(values=c(1,2,7,8))
## Warning: Removed 1 rows containing missing values (geom_point).
Interesting, yeah? Even though this PCA accounted for much more variance than the one based on OTU composition, hospital and residential types are inseparable in this chart. So we can conclude that they have very different OTU composition, but at the phylum level they are similar. In contrast, the industry and municipal Phandolin sites are different at the phylum level.
Here’s a biplot:
#change sample labels to site labels
sample.labels <- mdf[row.names(d.pcx$x),"site"]
#shrink text by changing cex. Text size is multiplied by this factor, so < 1 will shrink it.
biplot(x=d.pcx$x, y=d.pcx$rotation, var.axes=TRUE, col=c("black", "#56B4E9"), xlabs=sample.labels, cex=0.5)
It’s so far much more readable. We can make it a little easier still be replacing the OTU labels with the names of the phyla.
tax.labels <- tax.table[row.names(d.pcx$rotation),"Phylum"]
# Zoom in on the chart a little, change colors.
biplot(x=d.pcx$x, y=d.pcx$rotation, var.axes=TRUE, col=c("#333333", "#009E73"), xlabs=sample.labels, cex=c(0.5,0.6), ylabs=tax.labels, ylim=c(-10,7), xlim=c(-10, 12))
We can remove some loadings we don’t care about, like extremely rare phyla. My selection is somewhat arbitrary. You could base this selection off of major phyla that you identified in a different section of your analysis, or perhaps if you studying pathogens, you could limit it to pathogens.
phyla.tax <- tax.table[row.names(d.pcx$rotation),]
major.phyla <- c("Proteobacteria", "Firmicutes", "Bacteroidetes", "Fusobacteria", "Actinobacteria", "Verrucomicrobia", "Deinococcus-Thermus", "Spirochaetes", "Synergistetes", "Chloroflexi", "Acidobacteria", "Chlamydiae", "Planctomycetes.")
major.phyla.otus <- row.names(phyla.tax[which(phyla.tax$Phylum %in% major.phyla),])
major.phyla.otus
## [1] "Otu00001" "Otu00004" "Otu00005" "Otu00007" "Otu00021" "Otu00030"
## [7] "Otu00102" "Otu00214" "Otu00339" "Otu00410" "Otu00966" "Otu01206"
#expand graph a little bit.
biplot(x=d.pcx$x, y=d.pcx$rotation[major.phyla.otus,], var.axes=TRUE, col=c("#333333", "#009E73"), xlabs=sample.labels, cex=c(0.5,0.6), ylabs=tax.table[major.phyla.otus,"Phylum"], ylim=c(-10,7), xlim=c(-10, 12), expand=1.3)
This will be easier to see if you save it as a higher resolution graphic.
Of course you probably care about the OTU differences in the first PCA since there were some very clear clusters of residential vs. hospital type points. While it’s difficult to explore those in a visual way, we can look directly at the loadings for the PCA.
Here’s that object again.
d.pcx <- prcomp(t(otu.clr.table), scale=FALSE, center=TRUE)
d.mvar <- sum(d.pcx$sdev^2)
PC1.label <- paste("PC1: ", percent(sum(d.pcx$sdev[1]^2)/d.mvar, accuracy=0.1))
PC2.label <- paste("PC2: ", percent(sum(d.pcx$sdev[2]^2)/d.mvar, accuracy=0.1))
PC3.label <- paste("PC3: ", percent(sum(d.pcx$sdev[3]^2)/d.mvar, accuracy=0.1))
pca.points <- data.frame(d.pcx$x)
pca.points$site <- mdf[as.vector(row.names(pca.points)),"site"]
pca.points$stream <- mdf[as.vector(row.names(pca.points)),"stream"]
pca.points$community <- mdf[as.vector(row.names(pca.points)),"community"]
pca.points$site.type <- mdf[as.vector(row.names(pca.points)),"type"]
pca.points$date <- mdf[as.vector(row.names(pca.points)),"date"]
pca.points$month <- as.factor(sapply(pca.points$date, function(x) str_split(x, pattern="/")[[1]][1]))
ggplot(pca.points, aes(x=PC1, y=PC2, col=site.type, shape=community)) + geom_point(size=3, alpha=0.7, stroke=1) + scale_color_manual(values=m.g.colors) + ylab(PC2.label) + xlab(PC1.label) + scale_shape_manual(values=c(1,2,7,8))
## Warning: Removed 1 rows containing missing values (geom_point).
Looking at the graph, if we want to separate hospital from residential, we should look at which OTUs cause a sign change across the Y-axis (PC2).
d.loadings <- data.frame(d.pcx$rotation)[,c("PC1", "PC2")]
pc2.otus <- d.loadings$PC2
names(pc2.otus) <- row.names(d.loadings)
top10.negative <- sort(pc2.otus)[1:10]
top10.positive <- sort(pc2.otus, decreasing = TRUE)[1:10]
hospital.associated.taxa <- tax.table[names(top10.negative),]
residential.associated.taxa <- tax.table[names(top10.positive),]
hospital.associated.taxa$PC2.Weight <- top10.negative
residential.associated.taxa$PC2.Weight <- top10.positive
hospital.associated.taxa
residential.associated.taxa
We could now visualize these as a biplot if we wanted.
PC2.selection <- c(names(top10.positive), names(top10.negative))
tax.labels <- tax.table[PC2.selection,"Genus"]
biplot(x=d.pcx$x, y=d.pcx$rotation[PC2.selection,], var.axes=TRUE, col=c("#333333", "#009E73"), xlabs=sample.labels, cex=c(0.5,0.6), ylabs=tax.labels, expand = 1.5, ylim = c(-50, 70), xlim = c(-40, 50))
Again, this will be easier to look at if you save it as a file.
You can also display the relative contribution of each OTU in something like a barchart, but this is pretty misleading. The reader will probably be tempted to interpret the proportions as having anything to do with abundance, which they don’t. The taxa identified using ordination are usually not the most abundant taxa in a sample type, but are differentially abundant with other sample types.
For sparse datasets like MiSeq data, PCAs often accounts for a really disappointing amount of variance. The standard PCA also doesn’t fare well for binary data, where abundance is not interpretable (say for a prey composition dataset.). To address the first case, I suggest the package mixOmics
. Installing mixOmics
requires Bioconductor
, a bioinformatics software manager for R. If you don’t have Bioconductor, install it by running this command:
if (!requireNamespace("BiocManager", quietly = TRUE))
install.packages("BiocManager")
BiocManager::install(version = "3.12")
To install mixOmics
, run this command:
BiocManager::install("mixOmics")
library(mixOmics)
##
## Loaded mixOmics 6.12.2
## Thank you for using mixOmics!
## Tutorials: http://mixomics.org
## Bookdown vignette: https://mixomicsteam.github.io/Bookdown
## Questions, issues: Follow the prompts at http://mixomics.org/contact-us
## Cite us: citation('mixOmics')
mixOmics has a suite of tools to help you prepare your data, incuding a CLR transform method, but you can also use the CLR transform object you already have.
In this implementation, you must choose the number of variables to keep for each of your principle components. This choice is really up to you since this is mostly an exploratory analysis. Let’s start with 10 OTUs on three axes.
mixOmics uses the same matrix orientation as prcomp; samples=rows, OTUs=columns
#ncomp = number of PCA components
#keepX = number of variables to keep in each axis
d.spca <- spca(t(otu.clr.table), ncomp=3, center=TRUE, scale=FALSE, keepX=c(10,10,10))
d.spca
##
## Call:
## spca(X = t(otu.clr.table), ncomp = 3, center = TRUE, scale = FALSE, keepX = c(10, 10, 10))
##
## sparse pCA with 3 principal components.
## You entered data X of dimensions: 344 7103
## Selection of 10 10 10 variables on each of the principal components on the X data set.
## Main numerical outputs:
## --------------------
## loading vectors: see object$rotation
## principal components: see object$x
## cumulative explained variance: see object$varX
## variable names: see object$names
##
## Other functions:
## --------------------
## selectVar, tune
mixOmics has some native graphing capabilities, which are ok.
group.labels <- mdf[row.names(d.spca$x),]$site
plotIndiv(d.spca, group=group.labels, ind.names = TRUE, legend=TRUE)
We can get a little fancier:
group.labels <- paste(mdf[row.names(d.spca$x),]$community,mdf[row.names(d.spca$x),]$type)
plotIndiv(d.spca, comp=1:2, group=group.labels, ind.names=TRUE, ellipse=TRUE, legend=TRUE)
You can also extract the relevant data and plot it with ggplot.
PC1.label <- paste("PC1:",percent(d.spca$explained_variance[1], accuracy=0.1))
PC2.label <- paste("PC2:",percent(d.spca$explained_variance[2], accuracy=0.1))
PC3.label <- paste("PC3:",percent(d.spca$explained_variance[3], accuracy=0.1))
pca.points <- data.frame(d.spca$x)
pca.points$site <- mdf[as.vector(row.names(pca.points)),"site"]
pca.points$stream <- mdf[as.vector(row.names(pca.points)),"stream"]
pca.points$community <- mdf[as.vector(row.names(pca.points)),"community"]
pca.points$site.type <- mdf[as.vector(row.names(pca.points)),"type"]
pca.points$date <- mdf[as.vector(row.names(pca.points)),"date"]
#make a month variable
pca.points$month <- as.factor(sapply(pca.points$date, function(x) str_split(x, pattern="/")[[1]][1]))
ggplot(pca.points, aes(x=PC1, y=PC2, col=site.type, shape=community)) + geom_point(size=3, alpha=0.7, stroke=1) + scale_color_manual(values=m.g.colors) + ylab(PC2.label) + xlab(PC1.label) + scale_shape_manual(values=c(1,2,7,8))
## Warning: Removed 1 rows containing missing values (geom_point).
I think it’s now more obvious that while the amount of variance accounted for is similar in the first two axes, the separation is more clear between samples.
ggplot(pca.points, aes(x=PC2, y=PC3, col=site.type, shape=community)) + geom_point(size=3, alpha=0.7, stroke=1) + scale_color_manual(values=m.g.colors) + ylab(PC3.label) + xlab(PC2.label) + scale_shape_manual(values=c(1,2,7,8))
## Warning: Removed 1 rows containing missing values (geom_point).
The third axis in this version also has a new quality where Phandolin and Neverwinter residential samples are cleanly separated. Neat.
We can also add stats ellipses to our plots. Read up on those before you use them and make sure you’re using a statistic that makes sense.
ggplot(pca.points, aes(x=PC1, y=PC2, col=site.type, shape=community)) + geom_point(size=3, alpha=0.7, stroke=1) + scale_color_manual(values=m.g.colors) + ylab(PC2.label) + xlab(PC1.label) + scale_shape_manual(values=c(1,2,7,8)) + stat_ellipse(type="norm")
## Too few points to calculate an ellipse
## Warning: Removed 1 rows containing missing values (geom_point).
## Warning: Removed 1 row(s) containing missing values (geom_path).
Because we’ve already restricted our input matrix to 10 OTUs on each axis, analyzing OTU contribution is a little easier.
Let’s look at what OTUs our SPCA kept.
spca.rotation <- data.frame(d.spca$rotation)
spca.rotation <- spca.rotation[which(spca.rotation$PC1 != 0 | spca.rotation$PC2 != 0 | spca.rotation$PC3 != 0),]
spca.rotation
tax.table[row.names(spca.rotation),]
Unfortunately, by nature, sparse PCAs make horrible biplots. But they are a good way to explore which OTUs contribute most to variation between groups.
#Give readable names to OTUs
sample.labels <- mdf[row.names(d.spca$x),"site"]
tax.labels = tax.table[row.names(spca.rotation),"Genus"]
biplot(x=d.spca$x, y=as.matrix(spca.rotation), var.axes=TRUE, col=c("black", "#56B4E9"), xlabs=sample.labels, ylabs=tax.labels, cex=0.5)
## Warning in arrows(0, 0, y[, 1L] * 0.8, y[, 2L] * 0.8, col = col[2L], length =
## arrow.len): zero-length arrow is of indeterminate angle and so skipped
## Warning in arrows(0, 0, y[, 1L] * 0.8, y[, 2L] * 0.8, col = col[2L], length =
## arrow.len): zero-length arrow is of indeterminate angle and so skipped
## Warning in arrows(0, 0, y[, 1L] * 0.8, y[, 2L] * 0.8, col = col[2L], length =
## arrow.len): zero-length arrow is of indeterminate angle and so skipped
## Warning in arrows(0, 0, y[, 1L] * 0.8, y[, 2L] * 0.8, col = col[2L], length =
## arrow.len): zero-length arrow is of indeterminate angle and so skipped
## Warning in arrows(0, 0, y[, 1L] * 0.8, y[, 2L] * 0.8, col = col[2L], length =
## arrow.len): zero-length arrow is of indeterminate angle and so skipped
## Warning in arrows(0, 0, y[, 1L] * 0.8, y[, 2L] * 0.8, col = col[2L], length =
## arrow.len): zero-length arrow is of indeterminate angle and so skipped
## Warning in arrows(0, 0, y[, 1L] * 0.8, y[, 2L] * 0.8, col = col[2L], length =
## arrow.len): zero-length arrow is of indeterminate angle and so skipped
## Warning in arrows(0, 0, y[, 1L] * 0.8, y[, 2L] * 0.8, col = col[2L], length =
## arrow.len): zero-length arrow is of indeterminate angle and so skipped
## Warning in arrows(0, 0, y[, 1L] * 0.8, y[, 2L] * 0.8, col = col[2L], length =
## arrow.len): zero-length arrow is of indeterminate angle and so skipped
mixOmics has some neat alternative ways of visualizing loadings.
plotLoadings(d.spca, comp=1, title='Loadings on Component 1', contrib='max', method='mean')
plotLoadings(d.spca, comp=2, title='Loadings on Component 2', contrib='max', method='mean')
plotLoadings(d.spca, comp=3, title='Loadings on Component 3', contrib='max', method='mean')
Redundancy analysis is a great way to connect two large datasets. The idea is to do some kind of dimension reduction twice, using the eigenvectors of the first PCA (or PCoA or NMDS or LDA) to create the second PCA. The function rda
does all of this for you.
If you’ve made technical replicates of your sequences, then more than likely your datasets are not the same size. You will need to either combine your technical replicates together, select one, or make averages. Read counts are dependent on total read depth, as we’ve discussed many many times, so if you want to combine or average, you should do that with the CLR transformed tables. The easiest route is to select one, otherwise I recommend taking the average proportional abundance value.
Here’s how to do that:
Load some chemistry data.
chemdata <- read.csv("adelle/mfqpcr_normalized.csv", header=TRUE)
chemdata <- subset(chemdata, select=c("index", "date", "bottle", "community", "site", "source_type", "air_temp", "temperature", "DO", "conductivity", "pH"))
chemdata <- chemdata[!duplicated(chemdata),]
chemdata
This dataset has only 25 records, not the full dataset. The values are duplicated for samples sequenced in replicates. We really just want unique data points, so we will drop some of these. Let’s average together the relevant sequencing samples. First make a subset of the phyloseq object with the right samples.
phylo.sub <- prune_samples(sample_names(acsdat.clean) %in% chemdata$index, acsdat.clean)
phylo.sub
## phyloseq-class experiment-level object
## otu_table() OTU Table: [ 18583 taxa and 25 samples ]
## sample_data() Sample Data: [ 25 samples by 9 sample variables ]
## tax_table() Taxonomy Table: [ 18583 taxa by 6 taxonomic ranks ]
otu.table <- data.frame(otu_table(phylo.sub))
otu.clr.table <- data.frame(transform.clr(otu.table))
mdf.sub <- data.frame(sample_data(phylo.sub))
Double check that you’re not missing any metadata information that would inadvertantly lump non replicates together.
mdf.sub %>% group_by(date, community, type, stream, site, bottle) %>% tally()
Next we’ll use dplyr
to group our samples together and take the mean value for each otu.
mdf.sub$index <- row.names(mdf.sub)
OTU.averages <- mdf.sub %>% group_by(date, community, type, stream, site, bottle) %>% group_map(~ rowMeans(data.frame(otu.clr.table[,.x$index])))
OTU.mat <- unlist(OTU.averages)
condensed.mdf <- mdf.sub %>% group_by(date, community, type, stream, site, bottle) %>% group_modify(~ {paste(collapse = ".", unique(.x$index)) %>% tibble::enframe(name="meaningless", value="sampleIndices") })
condensed.mdf
condensed.otu.table <- data.frame(OTU.averages)
colnames(condensed.otu.table) <- condensed.mdf$sampleIndices
condensed.mdf <- data.frame(condensed.mdf)
row.names(condensed.mdf) <- condensed.mdf$sampleIndices
condensed.mdf
condensed.otu.table
We now have the mean CLR for each set of sample replicates. The original sample data is in the object condensed.mdf
.
Now we need to group the chemdata by the same levels exactly. (date, community, type, stream, site, bottle)
chemdata$mdf.date <- mdf[chemdata$index, "date"]
chemdata$mdf.community <- mdf[chemdata$index, "community"]
chemdata$mdf.type <- mdf[chemdata$index, "type"]
chemdata$mdf.stream <- mdf[chemdata$index, "stream"]
chemdata$mdf.site <- mdf[chemdata$index, "site"]
chemdata$mdf.bottle <- mdf[chemdata$index, "bottle"]
condensed.chem <- chemdata %>% group_by(mdf.date, mdf.community, mdf.type, mdf.stream, mdf.site, mdf.bottle, air_temp, temperature, DO, conductivity, pH) %>%group_modify(~ {paste(collapse = ".", unique(.x$index)) %>% tibble::enframe(name="meaningless", value="sampleIndices") })
condensed.chem <- data.frame(condensed.chem)
row.names(condensed.chem) <- condensed.chem$sampleIndices
condensed.chem
Now make a version of the chemistry dataframe with only the variables you want to include in the RDA.
chem.rda.dat <- subset(condensed.chem, select=c("air_temp", "temperature", "DO", "conductivity", "pH"))
chem.rda.dat
In this case we’re just going to give RDA our transformed OTU table and it will run both PCAs. Here’s what a PCA based only on the averaged OTUs would look like:
d.pcx <- prcomp(t(condensed.otu.table), scale=FALSE, center=TRUE)
d.mvar <- sum(d.pcx$sdev^2)
PC1.label <- paste("PC1: ", percent(sum(d.pcx$sdev[1]^2)/d.mvar, accuracy=0.1))
PC2.label <- paste("PC2: ", percent(sum(d.pcx$sdev[2]^2)/d.mvar, accuracy=0.1))
PC3.label <- paste("PC3: ", percent(sum(d.pcx$sdev[3]^2)/d.mvar, accuracy=0.1))
pca.points <- data.frame(d.pcx$x)
pca.points$site <- condensed.mdf[as.vector(row.names(pca.points)),"site"]
pca.points$stream <- condensed.mdf[as.vector(row.names(pca.points)),"stream"]
pca.points$community <- condensed.mdf[as.vector(row.names(pca.points)),"community"]
pca.points$site.type <- condensed.mdf[as.vector(row.names(pca.points)),"type"]
pca.points$date <- condensed.mdf[as.vector(row.names(pca.points)),"date"]
pca.points$month <- as.factor(sapply(pca.points$date, function(x) str_split(x, pattern="/")[[1]][1]))
ggplot(pca.points, aes(x=PC1, y=PC2, col=site.type, shape=community)) + geom_point(size=3, alpha=0.7, stroke=1) + scale_color_manual(values=m.g.colors) + ylab(PC2.label) + xlab(PC1.label) + scale_shape_manual(values=c(1,2,7,8))
acs.rda <- rda(t(condensed.otu.table) ~., data=chem.rda.dat, scale=T)
acs.rda
## Call: rda(formula = t(condensed.otu.table) ~ air_temp + temperature +
## DO + conductivity + pH, data = chem.rda.dat, scale = T)
##
## Inertia Proportion Rank
## Total 1696.0000 1.0000
## Constrained 727.2641 0.4288 5
## Unconstrained 968.7359 0.5712 8
## Inertia is correlations
##
## Eigenvalues for constrained axes:
## RDA1 RDA2 RDA3 RDA4 RDA5
## 308.79 171.35 104.57 74.13 68.42
##
## Eigenvalues for unconstrained axes:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7 PC8
## 273.69 178.26 121.66 106.17 85.79 83.76 62.47 56.93
You can also run rda using an explicit formula.
acs.rda <- rda(t(condensed.otu.table) ~ air_temp + temperature+ DO+ conductivity + pH, data=chem.rda.dat, scale=T)
acs.rda
## Call: rda(formula = t(condensed.otu.table) ~ air_temp + temperature +
## DO + conductivity + pH, data = chem.rda.dat, scale = T)
##
## Inertia Proportion Rank
## Total 1696.0000 1.0000
## Constrained 727.2641 0.4288 5
## Unconstrained 968.7359 0.5712 8
## Inertia is correlations
##
## Eigenvalues for constrained axes:
## RDA1 RDA2 RDA3 RDA4 RDA5
## 308.79 171.35 104.57 74.13 68.42
##
## Eigenvalues for unconstrained axes:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7 PC8
## 273.69 178.26 121.66 106.17 85.79 83.76 62.47 56.93
The constrained axes account for 43% of variance, which is pretty excellent. The unconstrained axis do OK in the first 3 axes, accountign for an additional 33%. The first constrained axis does nost of the work, accounting for 16%.
plot(acs.rda, type='n', scaling=0)
orditorp(acs.rda, display='sp', cex=0.75, scaling=0, col='blue')
text(acs.rda, display='cn', col='red', scaling=0)
plot(acs.rda, type='n', scaling=3)
orditorp(acs.rda, display='sites', cex=0.75, scaling=3, col='blue')
text(acs.rda, display='cn', col='red', scaling=3)
The first plot is arguably the most useful since it displays OTUs alongside the chemistry data, though the scaling can make it hard to tell what those OTUs are. If you re-run this analysis with a higher taxonomic grouping level, each taxon alone will account for more of the rotational matrix and will be more visible. You can also scale the data to look for correlations.
You can approach deeper analysis of what OTUs contribute most to principle components using the same methods we used for PCAs.
These are the constrained axis sample transformations.
data.frame(acs.rda$CCA$u)
This is the constrained axis rotational matrix for OTUs.
data.frame(acs.rda$CCA$v)
And here are the relevant contribution of each chemistry variable to each constrained axis.
data.frame(acs.rda$CCA$biplot)
If you’re interested in the PCA that’s created after redundancy analysis, meaning the variation in your dataset that can not be explained by variation in chemistry data, you can extract that like this:
sample.projections <- data.frame(acs.rda$CA$u)
otu.rotations <- data.frame(acs.rda$CA$v)
eigenvectors <- acs.rda$CA$eig
PC1.label <- percent(eigenvectors["PC1"]/sum(eigenvectors), accuracy=0.1)
PC2.label <- percent(eigenvectors["PC2"]/sum(eigenvectors), accuracy=0.1)
pca.points <- sample.projections
pca.points$site <- condensed.mdf[as.vector(row.names(pca.points)),"site"]
pca.points$stream <- condensed.mdf[as.vector(row.names(pca.points)),"stream"]
pca.points$community <- condensed.mdf[as.vector(row.names(pca.points)),"community"]
pca.points$site.type <- condensed.mdf[as.vector(row.names(pca.points)),"type"]
pca.points$date <- condensed.mdf[as.vector(row.names(pca.points)),"date"]
#make a month variable
pca.points$month <- as.factor(sapply(pca.points$date, function(x) str_split(x, pattern="/")[[1]][1]))
ggplot(pca.points, aes(x=PC1, y=PC2, col=site.type, shape=community)) + geom_point(size=3, alpha=0.7, stroke=1) + scale_color_manual(values=m.g.colors) + ylab(PC2.label) + xlab(PC1.label) + scale_shape_manual(values=c(1,2,7,8))
Let’s see if we can improve our inferences by looking at the order level microbe IDs.
acs.ord.subset <- tax_glom(phylo.sub, taxrank="Order")
otu.table <- data.frame(otu_table(acs.ord.subset))
otu.clr.table <- data.frame(transform.clr(otu.table))
OTU.averages <- mdf.sub %>% group_by(date, community, type, stream, site, bottle) %>% group_map(~ rowMeans(data.frame(otu.clr.table[,.x$index])))
condensed.otu.table <- data.frame(OTU.averages)
colnames(condensed.otu.table) <- condensed.mdf$sampleIndices
condensed.otu.table
acs.rda <- rda(t(condensed.otu.table) ~., data=chem.rda.dat, scale=T)
acs.rda
## Call: rda(formula = t(condensed.otu.table) ~ air_temp + temperature +
## DO + conductivity + pH, data = chem.rda.dat, scale = T)
##
## Inertia Proportion Rank
## Total 86.0000 1.0000
## Constrained 40.6318 0.4725 5
## Unconstrained 45.3682 0.5275 8
## Inertia is correlations
##
## Eigenvalues for constrained axes:
## RDA1 RDA2 RDA3 RDA4 RDA5
## 22.374 7.078 5.212 3.866 2.102
##
## Eigenvalues for unconstrained axes:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7 PC8
## 16.010 7.143 5.868 5.679 3.811 3.015 2.153 1.688
We account for a similar proportion of variance, though as you can see there is less variance overall. That’s because we went from 1000s of taxa (columns) to about 80 taxa.
plot(acs.rda, type='n', scaling=0)
orditorp(acs.rda, display='sp', cex=0.75, scaling=0, col='blue')
text(acs.rda, display='cn', col='red', scaling=0)
plot(acs.rda, type='n', scaling=3)
orditorp(acs.rda, display='sites', cex=0.75, scaling=3, col='blue')
text(acs.rda, display='cn', col='red', scaling=3)
It should now be a little easier to see the OTUs on the first chart. Let’s make the chart more attractive and change the OTU labels to the names of the taxonomic orders.
order.labels <- tax.table[row.names(condensed.otu.table),"Order"]
names(order.labels) <- row.names(condensed.otu.table)
plot(acs.rda, type='n', scaling=1, family='serif', xlab="RDA1 (26%)", ylab="RDA2 (8%)", font.lab=2, cex.lab=1, cex.axis=0.75)
orditorp(acs.rda, display='sp', cex=0.65, scaling=1, col='#666666', font=4, family='serif', pch=18, labels=order.labels, air = 1.5)
text(acs.rda, display='cn', col='black', scaling=1, family='serif', cex=1, pch=16)
Setting scaling=1 will make it easier to see the OTUs, but harder to see samples.
You can display the samples on the same chart if you want, but it will look pretty cluttered.
site.labels <- factor(paste(condensed.mdf[colnames(condensed.otu.table),"type"], condensed.mdf[colnames(condensed.otu.table),"community"], sep="."))
levels(site.labels)
## [1] "hospital.Neverwinter" "hospital.Phandolin"
## [3] "industry.Phandolin" "residential.Neverwinter"
## [5] "residential.Phandolin"
levels(site.labels) <- c("HN", "HP", "IP", "RN", "RP")
site.labels <- as.vector(site.labels)
names(site.labels) <- colnames(condensed.otu.table)
plot(acs.rda, type='n', scaling=1, family='serif', xlab="RDA1 (26%)", ylab="RDA2 (8%)", font.lab=2, cex.lab=1, cex.axis=0.75)
orditorp(acs.rda, display='sp', cex=0.65, scaling=1, col='#666666', font=4, family='serif', pch=18, labels=order.labels, air = 1.5)
orditorp(acs.rda, display='sites', cex=1, scaling=1, col='#56B4E9', labels=site.labels)
text(acs.rda, display='cn', col='black', scaling=1, family='serif', cex=1, pch=16)
Principle Coordinates Analysis (PCoA) is a really common dimension reduction tool for data is that is not close to a normal distribution. PCAs rely on that assumption, but PCoA does not. This technique can be used with count values, rather than the CLR-transformed values. We will need to make some version of an even depth table, and we will also need to create a distance matrix.
phyloseq
and vegan
have many distance functions. I’ve described the options we will use below. For mathematical definitions and a full list of available distance metrics, see the documentation of vegdist and phyloseq::distance
.
If you’re interested in using the Jaccard distance, you can calculate the Bray-Curtis dissimilarity, and then use this function:
jaccardIndex <- function(distval){
return((2*distvalt)/(1+distval))
}
Bray-Curtis and Manhattan both take an argument called binary
. If binary= TRUE
, abundance values of OTUs are not used in calculating the distance metric and data is treated as presence/absence only. The default setting is binary=FALSE
, where the species are weighted by their counts. Both are worth exploring, since the presence/absence (unweighted) reveals more about rarely occurring OTUs, and the weighted version tells you more about very common OTUs.
You can also use weighted or unweighted unifrac to incorporate phylogenetic distance into your study, but you should be cautious about doing that with a simple neighbor-joining tree like mothur
creates. That would be a 16S rrRNA tree calculated only from a small subset (250-300bp) of the full length sequence needed to resolve the whole tree. While V3-V4 and V4 are great for identifying microbes to the genera level, higher level taxonomic relationships will not be preserved in a tree inferred from only this region.
If you are dead set on using a tree, I at the very least recommend making a PhyML tree with representative sequences only for your OTUs. You can then run Unifrac in R.
For more detail on best practices for even sampling, see the tutorial on the front page. Here we are going to take a shortcut and simple scale to lowest depth.
minimum.depth <- min(sample_sums(acsdat.clean))
minimum.depth
## [1] 11932
acsdat.evendepth <- acsdat.clean %>% transform_sample_counts(function(x) round((x*minimum.depth)/sum(x))) %>% prune_taxa(taxa_sums(.) > 0,.)
acsdat.evendepth
## phyloseq-class experiment-level object
## otu_table() OTU Table: [ 13865 taxa and 344 samples ]
## sample_data() Sample Data: [ 344 samples by 9 sample variables ]
## tax_table() Taxonomy Table: [ 13865 taxa by 6 taxonomic ranks ]
Let’s use bray-curtis, both weighted and unweighted versions.
bc.unweighted <- phyloseq::distance(acsdat.evendepth, method="bray", type="samples", binary=TRUE)
bc.weighted <- phyloseq::distance(acsdat.evendepth, method="bray", type="samples", binary=FALSE)
#verify that these distances are at least slightly different.
bc.unweighted[1:4]
## [1] 0.5970696 0.8470067 0.8579545 0.8478478
bc.weighted[1:4]
## [1] 0.5102058 0.8672559 0.8511313 0.8675023
The phyloseq
wrapper is convenient, but if your data is not in phyloseq, you can do the same thing using vegan
.
even.otu.table <- data.frame(otu_table(acsdat.evendepth))
#important: vegan uses samples=rows, otus=columns.
bc.unweighted.2 <- vegan::vegdist(t(even.otu.table), method="bray", binary=TRUE)
bc.unweighted.2[1:4]
## [1] 0.5970696 0.8470067 0.8579545 0.8478478
The ‘lingoes’ correction shifts the eigenvalues to all be non-negative. That makes it possible to calculate the relative eigenvalues.
acs.pcoa.unweighted <- pcoa(bc.unweighted, correction='lingoes', sample_names(acsdat.evendepth))
relative.eigenvalues <- acs.pcoa.unweighted$values$Rel_corr_eig
PC1.label <- paste("Axis 1:", percent(relative.eigenvalues[1], accuracy=0.1))
PC2.label <- paste("Axis 2:", percent(relative.eigenvalues[2], accuracy=0.1))
# save the transformed vectors
d.pcoa <- data.frame(acs.pcoa.unweighted$vectors.cor)
d.pcoa$site.type <- mdf[row.names(d.pcoa), "type"]
d.pcoa$community <- mdf[row.names(d.pcoa), "community"]
ggplot(d.pcoa, aes(x=Axis.1, y=Axis.2, color=site.type, shape=community)) + geom_point(size=3, alpha=0.7, stroke=1) + scale_color_manual(values=m.g.colors) + ylab(PC2.label) + xlab(PC1.label) + scale_shape_manual(values=c(1,2,7,8)) + geom_vline(xintercept=0, linetype=2, color="gray") + geom_hline(yintercept=0, linetype=2, color="gray")
## Warning: Removed 1 rows containing missing values (geom_point).
We can add ellipses
ggplot(d.pcoa, aes(x=Axis.1, y=Axis.2, color=site.type, shape=community)) + geom_point(size=3, alpha=0.7, stroke=1) + scale_color_manual(values=m.g.colors) + ylab(PC2.label) + xlab(PC1.label) + scale_shape_manual(values=c(1,2,7,8)) + geom_vline(xintercept=0, linetype=2, color="gray") + geom_hline(yintercept=0, linetype=2, color="gray") + stat_ellipse()
## Too few points to calculate an ellipse
## Warning: Removed 1 rows containing missing values (geom_point).
## Warning: Removed 1 row(s) containing missing values (geom_path).
Which notably perform less well in the PCoA than the PCA.
Let’s see how things change when we use the weighted version.
acs.pcoa.weighted <- pcoa(bc.weighted, correction='lingoes', sample_names(acsdat.evendepth))
relative.eigenvalues <- acs.pcoa.weighted$values$Rel_corr_eig
PC1.label <- paste("Axis 1:", percent(relative.eigenvalues[1], accuracy=0.1))
PC2.label <- paste("Axis 2:", percent(relative.eigenvalues[2], accuracy=0.1))
# save the transformed vectors
d.pcoa <- data.frame(acs.pcoa.weighted$vectors.cor)
d.pcoa$site.type <- mdf[row.names(d.pcoa), "type"]
d.pcoa$community <- mdf[row.names(d.pcoa), "community"]
ggplot(d.pcoa, aes(x=Axis.1, y=Axis.2, color=site.type, shape=community)) + geom_point(size=3, alpha=0.7, stroke=1) + scale_color_manual(values=m.g.colors) + ylab(PC2.label) + xlab(PC1.label) + scale_shape_manual(values=c(1,2,7,8)) + geom_vline(xintercept=0, linetype=2, color="gray") + geom_hline(yintercept=0, linetype=2, color="gray") + stat_ellipse()
## Too few points to calculate an ellipse
## Warning: Removed 1 rows containing missing values (geom_point).
## Warning: Removed 1 row(s) containing missing values (geom_path).
The orientation of the chart changed, which doesn’t really matter. The industry samples however are no longer really separated from the municipal influent and phandolin samples though.
Unlike a PCA, the PCoA ordination does not produce a rotational matrix. We collapsed our OTU abundances when we made the distance matrix. It is possible to ordinate them again onto the graph, but this is based on the projections of samples, not the other way around.
Let’s first take an example with taxonomy.
#simplify the dataframe a bit
d.pcoa <- subset(d.pcoa, select=c("Axis.1", "Axis.2", "site.type", "community"))
Say we want to project the major phyla abundances onto this chart. Let’s calculate those.
#agglomerate by phylum
acsdat.phy <- acsdat.clean %>% tax_glom(taxrank="Phylum")
#transform the otu table to proportional abundance
phy.otu.clr.table <- transform.clr(otu_table(acsdat.phy))
## No. corrected values: 39
sample.phylum.abundance <- data.frame(t(phy.otu.clr.table))
#change the otu names to phylum names
colnames(sample.phylum.abundance) <- tax.table[colnames(sample.phylum.abundance),"Phylum"]
#all phyla
sample.phylum.abundance
30 arrows will be a lot to visualize. Let’s just take the 10 most abundant.
abundant.phyla <- sort(colMeans(sample.phylum.abundance), decreasing=TRUE)[1:10]
abundant.phyla
## Proteobacteria Firmicutes Bacteroidetes
## 8.0521911 7.1540243 6.6817712
## Actinobacteria Bacteria_unclassified Fusobacteria
## 5.0764497 4.3437989 4.2295055
## Verrucomicrobia TM7 Synergistetes
## 3.7495008 2.1860013 1.2210424
## Acidobacteria
## 0.5715673
#extract those columns from dataframe
abundant.sample.phyla <- sample.phylum.abundance[,names(abundant.phyla)]
We can now make a biplot.
x.labs <- mdf[row.names(d.pcoa), "site"]
biplot(acs.pcoa.weighted, Y=abundant.sample.phyla, var.axes=TRUE, col=c("#333333", "#009E73"), cex=c(0.5, 0.6), main="", rn=x.labs, correction="lingoes")
This may look terrible in your R window, but if you zoom in on the plot, it does look better.
You could use a similar strategy with any relevant sample data, like environmental chemistry data or qPCR data. Sometimes this is easier than performing redundancy analysis, but it will yield less information. It is really more a visual exploratory technique.
(See the previous section on redundancy analysis to collapse sample replicates to match up with chemistry data. The following section uses an OTU table of order level abundance)
Note that our condensed OTU table is in the form of mean proportional abundance. This is not really flexible, since we had to combine samples of different depths.
condensed.otu.table[1:10,1:4]
That’s OK! We should just pick a distance metric that makes sense. We could reuse our PCA from earlier and use the Aitchison distance of primary components:
#rerun that PCA
d.pcx <- prcomp(t(condensed.otu.table), scale=FALSE, center=TRUE)
#Extract the components that matter... let's use 1&2.
condensed.pca.components <- data.frame(d.pcx$x)[,c("PC1", "PC2")]
condensed.pca.components
#calculate distance using vegan::vegdist
aitchison.distance <- vegdist(condensed.pca.components, method="euclidian")
aitchison.distance[1:4]
## [1] 4.603993 10.320002 23.482425 20.526625
You could also use a different distance metric from the vegdist
list directly on your proportional abundance table. Let’s try the manhattan distance.
manhattan.distance <- vegdist(t(condensed.otu.table), method="manhattan", binary=FALSE)
manhattan.distance[1:4]
## [1] 72.21653 93.44243 158.93292 149.25027
Now run a PCoA using that distance matrix. Let’s try the Aitchison version first.
pcoa.aitchison <- pcoa(aitchison.distance, correction='lingoes', row.names(condensed.mdf))
relative.eigenvalues <- pcoa.aitchison$values$Rel_corr_eig
PC1.label <- paste("Axis 1:", percent(relative.eigenvalues[1], accuracy=0.1))
PC2.label <- paste("Axis 2:", percent(relative.eigenvalues[2], accuracy=0.1))
# save the transformed vectors
# no eigenvalues were negative, so pull from vectors, not corrected vectors
d.pcoa <- data.frame(pcoa.aitchison$vectors)
Here’s what that looks like in nice colors:
d.pcoa$site.type <- condensed.mdf[row.names(d.pcoa), "type"]
d.pcoa$community <- condensed.mdf[row.names(d.pcoa), "community"]
ggplot(d.pcoa, aes(x=Axis.1, y=Axis.2, color=site.type, shape=community)) + geom_point(size=3, alpha=0.7, stroke=1) + scale_color_manual(values=m.g.colors) + ylab(PC2.label) + xlab(PC1.label) + scale_shape_manual(values=c(1,2,7,8)) + geom_vline(xintercept=0, linetype=2, color="gray") + geom_hline(yintercept=0, linetype=2, color="gray") + stat_ellipse()
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Warning: Removed 3 row(s) containing missing values (geom_path).
Let’s pull just some of the chemistry data.n #edit: the real chemistry data was a much smaller set! Using the whole set.
chem.subset <- chem.rda.dat
We can now make a biplot
x.labs <- condensed.mdf[row.names(d.pcoa), "site"]
biplot(pcoa.aitchison, Y=chem.subset, var.axes=TRUE, col=c("#333333", "#009E73"), cex=c(0.5, 0.6), main="", rn=x.labs, xlim=c(-10,10), ylim=c(-10,10))
NMDS plots are alternatives to PCoAs. They have very similar results and both use distance matrices as inputs, but have different mathematical premises. The NMDS function requires a set number of axes to return – these will maximize dissimilarity between your samples. A good choice is 2-3, since you’re not going to easily represent more than that graphically.
One major difference of NMDS and PCoA or PCA is that sample scores are compared using rank, not means. This is what makes the NMDS useful for distributions of any shape. However you should remember that rank operations still expect the distribution shape of each variable to be the same; i.e., it doesn’t address a case where one column has a normal distribution and another has a binomial distribution. This isn’t a problem for ordinating MiSeq data, but it might be for ordinating your environmental data.
Begin with choosing a distance metric. See the above section on PCoAs for more detail on that. We’ll reuse our distance metrics here.
There are several implementations of NMDS in R. vegan
has one called metaMDS that can start with your community data and apply the distance transformation for you. I prefer to use the monoMDS
function that vegan
calls directly, since typically I am a micromanager and want to make the distance matrices myself.
Fair warning, any implementation of NMDS can be slow, since it is a convergent iterative algorithm.
#NMDS approximates a solution, and results can be somewhat unstable since they depend on your starting point. Set a seed for replicability.
set.seed(1234)
#There are several model choices. Read more about them in the documentation.
nmds.unweighted <- monoMDS(bc.unweighted, k=3, model="global")
# save the transformed points
d.nmds <- data.frame(nmds.unweighted$points)
There’s not an NMDS equivalent of percent variance accounted for. The closest we can get is the stress. Here’s a more extensive NMDS tutorial that goes into detail on interpreting stress. TL;DR stress < 0.05 is ideal. <0.1 is okay, > 0.25 is not good.
nmds.unweighted$stress
## [1] 0.06506595
We did ok!
Graph NMDS plots just like all of the other ordination families, or create biplots following the steps we used for PCoA biplots.
d.nmds$site.type <- mdf[row.names(d.nmds), "type"]
d.nmds$community <- mdf[row.names(d.nmds), "community"]
d.nmds
ggplot(d.nmds, aes(x=MDS1, y=MDS2, color=site.type, shape=community)) + geom_point(size=3, alpha=0.7, stroke=1) + scale_color_manual(values=m.g.colors) + scale_shape_manual(values=c(1,2,7,8)) + geom_vline(xintercept=0, linetype=2, color="gray") + geom_hline(yintercept=0, linetype=2, color="gray") + stat_ellipse()
## Too few points to calculate an ellipse
## Warning: Removed 1 rows containing missing values (geom_point).
## Warning: Removed 1 row(s) containing missing values (geom_path).
Ordinations are by no means the only way to represent pairwise distances! Both NMDS and PCoA inherently reduce the dimensions of the data to make it all visible in 2-3 dimensions. One family of alternatives: trees.
A Minimum Spanning Tree (MST) connects all vertices together with the minimum possible edge weight. It is the simplest possible tree that connects all your samples together based on a distance matrix that you supply. I’ve found these useful when ordination techniques have revealed clusters but not separated them, i.e. in datasets where community differences are smaller.
We’ll use the implementation in ape
.
First let’s try one using the unweighted Bray-Curtis dissimilarity.
bc.mst <- ape::mst(bc.unweighted)
ape
can graph this, but it will not look nice.
plot(bc.mst, graph='circle')
We can use the libraries ggnetwork
and igraph
to do a better job (available through CRAN).
library(ggnetwork)
library(igraph)
##
## Attaching package: 'igraph'
## The following object is masked from 'package:plotly':
##
## groups
## The following object is masked from 'package:vegan':
##
## diversity
## The following object is masked from 'package:permute':
##
## permute
## The following object is masked from 'package:tidyr':
##
## crossing
## The following objects are masked from 'package:dplyr':
##
## as_data_frame, groups, union
## The following objects are masked from 'package:ape':
##
## edges, mst, ring
## The following object is masked from 'package:compositions':
##
## normalize
## The following objects are masked from 'package:stats':
##
## decompose, spectrum
## The following object is masked from 'package:base':
##
## union
bc.mst.network <- ggnetwork(bc.mst)
#Add metadata using the vertex.names
bc.mst.network$site.type <- mdf[bc.mst.network$vertex.names, "type"]
bc.mst.network$community <- mdf[bc.mst.network$vertex.names, "community"]
ggplot(bc.mst.network, aes(x=x, y=y, xend=xend, yend=yend)) + geom_edges() + geom_nodes(aes(color=site.type, shape=community)) + theme_blank()
## Warning: Removed 1 rows containing missing values (geom_point).
This tree layout is cool, but not easy to read. Try this one:
bc.mst.igraph <- graph_from_adjacency_matrix(bc.mst, mode=c("undirected"))
bc.mst.network.tree <- ggnetwork(bc.mst.igraph, layout=layout_as_tree(bc.mst.igraph))
#Add metadata using the vertex.names
bc.mst.network.tree$site.type <- mdf[bc.mst.network.tree$name, "type"]
bc.mst.network.tree$community <- mdf[bc.mst.network.tree$name, "community"]
ggplot(bc.mst.network.tree, aes(x=x, y=y, xend=xend, yend=yend)) + geom_edges() + geom_nodes(aes(color=site.type, shape=community)) + theme_blank()
## Warning: Removed 1 rows containing missing values (geom_point).
You can customize the appearance of the tree a little. Cheat sheet for ggplot symbol codes
ggplot(bc.mst.network.tree, aes(x=x, y=y, xend=xend, yend=yend)) + geom_edges() + geom_nodes(aes(color=site.type, shape=community, stroke=1.1)) + theme_blank() + scale_color_manual(values=c("indianred1", "skyblue1", "mediumorchid","royalblue4", "olivedrab3","pink")) + scale_shape_manual(values=c(1,3,5,7))
## Warning: Removed 1 rows containing missing values (geom_point).
This looks cool, but doesn’t tell us much. We can use a permutational graph test to quanitfy what we observe – there is a non random pattern of categories in the tree. Subtrees exist of like type.
I’m using a modified version of the phyloseqGraphTest
package, available here. The only difference is that in the modified version you supply your own distance matrix instead of the package calculating distance for you.
source("modifiedGraphTest.R")
Supply graph_perm_test
with your phyloseq object and a matching distance matrix. The function randomly shuffles the categorical labels and calculates the number of pure edges. Pure edges connect nodes with the same categorical types. By shuffling the labels, the function calculates a probability distribution based on the structure of the minimum spanning tree and your actual proportions of categorical labels. It then compares your real count of pure edges to that probability distribution to calculate a p-value.
This one is based on unweighted Bray-Curtis distance.
gt <- graph_perm_test(acsdat.evendepth, "type", distance=bc.unweighted, type="mst", nperm=1000)
gt$pval
## [1] 0.000999001
We can plot the test tree to visualize the pure/mixed edges.
gnplot <- ggnetwork(gt$net, layout=igraph::layout_as_tree(gt$net))
ggplot(gnplot, aes(x = x, y = y, xend = xend, yend = yend))+
geom_edges(aes(linetype=edgetype)) +
geom_nodes(aes(color = sampletype, shape = sampletype, stroke = 1.1, alpha =1)) + theme_blank() + scale_color_manual(values=m.g.colors) + scale_shape_manual(values=c(1,3,5,7, 9, 11)) + scale_linetype_manual(values=c(3,1))
You can also use networks to identify clusters of like samples in your data blind to categorical variables.
We can use the package mstknnclust
. It’s available through CRAN.
This function uses both a minimum spanning tree considers each sample’s nearest neighbors to identify clusters. For detail, see the vignettes of the package.
bc.unweight.clust <- mstknnclust::mst.knn(as.matrix(bc.unweighted))
We can visualize the results like this:
bc.cluster.network <- bc.unweight.clust$network
V(bc.cluster.network)$label.cex <- seq(0.6, 0.6, length.out=2)
bc.cluster.network
## IGRAPH b259930 UN-- 344 308 --
## + attr: name (v/c), label.cex (v/n)
## + edges from b259930 (vertex names):
## [1] 24 --344 2 --344 1 --344 1 --38 1 --35 1 --13 26 --27 26 --25
## [9] 22 --23 22 --21 18 --20 18 --19 26 --16 21 --15 18 --15 15 --17
## [17] 16 --15 15 --14 14 --12 14 --11 211--212 210--212 208--209 207--209
## [25] 135--136 134--136 132--133 132--131 129--130 129--128 126--139 126--136
## [33] 126--133 128--126 126--127 123--139 123--124 118--212 118--209 126--118
## [41] 117--119 118--117 311--314 311--313 249--250 248--314 249--201 200--202
## [49] 201--200 311--199 199--198 311--197 197--250 308--309 308--307 307--306
## [57] 308--305 303--304 308--302 299--300 308--298 296--310 296--304 296--300
## + ... omitted several edges
plot(bc.cluster.network, vertex.color=igraph::clusters(bc.cluster.network)$membership, layout=igraph::layout.fruchterman.reingold(bc.cluster.network, niter=1000), vertex.size=6)
This will look better if you save it. You can adjust this graph by changing the number of neighbors considered in the beginning.
Let’s look in closer detail at which samples are in these clusters.
bc.clusters <- data.frame(bc.unweight.clust$partition)
bc.clusters %>% group_by(cluster) %>% tally()
The largest clusters of similar samples are clusters 6 and 7.
cluster7.samples <- bc.clusters[which(bc.clusters$cluster==7),"object"]
cluster6.samples <- bc.clusters[which(bc.clusters$cluster==6),"object"]
We can pull those samples up in our metadata file.
mdf[cluster7.samples,] %>% group_by(community, site, type, stream) %>% tally()
All of the cluster 7 samples are Phandolin residential samples except for one that is a hospital sample. Most of them are from sites PR1 and PR3, not PR2.
mdf[cluster6.samples,] %>% group_by(community, site, type, stream) %>% tally()
Cluster 6 samples are all from Neverwinter and are mostly from site NH2R, but have some NW downstream sites and 2 NM sites.
You could then for example compare the community composition at these subsets of samples. Let’s look at the class composition of cluster 7.
c7.acs <- acsdat.clean %>% prune_samples(sample_names(acsdat.clean) %in% cluster7.samples,.) %>% prune_taxa(taxa_sums(.) > 0,.)
c7.acs
## phyloseq-class experiment-level object
## otu_table() OTU Table: [ 5780 taxa and 86 samples ]
## sample_data() Sample Data: [ 86 samples by 9 sample variables ]
## tax_table() Taxonomy Table: [ 5780 taxa by 6 taxonomic ranks ]
c7.class <- tax_glom(c7.acs, taxrank="Class")
c7.clr.table <- transform.clr(data.frame(otu_table(c7.class)))
c7.clr.acs <- merge_phyloseq(otu_table(c7.clr.table, taxa_are_rows=TRUE), tax_table(c7.class), sample_data(c7.class))
c7.clr.acs
## phyloseq-class experiment-level object
## otu_table() OTU Table: [ 50 taxa and 86 samples ]
## sample_data() Sample Data: [ 86 samples by 9 sample variables ]
## tax_table() Taxonomy Table: [ 50 taxa by 6 taxonomic ranks ]
c7.clr.melt<- psmelt(c7.clr.acs)
c7.clr.melt <- data.frame(c7.clr.melt)
ggplot(c7.clr.melt, aes(x=Sample, y=Class, fill=Abundance)) + geom_tile() + scale_fill_gradient2(high=muted("red"), low=muted("blue")) + theme(axis.text.x = element_blank())
We could instead make a subset of both clusters to directly compare them.
clusters.acs <- acsdat.clean %>% prune_samples(sample_names(acsdat.clean) %in% c(cluster6.samples, cluster7.samples),.) %>% prune_taxa(taxa_sums(.) > 0,.)
clusters.acs
## phyloseq-class experiment-level object
## otu_table() OTU Table: [ 9253 taxa and 134 samples ]
## sample_data() Sample Data: [ 134 samples by 9 sample variables ]
## tax_table() Taxonomy Table: [ 9253 taxa by 6 taxonomic ranks ]
clusters.class <- tax_glom(clusters.acs, taxrank="Class")
clusters.clr.table <- transform.clr(data.frame(otu_table(clusters.class)))
## No. corrected values: 16
clusters.clr.acs <- merge_phyloseq(otu_table(clusters.clr.table, taxa_are_rows=TRUE), tax_table(clusters.class), sample_data(clusters.class))
clusters.clr.acs
## phyloseq-class experiment-level object
## otu_table() OTU Table: [ 57 taxa and 134 samples ]
## sample_data() Sample Data: [ 134 samples by 9 sample variables ]
## tax_table() Taxonomy Table: [ 57 taxa by 6 taxonomic ranks ]
clusters.clr.melt<- psmelt(clusters.clr.acs)
clusters.clr.melt <- data.frame(clusters.clr.melt)
Add variable to identify cluster membership
cluster.membership <- c(rep("Cluster 6", length(cluster6.samples)), rep("Cluster 7", length(cluster7.samples)))
names(cluster.membership) <- c(cluster6.samples, cluster7.samples)
cluster.membership[1:10]
## ACS_03_6 ACS_03_7 ACS_03_8 ACS_04 ACS_04_43 ACS_04_44
## "Cluster 6" "Cluster 6" "Cluster 6" "Cluster 6" "Cluster 6" "Cluster 6"
## ACS_06 ACS_06_52 ACS_06_57 ACS_07_90
## "Cluster 6" "Cluster 6" "Cluster 6" "Cluster 6"
clusters.clr.melt$Cluster <- cluster.membership[clusters.clr.melt$Sample]
ggplot(clusters.clr.melt, aes(x=Sample, y=Class, fill=Abundance)) + geom_tile() + scale_fill_gradient2(high=muted("red"), low=muted("blue")) + theme(axis.text.x = element_blank()) + facet_wrap(~Cluster, scales="free_x")
We could also compute averages.
cluster.averages <- clusters.clr.melt %>% group_by(Cluster, Class) %>% summarise(Abundance=mean(Abundance))
## `summarise()` regrouping output by 'Cluster' (override with `.groups` argument)
ggplot(clusters.clr.melt, aes(x=Cluster, y=Class, fill=Abundance)) + geom_tile() + scale_fill_gradient2(high=muted("red"), low=muted("blue")) + theme(axis.text.x = element_blank())
And then only plot the more common classes.
common.classes <- clusters.clr.melt %>% group_by(Class) %>% summarise(MeanAbd = mean(Abundance)) %>% filter(MeanAbd > 0)
## `summarise()` ungrouping output (override with `.groups` argument)
ggplot(clusters.clr.melt[which(clusters.clr.melt$Class %in% common.classes$Class),], aes(x=Cluster, y=Class, fill=Abundance)) + geom_tile() + scale_fill_gradient2(high=muted("red"), low=muted("blue")) + theme(axis.text.x = element_blank())
Unlike the ordination techniques we’ve explored so far, Linear Discriminant Analysis (and related analyses) are supervised techniques. An LDA maximizes the distance between samples based on a variable that you supply.
We’ll use the implementation(s) in mixOmics
.
library(mixOmics)