Murine species trees
knitr::opts_chunk$set(echo = TRUE, warning = FALSE, message = FALSE)
library(ggplot2)
library(cowplot)
library(ggbeeswarm)
library(dplyr)
library(kableExtra)
library(tidyr)
library(ggtree)
library(phytools)
library(phangorn)
library(reshape2)
library(ggExtra)
library(ggrepel)
library(vroom)
library(ggdist)
library(stringr)
library(ggsignif)
library(phytools)
library(castor)
library(MCMCtreeR)
library(here)
library(stringr)
library(BAMMtools)
library(nlme)
library(RPANDA)
source(here("docs", "scripts", "lib", "design.r"))
source(here("docs", "scripts", "lib", "get_tree_info.r"))
#source("C:/bin/core/r/design.r")
#source("C:/bin/core/r/get_tree_info.r")
#source("C:/Users/grt814/bin/core/corelib/design.r")
#source("C:/Users/grt814/bin/core/get_tree_info.r")
#htmltools::includeHTML("../html-chunks/rmd_nav.html")1 Full coding species tree
- 188 species targeted capture to mouse exons
- Assembled with SPADES
- 11,795 coding loci aligned with exonerate+MAFFT
- Gene trees inferred with IQ-Tree
- Species tree inferred with ASTRAL
- Branch lengths inferred with IQ-Tree using the 866 loci with a normalized RF distance to the species tree of less than 0.25.
# This chunk handles all of the main inputs and reads the tree
# cat("188 species targeted capture to mouse exons assembled with SPADES.\n")
# cat("11,775 coding loci aligned with exonerate+mafft\n")
# cat("Gene trees inferred with IQtree.\n")
# cat("Species tree inferred with ASTRAL.\n")
# cat("Branch lengths estimated by ASTRAL.\n")
# Data summary
###############
#tree_file = here("docs", "data", "trees", "astral", "concord-rooted", "full_coding_iqtree_astral.cf.labeled.tree")
tree_file = here("docs", "data", "trees", "astral", "concord-rooted-bl", "full_coding_iqtree_astral_rooted_bl.cf.labeled.tree")
# Newick tree file, with tp labels
rodent_tree = read.tree(tree_file)
tree_to_df_list = treeToDF(rodent_tree)
tree_info = tree_to_df_list[["info"]]
# Read the tree and parse with treetoDF
tree_info = tree_info %>% separate(label, c("tp", "support"), sep=">", remove=F)
tree_info$tp = paste(tree_info$tp, ">", sep="")
tree_info = tree_info %>% separate(support, c("astral", "gcf", "scf"), sep="/", remove=T)
# Split the label by /. tp is my treeParse label.
tree_info$astral[tree_info$node.type=="tip"] = NA
# Fill in ASTRAL support as NA for the tips
tree_info$astral = as.numeric(tree_info$astral)
tree_info$gcf = as.numeric(tree_info$gcf)
tree_info$scf = as.numeric(tree_info$scf)
# Convert all supports to numeric
###############
#iq_tree_labels = here("docs", "data", "trees", "astral", "concord-rooted", "full_coding_iqtree_astral.cf.branch")
iq_tree_labels = here("docs", "data", "trees", "astral", "concord-rooted-bl", "full_coding_iqtree_astral_rooted_bl.cf.branch")
# Newick tree file, with iqtree labels
###############
#cf_stat_file = here("docs", "data", "trees", "astral", "concord-rooted", "full_coding_iqtree_astral.cf.stat")
cf_stat_file = here("docs", "data", "trees", "astral", "concord-rooted-bl", "full_coding_iqtree_astral_rooted_bl.cf.stat")
cf_stats = read.table(cf_stat_file, header=T)
# Concordance factor file
###############
#cf_rep_dir = here("docs", "data", "trees", "astral", "concord-rooted", "cf-reps")
cf_rep_dir = here("docs", "data", "trees", "astral", "concord-rooted-bl", "cf-reps")
#delta_outfile = here("docs", "data", "trees", "astral", "concord-rooted", "delta.tab")
delta_outfile = here("docs", "data", "trees", "astral", "concord-rooted-bl", "delta.tab")
# CF reps for delta and delta outfile
###############
col_file = here("docs", "data", "trees", "astral", "astral-colonization-branches.txt")
exclude_branches = c("Lophiomys_imhausi_UM5152", "Lophuromys_woosnami_LSUMZ37793", "<1>", "<187>", "<186>")
# Colonization branches and branches around the root to exclude from some things
###############
gt_file = here("docs", "data", "trees", "gene-trees", "loci-labeled.treefile")
# The file containing the gene trees
###############
#xmax = 31
xmax = 0.175
# The max of the x-axis for tree figures# The node/branch labels in R and IQtree differ. IQtree uses a nice, logical post-ordering
# of internal nodes while R does something random and assigns labels to tips as well. This
# chunk matches those up for the delta analysis later
iq_tree = read.tree(iq_tree_labels)
iqtree_to_df_list = treeToDF(iq_tree)
iqtree_info = iqtree_to_df_list[["info"]]
# Read the IQ tree tree with branch labels in and parse with get_tree_info
#node_convert = matchNodes(tree_to_df_list[["labeled.tree"]], iqtree_to_df_list[["labeled.tree"]], method="descendants")
tree_info$iqtree.node = NA
# Add a column to the main tree table about IQ tree labels
for(i in 1:nrow(tree_info)){
cur_node = tree_info[i,]$node
iqtree_row = subset(iqtree_info, node==cur_node)
iqtree_label = iqtree_row$label
tree_info[i,]$iqtree.node = iqtree_label
}
# For every row in the main tree table, add in the IQ tree node label given that
# we've read the same tree in R and can use the node.labels1.1 Branches colored by gCF
h = corecol(numcol=1, pal="wilke", offset=3)
l = corecol(numcol=1, offset=3)
# Colors
gcf_tree = ggtree(rodent_tree, size=0.8, ladderize=F, aes(color=tree_info$gcf)) +
scale_color_continuous(name='gCF', low=l, high=h, limits=c(0,100)) +
xlim(0, xmax) +
geom_tiplab(color="#333333", fontface='italic', size=2) +
theme(legend.position=c(0.05,0.9))
#gcf_tree = gcf_tree + geom_text(aes(x=branch, label=ifelse(tree_info$node.type=="internal",as.character(node), ''), label.size=NA, fill="transparent"), size=2, vjust=-0.2)
# Option to add branch labels to the figure
print(gcf_tree)###############
fig_outfile = here("docs", "figs", "full-coding-astral-rooted-gcf.png")
ggsave(fig_outfile, gcf_tree, width=14, height=14, unit="in")
# Save the tree image
## gCF tree1.2 Branches colored by sCF
h = corecol(numcol=1, pal="wilke", offset=3)
l = corecol(numcol=1, offset=3)
# Colors
scf_tree = ggtree(rodent_tree, size=0.8, ladderize=F, aes(color=tree_info$scf)) +
scale_color_continuous(name='sCF', low=l, high=h, limits=c(0,100)) +
xlim(0, xmax) +
geom_tiplab(color="#333333", fontface='italic', size=2) +
theme(legend.position=c(0.05,0.9))
#gcf_tree = gcf_tree + geom_text(aes(x=branch, label=ifelse(tree_info$node.type=="internal",as.character(node), ''), label.size=NA, fill="transparent"), size=2, vjust=-0.2)
# Option to add branch labels to the figure
print(gcf_tree)###############
fig_outfile = here("docs", "figs", "full-coding-astral-rooted-scf.png")
ggsave(fig_outfile, scf_tree, width=14, height=14, unit="in")
# Save the tree image
## gCF tree1.3 Gene vs site concordance factors colored by branch support
h = corecol(numcol=1, pal="wilke", offset=3)
l = corecol(numcol=1, offset=3)
# Colors
p = ggplot(tree_info, aes(x=gcf, y=scf, color=astral)) +
geom_point(size=2, alpha=0.5) +
scale_color_continuous(name='Astral support', low=l, high=h, limits=c(0.8,1)) +
bartheme() +
theme(legend.title=element_text(size=12))
print(p)###############
fig_outfile = here("docs", "figs", "full-coding-astral-cf.png")
ggsave(fig_outfile, p, width=6, height=3, unit="in")
# Save the figure1.4 Concordance factors vs. branch lengths
bl_gcf_p = ggplot(tree_info, aes(x=branch.length, y=gcf)) +
geom_point(size=3, alpha=0.5) +
#geom_text_repel(aes(label=ifelse(avg.ds>0.2|avg.dn>0.01,as.character(node),'')), show_guide=F) +
#geom_smooth(method="lm", se=F, ) +
scale_x_continuous(limits=c(0,0.03)) +
xlab("Branch length") +
ylab("gCF per branch") +
bartheme()
bl_gcf_p = ggExtra::ggMarginal(bl_gcf_p, type="histogram", bins=50, fill=corecol(pal="wilke", numcol=1, offset=4), color="#666666")
bl_scf_p = ggplot(tree_info, aes(x=branch.length, y=scf)) +
geom_point(size=3, alpha=0.5) +
#geom_text_repel(aes(label=ifelse(avg.ds>0.2|avg.dn>0.01,as.character(node),'')), show_guide=F) +
#geom_smooth(method="lm", se=F, ) +
scale_x_continuous(limits=c(0,0.03)) +
xlab("Branch length") +
ylab("sCF per branch") +
bartheme()
bl_scf_p = ggExtra::ggMarginal(bl_scf_p, type="histogram", bins=50, fill=corecol(pal="wilke", numcol=1, offset=5), color="#666666")
p = plot_grid(bl_gcf_p, bl_scf_p, ncol=2)
print(p)###############
fig_outfile = here("docs", "figs", "full-coding-astral-cf-bl.png")
ggsave(fig_outfile, p, width=6, height=3, unit="in")
# Save the figure2 Species tree comparisons
2.1 ASTRAL tree vs. concatenated tree
concat_file = here("docs", "data", "trees", "concat", "full-coding-concat-r-labels.tre")
concat_tree = read.tree(concat_file)
# Read the concatenated tree
astral_concat_comp = cophylo(rodent_tree, concat_tree)## Rotating nodes to optimize matching...
## Done.
# Generate the cophylo object
plot(astral_concat_comp, link.type="curved", link.lwd=4, link.lty="solid", link.col=make.transparent(corecol(numcol=1), 0.5), fsize=c(0.5,0.5))# Plot the trees
#comparePhylo(rodent_tree, concat_tree, plot=T, force.rooted=T)2.2 ASTRAL tree vs. mitochondrial tree
mito_file = here("docs", "data", "trees", "mitochondrial", "concat-concord", "full-mitochondrial-concat.cf.rooted.tree")
mito_tree = read.tree(mito_file)
# Read the mitochondrial tree
astral_mito_comp = cophylo(rodent_tree, mito_tree)## Rotating nodes to optimize matching...
## Done.
# Generate the cophylo object
plot(astral_mito_comp, link.type="curved", link.lwd=4, link.lty="solid", link.col=make.transparent(corecol(numcol=1, offset=1), 0.5), fsize=c(0.5,0.5))# Plot the trees3 Gene trees
The 11,774 gene trees contain varying numbers of taxa due to filtering on the alignment and filtering by IQtree for identical sequences.
3.1 Taxa per gene tree
gene_trees = read.table(gt_file, sep="\t")
names(gene_trees) = c("gene", "tree")
# Read the geen tree file
gene_trees$gene = word(gene_trees$gene, 1, sep="-")
gene_trees$gene = word(gene_trees$gene, 2, sep="/")
# Parse out the protein ID from the filename for each gene
###############
gt_data = data.frame("gene"=c(), "rf"=c(), "num.tips"=c(), "rf.zeros"=c(), "num.tips.zeros"=c())
# A df to track info for each gene tree
for(i in 1:nrow(gene_trees)){
gt = read.tree(text=gene_trees[i,]$tree)
# Read the gene tree as a phylo object
cur_tips = gt$tip.label
pruned_tree = drop.tip(rodent_tree, rodent_tree$tip.label[-match(cur_tips, rodent_tree$tip.label)])
# Get a list of the tips and prune the species tree to contain only those tips
cur_rf = RF.dist(pruned_tree, gt, normalize=T)
# Calculate RF between the pruned species tree and the gene tree
if(cur_rf==0){
gt_data = rbind(gt_data, data.frame("gene"=gene_trees[i,]$gene, "rf"=cur_rf, "num.tips"=length(cur_tips), "rf.zeros"=cur_rf, "num.tips.zeros"=length(cur_tips)))
}else{
gt_data = rbind(gt_data, data.frame("gene"=gene_trees[i,]$gene, "rf"=cur_rf, "num.tips"=length(cur_tips), "rf.zeros"=NA, "num.tips.zeros"=NA))
}
# Add the gene tree info to the df, with a special case for when the RF is 0
}
# Read all the gene trees
p = ggplot(gt_data, aes(x=num.tips)) +
geom_histogram(bins=50, fill=corecol(pal="wilke", numcol=1, offset=6), color="#666666") +
#geom_quasirandom(size=2, width=0.25, alpha=0.25, color="#666666") +
#geom_boxplot(outlier.shape=NA, alpha=0.75, width=0.5, color="#666666") +
scale_y_continuous(expand=c(0,0)) +
xlab("# of taxa") +
ylab("# of gene trees") +
bartheme()
print(p)###############
fig_outfile = here("docs", "figs", "full-coding-gt-taxa-dist.png")
ggsave(fig_outfile, p, width=6, height=3, unit="in")
# Save the figure
#rf_file = here("docs", "data", "trees", "astral", "concord-rooted", "loci-nrf-below-0.25.txt")
#write.table(subset(gt_data, rf<=0.25), file=rf_file, row.names=F, quote=F, sep="\t")
# Write info for genes with low RF3.2 Tree distance (RF) between gene trees and species tree
Since RF cannot handle missing taxa, the species tree is pruned for each gene tree to calculate Robinson-Foulds distance. We use the normalized metric since there are varying numbers of tips per gene tree.
p = ggplot(gt_data, aes(x=rf)) +
geom_histogram(bins=50, fill=corecol(pal="wilke", numcol=1, offset=4), color="#666666") +
#geom_quasirandom(size=2, width=0.25, alpha=0.25, color="#666666") +
#geom_boxplot(outlier.shape=NA, alpha=0.75, width=0.5, color="#666666") +
scale_y_continuous(expand=c(0,0)) +
xlab("Normalized RF") +
ylab("# of gene trees") +
bartheme()
print(p)###############
fig_outfile = here("docs", "figs", "full-coding-gt-rf-dist.png")
ggsave(fig_outfile, p, width=6, height=3, unit="in")
# Save the figure4 Delta
4.1 Introduction
For each lineage in the species tree with gCF < 95% we calculated the \(\Delta\) statistic (Huson et al. 2005). This statistic follows the same logic as the ABBA-BABA site patterns used to calculate D-statistics, but uses gene tree topologies instead of alignment sites. Briefly, a given branch in an unrooted tree is defined by a quartet of species groupings with two possible discordant topologies, \(D_1\) and \(D_2\) (see Figure 1 from Minh et al. 2020). Under assumptions that discordance is caused by ILS, both discordant topologies should be present in equal proportions. However, if introgression has occurred one discordant topology will appear more frequently than the other. \(\Delta\) is calculated for a branch as follows, using the frequency of each discordant topology (Vanderpool et al. 2020):
\[\Delta = \frac{D_1 - D_2}{D_1 + D_2}\]
This normalized \(\Delta\) calculation ensures that all values are scaled between 0 and 1, with larger values indicating a larger skew towards one topology, and a higher chance that introgression has occurred.
To test whether the observed \(\Delta\) values are skewed significantly from 0 to imply introgression, we performed concordance factor analysis on 1,000 bootstrap replicates of our inferred gene trees to generate a null distribution of \(\Delta\) values. We then calculated Z-scores and p-values and assessed significance for each branch at a threshold of 0.01.
4.2 Null and actual distributions
Nodes with p < 0.01:
tree_info$delta.sig = F
# Add a column to the tree info table about significant delta values
###############
fig_outfile = here("docs", "figs", "full-coding-delta-dists.png")
# The image file to either generate or insert
if(!file.exists(delta_outfile)){
cf_stats$delta = (abs(cf_stats$gDF1_N - cf_stats$gDF2_N)) / (cf_stats$gDF1_N + cf_stats$gDF2_N)
# Calculate the delta values on the actual data
iqtree_delta = select(cf_stats, ID, delta)
names(iqtree_delta) = c("iqtree.node", "delta")
tree_info = merge(x = tree_info, y = iqtree_delta, by = "iqtree.node", all.x=TRUE)
# Adding the delta values to the main tree info table
tree_info = tree_info[order(tree_info$node), ]
# Re-sort the data frame by R node order after the merge so the trees still work
low_cf_nodes = subset(cf_stats, gCF < 95)
# Get the low concordance factor nodes from the data to test with delta
delta_null = c()
for(i in 0:999){
cur_rep_str = as.character(i)
while(nchar(cur_rep_str) < 4){
cur_rep_str = paste("0", cur_rep_str, sep="")
}
# Handling the string of the rep
cur_cf_file = paste(cf_rep_dir, "/rep", cur_rep_str, ".cf.stat", sep="")
cf_rep = read.table(cur_cf_file, header=T, fill=T)
# Read the current reps cf file
cf_rep$delta = (abs(cf_rep$gDF1_N - cf_rep$gDF2_N)) / (cf_rep$gDF1_N + cf_rep$gDF2_N)
delta_null = c(delta_null, cf_rep$delta)
# Calculate delta for this rep and save values in vector
}
# Read concordance factors from bootstrap samples of gene trees and calculate delta to generate
# null distribution
###############
delta_null_df = data.frame("delta"=delta_null, y="duh")
delta_null_df = subset(delta_null_df, !is.nan(delta))
# Convert the delta values to a data frame for ggplot
delta_mu = mean(delta_null_df$delta, na.rm=T)
delta_sd = sd(delta_null_df$delta, na.rm=T)
# Calculate the mean and sd of the null distribution to get z-scores and p-values
###############
delta_out = data.frame("node"=c(), "delta"=c(), "z-score"=c(), "p-value"=c())
# Initialize output data frame
for(i in 1:nrow(low_cf_nodes)){
row = low_cf_nodes[i,]
z = (row$delta - delta_mu) / delta_sd
p = pnorm(-abs(z))
if(p < 0.01){
print(paste(row$ID, row$delta, z, p, sep=" "))
tree_info$delta.sig[tree_info$iqtree.node==row$ID] = T
# Set the delta significant to TRUE in the main tree info table
}
delta_out = rbind(delta_out, data.frame("node"=row$ID, "delta"=row$delta, "z-score"=z, "p-value"=p))
}
# Calculate z and p for each low gCF node in the species tree and save to output data frame
write.table(file=delta_outfile, delta_out, sep="\t", row.names=F)
# Write the delta values per node to the output file
###############
delta_null_p = ggplot(delta_null_df, aes(x=delta)) +
geom_histogram(color="#ececec", bins=50) +
scale_x_continuous(limits=c(0,1)) +
scale_y_continuous(expand=c(0,0)) +
xlab("Delta") +
ylab("# nodes") +
bartheme()
delta_actual_p = ggplot(low_cf_nodes, aes(x=delta)) +
geom_histogram(fill="#920000", color="#ececec") +
scale_x_continuous(limits=c(0,1)) +
scale_y_continuous(expand=c(0,0)) +
xlab("Delta") +
ylab("# nodes") +
bartheme()
delta_p = plot_grid(delta_null_p, delta_actual_p, ncol=2, labels=c("Bootstrap distribution", "Actual distribution"), label_size=12)
print(delta_p)
###############
ggsave(fig_outfile, delta_p, width=10, height=4, unit="in")
# Save the figure
}else{
tree_info$delta = NA
delta_out = read.table(delta_outfile, header=T)
for(i in 1:nrow(delta_out)){
row = delta_out[i,]
tree_info$delta[tree_info$iqtree.node==row$node] = row$delta
if(row$p.value < 0.01){
print(paste(row$node, row$delta, row$z.score, row$p.value, sep=" "))
tree_info$delta.sig[tree_info$iqtree.node==row$node] = T
# Set the delta significant to TRUE in the main tree info table
}
}
knitr::include_graphics(fig_outfile)
}## [1] "219 0.406732117812062 2.44516394160885 0.00723931554647087"
## [1] "222 0.407151095732411 2.44879368056455 0.00716677631309369"
## [1] "232 0.622047244094488 4.31050734912284 8.14402076760321e-06"
## [1] "260 0.416606498194946 2.53070883884166 0.00569161487327277"
## [1] "272 0.531951640759931 3.52998048773703 0.000207795157849876"
## [1] "273 0.464877663772691 2.9488972960955 0.00159454968995379"
## [1] "317 0.510204081632653 3.34157446746339 0.000416523334864317"
## [1] "324 0.407725321888412 2.45376838435342 0.00706840016380011"
4.3 Branches by delta
Branches with significant delta are labeled
h = corecol(numcol=1, pal="wilke", offset=3)
l = corecol(numcol=1, offset=3)
delta_tree = ggtree(rodent_tree, size=0.8, ladderize=F, aes(color=tree_info$delta)) +
scale_color_continuous(name="Delta", high=h, low=l) +
xlim(0, xmax) +
geom_tiplab(color="#333333", fontface='italic', size=2) +
theme(legend.position=c(0.05,0.9)) +
geom_label(aes(x=branch, label=ifelse(tree_info$delta.sig,as.character(tree_info$tp),'')), label.size=NA, fill="transparent", show.legend=F, vjust=-0.1)
#geom_text(aes(label=rodent_data$support), hjust=-.1, color="#006ddb") +
#geom_nodepoint(color="#666666", alpha=0.85, size=4)
print(delta_tree)###############
fig_outfile = here("docs", "figs", "full-coding-astral-delta.png")
ggsave(fig_outfile, delta_tree, width=14, height=14, unit="in")
# Save the tree image4.4 Branches with significant delta
h = corecol(numcol=1, pal="wilke", offset=1)
l = corecol(numcol=1, offset=1)
intro_tree = ggtree(rodent_tree, size=0.8, ladderize=F, aes(color=tree_info$delta.sig)) +
scale_color_manual(name="Significant Delta", labels=c("False", "True"), values=corecol(pal="trek", numcol=2)) +
xlim(0, xmax) +
geom_tiplab(color="#333333", fontface='italic', size=2) +
theme(legend.position=c(0.05,0.9)) +
geom_label(aes(x=branch, label=ifelse(tree_info$delta.sig,as.character(tree_info$tp),'')), label.size=NA, fill="transparent", show.legend=F, vjust=-0.1)
#geom_text(aes(label=rodent_data$support), hjust=-.1, color="#006ddb") +
#geom_nodepoint(color="#666666", alpha=0.85, size=4)
print(intro_tree)4.5 Concordance factors and delta
h = corecol(numcol=1, pal="wilke", offset=3)
l = corecol(numcol=1, offset=3)
# Colors
p = ggplot(tree_info, aes(x=gcf, y=scf, color=delta)) +
geom_point(size=2, alpha=0.5) +
geom_text_repel(aes(label=ifelse(delta.sig,as.character(tp),'')), show_guide=F, min.segment.length=0) +
scale_color_continuous(name='Delta', low=l, high=h) +
xlab("gCF") +
ylab("sCF") +
bartheme() +
theme(legend.title=element_text(size=12))
print(p)###############
fig_outfile = here("docs", "figs", "full-coding-delta-cf.png")
ggsave(fig_outfile, p, width=6, height=4, unit="in")
# Save the figure4.6 Branch length and delta
h = corecol(numcol=1, pal="wilke", offset=3)
l = corecol(numcol=1, offset=3)
# Colors
p = ggplot(tree_info, aes(x=branch.length, y=delta, color=delta)) +
geom_point(size=2, alpha=0.5) +
geom_text_repel(aes(label=ifelse(delta.sig,as.character(tp),'')), show_guide=F, min.segment.length=0) +
scale_x_continuous(limits=c(0,0.03)) +
#scale_color_continuous(name='Delta', low=l, high=h) +
xlab("Branch length") +
ylab("Delta") +
bartheme() +
theme(legend.title=element_text(size=12))
print(p)###############
fig_outfile = here("docs", "figs", "full-coding-delta-bl.png")
ggsave(fig_outfile, p, width=6, height=4, unit="in")
# Save the figure4.7 Species tree without tips
#no_tip_file = here("docs", "data", "trees", "astral", "concord-rooted", "full_coding_iqtree_astral.cf.labeled.notips.tree")
#no_tip_file = here("docs", "data", "trees", "astral", "concord-rooted-bl", "full_coding_iqtree_astral_rooted_bl.cf.labeled.notips.tree")
no_tip_file = here("docs", "data", "trees", "astral", "iqtree-dating", "full-coding-astral-iqtree-dating-anil-mmus-12.1.timetree.reordered.labeled.notips.tre")
tree2 = read.tree(no_tip_file)
tree_to_df_list = treeToDF(tree2)
tree2_info = tree_to_df_list[["info"]]
names(tree2_info)[5] = "tp"
# Read the tree and parse with treetoDF
tree_info_notip = subset(tree_info, tp %in% tree2_info$tp)
tree_info_notip = tree_info_notip %>% select(tp, delta, delta.sig)
tree2_info = left_join(tree2_info, tree_info_notip, by="tp")
# Add the delta values for each node to the no tip tree
tree2_tip_info = subset(tree2_info, node.type=="tip")
## No tip tree
###############
tree3 = drop.tip(tree2, "<1>")
tree_to_df_list = treeToDF(tree3)
tree3_info = tree_to_df_list[["info"]]
names(tree3_info)[5] = "tp"
# Read the tree and parse with treetoDF
tree_info_notip = subset(tree_info, tp %in% tree2_info$tp)
tree_info_notip = tree_info_notip %>% select(tp, delta, delta.sig)
tree3_info = left_join(tree3_info, tree_info_notip, by="tp")
# Add the delta values for each node to the no tip tree
tree3_tip_info = subset(tree3_info, node.type=="tip")
## No tip tree, no outgroup
###############
h = corecol(numcol=1, pal="wilke", offset=3)
l = corecol(numcol=1, offset=3)
no_tip_max = 25
#no_tip_max = 0.13
delta_tree = ggtree(tree2, size=0.8, ladderize=F, aes(color=tree2_info$delta)) +
scale_color_continuous(name="Delta", high=h, low=l) +
xlim(0, no_tip_max) +
geom_tiplab(color="#333333", fontface='italic', size=5) +
theme(legend.position=c(0.05,0.9))
#geom_label(aes(x=branch, label=ifelse(tree_info$delta.sig,as.character(tree_info$tp),'')), label.size=NA, fill="transparent", show.legend=F, vjust=-0.1)
#geom_text(aes(label=rodent_data$support), hjust=-.1, color="#006ddb") +
#geom_nodepoint(color="#666666", alpha=0.85, size=4)
print(delta_tree)###############
fig_outfile = here("docs", "figs", "full-coding-astral-delta-notips.png")
ggsave(fig_outfile, delta_tree, width=12, height=12, unit="in")
# Save the tree image4.8 Phylogenetic signal of delta (Pagel’s lambda, Bloomberg K)
# Generally helpful: https://lukejharmon.github.io/pcm/chapter6_beyondbm/
phylosig(tree2, tree2_tip_info$delta, test=T)## [1] "x has no names; assuming x is in the same order as tree$tip.label"
## [1] "some data in x given as 'NA', dropping corresponding species from tree"
##
## Phylogenetic signal K : 0.222633
## P-value (based on 1000 randomizations) : 0.211
# Bloomberg K
phylosig(tree2, tree2_tip_info$delta, method="lambda", test=T)## [1] "x has no names; assuming x is in the same order as tree$tip.label"
## [1] "some data in x given as 'NA', dropping corresponding species from tree"
##
## Phylogenetic signal lambda : 6.6107e-05
## logL(lambda) : 52.31
## LR(lambda=0) : -0.00215541
## P-value (based on LR test) : 1
#Pagel's lambda
###############
#new_tree = drop.tip(no_tip_tree, "<1>")
#new_info = subset(new_tip_info, tp != "<1>")
#a = data.frame(x=new_info$branch.length, y=new_info$delta, names=new_info$tp, row.names=new_info$tp)
#picModel <- pic.pgls(formula = y ~ x, phy = new_tree, y = a, lambda = "ML", return.intercept.stat = FALSE)
# PGLS.. not sure how to interpret
# https://cran.r-project.org/web/packages/motmot/vignettes/motmot.html#fast-estimation-of-phylogenetic-generalised-least-squares
# Or maybe this is better: https://lukejharmon.github.io/ilhabela/instruction/2015/07/03/PGLS/
###############4.9 Phylogenetic signal of delta (Pagel’s lambda, Bloomberg K) for clades with high speciation rates (see below)
4.9.1 Rattini
# Generally helpful: https://lukejharmon.github.io/pcm/chapter6_beyondbm/
tree2_rattini = tree2
tree2_rattini$tip.label = paste(tree2_rattini$tip.label, "/", subset(tree2_info, node.type == "tip")$delta, sep="")
tree2_rattini$node.label = paste(tree2_rattini$node.label, "/", subset(tree2_info, node.type != "tip")$delta, sep="")
# Add delta values to the labels of the tree with no tips
tree2_rattini = extract.clade(tree2_rattini, tree2_info$node[tree2_info$tp=="<169>"])
tree_to_df_list = treeToDF(tree2_rattini)
tree2_rattini_info = tree_to_df_list[["info"]]
# Get the subtree and parse to df
tree2_rattini_info = tree2_rattini_info %>% separate(label, c("tp", "delta"), sep="/", remove=F)
tree2_rattini_info$delta = as.numeric(tree2_rattini_info$delta)
# Split the label by / to get the delta value
tree2_rattini_tip_info = subset(tree2_rattini_info, node.type=="tip")
# Get only the tip info for the tests
phylosig(tree2_rattini, tree2_rattini_tip_info$delta, test=T)## [1] "x has no names; assuming x is in the same order as tree$tip.label"
##
## Phylogenetic signal K : 0.634626
## P-value (based on 1000 randomizations) : 0.129
# Bloomberg K
phylosig(tree2_rattini, tree2_rattini_tip_info$delta, method="lambda", test=T)## [1] "x has no names; assuming x is in the same order as tree$tip.label"
##
## Phylogenetic signal lambda : 6.6107e-05
## logL(lambda) : 12.2344
## LR(lambda=0) : -0.00023808
## P-value (based on LR test) : 1
#Pagel's lambda4.9.2 Hydromyini
# Generally helpful: https://lukejharmon.github.io/pcm/chapter6_beyondbm/
tree2_rattini = tree2
tree2_rattini$tip.label = paste(tree2_rattini$tip.label, "/", subset(tree2_info, node.type == "tip")$delta, sep="")
tree2_rattini$node.label = paste(tree2_rattini$node.label, "/", subset(tree2_info, node.type != "tip")$delta, sep="")
# Add delta values to the labels of the tree with no tips
tree2_rattini = extract.clade(tree2_rattini, tree2_info$node[tree2_info$tp=="<177>"])
tree_to_df_list = treeToDF(tree2_rattini)
tree2_rattini_info = tree_to_df_list[["info"]]
# Get the subtree and parse to df
tree2_rattini_info = tree2_rattini_info %>% separate(label, c("tp", "delta"), sep="/", remove=F)
tree2_rattini_info$delta = as.numeric(tree2_rattini_info$delta)
# Split the label by / to get the delta value
tree2_rattini_tip_info = subset(tree2_rattini_info, node.type=="tip")
# Get only the tip info for the tests
phylosig(tree2_rattini, tree2_rattini_tip_info$delta, test=T)## [1] "x has no names; assuming x is in the same order as tree$tip.label"
##
## Phylogenetic signal K : 0.505996
## P-value (based on 1000 randomizations) : 0.455
# Bloomberg K
phylosig(tree2_rattini, tree2_rattini_tip_info$delta, method="lambda", test=T)## [1] "x has no names; assuming x is in the same order as tree$tip.label"
##
## Phylogenetic signal lambda : 6.6107e-05
## logL(lambda) : 15.8495
## LR(lambda=0) : -0.000279124
## P-value (based on LR test) : 1
#Pagel's lambda4.10 Phylogenetic independent contrasts of delta
PIC are basically the expected differences in trait values between pairs of branches descending from a node in a tree corrected for expected variance, which is just the sum of the branch lengths. For example, for a given node \(N\) in a tree, the PIC will be:
\[PIC(N) = \frac{V_1 - V_2}{L_1 + L_2}\]
With a post-order traversal of the tree, PICs can be calculated for each node in the tree. Generally, for internal nodes, a trait value \(V\) is assigned after its PIC has been calculated so it can be used in calculation of the PIC of its ancestor.
\[V = \frac{(1/L_1)V_1+(1/L_2)V_2}{1/L_1+1/L_2}\]
The branch is also lengthened to account for possible error in \(V\).
Finally, the rate of trait evolution \(R\) is calculated as
\[R = \frac{\sum ^{n}{PIC_n}}{n}\]
where \(n\) is the number of internal nodes in the tree and \(PIC_n\) is the PIC calculated for that node.
However, we have no need to estimate values \(V\) at internal nodes in our tree since we have delta values for each branch. Here I compare both the contrasts themselves and the rate estimated from them when using the inferred ancestral states or the observed states in our tree.
# Helpful:
# https://bio.libretexts.org/Bookshelves/Evolutionary_Developmental_Biology/Phylogenetic_Comparative_Methods_(Harmon)/04%3A_Fitting_Brownian_Motion/4.02%3A_Estimating_Rates_using_Independent_Contrasts
# https://lukejharmon.github.io/ilhabela/instruction/2015/07/02/phylogenetic-independent-contrasts/
inferred_pic = pic(tree3_tip_info$delta, tree3, var.contrasts=T, rescaled.tree=T)
# Compute the contrasts based on the 'tip' values from the tree
inferred_rate = sum(inferred_pic$contr[,1]^2)/nrow(inferred_pic$contr)
# Compute the rate from the inferred constrasts
###############
tree2_info$pic = NA
tree2_info$inferred.pic = NA
# Columns to add to the tree info
num_contrasts = 0
# Keep track of the number of contrasts done to calculate rate later
for(i in 1:nrow(tree2_info)){
row = tree2_info[i,]
if(row$node.type == "tip" || row$node.type=="ROOT"){
next
}
# Skip tips and the root since we can't calculate on them
tree2_info$inferred.pic[tree2_info$tp==row$tp] = inferred_pic[["contr"]][,1][[row$tp]]
# Add the inferred pic to the tree info df here
desc = getDescendants(tree2, row$node)
desc1 = tree2_info[tree2_info$node==desc[1],]
desc2 = tree2_info[tree2_info$node==desc[2],]
# Get the descendants of the current node
pic = (desc1$delta - desc2$delta) / (desc1$branch.length + desc2$branch.length)
# Calculate the contrast based on the descendant deltas and branch lenghts
tree2_info$pic[tree2_info$node==row$node] = pic
# Add the contrast to the tree info df
num_contrasts = num_contrasts + 1
# Increment number of contrasts
}
observed_rate = sum(tree2_info$pic^2, na.rm=t)/num_contrasts
# Compute the rate from the observed contrasts
## Compute contrasts from observed delta
###############
plot_info = subset(tree2_info, node.type != "tip" & !is.na(delta))
#a = subset(no_tip_info, node.type != "tip")
a = select(plot_info, tp, pic, delta.sig)
a$label = "Observed"
b = select(plot_info, tp, inferred.pic, delta.sig)
names(b)[2] = "pic"
b$label = "Inferred"
z = rbind(a,b)
x_comps = list(c("Observed", "Inferred"))
p = ggplot(data=z, aes(x=label, y=pic, group=label, color=delta.sig)) +
geom_quasirandom(size=2, width=0.25, alpha=0.30) +
geom_boxplot(outlier.shape=NA, alpha=0.15, width=0.5, color="#666666") +
geom_signif(comparisons=x_comps, map_signif_level=TRUE, textsize=4, size=1, step_increase=0.12, margin_top=0.1, test="ks.test") +
#scale_y_continuous(limits=c(0,0.7)) +
scale_color_manual("Significant delta?", values=c(corecol(numcol=1, pal="wilke", offset=6), corecol(numcol=1, pal="wilke", offset=4))) +
xlab("") +
ylab("PIC") +
bartheme() +
theme(axis.text.x=element_text(angle=15, hjust=1),
plot.title = element_text(size=14, vjust=1),
legend.title=element_text(size=12),
legend.position="bottom",
plot.margin = unit(c(1,0,0,0), "cm")) +
guides(colour=guide_legend(override.aes=list(alpha=1)))
print(p)cat("KS test. Note that distributions are not significantly different with Wilcox test\n")## KS test. Note that distributions are not significantly different with Wilcox test
###############
fig_outfile = here("docs", "figs", "full-coding-delta-pic.png")
ggsave(fig_outfile, p, width=6, height=4, unit="in")
# Save the figure
###############
rate_table = data.frame("Rate type"=c("Inferred", "Observed"), "Rate"=c(inferred_rate, observed_rate))
rate_table %>% kable() %>% kable_styling(bootstrap_options=c("striped", "condended", "responsive"), full_width=F)| Rate.type | Rate |
|---|---|
| Inferred | 0.0047570 |
| Observed | 0.0095405 |
# Table to display rates4.11 Ancestral reconstruction of delta values
#new_tree = drop.tip(no_tip_tree, "<1>")
# tree_to_df_list = treeToDF(tree2_no_out)
# tree2_no_out_ = tree_to_df_list[["info"]]
# names(new_tree_info)[5] = "tp"
# Drop the outgroup and re-read the tree
#new_info = subset(new_tip_info, tp != "<1>")
###############
a = data.frame(delta=tree3_tip_info$delta, row.names=tree3_tip_info$tp)
b = as.matrix(a)[,1]
fit = fastAnc(tree3, b, vars=T,CI=T)
# Transform delta values from tips and infer ancestral states
#obj<-contMap(new_tree,b,plot=FALSE)
#plot(obj,type="fan",legend=0.7*max(nodeHeights(new_tree)),
# fsize=c(0.7,0.9))
# See http://www.phytools.org/eqg2015/asr.html
###############
tree2_info$delta.recon = NA
tree2_info$delta.lower = NA
tree2_info$delta.upper = NA
for(i in 1:nrow(tree2_info)){
row = tree2_info[i,]
if(row$node.type == "tip" || row$node.type=="ROOT"){
next
}
tp_node = row$tp
node = row$node
new_node = tree3_info$node[tree3_info$tp==tp_node]
tree2_info$delta.recon[tree2_info$tp==row$tp] = fit[["ace"]][[as.character(new_node)]]
tree2_info$delta.lower[tree2_info$tp==row$tp] = fit[["CI95"]][as.character(new_node),1]
tree2_info$delta.upper[tree2_info$tp==row$tp] = fit[["CI95"]][as.character(new_node),2]
}
# Add the inferred delta values to the main data frame for the no tip tree
###############
# delta_tree = ggtree(fit$rescaled.tree, size=0.8, ladderize=F) +
# #scale_color_continuous(name="Delta", high=h, low=l) +
# xlim(0, 0.2) +
# geom_tiplab(color="#333333", fontface='italic', size=5) +
# theme(legend.position=c(0.05,0.9))
# #geom_label(aes(x=branch, label=ifelse(tree_info$delta.sig,as.character(tree_info$tp),'')), label.size=NA, fill="transparent", show.legend=F, vjust=-0.1)
# #geom_text(aes(label=rodent_data$support), hjust=-.1, color="#006ddb") +
# #geom_nodepoint(color="#666666", alpha=0.85, size=4)
# print(delta_tree)
h = corecol(numcol=1, pal="wilke", offset=3)
l = corecol(numcol=1, offset=3)
no_tip_max = 25
#no_tip_max = 0.13
delta_tree = ggtree(tree2, size=0.8, ladderize=F, aes(color=tree2_info$delta.recon)) +
scale_color_continuous(name="Delta", high=h, low=l) +
xlim(0, no_tip_max) +
geom_tiplab(color="#333333", fontface='italic', size=5) +
theme(legend.position=c(0.05,0.9))
#geom_label(aes(x=branch, label=ifelse(tree_info$delta.sig,as.character(tree_info$tp),'')), label.size=NA, fill="transparent", show.legend=F, vjust=-0.1)
#geom_text(aes(label=rodent_data$support), hjust=-.1, color="#006ddb") +
#geom_nodepoint(color="#666666", alpha=0.85, size=4)
print(delta_tree)# Tree with ancestral reconstructions of delta
###############
fig_outfile = here("docs", "figs", "full-coding-astral-delta-recon-notips.png")
ggsave(fig_outfile, delta_tree, width=12, height=12, unit="in")
# Save the tree image# w = subset(tree2_info, node.type=="tip")
# w = select(w, tp, delta)
# w$label = "delta.recon"
x = subset(tree2_info, node.type != "tip")
x = select(x, tp, delta, delta.sig)
x$delta.sig = ifelse(x$delta.sig, "Yes", "No")
x$label = "Observed"
y = select(tree2_info, tp, delta.recon, delta.sig)
names(y)[2] = "delta"
y$delta.sig = ifelse(y$delta.sig, "Yes", "No")
y$label = "Simulated"
z = rbind(x,y)
# Convert to long with two categories
###############
x_comps = list(c("Observed", "Simulated"))
p = ggplot(z, aes(x=label, y=delta, group=label, color=delta.sig)) +
geom_quasirandom(size=3, width=0.25, alpha=0.5) +
geom_boxplot(outlier.shape=NA, alpha=0.15, width=0.5, color="#666666") +
geom_signif(comparisons=x_comps, map_signif_level=TRUE, textsize=4, size=1, step_increase=0.12, margin_top=0.1) +
ylab("Delta") +
xlab("") +
scale_color_manual("Significant observed delta?", values=c(corecol(numcol=1, pal="wilke", offset=3), corecol(numcol=1, pal="wilke", offset=4))) +
bartheme() +
theme(plot.title = element_text(size=14, vjust=1),
legend.title=element_text(size=12),
legend.position="bottom",
plot.margin = unit(c(1,0,0,0), "cm"))
print(p)###############
fig_outfile = here("docs", "figs", "full-coding-delta-recon.png")
ggsave(fig_outfile, p, width=6, height=5, unit="in")
# Save the figure
###############
p = ggplot(tree2_info, aes(x=delta.recon, delta)) +
geom_point(size=2, alpha=0.5) +
geom_smooth(method="lm") +
bartheme()
print(p)fig_outfile = here("docs", "figs", "full-coding-delta-recon-corr.png")
ggsave(fig_outfile, p, width=6, height=4, unit="in")
# Save the figure
###############
tree2_internal_info = subset(tree2_info, node.type=="internal" & !tp %in% exclude_branches)
p = ggplot(tree2_internal_info, aes(x=delta.recon, y=node)) +
geom_point(aes(color="Simulated (95% CI)"), size=2, alpha=0.7) +
geom_segment(aes(x=delta.lower, xend=delta.upper, y=node, yend=node)) +
geom_point(aes(x=delta, y=node, color="Observed"), size=2, alpha=0.7) +
scale_color_manual(values=c("Simulated (95% CI)"="black", "Observed"="red")) +
xlab("Delta") +
ylab("Node") +
bartheme() +
theme(legend.position="bottom")
print(p)tree2_internal_info$within.bounds = 0
for(i in 1:nrow(tree2_internal_info)){
if(tree2_internal_info[i,]$delta >= tree2_internal_info[i,]$delta.lower && tree2_internal_info[i,]$delta <= tree2_internal_info[i,]$delta.upper){
tree2_internal_info[i,]$within.bounds = 1
}
}
fig_outfile = here("docs", "figs", "full-coding-delta-recon-ci.png")
ggsave(fig_outfile, p, width=4, height=6, unit="in")
# Save the figure4.12 Delta differences to ancestors (observed)
My own attempt at some sort of phylogenetic independent comparison.
For each branch, compute the difference in the value delta between it and its ancestor and the difference between it and a random branch in the tree. Repeat random sampling as needed.
delta_info = subset(tree2_info, !is.na(delta))
# Get info only from branches with delta values
#delta_diffs = data.frame("ref.node"=c(), "ref.delta"=c(), "query.node"=c(), "query.delta"=c(), "query.type"=c(), "delta.diff"=c())
delta_anc_diffs = data.frame("ref.node"=c(), "ref.delta"=c(), "sig.ref.delta"=c(), "query.node"=c(), "query.delta"=c(), "query.type"=c(), "delta.diff"=c())
delta_random_diffs = data.frame("ref.node"=c(), "ref.delta"=c(), "sig.ref.delta"=c(), "query.node"=c(), "query.delta"=c(), "query.type"=c(), "delta.diff"=c(), "replicate"=c())
# Data frames to save delta diffs
for(i in 1:nrow(delta_info)){
# Loop over every branch with delta
node = delta_info[i,]$node
delta = delta_info[i,]$delta
sig_delta = delta_info[i,]$delta.sig
sig_label = "N"
if(sig_delta){
sig_label = "Y"
}
# Get the delta value and other info from the branch
anc = Ancestors(tree2, node, type="parent")
if(tree2_info[tree2_info$node==node,]$tp %in% exclude_branches || tree2_info[tree2_info$node==anc,]$tp %in% exclude_branches){
next
}
# Get the ancestral node, but move on if it is an excluded branch
anc_delta = delta_info[delta_info$node==anc,]$delta
if(delta_info[delta_info$node==anc,]$delta.sig){
sig_label = "Ancestor Y"
}
# Get the delta info from the ancestor
#delta_info[delta_info$node==node,]$delta.diff = abs(delta - anc_delta)
delta_anc_diffs = rbind(delta_anc_diffs, data.frame("ref.node"=node, "ref.delta"=delta, "sig.ref.delta"=sig_label, "query.node"=anc, "query.delta"=anc_delta, "query.type"="Ancestor", "delta.diff"=abs(delta-anc_delta)))
# Add the info of the comparison of this node to its ancestor to the anc delta diff data frame
for(j in 1:100){
# Select random branches
random_anc = anc
while(random_anc == anc || random_anc == node){
random_row = delta_info[sample(nrow(delta_info),1),]
random_anc = random_row$node
}
# Select a random branch in the tree as long as it is not this branch or its ancestor
delta_random_diffs = rbind(delta_random_diffs, data.frame("ref.node"=node, "ref.delta"=delta, "sig.ref.delta"=sig_label, "query.node"=random_anc, "query.delta"=random_row$delta, "query.type"="Random branch", "delta.diff"=abs(delta-random_row$delta), "replicate"=j))
# Add the info of the comparison of this node to the random branch to the random delta diff data frame
}
#delta_info[delta_info$node==node,]$random.diff = abs(delta - random_row$delta)
}
###############
x_comps = list(c("Ancestor", "Random branch"))
# List of comparisons to make
delta_random_single = subset(delta_random_diffs, replicate==1)
delta_random_single = select(delta_random_single, -replicate)
delta_random_single$query.type = "Random branch"
delta_diffs_single = rbind(delta_anc_diffs, delta_random_single)
# Get the first replicate, add label, and combine with anc diffs
delta_diff_single_p = ggplot(data=delta_diffs_single, aes(x=query.type, y=delta.diff, group=query.type, color=sig.ref.delta)) +
geom_quasirandom(size=2, width=0.25, alpha=0.30) +
geom_boxplot(outlier.shape=NA, alpha=0.15, width=0.5, color="#666666") +
geom_signif(comparisons=x_comps, map_signif_level=TRUE, textsize=4, size=1, step_increase=0.12, margin_top=0.1) +
scale_y_continuous(limits=c(0,0.7)) +
scale_color_manual("Significant delta?", values=c(corecol(numcol=1, pal="wilke", offset=4), corecol(numcol=1, pal="wilke", offset=3), "#920000")) +
ggtitle("Single random sample") +
xlab("") +
ylab("Delta difference") +
bartheme() +
theme(axis.text.x=element_text(angle=15, hjust=1),
plot.title = element_text(size=14, vjust=1),
legend.title=element_text(size=12),
legend.position="bottom",
plot.margin = unit(c(1,0,0,0), "cm")) +
guides(colour=guide_legend(override.aes=list(alpha=1)))
print(delta_diff_single_p)# Distribution of delta diffs to ancestor and one random branch
###############
x_comps = list(c("Ancestor", "Random branch"))
# List of comparisons to make
delta_random_meds = delta_random_diffs %>% group_by(ref.node, query.type, sig.ref.delta) %>% summarize(delta.diff=median(delta.diff))
delta_diffs_meds = rbind(select(delta_anc_diffs, ref.node, query.type, sig.ref.delta, delta.diff), delta_random_meds)
# Calculate median of all random branch diffs and combine with anc diffs
delta_diff_med_p = ggplot(data=delta_diffs_meds, aes(x=query.type, y=delta.diff, group=query.type, color=sig.ref.delta)) +
geom_quasirandom(size=2, width=0.25, alpha=0.30) +
geom_boxplot(outlier.shape=NA, alpha=0.15, width=0.5, color="#666666") +
geom_signif(comparisons=x_comps, map_signif_level=TRUE, textsize=4, size=1, step_increase=0.12, margin_top=0.1) +
scale_y_continuous(limits=c(0,0.7)) +
scale_color_manual("Significant delta?", values=c(corecol(numcol=1, pal="wilke", offset=4), corecol(numcol=1, pal="wilke", offset=3), "#920000")) +
#ggtitle("Difference between delta and ancestral delta / average after 100 replicates of random sampling") +
ggtitle("Median after 100 random samples") +
xlab("") +
ylab("Delta difference") +
bartheme() +
theme(axis.text.x=element_text(angle=15, hjust=1),
plot.title = element_text(size=14, vjust=1),
legend.title=element_text(size=12),
legend.position="bottom",
plot.margin = unit(c(1,0,0,0), "cm")) +
guides(colour=guide_legend(override.aes=list(alpha=1)))
print(delta_diff_med_p)# Distribution of delta diffs to ancestor and median over many random branches
###############
x_comps = list(c("Ancestor", "Random branch"))
# List of comparisons to make
delta_random_avgs = delta_random_diffs %>% group_by(ref.node, query.type, sig.ref.delta) %>% summarize(delta.diff=mean(delta.diff))
delta_random_avgs$query.type = "Random branch"
delta_diffs_avgs = rbind(select(delta_anc_diffs, ref.node, query.type, sig.ref.delta, delta.diff), delta_random_avgs)
# Calculate average of all random branch diffs and combine with anc diffs
delta_diff_avg_p = ggplot(data=delta_diffs_avgs, aes(x=query.type, y=delta.diff, group=query.type, color=sig.ref.delta)) +
geom_quasirandom(size=2, width=0.25, alpha=0.30) +
geom_boxplot(outlier.shape=NA, alpha=0.15, width=0.5, color="#666666") +
geom_signif(comparisons=x_comps, map_signif_level=TRUE, textsize=4, size=1, step_increase=0.12, margin_top=0.1) +
scale_y_continuous(limits=c(0,0.7)) +
scale_color_manual("Significant delta?", values=c(corecol(numcol=1, pal="wilke", offset=4), corecol(numcol=1, pal="wilke", offset=3), "#920000")) +
#ggtitle("Difference between delta and ancestral delta / average after 100 replicates of random sampling") +
ggtitle("Mean after 100 random samples") +
xlab("") +
ylab("Delta difference") +
bartheme() +
theme(axis.text.x=element_text(angle=15, hjust=1),
plot.title = element_text(size=14, vjust=1),
legend.title=element_text(size=12),
legend.position="bottom",
plot.margin = unit(c(1,0,0,0), "cm")) +
guides(colour=guide_legend(override.aes=list(alpha=1)))
print(delta_diff_avg_p)# Distribution of delta diffs to ancestor and average over many random branches
###############
p_legend = get_legend(delta_diff_single_p)
p_grid = plot_grid(delta_diff_single_p + theme(legend.position="none"),
delta_diff_med_p + theme(legend.position="none"),
delta_diff_avg_p + theme(legend.position="none"),
ncol=3)
p = plot_grid(p_grid, p_legend, nrow=2, rel_heights=c(1,0.1), align='vh')
fig_outfile = here("docs", "figs", "full-coding-delta-anc-diffs.png")
ggsave(fig_outfile, p, width=12, height=4, unit="in")
# Save the figure4.13 Delta differences to ancestors (reconstructed)
The same comparisons, but using values of delta reconstructed via a Brownian motion model
delta_info = subset(tree2_info, !is.na(delta.recon))
# Get info only from branches with delta values
#delta_diffs = data.frame("ref.node"=c(), "ref.delta"=c(), "query.node"=c(), "query.delta"=c(), "query.type"=c(), "delta.diff"=c())
delta_anc_diffs = data.frame("ref.node"=c(), "ref.delta"=c(), "sig.ref.delta"=c(), "query.node"=c(), "query.delta"=c(), "query.type"=c(), "delta.diff"=c())
delta_random_diffs = data.frame("ref.node"=c(), "ref.delta"=c(), "sig.ref.delta"=c(), "query.node"=c(), "query.delta"=c(), "query.type"=c(), "delta.diff"=c(), "replicate"=c())
# Data frames to save delta diffs
for(i in 1:nrow(delta_info)){
# Loop over every branch with delta
node = delta_info[i,]$node
delta = delta_info[i,]$delta.recon
sig_delta = delta_info[i,]$delta.sig
sig_label = "N"
if(sig_delta){
sig_label = "Y"
}
# Get the delta value and other info from the branch
anc = Ancestors(tree2, node, type="parent")
if(tree2_info[tree2_info$node==node,]$tp %in% exclude_branches || tree2_info[tree2_info$node==anc,]$tp %in% exclude_branches){
next
}
# Get the ancestral node, but move on if it is an excluded branch
anc_delta = delta_info[delta_info$node==anc,]$delta.recon
if(delta_info[delta_info$node==anc,]$delta.sig){
sig_label = "Ancestor Y"
}
# Get the delta info from the ancestor
#delta_info[delta_info$node==node,]$delta.diff = abs(delta - anc_delta)
delta_anc_diffs = rbind(delta_anc_diffs, data.frame("ref.node"=node, "ref.delta"=delta, "sig.ref.delta"=sig_label, "query.node"=anc, "query.delta"=anc_delta, "query.type"="Ancestor", "delta.diff"=abs(delta-anc_delta)))
# Add the info of the comparison of this node to its ancestor to the anc delta diff data frame
for(j in 1:100){
# Select random branches
random_anc = anc
while(random_anc == anc || random_anc == node){
random_row = delta_info[sample(nrow(delta_info),1),]
random_anc = random_row$node
}
# Select a random branch in the tree as long as it is not this branch or its ancestor
delta_random_diffs = rbind(delta_random_diffs, data.frame("ref.node"=node, "ref.delta"=delta, "sig.ref.delta"=sig_label, "query.node"=random_anc, "query.delta"=random_row$delta, "query.type"="Random branch", "delta.diff"=abs(delta-random_row$delta), "replicate"=j))
# Add the info of the comparison of this node to the random branch to the random delta diff data frame
}
#delta_info[delta_info$node==node,]$random.diff = abs(delta - random_row$delta)
}
###############
x_comps = list(c("Ancestor", "Random branch"))
# List of comparisons to make
delta_random_single = subset(delta_random_diffs, replicate==1)
delta_random_single = select(delta_random_single, -replicate)
delta_random_single$query.type = "Random branch"
delta_diffs_single = rbind(delta_anc_diffs, delta_random_single)
# Get the first replicate, add label, and combine with anc diffs
delta_diff_single_p = ggplot(data=delta_diffs_single, aes(x=query.type, y=delta.diff, group=query.type, color=sig.ref.delta)) +
geom_quasirandom(size=2, width=0.25, alpha=0.30) +
geom_boxplot(outlier.shape=NA, alpha=0.15, width=0.5, color="#666666") +
geom_signif(comparisons=x_comps, map_signif_level=TRUE, textsize=4, size=1, step_increase=0.12, margin_top=0.1) +
scale_y_continuous(limits=c(0,0.7)) +
scale_color_manual("Significant delta?", values=c(corecol(numcol=1, pal="wilke", offset=4), corecol(numcol=1, pal="wilke", offset=3), "#920000")) +
ggtitle("Single random sample") +
xlab("") +
ylab("Delta difference") +
bartheme() +
theme(axis.text.x=element_text(angle=15, hjust=1),
plot.title = element_text(size=14, vjust=1),
legend.title=element_text(size=12),
legend.position="bottom",
plot.margin = unit(c(1,0,0,0), "cm")) +
guides(colour=guide_legend(override.aes=list(alpha=1)))
print(delta_diff_single_p)# Distribution of delta diffs to ancestor and one random branch
###############
x_comps = list(c("Ancestor", "Random branch"))
# List of comparisons to make
delta_random_meds = delta_random_diffs %>% group_by(ref.node, query.type, sig.ref.delta) %>% summarize(delta.diff=median(delta.diff))
delta_diffs_meds = rbind(select(delta_anc_diffs, ref.node, query.type, sig.ref.delta, delta.diff), delta_random_meds)
# Calculate median of all random branch diffs and combine with anc diffs
delta_diff_med_p = ggplot(data=delta_diffs_meds, aes(x=query.type, y=delta.diff, group=query.type, color=sig.ref.delta)) +
geom_quasirandom(size=2, width=0.25, alpha=0.30) +
geom_boxplot(outlier.shape=NA, alpha=0.15, width=0.5, color="#666666") +
geom_signif(comparisons=x_comps, map_signif_level=TRUE, textsize=4, size=1, step_increase=0.12, margin_top=0.1) +
scale_y_continuous(limits=c(0,0.7)) +
scale_color_manual("Significant delta?", values=c(corecol(numcol=1, pal="wilke", offset=4), corecol(numcol=1, pal="wilke", offset=3), "#920000")) +
#ggtitle("Difference between delta and ancestral delta / average after 100 replicates of random sampling") +
ggtitle("Median after 100 random samples") +
xlab("") +
ylab("Delta difference") +
bartheme() +
theme(axis.text.x=element_text(angle=15, hjust=1),
plot.title = element_text(size=14, vjust=1),
legend.title=element_text(size=12),
legend.position="bottom",
plot.margin = unit(c(1,0,0,0), "cm")) +
guides(colour=guide_legend(override.aes=list(alpha=1)))
print(delta_diff_med_p)# Distribution of delta diffs to ancestor and median over many random branches
###############
x_comps = list(c("Ancestor", "Random branch"))
# List of comparisons to make
delta_random_avgs = delta_random_diffs %>% group_by(ref.node, query.type, sig.ref.delta) %>% summarize(delta.diff=mean(delta.diff))
delta_random_avgs$query.type = "Random branch"
delta_diffs_avgs = rbind(select(delta_anc_diffs, ref.node, query.type, sig.ref.delta, delta.diff), delta_random_avgs)
# Calculate average of all random branch diffs and combine with anc diffs
delta_diff_avg_p = ggplot(data=delta_diffs_avgs, aes(x=query.type, y=delta.diff, group=query.type, color=sig.ref.delta)) +
geom_quasirandom(size=2, width=0.25, alpha=0.30) +
geom_boxplot(outlier.shape=NA, alpha=0.15, width=0.5, color="#666666") +
geom_signif(comparisons=x_comps, map_signif_level=TRUE, textsize=4, size=1, step_increase=0.12, margin_top=0.1) +
scale_y_continuous(limits=c(0,0.7)) +
scale_color_manual("Significant delta?", values=c(corecol(numcol=1, pal="wilke", offset=4), corecol(numcol=1, pal="wilke", offset=3), "#920000")) +
#ggtitle("Difference between delta and ancestral delta / average after 100 replicates of random sampling") +
ggtitle("Mean after 100 random samples") +
xlab("") +
ylab("Delta difference") +
bartheme() +
theme(axis.text.x=element_text(angle=15, hjust=1),
plot.title = element_text(size=14, vjust=1),
legend.title=element_text(size=12),
legend.position="bottom",
plot.margin = unit(c(1,0,0,0), "cm")) +
guides(colour=guide_legend(override.aes=list(alpha=1)))
print(delta_diff_avg_p)# Distribution of delta diffs to ancestor and average over many random branches
###############
p_legend = get_legend(delta_diff_single_p)
p_grid = plot_grid(delta_diff_single_p + theme(legend.position="none"),
delta_diff_med_p + theme(legend.position="none"),
delta_diff_avg_p + theme(legend.position="none"),
ncol=3)
p = plot_grid(p_grid, p_legend, nrow=2, rel_heights=c(1,0.1), align='vh')
fig_outfile = here("docs", "figs", "full-coding-delta-anc-diffs-recon.png")
ggsave(fig_outfile, p, width=12, height=4, unit="in")
# Save the figure5 Delta spatial patterns
Here, we look for evidence of introgression along the genome by calculating the average size of blocks of genes that share the same topology. For all genes that form a block of topologies on a given branch (e.g. all genes in a row that share the same topology), we subtract the starting position of the final gene from the starting position of the first gene in the block.
We then look at the sizes of blocks around each branch for each topology: 1. the concordant topology, 2. the major discordant topology (the one that is most frequent), and 3. the minor discordant topology.
Specifically, we take the difference in the average block size of the major and minor topologies to measure if the skew in gene tree topologies is observable in different areas of the genome.
topo_file = here("docs", "data", "trees", "astral", "concord-rooted-bl", "topo-counts.tab")
topo_counts = read.table(topo_file, header=T)
topo_counts = subset(topo_counts, !tp.node %in% exclude_branches)
topo_counts = subset(topo_counts, mchr != "X" & mchr != "Y")
# Read the topo counts for each gene for each branch
###############
block_outfile = here("docs", "data", "trees", "astral", "concord-rooted-bl", "blocks.tab")
# A file to save the block sizes to
###############
tree_info$depth = node.depth(rodent_tree)
#tree_info$depth.bl = get_all_node_depths(rodent_tree)
tree_info$depth.edge = NA
tree_info$depth.bl = NA
tree_info$depth.edge[189:nrow(tree_info)] = get_all_node_depths(rodent_tree, as_edge_count=T)
tree_info$depth.bl[189:nrow(tree_info)] = get_all_node_depths(rodent_tree)
# Depths of branches in the tree relative to tips
###############
topo_tree_info = subset(tree_info, node.type!="tip" & !tp %in% exclude_branches)
topo_cf_stats = cf_stats
names(topo_cf_stats)[1] = "node"
topo_cf_stats = merge(topo_cf_stats, select(topo_tree_info, node, node.type, branch.length, depth, depth.bl, depth.edge, tp, delta, delta.sig), by="node")
# Add the tree info to the cf table
topo_cf_stats$gD_prev = ifelse(topo_cf_stats$gDF1_N > topo_cf_stats$gDF2_N, "d1", "d2")
# Identify the prevalant discorant topology for each branchif(!file.exists(block_outfile)){
block_data = data.frame("tp"=c(), "num.genes.c"=c(), "num_genes_d1"=c(), "num_genes_d2"=c(), "num.genes.m"=c(),
"c.avg.block.size"=c(), "c.avg.block.count"=c(), "c.num.blocks"=c(),
"d1.avg.block.size"=c(), "d1.avg.block.count"=c(), "d1.num.blocks"=c(),
"d2.avg.block.size"=c(), "d2.avg.block.count"=c(), "d2.num.blocks"=c(),
"m.avg.block.size"=c(), "m.avg.block.count"=c(), "m.num.blocks"=c())
# Block sizes for each topology around each branch
for(node in levels(as.factor(topo_counts$tp.node))){
# Loop over every node in the tree
c_blocks = 0
c_block_sizes = c()
c_block_counts = c()
d1_blocks = 0
d1_block_sizes = c()
d1_block_counts = c()
d2_blocks = 0
d2_block_sizes = c()
d2_block_counts = c()
m_blocks = 0
m_block_sizes = c()
m_block_counts = c()
# Running tallies and lists of block sizes for each topology to be averaged
# over all chromosomes
num_genes_c = 0
num_genes_d1 = 0
num_genes_d2 = 0
num_genes_m = 0
for(chr in levels(as.factor(topo_counts$mchr))){
# Loop over every mouse chromosome
#print(paste(node, chr))
chrdata = subset(topo_counts, tp.node==node & mchr==chr)
# Subset topo counts from this chromosome and node
chrdata = chrdata[order(chrdata$mstart), ]
# Order the genes by their starting position on the chrome
chrdata$row.num = seq_along(chrdata[,1])
# Add the row number as a column
first = T
cur_topo = NA
num_genes = 0
# Initialize some things for this chromosomes
for(i in 1:nrow(chrdata)){
# Loop over every gene on this chromosomes
cur_gene = chrdata[i,]
# Get the current gene
if(first){
cur_topo = cur_gene$topo
num_genes = 0
block_start = cur_gene$mstart
first = F
next
}
# If this is the first gene of this chromosome/node combo, start the first block
if(cur_gene$topo == "m"){
num_genes_m = num_genes_m + 1
}else if(cur_gene$topo == "c"){
num_genes_c = num_genes_c + 1
}else if(cur_gene$topo == "d1"){
num_genes_d1 = num_genes_d1 + 1
}else if(cur_gene$topo == "d2"){
num_genes_d2 = num_genes_d2 + 1
}
# Count the topology of the current gene... I probably don't need to do this...
if(!cur_gene$topo %in% cur_topo || i == nrow(chrdata)){
block_size = cur_gene$mstart - block_start
if(cur_topo == "m"){
m_blocks = m_blocks + 1
m_block_sizes = c(m_block_sizes, block_size)
m_block_counts = c(m_block_counts, num_genes)
}else if(cur_topo == "c"){
c_blocks = c_blocks + 1
c_block_sizes = c(c_block_sizes, block_size)
c_block_counts = c(c_block_counts, num_genes)
}else if(cur_topo == "d1"){
d1_blocks = d1_blocks + 1
d1_block_sizes = c(d1_block_sizes, block_size)
d1_block_counts = c(d1_block_counts, num_genes)
}else if(cur_topo == "d2"){
d2_blocks = d2_blocks + 1
d2_block_sizes = c(d2_block_sizes, block_size)
d2_block_counts = c(d2_block_counts, num_genes)
}
# Get the size of the current block and add it to the correct list
cur_topo = cur_gene$topo
num_genes = 0
block_start = cur_gene$mstart
# Reset the block counts and positions
}else{
num_genes = num_genes + 1
}
# If there is a decisive topology for this gene/node combo AND the topology doesn't match the
# previous topology .. OR if this is the last gene of the chromosome, then we need to count the
# size of the block and start the next block
# Otherwise, just increment the number of genes
}
## End gene loop
}
## End chromosome loop
block_data = rbind(block_data, data.frame("tp"=node,
"num.genes.c"=num_genes_c, "num_genes_d1"=num_genes_d1, "num_genes_d2"=num_genes_d2, "num.genes.m"=num_genes_m,
"c.avg.block.size"=mean(c_block_sizes, na.rm=T), "c.avg.block.count"=mean(c_block_counts, na.rm=T), "c.num.blocks"=c_blocks,
"d1.avg.block.size"=mean(d1_block_sizes, na.rm=T), "d1.avg.block.count"=mean(d1_block_counts, na.rm=T), "d1.num.blocks"=d1_blocks,
"d2.avg.block.size"=mean(d2_block_sizes, na.rm=T), "d2.avg.block.count"=mean(d2_block_counts, na.rm=T), "d2.num.blocks"=d2_blocks,
"m.avg.block.size"=mean(m_block_sizes, na.rm=T), "m.avg.block.count"=mean(m_block_counts, na.rm=T), "m.num.blocks"=m_blocks))
# Calculate average block sizes for this node over all chromosomes and add to the main df
}
## End node loop
write.table(file=block_outfile, block_data, sep="\t", row.names=F)
# Write the block sizes per node to a file
}else{
block_data = read.table(block_outfile, header=T)
}
# If the block sizes have already been calculated, just read them in
block_data_merge = merge(block_data, topo_cf_stats, by="tp")
# Merge the block counts into the main cf df
###############
block_data_merge$anc.delta.sig = "N"
block_data_merge$desc.delta.sig = "N"
for(i in 1:nrow(block_data_merge)){
# Loop over every branch with delta
node = block_data_merge[i,]$node
# Get the node of the current row
anc = Ancestors(rodent_tree, node, type="parent")
if(!anc %in% topo_tree_info$node){
next
}
# Get the ancestral node, but move on if it is an excluded branch
anc_delta = block_data_merge[block_data_merge$node==anc,]$delta
if(block_data_merge[block_data_merge$node==anc,]$delta.sig == "Y"){
block_data_merge[i,]$anc.delta.sig = "Y"
}
# Get the delta info from the ancestor
for(desc in getDescendants(rodent_tree, node)){
if(!desc %in% topo_tree_info$node){
next
}
desc_delta = block_data_merge[block_data_merge$node==desc,]$delta
if(block_data_merge[block_data_merge$node==desc,]$delta.sig == "Y"){
block_data_merge[i,]$desc.delta.sig = "Y"
}
# Get the delta info from the ancestor
}
}
## Determine whether ancestor or either descendant has a significant delta for each branch
###############
block_data_merge$major.block.size = ifelse(block_data_merge$gD_prev=="d1", block_data_merge$d1.avg.block.size, block_data_merge$d2.avg.block.size)
block_data_merge$minor.block.size = ifelse(block_data_merge$gD_prev=="d1", block_data_merge$d2.avg.block.size, block_data_merge$d1.avg.block.size)
block_data_merge$block.diff = block_data_merge$major.block.size - block_data_merge$minor.block.size
block_data_merge$block.ratio = block_data_merge$major.block.size / block_data_merge$minor.block.size
# Finally, determine the block sizes for the major and minor topologies, and calculate their differences5.1 Distribution of block sizes for each topology around each branch
m = select(block_data_merge, tp, m.avg.block.size, delta.sig)
names(m)[2] = "block.size"
m$label = "Missing"
c = select(block_data_merge, tp, c.avg.block.size, delta.sig)
names(c)[2] = "block.size"
c$label = "Concordant"
d1 = select(block_data_merge, tp, d1.avg.block.size, delta.sig)
names(d1)[2] = "block.size"
d1$label = "D1"
d2 = select(block_data_merge, tp, d2.avg.block.size, delta.sig)
names(d2)[2] = "block.size"
d2$label = "D2"
a = rbind(c, d1, d2, m)
#x_comps = list(c("Observed", "Inferred"))
p = ggplot(a, aes(x=label, y=block.size, group=label, color=delta.sig)) +
geom_quasirandom(size=2, width=0.25, alpha=0.5) +
geom_boxplot(outlier.shape=NA, alpha=0.15, width=0.5, color="#666666") +
#geom_signif(comparisons=x_comps, map_signif_level=TRUE, textsize=4, size=1, step_increase=0.12, margin_top=0.1) +
ylab("Avg. block size") +
xlab("") +
scale_color_manual("Significant delta?", values=c(corecol(numcol=1, pal="wilke", offset=3), corecol(numcol=1, pal="wilke", offset=4))) +
bartheme() +
theme(plot.title = element_text(size=14, vjust=1),
legend.title=element_text(size=12),
legend.position="bottom",
plot.margin = unit(c(1,0,0,0), "cm"))
print(p)###############
fig_outfile = here("docs", "figs", "full-coding-delta-block-dists.png")
ggsave(fig_outfile, p, width=4, height=4, unit="in")
# Save the figure5.2 Distribution of the difference between major and minor block sizes
Note that a negative value indicates that, on average, blocks consisting of the minor topology are larger than those of the major topology.
###############
p = ggplot(block_data_merge, aes(x=1, y=block.diff, color=delta.sig)) +
geom_quasirandom(size=2, width=0.25, alpha=0.5) +
geom_boxplot(outlier.shape=NA, alpha=0.15, width=0.5, color="#666666") +
#geom_signif(comparisons=x_comps, map_signif_level=TRUE, textsize=4, size=1, step_increase=0.12, margin_top=0.1) +
scale_x_continuous(limits=c(0,2)) +
ylab("Block size difference") +
xlab("") +
scale_color_manual("Significant delta?", values=c(corecol(numcol=1, pal="wilke", offset=3), corecol(numcol=1, pal="wilke", offset=4))) +
bartheme() +
theme(axis.title.x=element_blank(),
axis.text.x=element_blank(),
axis.ticks.x=element_blank(),
plot.title = element_text(size=14, vjust=1),
legend.title=element_text(size=12),
legend.position="bottom",
plot.margin = unit(c(1,0,0,0), "cm"))
print(p)###############
fig_outfile = here("docs", "figs", "full-coding-delta-block-diff-dist.png")
ggsave(fig_outfile, p, width=4, height=4, unit="in")
# Save the figure5.3 Block size difference and delta
# p = ggplot(block_data_merge, aes(x=block.diff, y=num.genes.na, color=delta.sig)) +
# geom_point(size=3, alpha=0.5) +
# geom_smooth(method="lm", se=F) +
# geom_text_repel(aes(label=ifelse(delta.sig=="Y",as.character(tp),'')), show_guide=F, min.segment.length=0) +
# bartheme()
# print(p)
p = ggplot(block_data_merge, aes(x=delta, y=block.diff, color=delta.sig)) +
geom_point(size=2, alpha=0.5) +
geom_smooth(method="lm", se=F, linetype="dashed", color="#333333") +
xlab("Delta") +
ylab("Block size difference") +
scale_color_manual("Significant delta?", values=c(corecol(numcol=1, pal="wilke", offset=3), corecol(numcol=1, pal="wilke", offset=4))) +
bartheme() +
theme(legend.title=element_text(size=12),
legend.position="bottom",
plot.margin = unit(c(1,0,0,0), "cm"))
print(p)###############
fig_outfile = here("docs", "figs", "full-coding-delta-block-diff-delta.png")
ggsave(fig_outfile, p, width=6, height=4, unit="in")
# Save the figure5.4 Block size difference and tree depth
Maximum number of branches from a tip
# p = ggplot(block_data_merge, aes(x=block.diff, y=num.genes.na, color=delta.sig)) +
# geom_point(size=3, alpha=0.5) +
# geom_smooth(method="lm", se=F) +
# geom_text_repel(aes(label=ifelse(delta.sig=="Y",as.character(tp),'')), show_guide=F, min.segment.length=0) +
# bartheme()
# print(p)
block_data_merge = block_data_merge[order(block_data_merge$node), ]
# Re-sort the data frame by R node order after the merge so the trees still work
p = ggplot(block_data_merge, aes(x=depth.bl, y=block.diff, color=delta.sig)) +
geom_point(size=2, alpha=0.5) +
geom_smooth(method="lm", se=F, linetype="dashed", color="#333333") +
xlab("Tree depth") +
ylab("Block size difference") +
scale_color_manual("Significant delta?", values=c(corecol(numcol=1, pal="wilke", offset=3), corecol(numcol=1, pal="wilke", offset=4))) +
bartheme() +
theme(legend.title=element_text(size=12),
legend.position="bottom",
plot.margin = unit(c(1,0,0,0), "cm"))
print(p)###############
fig_outfile = here("docs", "figs", "full-coding-delta-block-diff-depth.png")
ggsave(fig_outfile, p, width=6, height=4, unit="in")
# Save the figure6 Time tree
Tree was dated using IQ-tree’s implementation of least squares dating with the following fossil calibrations:
calibration_file = here("docs", "data", "trees", "astral", "iqtree-dating", "fossil-calibrations-iqtree.txt")
cal_dates = read.table(calibration_file, header=F)
names(cal_dates) = c("MRCA of", "dates")
# Read the fossil calibration dates
cal_dates = cal_dates %>% separate(dates, c("Min.age", "Max.age"), sep=":", remove=T)
cal_dates$Max.age = abs(as.numeric(cal_dates$Max.age))
cal_dates$Min.age = abs(as.numeric(cal_dates$Min.age))
# Parse the dates better
cal_dates %>% kable() %>% kable_styling(bootstrap_options=c("striped", "condended", "responsive"), full_width=F)| MRCA of | Min.age | Max.age |
|---|---|---|
| mm10,Arvicanthus_niloticus_MNHN1999201 | 11.1 | 12.3 |
| Pithecheir_melanurus_LSUMZ38198,Arvicanthus_niloticus_MNHN1999201 | 8.7 | 10.1 |
| mm10,Mus_pahari_10460X1 | 7.3 | 8.3 |
| Leggadina_forresti_WAMM62323,Pseudomys_shortridgei_Z25113 | 4.0 | 4.5 |
# Display calibration points as a table
###############
#dated_tree_file = here("docs", "data", "trees", "astral", "iqtree-dating", "full-coding-astral-iqtree-dating-anil-mmus-12.1.timetree.nwk")
#dated_tree = read.tree(dated_tree_file)
#tree_to_df_list = treeToDF(rodent_tree)
#dated_info = tree_to_df_list[["info"]]
# Read the tree and parse with treetoDF
#plotTreeTime(dated_tree, runif(Ntip(dated_tree)))
dated_tree_file = here("docs", "data", "trees", "astral", "iqtree-dating", "full-coding-astral-iqtree-dating-anil-mmus-12.1.timetree.nex.mod")
dated_tree_text = readLines(con=dated_tree_file, n = -1)
dated_tree = readMCMCtree(dated_tree_text, from.file = FALSE)
# Read tree as nexus file# From: https://cran.r-project.org/web/packages/MCMCtreeR/vignettes/MCMCtree_plot.html
MCMC.tree.plot(dated_tree, cex.tips = 0.3, time.correction = 1, scale.res = c("Eon",
"Period", "Epoch", "Age"), plot.type = "phylogram", cex.age = 0.6, cex.labels = 0.6,
relative.height = 0.1, col.tree = "grey40", label.offset = 0.05,
node.method = "bar", no.margin = TRUE, label.timescale.names=TRUE, add.time.scale=TRUE, add.abs.time=TRUE,
lwd.bar=2)###############
fig_outfile = here("docs", "figs", "full-coding-timetree.png")
png(fig_outfile, width=12, height=12, units="in", res=320)
MCMC.tree.plot(dated_tree, cex.tips = 0.3, time.correction = 1, scale.res = c("Eon",
"Period", "Epoch", "Age"), plot.type = "phylogram", cex.age = 0.6, cex.labels = 0.6,
relative.height = 0.1, col.tree = "grey40", label.offset = 0.05,
node.method = "bar", no.margin = TRUE, label.timescale.names=TRUE, add.time.scale=TRUE, add.abs.time=TRUE,
lwd.bar=2)
dev.off()## png
## 2
7 Speciation rates with BAMM
taxo_file = here("docs", "data", "full-murinae-taxonomy.csv")
taxo_data = read.csv(taxo_file, header=T, quote="\"")
# Read the taxonomy data
###############
div_comp_sums = taxo_data %>% count(Division)
names(div_comp_sums)[2] = "Full"
div_samp_sums = subset(taxo_data, Exome==1) %>% count(Division)
names(div_samp_sums)[2] = "Sampled"
div_props = merge(div_comp_sums, div_samp_sums, by="Division")
div_props$div.prop.sampled = div_props$Sampled / div_props$Full
taxo_data = merge(taxo_data, select(div_props, Division, div.prop.sampled), by="Division")
# Get Division counts and sampling rates
div_sampling_file = here("docs", "data", "bamm", "div-sampling-rates.tab")
write.table(div_props, file=div_sampling_file, row.names=F, quote=F)
# Write the counts to a file
###############
tribe_comp_sums = taxo_data %>% count(Tribe)
names(tribe_comp_sums)[2] = "Full"
tribe_samp_sums = subset(taxo_data, Exome==1) %>% count(Tribe)
names(tribe_samp_sums)[2] = "Sampled"
tribe_props = merge(tribe_comp_sums, tribe_samp_sums, by="Tribe")
tribe_props$tribe.prop.sampled = tribe_props$Sampled / tribe_props$Full
taxo_data = merge(taxo_data, select(tribe_props, Tribe, tribe.prop.sampled), by="Tribe")
# Get Tribe counts and sampling rates
tribe_sampling_file = here("docs", "data", "bamm", "tribe-sampling-rates.tab")
write.table(tribe_props, file=tribe_sampling_file, row.names=F, quote=F)
# Write the counts to a file
###############
sample_taxo_data = subset(taxo_data, Exome==1)
# Get the sampled species
###############
missing_samples = c("Lophuromys_woosnami_LSUMZ37793", "Lophiomys_imhausi_UM5152", "Mus_musculus-reference_10460X5", "Mus_musculus-domesticus_10460X15", "Mus_musculus-castaneus_10460X7", "Hydromys_sp-nov_YS391", "Genus_sp-nov_NMVZ21816", "Paruromys_dominator_JAE4870", "Mus_musculus-musculus_10460X14")
# A list of samples missing from the taxonomy file
tribe_prob_full = c("Pithecheir_melanurus_LSUMZ38198", "Vandeleuria_oleracea_ABTC116404")
tribe_prob = c("Pithecheir melanurus", "Vandeleuria oleracea")
div_prob_full = c("Coccymys_ruemmleri_ABTC49489", "Vandeleuria_oleracea_ABTC116404", "Anonymomys_mindorensis_FMNH2222010")
div_prob = c("Coccymys ruemmleri", "Vandeleuria oleracea", "Anonymomys mindorensis")
# Lists of samples with unclear taxonomy
###############tribe_tree_file = here("docs", "data", "bamm", "full-coding-astral-timetree-tribe.tre")
time_tree = read.nexus(dated_tree_file)
tribe_tree = drop.tip(time_tree, c(missing_samples, tribe_prob_full))
write.tree(tribe_tree, file=tribe_tree_file)
# Write the tree for Tribe sampling
# NOTE: manually convert 0 branch lengths to 0.000001 (there are 5 of these for some reason)
tree_to_df_list = treeToDF(tribe_tree)
tribe_tree_info = tree_to_df_list[["info"]]
tribe_tree_tips = subset(tribe_tree_info, node.type=="tip")
# Read the tree and parse with treetoDF
tribe_tree_tips = tribe_tree_tips %>% separate(label, c("genus", "species", "sample"), sep="_", remove=F)
tribe_tree_tips$Genus.species = paste(tribe_tree_tips$genus, tribe_tree_tips$species)
tribe_tree_tips[tribe_tree_tips$label=="mm10",]$Genus.species = "Mus musculus"
tribe_tree_tips[tribe_tree_tips$label=="rnor6",]$Genus.species = "Rattus norvegicus"
# Parse the genus and species names of the remaining tips
###############
tribe_sample_file = here("docs", "data", "bamm", "tribe-sampling-rates.tab")
tribe_data = subset(sample_taxo_data, !Genus.species %in% tribe_prob)
tribe_data = merge(tribe_data, tribe_tree_tips, by="Genus.species")
tribe_data = select(tribe_data, label, Tribe, tribe.prop.sampled)
write.table(tribe_data, file=tribe_sample_file, sep="\t", row.names=F, col.names=F, quote=F)
# Write the Tribe sampling rates to a file
# NOTE: manually add 1.0 to the first line
# Format from: http://bamm-project.org/advanced.html#accounting-for-non-random-incomplete-taxon-sampling-in-diversification-studies
###############
# From: http://bamm-project.org/quickstart.html#speciation-extinction-analyses
taxo_data_nonextinct = subset(taxo_data, !grepl("EXTINCT",Extinct.TAXONOMY))
# Remove extinct species from complete list of species
prior_file = here("docs", "data", "bamm", "tribe-bamm-priors.txt")
setBAMMpriors(tribe_tree, total.taxa=nrow(taxo_data_nonextinct), outfile=prior_file)
# Generate BAMM priors based on the tree
## TRIBE
##########################################################################################
## DIVISION
div_tree_file = here("docs", "data", "bamm", "full-coding-astral-timetree-div.tre")
time_tree = read.nexus(dated_tree_file)
div_tree = drop.tip(time_tree, c(missing_samples, div_prob_full))
write.tree(div_tree, file=div_tree_file)
# Write the tree for Tribe sampling
# NOTE: manually convert 0 branch lengths to 0.000001 (there are 5 of these for some reason)
tree_to_df_list = treeToDF(div_tree)
div_tree_info = tree_to_df_list[["info"]]
div_tree_tips = subset(div_tree_info, node.type=="tip")
# Read the tree and parse with treetoDF
div_tree_tips = div_tree_tips %>% separate(label, c("genus", "species", "sample"), sep="_", remove=F)
div_tree_tips$Genus.species = paste(div_tree_tips$genus, div_tree_tips$species)
div_tree_tips[div_tree_tips$label=="mm10",]$Genus.species = "Mus musculus"
div_tree_tips[div_tree_tips$label=="rnor6",]$Genus.species = "Rattus norvegicus"
# Parse the genus and species names of the remaining tips
###############
div_sample_file = here("docs", "data", "bamm", "div-sampling-rates.tab")
div_data = subset(sample_taxo_data, !Genus.species %in% div_prob)
div_data = merge(div_data, div_tree_tips, by="Genus.species")
div_data = select(div_data, label, Division, div.prop.sampled)
write.table(div_data, file=div_sample_file, sep="\t", row.names=F, col.names=F, quote=F)
# Write the Tribe sampling rates to a file
# NOTE: manually add 1.0 to the first line
# Format from: http://bamm-project.org/advanced.html#accounting-for-non-random-incomplete-taxon-sampling-in-diversification-studies
###############
# From: http://bamm-project.org/quickstart.html#speciation-extinction-analyses
taxo_data_nonextinct = subset(taxo_data, !grepl("EXTINCT",Extinct.TAXONOMY))
# Remove extinct species from complete list of species
prior_file = here("docs", "data", "bamm", "div-bamm-priors.txt")
setBAMMpriors(div_tree, total.taxa=nrow(taxo_data_nonextinct), outfile=prior_file)
# Generate BAMM priors based on the tree
###############7.1 Sampling rates based on Tribe
tribe_props %>% kable() %>% kable_styling(bootstrap_options=c("striped", "condended", "responsive"), full_width=F)| Tribe | Full | Sampled | tribe.prop.sampled |
|---|---|---|---|
| Apodemini | 26 | 1 | 0.0384615 |
| Arvicanthini | 90 | 22 | 0.2444444 |
| Hydromyini | 211 | 70 | 0.3317536 |
| incertae sedis | 11 | 2 | 0.1818182 |
| Malacomyini | 3 | 1 | 0.3333333 |
| Murini | 43 | 11 | 0.2558140 |
| Otomyini | 31 | 2 | 0.0645161 |
| Phloeomyini | 18 | 6 | 0.3333333 |
| Praomyini | 55 | 9 | 0.1636364 |
| Rattini | 193 | 55 | 0.2849741 |
7.2 Speciation rates based on Tribe sampling (BAMM)
# From: https://lukejharmon.github.io/ilhabela/instruction/2015/07/02/diversification-analysis-bamm-rpanda/
bamm_tree_file = here("docs", "data", "bamm", "full-coding-astral-timetree-tribe.tre")
bamm_tree = read.tree(bamm_tree_file)
# Read the tree from the BAMM run
event_data_file = here("docs", "data", "bamm", "tribe_event_data.txt")
events = read.csv(event_data_file)
# Read the event file from the BAMM run
###############
ed = getEventData(bamm_tree, events, burnin=0.25)## Processing event data from data.frame
##
## Discarded as burnin: GENERATIONS < 249000
## Analyzing 751 samples from posterior
##
## Setting recursive sequence on tree...
##
## Done with recursive sequence
best = getBestShiftConfiguration(ed, expectedNumberOfShifts=1, threshold=5)## Processing event data from data.frame
##
## Discarded as burnin: GENERATIONS < 0
## Analyzing 1 samples from posterior
##
## Setting recursive sequence on tree...
##
## Done with recursive sequence
# Get the best shift possibility (MAP)
# See: http://bamm-project.org/rateshifts.html#overall-best-shift-configuration
bamm_tree_p = plot.bammdata(best, lwd=2, labels = T, cex = 0.3)
addBAMMshifts(best, cex=2)
addBAMMlegend(bamm_tree_p, nTicks = 4, side = 4, las = 1)# Plot the rates and shifts on the tree
###############
fig_outfile = here("docs", "figs", "full-coding-bamm-tribe.png")
png(fig_outfile, width=12, height=12, units="in", res=320)
bamm_tree_p = plot.bammdata(best, lwd=2, labels = T, cex = 0.3)
addBAMMshifts(best, cex=2)
addBAMMlegend(bamm_tree_p, nTicks = 4, side = 4, las = 1)
dev.off()## png
## 2
plotRateThroughTime(ed, intervalCol=corecol(numcol=1), avgCol=corecol(numcol=1), ylim=c(0,1), cex.axis=1, cex.lab=1.5)###############
fig_outfile = here("docs", "figs", "full-coding-bamm-tribe-time.png")
png(fig_outfile, width=6, height=4, units="in", res=320)
plotRateThroughTime(ed, intervalCol=corecol(numcol=1), avgCol=corecol(numcol=1), ylim=c(0,1), cex.axis=1, cex=1.5)
dev.off()## png
## 2
7.3 Speciation rates based on Tribe sampling (BAMM) and delta
labeled_timetree_file = here("docs", "data", "trees", "astral", "iqtree-dating", "full-coding-astral-iqtree-dating-anil-mmus-12.1.timetree.nex")
labeled_timetree = read.nexus(labeled_timetree_file)
tree_to_df_list = treeToDF(labeled_timetree)
labeled_timetree_info = tree_to_df_list[["info"]]
# Read the labeled time tree and parse with treetoDF
tree_info_clades = select(tree_info, node, clade, tp, delta)
names(tree_info_clades)[1] = "orig.node"
# Select relevant columns from original tree df
labeled_timetree_info = merge(labeled_timetree_info, tree_info_clades, by="clade")
# Merge the original tree info with the time tree info to get the delta values
labeled_timetree_info = labeled_timetree_info[order(labeled_timetree_info$node), ]
# Re-sort the data frame by R node order after the merge so the trees still work
###############
# Since the speciation rates are missing some species, we need to prune the tree info
# to remove those species. We do this by removing those species from the tree info df
# and from any clade in any other node. At the end, for clades that are identical
# (because their descendants have been pruned), we keep the branch at the shallowest
# depth
labeled_timetree_info$depth = node.depth(labeled_timetree)
# Get the depth of each node in the tree for pruning
tips_to_drop = c(missing_samples, tribe_prob_full)
# The species we need to remove
for(tip in tips_to_drop){
labeled_timetree_info = subset(labeled_timetree_info, label != tip)
labeled_timetree_info$clade = gsub(paste(";", tip, sep=""), "", labeled_timetree_info$clade)
labeled_timetree_info$clade = gsub(paste(tip, ";", sep=""), "", labeled_timetree_info$clade)
labeled_timetree_info$clade = gsub(tip, "", labeled_timetree_info$clade)
}
# Loop over each species to remove and remove that row from the df and that species from any
# clade in the df
labeled_timetree_info = subset(labeled_timetree_info, clade != "")
# For some branches with all descendant species removed, they will now have empty clades
# Remove them here
labeled_timetree$node.label = paste(labeled_timetree_info$label, "/", labeled_timetree_info$delta, sep="")
# Add delta to the label of the time tree
labeled_timetree_info = labeled_timetree_info[order(labeled_timetree_info[,'clade'],labeled_timetree_info[,'depth']),]
labeled_timetree_info = labeled_timetree_info[!duplicated(labeled_timetree_info$clade),]
# Sort nodes in the tree df by clade and then by depth
# Then, for duplicate clades, keep the one at the shallowest depth
labeled_timetree_info = labeled_timetree_info[order(labeled_timetree_info$node), ]
# Re-sort the data frame by R node order after the merge so the trees still work
###############
labeled_tribetree = drop.tip(labeled_timetree, c(missing_samples, tribe_prob_full))
tree_to_df_list = treeToDF(labeled_tribetree)
labeled_tribetree_info = tree_to_df_list[["info"]]
# Now, prune the tree structure and re-read the pruned tree and parse with treetoDF
labeled_tribetree_info = merge(select(labeled_timetree_info, clade, tp, delta), labeled_tribetree_info, by="clade")
# Add the info from the full tree to the pruned tribe tree by merging by clade
labeled_tribetree_info = labeled_tribetree_info[order(labeled_tribetree_info$node), ]
# Re-sort the data frame by R node order after the merge so the trees still work
###############
speciation_tree = getMeanBranchLengthTree(best)$phy
tree_to_df_list = treeToDF(speciation_tree)
speciation_tree_info = tree_to_df_list[["info"]]
# Get the tree with speciation rates as branch lengths from BAMM and parse to df
write.csv(speciation_tree_info, file=here("docs", "data", "bamm", "full-coding-astral-bamm-as-bl.csv"), row.names=F)
speciation_tree_info = merge(select(labeled_tribetree_info, clade, tp, delta), speciation_tree_info, by="clade")
# Add the info from the tribe tree to the speciation tree
speciation_tree_info = labeled_timetree_info[order(labeled_timetree_info$node), ]
# Re-sort the data frame by R node order after the merge so the trees still work
###############
p = ggplot(speciation_tree_info, aes(x=delta, y=branch.length)) +
geom_point() +
geom_smooth(method="lm") +
bartheme()
#print(p)
# Simple correlation
###############
speciation_tree_internal_info = subset(speciation_tree_info, node.type != "tip")
speciation_tree$node.label = paste(speciation_tree_internal_info$label, "/", speciation_tree_internal_info$delta, sep="")
# Add delta values to the internal node labels of the speciation tree
###############
# At this point, to do PGLS without delta values on the tips, we need to again
# remove tip branches. To do this, you'd have to write.tree() and then use
# bonsai to remove the tips
#write.tree(speciation_tree)
speciation_tree2_file = here("docs", "data", "trees", "astral", "concord-rooted-bl", "full_coding_iqtree_astral_rooted_speciation.bl.cf.labeled.notips.tree")
speciation_tree2 = read.tree(file=speciation_tree2_file)
tree_to_df_list = treeToDF(speciation_tree2)
speciation_tree2_info = tree_to_df_list[["info"]]
# Read the speciation tree without tips and parse to df
speciation_tree2_info = speciation_tree2_info %>% separate(label, c("tp", "support"), sep=">", remove=F)
speciation_tree2_info$tp = paste(speciation_tree2_info$tp, ">", sep="")
speciation_tree2_info = speciation_tree2_info %>% separate(support, c("astral", "gcf", "scf", "delta"), sep="/", remove=T)
speciation_tree2_info$delta = as.numeric(speciation_tree2_info$delta)
# Split the label by /. tp is my treeParse label.
speciation_tree2_tip_info = subset(speciation_tree2_info, node.type=="tip")
# Get only the tip info from the new tree
###############
# From: https://lukejharmon.github.io/ilhabela/instruction/2015/07/03/PGLS/
delta = speciation_tree2_tip_info$delta
speciation.rate = speciation_tree2_tip_info$branch.length
# Get the delta values and branch length values for plotting later
pglsModel = gls(delta ~ branch.length, correlation = corBrownian(phy = speciation_tree2),
data = speciation_tree2_tip_info, method = "ML")
summary(pglsModel)## Generalized least squares fit by maximum likelihood
## Model: delta ~ branch.length
## Data: speciation_tree2_tip_info
## AIC BIC logLik
## -70.74475 -64.6687 38.37238
##
## Correlation Structure: corBrownian
## Formula: ~1
## Parameter estimate(s):
## numeric(0)
##
## Coefficients:
## Value Std.Error t-value p-value
## (Intercept) 0.11288103 0.08639655 1.3065455 0.1969
## branch.length -0.00390095 0.02904279 -0.1343172 0.8937
##
## Correlation:
## (Intr)
## branch.length -0.48
##
## Standardized residuals:
## Min Q1 Med Q3 Max
## -0.5606684 -0.4261186 -0.2044627 0.2750020 1.6042793
##
## Residual standard error: 0.1861634
## Degrees of freedom: 56 total; 54 residual
coef(pglsModel)## (Intercept) branch.length
## 0.112881032 -0.003900946
# Run the PGLS
plot(speciation.rate,delta)
abline(a = coef(pglsModel)[1], b = coef(pglsModel)[2])# Plot the PGLS
###############
fig_outfile = here("docs", "figs", "full-coding-bamm-tribe-spec-delta.png")
png(fig_outfile, width=6, height=4, units="in", res=320)
plot(speciation.rate,delta)
abline(a = coef(pglsModel)[1], b = coef(pglsModel)[2])
dev.off()## png
## 2
# Save the figure7.4 Sampling rates based on Division
div_props %>% kable() %>% kable_styling(bootstrap_options=c("striped", "condended", "responsive"), full_width=F)| Division | Full | Sampled | div.prop.sampled |
|---|---|---|---|
| Aethomys | 11 | 3 | 0.2727273 |
| Apodemus | 26 | 1 | 0.0384615 |
| Arvicanthis | 30 | 7 | 0.2333333 |
| Berylmys | 4 | 2 | 0.5000000 |
| Bunomys | 31 | 15 | 0.4838710 |
| Chiropodomys | 6 | 1 | 0.1666667 |
| Chrotomys | 39 | 5 | 0.1282051 |
| Colomys | 4 | 2 | 0.5000000 |
| Conilurus | 7 | 6 | 0.8571429 |
| Dacnomys | 41 | 7 | 0.1707317 |
| Dasymys | 14 | 2 | 0.1428571 |
| Echiothrix | 10 | 9 | 0.9000000 |
| Haeromys | 3 | 1 | 0.3333333 |
| Hybomys | 11 | 5 | 0.4545455 |
| Hydromys | 32 | 9 | 0.2812500 |
| incertae sedis | 13 | 3 | 0.2307692 |
| Malacomys | 3 | 1 | 0.3333333 |
| Mallomys | 11 | 5 | 0.4545455 |
| Maxomys | 22 | 4 | 0.1818182 |
| Mus | 43 | 11 | 0.2558140 |
| Oenomys | 24 | 5 | 0.2083333 |
| Otomys | 31 | 2 | 0.0645161 |
| Phloeomys | 18 | 6 | 0.3333333 |
| Pithecheir | 3 | 1 | 0.3333333 |
| Pogonomys | 16 | 6 | 0.3750000 |
| Pseudomys | 41 | 32 | 0.7804878 |
| Rattus | 84 | 17 | 0.2023810 |
| Stenocephalemys | 51 | 7 | 0.1372549 |
| Uromys | 52 | 4 | 0.0769231 |
7.5 Speciation rates based on Division sampling (BAMM)
# From: https://lukejharmon.github.io/ilhabela/instruction/2015/07/02/diversification-analysis-bamm-rpanda/
bamm_tree_file = here("docs", "data", "bamm", "full-coding-astral-timetree-div.tre")
bamm_tree = read.tree(bamm_tree_file)
# Read the tree from the BAMM run
event_data_file = here("docs", "data", "bamm", "div_event_data.txt")
events = read.csv(event_data_file)
# Read the event file from the BAMM run
###############
ed = getEventData(bamm_tree, events, burnin=0.25)## Processing event data from data.frame
##
## Discarded as burnin: GENERATIONS < 249000
## Analyzing 751 samples from posterior
##
## Setting recursive sequence on tree...
##
## Done with recursive sequence
best = getBestShiftConfiguration(ed, expectedNumberOfShifts=1, threshold=5)## Processing event data from data.frame
##
## Discarded as burnin: GENERATIONS < 0
## Analyzing 1 samples from posterior
##
## Setting recursive sequence on tree...
##
## Done with recursive sequence
# Get the best shift possibility (MAP)
# See: http://bamm-project.org/rateshifts.html#overall-best-shift-configuration
bamm_tree_p = plot.bammdata(best, lwd=2, labels = T, cex = 0.3)
addBAMMshifts(best, cex=2)
addBAMMlegend(bamm_tree_p, nTicks = 4, side = 4, las = 1)# Plot the rates and shifts on the tree
###############
fig_outfile = here("docs", "figs", "full-coding-bamm-div.png")
png(fig_outfile, width=12, height=12, units="in", res=320)
bamm_tree_p = plot.bammdata(best, lwd=2, labels = T, cex = 0.3)
addBAMMshifts(best, cex=2)
addBAMMlegend(bamm_tree_p, nTicks = 4, side = 4, las = 1)
dev.off()## png
## 2
plotRateThroughTime(ed, intervalCol=corecol(numcol=1), avgCol=corecol(numcol=1), ylim=c(0,1), cex.axis=1, cex.lab=1.5)###############
fig_outfile = here("docs", "figs", "full-coding-bamm-div-time.png")
png(fig_outfile, width=6, height=4, units="in", res=320)
plotRateThroughTime(ed, intervalCol=corecol(numcol=1), avgCol=corecol(numcol=1), ylim=c(0,1), cex.axis=1, cex=1.5)
dev.off()## png
## 2
8 Speciation rates with RPanda
8.1 Species tree colored by tribe
Following http://www.phytools.org/***SanJuan2016/ex/14/Complex-diversification-models.html
## The following rpanda sections follow http://www.phytools.org/***SanJuan2016/ex/14/Complex-diversification-models.html
labeled_timetree_file = here("docs", "data", "trees", "astral", "iqtree-dating", "full-coding-astral-iqtree-dating-anil-mmus-12.1.timetree.nex")
labeled_timetree = read.nexus(labeled_timetree_file)
labeled_timetree = drop.tip(labeled_timetree, c(missing_samples, tribe_prob_full))
tree_to_df_list = treeToDF(labeled_timetree)
labeled_timetree_info = tree_to_df_list[["info"]]
# Read the labeled time tree and parse with treetoDF
labeled_timetree_info = labeled_timetree_info[order(labeled_timetree_info$node), ]
# Re-sort the data frame by R node order after the merge so the trees still work
rpanda_tree_tips = subset(labeled_timetree_info, node.type=="tip")
rpanda_tree_tips = rpanda_tree_tips %>% separate(clade, c("genus", "species", "sample"), sep="_", remove=F)
rpanda_tree_tips$Genus.species = paste(rpanda_tree_tips$genus, rpanda_tree_tips$species)
rpanda_tree_tips[rpanda_tree_tips$label=="mm10",]$Genus.species = "Mus musculus"
rpanda_tree_tips[rpanda_tree_tips$label=="rnor6",]$Genus.species = "Rattus norvegicus"
# Parse the genus and species names of the remaining tips
labeled_timetree$tip.label = rpanda_tree_tips$Genus.species
# Replace the tip labels in the time tree with the genus and species names
labeled_timetree_info$Genus.species = NA
labeled_timetree_info[labeled_timetree_info$clade == rpanda_tree_tips$clade,]$Genus.species = rpanda_tree_tips$Genus.species
# Add the genus and species labels to the info df
labeled_timetree_info$tribe = NA
# Initialize a tribe column in the tree info
###############
rpanda_taxo_data = subset(sample_taxo_data, !Genus.species %in% tribe_prob)
# Remove the problematic tribe samples for rpanda
rpanda_taxo_data$Tribe = as.factor(rpanda_taxo_data$Tribe)
# Factorize the tribe column
tribes = c()
tribe_subtrees = list()
# Initialize a vector of tribes for all tribes with > 1 species and
# a list of tribe subtrees
for(tribe in levels(rpanda_taxo_data$Tribe)){
#print(tribe)
tribe_samples = subset(rpanda_taxo_data, Tribe==tribe)$Genus.species
# Get all the samples from the current tribe
if(length(tribe_samples) < 2){
labeled_timetree_info[labeled_timetree_info$Genus.species %in% tribe_samples,]$tribe = tribe
next
}
# If the tribe has only one species, add the tribe label to the tree info column for that species,
# but then skip the rest
tribes = c(tribes, tribe)
# Add this tribe to the vector of tribes that have > 2 species
n = getMRCA(labeled_timetree, tribe_samples)
#print(n)
# Get the mrca of all the tribe species
tribe_nodes = getDescendants(labeled_timetree, n)
# Get all the descendant nodes from the mrca
labeled_timetree_info[labeled_timetree_info$node == n,]$tribe = tribe
labeled_timetree_info[labeled_timetree_info$node %in% tribe_nodes,]$tribe = tribe
# Add tribe info for the tribe nodes to the tree info df
tribe_subtrees[[tribe]] = extract.clade(labeled_timetree, n)
# Extract the subtree for this tribe
}
tribe_cols = c(corecol(numcol=8), corecol(numcol=2, offset=10))
xmax = 16
tribe_tree = ggtree(labeled_timetree, size=0.8, aes(color=labeled_timetree_info$tribe)) +
xlim(0, xmax) +
geom_tiplab(fontface='italic', size=2, show.legend=F) +
scale_color_manual(name="Tribe", values=tribe_cols) +
#theme(legend.position="bottom")
theme(legend.position=c(0.05,0.9))
print(tribe_tree)# A tribe tree figure
num_tribes = length(tribes)
# Variable to keep track of the number of tribes
##############################
lambda.cst <- function(x,y){y}
lambda.var <- function(x,y){y[1]*exp(y[2]*x)}
mu.cst <- function(x,y){y}
mu.var <- function(x,y){y[1]*exp(y[2]*x)}
# The different models to test
fit.multi.rpanda <- function(tree, par, sample_rate) {
bcstdcst = fit_bd(tree, max(branching.times(tree)), f.lamb=lambda.cst, f.mu=mu.cst, lamb_par=par[[1]][1], mu_par=par[[1]][2], cst.lamb=TRUE, cst.mu=TRUE, cond="crown", f=sample_rate, dt=1e-3)
bvardcst = fit_bd(tree, max(branching.times(tree)), f.lamb=lambda.var, f.mu=mu.cst, lamb_par=par[[2]][c(1,2)], mu_par=par[[2]][3], expo.lamb=TRUE, cst.mu=TRUE, cond="crown", f=sample_rate, dt=1e-3)
bcstdvar = fit_bd(tree, max(branching.times(tree)), f.lamb=lambda.cst, f.mu=mu.var, lamb_par=par[[3]][1], mu_par=par[[3]][c(2,3)], cst.lamb=TRUE, expo.mu=TRUE, cond="crown", f=sample_rate, dt=1e-3)
bvardvar = fit_bd(tree, max(branching.times(tree)), f.lamb=lambda.var, f.mu=mu.var, lamb_par=par[[4]][c(1,2)], mu_par=par[[4]][c(3,4)], expo.lamb=TRUE, expo.mu=TRUE, cond="crown", f=sample_rate, dt=1e-3)
return(list("bcstdcst"=bcstdcst,"bvardcst"=bvardcst,"bcstdvar"=bcstdvar,"bvardvar"=bvardvar))
}
# A function that uses rpanda to fit various models, given a tree and some initial params
###############
init_params = list(c(0.4,0),c(0.4,-0.05,0),c(0.4,0.1,0.05),c(0.4,-0.05,0.1,0.05))
# We need to supply starting parameter values for optimization to RPANDA
rpanda_results = list()
# Initialize a list of results by tribe
for(tribe in tribes){
#print(tribe)
prop_sampled = tribe_props[tribe_props$Tribe==tribe,]$tribe.prop.sampled
#print(prop_sampled)
rpanda_results[[tribe]] = fit.multi.rpanda(tribe_subtrees[[tribe]], init_params, 1)
}
# Run the model fitting function on each tribe
###############8.2 AICc results for 4 models
See: http://www.phytools.org/***SanJuan2016/ex/14/Complex-diversification-models.html
aic.table = matrix(nrow=4,ncol=num_tribes,NA)
# Initialize a matrix for the AIC table
# One column for each tribe, one row for each model
colnames(aic.table) = tribes
rownames(aic.table) = c("bcstdcst","bvardcst","bcstdvar","bvardvar")
# Rename the columns and the rows based on the tribes and the models
rpanda_best_fit = list()
# Initialize an empty list with the best fitting (lowest aicc) model for each tribe
col_i = 0
# A column counter
for(tribe in tribes) {
col_i = col_i + 1
# Increment the column counter to match the current tribe's column
min_aic = 999999999
min_aic_model = ""
# Initialize the min aic and model to pick the best model
row_j = 0
# A row counter
for(model in rownames(aic.table)) {
row_j = row_j + 1
# Increment the row counter to match the current model's row
cur_aic = rpanda_results[[tribe]][[model]]$aicc
# Get the aic for the current model
if(cur_aic < min_aic){
min_aic = cur_aic
min_aic_model = model
}
# Check if the aic is lower than the lowest so far and replace if so
aic.table[row_j,col_i] = cur_aic
# Add the aic to the table
}
rpanda_best_fit[[tribe]] = min_aic_model
# After all models have been checked for this tribe, store the best fitting one on the list
}
# Retrieve the AIC values from the rpanda results
aic.table %>% kable() %>% kable_styling(bootstrap_options=c("striped", "condended", "responsive"), full_width=F)| Arvicanthini | Hydromyini | Murini | Otomyini | Phloeomyini | Praomyini | Rattini | |
|---|---|---|---|---|---|---|---|
| bcstdcst | 110.63650 | 289.5277 | 50.78988 | -8.000000 | 33.29397 | 42.77105 | 209.6955 |
| bvardcst | 97.15104 | 288.0626 | 54.60092 | -6.000000 | 43.11915 | 41.75441 | 209.6553 |
| bcstdvar | 99.16857 | 291.7122 | 54.71845 | -6.000000 | 43.12934 | 47.57092 | 211.9353 |
| bvardvar | 100.18723 | 290.3143 | 59.83902 | -5.333333 | 73.11915 | 48.95441 | 211.9847 |
###############8.3 Parameter estimates for the best fitting model for each tribe
RPanda estimates lambda (speciation rate) and mu (extinction rate)
param_table = data.frame("dummy"=c(0,0,0,0))
# Initialize the parameters table with a dummy column with one row per model tested
for(tribe in tribes){
best_model = rpanda_best_fit[[tribe]]
# Get the best model for the current tribe
param_table[,tribe] = c(rpanda_results[[tribe]][[best_model]]$lamb_par[1:2],rpanda_results[[tribe]][[best_model]]$mu_par[1:2])
# Add the estimated params for the best model for each tribe to the table
}
param_table = select(param_table, -dummy)
# Remove the dummy column
rownames(param_table) = c("Lambda1", "Lambda2", "Mu1", "Mu2")
# Add row names to the table for each parameter
param_table %>% kable() %>% kable_styling(bootstrap_options=c("striped", "condended", "responsive"), full_width=F)| Arvicanthini | Hydromyini | Murini | Otomyini | Phloeomyini | Praomyini | Rattini | |
|---|---|---|---|---|---|---|---|
| Lambda1 | 0.0942485 | 0.2544161 | 0.2195913 | 0.000000 | 0.1151515 | 0.0329268 | 0.3090729 |
| Lambda2 | 0.4567976 | 0.1095986 | NA | NA | NA | 0.6308453 | 0.1138635 |
| Mu1 | -0.5176374 | 0.0000001 | -0.0000001 | 1.082918 | -0.0000002 | 0.0000000 | 0.0000002 |
| Mu2 | NA | NA | NA | NA | NA | NA | NA |
###############8.4 Estimating total number of species over time
rpanda_plot_df = data.frame(tribe=c(), t=c(), N=c())
# A df to store results for each tribe
for(tribe in tribes){
best_model = rpanda_results[[tribe]][[rpanda_best_fit[[tribe]]]]
# Get the best fitting model for the current tribe
max_div_time = max(branching.times(tribe_subtrees[[tribe]]))
if (!inherits(best_model, "fit.bd")) {
stop("object \"fit.bd\" is not of class \"fit.bd\"")
}
# If the best fitting model isn't an rpanda bd then error out
t = seq(0, max_div_time, 0.01)
# Intervals of time to estimate N
if ("f.mu" %in% attributes(best_model)$names) {
r <- function(t) { -best_model$f.lamb(t) + best_model$f.mu(t) }
} else {
r <- function(t) { -best_model$f.lamb(t) }
}
R <- function(s) { RPANDA:::.Integrate(Vectorize(r), 0, s) }
# Depending on the best model, functions to sample parameters at a given time
num_tips = Ntip(tribe_subtrees[[tribe]])
N = num_tips * exp(Vectorize(R)(t))
# Estimate the number of tips at a given time
rpanda_plot_df = rbind(rpanda_plot_df, data.frame(tribe=tribe, t=-t, N=N))
# Add the data for the current tribe to the results df
}
###############
matched_tribe_cols = c(tribe_cols[2:3], tribe_cols[5:9])
rpanda_p = ggplot(rpanda_plot_df, aes(x=t, y=N, color=tribe)) +
geom_line(size=1) +
xlab("Time (mya)") +
ylab("# of species") +
scale_color_manual(values=matched_tribe_cols) +
bartheme() +
theme(legend.position="bottom")
print(rpanda_p)8.5 Spectral density analysis of murine phylogeny
https://besjournals.onlinelibrary.wiley.com/doi/10.1111/2041-210X.12526
res1 = spectR(labeled_timetree)
plot_spectR(res1)res2 = BICompare(labeled_timetree, res1$eigengap)
plot_BICompare(labeled_timetree, res2)9 Tip introgression with RND
From https://onlinelibrary.wiley.com/doi/10.1111/mec.13610
RND is a measure of sequence divergence between a pair of species corrected for divergence from an outgroup:
\[RND = \frac{d_{XY}}{d_{out}}\]
Where: \(d_{XY}\) is the average distance (\(d_{x,y}\)) between all sequences between the two species \(X\) and \(Y\), and:
\[d_{out} = \frac{d_{XO} + d_{YO}}{2}\]
Since we have two outgroups, \(O_1\) and \(O_2\), each of \(d_{XO}\) and \(d_{XY}\) is also an average of the distances between the two outgroups, e.g.:
\[d_{XO} = \frac{d_{XO_1} + d_{XO_2}}{2}\] Low values of \(d_{XY}\) and/or \(RND\) are indicative of recent introgression.
9.1 Summary of sister pairs
#source("C:/bin/Ks_plots/fitMixEM.R")
#source("C:/bin/Ks_plots/bicWGD.R")
#source("C:/bin/Ks_plots/plotComponentExpectations.R")
# Files for mixture model testing code
rnd_dir = here("docs", "data", "trees", "tip-introgression-rnd")
# The input directory for the dxy files
rnd_sis_dir = paste0(rnd_dir, "/sister/")
sis_files = list.files(rnd_sis_dir)
sis_files = paste0(rnd_sis_dir, sis_files)
# Read the sister tip pairs
rnd_non_sis_dir = paste0(rnd_dir, "/non-sister/")
non_sis_files = list.files(rnd_non_sis_dir)
non_sis_files = paste0(rnd_non_sis_dir, non_sis_files)
# Read the non-sister tip pairs
#rnd_files = c(sis_files, non_sis_files)
#rnd_files = sis_files
#rnd_files = sample(non_sis_files, 10)
rnd_summary_sis = data.frame("pair"=c(), "mean.rnd"=c(), "median.rnd"=c())
# A df for the sister pairs
for(file in sis_files){
cur_data = read.csv(file, header=F)
names(cur_data) = c("gene", "cds.len", "pair.diffs", "x.out1", "x.out2", "y.out1", "y.out2")
# Read the file for the current pair and add headers
comp = tools::file_path_sans_ext(basename(file))
cur_data$pair = comp
# Get the pair name
cur_data$d.xy = cur_data$pair.diffs / cur_data$cds.len
# Calculate dxy between the pair
cur_data$x.out = rowMeans(cur_data[,c('x.out1', 'x.out2')], na.rm=T)
cur_data$y.out = rowMeans(cur_data[,c('y.out1', 'y.out2')], na.rm=T)
# Calculate the average number of differences between each in the pair and both outgroups
cur_data$d.out = rowMeans(cur_data[,c('x.out', 'y.out')], na.rm=T)
# Calculate average dxy to the outgroups
cur_data$rnd = cur_data$d.xy / cur_data$d.out
# Calculate RND as dxy / d_out
#bic.test.wgd(cur_data$rnd, startK = 1, maxK = 2, model = 1, nstarts = 100, outPrefix = comp)
#x = modetest(cur_data$rnd, method="HH")
# Mixture model and multimode testing
p = ggplot(cur_data, aes(x=rnd)) +
geom_histogram(bins=100) +
ggtitle(basename(comp)) +
xlab("RND") +
ylab("# of genes") +
scale_y_continuous(expand=c(0,0)) +
bartheme() +
theme(plot.title=element_text(size=12))
#print(p)
# Histogram of rnd for the current pair
rnd_summary_sis = rbind(rnd_summary_sis, data.frame(pair=comp, mean.rnd=mean(cur_data$rnd, na.rm=T), median.rnd=median(cur_data$rnd, na.rm=T)))
# Summarize the RND distribution for the current pair
}
print(p)rnd_summary_sis$pair <- reorder(rnd_summary_sis$pair, -rnd_summary_sis$median.rnd)
p = ggplot(rnd_summary_sis, aes(x=pair, y=mean.rnd, color="Mean")) +
geom_point() +
xlab("") +
ylab("RND") +
geom_point(aes(y=median.rnd, color="Median")) +
scale_color_manual(values=c("Mean"="#333333", "Median"=corecol(numcol=1))) +
theme(legend.position="bottom") +
coord_flip()
print(p)9.2 Summary of non-sister pairs
Distributions of median RND values for each pair
rnd_non_sis_dir = paste0(rnd_dir, "/non-sister/")
non_sis_files = list.files(rnd_non_sis_dir)
non_sis_files = paste0(rnd_non_sis_dir, non_sis_files)
# Read the non-sister tip pairs
#non_sis_files = sample(non_sis_files, 50)
rnd_summary_non_sis = data.frame("pair"=c(), "mean.rnd"=c(), "median.rnd"=c())
# A df for the sister pairs
for(file in non_sis_files){
cur_data = read.csv(file, header=F)
names(cur_data) = c("gene", "cds.len", "pair.diffs", "x.out1", "x.out2", "y.out1", "y.out2")
# Read the file for the current pair and add headers
comp = tools::file_path_sans_ext(basename(file))
cur_data$pair = comp
# Get the pair name
cur_data$d.xy = cur_data$pair.diffs / cur_data$cds.len
# Calculate dxy between the pair
cur_data$x.out = rowMeans(cur_data[,c('x.out1', 'x.out2')], na.rm=T)
cur_data$y.out = rowMeans(cur_data[,c('y.out1', 'y.out2')], na.rm=T)
# Calculate the average number of differences between each in the pair and both outgroups
cur_data$d.out = rowMeans(cur_data[,c('x.out', 'y.out')], na.rm=T)
# Calculate average dxy to the outgroups
cur_data$rnd = cur_data$d.xy / cur_data$d.out
# Calculate RND as dxy / d_out
rnd_summary_non_sis = rbind(rnd_summary_non_sis, data.frame(pair=comp, mean.rnd=mean(cur_data$rnd, na.rm=T), median.rnd=median(cur_data$rnd, na.rm=T)))
# Summarize the RND distribution for the current pair
}
rnd_summary_sis$type = "Sister pairs"
rnd_summary_non_sis$type = "Non-sister pairs"
rnd_summary = rbind(rnd_summary_sis, rnd_summary_non_sis)
p = ggplot(rnd_summary, aes(x=type, y=median.rnd, group=type, color=type)) +
geom_quasirandom(size=2, width=0.25, alpha=0.25) +
geom_boxplot(outlier.shape=NA, alpha=0.15, width=0.5, color="#666666") +
xlab("") +
ylab("Median RND") +
scale_color_manual(values=corecol(numcol=2, pal="wilke")) +
#geom_signif(comparisons=x_comps, map_signif_level=TRUE, textsize=4, size=1, step_increase=0.12, margin_top=0.1) +
#scale_x_continuous(limits=c(0,2)) +
bartheme() +
theme(legend.position="none")
print(p)