Murine species trees
::opts_chunk$set(echo = TRUE, warning = FALSE, message = FALSE)
knitr
library(ggplot2)
library(cowplot)
library(ggbeeswarm)
library(dplyr)
library(kableExtra)
library(tidyr)
library(ggtree)
library(phytools)
library(phangorn)
library(reshape2)
library(ggExtra)
library(ggrepel)
library(vroom)
library(ggdist)
library(stringr)
library(ggsignif)
library(phytools)
library(castor)
library(MCMCtreeR)
library(here)
library(stringr)
library(BAMMtools)
library(nlme)
library(RPANDA)
source(here("docs", "scripts", "lib", "design.r"))
source(here("docs", "scripts", "lib", "get_tree_info.r"))
#source("C:/bin/core/r/design.r")
#source("C:/bin/core/r/get_tree_info.r")
#source("C:/Users/grt814/bin/core/corelib/design.r")
#source("C:/Users/grt814/bin/core/get_tree_info.r")
#htmltools::includeHTML("../html-chunks/rmd_nav.html")
1 Full coding species tree
- 188 species targeted capture to mouse exons
- Assembled with SPADES
- 11,795 coding loci aligned with exonerate+MAFFT
- Gene trees inferred with IQ-Tree
- Species tree inferred with ASTRAL
- Branch lengths inferred with IQ-Tree using the 866 loci with a normalized RF distance to the species tree of less than 0.25.
# This chunk handles all of the main inputs and reads the tree
# cat("188 species targeted capture to mouse exons assembled with SPADES.\n")
# cat("11,775 coding loci aligned with exonerate+mafft\n")
# cat("Gene trees inferred with IQtree.\n")
# cat("Species tree inferred with ASTRAL.\n")
# cat("Branch lengths estimated by ASTRAL.\n")
# Data summary
###############
#tree_file = here("docs", "data", "trees", "astral", "concord-rooted", "full_coding_iqtree_astral.cf.labeled.tree")
here("docs", "data", "trees", "astral", "concord-rooted-bl", "full_coding_iqtree_astral_rooted_bl.cf.labeled.tree")
tree_file =# Newick tree file, with tp labels
read.tree(tree_file)
rodent_tree = treeToDF(rodent_tree)
tree_to_df_list = tree_to_df_list[["info"]]
tree_info =# Read the tree and parse with treetoDF
tree_info %>% separate(label, c("tp", "support"), sep=">", remove=F)
tree_info =$tp = paste(tree_info$tp, ">", sep="")
tree_info tree_info %>% separate(support, c("astral", "gcf", "scf"), sep="/", remove=T)
tree_info =# Split the label by /. tp is my treeParse label.
$astral[tree_info$node.type=="tip"] = NA
tree_info# Fill in ASTRAL support as NA for the tips
$astral = as.numeric(tree_info$astral)
tree_info$gcf = as.numeric(tree_info$gcf)
tree_info$scf = as.numeric(tree_info$scf)
tree_info# Convert all supports to numeric
###############
#iq_tree_labels = here("docs", "data", "trees", "astral", "concord-rooted", "full_coding_iqtree_astral.cf.branch")
here("docs", "data", "trees", "astral", "concord-rooted-bl", "full_coding_iqtree_astral_rooted_bl.cf.branch")
iq_tree_labels =# Newick tree file, with iqtree labels
###############
#cf_stat_file = here("docs", "data", "trees", "astral", "concord-rooted", "full_coding_iqtree_astral.cf.stat")
here("docs", "data", "trees", "astral", "concord-rooted-bl", "full_coding_iqtree_astral_rooted_bl.cf.stat")
cf_stat_file = read.table(cf_stat_file, header=T)
cf_stats =# Concordance factor file
###############
#cf_rep_dir = here("docs", "data", "trees", "astral", "concord-rooted", "cf-reps")
here("docs", "data", "trees", "astral", "concord-rooted-bl", "cf-reps")
cf_rep_dir =#delta_outfile = here("docs", "data", "trees", "astral", "concord-rooted", "delta.tab")
here("docs", "data", "trees", "astral", "concord-rooted-bl", "delta.tab")
delta_outfile =# CF reps for delta and delta outfile
###############
here("docs", "data", "trees", "astral", "astral-colonization-branches.txt")
col_file = c("Lophiomys_imhausi_UM5152", "Lophuromys_woosnami_LSUMZ37793", "<1>", "<187>", "<186>")
exclude_branches =# Colonization branches and branches around the root to exclude from some things
###############
here("docs", "data", "trees", "gene-trees", "loci-labeled.treefile")
gt_file =# The file containing the gene trees
###############
#xmax = 31
0.175
xmax =# The max of the x-axis for tree figures
# The node/branch labels in R and IQtree differ. IQtree uses a nice, logical post-ordering
# of internal nodes while R does something random and assigns labels to tips as well. This
# chunk matches those up for the delta analysis later
read.tree(iq_tree_labels)
iq_tree = treeToDF(iq_tree)
iqtree_to_df_list = iqtree_to_df_list[["info"]]
iqtree_info =# Read the IQ tree tree with branch labels in and parse with get_tree_info
#node_convert = matchNodes(tree_to_df_list[["labeled.tree"]], iqtree_to_df_list[["labeled.tree"]], method="descendants")
$iqtree.node = NA
tree_info# Add a column to the main tree table about IQ tree labels
for(i in 1:nrow(tree_info)){
tree_info[i,]$node
cur_node = subset(iqtree_info, node==cur_node)
iqtree_row = iqtree_row$label
iqtree_label =$iqtree.node = iqtree_label
tree_info[i,]
}# For every row in the main tree table, add in the IQ tree node label given that
# we've read the same tree in R and can use the node.labels
1.1 Branches colored by gCF
corecol(numcol=1, pal="wilke", offset=3)
h = corecol(numcol=1, offset=3)
l =# Colors
ggtree(rodent_tree, size=0.8, ladderize=F, aes(color=tree_info$gcf)) +
gcf_tree = scale_color_continuous(name='gCF', low=l, high=h, limits=c(0,100)) +
xlim(0, xmax) +
geom_tiplab(color="#333333", fontface='italic', size=2) +
theme(legend.position=c(0.05,0.9))
#gcf_tree = gcf_tree + geom_text(aes(x=branch, label=ifelse(tree_info$node.type=="internal",as.character(node), ''), label.size=NA, fill="transparent"), size=2, vjust=-0.2)
# Option to add branch labels to the figure
print(gcf_tree)
###############
here("docs", "figs", "full-coding-astral-rooted-gcf.png")
fig_outfile =ggsave(fig_outfile, gcf_tree, width=14, height=14, unit="in")
# Save the tree image
## gCF tree
1.2 Branches colored by sCF
corecol(numcol=1, pal="wilke", offset=3)
h = corecol(numcol=1, offset=3)
l =# Colors
ggtree(rodent_tree, size=0.8, ladderize=F, aes(color=tree_info$scf)) +
scf_tree = scale_color_continuous(name='sCF', low=l, high=h, limits=c(0,100)) +
xlim(0, xmax) +
geom_tiplab(color="#333333", fontface='italic', size=2) +
theme(legend.position=c(0.05,0.9))
#gcf_tree = gcf_tree + geom_text(aes(x=branch, label=ifelse(tree_info$node.type=="internal",as.character(node), ''), label.size=NA, fill="transparent"), size=2, vjust=-0.2)
# Option to add branch labels to the figure
print(gcf_tree)
###############
here("docs", "figs", "full-coding-astral-rooted-scf.png")
fig_outfile =ggsave(fig_outfile, scf_tree, width=14, height=14, unit="in")
# Save the tree image
## gCF tree
1.3 Gene vs site concordance factors colored by branch support
corecol(numcol=1, pal="wilke", offset=3)
h = corecol(numcol=1, offset=3)
l =# Colors
ggplot(tree_info, aes(x=gcf, y=scf, color=astral)) +
p = geom_point(size=2, alpha=0.5) +
scale_color_continuous(name='Astral support', low=l, high=h, limits=c(0.8,1)) +
bartheme() +
theme(legend.title=element_text(size=12))
print(p)
###############
here("docs", "figs", "full-coding-astral-cf.png")
fig_outfile =ggsave(fig_outfile, p, width=6, height=3, unit="in")
# Save the figure
1.4 Concordance factors vs. branch lengths
ggplot(tree_info, aes(x=branch.length, y=gcf)) +
bl_gcf_p = geom_point(size=3, alpha=0.5) +
#geom_text_repel(aes(label=ifelse(avg.ds>0.2|avg.dn>0.01,as.character(node),'')), show_guide=F) +
#geom_smooth(method="lm", se=F, ) +
scale_x_continuous(limits=c(0,0.03)) +
xlab("Branch length") +
ylab("gCF per branch") +
bartheme()
ggExtra::ggMarginal(bl_gcf_p, type="histogram", bins=50, fill=corecol(pal="wilke", numcol=1, offset=4), color="#666666")
bl_gcf_p =
ggplot(tree_info, aes(x=branch.length, y=scf)) +
bl_scf_p = geom_point(size=3, alpha=0.5) +
#geom_text_repel(aes(label=ifelse(avg.ds>0.2|avg.dn>0.01,as.character(node),'')), show_guide=F) +
#geom_smooth(method="lm", se=F, ) +
scale_x_continuous(limits=c(0,0.03)) +
xlab("Branch length") +
ylab("sCF per branch") +
bartheme()
ggExtra::ggMarginal(bl_scf_p, type="histogram", bins=50, fill=corecol(pal="wilke", numcol=1, offset=5), color="#666666")
bl_scf_p =
plot_grid(bl_gcf_p, bl_scf_p, ncol=2)
p =print(p)
###############
here("docs", "figs", "full-coding-astral-cf-bl.png")
fig_outfile =ggsave(fig_outfile, p, width=6, height=3, unit="in")
# Save the figure
2 Species tree comparisons
2.1 ASTRAL tree vs. concatenated tree
here("docs", "data", "trees", "concat", "full-coding-concat-r-labels.tre")
concat_file = read.tree(concat_file)
concat_tree =# Read the concatenated tree
cophylo(rodent_tree, concat_tree) astral_concat_comp =
## Rotating nodes to optimize matching...
## Done.
# Generate the cophylo object
plot(astral_concat_comp, link.type="curved", link.lwd=4, link.lty="solid", link.col=make.transparent(corecol(numcol=1), 0.5), fsize=c(0.5,0.5))
# Plot the trees
#comparePhylo(rodent_tree, concat_tree, plot=T, force.rooted=T)
2.2 ASTRAL tree vs. mitochondrial tree
here("docs", "data", "trees", "mitochondrial", "concat-concord", "full-mitochondrial-concat.cf.rooted.tree")
mito_file = read.tree(mito_file)
mito_tree =# Read the mitochondrial tree
cophylo(rodent_tree, mito_tree) astral_mito_comp =
## Rotating nodes to optimize matching...
## Done.
# Generate the cophylo object
plot(astral_mito_comp, link.type="curved", link.lwd=4, link.lty="solid", link.col=make.transparent(corecol(numcol=1, offset=1), 0.5), fsize=c(0.5,0.5))
# Plot the trees
3 Gene trees
The 11,774 gene trees contain varying numbers of taxa due to filtering on the alignment and filtering by IQtree for identical sequences.
3.1 Taxa per gene tree
read.table(gt_file, sep="\t")
gene_trees =names(gene_trees) = c("gene", "tree")
# Read the geen tree file
$gene = word(gene_trees$gene, 1, sep="-")
gene_trees$gene = word(gene_trees$gene, 2, sep="/")
gene_trees# Parse out the protein ID from the filename for each gene
###############
data.frame("gene"=c(), "rf"=c(), "num.tips"=c(), "rf.zeros"=c(), "num.tips.zeros"=c())
gt_data =# A df to track info for each gene tree
for(i in 1:nrow(gene_trees)){
read.tree(text=gene_trees[i,]$tree)
gt =# Read the gene tree as a phylo object
gt$tip.label
cur_tips = drop.tip(rodent_tree, rodent_tree$tip.label[-match(cur_tips, rodent_tree$tip.label)])
pruned_tree =# Get a list of the tips and prune the species tree to contain only those tips
RF.dist(pruned_tree, gt, normalize=T)
cur_rf =# Calculate RF between the pruned species tree and the gene tree
if(cur_rf==0){
rbind(gt_data, data.frame("gene"=gene_trees[i,]$gene, "rf"=cur_rf, "num.tips"=length(cur_tips), "rf.zeros"=cur_rf, "num.tips.zeros"=length(cur_tips)))
gt_data =else{
} rbind(gt_data, data.frame("gene"=gene_trees[i,]$gene, "rf"=cur_rf, "num.tips"=length(cur_tips), "rf.zeros"=NA, "num.tips.zeros"=NA))
gt_data =
}# Add the gene tree info to the df, with a special case for when the RF is 0
}# Read all the gene trees
ggplot(gt_data, aes(x=num.tips)) +
p = geom_histogram(bins=50, fill=corecol(pal="wilke", numcol=1, offset=6), color="#666666") +
#geom_quasirandom(size=2, width=0.25, alpha=0.25, color="#666666") +
#geom_boxplot(outlier.shape=NA, alpha=0.75, width=0.5, color="#666666") +
scale_y_continuous(expand=c(0,0)) +
xlab("# of taxa") +
ylab("# of gene trees") +
bartheme()
print(p)
###############
here("docs", "figs", "full-coding-gt-taxa-dist.png")
fig_outfile =ggsave(fig_outfile, p, width=6, height=3, unit="in")
# Save the figure
#rf_file = here("docs", "data", "trees", "astral", "concord-rooted", "loci-nrf-below-0.25.txt")
#write.table(subset(gt_data, rf<=0.25), file=rf_file, row.names=F, quote=F, sep="\t")
# Write info for genes with low RF
3.2 Tree distance (RF) between gene trees and species tree
Since RF cannot handle missing taxa, the species tree is pruned for each gene tree to calculate Robinson-Foulds distance. We use the normalized metric since there are varying numbers of tips per gene tree.
ggplot(gt_data, aes(x=rf)) +
p = geom_histogram(bins=50, fill=corecol(pal="wilke", numcol=1, offset=4), color="#666666") +
#geom_quasirandom(size=2, width=0.25, alpha=0.25, color="#666666") +
#geom_boxplot(outlier.shape=NA, alpha=0.75, width=0.5, color="#666666") +
scale_y_continuous(expand=c(0,0)) +
xlab("Normalized RF") +
ylab("# of gene trees") +
bartheme()
print(p)
###############
here("docs", "figs", "full-coding-gt-rf-dist.png")
fig_outfile =ggsave(fig_outfile, p, width=6, height=3, unit="in")
# Save the figure
4 Delta
4.1 Introduction
For each lineage in the species tree with gCF < 95% we calculated the \(\Delta\) statistic (Huson et al. 2005). This statistic follows the same logic as the ABBA-BABA site patterns used to calculate D-statistics, but uses gene tree topologies instead of alignment sites. Briefly, a given branch in an unrooted tree is defined by a quartet of species groupings with two possible discordant topologies, \(D_1\) and \(D_2\) (see Figure 1 from Minh et al. 2020). Under assumptions that discordance is caused by ILS, both discordant topologies should be present in equal proportions. However, if introgression has occurred one discordant topology will appear more frequently than the other. \(\Delta\) is calculated for a branch as follows, using the frequency of each discordant topology (Vanderpool et al. 2020):
\[\Delta = \frac{D_1 - D_2}{D_1 + D_2}\]
This normalized \(\Delta\) calculation ensures that all values are scaled between 0 and 1, with larger values indicating a larger skew towards one topology, and a higher chance that introgression has occurred.
To test whether the observed \(\Delta\) values are skewed significantly from 0 to imply introgression, we performed concordance factor analysis on 1,000 bootstrap replicates of our inferred gene trees to generate a null distribution of \(\Delta\) values. We then calculated Z-scores and p-values and assessed significance for each branch at a threshold of 0.01.
4.2 Null and actual distributions
Nodes with p < 0.01:
$delta.sig = F
tree_info# Add a column to the tree info table about significant delta values
###############
here("docs", "figs", "full-coding-delta-dists.png")
fig_outfile =# The image file to either generate or insert
if(!file.exists(delta_outfile)){
$delta = (abs(cf_stats$gDF1_N - cf_stats$gDF2_N)) / (cf_stats$gDF1_N + cf_stats$gDF2_N)
cf_stats# Calculate the delta values on the actual data
select(cf_stats, ID, delta)
iqtree_delta =names(iqtree_delta) = c("iqtree.node", "delta")
merge(x = tree_info, y = iqtree_delta, by = "iqtree.node", all.x=TRUE)
tree_info =# Adding the delta values to the main tree info table
tree_info[order(tree_info$node), ]
tree_info =# Re-sort the data frame by R node order after the merge so the trees still work
subset(cf_stats, gCF < 95)
low_cf_nodes =# Get the low concordance factor nodes from the data to test with delta
c()
delta_null =for(i in 0:999){
as.character(i)
cur_rep_str =while(nchar(cur_rep_str) < 4){
paste("0", cur_rep_str, sep="")
cur_rep_str =
}# Handling the string of the rep
paste(cf_rep_dir, "/rep", cur_rep_str, ".cf.stat", sep="")
cur_cf_file = read.table(cur_cf_file, header=T, fill=T)
cf_rep =# Read the current reps cf file
$delta = (abs(cf_rep$gDF1_N - cf_rep$gDF2_N)) / (cf_rep$gDF1_N + cf_rep$gDF2_N)
cf_rep c(delta_null, cf_rep$delta)
delta_null =# Calculate delta for this rep and save values in vector
}# Read concordance factors from bootstrap samples of gene trees and calculate delta to generate
# null distribution
###############
data.frame("delta"=delta_null, y="duh")
delta_null_df = subset(delta_null_df, !is.nan(delta))
delta_null_df =# Convert the delta values to a data frame for ggplot
mean(delta_null_df$delta, na.rm=T)
delta_mu = sd(delta_null_df$delta, na.rm=T)
delta_sd =# Calculate the mean and sd of the null distribution to get z-scores and p-values
###############
data.frame("node"=c(), "delta"=c(), "z-score"=c(), "p-value"=c())
delta_out =# Initialize output data frame
for(i in 1:nrow(low_cf_nodes)){
low_cf_nodes[i,]
row = (row$delta - delta_mu) / delta_sd
z = pnorm(-abs(z))
p =if(p < 0.01){
print(paste(row$ID, row$delta, z, p, sep=" "))
$delta.sig[tree_info$iqtree.node==row$ID] = T
tree_info# Set the delta significant to TRUE in the main tree info table
} rbind(delta_out, data.frame("node"=row$ID, "delta"=row$delta, "z-score"=z, "p-value"=p))
delta_out =
}# Calculate z and p for each low gCF node in the species tree and save to output data frame
write.table(file=delta_outfile, delta_out, sep="\t", row.names=F)
# Write the delta values per node to the output file
###############
ggplot(delta_null_df, aes(x=delta)) +
delta_null_p = geom_histogram(color="#ececec", bins=50) +
scale_x_continuous(limits=c(0,1)) +
scale_y_continuous(expand=c(0,0)) +
xlab("Delta") +
ylab("# nodes") +
bartheme()
ggplot(low_cf_nodes, aes(x=delta)) +
delta_actual_p = geom_histogram(fill="#920000", color="#ececec") +
scale_x_continuous(limits=c(0,1)) +
scale_y_continuous(expand=c(0,0)) +
xlab("Delta") +
ylab("# nodes") +
bartheme()
plot_grid(delta_null_p, delta_actual_p, ncol=2, labels=c("Bootstrap distribution", "Actual distribution"), label_size=12)
delta_p =print(delta_p)
###############
ggsave(fig_outfile, delta_p, width=10, height=4, unit="in")
# Save the figure
else{
}$delta = NA
tree_info
read.table(delta_outfile, header=T)
delta_out =for(i in 1:nrow(delta_out)){
delta_out[i,]
row =$delta[tree_info$iqtree.node==row$node] = row$delta
tree_infoif(row$p.value < 0.01){
print(paste(row$node, row$delta, row$z.score, row$p.value, sep=" "))
$delta.sig[tree_info$iqtree.node==row$node] = T
tree_info# Set the delta significant to TRUE in the main tree info table
}
}
::include_graphics(fig_outfile)
knitr
}
## [1] "219 0.406732117812062 2.44516394160885 0.00723931554647087"
## [1] "222 0.407151095732411 2.44879368056455 0.00716677631309369"
## [1] "232 0.622047244094488 4.31050734912284 8.14402076760321e-06"
## [1] "260 0.416606498194946 2.53070883884166 0.00569161487327277"
## [1] "272 0.531951640759931 3.52998048773703 0.000207795157849876"
## [1] "273 0.464877663772691 2.9488972960955 0.00159454968995379"
## [1] "317 0.510204081632653 3.34157446746339 0.000416523334864317"
## [1] "324 0.407725321888412 2.45376838435342 0.00706840016380011"
4.3 Branches by delta
Branches with significant delta are labeled
corecol(numcol=1, pal="wilke", offset=3)
h = corecol(numcol=1, offset=3)
l =
ggtree(rodent_tree, size=0.8, ladderize=F, aes(color=tree_info$delta)) +
delta_tree = scale_color_continuous(name="Delta", high=h, low=l) +
xlim(0, xmax) +
geom_tiplab(color="#333333", fontface='italic', size=2) +
theme(legend.position=c(0.05,0.9)) +
geom_label(aes(x=branch, label=ifelse(tree_info$delta.sig,as.character(tree_info$tp),'')), label.size=NA, fill="transparent", show.legend=F, vjust=-0.1)
#geom_text(aes(label=rodent_data$support), hjust=-.1, color="#006ddb") +
#geom_nodepoint(color="#666666", alpha=0.85, size=4)
print(delta_tree)
###############
here("docs", "figs", "full-coding-astral-delta.png")
fig_outfile =ggsave(fig_outfile, delta_tree, width=14, height=14, unit="in")
# Save the tree image
4.4 Branches with significant delta
corecol(numcol=1, pal="wilke", offset=1)
h = corecol(numcol=1, offset=1)
l =
ggtree(rodent_tree, size=0.8, ladderize=F, aes(color=tree_info$delta.sig)) +
intro_tree = scale_color_manual(name="Significant Delta", labels=c("False", "True"), values=corecol(pal="trek", numcol=2)) +
xlim(0, xmax) +
geom_tiplab(color="#333333", fontface='italic', size=2) +
theme(legend.position=c(0.05,0.9)) +
geom_label(aes(x=branch, label=ifelse(tree_info$delta.sig,as.character(tree_info$tp),'')), label.size=NA, fill="transparent", show.legend=F, vjust=-0.1)
#geom_text(aes(label=rodent_data$support), hjust=-.1, color="#006ddb") +
#geom_nodepoint(color="#666666", alpha=0.85, size=4)
print(intro_tree)
4.5 Concordance factors and delta
corecol(numcol=1, pal="wilke", offset=3)
h = corecol(numcol=1, offset=3)
l =# Colors
ggplot(tree_info, aes(x=gcf, y=scf, color=delta)) +
p = geom_point(size=2, alpha=0.5) +
geom_text_repel(aes(label=ifelse(delta.sig,as.character(tp),'')), show_guide=F, min.segment.length=0) +
scale_color_continuous(name='Delta', low=l, high=h) +
xlab("gCF") +
ylab("sCF") +
bartheme() +
theme(legend.title=element_text(size=12))
print(p)
###############
here("docs", "figs", "full-coding-delta-cf.png")
fig_outfile =ggsave(fig_outfile, p, width=6, height=4, unit="in")
# Save the figure
4.6 Branch length and delta
corecol(numcol=1, pal="wilke", offset=3)
h = corecol(numcol=1, offset=3)
l =# Colors
ggplot(tree_info, aes(x=branch.length, y=delta, color=delta)) +
p = geom_point(size=2, alpha=0.5) +
geom_text_repel(aes(label=ifelse(delta.sig,as.character(tp),'')), show_guide=F, min.segment.length=0) +
scale_x_continuous(limits=c(0,0.03)) +
#scale_color_continuous(name='Delta', low=l, high=h) +
xlab("Branch length") +
ylab("Delta") +
bartheme() +
theme(legend.title=element_text(size=12))
print(p)
###############
here("docs", "figs", "full-coding-delta-bl.png")
fig_outfile =ggsave(fig_outfile, p, width=6, height=4, unit="in")
# Save the figure
4.7 Species tree without tips
#no_tip_file = here("docs", "data", "trees", "astral", "concord-rooted", "full_coding_iqtree_astral.cf.labeled.notips.tree")
#no_tip_file = here("docs", "data", "trees", "astral", "concord-rooted-bl", "full_coding_iqtree_astral_rooted_bl.cf.labeled.notips.tree")
here("docs", "data", "trees", "astral", "iqtree-dating", "full-coding-astral-iqtree-dating-anil-mmus-12.1.timetree.reordered.labeled.notips.tre")
no_tip_file =
read.tree(no_tip_file)
tree2 = treeToDF(tree2)
tree_to_df_list = tree_to_df_list[["info"]]
tree2_info =names(tree2_info)[5] = "tp"
# Read the tree and parse with treetoDF
subset(tree_info, tp %in% tree2_info$tp)
tree_info_notip = tree_info_notip %>% select(tp, delta, delta.sig)
tree_info_notip = left_join(tree2_info, tree_info_notip, by="tp")
tree2_info =# Add the delta values for each node to the no tip tree
subset(tree2_info, node.type=="tip")
tree2_tip_info =## No tip tree
###############
drop.tip(tree2, "<1>")
tree3 = treeToDF(tree3)
tree_to_df_list = tree_to_df_list[["info"]]
tree3_info =names(tree3_info)[5] = "tp"
# Read the tree and parse with treetoDF
subset(tree_info, tp %in% tree2_info$tp)
tree_info_notip = tree_info_notip %>% select(tp, delta, delta.sig)
tree_info_notip = left_join(tree3_info, tree_info_notip, by="tp")
tree3_info =# Add the delta values for each node to the no tip tree
subset(tree3_info, node.type=="tip")
tree3_tip_info =## No tip tree, no outgroup
###############
corecol(numcol=1, pal="wilke", offset=3)
h = corecol(numcol=1, offset=3)
l =
25
no_tip_max =#no_tip_max = 0.13
ggtree(tree2, size=0.8, ladderize=F, aes(color=tree2_info$delta)) +
delta_tree = scale_color_continuous(name="Delta", high=h, low=l) +
xlim(0, no_tip_max) +
geom_tiplab(color="#333333", fontface='italic', size=5) +
theme(legend.position=c(0.05,0.9))
#geom_label(aes(x=branch, label=ifelse(tree_info$delta.sig,as.character(tree_info$tp),'')), label.size=NA, fill="transparent", show.legend=F, vjust=-0.1)
#geom_text(aes(label=rodent_data$support), hjust=-.1, color="#006ddb") +
#geom_nodepoint(color="#666666", alpha=0.85, size=4)
print(delta_tree)
###############
here("docs", "figs", "full-coding-astral-delta-notips.png")
fig_outfile =ggsave(fig_outfile, delta_tree, width=12, height=12, unit="in")
# Save the tree image
4.8 Phylogenetic signal of delta (Pagel’s lambda, Bloomberg K)
# Generally helpful: https://lukejharmon.github.io/pcm/chapter6_beyondbm/
phylosig(tree2, tree2_tip_info$delta, test=T)
## [1] "x has no names; assuming x is in the same order as tree$tip.label"
## [1] "some data in x given as 'NA', dropping corresponding species from tree"
##
## Phylogenetic signal K : 0.222633
## P-value (based on 1000 randomizations) : 0.211
# Bloomberg K
phylosig(tree2, tree2_tip_info$delta, method="lambda", test=T)
## [1] "x has no names; assuming x is in the same order as tree$tip.label"
## [1] "some data in x given as 'NA', dropping corresponding species from tree"
##
## Phylogenetic signal lambda : 6.6107e-05
## logL(lambda) : 52.31
## LR(lambda=0) : -0.00215541
## P-value (based on LR test) : 1
#Pagel's lambda
###############
#new_tree = drop.tip(no_tip_tree, "<1>")
#new_info = subset(new_tip_info, tp != "<1>")
#a = data.frame(x=new_info$branch.length, y=new_info$delta, names=new_info$tp, row.names=new_info$tp)
#picModel <- pic.pgls(formula = y ~ x, phy = new_tree, y = a, lambda = "ML", return.intercept.stat = FALSE)
# PGLS.. not sure how to interpret
# https://cran.r-project.org/web/packages/motmot/vignettes/motmot.html#fast-estimation-of-phylogenetic-generalised-least-squares
# Or maybe this is better: https://lukejharmon.github.io/ilhabela/instruction/2015/07/03/PGLS/
###############
4.9 Phylogenetic signal of delta (Pagel’s lambda, Bloomberg K) for clades with high speciation rates (see below)
4.9.1 Rattini
# Generally helpful: https://lukejharmon.github.io/pcm/chapter6_beyondbm/
tree2
tree2_rattini =$tip.label = paste(tree2_rattini$tip.label, "/", subset(tree2_info, node.type == "tip")$delta, sep="")
tree2_rattini$node.label = paste(tree2_rattini$node.label, "/", subset(tree2_info, node.type != "tip")$delta, sep="")
tree2_rattini# Add delta values to the labels of the tree with no tips
extract.clade(tree2_rattini, tree2_info$node[tree2_info$tp=="<169>"])
tree2_rattini = treeToDF(tree2_rattini)
tree_to_df_list = tree_to_df_list[["info"]]
tree2_rattini_info =# Get the subtree and parse to df
tree2_rattini_info %>% separate(label, c("tp", "delta"), sep="/", remove=F)
tree2_rattini_info =$delta = as.numeric(tree2_rattini_info$delta)
tree2_rattini_info# Split the label by / to get the delta value
subset(tree2_rattini_info, node.type=="tip")
tree2_rattini_tip_info =# Get only the tip info for the tests
phylosig(tree2_rattini, tree2_rattini_tip_info$delta, test=T)
## [1] "x has no names; assuming x is in the same order as tree$tip.label"
##
## Phylogenetic signal K : 0.634626
## P-value (based on 1000 randomizations) : 0.129
# Bloomberg K
phylosig(tree2_rattini, tree2_rattini_tip_info$delta, method="lambda", test=T)
## [1] "x has no names; assuming x is in the same order as tree$tip.label"
##
## Phylogenetic signal lambda : 6.6107e-05
## logL(lambda) : 12.2344
## LR(lambda=0) : -0.00023808
## P-value (based on LR test) : 1
#Pagel's lambda
4.9.2 Hydromyini
# Generally helpful: https://lukejharmon.github.io/pcm/chapter6_beyondbm/
tree2
tree2_rattini =$tip.label = paste(tree2_rattini$tip.label, "/", subset(tree2_info, node.type == "tip")$delta, sep="")
tree2_rattini$node.label = paste(tree2_rattini$node.label, "/", subset(tree2_info, node.type != "tip")$delta, sep="")
tree2_rattini# Add delta values to the labels of the tree with no tips
extract.clade(tree2_rattini, tree2_info$node[tree2_info$tp=="<177>"])
tree2_rattini = treeToDF(tree2_rattini)
tree_to_df_list = tree_to_df_list[["info"]]
tree2_rattini_info =# Get the subtree and parse to df
tree2_rattini_info %>% separate(label, c("tp", "delta"), sep="/", remove=F)
tree2_rattini_info =$delta = as.numeric(tree2_rattini_info$delta)
tree2_rattini_info# Split the label by / to get the delta value
subset(tree2_rattini_info, node.type=="tip")
tree2_rattini_tip_info =# Get only the tip info for the tests
phylosig(tree2_rattini, tree2_rattini_tip_info$delta, test=T)
## [1] "x has no names; assuming x is in the same order as tree$tip.label"
##
## Phylogenetic signal K : 0.505996
## P-value (based on 1000 randomizations) : 0.455
# Bloomberg K
phylosig(tree2_rattini, tree2_rattini_tip_info$delta, method="lambda", test=T)
## [1] "x has no names; assuming x is in the same order as tree$tip.label"
##
## Phylogenetic signal lambda : 6.6107e-05
## logL(lambda) : 15.8495
## LR(lambda=0) : -0.000279124
## P-value (based on LR test) : 1
#Pagel's lambda
4.10 Phylogenetic independent contrasts of delta
PIC are basically the expected differences in trait values between pairs of branches descending from a node in a tree corrected for expected variance, which is just the sum of the branch lengths. For example, for a given node \(N\) in a tree, the PIC will be:
\[PIC(N) = \frac{V_1 - V_2}{L_1 + L_2}\]
With a post-order traversal of the tree, PICs can be calculated for each node in the tree. Generally, for internal nodes, a trait value \(V\) is assigned after its PIC has been calculated so it can be used in calculation of the PIC of its ancestor.
\[V = \frac{(1/L_1)V_1+(1/L_2)V_2}{1/L_1+1/L_2}\]
The branch is also lengthened to account for possible error in \(V\).
Finally, the rate of trait evolution \(R\) is calculated as
\[R = \frac{\sum ^{n}{PIC_n}}{n}\]
where \(n\) is the number of internal nodes in the tree and \(PIC_n\) is the PIC calculated for that node.
However, we have no need to estimate values \(V\) at internal nodes in our tree since we have delta values for each branch. Here I compare both the contrasts themselves and the rate estimated from them when using the inferred ancestral states or the observed states in our tree.
# Helpful:
# https://bio.libretexts.org/Bookshelves/Evolutionary_Developmental_Biology/Phylogenetic_Comparative_Methods_(Harmon)/04%3A_Fitting_Brownian_Motion/4.02%3A_Estimating_Rates_using_Independent_Contrasts
# https://lukejharmon.github.io/ilhabela/instruction/2015/07/02/phylogenetic-independent-contrasts/
pic(tree3_tip_info$delta, tree3, var.contrasts=T, rescaled.tree=T)
inferred_pic =# Compute the contrasts based on the 'tip' values from the tree
sum(inferred_pic$contr[,1]^2)/nrow(inferred_pic$contr)
inferred_rate =# Compute the rate from the inferred constrasts
###############
$pic = NA
tree2_info$inferred.pic = NA
tree2_info# Columns to add to the tree info
0
num_contrasts =# Keep track of the number of contrasts done to calculate rate later
for(i in 1:nrow(tree2_info)){
tree2_info[i,]
row =if(row$node.type == "tip" || row$node.type=="ROOT"){
next
}# Skip tips and the root since we can't calculate on them
$inferred.pic[tree2_info$tp==row$tp] = inferred_pic[["contr"]][,1][[row$tp]]
tree2_info# Add the inferred pic to the tree info df here
getDescendants(tree2, row$node)
desc = tree2_info[tree2_info$node==desc[1],]
desc1 = tree2_info[tree2_info$node==desc[2],]
desc2 =# Get the descendants of the current node
(desc1$delta - desc2$delta) / (desc1$branch.length + desc2$branch.length)
pic =# Calculate the contrast based on the descendant deltas and branch lenghts
$pic[tree2_info$node==row$node] = pic
tree2_info# Add the contrast to the tree info df
num_contrasts + 1
num_contrasts =# Increment number of contrasts
}
sum(tree2_info$pic^2, na.rm=t)/num_contrasts
observed_rate =# Compute the rate from the observed contrasts
## Compute contrasts from observed delta
###############
subset(tree2_info, node.type != "tip" & !is.na(delta))
plot_info =
#a = subset(no_tip_info, node.type != "tip")
select(plot_info, tp, pic, delta.sig)
a =$label = "Observed"
a
select(plot_info, tp, inferred.pic, delta.sig)
b =names(b)[2] = "pic"
$label = "Inferred"
b
rbind(a,b)
z =
list(c("Observed", "Inferred"))
x_comps =
ggplot(data=z, aes(x=label, y=pic, group=label, color=delta.sig)) +
p = geom_quasirandom(size=2, width=0.25, alpha=0.30) +
geom_boxplot(outlier.shape=NA, alpha=0.15, width=0.5, color="#666666") +
geom_signif(comparisons=x_comps, map_signif_level=TRUE, textsize=4, size=1, step_increase=0.12, margin_top=0.1, test="ks.test") +
#scale_y_continuous(limits=c(0,0.7)) +
scale_color_manual("Significant delta?", values=c(corecol(numcol=1, pal="wilke", offset=6), corecol(numcol=1, pal="wilke", offset=4))) +
xlab("") +
ylab("PIC") +
bartheme() +
theme(axis.text.x=element_text(angle=15, hjust=1),
plot.title = element_text(size=14, vjust=1),
legend.title=element_text(size=12),
legend.position="bottom",
plot.margin = unit(c(1,0,0,0), "cm")) +
guides(colour=guide_legend(override.aes=list(alpha=1)))
print(p)
cat("KS test. Note that distributions are not significantly different with Wilcox test\n")
## KS test. Note that distributions are not significantly different with Wilcox test
###############
here("docs", "figs", "full-coding-delta-pic.png")
fig_outfile =ggsave(fig_outfile, p, width=6, height=4, unit="in")
# Save the figure
###############
data.frame("Rate type"=c("Inferred", "Observed"), "Rate"=c(inferred_rate, observed_rate))
rate_table =%>% kable() %>% kable_styling(bootstrap_options=c("striped", "condended", "responsive"), full_width=F) rate_table
Rate.type | Rate |
---|---|
Inferred | 0.0047570 |
Observed | 0.0095405 |
# Table to display rates
4.11 Ancestral reconstruction of delta values
#new_tree = drop.tip(no_tip_tree, "<1>")
# tree_to_df_list = treeToDF(tree2_no_out)
# tree2_no_out_ = tree_to_df_list[["info"]]
# names(new_tree_info)[5] = "tp"
# Drop the outgroup and re-read the tree
#new_info = subset(new_tip_info, tp != "<1>")
###############
data.frame(delta=tree3_tip_info$delta, row.names=tree3_tip_info$tp)
a = as.matrix(a)[,1]
b = fastAnc(tree3, b, vars=T,CI=T)
fit =# Transform delta values from tips and infer ancestral states
#obj<-contMap(new_tree,b,plot=FALSE)
#plot(obj,type="fan",legend=0.7*max(nodeHeights(new_tree)),
# fsize=c(0.7,0.9))
# See http://www.phytools.org/eqg2015/asr.html
###############
$delta.recon = NA
tree2_info$delta.lower = NA
tree2_info$delta.upper = NA
tree2_infofor(i in 1:nrow(tree2_info)){
tree2_info[i,]
row =if(row$node.type == "tip" || row$node.type=="ROOT"){
next
}
row$tp
tp_node = row$node
node = tree3_info$node[tree3_info$tp==tp_node]
new_node =
$delta.recon[tree2_info$tp==row$tp] = fit[["ace"]][[as.character(new_node)]]
tree2_info$delta.lower[tree2_info$tp==row$tp] = fit[["CI95"]][as.character(new_node),1]
tree2_info$delta.upper[tree2_info$tp==row$tp] = fit[["CI95"]][as.character(new_node),2]
tree2_info
}# Add the inferred delta values to the main data frame for the no tip tree
###############
# delta_tree = ggtree(fit$rescaled.tree, size=0.8, ladderize=F) +
# #scale_color_continuous(name="Delta", high=h, low=l) +
# xlim(0, 0.2) +
# geom_tiplab(color="#333333", fontface='italic', size=5) +
# theme(legend.position=c(0.05,0.9))
# #geom_label(aes(x=branch, label=ifelse(tree_info$delta.sig,as.character(tree_info$tp),'')), label.size=NA, fill="transparent", show.legend=F, vjust=-0.1)
# #geom_text(aes(label=rodent_data$support), hjust=-.1, color="#006ddb") +
# #geom_nodepoint(color="#666666", alpha=0.85, size=4)
# print(delta_tree)
corecol(numcol=1, pal="wilke", offset=3)
h = corecol(numcol=1, offset=3)
l =
25
no_tip_max =#no_tip_max = 0.13
ggtree(tree2, size=0.8, ladderize=F, aes(color=tree2_info$delta.recon)) +
delta_tree = scale_color_continuous(name="Delta", high=h, low=l) +
xlim(0, no_tip_max) +
geom_tiplab(color="#333333", fontface='italic', size=5) +
theme(legend.position=c(0.05,0.9))
#geom_label(aes(x=branch, label=ifelse(tree_info$delta.sig,as.character(tree_info$tp),'')), label.size=NA, fill="transparent", show.legend=F, vjust=-0.1)
#geom_text(aes(label=rodent_data$support), hjust=-.1, color="#006ddb") +
#geom_nodepoint(color="#666666", alpha=0.85, size=4)
print(delta_tree)
# Tree with ancestral reconstructions of delta
###############
here("docs", "figs", "full-coding-astral-delta-recon-notips.png")
fig_outfile =ggsave(fig_outfile, delta_tree, width=12, height=12, unit="in")
# Save the tree image
# w = subset(tree2_info, node.type=="tip")
# w = select(w, tp, delta)
# w$label = "delta.recon"
subset(tree2_info, node.type != "tip")
x = select(x, tp, delta, delta.sig)
x =$delta.sig = ifelse(x$delta.sig, "Yes", "No")
x$label = "Observed"
x
select(tree2_info, tp, delta.recon, delta.sig)
y =names(y)[2] = "delta"
$delta.sig = ifelse(y$delta.sig, "Yes", "No")
y$label = "Simulated"
y
rbind(x,y)
z =# Convert to long with two categories
###############
list(c("Observed", "Simulated"))
x_comps =
ggplot(z, aes(x=label, y=delta, group=label, color=delta.sig)) +
p = geom_quasirandom(size=3, width=0.25, alpha=0.5) +
geom_boxplot(outlier.shape=NA, alpha=0.15, width=0.5, color="#666666") +
geom_signif(comparisons=x_comps, map_signif_level=TRUE, textsize=4, size=1, step_increase=0.12, margin_top=0.1) +
ylab("Delta") +
xlab("") +
scale_color_manual("Significant observed delta?", values=c(corecol(numcol=1, pal="wilke", offset=3), corecol(numcol=1, pal="wilke", offset=4))) +
bartheme() +
theme(plot.title = element_text(size=14, vjust=1),
legend.title=element_text(size=12),
legend.position="bottom",
plot.margin = unit(c(1,0,0,0), "cm"))
print(p)
###############
here("docs", "figs", "full-coding-delta-recon.png")
fig_outfile =ggsave(fig_outfile, p, width=6, height=5, unit="in")
# Save the figure
###############
ggplot(tree2_info, aes(x=delta.recon, delta)) +
p = geom_point(size=2, alpha=0.5) +
geom_smooth(method="lm") +
bartheme()
print(p)
here("docs", "figs", "full-coding-delta-recon-corr.png")
fig_outfile =ggsave(fig_outfile, p, width=6, height=4, unit="in")
# Save the figure
###############
subset(tree2_info, node.type=="internal" & !tp %in% exclude_branches)
tree2_internal_info = ggplot(tree2_internal_info, aes(x=delta.recon, y=node)) +
p = geom_point(aes(color="Simulated (95% CI)"), size=2, alpha=0.7) +
geom_segment(aes(x=delta.lower, xend=delta.upper, y=node, yend=node)) +
geom_point(aes(x=delta, y=node, color="Observed"), size=2, alpha=0.7) +
scale_color_manual(values=c("Simulated (95% CI)"="black", "Observed"="red")) +
xlab("Delta") +
ylab("Node") +
bartheme() +
theme(legend.position="bottom")
print(p)
$within.bounds = 0
tree2_internal_infofor(i in 1:nrow(tree2_internal_info)){
if(tree2_internal_info[i,]$delta >= tree2_internal_info[i,]$delta.lower && tree2_internal_info[i,]$delta <= tree2_internal_info[i,]$delta.upper){
$within.bounds = 1
tree2_internal_info[i,]
}
}
here("docs", "figs", "full-coding-delta-recon-ci.png")
fig_outfile =ggsave(fig_outfile, p, width=4, height=6, unit="in")
# Save the figure
4.12 Delta differences to ancestors (observed)
My own attempt at some sort of phylogenetic independent comparison.
For each branch, compute the difference in the value delta between it and its ancestor and the difference between it and a random branch in the tree. Repeat random sampling as needed.
subset(tree2_info, !is.na(delta))
delta_info =# Get info only from branches with delta values
#delta_diffs = data.frame("ref.node"=c(), "ref.delta"=c(), "query.node"=c(), "query.delta"=c(), "query.type"=c(), "delta.diff"=c())
data.frame("ref.node"=c(), "ref.delta"=c(), "sig.ref.delta"=c(), "query.node"=c(), "query.delta"=c(), "query.type"=c(), "delta.diff"=c())
delta_anc_diffs = data.frame("ref.node"=c(), "ref.delta"=c(), "sig.ref.delta"=c(), "query.node"=c(), "query.delta"=c(), "query.type"=c(), "delta.diff"=c(), "replicate"=c())
delta_random_diffs =# Data frames to save delta diffs
for(i in 1:nrow(delta_info)){
# Loop over every branch with delta
delta_info[i,]$node
node = delta_info[i,]$delta
delta = delta_info[i,]$delta.sig
sig_delta = "N"
sig_label =
if(sig_delta){
"Y"
sig_label =
}# Get the delta value and other info from the branch
Ancestors(tree2, node, type="parent")
anc =if(tree2_info[tree2_info$node==node,]$tp %in% exclude_branches || tree2_info[tree2_info$node==anc,]$tp %in% exclude_branches){
next
}# Get the ancestral node, but move on if it is an excluded branch
delta_info[delta_info$node==anc,]$delta
anc_delta =if(delta_info[delta_info$node==anc,]$delta.sig){
"Ancestor Y"
sig_label =
}# Get the delta info from the ancestor
#delta_info[delta_info$node==node,]$delta.diff = abs(delta - anc_delta)
rbind(delta_anc_diffs, data.frame("ref.node"=node, "ref.delta"=delta, "sig.ref.delta"=sig_label, "query.node"=anc, "query.delta"=anc_delta, "query.type"="Ancestor", "delta.diff"=abs(delta-anc_delta)))
delta_anc_diffs =# Add the info of the comparison of this node to its ancestor to the anc delta diff data frame
for(j in 1:100){
# Select random branches
anc
random_anc =while(random_anc == anc || random_anc == node){
delta_info[sample(nrow(delta_info),1),]
random_row = random_row$node
random_anc =
}# Select a random branch in the tree as long as it is not this branch or its ancestor
rbind(delta_random_diffs, data.frame("ref.node"=node, "ref.delta"=delta, "sig.ref.delta"=sig_label, "query.node"=random_anc, "query.delta"=random_row$delta, "query.type"="Random branch", "delta.diff"=abs(delta-random_row$delta), "replicate"=j))
delta_random_diffs =# Add the info of the comparison of this node to the random branch to the random delta diff data frame
}
#delta_info[delta_info$node==node,]$random.diff = abs(delta - random_row$delta)
}
###############
list(c("Ancestor", "Random branch"))
x_comps =# List of comparisons to make
subset(delta_random_diffs, replicate==1)
delta_random_single = select(delta_random_single, -replicate)
delta_random_single =$query.type = "Random branch"
delta_random_single rbind(delta_anc_diffs, delta_random_single)
delta_diffs_single =# Get the first replicate, add label, and combine with anc diffs
ggplot(data=delta_diffs_single, aes(x=query.type, y=delta.diff, group=query.type, color=sig.ref.delta)) +
delta_diff_single_p = geom_quasirandom(size=2, width=0.25, alpha=0.30) +
geom_boxplot(outlier.shape=NA, alpha=0.15, width=0.5, color="#666666") +
geom_signif(comparisons=x_comps, map_signif_level=TRUE, textsize=4, size=1, step_increase=0.12, margin_top=0.1) +
scale_y_continuous(limits=c(0,0.7)) +
scale_color_manual("Significant delta?", values=c(corecol(numcol=1, pal="wilke", offset=4), corecol(numcol=1, pal="wilke", offset=3), "#920000")) +
ggtitle("Single random sample") +
xlab("") +
ylab("Delta difference") +
bartheme() +
theme(axis.text.x=element_text(angle=15, hjust=1),
plot.title = element_text(size=14, vjust=1),
legend.title=element_text(size=12),
legend.position="bottom",
plot.margin = unit(c(1,0,0,0), "cm")) +
guides(colour=guide_legend(override.aes=list(alpha=1)))
print(delta_diff_single_p)
# Distribution of delta diffs to ancestor and one random branch
###############
list(c("Ancestor", "Random branch"))
x_comps =# List of comparisons to make
delta_random_diffs %>% group_by(ref.node, query.type, sig.ref.delta) %>% summarize(delta.diff=median(delta.diff))
delta_random_meds = rbind(select(delta_anc_diffs, ref.node, query.type, sig.ref.delta, delta.diff), delta_random_meds)
delta_diffs_meds =# Calculate median of all random branch diffs and combine with anc diffs
ggplot(data=delta_diffs_meds, aes(x=query.type, y=delta.diff, group=query.type, color=sig.ref.delta)) +
delta_diff_med_p = geom_quasirandom(size=2, width=0.25, alpha=0.30) +
geom_boxplot(outlier.shape=NA, alpha=0.15, width=0.5, color="#666666") +
geom_signif(comparisons=x_comps, map_signif_level=TRUE, textsize=4, size=1, step_increase=0.12, margin_top=0.1) +
scale_y_continuous(limits=c(0,0.7)) +
scale_color_manual("Significant delta?", values=c(corecol(numcol=1, pal="wilke", offset=4), corecol(numcol=1, pal="wilke", offset=3), "#920000")) +
#ggtitle("Difference between delta and ancestral delta / average after 100 replicates of random sampling") +
ggtitle("Median after 100 random samples") +
xlab("") +
ylab("Delta difference") +
bartheme() +
theme(axis.text.x=element_text(angle=15, hjust=1),
plot.title = element_text(size=14, vjust=1),
legend.title=element_text(size=12),
legend.position="bottom",
plot.margin = unit(c(1,0,0,0), "cm")) +
guides(colour=guide_legend(override.aes=list(alpha=1)))
print(delta_diff_med_p)
# Distribution of delta diffs to ancestor and median over many random branches
###############
list(c("Ancestor", "Random branch"))
x_comps =# List of comparisons to make
delta_random_diffs %>% group_by(ref.node, query.type, sig.ref.delta) %>% summarize(delta.diff=mean(delta.diff))
delta_random_avgs =$query.type = "Random branch"
delta_random_avgs rbind(select(delta_anc_diffs, ref.node, query.type, sig.ref.delta, delta.diff), delta_random_avgs)
delta_diffs_avgs =# Calculate average of all random branch diffs and combine with anc diffs
ggplot(data=delta_diffs_avgs, aes(x=query.type, y=delta.diff, group=query.type, color=sig.ref.delta)) +
delta_diff_avg_p = geom_quasirandom(size=2, width=0.25, alpha=0.30) +
geom_boxplot(outlier.shape=NA, alpha=0.15, width=0.5, color="#666666") +
geom_signif(comparisons=x_comps, map_signif_level=TRUE, textsize=4, size=1, step_increase=0.12, margin_top=0.1) +
scale_y_continuous(limits=c(0,0.7)) +
scale_color_manual("Significant delta?", values=c(corecol(numcol=1, pal="wilke", offset=4), corecol(numcol=1, pal="wilke", offset=3), "#920000")) +
#ggtitle("Difference between delta and ancestral delta / average after 100 replicates of random sampling") +
ggtitle("Mean after 100 random samples") +
xlab("") +
ylab("Delta difference") +
bartheme() +
theme(axis.text.x=element_text(angle=15, hjust=1),
plot.title = element_text(size=14, vjust=1),
legend.title=element_text(size=12),
legend.position="bottom",
plot.margin = unit(c(1,0,0,0), "cm")) +
guides(colour=guide_legend(override.aes=list(alpha=1)))
print(delta_diff_avg_p)
# Distribution of delta diffs to ancestor and average over many random branches
###############
get_legend(delta_diff_single_p)
p_legend = plot_grid(delta_diff_single_p + theme(legend.position="none"),
p_grid =+ theme(legend.position="none"),
delta_diff_med_p + theme(legend.position="none"),
delta_diff_avg_p ncol=3)
plot_grid(p_grid, p_legend, nrow=2, rel_heights=c(1,0.1), align='vh')
p =
here("docs", "figs", "full-coding-delta-anc-diffs.png")
fig_outfile =ggsave(fig_outfile, p, width=12, height=4, unit="in")
# Save the figure
4.13 Delta differences to ancestors (reconstructed)
The same comparisons, but using values of delta reconstructed via a Brownian motion model
subset(tree2_info, !is.na(delta.recon))
delta_info =# Get info only from branches with delta values
#delta_diffs = data.frame("ref.node"=c(), "ref.delta"=c(), "query.node"=c(), "query.delta"=c(), "query.type"=c(), "delta.diff"=c())
data.frame("ref.node"=c(), "ref.delta"=c(), "sig.ref.delta"=c(), "query.node"=c(), "query.delta"=c(), "query.type"=c(), "delta.diff"=c())
delta_anc_diffs = data.frame("ref.node"=c(), "ref.delta"=c(), "sig.ref.delta"=c(), "query.node"=c(), "query.delta"=c(), "query.type"=c(), "delta.diff"=c(), "replicate"=c())
delta_random_diffs =# Data frames to save delta diffs
for(i in 1:nrow(delta_info)){
# Loop over every branch with delta
delta_info[i,]$node
node = delta_info[i,]$delta.recon
delta = delta_info[i,]$delta.sig
sig_delta = "N"
sig_label =
if(sig_delta){
"Y"
sig_label =
}# Get the delta value and other info from the branch
Ancestors(tree2, node, type="parent")
anc =if(tree2_info[tree2_info$node==node,]$tp %in% exclude_branches || tree2_info[tree2_info$node==anc,]$tp %in% exclude_branches){
next
}# Get the ancestral node, but move on if it is an excluded branch
delta_info[delta_info$node==anc,]$delta.recon
anc_delta =if(delta_info[delta_info$node==anc,]$delta.sig){
"Ancestor Y"
sig_label =
}# Get the delta info from the ancestor
#delta_info[delta_info$node==node,]$delta.diff = abs(delta - anc_delta)
rbind(delta_anc_diffs, data.frame("ref.node"=node, "ref.delta"=delta, "sig.ref.delta"=sig_label, "query.node"=anc, "query.delta"=anc_delta, "query.type"="Ancestor", "delta.diff"=abs(delta-anc_delta)))
delta_anc_diffs =# Add the info of the comparison of this node to its ancestor to the anc delta diff data frame
for(j in 1:100){
# Select random branches
anc
random_anc =while(random_anc == anc || random_anc == node){
delta_info[sample(nrow(delta_info),1),]
random_row = random_row$node
random_anc =
}# Select a random branch in the tree as long as it is not this branch or its ancestor
rbind(delta_random_diffs, data.frame("ref.node"=node, "ref.delta"=delta, "sig.ref.delta"=sig_label, "query.node"=random_anc, "query.delta"=random_row$delta, "query.type"="Random branch", "delta.diff"=abs(delta-random_row$delta), "replicate"=j))
delta_random_diffs =# Add the info of the comparison of this node to the random branch to the random delta diff data frame
}
#delta_info[delta_info$node==node,]$random.diff = abs(delta - random_row$delta)
}
###############
list(c("Ancestor", "Random branch"))
x_comps =# List of comparisons to make
subset(delta_random_diffs, replicate==1)
delta_random_single = select(delta_random_single, -replicate)
delta_random_single =$query.type = "Random branch"
delta_random_single rbind(delta_anc_diffs, delta_random_single)
delta_diffs_single =# Get the first replicate, add label, and combine with anc diffs
ggplot(data=delta_diffs_single, aes(x=query.type, y=delta.diff, group=query.type, color=sig.ref.delta)) +
delta_diff_single_p = geom_quasirandom(size=2, width=0.25, alpha=0.30) +
geom_boxplot(outlier.shape=NA, alpha=0.15, width=0.5, color="#666666") +
geom_signif(comparisons=x_comps, map_signif_level=TRUE, textsize=4, size=1, step_increase=0.12, margin_top=0.1) +
scale_y_continuous(limits=c(0,0.7)) +
scale_color_manual("Significant delta?", values=c(corecol(numcol=1, pal="wilke", offset=4), corecol(numcol=1, pal="wilke", offset=3), "#920000")) +
ggtitle("Single random sample") +
xlab("") +
ylab("Delta difference") +
bartheme() +
theme(axis.text.x=element_text(angle=15, hjust=1),
plot.title = element_text(size=14, vjust=1),
legend.title=element_text(size=12),
legend.position="bottom",
plot.margin = unit(c(1,0,0,0), "cm")) +
guides(colour=guide_legend(override.aes=list(alpha=1)))
print(delta_diff_single_p)
# Distribution of delta diffs to ancestor and one random branch
###############
list(c("Ancestor", "Random branch"))
x_comps =# List of comparisons to make
delta_random_diffs %>% group_by(ref.node, query.type, sig.ref.delta) %>% summarize(delta.diff=median(delta.diff))
delta_random_meds = rbind(select(delta_anc_diffs, ref.node, query.type, sig.ref.delta, delta.diff), delta_random_meds)
delta_diffs_meds =# Calculate median of all random branch diffs and combine with anc diffs
ggplot(data=delta_diffs_meds, aes(x=query.type, y=delta.diff, group=query.type, color=sig.ref.delta)) +
delta_diff_med_p = geom_quasirandom(size=2, width=0.25, alpha=0.30) +
geom_boxplot(outlier.shape=NA, alpha=0.15, width=0.5, color="#666666") +
geom_signif(comparisons=x_comps, map_signif_level=TRUE, textsize=4, size=1, step_increase=0.12, margin_top=0.1) +
scale_y_continuous(limits=c(0,0.7)) +
scale_color_manual("Significant delta?", values=c(corecol(numcol=1, pal="wilke", offset=4), corecol(numcol=1, pal="wilke", offset=3), "#920000")) +
#ggtitle("Difference between delta and ancestral delta / average after 100 replicates of random sampling") +
ggtitle("Median after 100 random samples") +
xlab("") +
ylab("Delta difference") +
bartheme() +
theme(axis.text.x=element_text(angle=15, hjust=1),
plot.title = element_text(size=14, vjust=1),
legend.title=element_text(size=12),
legend.position="bottom",
plot.margin = unit(c(1,0,0,0), "cm")) +
guides(colour=guide_legend(override.aes=list(alpha=1)))
print(delta_diff_med_p)
# Distribution of delta diffs to ancestor and median over many random branches
###############
list(c("Ancestor", "Random branch"))
x_comps =# List of comparisons to make
delta_random_diffs %>% group_by(ref.node, query.type, sig.ref.delta) %>% summarize(delta.diff=mean(delta.diff))
delta_random_avgs =$query.type = "Random branch"
delta_random_avgs rbind(select(delta_anc_diffs, ref.node, query.type, sig.ref.delta, delta.diff), delta_random_avgs)
delta_diffs_avgs =# Calculate average of all random branch diffs and combine with anc diffs
ggplot(data=delta_diffs_avgs, aes(x=query.type, y=delta.diff, group=query.type, color=sig.ref.delta)) +
delta_diff_avg_p = geom_quasirandom(size=2, width=0.25, alpha=0.30) +
geom_boxplot(outlier.shape=NA, alpha=0.15, width=0.5, color="#666666") +
geom_signif(comparisons=x_comps, map_signif_level=TRUE, textsize=4, size=1, step_increase=0.12, margin_top=0.1) +
scale_y_continuous(limits=c(0,0.7)) +
scale_color_manual("Significant delta?", values=c(corecol(numcol=1, pal="wilke", offset=4), corecol(numcol=1, pal="wilke", offset=3), "#920000")) +
#ggtitle("Difference between delta and ancestral delta / average after 100 replicates of random sampling") +
ggtitle("Mean after 100 random samples") +
xlab("") +
ylab("Delta difference") +
bartheme() +
theme(axis.text.x=element_text(angle=15, hjust=1),
plot.title = element_text(size=14, vjust=1),
legend.title=element_text(size=12),
legend.position="bottom",
plot.margin = unit(c(1,0,0,0), "cm")) +
guides(colour=guide_legend(override.aes=list(alpha=1)))
print(delta_diff_avg_p)
# Distribution of delta diffs to ancestor and average over many random branches
###############
get_legend(delta_diff_single_p)
p_legend = plot_grid(delta_diff_single_p + theme(legend.position="none"),
p_grid =+ theme(legend.position="none"),
delta_diff_med_p + theme(legend.position="none"),
delta_diff_avg_p ncol=3)
plot_grid(p_grid, p_legend, nrow=2, rel_heights=c(1,0.1), align='vh')
p =
here("docs", "figs", "full-coding-delta-anc-diffs-recon.png")
fig_outfile =ggsave(fig_outfile, p, width=12, height=4, unit="in")
# Save the figure
5 Delta spatial patterns
Here, we look for evidence of introgression along the genome by calculating the average size of blocks of genes that share the same topology. For all genes that form a block of topologies on a given branch (e.g. all genes in a row that share the same topology), we subtract the starting position of the final gene from the starting position of the first gene in the block.
We then look at the sizes of blocks around each branch for each topology: 1. the concordant topology, 2. the major discordant topology (the one that is most frequent), and 3. the minor discordant topology.
Specifically, we take the difference in the average block size of the major and minor topologies to measure if the skew in gene tree topologies is observable in different areas of the genome.
here("docs", "data", "trees", "astral", "concord-rooted-bl", "topo-counts.tab")
topo_file = read.table(topo_file, header=T)
topo_counts = subset(topo_counts, !tp.node %in% exclude_branches)
topo_counts = subset(topo_counts, mchr != "X" & mchr != "Y")
topo_counts =# Read the topo counts for each gene for each branch
###############
here("docs", "data", "trees", "astral", "concord-rooted-bl", "blocks.tab")
block_outfile =# A file to save the block sizes to
###############
$depth = node.depth(rodent_tree)
tree_info#tree_info$depth.bl = get_all_node_depths(rodent_tree)
$depth.edge = NA
tree_info$depth.bl = NA
tree_info
$depth.edge[189:nrow(tree_info)] = get_all_node_depths(rodent_tree, as_edge_count=T)
tree_info$depth.bl[189:nrow(tree_info)] = get_all_node_depths(rodent_tree)
tree_info# Depths of branches in the tree relative to tips
###############
subset(tree_info, node.type!="tip" & !tp %in% exclude_branches)
topo_tree_info = cf_stats
topo_cf_stats =names(topo_cf_stats)[1] = "node"
merge(topo_cf_stats, select(topo_tree_info, node, node.type, branch.length, depth, depth.bl, depth.edge, tp, delta, delta.sig), by="node")
topo_cf_stats =# Add the tree info to the cf table
$gD_prev = ifelse(topo_cf_stats$gDF1_N > topo_cf_stats$gDF2_N, "d1", "d2")
topo_cf_stats# Identify the prevalant discorant topology for each branch
if(!file.exists(block_outfile)){
data.frame("tp"=c(), "num.genes.c"=c(), "num_genes_d1"=c(), "num_genes_d2"=c(), "num.genes.m"=c(),
block_data ="c.avg.block.size"=c(), "c.avg.block.count"=c(), "c.num.blocks"=c(),
"d1.avg.block.size"=c(), "d1.avg.block.count"=c(), "d1.num.blocks"=c(),
"d2.avg.block.size"=c(), "d2.avg.block.count"=c(), "d2.num.blocks"=c(),
"m.avg.block.size"=c(), "m.avg.block.count"=c(), "m.num.blocks"=c())
# Block sizes for each topology around each branch
for(node in levels(as.factor(topo_counts$tp.node))){
# Loop over every node in the tree
0
c_blocks = c()
c_block_sizes = c()
c_block_counts = 0
d1_blocks = c()
d1_block_sizes = c()
d1_block_counts = 0
d2_blocks = c()
d2_block_sizes = c()
d2_block_counts = 0
m_blocks = c()
m_block_sizes = c()
m_block_counts =# Running tallies and lists of block sizes for each topology to be averaged
# over all chromosomes
0
num_genes_c = 0
num_genes_d1 = 0
num_genes_d2 = 0
num_genes_m =
for(chr in levels(as.factor(topo_counts$mchr))){
# Loop over every mouse chromosome
#print(paste(node, chr))
subset(topo_counts, tp.node==node & mchr==chr)
chrdata =# Subset topo counts from this chromosome and node
chrdata[order(chrdata$mstart), ]
chrdata =# Order the genes by their starting position on the chrome
$row.num = seq_along(chrdata[,1])
chrdata# Add the row number as a column
T
first = NA
cur_topo = 0
num_genes =# Initialize some things for this chromosomes
for(i in 1:nrow(chrdata)){
# Loop over every gene on this chromosomes
chrdata[i,]
cur_gene =# Get the current gene
if(first){
cur_gene$topo
cur_topo = 0
num_genes = cur_gene$mstart
block_start =
F
first =
next
}# If this is the first gene of this chromosome/node combo, start the first block
if(cur_gene$topo == "m"){
num_genes_m + 1
num_genes_m =else if(cur_gene$topo == "c"){
} num_genes_c + 1
num_genes_c =else if(cur_gene$topo == "d1"){
} num_genes_d1 + 1
num_genes_d1 =else if(cur_gene$topo == "d2"){
} num_genes_d2 + 1
num_genes_d2 =
}# Count the topology of the current gene... I probably don't need to do this...
if(!cur_gene$topo %in% cur_topo || i == nrow(chrdata)){
cur_gene$mstart - block_start
block_size =if(cur_topo == "m"){
m_blocks + 1
m_blocks = c(m_block_sizes, block_size)
m_block_sizes = c(m_block_counts, num_genes)
m_block_counts =else if(cur_topo == "c"){
} c_blocks + 1
c_blocks = c(c_block_sizes, block_size)
c_block_sizes = c(c_block_counts, num_genes)
c_block_counts =else if(cur_topo == "d1"){
} d1_blocks + 1
d1_blocks = c(d1_block_sizes, block_size)
d1_block_sizes = c(d1_block_counts, num_genes)
d1_block_counts =else if(cur_topo == "d2"){
} d2_blocks + 1
d2_blocks = c(d2_block_sizes, block_size)
d2_block_sizes = c(d2_block_counts, num_genes)
d2_block_counts =
}# Get the size of the current block and add it to the correct list
cur_gene$topo
cur_topo = 0
num_genes = cur_gene$mstart
block_start =# Reset the block counts and positions
else{
} num_genes + 1
num_genes =
}# If there is a decisive topology for this gene/node combo AND the topology doesn't match the
# previous topology .. OR if this is the last gene of the chromosome, then we need to count the
# size of the block and start the next block
# Otherwise, just increment the number of genes
}## End gene loop
}## End chromosome loop
rbind(block_data, data.frame("tp"=node,
block_data ="num.genes.c"=num_genes_c, "num_genes_d1"=num_genes_d1, "num_genes_d2"=num_genes_d2, "num.genes.m"=num_genes_m,
"c.avg.block.size"=mean(c_block_sizes, na.rm=T), "c.avg.block.count"=mean(c_block_counts, na.rm=T), "c.num.blocks"=c_blocks,
"d1.avg.block.size"=mean(d1_block_sizes, na.rm=T), "d1.avg.block.count"=mean(d1_block_counts, na.rm=T), "d1.num.blocks"=d1_blocks,
"d2.avg.block.size"=mean(d2_block_sizes, na.rm=T), "d2.avg.block.count"=mean(d2_block_counts, na.rm=T), "d2.num.blocks"=d2_blocks,
"m.avg.block.size"=mean(m_block_sizes, na.rm=T), "m.avg.block.count"=mean(m_block_counts, na.rm=T), "m.num.blocks"=m_blocks))
# Calculate average block sizes for this node over all chromosomes and add to the main df
}## End node loop
write.table(file=block_outfile, block_data, sep="\t", row.names=F)
# Write the block sizes per node to a file
else{
} read.table(block_outfile, header=T)
block_data =
}# If the block sizes have already been calculated, just read them in
merge(block_data, topo_cf_stats, by="tp")
block_data_merge =# Merge the block counts into the main cf df
###############
$anc.delta.sig = "N"
block_data_merge$desc.delta.sig = "N"
block_data_mergefor(i in 1:nrow(block_data_merge)){
# Loop over every branch with delta
block_data_merge[i,]$node
node =# Get the node of the current row
Ancestors(rodent_tree, node, type="parent")
anc =if(!anc %in% topo_tree_info$node){
next
}# Get the ancestral node, but move on if it is an excluded branch
block_data_merge[block_data_merge$node==anc,]$delta
anc_delta =if(block_data_merge[block_data_merge$node==anc,]$delta.sig == "Y"){
$anc.delta.sig = "Y"
block_data_merge[i,]
}# Get the delta info from the ancestor
for(desc in getDescendants(rodent_tree, node)){
if(!desc %in% topo_tree_info$node){
next
}
block_data_merge[block_data_merge$node==desc,]$delta
desc_delta =if(block_data_merge[block_data_merge$node==desc,]$delta.sig == "Y"){
$desc.delta.sig = "Y"
block_data_merge[i,]
}# Get the delta info from the ancestor
}
}## Determine whether ancestor or either descendant has a significant delta for each branch
###############
$major.block.size = ifelse(block_data_merge$gD_prev=="d1", block_data_merge$d1.avg.block.size, block_data_merge$d2.avg.block.size)
block_data_merge$minor.block.size = ifelse(block_data_merge$gD_prev=="d1", block_data_merge$d2.avg.block.size, block_data_merge$d1.avg.block.size)
block_data_merge$block.diff = block_data_merge$major.block.size - block_data_merge$minor.block.size
block_data_merge$block.ratio = block_data_merge$major.block.size / block_data_merge$minor.block.size
block_data_merge# Finally, determine the block sizes for the major and minor topologies, and calculate their differences
5.1 Distribution of block sizes for each topology around each branch
select(block_data_merge, tp, m.avg.block.size, delta.sig)
m =names(m)[2] = "block.size"
$label = "Missing"
m
select(block_data_merge, tp, c.avg.block.size, delta.sig)
c =names(c)[2] = "block.size"
$label = "Concordant"
c
select(block_data_merge, tp, d1.avg.block.size, delta.sig)
d1 =names(d1)[2] = "block.size"
$label = "D1"
d1
select(block_data_merge, tp, d2.avg.block.size, delta.sig)
d2 =names(d2)[2] = "block.size"
$label = "D2"
d2
rbind(c, d1, d2, m)
a =
#x_comps = list(c("Observed", "Inferred"))
ggplot(a, aes(x=label, y=block.size, group=label, color=delta.sig)) +
p = geom_quasirandom(size=2, width=0.25, alpha=0.5) +
geom_boxplot(outlier.shape=NA, alpha=0.15, width=0.5, color="#666666") +
#geom_signif(comparisons=x_comps, map_signif_level=TRUE, textsize=4, size=1, step_increase=0.12, margin_top=0.1) +
ylab("Avg. block size") +
xlab("") +
scale_color_manual("Significant delta?", values=c(corecol(numcol=1, pal="wilke", offset=3), corecol(numcol=1, pal="wilke", offset=4))) +
bartheme() +
theme(plot.title = element_text(size=14, vjust=1),
legend.title=element_text(size=12),
legend.position="bottom",
plot.margin = unit(c(1,0,0,0), "cm"))
print(p)
###############
here("docs", "figs", "full-coding-delta-block-dists.png")
fig_outfile =ggsave(fig_outfile, p, width=4, height=4, unit="in")
# Save the figure
5.2 Distribution of the difference between major and minor block sizes
Note that a negative value indicates that, on average, blocks consisting of the minor topology are larger than those of the major topology.
###############
ggplot(block_data_merge, aes(x=1, y=block.diff, color=delta.sig)) +
p = geom_quasirandom(size=2, width=0.25, alpha=0.5) +
geom_boxplot(outlier.shape=NA, alpha=0.15, width=0.5, color="#666666") +
#geom_signif(comparisons=x_comps, map_signif_level=TRUE, textsize=4, size=1, step_increase=0.12, margin_top=0.1) +
scale_x_continuous(limits=c(0,2)) +
ylab("Block size difference") +
xlab("") +
scale_color_manual("Significant delta?", values=c(corecol(numcol=1, pal="wilke", offset=3), corecol(numcol=1, pal="wilke", offset=4))) +
bartheme() +
theme(axis.title.x=element_blank(),
axis.text.x=element_blank(),
axis.ticks.x=element_blank(),
plot.title = element_text(size=14, vjust=1),
legend.title=element_text(size=12),
legend.position="bottom",
plot.margin = unit(c(1,0,0,0), "cm"))
print(p)
###############
here("docs", "figs", "full-coding-delta-block-diff-dist.png")
fig_outfile =ggsave(fig_outfile, p, width=4, height=4, unit="in")
# Save the figure
5.3 Block size difference and delta
# p = ggplot(block_data_merge, aes(x=block.diff, y=num.genes.na, color=delta.sig)) +
# geom_point(size=3, alpha=0.5) +
# geom_smooth(method="lm", se=F) +
# geom_text_repel(aes(label=ifelse(delta.sig=="Y",as.character(tp),'')), show_guide=F, min.segment.length=0) +
# bartheme()
# print(p)
ggplot(block_data_merge, aes(x=delta, y=block.diff, color=delta.sig)) +
p = geom_point(size=2, alpha=0.5) +
geom_smooth(method="lm", se=F, linetype="dashed", color="#333333") +
xlab("Delta") +
ylab("Block size difference") +
scale_color_manual("Significant delta?", values=c(corecol(numcol=1, pal="wilke", offset=3), corecol(numcol=1, pal="wilke", offset=4))) +
bartheme() +
theme(legend.title=element_text(size=12),
legend.position="bottom",
plot.margin = unit(c(1,0,0,0), "cm"))
print(p)
###############
here("docs", "figs", "full-coding-delta-block-diff-delta.png")
fig_outfile =ggsave(fig_outfile, p, width=6, height=4, unit="in")
# Save the figure
5.4 Block size difference and tree depth
Maximum number of branches from a tip
# p = ggplot(block_data_merge, aes(x=block.diff, y=num.genes.na, color=delta.sig)) +
# geom_point(size=3, alpha=0.5) +
# geom_smooth(method="lm", se=F) +
# geom_text_repel(aes(label=ifelse(delta.sig=="Y",as.character(tp),'')), show_guide=F, min.segment.length=0) +
# bartheme()
# print(p)
block_data_merge[order(block_data_merge$node), ]
block_data_merge =# Re-sort the data frame by R node order after the merge so the trees still work
ggplot(block_data_merge, aes(x=depth.bl, y=block.diff, color=delta.sig)) +
p = geom_point(size=2, alpha=0.5) +
geom_smooth(method="lm", se=F, linetype="dashed", color="#333333") +
xlab("Tree depth") +
ylab("Block size difference") +
scale_color_manual("Significant delta?", values=c(corecol(numcol=1, pal="wilke", offset=3), corecol(numcol=1, pal="wilke", offset=4))) +
bartheme() +
theme(legend.title=element_text(size=12),
legend.position="bottom",
plot.margin = unit(c(1,0,0,0), "cm"))
print(p)
###############
here("docs", "figs", "full-coding-delta-block-diff-depth.png")
fig_outfile =ggsave(fig_outfile, p, width=6, height=4, unit="in")
# Save the figure
6 Time tree
Tree was dated using IQ-tree’s implementation of least squares dating with the following fossil calibrations:
here("docs", "data", "trees", "astral", "iqtree-dating", "fossil-calibrations-iqtree.txt")
calibration_file = read.table(calibration_file, header=F)
cal_dates =names(cal_dates) = c("MRCA of", "dates")
# Read the fossil calibration dates
cal_dates %>% separate(dates, c("Min.age", "Max.age"), sep=":", remove=T)
cal_dates =$Max.age = abs(as.numeric(cal_dates$Max.age))
cal_dates$Min.age = abs(as.numeric(cal_dates$Min.age))
cal_dates# Parse the dates better
%>% kable() %>% kable_styling(bootstrap_options=c("striped", "condended", "responsive"), full_width=F) cal_dates
MRCA of | Min.age | Max.age |
---|---|---|
mm10,Arvicanthus_niloticus_MNHN1999201 | 11.1 | 12.3 |
Pithecheir_melanurus_LSUMZ38198,Arvicanthus_niloticus_MNHN1999201 | 8.7 | 10.1 |
mm10,Mus_pahari_10460X1 | 7.3 | 8.3 |
Leggadina_forresti_WAMM62323,Pseudomys_shortridgei_Z25113 | 4.0 | 4.5 |
# Display calibration points as a table
###############
#dated_tree_file = here("docs", "data", "trees", "astral", "iqtree-dating", "full-coding-astral-iqtree-dating-anil-mmus-12.1.timetree.nwk")
#dated_tree = read.tree(dated_tree_file)
#tree_to_df_list = treeToDF(rodent_tree)
#dated_info = tree_to_df_list[["info"]]
# Read the tree and parse with treetoDF
#plotTreeTime(dated_tree, runif(Ntip(dated_tree)))
here("docs", "data", "trees", "astral", "iqtree-dating", "full-coding-astral-iqtree-dating-anil-mmus-12.1.timetree.nex.mod")
dated_tree_file = readLines(con=dated_tree_file, n = -1)
dated_tree_text = readMCMCtree(dated_tree_text, from.file = FALSE)
dated_tree =# Read tree as nexus file
# From: https://cran.r-project.org/web/packages/MCMCtreeR/vignettes/MCMCtree_plot.html
MCMC.tree.plot(dated_tree, cex.tips = 0.3, time.correction = 1, scale.res = c("Eon",
"Period", "Epoch", "Age"), plot.type = "phylogram", cex.age = 0.6, cex.labels = 0.6,
relative.height = 0.1, col.tree = "grey40", label.offset = 0.05,
node.method = "bar", no.margin = TRUE, label.timescale.names=TRUE, add.time.scale=TRUE, add.abs.time=TRUE,
lwd.bar=2)
###############
here("docs", "figs", "full-coding-timetree.png")
fig_outfile =png(fig_outfile, width=12, height=12, units="in", res=320)
MCMC.tree.plot(dated_tree, cex.tips = 0.3, time.correction = 1, scale.res = c("Eon",
"Period", "Epoch", "Age"), plot.type = "phylogram", cex.age = 0.6, cex.labels = 0.6,
relative.height = 0.1, col.tree = "grey40", label.offset = 0.05,
node.method = "bar", no.margin = TRUE, label.timescale.names=TRUE, add.time.scale=TRUE, add.abs.time=TRUE,
lwd.bar=2)
dev.off()
## png
## 2
7 Speciation rates with BAMM
here("docs", "data", "full-murinae-taxonomy.csv")
taxo_file = read.csv(taxo_file, header=T, quote="\"")
taxo_data =# Read the taxonomy data
###############
taxo_data %>% count(Division)
div_comp_sums =names(div_comp_sums)[2] = "Full"
subset(taxo_data, Exome==1) %>% count(Division)
div_samp_sums =names(div_samp_sums)[2] = "Sampled"
merge(div_comp_sums, div_samp_sums, by="Division")
div_props =$div.prop.sampled = div_props$Sampled / div_props$Full
div_props merge(taxo_data, select(div_props, Division, div.prop.sampled), by="Division")
taxo_data =# Get Division counts and sampling rates
here("docs", "data", "bamm", "div-sampling-rates.tab")
div_sampling_file =write.table(div_props, file=div_sampling_file, row.names=F, quote=F)
# Write the counts to a file
###############
taxo_data %>% count(Tribe)
tribe_comp_sums =names(tribe_comp_sums)[2] = "Full"
subset(taxo_data, Exome==1) %>% count(Tribe)
tribe_samp_sums =names(tribe_samp_sums)[2] = "Sampled"
merge(tribe_comp_sums, tribe_samp_sums, by="Tribe")
tribe_props =$tribe.prop.sampled = tribe_props$Sampled / tribe_props$Full
tribe_props merge(taxo_data, select(tribe_props, Tribe, tribe.prop.sampled), by="Tribe")
taxo_data =# Get Tribe counts and sampling rates
here("docs", "data", "bamm", "tribe-sampling-rates.tab")
tribe_sampling_file =write.table(tribe_props, file=tribe_sampling_file, row.names=F, quote=F)
# Write the counts to a file
###############
subset(taxo_data, Exome==1)
sample_taxo_data =# Get the sampled species
###############
c("Lophuromys_woosnami_LSUMZ37793", "Lophiomys_imhausi_UM5152", "Mus_musculus-reference_10460X5", "Mus_musculus-domesticus_10460X15", "Mus_musculus-castaneus_10460X7", "Hydromys_sp-nov_YS391", "Genus_sp-nov_NMVZ21816", "Paruromys_dominator_JAE4870", "Mus_musculus-musculus_10460X14")
missing_samples =# A list of samples missing from the taxonomy file
c("Pithecheir_melanurus_LSUMZ38198", "Vandeleuria_oleracea_ABTC116404")
tribe_prob_full = c("Pithecheir melanurus", "Vandeleuria oleracea")
tribe_prob = c("Coccymys_ruemmleri_ABTC49489", "Vandeleuria_oleracea_ABTC116404", "Anonymomys_mindorensis_FMNH2222010")
div_prob_full = c("Coccymys ruemmleri", "Vandeleuria oleracea", "Anonymomys mindorensis")
div_prob =# Lists of samples with unclear taxonomy
###############
here("docs", "data", "bamm", "full-coding-astral-timetree-tribe.tre")
tribe_tree_file = read.nexus(dated_tree_file)
time_tree = drop.tip(time_tree, c(missing_samples, tribe_prob_full))
tribe_tree =write.tree(tribe_tree, file=tribe_tree_file)
# Write the tree for Tribe sampling
# NOTE: manually convert 0 branch lengths to 0.000001 (there are 5 of these for some reason)
treeToDF(tribe_tree)
tree_to_df_list = tree_to_df_list[["info"]]
tribe_tree_info = subset(tribe_tree_info, node.type=="tip")
tribe_tree_tips =# Read the tree and parse with treetoDF
tribe_tree_tips %>% separate(label, c("genus", "species", "sample"), sep="_", remove=F)
tribe_tree_tips =$Genus.species = paste(tribe_tree_tips$genus, tribe_tree_tips$species)
tribe_tree_tips$label=="mm10",]$Genus.species = "Mus musculus"
tribe_tree_tips[tribe_tree_tips$label=="rnor6",]$Genus.species = "Rattus norvegicus"
tribe_tree_tips[tribe_tree_tips# Parse the genus and species names of the remaining tips
###############
here("docs", "data", "bamm", "tribe-sampling-rates.tab")
tribe_sample_file = subset(sample_taxo_data, !Genus.species %in% tribe_prob)
tribe_data = merge(tribe_data, tribe_tree_tips, by="Genus.species")
tribe_data = select(tribe_data, label, Tribe, tribe.prop.sampled)
tribe_data =write.table(tribe_data, file=tribe_sample_file, sep="\t", row.names=F, col.names=F, quote=F)
# Write the Tribe sampling rates to a file
# NOTE: manually add 1.0 to the first line
# Format from: http://bamm-project.org/advanced.html#accounting-for-non-random-incomplete-taxon-sampling-in-diversification-studies
###############
# From: http://bamm-project.org/quickstart.html#speciation-extinction-analyses
subset(taxo_data, !grepl("EXTINCT",Extinct.TAXONOMY))
taxo_data_nonextinct =# Remove extinct species from complete list of species
here("docs", "data", "bamm", "tribe-bamm-priors.txt")
prior_file =setBAMMpriors(tribe_tree, total.taxa=nrow(taxo_data_nonextinct), outfile=prior_file)
# Generate BAMM priors based on the tree
## TRIBE
##########################################################################################
## DIVISION
here("docs", "data", "bamm", "full-coding-astral-timetree-div.tre")
div_tree_file = read.nexus(dated_tree_file)
time_tree = drop.tip(time_tree, c(missing_samples, div_prob_full))
div_tree =write.tree(div_tree, file=div_tree_file)
# Write the tree for Tribe sampling
# NOTE: manually convert 0 branch lengths to 0.000001 (there are 5 of these for some reason)
treeToDF(div_tree)
tree_to_df_list = tree_to_df_list[["info"]]
div_tree_info = subset(div_tree_info, node.type=="tip")
div_tree_tips =# Read the tree and parse with treetoDF
div_tree_tips %>% separate(label, c("genus", "species", "sample"), sep="_", remove=F)
div_tree_tips =$Genus.species = paste(div_tree_tips$genus, div_tree_tips$species)
div_tree_tips$label=="mm10",]$Genus.species = "Mus musculus"
div_tree_tips[div_tree_tips$label=="rnor6",]$Genus.species = "Rattus norvegicus"
div_tree_tips[div_tree_tips# Parse the genus and species names of the remaining tips
###############
here("docs", "data", "bamm", "div-sampling-rates.tab")
div_sample_file = subset(sample_taxo_data, !Genus.species %in% div_prob)
div_data = merge(div_data, div_tree_tips, by="Genus.species")
div_data = select(div_data, label, Division, div.prop.sampled)
div_data =write.table(div_data, file=div_sample_file, sep="\t", row.names=F, col.names=F, quote=F)
# Write the Tribe sampling rates to a file
# NOTE: manually add 1.0 to the first line
# Format from: http://bamm-project.org/advanced.html#accounting-for-non-random-incomplete-taxon-sampling-in-diversification-studies
###############
# From: http://bamm-project.org/quickstart.html#speciation-extinction-analyses
subset(taxo_data, !grepl("EXTINCT",Extinct.TAXONOMY))
taxo_data_nonextinct =# Remove extinct species from complete list of species
here("docs", "data", "bamm", "div-bamm-priors.txt")
prior_file =setBAMMpriors(div_tree, total.taxa=nrow(taxo_data_nonextinct), outfile=prior_file)
# Generate BAMM priors based on the tree
###############
7.1 Sampling rates based on Tribe
%>% kable() %>% kable_styling(bootstrap_options=c("striped", "condended", "responsive"), full_width=F) tribe_props
Tribe | Full | Sampled | tribe.prop.sampled |
---|---|---|---|
Apodemini | 26 | 1 | 0.0384615 |
Arvicanthini | 90 | 22 | 0.2444444 |
Hydromyini | 211 | 70 | 0.3317536 |
incertae sedis | 11 | 2 | 0.1818182 |
Malacomyini | 3 | 1 | 0.3333333 |
Murini | 43 | 11 | 0.2558140 |
Otomyini | 31 | 2 | 0.0645161 |
Phloeomyini | 18 | 6 | 0.3333333 |
Praomyini | 55 | 9 | 0.1636364 |
Rattini | 193 | 55 | 0.2849741 |
7.2 Speciation rates based on Tribe sampling (BAMM)
# From: https://lukejharmon.github.io/ilhabela/instruction/2015/07/02/diversification-analysis-bamm-rpanda/
here("docs", "data", "bamm", "full-coding-astral-timetree-tribe.tre")
bamm_tree_file = read.tree(bamm_tree_file)
bamm_tree =# Read the tree from the BAMM run
here("docs", "data", "bamm", "tribe_event_data.txt")
event_data_file = read.csv(event_data_file)
events =# Read the event file from the BAMM run
###############
getEventData(bamm_tree, events, burnin=0.25) ed =
## Processing event data from data.frame
##
## Discarded as burnin: GENERATIONS < 249000
## Analyzing 751 samples from posterior
##
## Setting recursive sequence on tree...
##
## Done with recursive sequence
getBestShiftConfiguration(ed, expectedNumberOfShifts=1, threshold=5) best =
## Processing event data from data.frame
##
## Discarded as burnin: GENERATIONS < 0
## Analyzing 1 samples from posterior
##
## Setting recursive sequence on tree...
##
## Done with recursive sequence
# Get the best shift possibility (MAP)
# See: http://bamm-project.org/rateshifts.html#overall-best-shift-configuration
plot.bammdata(best, lwd=2, labels = T, cex = 0.3)
bamm_tree_p =addBAMMshifts(best, cex=2)
addBAMMlegend(bamm_tree_p, nTicks = 4, side = 4, las = 1)
# Plot the rates and shifts on the tree
###############
here("docs", "figs", "full-coding-bamm-tribe.png")
fig_outfile =png(fig_outfile, width=12, height=12, units="in", res=320)
plot.bammdata(best, lwd=2, labels = T, cex = 0.3)
bamm_tree_p =addBAMMshifts(best, cex=2)
addBAMMlegend(bamm_tree_p, nTicks = 4, side = 4, las = 1)
dev.off()
## png
## 2
plotRateThroughTime(ed, intervalCol=corecol(numcol=1), avgCol=corecol(numcol=1), ylim=c(0,1), cex.axis=1, cex.lab=1.5)
###############
here("docs", "figs", "full-coding-bamm-tribe-time.png")
fig_outfile =png(fig_outfile, width=6, height=4, units="in", res=320)
plotRateThroughTime(ed, intervalCol=corecol(numcol=1), avgCol=corecol(numcol=1), ylim=c(0,1), cex.axis=1, cex=1.5)
dev.off()
## png
## 2
7.3 Speciation rates based on Tribe sampling (BAMM) and delta
here("docs", "data", "trees", "astral", "iqtree-dating", "full-coding-astral-iqtree-dating-anil-mmus-12.1.timetree.nex")
labeled_timetree_file = read.nexus(labeled_timetree_file)
labeled_timetree = treeToDF(labeled_timetree)
tree_to_df_list = tree_to_df_list[["info"]]
labeled_timetree_info =# Read the labeled time tree and parse with treetoDF
select(tree_info, node, clade, tp, delta)
tree_info_clades =names(tree_info_clades)[1] = "orig.node"
# Select relevant columns from original tree df
merge(labeled_timetree_info, tree_info_clades, by="clade")
labeled_timetree_info =# Merge the original tree info with the time tree info to get the delta values
labeled_timetree_info[order(labeled_timetree_info$node), ]
labeled_timetree_info =# Re-sort the data frame by R node order after the merge so the trees still work
###############
# Since the speciation rates are missing some species, we need to prune the tree info
# to remove those species. We do this by removing those species from the tree info df
# and from any clade in any other node. At the end, for clades that are identical
# (because their descendants have been pruned), we keep the branch at the shallowest
# depth
$depth = node.depth(labeled_timetree)
labeled_timetree_info# Get the depth of each node in the tree for pruning
c(missing_samples, tribe_prob_full)
tips_to_drop =# The species we need to remove
for(tip in tips_to_drop){
subset(labeled_timetree_info, label != tip)
labeled_timetree_info =$clade = gsub(paste(";", tip, sep=""), "", labeled_timetree_info$clade)
labeled_timetree_info$clade = gsub(paste(tip, ";", sep=""), "", labeled_timetree_info$clade)
labeled_timetree_info$clade = gsub(tip, "", labeled_timetree_info$clade)
labeled_timetree_info
}# Loop over each species to remove and remove that row from the df and that species from any
# clade in the df
subset(labeled_timetree_info, clade != "")
labeled_timetree_info =# For some branches with all descendant species removed, they will now have empty clades
# Remove them here
$node.label = paste(labeled_timetree_info$label, "/", labeled_timetree_info$delta, sep="")
labeled_timetree# Add delta to the label of the time tree
labeled_timetree_info[order(labeled_timetree_info[,'clade'],labeled_timetree_info[,'depth']),]
labeled_timetree_info = labeled_timetree_info[!duplicated(labeled_timetree_info$clade),]
labeled_timetree_info =# Sort nodes in the tree df by clade and then by depth
# Then, for duplicate clades, keep the one at the shallowest depth
labeled_timetree_info[order(labeled_timetree_info$node), ]
labeled_timetree_info =# Re-sort the data frame by R node order after the merge so the trees still work
###############
drop.tip(labeled_timetree, c(missing_samples, tribe_prob_full))
labeled_tribetree = treeToDF(labeled_tribetree)
tree_to_df_list = tree_to_df_list[["info"]]
labeled_tribetree_info =# Now, prune the tree structure and re-read the pruned tree and parse with treetoDF
merge(select(labeled_timetree_info, clade, tp, delta), labeled_tribetree_info, by="clade")
labeled_tribetree_info =# Add the info from the full tree to the pruned tribe tree by merging by clade
labeled_tribetree_info[order(labeled_tribetree_info$node), ]
labeled_tribetree_info =# Re-sort the data frame by R node order after the merge so the trees still work
###############
getMeanBranchLengthTree(best)$phy
speciation_tree = treeToDF(speciation_tree)
tree_to_df_list = tree_to_df_list[["info"]]
speciation_tree_info =# Get the tree with speciation rates as branch lengths from BAMM and parse to df
write.csv(speciation_tree_info, file=here("docs", "data", "bamm", "full-coding-astral-bamm-as-bl.csv"), row.names=F)
merge(select(labeled_tribetree_info, clade, tp, delta), speciation_tree_info, by="clade")
speciation_tree_info =# Add the info from the tribe tree to the speciation tree
labeled_timetree_info[order(labeled_timetree_info$node), ]
speciation_tree_info =# Re-sort the data frame by R node order after the merge so the trees still work
###############
ggplot(speciation_tree_info, aes(x=delta, y=branch.length)) +
p = geom_point() +
geom_smooth(method="lm") +
bartheme()
#print(p)
# Simple correlation
###############
subset(speciation_tree_info, node.type != "tip")
speciation_tree_internal_info =$node.label = paste(speciation_tree_internal_info$label, "/", speciation_tree_internal_info$delta, sep="")
speciation_tree# Add delta values to the internal node labels of the speciation tree
###############
# At this point, to do PGLS without delta values on the tips, we need to again
# remove tip branches. To do this, you'd have to write.tree() and then use
# bonsai to remove the tips
#write.tree(speciation_tree)
here("docs", "data", "trees", "astral", "concord-rooted-bl", "full_coding_iqtree_astral_rooted_speciation.bl.cf.labeled.notips.tree")
speciation_tree2_file = read.tree(file=speciation_tree2_file)
speciation_tree2 = treeToDF(speciation_tree2)
tree_to_df_list = tree_to_df_list[["info"]]
speciation_tree2_info =# Read the speciation tree without tips and parse to df
speciation_tree2_info %>% separate(label, c("tp", "support"), sep=">", remove=F)
speciation_tree2_info =$tp = paste(speciation_tree2_info$tp, ">", sep="")
speciation_tree2_info speciation_tree2_info %>% separate(support, c("astral", "gcf", "scf", "delta"), sep="/", remove=T)
speciation_tree2_info =$delta = as.numeric(speciation_tree2_info$delta)
speciation_tree2_info# Split the label by /. tp is my treeParse label.
subset(speciation_tree2_info, node.type=="tip")
speciation_tree2_tip_info =# Get only the tip info from the new tree
###############
# From: https://lukejharmon.github.io/ilhabela/instruction/2015/07/03/PGLS/
speciation_tree2_tip_info$delta
delta = speciation_tree2_tip_info$branch.length
speciation.rate =# Get the delta values and branch length values for plotting later
gls(delta ~ branch.length, correlation = corBrownian(phy = speciation_tree2),
pglsModel =data = speciation_tree2_tip_info, method = "ML")
summary(pglsModel)
## Generalized least squares fit by maximum likelihood
## Model: delta ~ branch.length
## Data: speciation_tree2_tip_info
## AIC BIC logLik
## -70.74475 -64.6687 38.37238
##
## Correlation Structure: corBrownian
## Formula: ~1
## Parameter estimate(s):
## numeric(0)
##
## Coefficients:
## Value Std.Error t-value p-value
## (Intercept) 0.11288103 0.08639655 1.3065455 0.1969
## branch.length -0.00390095 0.02904279 -0.1343172 0.8937
##
## Correlation:
## (Intr)
## branch.length -0.48
##
## Standardized residuals:
## Min Q1 Med Q3 Max
## -0.5606684 -0.4261186 -0.2044627 0.2750020 1.6042793
##
## Residual standard error: 0.1861634
## Degrees of freedom: 56 total; 54 residual
coef(pglsModel)
## (Intercept) branch.length
## 0.112881032 -0.003900946
# Run the PGLS
plot(speciation.rate,delta)
abline(a = coef(pglsModel)[1], b = coef(pglsModel)[2])
# Plot the PGLS
###############
here("docs", "figs", "full-coding-bamm-tribe-spec-delta.png")
fig_outfile =png(fig_outfile, width=6, height=4, units="in", res=320)
plot(speciation.rate,delta)
abline(a = coef(pglsModel)[1], b = coef(pglsModel)[2])
dev.off()
## png
## 2
# Save the figure
7.4 Sampling rates based on Division
%>% kable() %>% kable_styling(bootstrap_options=c("striped", "condended", "responsive"), full_width=F) div_props
Division | Full | Sampled | div.prop.sampled |
---|---|---|---|
Aethomys | 11 | 3 | 0.2727273 |
Apodemus | 26 | 1 | 0.0384615 |
Arvicanthis | 30 | 7 | 0.2333333 |
Berylmys | 4 | 2 | 0.5000000 |
Bunomys | 31 | 15 | 0.4838710 |
Chiropodomys | 6 | 1 | 0.1666667 |
Chrotomys | 39 | 5 | 0.1282051 |
Colomys | 4 | 2 | 0.5000000 |
Conilurus | 7 | 6 | 0.8571429 |
Dacnomys | 41 | 7 | 0.1707317 |
Dasymys | 14 | 2 | 0.1428571 |
Echiothrix | 10 | 9 | 0.9000000 |
Haeromys | 3 | 1 | 0.3333333 |
Hybomys | 11 | 5 | 0.4545455 |
Hydromys | 32 | 9 | 0.2812500 |
incertae sedis | 13 | 3 | 0.2307692 |
Malacomys | 3 | 1 | 0.3333333 |
Mallomys | 11 | 5 | 0.4545455 |
Maxomys | 22 | 4 | 0.1818182 |
Mus | 43 | 11 | 0.2558140 |
Oenomys | 24 | 5 | 0.2083333 |
Otomys | 31 | 2 | 0.0645161 |
Phloeomys | 18 | 6 | 0.3333333 |
Pithecheir | 3 | 1 | 0.3333333 |
Pogonomys | 16 | 6 | 0.3750000 |
Pseudomys | 41 | 32 | 0.7804878 |
Rattus | 84 | 17 | 0.2023810 |
Stenocephalemys | 51 | 7 | 0.1372549 |
Uromys | 52 | 4 | 0.0769231 |
7.5 Speciation rates based on Division sampling (BAMM)
# From: https://lukejharmon.github.io/ilhabela/instruction/2015/07/02/diversification-analysis-bamm-rpanda/
here("docs", "data", "bamm", "full-coding-astral-timetree-div.tre")
bamm_tree_file = read.tree(bamm_tree_file)
bamm_tree =# Read the tree from the BAMM run
here("docs", "data", "bamm", "div_event_data.txt")
event_data_file = read.csv(event_data_file)
events =# Read the event file from the BAMM run
###############
getEventData(bamm_tree, events, burnin=0.25) ed =
## Processing event data from data.frame
##
## Discarded as burnin: GENERATIONS < 249000
## Analyzing 751 samples from posterior
##
## Setting recursive sequence on tree...
##
## Done with recursive sequence
getBestShiftConfiguration(ed, expectedNumberOfShifts=1, threshold=5) best =
## Processing event data from data.frame
##
## Discarded as burnin: GENERATIONS < 0
## Analyzing 1 samples from posterior
##
## Setting recursive sequence on tree...
##
## Done with recursive sequence
# Get the best shift possibility (MAP)
# See: http://bamm-project.org/rateshifts.html#overall-best-shift-configuration
plot.bammdata(best, lwd=2, labels = T, cex = 0.3)
bamm_tree_p =addBAMMshifts(best, cex=2)
addBAMMlegend(bamm_tree_p, nTicks = 4, side = 4, las = 1)
# Plot the rates and shifts on the tree
###############
here("docs", "figs", "full-coding-bamm-div.png")
fig_outfile =png(fig_outfile, width=12, height=12, units="in", res=320)
plot.bammdata(best, lwd=2, labels = T, cex = 0.3)
bamm_tree_p =addBAMMshifts(best, cex=2)
addBAMMlegend(bamm_tree_p, nTicks = 4, side = 4, las = 1)
dev.off()
## png
## 2
plotRateThroughTime(ed, intervalCol=corecol(numcol=1), avgCol=corecol(numcol=1), ylim=c(0,1), cex.axis=1, cex.lab=1.5)
###############
here("docs", "figs", "full-coding-bamm-div-time.png")
fig_outfile =png(fig_outfile, width=6, height=4, units="in", res=320)
plotRateThroughTime(ed, intervalCol=corecol(numcol=1), avgCol=corecol(numcol=1), ylim=c(0,1), cex.axis=1, cex=1.5)
dev.off()
## png
## 2
8 Speciation rates with RPanda
8.1 Species tree colored by tribe
Following http://www.phytools.org/***SanJuan2016/ex/14/Complex-diversification-models.html
## The following rpanda sections follow http://www.phytools.org/***SanJuan2016/ex/14/Complex-diversification-models.html
here("docs", "data", "trees", "astral", "iqtree-dating", "full-coding-astral-iqtree-dating-anil-mmus-12.1.timetree.nex")
labeled_timetree_file = read.nexus(labeled_timetree_file)
labeled_timetree = drop.tip(labeled_timetree, c(missing_samples, tribe_prob_full))
labeled_timetree = treeToDF(labeled_timetree)
tree_to_df_list = tree_to_df_list[["info"]]
labeled_timetree_info =# Read the labeled time tree and parse with treetoDF
labeled_timetree_info[order(labeled_timetree_info$node), ]
labeled_timetree_info =# Re-sort the data frame by R node order after the merge so the trees still work
subset(labeled_timetree_info, node.type=="tip")
rpanda_tree_tips = rpanda_tree_tips %>% separate(clade, c("genus", "species", "sample"), sep="_", remove=F)
rpanda_tree_tips =$Genus.species = paste(rpanda_tree_tips$genus, rpanda_tree_tips$species)
rpanda_tree_tips$label=="mm10",]$Genus.species = "Mus musculus"
rpanda_tree_tips[rpanda_tree_tips$label=="rnor6",]$Genus.species = "Rattus norvegicus"
rpanda_tree_tips[rpanda_tree_tips# Parse the genus and species names of the remaining tips
$tip.label = rpanda_tree_tips$Genus.species
labeled_timetree# Replace the tip labels in the time tree with the genus and species names
$Genus.species = NA
labeled_timetree_info$clade == rpanda_tree_tips$clade,]$Genus.species = rpanda_tree_tips$Genus.species
labeled_timetree_info[labeled_timetree_info# Add the genus and species labels to the info df
$tribe = NA
labeled_timetree_info# Initialize a tribe column in the tree info
###############
subset(sample_taxo_data, !Genus.species %in% tribe_prob)
rpanda_taxo_data =# Remove the problematic tribe samples for rpanda
$Tribe = as.factor(rpanda_taxo_data$Tribe)
rpanda_taxo_data# Factorize the tribe column
c()
tribes = list()
tribe_subtrees =# Initialize a vector of tribes for all tribes with > 1 species and
# a list of tribe subtrees
for(tribe in levels(rpanda_taxo_data$Tribe)){
#print(tribe)
subset(rpanda_taxo_data, Tribe==tribe)$Genus.species
tribe_samples =# Get all the samples from the current tribe
if(length(tribe_samples) < 2){
$Genus.species %in% tribe_samples,]$tribe = tribe
labeled_timetree_info[labeled_timetree_infonext
}# If the tribe has only one species, add the tribe label to the tree info column for that species,
# but then skip the rest
c(tribes, tribe)
tribes =# Add this tribe to the vector of tribes that have > 2 species
getMRCA(labeled_timetree, tribe_samples)
n =#print(n)
# Get the mrca of all the tribe species
getDescendants(labeled_timetree, n)
tribe_nodes =# Get all the descendant nodes from the mrca
$node == n,]$tribe = tribe
labeled_timetree_info[labeled_timetree_info$node %in% tribe_nodes,]$tribe = tribe
labeled_timetree_info[labeled_timetree_info# Add tribe info for the tribe nodes to the tree info df
extract.clade(labeled_timetree, n)
tribe_subtrees[[tribe]] =# Extract the subtree for this tribe
}
c(corecol(numcol=8), corecol(numcol=2, offset=10))
tribe_cols =
16
xmax = ggtree(labeled_timetree, size=0.8, aes(color=labeled_timetree_info$tribe)) +
tribe_tree = xlim(0, xmax) +
geom_tiplab(fontface='italic', size=2, show.legend=F) +
scale_color_manual(name="Tribe", values=tribe_cols) +
#theme(legend.position="bottom")
theme(legend.position=c(0.05,0.9))
print(tribe_tree)
# A tribe tree figure
length(tribes)
num_tribes =# Variable to keep track of the number of tribes
###############
###############
function(x,y){y}
lambda.cst <- function(x,y){y[1]*exp(y[2]*x)}
lambda.var <- function(x,y){y}
mu.cst <- function(x,y){y[1]*exp(y[2]*x)}
mu.var <-# The different models to test
function(tree, par, sample_rate) {
fit.multi.rpanda <- fit_bd(tree, max(branching.times(tree)), f.lamb=lambda.cst, f.mu=mu.cst, lamb_par=par[[1]][1], mu_par=par[[1]][2], cst.lamb=TRUE, cst.mu=TRUE, cond="crown", f=sample_rate, dt=1e-3)
bcstdcst = fit_bd(tree, max(branching.times(tree)), f.lamb=lambda.var, f.mu=mu.cst, lamb_par=par[[2]][c(1,2)], mu_par=par[[2]][3], expo.lamb=TRUE, cst.mu=TRUE, cond="crown", f=sample_rate, dt=1e-3)
bvardcst = fit_bd(tree, max(branching.times(tree)), f.lamb=lambda.cst, f.mu=mu.var, lamb_par=par[[3]][1], mu_par=par[[3]][c(2,3)], cst.lamb=TRUE, expo.mu=TRUE, cond="crown", f=sample_rate, dt=1e-3)
bcstdvar = fit_bd(tree, max(branching.times(tree)), f.lamb=lambda.var, f.mu=mu.var, lamb_par=par[[4]][c(1,2)], mu_par=par[[4]][c(3,4)], expo.lamb=TRUE, expo.mu=TRUE, cond="crown", f=sample_rate, dt=1e-3)
bvardvar =return(list("bcstdcst"=bcstdcst,"bvardcst"=bvardcst,"bcstdvar"=bcstdvar,"bvardvar"=bvardvar))
}# A function that uses rpanda to fit various models, given a tree and some initial params
###############
list(c(0.4,0),c(0.4,-0.05,0),c(0.4,0.1,0.05),c(0.4,-0.05,0.1,0.05))
init_params =# We need to supply starting parameter values for optimization to RPANDA
list()
rpanda_results =# Initialize a list of results by tribe
for(tribe in tribes){
#print(tribe)
tribe_props[tribe_props$Tribe==tribe,]$tribe.prop.sampled
prop_sampled =#print(prop_sampled)
fit.multi.rpanda(tribe_subtrees[[tribe]], init_params, 1)
rpanda_results[[tribe]] =
}# Run the model fitting function on each tribe
###############
8.2 AICc results for 4 models
See: http://www.phytools.org/***SanJuan2016/ex/14/Complex-diversification-models.html
matrix(nrow=4,ncol=num_tribes,NA)
aic.table =# Initialize a matrix for the AIC table
# One column for each tribe, one row for each model
colnames(aic.table) = tribes
rownames(aic.table) = c("bcstdcst","bvardcst","bcstdvar","bvardvar")
# Rename the columns and the rows based on the tribes and the models
list()
rpanda_best_fit =# Initialize an empty list with the best fitting (lowest aicc) model for each tribe
0
col_i =# A column counter
for(tribe in tribes) {
col_i + 1
col_i =# Increment the column counter to match the current tribe's column
999999999
min_aic = ""
min_aic_model =# Initialize the min aic and model to pick the best model
0
row_j =# A row counter
for(model in rownames(aic.table)) {
row_j + 1
row_j =# Increment the row counter to match the current model's row
rpanda_results[[tribe]][[model]]$aicc
cur_aic =# Get the aic for the current model
if(cur_aic < min_aic){
cur_aic
min_aic = model
min_aic_model =
}# Check if the aic is lower than the lowest so far and replace if so
cur_aic
aic.table[row_j,col_i] =# Add the aic to the table
}
min_aic_model
rpanda_best_fit[[tribe]] =# After all models have been checked for this tribe, store the best fitting one on the list
}# Retrieve the AIC values from the rpanda results
%>% kable() %>% kable_styling(bootstrap_options=c("striped", "condended", "responsive"), full_width=F) aic.table
Arvicanthini | Hydromyini | Murini | Otomyini | Phloeomyini | Praomyini | Rattini | |
---|---|---|---|---|---|---|---|
bcstdcst | 110.63650 | 289.5277 | 50.78988 | -8.000000 | 33.29397 | 42.77105 | 209.6955 |
bvardcst | 97.15104 | 288.0626 | 54.60092 | -6.000000 | 43.11915 | 41.75441 | 209.6553 |
bcstdvar | 99.16857 | 291.7122 | 54.71845 | -6.000000 | 43.12934 | 47.57092 | 211.9353 |
bvardvar | 100.18723 | 290.3143 | 59.83902 | -5.333333 | 73.11915 | 48.95441 | 211.9847 |
###############
8.3 Parameter estimates for the best fitting model for each tribe
RPanda estimates lambda (speciation rate) and mu (extinction rate)
data.frame("dummy"=c(0,0,0,0))
param_table =# Initialize the parameters table with a dummy column with one row per model tested
for(tribe in tribes){
rpanda_best_fit[[tribe]]
best_model =# Get the best model for the current tribe
c(rpanda_results[[tribe]][[best_model]]$lamb_par[1:2],rpanda_results[[tribe]][[best_model]]$mu_par[1:2])
param_table[,tribe] =# Add the estimated params for the best model for each tribe to the table
}
select(param_table, -dummy)
param_table =# Remove the dummy column
rownames(param_table) = c("Lambda1", "Lambda2", "Mu1", "Mu2")
# Add row names to the table for each parameter
%>% kable() %>% kable_styling(bootstrap_options=c("striped", "condended", "responsive"), full_width=F) param_table
Arvicanthini | Hydromyini | Murini | Otomyini | Phloeomyini | Praomyini | Rattini | |
---|---|---|---|---|---|---|---|
Lambda1 | 0.0942485 | 0.2544161 | 0.2195913 | 0.000000 | 0.1151515 | 0.0329268 | 0.3090729 |
Lambda2 | 0.4567976 | 0.1095986 | NA | NA | NA | 0.6308453 | 0.1138635 |
Mu1 | -0.5176374 | 0.0000001 | -0.0000001 | 1.082918 | -0.0000002 | 0.0000000 | 0.0000002 |
Mu2 | NA | NA | NA | NA | NA | NA | NA |
###############
8.4 Estimating total number of species over time
data.frame(tribe=c(), t=c(), N=c())
rpanda_plot_df =# A df to store results for each tribe
for(tribe in tribes){
rpanda_results[[tribe]][[rpanda_best_fit[[tribe]]]]
best_model =# Get the best fitting model for the current tribe
max(branching.times(tribe_subtrees[[tribe]]))
max_div_time =
if (!inherits(best_model, "fit.bd")) {
stop("object \"fit.bd\" is not of class \"fit.bd\"")
}# If the best fitting model isn't an rpanda bd then error out
seq(0, max_div_time, 0.01)
t =# Intervals of time to estimate N
if ("f.mu" %in% attributes(best_model)$names) {
function(t) { -best_model$f.lamb(t) + best_model$f.mu(t) }
r <-else {
} function(t) { -best_model$f.lamb(t) }
r <-
} function(s) { RPANDA:::.Integrate(Vectorize(r), 0, s) }
R <-# Depending on the best model, functions to sample parameters at a given time
Ntip(tribe_subtrees[[tribe]])
num_tips = num_tips * exp(Vectorize(R)(t))
N =# Estimate the number of tips at a given time
rbind(rpanda_plot_df, data.frame(tribe=tribe, t=-t, N=N))
rpanda_plot_df =# Add the data for the current tribe to the results df
}
###############
c(tribe_cols[2:3], tribe_cols[5:9])
matched_tribe_cols =
ggplot(rpanda_plot_df, aes(x=t, y=N, color=tribe)) +
rpanda_p = geom_line(size=1) +
xlab("Time (mya)") +
ylab("# of species") +
scale_color_manual(values=matched_tribe_cols) +
bartheme() +
theme(legend.position="bottom")
print(rpanda_p)
8.5 Spectral density analysis of murine phylogeny
https://besjournals.onlinelibrary.wiley.com/doi/10.1111/2041-210X.12526
spectR(labeled_timetree)
res1 =plot_spectR(res1)
BICompare(labeled_timetree, res1$eigengap)
res2 =plot_BICompare(labeled_timetree, res2)
9 Tip introgression with RND
From https://onlinelibrary.wiley.com/doi/10.1111/mec.13610
RND is a measure of sequence divergence between a pair of species corrected for divergence from an outgroup:
\[RND = \frac{d_{XY}}{d_{out}}\]
Where: \(d_{XY}\) is the average distance (\(d_{x,y}\)) between all sequences between the two species \(X\) and \(Y\), and:
\[d_{out} = \frac{d_{XO} + d_{YO}}{2}\]
Since we have two outgroups, \(O_1\) and \(O_2\), each of \(d_{XO}\) and \(d_{XY}\) is also an average of the distances between the two outgroups, e.g.:
\[d_{XO} = \frac{d_{XO_1} + d_{XO_2}}{2}\] Low values of \(d_{XY}\) and/or \(RND\) are indicative of recent introgression.
9.1 Summary of sister pairs
#source("C:/bin/Ks_plots/fitMixEM.R")
#source("C:/bin/Ks_plots/bicWGD.R")
#source("C:/bin/Ks_plots/plotComponentExpectations.R")
# Files for mixture model testing code
here("docs", "data", "trees", "tip-introgression-rnd")
rnd_dir =# The input directory for the dxy files
paste0(rnd_dir, "/sister/")
rnd_sis_dir = list.files(rnd_sis_dir)
sis_files = paste0(rnd_sis_dir, sis_files)
sis_files =# Read the sister tip pairs
paste0(rnd_dir, "/non-sister/")
rnd_non_sis_dir = list.files(rnd_non_sis_dir)
non_sis_files = paste0(rnd_non_sis_dir, non_sis_files)
non_sis_files =# Read the non-sister tip pairs
#rnd_files = c(sis_files, non_sis_files)
#rnd_files = sis_files
#rnd_files = sample(non_sis_files, 10)
data.frame("pair"=c(), "mean.rnd"=c(), "median.rnd"=c())
rnd_summary_sis =# A df for the sister pairs
for(file in sis_files){
read.csv(file, header=F)
cur_data =names(cur_data) = c("gene", "cds.len", "pair.diffs", "x.out1", "x.out2", "y.out1", "y.out2")
# Read the file for the current pair and add headers
tools::file_path_sans_ext(basename(file))
comp =$pair = comp
cur_data# Get the pair name
$d.xy = cur_data$pair.diffs / cur_data$cds.len
cur_data# Calculate dxy between the pair
$x.out = rowMeans(cur_data[,c('x.out1', 'x.out2')], na.rm=T)
cur_data$y.out = rowMeans(cur_data[,c('y.out1', 'y.out2')], na.rm=T)
cur_data# Calculate the average number of differences between each in the pair and both outgroups
$d.out = rowMeans(cur_data[,c('x.out', 'y.out')], na.rm=T)
cur_data# Calculate average dxy to the outgroups
$rnd = cur_data$d.xy / cur_data$d.out
cur_data# Calculate RND as dxy / d_out
#bic.test.wgd(cur_data$rnd, startK = 1, maxK = 2, model = 1, nstarts = 100, outPrefix = comp)
#x = modetest(cur_data$rnd, method="HH")
# Mixture model and multimode testing
ggplot(cur_data, aes(x=rnd)) +
p = geom_histogram(bins=100) +
ggtitle(basename(comp)) +
xlab("RND") +
ylab("# of genes") +
scale_y_continuous(expand=c(0,0)) +
bartheme() +
theme(plot.title=element_text(size=12))
#print(p)
# Histogram of rnd for the current pair
rbind(rnd_summary_sis, data.frame(pair=comp, mean.rnd=mean(cur_data$rnd, na.rm=T), median.rnd=median(cur_data$rnd, na.rm=T)))
rnd_summary_sis =# Summarize the RND distribution for the current pair
}
print(p)
$pair <- reorder(rnd_summary_sis$pair, -rnd_summary_sis$median.rnd)
rnd_summary_sis
ggplot(rnd_summary_sis, aes(x=pair, y=mean.rnd, color="Mean")) +
p = geom_point() +
xlab("") +
ylab("RND") +
geom_point(aes(y=median.rnd, color="Median")) +
scale_color_manual(values=c("Mean"="#333333", "Median"=corecol(numcol=1))) +
theme(legend.position="bottom") +
coord_flip()
print(p)
9.2 Summary of non-sister pairs
Distributions of median RND values for each pair
paste0(rnd_dir, "/non-sister/")
rnd_non_sis_dir = list.files(rnd_non_sis_dir)
non_sis_files = paste0(rnd_non_sis_dir, non_sis_files)
non_sis_files =# Read the non-sister tip pairs
#non_sis_files = sample(non_sis_files, 50)
data.frame("pair"=c(), "mean.rnd"=c(), "median.rnd"=c())
rnd_summary_non_sis =# A df for the sister pairs
for(file in non_sis_files){
read.csv(file, header=F)
cur_data =names(cur_data) = c("gene", "cds.len", "pair.diffs", "x.out1", "x.out2", "y.out1", "y.out2")
# Read the file for the current pair and add headers
tools::file_path_sans_ext(basename(file))
comp =$pair = comp
cur_data# Get the pair name
$d.xy = cur_data$pair.diffs / cur_data$cds.len
cur_data# Calculate dxy between the pair
$x.out = rowMeans(cur_data[,c('x.out1', 'x.out2')], na.rm=T)
cur_data$y.out = rowMeans(cur_data[,c('y.out1', 'y.out2')], na.rm=T)
cur_data# Calculate the average number of differences between each in the pair and both outgroups
$d.out = rowMeans(cur_data[,c('x.out', 'y.out')], na.rm=T)
cur_data# Calculate average dxy to the outgroups
$rnd = cur_data$d.xy / cur_data$d.out
cur_data# Calculate RND as dxy / d_out
rbind(rnd_summary_non_sis, data.frame(pair=comp, mean.rnd=mean(cur_data$rnd, na.rm=T), median.rnd=median(cur_data$rnd, na.rm=T)))
rnd_summary_non_sis =# Summarize the RND distribution for the current pair
}
$type = "Sister pairs"
rnd_summary_sis$type = "Non-sister pairs"
rnd_summary_non_sis
rbind(rnd_summary_sis, rnd_summary_non_sis)
rnd_summary =
ggplot(rnd_summary, aes(x=type, y=median.rnd, group=type, color=type)) +
p = geom_quasirandom(size=2, width=0.25, alpha=0.25) +
geom_boxplot(outlier.shape=NA, alpha=0.15, width=0.5, color="#666666") +
xlab("") +
ylab("Median RND") +
scale_color_manual(values=corecol(numcol=2, pal="wilke")) +
#geom_signif(comparisons=x_comps, map_signif_level=TRUE, textsize=4, size=1, step_increase=0.12, margin_top=0.1) +
#scale_x_continuous(limits=c(0,2)) +
bartheme() +
theme(legend.position="none")
print(p)