Murine species trees
::opts_chunk$set(echo = TRUE, warning = FALSE, message = FALSE)
knitr
library(ggplot2)
library(cowplot)
library(ggbeeswarm)
library(dplyr)
library(kableExtra)
library(tidyr)
library(ggtree)
library(phytools)
library(phangorn)
library(reshape2)
library(ggExtra)
library(ggrepel)
library(vroom)
library(ggdist)
library(stringr)
library(ggsignif)
library(phytools)
library(castor)
library(MCMCtreeR)
library(here)
library(stringr)
library(BAMMtools)
library(nlme)
source(here("docs", "scripts", "lib", "design.r"))
source(here("docs", "scripts", "lib", "get_tree_info.r"))
#source("C:/bin/core/r/design.r")
#source("C:/bin/core/r/get_tree_info.r")
#source("C:/Users/grt814/bin/core/corelib/design.r")
#source("C:/Users/grt814/bin/core/get_tree_info.r")
#htmltools::includeHTML("../html-chunks/rmd_nav.html")
1 Full coding species tree
- 188 species targeted capture to mouse exons
- Assembled with SPADES
- 11,795 coding loci aligned with exonerate+MAFFT
- Gene trees inferred with IQ-Tree
- Species tree inferred with ASTRAL
- Branch lengths inferred with IQ-Tree using the 866 loci with a normalized RF distance to the species tree of less than 0.25.
# This chunk handles all of the main inputs and reads the tree
# cat("188 species targeted capture to mouse exons assembled with SPADES.\n")
# cat("11,775 coding loci aligned with exonerate+mafft\n")
# cat("Gene trees inferred with IQtree.\n")
# cat("Species tree inferred with ASTRAL.\n")
# cat("Branch lengths estimated by ASTRAL.\n")
# Data summary
###############
#tree_file = here("docs", "data", "trees", "astral", "concord-rooted", "full_coding_iqtree_astral.cf.labeled.tree")
= here("docs", "data", "trees", "astral", "morphometrics-pruned", "morpho-coding-iqtree-astral-rooted-bl-cf.tre")
tree_file # Newick tree file, with tp labels
= read.tree(tree_file)
rodent_tree = treeToDF(rodent_tree)
tree_to_df_list = tree_to_df_list[["info"]]
tree_info # Read the tree and parse with treetoDF
= tree_info %>% separate(label, c("tp", "support"), sep=">", remove=F)
tree_info $tp = paste(tree_info$tp, ">", sep="")
tree_info= tree_info %>% separate(support, c("gcf", "scf"), sep="/", remove=T)
tree_info # Split the label by /. tp is my treeParse label.
$gcf = as.numeric(tree_info$gcf)
tree_info$scf = as.numeric(tree_info$scf)
tree_info# Convert all supports to numeric
###############
#iq_tree_labels = here("docs", "data", "trees", "astral", "concord-rooted", "full_coding_iqtree_astral.cf.branch")
#iq_tree_labels = here("docs", "data", "trees", "astral", "concord-rooted-bl", "full_coding_iqtree_astral_rooted_bl.cf.branch")
# Newick tree file, with iqtree labels
###############
#cf_stat_file = here("docs", "data", "trees", "astral", "concord-rooted", "full_coding_iqtree_astral.cf.stat")
= here("docs", "data", "trees", "astral", "morphometrics-pruned", "morpho-coding-iqtree-astral-rooted-bl-cf-stats.tab")
cf_stat_file = read.table(cf_stat_file, header=T)
cf_stats # Concordance factor file
###############
#cf_rep_dir = here("docs", "data", "trees", "astral", "concord-rooted", "cf-reps")
#cf_rep_dir = here("docs", "data", "trees", "astral", "concord-rooted-bl", "cf-reps")
#delta_outfile = here("docs", "data", "trees", "astral", "concord-rooted", "delta.tab")
#delta_outfile = here("docs", "data", "trees", "astral", "concord-rooted-bl", "delta.tab")
# CF reps for delta and delta outfile
###############
#col_file = here("docs", "data", "trees", "astral", "astral-colonization-branches.txt")
#exclude_branches = c("Lophiomys_imhausi_UM5152", "Lophuromys_woosnami_LSUMZ37793", "<1>", "<187>", "<186>")
# Colonization branches and branches around the root to exclude from some things
###############
= here("docs", "data", "trees", "astral", "morphometrics-pruned", "morpho-coding-gene-trees-rooted-labeled.tre")
gt_file # The file containing the gene trees
###############
#xmax = 31
= 0.175
xmax # The max of the x-axis for tree figures
1.1 Branches colored by gCF
= corecol(numcol=1, pal="wilke", offset=3)
h = corecol(numcol=1, offset=3)
l # Colors
= ggtree(rodent_tree, size=0.8, ladderize=F, aes(color=tree_info$gcf)) +
gcf_tree scale_color_continuous(name='gCF', low=l, high=h, limits=c(0,1)) +
xlim(0, xmax) +
geom_tiplab(color="#333333", fontface='italic', size=2) +
theme(legend.position=c(0.05,0.9))
#gcf_tree = gcf_tree + geom_text(aes(x=branch, label=ifelse(tree_info$node.type=="internal",as.character(node), ''), label.size=NA, fill="transparent"), size=2, vjust=-0.2)
# Option to add branch labels to the figure
print(gcf_tree)
###############
= here("docs", "figs", "morpho-coding-astral-rooted-gcf.png")
fig_outfile ggsave(fig_outfile, gcf_tree, width=14, height=14, unit="in")
# Save the tree image
## gCF tree
1.2 Branches colored by sCF
= corecol(numcol=1, pal="wilke", offset=3)
h = corecol(numcol=1, offset=3)
l # Colors
= ggtree(rodent_tree, size=0.8, ladderize=F, aes(color=tree_info$scf)) +
scf_tree scale_color_continuous(name='sCF', low=l, high=h, limits=c(0,1)) +
xlim(0, xmax) +
geom_tiplab(color="#333333", fontface='italic', size=2) +
theme(legend.position=c(0.05,0.9))
#gcf_tree = gcf_tree + geom_text(aes(x=branch, label=ifelse(tree_info$node.type=="internal",as.character(node), ''), label.size=NA, fill="transparent"), size=2, vjust=-0.2)
# Option to add branch labels to the figure
print(gcf_tree)
###############
= here("docs", "figs", "morpho-coding-astral-rooted-scf.png")
fig_outfile ggsave(fig_outfile, scf_tree, width=14, height=14, unit="in")
# Save the tree image
## gCF tree
1.3 Gene vs site concordance factors colored by branch support
= corecol(numcol=1, pal="wilke", offset=3)
h = corecol(numcol=1, offset=3)
l # Colors
= ggplot(tree_info, aes(x=gcf, y=scf)) +
p geom_point(size=2, alpha=0.5) +
bartheme() +
theme(legend.title=element_text(size=12))
print(p)
###############
= here("docs", "figs", "full-coding-astral-cf.png")
fig_outfile ggsave(fig_outfile, p, width=6, height=3, unit="in")
# Save the figure
1.4 Concordance factors vs. branch lengths
= ggplot(tree_info, aes(x=branch.length, y=gcf)) +
bl_gcf_p geom_point(size=3, alpha=0.5) +
#geom_text_repel(aes(label=ifelse(avg.ds>0.2|avg.dn>0.01,as.character(node),'')), show_guide=F) +
#geom_smooth(method="lm", se=F, ) +
scale_x_continuous(limits=c(0,0.03)) +
xlab("Branch length") +
ylab("gCF per branch") +
bartheme()
= ggExtra::ggMarginal(bl_gcf_p, type="histogram", bins=50, fill=corecol(pal="wilke", numcol=1, offset=4), color="#666666")
bl_gcf_p
= ggplot(tree_info, aes(x=branch.length, y=scf)) +
bl_scf_p geom_point(size=3, alpha=0.5) +
#geom_text_repel(aes(label=ifelse(avg.ds>0.2|avg.dn>0.01,as.character(node),'')), show_guide=F) +
#geom_smooth(method="lm", se=F, ) +
scale_x_continuous(limits=c(0,0.03)) +
xlab("Branch length") +
ylab("sCF per branch") +
bartheme()
= ggExtra::ggMarginal(bl_scf_p, type="histogram", bins=50, fill=corecol(pal="wilke", numcol=1, offset=5), color="#666666")
bl_scf_p
= plot_grid(bl_gcf_p, bl_scf_p, ncol=2)
p print(p)
###############
= here("docs", "figs", "morpho-coding-astral-cf-bl.png")
fig_outfile ggsave(fig_outfile, p, width=6, height=3, unit="in")
# Save the figure
2 Gene trees
The 11,774 gene trees contain varying numbers of taxa due to filtering on the alignment and filtering by IQtree for identical sequences.
2.1 Taxa per gene tree
= read.table(gt_file, sep="\t")
gene_trees names(gene_trees) = c("gene", "tree")
# Read the geen tree file
$gene = word(gene_trees$gene, 1, sep="-")
gene_trees#gene_trees$gene = word(gene_trees$gene, 2, sep="/")
# Parse out the protein ID from the filename for each gene
###############
= data.frame("gene"=c(), "rf"=c(), "num.tips"=c(), "rf.zeros"=c(), "num.tips.zeros"=c())
gt_data # A df to track info for each gene tree
for(i in 1:nrow(gene_trees)){
= read.tree(text=gene_trees[i,]$tree)
gt # Read the gene tree as a phylo object
= gt$tip.label
cur_tips = drop.tip(rodent_tree, rodent_tree$tip.label[-match(cur_tips, rodent_tree$tip.label)])
pruned_tree # Get a list of the tips and prune the species tree to contain only those tips
= RF.dist(pruned_tree, gt, normalize=T)
cur_rf # Calculate RF between the pruned species tree and the gene tree
if(cur_rf==0){
= rbind(gt_data, data.frame("gene"=gene_trees[i,]$gene, "rf"=cur_rf, "num.tips"=length(cur_tips), "rf.zeros"=cur_rf, "num.tips.zeros"=length(cur_tips)))
gt_data else{
}= rbind(gt_data, data.frame("gene"=gene_trees[i,]$gene, "rf"=cur_rf, "num.tips"=length(cur_tips), "rf.zeros"=NA, "num.tips.zeros"=NA))
gt_data
}# Add the gene tree info to the df, with a special case for when the RF is 0
}# Read all the gene trees
= ggplot(gt_data, aes(x=num.tips)) +
p geom_histogram(bins=50, fill=corecol(pal="wilke", numcol=1, offset=6), color="#666666") +
#geom_quasirandom(size=2, width=0.25, alpha=0.25, color="#666666") +
#geom_boxplot(outlier.shape=NA, alpha=0.75, width=0.5, color="#666666") +
scale_y_continuous(expand=c(0,0)) +
xlab("# of taxa") +
ylab("# of gene trees") +
bartheme()
print(p)
###############
= here("docs", "figs", "morpho-coding-gt-taxa-dist.png")
fig_outfile ggsave(fig_outfile, p, width=6, height=3, unit="in")
# Save the figure
#rf_file = here("docs", "data", "trees", "astral", "concord-rooted", "loci-nrf-below-0.25.txt")
#write.table(subset(gt_data, rf<=0.25), file=rf_file, row.names=F, quote=F, sep="\t")
# Write info for genes with low RF
2.2 Tree distance (RF) between gene trees and species tree
Since RF cannot handle missing taxa, the species tree is pruned for each gene tree to calculate Robinson-Foulds distance. We use the normalized metric since there are varying numbers of tips per gene tree.
= ggplot(gt_data, aes(x=rf)) +
p geom_histogram(bins=50, fill=corecol(pal="wilke", numcol=1, offset=4), color="#666666") +
#geom_quasirandom(size=2, width=0.25, alpha=0.25, color="#666666") +
#geom_boxplot(outlier.shape=NA, alpha=0.75, width=0.5, color="#666666") +
scale_y_continuous(expand=c(0,0)) +
xlab("Normalized RF") +
ylab("# of gene trees") +
bartheme()
print(p)
###############
= here("docs", "figs", "morpho-coding-gt-rf-dist.png")
fig_outfile ggsave(fig_outfile, p, width=6, height=3, unit="in")
# Save the figure