Murine species trees

knitr::opts_chunk$set(echo = TRUE, warning = FALSE, message = FALSE)

library(ggplot2)
library(cowplot)
library(ggbeeswarm)
library(dplyr)
library(kableExtra)
library(tidyr)
library(ggtree)
library(phytools)
library(phangorn)
library(reshape2)
library(ggExtra)
library(ggrepel)
library(vroom)
library(ggdist)
library(stringr)
library(ggsignif)
library(phytools)
library(castor)
library(MCMCtreeR)
library(here)
library(stringr)
library(BAMMtools)
library(nlme)

source(here("docs", "scripts", "lib", "design.r"))
source(here("docs", "scripts", "lib", "get_tree_info.r"))

#source("C:/bin/core/r/design.r")
#source("C:/bin/core/r/get_tree_info.r")

#source("C:/Users/grt814/bin/core/corelib/design.r")
#source("C:/Users/grt814/bin/core/get_tree_info.r")

#htmltools::includeHTML("../html-chunks/rmd_nav.html")

< Back to summary

1 Full coding species tree

  1. 188 species targeted capture to mouse exons
  2. Assembled with SPADES
  3. 11,795 coding loci aligned with exonerate+MAFFT
  4. Gene trees inferred with IQ-Tree
  5. Species tree inferred with ASTRAL
  6. Branch lengths inferred with IQ-Tree using the 866 loci with a normalized RF distance to the species tree of less than 0.25.
# This chunk handles all of the main inputs and reads the tree

# cat("188 species targeted capture to mouse exons assembled with SPADES.\n")
# cat("11,775 coding loci aligned with exonerate+mafft\n")
# cat("Gene trees inferred with IQtree.\n")
# cat("Species tree inferred with ASTRAL.\n")
# cat("Branch lengths estimated by ASTRAL.\n")
# Data summary

###############

#tree_file = here("docs", "data", "trees", "astral", "concord-rooted", "full_coding_iqtree_astral.cf.labeled.tree")
tree_file = here("docs", "data", "trees", "astral", "morphometrics-pruned", "morpho-coding-iqtree-astral-rooted-bl-cf.tre")
# Newick tree file, with tp labels

rodent_tree = read.tree(tree_file)
tree_to_df_list = treeToDF(rodent_tree)
tree_info = tree_to_df_list[["info"]]
# Read the tree and parse with treetoDF

tree_info = tree_info %>% separate(label, c("tp", "support"), sep=">", remove=F)
tree_info$tp = paste(tree_info$tp, ">", sep="")
tree_info = tree_info %>% separate(support, c("gcf", "scf"), sep="/", remove=T)
# Split the label by /. tp is my treeParse label.

tree_info$gcf = as.numeric(tree_info$gcf)
tree_info$scf = as.numeric(tree_info$scf)
# Convert all supports to numeric

###############

#iq_tree_labels = here("docs", "data", "trees", "astral", "concord-rooted", "full_coding_iqtree_astral.cf.branch")
#iq_tree_labels = here("docs", "data", "trees", "astral", "concord-rooted-bl", "full_coding_iqtree_astral_rooted_bl.cf.branch")
# Newick tree file, with iqtree labels

###############

#cf_stat_file = here("docs", "data", "trees", "astral", "concord-rooted", "full_coding_iqtree_astral.cf.stat")
cf_stat_file = here("docs", "data", "trees", "astral", "morphometrics-pruned", "morpho-coding-iqtree-astral-rooted-bl-cf-stats.tab")
cf_stats = read.table(cf_stat_file, header=T)
# Concordance factor file

###############

#cf_rep_dir = here("docs", "data", "trees", "astral", "concord-rooted", "cf-reps")
#cf_rep_dir = here("docs", "data", "trees", "astral", "concord-rooted-bl", "cf-reps")
#delta_outfile = here("docs", "data", "trees", "astral", "concord-rooted", "delta.tab")
#delta_outfile = here("docs", "data", "trees", "astral", "concord-rooted-bl", "delta.tab")
# CF reps for delta and delta outfile

###############

#col_file = here("docs", "data", "trees", "astral", "astral-colonization-branches.txt")
#exclude_branches = c("Lophiomys_imhausi_UM5152", "Lophuromys_woosnami_LSUMZ37793", "<1>", "<187>", "<186>")
# Colonization branches and branches around the root to exclude from some things

###############

gt_file = here("docs", "data", "trees", "astral", "morphometrics-pruned", "morpho-coding-gene-trees-rooted-labeled.tre")
# The file containing the gene trees

###############

#xmax = 31
xmax = 0.175
# The max of the x-axis for tree figures

1.1 Branches colored by gCF

h = corecol(numcol=1, pal="wilke", offset=3)
l = corecol(numcol=1, offset=3)
# Colors

gcf_tree = ggtree(rodent_tree, size=0.8, ladderize=F, aes(color=tree_info$gcf)) +
  scale_color_continuous(name='gCF', low=l, high=h, limits=c(0,1)) +
  xlim(0, xmax) +
  geom_tiplab(color="#333333", fontface='italic', size=2) +
  theme(legend.position=c(0.05,0.9))
#gcf_tree = gcf_tree + geom_text(aes(x=branch, label=ifelse(tree_info$node.type=="internal",as.character(node), ''), label.size=NA, fill="transparent"), size=2, vjust=-0.2)
# Option to add branch labels to the figure
print(gcf_tree)

###############

fig_outfile = here("docs", "figs", "morpho-coding-astral-rooted-gcf.png")
ggsave(fig_outfile, gcf_tree, width=14, height=14, unit="in")
# Save the tree image

## gCF tree

1.2 Branches colored by sCF

h = corecol(numcol=1, pal="wilke", offset=3)
l = corecol(numcol=1, offset=3)
# Colors

scf_tree = ggtree(rodent_tree, size=0.8, ladderize=F, aes(color=tree_info$scf)) +
  scale_color_continuous(name='sCF', low=l, high=h, limits=c(0,1)) +
  xlim(0, xmax) +
  geom_tiplab(color="#333333", fontface='italic', size=2) +
  theme(legend.position=c(0.05,0.9))
#gcf_tree = gcf_tree + geom_text(aes(x=branch, label=ifelse(tree_info$node.type=="internal",as.character(node), ''), label.size=NA, fill="transparent"), size=2, vjust=-0.2)
# Option to add branch labels to the figure
print(gcf_tree)

###############

fig_outfile = here("docs", "figs", "morpho-coding-astral-rooted-scf.png")
ggsave(fig_outfile, scf_tree, width=14, height=14, unit="in")
# Save the tree image

## gCF tree

1.3 Gene vs site concordance factors colored by branch support

h = corecol(numcol=1, pal="wilke", offset=3)
l = corecol(numcol=1, offset=3)
# Colors

p = ggplot(tree_info, aes(x=gcf, y=scf)) + 
  geom_point(size=2, alpha=0.5) +
  bartheme() +
  theme(legend.title=element_text(size=12))
print(p)

###############

fig_outfile = here("docs", "figs", "full-coding-astral-cf.png")
ggsave(fig_outfile, p, width=6, height=3, unit="in")
# Save the figure

1.4 Concordance factors vs. branch lengths

bl_gcf_p = ggplot(tree_info, aes(x=branch.length, y=gcf)) +
  geom_point(size=3, alpha=0.5) +
  #geom_text_repel(aes(label=ifelse(avg.ds>0.2|avg.dn>0.01,as.character(node),'')), show_guide=F) +
  #geom_smooth(method="lm", se=F, ) +
  scale_x_continuous(limits=c(0,0.03)) +
  xlab("Branch length") + 
  ylab("gCF per branch") + 
  bartheme()
bl_gcf_p = ggExtra::ggMarginal(bl_gcf_p, type="histogram", bins=50, fill=corecol(pal="wilke", numcol=1, offset=4), color="#666666")

bl_scf_p = ggplot(tree_info, aes(x=branch.length, y=scf)) +
  geom_point(size=3, alpha=0.5) +
  #geom_text_repel(aes(label=ifelse(avg.ds>0.2|avg.dn>0.01,as.character(node),'')), show_guide=F) +
  #geom_smooth(method="lm", se=F, ) +
  scale_x_continuous(limits=c(0,0.03)) +
  xlab("Branch length") + 
  ylab("sCF per branch") + 
  bartheme()
bl_scf_p = ggExtra::ggMarginal(bl_scf_p, type="histogram", bins=50, fill=corecol(pal="wilke", numcol=1, offset=5), color="#666666")

p = plot_grid(bl_gcf_p, bl_scf_p, ncol=2)
print(p)

###############

fig_outfile = here("docs", "figs", "morpho-coding-astral-cf-bl.png")
ggsave(fig_outfile, p, width=6, height=3, unit="in")
# Save the figure

2 Gene trees

The 11,774 gene trees contain varying numbers of taxa due to filtering on the alignment and filtering by IQtree for identical sequences.

2.1 Taxa per gene tree

gene_trees = read.table(gt_file, sep="\t")
names(gene_trees) = c("gene", "tree")
# Read the geen tree file

gene_trees$gene = word(gene_trees$gene, 1, sep="-")
#gene_trees$gene = word(gene_trees$gene, 2, sep="/")
# Parse out the protein ID from the filename for each gene

###############

gt_data = data.frame("gene"=c(), "rf"=c(), "num.tips"=c(), "rf.zeros"=c(), "num.tips.zeros"=c())
# A df to track info for each gene tree

for(i in 1:nrow(gene_trees)){
  gt = read.tree(text=gene_trees[i,]$tree)
  # Read the gene tree as a phylo object
  
  cur_tips = gt$tip.label
  pruned_tree = drop.tip(rodent_tree, rodent_tree$tip.label[-match(cur_tips, rodent_tree$tip.label)])
  # Get a list of the tips and prune the species tree to contain only those tips
  
  cur_rf = RF.dist(pruned_tree, gt, normalize=T)
  # Calculate RF between the pruned species tree and the gene tree
  
  if(cur_rf==0){
    gt_data = rbind(gt_data, data.frame("gene"=gene_trees[i,]$gene, "rf"=cur_rf, "num.tips"=length(cur_tips), "rf.zeros"=cur_rf, "num.tips.zeros"=length(cur_tips)))
  }else{
    gt_data = rbind(gt_data, data.frame("gene"=gene_trees[i,]$gene, "rf"=cur_rf, "num.tips"=length(cur_tips), "rf.zeros"=NA, "num.tips.zeros"=NA))
  }
  # Add the gene tree info to the df, with a special case for when the RF is 0
}
# Read all the gene trees

p = ggplot(gt_data, aes(x=num.tips)) +
  geom_histogram(bins=50, fill=corecol(pal="wilke", numcol=1, offset=6), color="#666666") +
  #geom_quasirandom(size=2, width=0.25, alpha=0.25, color="#666666") +
  #geom_boxplot(outlier.shape=NA, alpha=0.75, width=0.5, color="#666666") +
  scale_y_continuous(expand=c(0,0)) +
  xlab("# of taxa") +
  ylab("# of gene trees") +
  bartheme()
print(p)

###############

fig_outfile = here("docs", "figs", "morpho-coding-gt-taxa-dist.png")
ggsave(fig_outfile, p, width=6, height=3, unit="in")
# Save the figure

#rf_file = here("docs", "data", "trees", "astral", "concord-rooted", "loci-nrf-below-0.25.txt")
#write.table(subset(gt_data, rf<=0.25), file=rf_file, row.names=F, quote=F, sep="\t")
# Write info for genes with low RF

2.2 Tree distance (RF) between gene trees and species tree

Since RF cannot handle missing taxa, the species tree is pruned for each gene tree to calculate Robinson-Foulds distance. We use the normalized metric since there are varying numbers of tips per gene tree.

p = ggplot(gt_data, aes(x=rf)) +
  geom_histogram(bins=50, fill=corecol(pal="wilke", numcol=1, offset=4), color="#666666") +
  #geom_quasirandom(size=2, width=0.25, alpha=0.25, color="#666666") +
  #geom_boxplot(outlier.shape=NA, alpha=0.75, width=0.5, color="#666666") +
  scale_y_continuous(expand=c(0,0)) +
  xlab("Normalized RF") +
  ylab("# of gene trees") +
  bartheme()
print(p)

###############

fig_outfile = here("docs", "figs", "morpho-coding-gt-rf-dist.png")
ggsave(fig_outfile, p, width=6, height=3, unit="in")
# Save the figure

< Back to summary