Murine multi-nucleotide substitutions
::opts_chunk$set(echo = TRUE, warning = FALSE, message = FALSE)
knitr
library(ggplot2)
library(cowplot)
library(ggbeeswarm)
library(dplyr)
#library(kableExtra)
library(tidyr)
library(ggtree)
library(phytools)
library(phangorn)
library(reshape2)
library(ggExtra)
library(ggrepel)
#library(vroom)
library(ggdist)
#library(stringr)
source("../lib/design.r")
source("../lib/get_tree_info.r")
1 Multi-nucleotide substitutions
Here defined as two or more substitutions inferred along a branch in the same codon
#tree_type = "astral"
= F
save_tree_fig
cat(tree_type, " tree\n")
## astral tree
cat("188 species targeted capture to mouse exons assembled with SPADES.\n")
## 188 species targeted capture to mouse exons assembled with SPADES.
cat("11,775 coding loci aligned with exonerate+mafft\n")
## 11,775 coding loci aligned with exonerate+mafft
cat("Gene trees inferred with IQtree.\n")
## Gene trees inferred with IQtree.
if(tree_type == "astral"){
cat("Species tree inferred with ASTRAL.\n")
cat("Branch lengths estimated using 865 loci with normalized RF distance <= 0.25.\n")
= "../../data/trees/full_coding_iqtree_astral.cf.bl.nrf25.rooted.treefile"
tree_file = "../../data/rates/full-coding-astral-mg94-local-branch-mns.csv"
anc_file = c("Lophiomys_imhausi_UM5152", "Lophuromys_woosnami_LSUMZ37793", "<1>", "<187>", "<186>")
exclude_branches = 0.125
xmax else if(tree_type == "concat"){
}cat("Species tree inferred by concatenation of all loci with IQtree.\n")
= "../../data/trees/full_coding_iqtree_concat.cf.rooted.tree"
tree_file = "../../data/rates/full-coding-concat-mg94-local-branch-mns.csv"
anc_file = c("Lophiomys_imhausi_UM5152", "Lophuromys_woosnami_LSUMZ37793", "<1>", "<187>", "<186>")
exclude_branches = 0.125
xmax }
## Species tree inferred with ASTRAL.
## Branch lengths estimated using 865 loci with normalized RF distance <= 0.25.
cat("Ancestral sequences inferred with HyPhy\n")
## Ancestral sequences inferred with HyPhy
= read.tree(tree_file)
tree = treeToDF(tree)
tree_to_df_list = tree_to_df_list[["info"]]
tree_info
if(tree_type == "astral"){
= tree_info %>% separate(label, c("astral", "gcf", "scf"), sep="/", remove=F)
tree_info $astral[tree_info$node.type=="tip"] = NA
tree_info$astral = as.numeric(tree_info$astral)
tree_info$gcf = as.numeric(tree_info$gcf)
tree_info$scf = as.numeric(tree_info$scf)
tree_infoelse if(tree_type == "concat"){
}= tree_info %>% separate(label, c("bootstrap", "gcf", "scf"), sep="/", remove=F)
tree_info $bootstrap[tree_info$node.type=="tip"] = NA
tree_info$bootstrap = as.numeric(tree_info$bootstrap)
tree_info$gcf = as.numeric(tree_info$gcf)
tree_info$scf = as.numeric(tree_info$scf)
tree_info }
= read.csv(anc_file, header=T)
anc_info_w_root
= c("total.subs", "total.mns", "S", "N", "A", "mS", "mN", "mA")
cols_to_na for(col in cols_to_na){
$node.label %in% exclude_branches,][[col]] = NA
anc_info_w_root[anc_info_w_root
}
= names(tree_info)[!(names(tree_info) %in% names(anc_info_w_root))]
uniq_info_cols # Get non common names
= c(uniq_info_cols, "clade") # appending key parameter
uniq_info_cols # Get a list of columns from the tree_info df to join to the tree rates df
= anc_info_w_root %>% left_join((tree_info %>% select(uniq_info_cols)), by="clade")
anc_info_w_root # Select the columns from tree_info and join to tree_rates, merging by clade
# https://stackoverflow.com/a/61628157
= anc_info_w_root[order(anc_info_w_root$node), ]
anc_info_w_root # Re-order the rows by the R node
$total.mns = anc_info_w_root$mS + anc_info_w_root$mN + anc_info_w_root$mA
anc_info_w_root$mn.ms = anc_info_w_root$mN / anc_info_w_root$mS anc_info_w_root
2 Species tree branch presence/absence per gene
Following the same procedure as before, we count multi-nucleotdie substitutions per species tree branch only in genes that contain that branch (full clade or partial clade):
= subset(anc_info_w_root, node.type != "root")
anc_info
= select(anc_info, clade, node.type, num.genes.full)
full_clade $label = "Full clade"
full_cladenames(full_clade)[3] = "num.genes"
= select(anc_info, clade, node.type, num.genes.partial)
partial_clade $label = "Partial clade"
partial_cladenames(partial_clade)[3] = "num.genes"
= select(anc_info, clade, node.type, num.genes.descendant.counted)
descendant_counted $label = "Descendant counted"
descendant_countednames(descendant_counted)[3] = "num.genes"
= select(anc_info, clade, node.type, num.genes.discordant)
discordant_clade $label = "Discordant clade"
discordant_cladenames(discordant_clade)[3] = "num.genes"
= select(anc_info, clade, node.type, num.genes.missing)
no_clade $label = "Missing clade"
no_cladenames(no_clade)[3] = "num.genes"
= rbind(full_clade, partial_clade, descendant_counted, discordant_clade, no_clade)
clade_counts # Convert branch categories to long format
$label = factor(clade_counts$label, levels=c("Full clade", "Partial clade", "Descendant counted", "Discordant clade", "Missing clade"))
clade_counts
= ggplot(clade_counts, aes(x=label, y=num.genes, group=label, color=node.type)) +
branch_counts geom_quasirandom(size=2, width=0.25, alpha=0.25) +
geom_boxplot(outlier.shape=NA, alpha=0.15, width=0.5, color="#666666") +
ylab("# of genes") +
xlab("Species tree\nbranch classification") +
bartheme() +
theme(axis.text.x = element_text(angle=25, hjust=1)) +
guides(colour=guide_legend(override.aes=list(alpha=1)))
#print(branch_counts)
3 Synonymous and non-synonymous multi-nucleotide substitutions
This results in the following distributions for number of convergent substitutions across branches (green lines indicating 95th percentile of each rate):
= quantile(anc_info$mS, 0.95, na.rm=T)
ms_95_perc = quantile(anc_info$mN, 0.95, na.rm=T)
mn_95_perc
= ggplot(anc_info, aes(x=mS, y=mN, color=node.type)) +
p geom_point(size=2, alpha=0.2) +
geom_text_repel(aes(label=ifelse(mS>ms_95_perc | mN>mn_95_perc, as.character(node), '')), show_guide=F) +
#(aes(label=ifelse(avg.dN>dn_95_perc, as.character(node), '')), show_guide=F) +
geom_vline(xintercept=ms_95_perc, linetype="dashed", color=corecol(numcol=1, offset=6)) +
geom_hline(yintercept=mn_95_perc, linetype="dashed", color=corecol(numcol=1, offset=6)) +
#geom_smooth(method="lm", se=F, ) +
xlab("mS per branch") +
ylab("mN per branch") +
bartheme() +
theme(legend.position="bottom") +
guides(colour = guide_legend(override.aes = list(alpha = 1)))
= ggExtra::ggMarginal(p, type="histogram", bins=50, fill=corecol(pal="wilke", numcol=1, offset=1), color="#666666")
p print(p)
$ms.outlier = ifelse(anc_info$mS>ms_95_perc,anc_info$node,'')
anc_info$mn.outlier = ifelse(anc_info$mN>mn_95_perc,anc_info$node,'') anc_info
4 Species tree with branches colored # of multi-nucleotide substitutions
= corecol(numcol=1, pal="wilke", offset=3)
h = corecol(numcol=1, offset=3)
l # Colors
= ggtree(tree, size=0.8, ladderize=F, aes(color=anc_info_w_root$total.mns)) +
mns_tree scale_color_continuous(name='Multi-nucleotide subs', low=l, high=h) +
xlim(0, xmax) +
geom_tiplab(color="#333333", fontface='italic', size=2) +
theme(legend.position=c(0.15,0.9))
print(mns_tree)
5 Ratio of non-synyonymous to synonymous MNS (mN/mS)
= ggplot(anc_info, aes(x=mn.ms)) +
mnms_p geom_histogram(bins=50, fill=corecol(pal="wilke", numcol=1), color="#666666") +
scale_y_continuous(expand=c(0,0)) +
xlab("mN/mS") +
ylab("# of branches") +
bartheme()
print(mnms_p)
# Distribution of dN/dS when using gene trees
= quantile(anc_info$mn.ms, 0.95, na.rm=T)
mnms_95_perc $mnms.outlier = ifelse(anc_info$mn.ms>mnms_95_perc,anc_info$node,'') anc_info
6 Species tree with branches colored by mN/mS
= corecol(numcol=1, pal="wilke", offset=3)
h = corecol(numcol=1, offset=3)
l # Colors
= ggtree(tree, size=0.8, ladderize=F, aes(color=anc_info_w_root$mn.ms)) +
mnms_tree scale_color_continuous(name='mN/mS', low=l, high=h) +
xlim(0, xmax) +
geom_tiplab(color="#333333", fontface='italic', size=2) +
theme(legend.position=c(0.05,0.9))
print(mnms_tree)