Murine multi-nucleotide substitutions

knitr::opts_chunk$set(echo = TRUE, warning = FALSE, message = FALSE)

library(ggplot2)
library(cowplot)
library(ggbeeswarm)
library(dplyr)
#library(kableExtra)
library(tidyr)
library(ggtree)
library(phytools)
library(phangorn)
library(reshape2)
library(ggExtra)
library(ggrepel)
#library(vroom)
library(ggdist)
#library(stringr)
source("../lib/design.r")
source("../lib/get_tree_info.r")

< Back to summary

1 Multi-nucleotide substitutions

Here defined as two or more substitutions inferred along a branch in the same codon

#tree_type = "astral"
save_tree_fig = F

cat(tree_type, " tree\n")
## astral  tree
cat("188 species targeted capture to mouse exons assembled with SPADES.\n")
## 188 species targeted capture to mouse exons assembled with SPADES.
cat("11,775 coding loci aligned with exonerate+mafft\n")
## 11,775 coding loci aligned with exonerate+mafft
cat("Gene trees inferred with IQtree.\n")
## Gene trees inferred with IQtree.
if(tree_type == "astral"){
  cat("Species tree inferred with ASTRAL.\n")
  cat("Branch lengths estimated using 865 loci with normalized RF distance <= 0.25.\n")
  tree_file = "../../data/trees/full_coding_iqtree_astral.cf.bl.nrf25.rooted.treefile"
  anc_file = "../../data/rates/full-coding-astral-mg94-local-branch-mns.csv"
  exclude_branches = c("Lophiomys_imhausi_UM5152", "Lophuromys_woosnami_LSUMZ37793", "<1>", "<187>", "<186>")
  xmax = 0.125
}else if(tree_type == "concat"){
  cat("Species tree inferred by concatenation of all loci with IQtree.\n")
  tree_file = "../../data/trees/full_coding_iqtree_concat.cf.rooted.tree"
  anc_file = "../../data/rates/full-coding-concat-mg94-local-branch-mns.csv"
  exclude_branches = c("Lophiomys_imhausi_UM5152", "Lophuromys_woosnami_LSUMZ37793", "<1>", "<187>", "<186>")
  xmax = 0.125
}
## Species tree inferred with ASTRAL.
## Branch lengths estimated using 865 loci with normalized RF distance <= 0.25.
cat("Ancestral sequences inferred with HyPhy\n")
## Ancestral sequences inferred with HyPhy
tree = read.tree(tree_file)
tree_to_df_list = treeToDF(tree)
tree_info = tree_to_df_list[["info"]]

if(tree_type == "astral"){
  tree_info = tree_info %>% separate(label, c("astral", "gcf", "scf"), sep="/", remove=F)
  tree_info$astral[tree_info$node.type=="tip"] = NA
  tree_info$astral = as.numeric(tree_info$astral)
  tree_info$gcf = as.numeric(tree_info$gcf)
  tree_info$scf = as.numeric(tree_info$scf)
}else if(tree_type == "concat"){
  tree_info = tree_info %>% separate(label, c("bootstrap", "gcf", "scf"), sep="/", remove=F)
  tree_info$bootstrap[tree_info$node.type=="tip"] = NA
  tree_info$bootstrap = as.numeric(tree_info$bootstrap)
  tree_info$gcf = as.numeric(tree_info$gcf)
  tree_info$scf = as.numeric(tree_info$scf)
}
anc_info_w_root = read.csv(anc_file, header=T)

cols_to_na = c("total.subs", "total.mns", "S", "N", "A", "mS", "mN", "mA")
for(col in cols_to_na){
  anc_info_w_root[anc_info_w_root$node.label %in% exclude_branches,][[col]] = NA
}

uniq_info_cols = names(tree_info)[!(names(tree_info) %in% names(anc_info_w_root))] 
# Get non common names

uniq_info_cols = c(uniq_info_cols, "clade") # appending key parameter
# Get a list of columns from the tree_info df to join to the tree rates df

anc_info_w_root = anc_info_w_root %>% left_join((tree_info %>% select(uniq_info_cols)), by="clade")
# Select the columns from tree_info and join to tree_rates, merging by clade
# https://stackoverflow.com/a/61628157

anc_info_w_root = anc_info_w_root[order(anc_info_w_root$node), ]
# Re-order the rows by the R node

anc_info_w_root$total.mns = anc_info_w_root$mS + anc_info_w_root$mN + anc_info_w_root$mA
anc_info_w_root$mn.ms = anc_info_w_root$mN / anc_info_w_root$mS

2 Species tree branch presence/absence per gene

Following the same procedure as before, we count multi-nucleotdie substitutions per species tree branch only in genes that contain that branch (full clade or partial clade):

anc_info = subset(anc_info_w_root, node.type != "root")

full_clade = select(anc_info, clade, node.type, num.genes.full)
full_clade$label = "Full clade"
names(full_clade)[3] = "num.genes"

partial_clade = select(anc_info, clade, node.type, num.genes.partial)
partial_clade$label = "Partial clade"
names(partial_clade)[3] = "num.genes"

descendant_counted = select(anc_info, clade, node.type, num.genes.descendant.counted)
descendant_counted$label = "Descendant counted"
names(descendant_counted)[3] = "num.genes"

discordant_clade = select(anc_info, clade, node.type, num.genes.discordant)
discordant_clade$label = "Discordant clade"
names(discordant_clade)[3] = "num.genes"

no_clade = select(anc_info, clade, node.type, num.genes.missing)
no_clade$label = "Missing clade"
names(no_clade)[3] = "num.genes"

clade_counts = rbind(full_clade, partial_clade, descendant_counted, discordant_clade, no_clade)
# Convert branch categories to long format

clade_counts$label = factor(clade_counts$label, levels=c("Full clade", "Partial clade", "Descendant counted", "Discordant clade", "Missing clade"))

branch_counts = ggplot(clade_counts, aes(x=label, y=num.genes, group=label, color=node.type)) +
  geom_quasirandom(size=2, width=0.25, alpha=0.25) +
  geom_boxplot(outlier.shape=NA, alpha=0.15, width=0.5, color="#666666") +
  ylab("# of genes") +
  xlab("Species tree\nbranch classification") +
  bartheme() +
  theme(axis.text.x = element_text(angle=25, hjust=1)) +
  guides(colour=guide_legend(override.aes=list(alpha=1)))
#print(branch_counts)

3 Synonymous and non-synonymous multi-nucleotide substitutions

This results in the following distributions for number of convergent substitutions across branches (green lines indicating 95th percentile of each rate):

ms_95_perc = quantile(anc_info$mS, 0.95, na.rm=T)
mn_95_perc = quantile(anc_info$mN, 0.95, na.rm=T)

p = ggplot(anc_info, aes(x=mS, y=mN, color=node.type)) +
  geom_point(size=2, alpha=0.2) +
  geom_text_repel(aes(label=ifelse(mS>ms_95_perc | mN>mn_95_perc, as.character(node), '')), show_guide=F) +
  #(aes(label=ifelse(avg.dN>dn_95_perc, as.character(node), '')), show_guide=F) +
  geom_vline(xintercept=ms_95_perc, linetype="dashed", color=corecol(numcol=1, offset=6)) +
  geom_hline(yintercept=mn_95_perc, linetype="dashed", color=corecol(numcol=1, offset=6)) +
  #geom_smooth(method="lm", se=F, ) +
  xlab("mS per branch") + 
  ylab("mN per branch") + 
  bartheme() +
  theme(legend.position="bottom") +
  guides(colour = guide_legend(override.aes = list(alpha = 1)))
p = ggExtra::ggMarginal(p, type="histogram", bins=50, fill=corecol(pal="wilke", numcol=1, offset=1), color="#666666")
print(p)

anc_info$ms.outlier = ifelse(anc_info$mS>ms_95_perc,anc_info$node,'')
anc_info$mn.outlier = ifelse(anc_info$mN>mn_95_perc,anc_info$node,'')

4 Species tree with branches colored # of multi-nucleotide substitutions

h = corecol(numcol=1, pal="wilke", offset=3)
l = corecol(numcol=1, offset=3)
# Colors

mns_tree = ggtree(tree, size=0.8, ladderize=F, aes(color=anc_info_w_root$total.mns)) +
  scale_color_continuous(name='Multi-nucleotide subs', low=l, high=h) +
  xlim(0, xmax) +
  geom_tiplab(color="#333333", fontface='italic', size=2) +
  theme(legend.position=c(0.15,0.9))
print(mns_tree)

5 Ratio of non-synyonymous to synonymous MNS (mN/mS)

mnms_p = ggplot(anc_info, aes(x=mn.ms)) +
  geom_histogram(bins=50, fill=corecol(pal="wilke", numcol=1), color="#666666") +
  scale_y_continuous(expand=c(0,0)) +
  xlab("mN/mS") +
  ylab("# of branches") +
  bartheme()
print(mnms_p)

# Distribution of dN/dS when using gene trees

mnms_95_perc = quantile(anc_info$mn.ms, 0.95, na.rm=T)
anc_info$mnms.outlier = ifelse(anc_info$mn.ms>mnms_95_perc,anc_info$node,'')

6 Species tree with branches colored by mN/mS

h = corecol(numcol=1, pal="wilke", offset=3)
l = corecol(numcol=1, offset=3)
# Colors

mnms_tree = ggtree(tree, size=0.8, ladderize=F, aes(color=anc_info_w_root$mn.ms)) +
  scale_color_continuous(name='mN/mS', low=l, high=h) +
  xlim(0, xmax) +
  geom_tiplab(color="#333333", fontface='italic', size=2) +
  theme(legend.position=c(0.05,0.9))
print(mnms_tree)

< Back to summary