Murine convergent substitutions
::opts_chunk$set(echo = TRUE, warning = FALSE, message = FALSE)
knitr
library(ggplot2)
library(cowplot)
library(ggbeeswarm)
library(dplyr)
#library(kableExtra)
library(tidyr)
library(ggtree)
library(phytools)
library(phangorn)
library(reshape2)
library(ggExtra)
library(ggrepel)
#library(vroom)
library(ggdist)
#library(stringr)
source("../lib/design.r")
source("../lib/get_tree_info.r")
= "astral"
tree_type = F
save_tree_fig
cat(tree_type, " tree\n")
## astral tree
cat("188 species targeted capture to mouse exons assembled with SPADES.\n")
## 188 species targeted capture to mouse exons assembled with SPADES.
cat("11,775 coding loci aligned with exonerate+mafft\n")
## 11,775 coding loci aligned with exonerate+mafft
cat("Gene trees inferred with IQtree.\n")
## Gene trees inferred with IQtree.
if(tree_type == "astral"){
cat("Species tree inferred with ASTRAL.\n")
cat("Branch lengths estimated using 865 loci with normalized RF distance to species tree <= 0.25.\n")
= "../../data/trees/full_coding_iqtree_astral.cf.bl.nrf25.rooted.labeled.treefile"
tree_file = "../../data/rates/full-coding-astral-pairwise-convergence.csv"
anc_file = "../../data/rates/full-coding-astral-mg94-local-branch-mns.csv"
branch_count_file = c("Lophiomys_imhausi_UM5152", "Lophuromys_woosnami_LSUMZ37793", "<1>", "<187>", "<186>")
exclude_branches = 0.125
xmax else if(tree_type == "concat"){
}cat("Species tree inferred by concatenation of all loci with IQtree.\n")
= "../../data/trees/full_coding_iqtree_concat.cf.rooted.tree"
tree_file = "../../data/rates/full-coding-concat-pairwise-convergence.csv"
anc_file = ""
branch_count_file = c("Lophiomys_imhausi_UM5152", "Lophuromys_woosnami_LSUMZ37793", "<1>", "<187>", "<186>")
exclude_branches = 0.125
xmax }
## Species tree inferred with ASTRAL.
## Branch lengths estimated using 865 loci with normalized RF distance to species tree <= 0.25.
cat("Ancestral sequences inferred with HyPhy\n")
## Ancestral sequences inferred with HyPhy
= read.tree(tree_file)
tree = treeToDF(tree)
tree_to_df_list = tree_to_df_list[["info"]]
tree_info
if(tree_type == "astral"){
= tree_info %>% separate(label, c("tp", "astral", "gcf", "scf"), sep="/", remove=F)
tree_info $astral[tree_info$node.type=="tip"] = NA
tree_info$astral = as.numeric(tree_info$astral)
tree_info$gcf = as.numeric(tree_info$gcf)
tree_info$scf = as.numeric(tree_info$scf)
tree_infoelse if(tree_type == "concat"){
}= tree_info %>% separate(label, c("bootstrap", "gcf", "scf"), sep="/", remove=F)
tree_info $bootstrap[tree_info$node.type=="tip"] = NA
tree_info$bootstrap = as.numeric(tree_info$bootstrap)
tree_info$gcf = as.numeric(tree_info$gcf)
tree_info$scf = as.numeric(tree_info$scf)
tree_info }
1 Definitions
- Divergent substitution: A substitution at the same site on two branches of the tree to a different derived state regardless of ancestral state (e.g. A,A -> C,G or A,T -> C,G).
- Convergent substitution: A substitution at the same site on two branches of the tree that differ in their ancestral state to an identical derived state (e.g. A,G -> C,C).
- Parllel substitution: A substitution at the same site on two branches of the tree that share the same ancestral state to an identical derived state (e.g. A,A -> C,C).
For all pairwise analyses, comparisons between branches where one is the direct descendant of the other are excluded.
2 Species tree branch presence/absence per gene
Following the same procedure as before, we count convergent substitutions per species tree branch only in genes that contain that branch (full clade or partial clade).
3 Pairwise amino acid convergence
Convergent and divergent amino acid substitutions were counted on each pair of branches in the species tree in every gene where both branches were present, excluding pairs where one branch is the direct descendant of the other.
We find a strong correlation between divergent and convergent amino acid substitutions – with a secondary correlation (orange points).
= read.csv(anc_file, header=T, comment.char="#")
anc_info_w_root # Read the pairwise counts
= names(anc_info_w_root)[3:80]
cols_to_na for(col in cols_to_na){
$node1 %in% exclude_branches,][[col]] = NA
anc_info_w_root[anc_info_w_root$node2 %in% exclude_branches,][[col]] = NA
anc_info_w_root[anc_info_w_root
}# This loop changes all the counts for the branches descending from the root and the outgroup to NA to exclude them
= anc_info_w_root %>% mutate(aa.div = rowSums(select(., ends_with(".div")), na.rm=T))
anc_info_w_root = anc_info_w_root %>% mutate(aa.par = rowSums(select(., ends_with(".par")), na.rm=T))
anc_info_w_root = anc_info_w_root %>% mutate(aa.con = rowSums(select(., ends_with(".con")), na.rm=T))
anc_info_w_root
= subset(anc_info_w_root, aa.con > 200)
second_cor #second_cor = rbind(second_cor, subset(anc_info_w_root, aa.div < 3000 & aa.con > 200))
= rbind(second_cor, subset(anc_info_w_root, aa.div < 1000 & aa.con > 100))
second_cor = rbind(second_cor, subset(anc_info_w_root, aa.div < 400 & aa.con > 50))
second_cor = rbind(second_cor, subset(anc_info_w_root, aa.div < 200 & aa.con > 30))
second_cor
= ggplot(anc_info_w_root, aes(x=aa.div, y=aa.con)) +
aa_p geom_point(size=2, alpha=0.3, color="#333333") +
geom_point(data=second_cor, aes(x=aa.div, y=aa.con), size=2, alpha=0.3, color=corecol(numcol=1)) +
geom_smooth(method="lm", size=2, linetype="dashed", color="#920000") +
geom_smooth(data=second_cor, aes(x=aa.div, y=aa.con), method="lm", size=2, linetype="dashed", color="#920000") +
xlab("Divergent AA substitutions") +
ylab("Convergent AA substitutions") +
bartheme()
print(aa_p)
3.1 Divergent vs parallel substitutions
No secondary correlation between divergent and parallel (same ancestral state) substitutions, though those pairs seem to cluster on the lower end of parallel substitutions.
= subset(anc_info_w_root, aa.con > 200)
second_cor_par #second_cor = rbind(second_cor, subset(anc_info_w_root, aa.div < 3000 & aa.con > 200))
= rbind(second_cor_par, subset(anc_info_w_root, aa.div < 1000 & aa.con > 100))
second_cor_par = rbind(second_cor_par, subset(anc_info_w_root, aa.div < 400 & aa.con > 50))
second_cor_par = rbind(second_cor_par, subset(anc_info_w_root, aa.div < 200 & aa.con > 30))
second_cor_par
= ggplot(anc_info_w_root, aes(x=aa.div, y=aa.par)) +
aa_p geom_point(size=2, alpha=0.3, color="#333333") +
geom_point(data=second_cor_par, aes(x=aa.div, y=aa.par), size=2, alpha=0.3, color=corecol(numcol=1)) +
geom_smooth(method="lm", size=2, linetype="dashed", color="#920000") +
#geom_smooth(data=second_cor_par, aes(x=aa.div, y=aa.con), method="lm", size=2, linetype="dashed", color="#920000") +
xlab("Divergent AA substitutions") +
ylab("Parallel AA substitutions") +
bartheme()
print(aa_p)
= select(second_cor, node1, node2)
second_cor_br = data.frame(table(unlist(second_cor_br)))
second_cor_br_counts names(second_cor_br_counts) = c("tp", "conv.pair.count")
#second_cor_p = ggplot(second_cor_br_counts, aes(x=tp, y=conv.pair.count)) +
# geom_bar(stat="identity") +
# bartheme()
#print(second_cor_p)
= names(second_cor_br_counts)[!(names(second_cor_br_counts) %in% names(tree_info))]
uniq_info_cols # # Get non common names
= c(uniq_info_cols, "tp") # appending key parameter
uniq_info_cols # Get a list of columns from the tree_info df to join to the tree rates df
= tree_info %>% left_join((second_cor_br_counts %>% select(uniq_info_cols)), by="tp")
tree_info # Select the columns from tree_info and join to tree_rates, merging by clade
# https://stackoverflow.com/a/61628157
= tree_info[order(tree_info$node), ]
tree_info # Re-order the rows by the R node
$conv.pair.count[is.na(tree_info$conv.pair.count)] = 0 tree_info
3.2 Branches involved in secondary correlation
Remember that each point above is a count between a PAIR of branches. This tree displays the number of times each branch in the tree is involved in one of the pairings in the secondary correlation (orange points).
= corecol(numcol=1, pal="wilke", offset=3)
h = corecol(numcol=1, offset=3)
l # Colors
= ggtree(tree, size=0.8, ladderize=F, aes(color=tree_info$conv.pair.count)) +
conv_tree scale_color_continuous(name='Times in secondary pair', low=l, high=h) +
xlim(0, xmax) +
geom_tiplab(color="#333333", fontface='italic', size=2) +
geom_label(aes(x=branch, label=ifelse(tree_info$conv.pair.count>0,as.character(tree_info$conv.pair.count),'')), label.size=NA, fill="transparent") +
theme(legend.position=c(0.15,0.9))
print(conv_tree)
if(save_tree_fig){
= gcf_tree + geom_text(aes(x=branch, label=ifelse(tree_info$node.type=="internal",as.character(node), ''), label.size=NA, fill="transparent"), size=2, vjust=-0.2)
gcf_tree = paste("../../data/trees/", tree_type, "-gcf-tree-PARED-gcf50.pdf", sep="")
tree_outfile ggsave(tree_outfile, gcf_tree, width=8, height=16, unit="in")
}# conv tree
3.3 Secondary correlation and branch length
No correlation between branch length and the number of times a branch is on this secondary correlation.
Note: ALL branches plotted, but still no correlation if only those with at least 1 pairing in the secondary correlation are used
= subset(tree_info, conv.pair.count > 0)
secondary_tree_info
= ggplot(tree_info, aes(x=branch.length, y=conv.pair.count)) +
conv_bl_p geom_point(size=2, alpha=0.3, color=corecol(numcol=1)) +
xlab("Branch length") +
ylab("# of times in second correlation") +
bartheme()
print(conv_bl_p)
3.4 Secondary correlation and number of loci branch present in
No correlation between branch presence and the number of times a branch is on this secondary correlation.
Note: ALL branches plotted, but still no correlation if only those with at least 1 pairing in the secondary correlation are used
= read.csv(branch_count_file, header=T)
branch_presence names(branch_presence)[1] = "tp"
= names(branch_presence)[!(names(branch_presence) %in% names(tree_info))]
uniq_info_cols # # Get non common names
= c(uniq_info_cols, "tp") # appending key parameter
uniq_info_cols # Get a list of columns from the tree_info df to join to the tree rates df
= tree_info %>% left_join((branch_presence %>% select(uniq_info_cols)), by="tp")
tree_info # Select the columns from tree_info and join to tree_rates, merging by clade
# https://stackoverflow.com/a/61628157
= tree_info[order(tree_info$node), ]
tree_info # Re-order the rows by the R node
= subset(tree_info, conv.pair.count > 0)
secondary_tree_info
= ggplot(tree_info, aes(x=num.genes.full+num.genes.partial, y=conv.pair.count)) +
conv_bp_p geom_point(size=2, alpha=0.3, color=corecol(numcol=1)) +
xlab("# of times branch in gene") +
ylab("# of times in second correlation") +
bartheme()
print(conv_bp_p)
3.5 Convergent substitutions and number of time branch pairs are present in gene trees
#secondary_tree_info = subset(tree_info, conv.pair.count > 0)
= ggplot(anc_info_w_root, aes(x=count, y=aa.con)) +
conv_pairs_p geom_point(size=2, alpha=0.3, color="#333333") +
geom_point(data=second_cor, aes(x=count, y=aa.con), size=2, alpha=0.3, color=corecol(numcol=1)) +
#geom_smooth(method="lm", size=2, linetype="dashed", color="#920000") +
#geom_smooth(data=second_cor, aes(x=aa.div, y=aa.con), method="lm", size=2, linetype="dashed", color="#920000") +
xlab("# of times branches appear together") +
ylab("Convergent AA substitutions") +
bartheme()
print(conv_pairs_p)
3.6 Excluding convergent AA substitutions that are divergent at the codon level
This pattern, and actually most convergent amino acid substitutions, are actually driven by divergent substitutions at the codon level:
= read.csv(anc_file, header=T, comment.char="#")
anc_info_w_root # Read the pairwise counts
= names(anc_info_w_root)[3:80]
cols_to_na for(col in cols_to_na){
$node1 %in% exclude_branches,][[col]] = NA
anc_info_w_root[anc_info_w_root$node2 %in% exclude_branches,][[col]] = NA
anc_info_w_root[anc_info_w_root
}# This loop changes all the counts for the branches descending from the root and the outgroup to NA to exclude them
= anc_info_w_root %>% mutate(aa.div = rowSums(select(., ends_with(".div.div")), na.rm=T))
anc_info_w_root = anc_info_w_root %>% mutate(aa.par = rowSums(select(., ends_with(".par.par")), na.rm=T))
anc_info_w_root = anc_info_w_root %>% mutate(aa.con = rowSums(select(., ends_with(".con.con")), na.rm=T))
anc_info_w_root
#second_cor = subset(anc_info_w_root, aa.con > 200)
= ggplot(anc_info_w_root, aes(x=aa.div, y=aa.con)) +
aa_p geom_point(size=2, alpha=0.3, color="#333333") +
#geom_point(data=second_cor, aes(x=aa.div, y=aa.con), size=2, alpha=0.3, color=corecol(numcol=1)) +
geom_smooth(method="lm", size=2, linetype="dashed", color="#920000") +
#geom_smooth(data=second_cor, aes(x=aa.div, y=aa.con), method="lm", size=2, linetype="dashed", color="#920000") +
xlab("Divergent AA substitutions") +
ylab("Convergent AA substitutions") +
bartheme()
print(aa_p)
3.7 Including only convergent amino acid substitutions that are divergent at the codon level
= read.csv(anc_file, header=T, comment.char="#")
anc_info_w_root # Read the pairwise counts
= names(anc_info_w_root)[3:80]
cols_to_na for(col in cols_to_na){
$node1 %in% exclude_branches,][[col]] = NA
anc_info_w_root[anc_info_w_root$node2 %in% exclude_branches,][[col]] = NA
anc_info_w_root[anc_info_w_root
}# This loop changes all the counts for the branches descending from the root and the outgroup to NA to exclude them
= anc_info_w_root %>% mutate(aa.div = rowSums(select(., ends_with(".div.div")), na.rm=T))
anc_info_w_root = anc_info_w_root %>% mutate(aa.con = rowSums(select(., ends_with(".div.con")), na.rm=T))
anc_info_w_root
#second_cor = subset(anc_info_w_root, aa.con > 200)
= ggplot(anc_info_w_root, aes(x=aa.div, y=aa.con)) +
aa_p geom_point(size=2, alpha=0.3, color="#333333") +
#geom_point(data=second_cor, aes(x=aa.div, y=aa.con), size=2, alpha=0.3, color=corecol(numcol=1)) +
geom_smooth(method="lm", size=2, linetype="dashed", color="#920000") +
#geom_smooth(data=second_cor, aes(x=aa.div, y=aa.con), method="lm", size=2, linetype="dashed", color="#920000") +
xlab("Divergent AA substitutions") +
ylab("Convergent AA substitutions") +
bartheme()
print(aa_p)
4 Pairwise amino acid convergence without descendant branches
Excluding descendant branches removes the secondary correlation
= "../../data/rates/full-coding-astral-pairwise-convergence-nodesc.csv"
anc_file = read.csv(anc_file, header=T, comment.char="#")
anc_info_w_root # Read the pairwise counts
= names(anc_info_w_root)[3:80]
cols_to_na for(col in cols_to_na){
$node1 %in% exclude_branches,][[col]] = NA
anc_info_w_root[anc_info_w_root$node2 %in% exclude_branches,][[col]] = NA
anc_info_w_root[anc_info_w_root
}# This loop changes all the counts for the branches descending from the root and the outgroup to NA to exclude them
= anc_info_w_root %>% mutate(aa.div = rowSums(select(., ends_with(".div")), na.rm=T))
anc_info_w_root = anc_info_w_root %>% mutate(aa.par = rowSums(select(., ends_with(".par")), na.rm=T))
anc_info_w_root = anc_info_w_root %>% mutate(aa.con = rowSums(select(., ends_with(".con")), na.rm=T))
anc_info_w_root
= subset(anc_info_w_root, aa.con > 200)
second_cor #second_cor = rbind(second_cor, subset(anc_info_w_root, aa.div < 3000 & aa.con > 200))
= rbind(second_cor, subset(anc_info_w_root, aa.div < 1000 & aa.con > 100))
second_cor = rbind(second_cor, subset(anc_info_w_root, aa.div < 400 & aa.con > 50))
second_cor = rbind(second_cor, subset(anc_info_w_root, aa.div < 200 & aa.con > 30))
second_cor
= ggplot(anc_info_w_root, aes(x=aa.div, y=aa.con)) +
aa_p geom_point(size=2, alpha=0.3, color="#333333") +
geom_point(data=second_cor, aes(x=aa.div, y=aa.con), size=2, alpha=0.3, color=corecol(numcol=1)) +
geom_smooth(method="lm", size=2, linetype="dashed", color="#920000") +
geom_smooth(data=second_cor, aes(x=aa.div, y=aa.con), method="lm", size=2, linetype="dashed", color="#920000") +
xlab("Divergent AA substitutions") +
ylab("Convergent AA substitutions") +
bartheme()
print(aa_p)
4.1 Divergent vs parallel substitutions
= subset(anc_info_w_root, aa.con > 200)
second_cor_par #second_cor = rbind(second_cor, subset(anc_info_w_root, aa.div < 3000 & aa.con > 200))
= rbind(second_cor_par, subset(anc_info_w_root, aa.div < 1000 & aa.con > 100))
second_cor_par = rbind(second_cor_par, subset(anc_info_w_root, aa.div < 400 & aa.con > 50))
second_cor_par = rbind(second_cor_par, subset(anc_info_w_root, aa.div < 200 & aa.con > 30))
second_cor_par
= ggplot(anc_info_w_root, aes(x=aa.div, y=aa.par)) +
aa_p geom_point(size=2, alpha=0.3, color="#333333") +
geom_point(data=second_cor_par, aes(x=aa.div, y=aa.par), size=2, alpha=0.3, color=corecol(numcol=1)) +
geom_smooth(method="lm", size=2, linetype="dashed", color="#920000") +
#geom_smooth(data=second_cor_par, aes(x=aa.div, y=aa.con), method="lm", size=2, linetype="dashed", color="#920000") +
xlab("Divergent AA substitutions") +
ylab("Parallel AA substitutions") +
bartheme()
print(aa_p)
4.2 Amino acid convergence on the phylogeny
= data.frame("tp"=tree_info$tp)
conv_counts $aa.con.sum = 0
conv_counts$aa.div.sum = 0
conv_counts
for(i in 1:nrow(anc_info_w_root)){
= anc_info_w_root[i,]
row = row$node1
n1 = row$node2
n2 $tp==n1,]$aa.con.sum = conv_counts[conv_counts$tp==n1,]$aa.con.sum + row$aa.con
conv_counts[conv_counts$tp==n1,]$aa.div.sum = conv_counts[conv_counts$tp==n1,]$aa.div.sum + row$aa.div
conv_counts[conv_counts
$tp==n2,]$aa.con.sum = conv_counts[conv_counts$tp==n2,]$aa.con.sum + row$aa.con
conv_counts[conv_counts$tp==n2,]$aa.div.sum = conv_counts[conv_counts$tp==n2,]$aa.div.sum + row$aa.div
conv_counts[conv_counts
}
= merge(tree_info, conv_counts, by="tp")
tree_info
= tree_info[order(tree_info$node), ]
tree_info # Re-order the rows by the R node
= corecol(numcol=1, pal="wilke", offset=3)
h = corecol(numcol=1, offset=3)
l # Colors
$cd.ratio = tree_info$aa.con.sum / tree_info$aa.div.sum
tree_infois.nan(tree_info$cd.ratio),]$cd.ratio = NA
tree_info[
= ggtree(tree, size=0.8, ladderize=F, aes(color=tree_info$cd.ratio)) +
conv_tree scale_color_continuous(name='# convergent subs / # divergent subs', low=l, high=h) +
xlim(0, xmax) +
geom_tiplab(color="#333333", fontface='italic', size=2) +
#geom_label(aes(x=branch, label=ifelse(tree_info$conv.pair.count>0,as.character(tree_info$conv.pair.count),'')), label.size=NA, fill="transparent") +
theme(legend.position=c(0.15,0.9))
print(conv_tree)
if(save_tree_fig){
= gcf_tree + geom_text(aes(x=branch, label=ifelse(tree_info$node.type=="internal",as.character(node), ''), label.size=NA, fill="transparent"), size=2, vjust=-0.2)
gcf_tree = paste("../../data/trees/", tree_type, "-gcf-tree-PARED-gcf50.pdf", sep="")
tree_outfile ggsave(tree_outfile, gcf_tree, width=8, height=16, unit="in")
}# conv tree
5 Codon convergence
5.1 Non-synonymous codon convergence
= anc_info_w_root %>% mutate(syn.con = rowSums(select(., starts_with("s.s.con")), na.rm=T))
anc_info_w_root #anc_info_w_root = anc_info_w_root %>% mutate(non.con = rowSums(select(., starts_with("n.n.con") | starts_with("s.n.con")), na.rm=T))
#anc_info_w_root = anc_info_w_root %>% mutate(non.div = rowSums(select(., starts_with("n.n.div") | starts_with("s.n.div")), na.rm=T))
= anc_info_w_root %>% mutate(non.con = rowSums(select(., starts_with("n.n.con")), na.rm=T))
anc_info_w_root = anc_info_w_root %>% mutate(non.div = rowSums(select(., starts_with("n.n.div")), na.rm=T))
anc_info_w_root
= ggplot(anc_info_w_root, aes(x=non.div, y=non.con)) +
ns_codon_p geom_point(size=2, alpha=0.3, color="#333333") +
geom_smooth(method="lm", size=2, linetype="dashed", color="#920000") +
xlab("Divergent non-synonymous codon substitutions") +
ylab("Convergent non-synonymous codon substitutions") +
bartheme()
print(ns_codon_p)
5.2 Non-synonymous codon convergence / synonymous codon convergence
For each pair, counts where both convergent substitutions are non-synonymous vs. both are synonymous
= subset(anc_info_w_root, syn.con > 4)
cncs_set
$cncs = cncs_set$non.con / cncs_set$syn.con
cncs_set#cncs_set$cncs[is.na(cncs_set$cncs)] = NA
#cncs_set$cncs[is.infinite(cncs_set$cncs)] = NA
= ggplot(cncs_set, aes(x=cncs)) +
cncs_p geom_histogram(fill=corecol(numcol=1, offset=1), color="#ececec") +
scale_x_continuous(expand=c(0,0)) +
scale_y_continuous(expand=c(0,0)) +
xlab("cN/cS") +
ylab("Branch pairs") +
bartheme()
print(cncs_p)
cat("Avg. cN/cS: ", mean(cncs_set$cncs, na.rm=T), "\n")
## Avg. cN/cS: 0.2915383
cat("Median cN/cS: ", median(cncs_set$cncs, na.rm=T), "\n")
## Median cN/cS: 0.2391304
= select(cncs_set, node1, node2, non.con, syn.con, cncs)
cncs_subset = cncs_subset[order(-cncs_subset$cncs), ]
cncs_subset
head(cncs_subset, n=20)
= quantile(cncs_set$cncs, 0.95, na.rm=T)
cncs_95_perc
cat("cN/cS 95th percentile: ", cncs_95_perc, "\n")
## cN/cS 95th percentile: 0.7777778
5.3 Synonymous vs. non-synonymous codon convergence
Y = above 95th percentile of cN/cS
N = below 95th percentile of cN/cS
$cncs.95 = ifelse(cncs_subset$cncs > cncs_95_perc, "Y", "N")
cncs_subset
= ggplot(cncs_subset, aes(x=syn.con, y=non.con, color=cncs.95)) +
s_ns_codon_p geom_point(size=2, alpha=0.3) +
geom_smooth(method="lm", size=2, linetype="dashed", color="#920000") +
xlab("Convergent synonymous codon substitutions") +
ylab("Convergent non-synonymous\ncodon substitutions") +
bartheme()
print(s_ns_codon_p)
= quantile(anc_info$cS, 0.95, na.rm=T)
cs_95_perc = quantile(anc_info$cN, 0.95, na.rm=T)
cn_95_perc
= ggplot(anc_info, aes(x=cS, y=cN, color=node.type)) +
p geom_point(size=2, alpha=0.2) +
geom_text_repel(aes(label=ifelse(cS>cs_95_perc | cN>cn_95_perc, as.character(node), '')), show_guide=F) +
#(aes(label=ifelse(avg.dN>dn_95_perc, as.character(node), '')), show_guide=F) +
geom_vline(xintercept=cs_95_perc, linetype="dashed", color=corecol(numcol=1, offset=6)) +
geom_hline(yintercept=cn_95_perc, linetype="dashed", color=corecol(numcol=1, offset=6)) +
#geom_smooth(method="lm", se=F, ) +
xlab("cS per branch") +
ylab("cN per branch") +
bartheme() +
theme(legend.position="bottom") +
guides(colour = guide_legend(override.aes = list(alpha = 1)))
= ggExtra::ggMarginal(p, type="histogram", bins=50, fill=corecol(pal="wilke", numcol=1, offset=1), color="#666666")
p print(p)
$cs.outlier = ifelse(anc_info$cS>cs_95_perc,anc_info$node,'')
anc_info$cn.outlier = ifelse(anc_info$cN>cn_95_perc,anc_info$node,'') anc_info
# Synonymous and non-synonymous convergent substitutions
#This results in the following distributions for number of convergent substitutions across branches (green lines indicating 95th percentile of each rate):
= quantile(anc_info$cS, 0.95, na.rm=T)
cs_95_perc = quantile(anc_info$cN, 0.95, na.rm=T)
cn_95_perc
= ggplot(anc_info, aes(x=cS, y=cN, color=node.type)) +
p geom_point(size=2, alpha=0.2) +
geom_text_repel(aes(label=ifelse(cS>cs_95_perc | cN>cn_95_perc, as.character(node), '')), show_guide=F) +
#(aes(label=ifelse(avg.dN>dn_95_perc, as.character(node), '')), show_guide=F) +
geom_vline(xintercept=cs_95_perc, linetype="dashed", color=corecol(numcol=1, offset=6)) +
geom_hline(yintercept=cn_95_perc, linetype="dashed", color=corecol(numcol=1, offset=6)) +
#geom_smooth(method="lm", se=F, ) +
xlab("cS per branch") +
ylab("cN per branch") +
bartheme() +
theme(legend.position="bottom") +
guides(colour = guide_legend(override.aes = list(alpha = 1)))
= ggExtra::ggMarginal(p, type="histogram", bins=50, fill=corecol(pal="wilke", numcol=1, offset=1), color="#666666")
p print(p)
$cs.outlier = ifelse(anc_info$cS>cs_95_perc,anc_info$node,'')
anc_info$cn.outlier = ifelse(anc_info$cN>cn_95_perc,anc_info$node,'') anc_info
# Species tree with branches colored # of convergent substitutions to any other branch
= corecol(numcol=1, pal="wilke", offset=3)
h = corecol(numcol=1, offset=3)
l # Colors
= ggtree(tree, size=0.8, ladderize=F, aes(color=anc_info_w_root$total.c)) +
c_tree scale_color_continuous(name='Convergent subs', low=l, high=h) +
xlim(0, xmax) +
geom_tiplab(color="#333333", fontface='italic', size=2) +
theme(legend.position=c(0.05,0.9))
print(c_tree)
# Ratio of non-synyonymous to synonymous convergent substitutions (cN/cS)
= ggplot(anc_info, aes(x=cn.cs)) +
cncs_p geom_histogram(bins=50, fill=corecol(pal="wilke", numcol=1), color="#666666") +
scale_y_continuous(expand=c(0,0)) +
xlab("cN/cS") +
ylab("# of branches") +
bartheme()
print(cncs_p)
# Distribution of dN/dS when using gene trees
= quantile(anc_info$cn.cs, 0.95, na.rm=T)
cncs_95_perc $cncs.outlier = ifelse(anc_info$cn.cs>cncs_95_perc,anc_info$node,'') anc_info
# Species tree with branches colored by cN/cS
= corecol(numcol=1, pal="wilke", offset=3)
h = corecol(numcol=1, offset=3)
l # Colors
= ggtree(tree, size=0.8, ladderize=F, aes(color=anc_info_w_root$cn.cs)) +
cncs_tree scale_color_continuous(name='cN/cS', low=l, high=h) +
xlim(0, xmax) +
geom_tiplab(color="#333333", fontface='italic', size=2) +
theme(legend.position=c(0.05,0.9))
print(cncs_tree)
# Pairwise convergent substitutions
# Convergent and divergent substitutions between every pair of branches in the tree
= read.csv("../../data/rates/")
pairwise_data
$convergent = pairwise_data$con.syn + pairwise_data$con.one.each + pairwise_data$con.non.syn
pairwise_data$divergent = pairwise_data$div.syn + pairwise_data$div.one.each + pairwise_data$div.non.syn
pairwise_data
= ggplot(pairwise_data, aes(x=divergent, y=convergent)) +
pairwise_p geom_point(size=2, alpha=0.3, color="#920000") +
geom_smooth(method="lm", se=F, size=2, linetype="dashed", color="#333333") +
bartheme()
print(pairwise_p)