Murine convergent substitutions
knitr::opts_chunk$set(echo = TRUE, warning = FALSE, message = FALSE)
library(ggplot2)
library(cowplot)
library(ggbeeswarm)
library(dplyr)
#library(kableExtra)
library(tidyr)
library(ggtree)
library(phytools)
library(phangorn)
library(reshape2)
library(ggExtra)
library(ggrepel)
#library(vroom)
library(ggdist)
#library(stringr)
source("../lib/design.r")
source("../lib/get_tree_info.r")tree_type = "astral"
save_tree_fig = F
cat(tree_type, " tree\n")## astral tree
cat("188 species targeted capture to mouse exons assembled with SPADES.\n")## 188 species targeted capture to mouse exons assembled with SPADES.
cat("11,775 coding loci aligned with exonerate+mafft\n")## 11,775 coding loci aligned with exonerate+mafft
cat("Gene trees inferred with IQtree.\n")## Gene trees inferred with IQtree.
if(tree_type == "astral"){
cat("Species tree inferred with ASTRAL.\n")
cat("Branch lengths estimated using 865 loci with normalized RF distance to species tree <= 0.25.\n")
tree_file = "../../data/trees/full_coding_iqtree_astral.cf.bl.nrf25.rooted.labeled.treefile"
anc_file = "../../data/rates/full-coding-astral-pairwise-convergence.csv"
branch_count_file = "../../data/rates/full-coding-astral-mg94-local-branch-mns.csv"
exclude_branches = c("Lophiomys_imhausi_UM5152", "Lophuromys_woosnami_LSUMZ37793", "<1>", "<187>", "<186>")
xmax = 0.125
}else if(tree_type == "concat"){
cat("Species tree inferred by concatenation of all loci with IQtree.\n")
tree_file = "../../data/trees/full_coding_iqtree_concat.cf.rooted.tree"
anc_file = "../../data/rates/full-coding-concat-pairwise-convergence.csv"
branch_count_file = ""
exclude_branches = c("Lophiomys_imhausi_UM5152", "Lophuromys_woosnami_LSUMZ37793", "<1>", "<187>", "<186>")
xmax = 0.125
}## Species tree inferred with ASTRAL.
## Branch lengths estimated using 865 loci with normalized RF distance to species tree <= 0.25.
cat("Ancestral sequences inferred with HyPhy\n")## Ancestral sequences inferred with HyPhy
tree = read.tree(tree_file)
tree_to_df_list = treeToDF(tree)
tree_info = tree_to_df_list[["info"]]
if(tree_type == "astral"){
tree_info = tree_info %>% separate(label, c("tp", "astral", "gcf", "scf"), sep="/", remove=F)
tree_info$astral[tree_info$node.type=="tip"] = NA
tree_info$astral = as.numeric(tree_info$astral)
tree_info$gcf = as.numeric(tree_info$gcf)
tree_info$scf = as.numeric(tree_info$scf)
}else if(tree_type == "concat"){
tree_info = tree_info %>% separate(label, c("bootstrap", "gcf", "scf"), sep="/", remove=F)
tree_info$bootstrap[tree_info$node.type=="tip"] = NA
tree_info$bootstrap = as.numeric(tree_info$bootstrap)
tree_info$gcf = as.numeric(tree_info$gcf)
tree_info$scf = as.numeric(tree_info$scf)
}1 Definitions
- Divergent substitution: A substitution at the same site on two branches of the tree to a different derived state regardless of ancestral state (e.g. A,A -> C,G or A,T -> C,G).
- Convergent substitution: A substitution at the same site on two branches of the tree that differ in their ancestral state to an identical derived state (e.g. A,G -> C,C).
- Parllel substitution: A substitution at the same site on two branches of the tree that share the same ancestral state to an identical derived state (e.g. A,A -> C,C).
For all pairwise analyses, comparisons between branches where one is the direct descendant of the other are excluded.
2 Species tree branch presence/absence per gene
Following the same procedure as before, we count convergent substitutions per species tree branch only in genes that contain that branch (full clade or partial clade).
3 Pairwise amino acid convergence
Convergent and divergent amino acid substitutions were counted on each pair of branches in the species tree in every gene where both branches were present, excluding pairs where one branch is the direct descendant of the other.
We find a strong correlation between divergent and convergent amino acid substitutions – with a secondary correlation (orange points).
anc_info_w_root = read.csv(anc_file, header=T, comment.char="#")
# Read the pairwise counts
cols_to_na = names(anc_info_w_root)[3:80]
for(col in cols_to_na){
anc_info_w_root[anc_info_w_root$node1 %in% exclude_branches,][[col]] = NA
anc_info_w_root[anc_info_w_root$node2 %in% exclude_branches,][[col]] = NA
}
# This loop changes all the counts for the branches descending from the root and the outgroup to NA to exclude them
anc_info_w_root = anc_info_w_root %>% mutate(aa.div = rowSums(select(., ends_with(".div")), na.rm=T))
anc_info_w_root = anc_info_w_root %>% mutate(aa.par = rowSums(select(., ends_with(".par")), na.rm=T))
anc_info_w_root = anc_info_w_root %>% mutate(aa.con = rowSums(select(., ends_with(".con")), na.rm=T))
second_cor = subset(anc_info_w_root, aa.con > 200)
#second_cor = rbind(second_cor, subset(anc_info_w_root, aa.div < 3000 & aa.con > 200))
second_cor = rbind(second_cor, subset(anc_info_w_root, aa.div < 1000 & aa.con > 100))
second_cor = rbind(second_cor, subset(anc_info_w_root, aa.div < 400 & aa.con > 50))
second_cor = rbind(second_cor, subset(anc_info_w_root, aa.div < 200 & aa.con > 30))
aa_p = ggplot(anc_info_w_root, aes(x=aa.div, y=aa.con)) +
geom_point(size=2, alpha=0.3, color="#333333") +
geom_point(data=second_cor, aes(x=aa.div, y=aa.con), size=2, alpha=0.3, color=corecol(numcol=1)) +
geom_smooth(method="lm", size=2, linetype="dashed", color="#920000") +
geom_smooth(data=second_cor, aes(x=aa.div, y=aa.con), method="lm", size=2, linetype="dashed", color="#920000") +
xlab("Divergent AA substitutions") +
ylab("Convergent AA substitutions") +
bartheme()
print(aa_p)3.1 Divergent vs parallel substitutions
No secondary correlation between divergent and parallel (same ancestral state) substitutions, though those pairs seem to cluster on the lower end of parallel substitutions.
second_cor_par = subset(anc_info_w_root, aa.con > 200)
#second_cor = rbind(second_cor, subset(anc_info_w_root, aa.div < 3000 & aa.con > 200))
second_cor_par = rbind(second_cor_par, subset(anc_info_w_root, aa.div < 1000 & aa.con > 100))
second_cor_par = rbind(second_cor_par, subset(anc_info_w_root, aa.div < 400 & aa.con > 50))
second_cor_par = rbind(second_cor_par, subset(anc_info_w_root, aa.div < 200 & aa.con > 30))
aa_p = ggplot(anc_info_w_root, aes(x=aa.div, y=aa.par)) +
geom_point(size=2, alpha=0.3, color="#333333") +
geom_point(data=second_cor_par, aes(x=aa.div, y=aa.par), size=2, alpha=0.3, color=corecol(numcol=1)) +
geom_smooth(method="lm", size=2, linetype="dashed", color="#920000") +
#geom_smooth(data=second_cor_par, aes(x=aa.div, y=aa.con), method="lm", size=2, linetype="dashed", color="#920000") +
xlab("Divergent AA substitutions") +
ylab("Parallel AA substitutions") +
bartheme()
print(aa_p)second_cor_br = select(second_cor, node1, node2)
second_cor_br_counts = data.frame(table(unlist(second_cor_br)))
names(second_cor_br_counts) = c("tp", "conv.pair.count")
#second_cor_p = ggplot(second_cor_br_counts, aes(x=tp, y=conv.pair.count)) +
# geom_bar(stat="identity") +
# bartheme()
#print(second_cor_p)
uniq_info_cols = names(second_cor_br_counts)[!(names(second_cor_br_counts) %in% names(tree_info))]
# # Get non common names
uniq_info_cols = c(uniq_info_cols, "tp") # appending key parameter
# Get a list of columns from the tree_info df to join to the tree rates df
tree_info = tree_info %>% left_join((second_cor_br_counts %>% select(uniq_info_cols)), by="tp")
# Select the columns from tree_info and join to tree_rates, merging by clade
# https://stackoverflow.com/a/61628157
tree_info = tree_info[order(tree_info$node), ]
# Re-order the rows by the R node
tree_info$conv.pair.count[is.na(tree_info$conv.pair.count)] = 03.2 Branches involved in secondary correlation
Remember that each point above is a count between a PAIR of branches. This tree displays the number of times each branch in the tree is involved in one of the pairings in the secondary correlation (orange points).
h = corecol(numcol=1, pal="wilke", offset=3)
l = corecol(numcol=1, offset=3)
# Colors
conv_tree = ggtree(tree, size=0.8, ladderize=F, aes(color=tree_info$conv.pair.count)) +
scale_color_continuous(name='Times in secondary pair', low=l, high=h) +
xlim(0, xmax) +
geom_tiplab(color="#333333", fontface='italic', size=2) +
geom_label(aes(x=branch, label=ifelse(tree_info$conv.pair.count>0,as.character(tree_info$conv.pair.count),'')), label.size=NA, fill="transparent") +
theme(legend.position=c(0.15,0.9))
print(conv_tree)if(save_tree_fig){
gcf_tree = gcf_tree + geom_text(aes(x=branch, label=ifelse(tree_info$node.type=="internal",as.character(node), ''), label.size=NA, fill="transparent"), size=2, vjust=-0.2)
tree_outfile = paste("../../data/trees/", tree_type, "-gcf-tree-PARED-gcf50.pdf", sep="")
ggsave(tree_outfile, gcf_tree, width=8, height=16, unit="in")
}
# conv tree3.3 Secondary correlation and branch length
No correlation between branch length and the number of times a branch is on this secondary correlation.
Note: ALL branches plotted, but still no correlation if only those with at least 1 pairing in the secondary correlation are used
secondary_tree_info = subset(tree_info, conv.pair.count > 0)
conv_bl_p = ggplot(tree_info, aes(x=branch.length, y=conv.pair.count)) +
geom_point(size=2, alpha=0.3, color=corecol(numcol=1)) +
xlab("Branch length") +
ylab("# of times in second correlation") +
bartheme()
print(conv_bl_p)3.4 Secondary correlation and number of loci branch present in
No correlation between branch presence and the number of times a branch is on this secondary correlation.
Note: ALL branches plotted, but still no correlation if only those with at least 1 pairing in the secondary correlation are used
branch_presence = read.csv(branch_count_file, header=T)
names(branch_presence)[1] = "tp"
uniq_info_cols = names(branch_presence)[!(names(branch_presence) %in% names(tree_info))]
# # Get non common names
uniq_info_cols = c(uniq_info_cols, "tp") # appending key parameter
# Get a list of columns from the tree_info df to join to the tree rates df
tree_info = tree_info %>% left_join((branch_presence %>% select(uniq_info_cols)), by="tp")
# Select the columns from tree_info and join to tree_rates, merging by clade
# https://stackoverflow.com/a/61628157
tree_info = tree_info[order(tree_info$node), ]
# Re-order the rows by the R node
secondary_tree_info = subset(tree_info, conv.pair.count > 0)
conv_bp_p = ggplot(tree_info, aes(x=num.genes.full+num.genes.partial, y=conv.pair.count)) +
geom_point(size=2, alpha=0.3, color=corecol(numcol=1)) +
xlab("# of times branch in gene") +
ylab("# of times in second correlation") +
bartheme()
print(conv_bp_p)3.5 Convergent substitutions and number of time branch pairs are present in gene trees
#secondary_tree_info = subset(tree_info, conv.pair.count > 0)
conv_pairs_p = ggplot(anc_info_w_root, aes(x=count, y=aa.con)) +
geom_point(size=2, alpha=0.3, color="#333333") +
geom_point(data=second_cor, aes(x=count, y=aa.con), size=2, alpha=0.3, color=corecol(numcol=1)) +
#geom_smooth(method="lm", size=2, linetype="dashed", color="#920000") +
#geom_smooth(data=second_cor, aes(x=aa.div, y=aa.con), method="lm", size=2, linetype="dashed", color="#920000") +
xlab("# of times branches appear together") +
ylab("Convergent AA substitutions") +
bartheme()
print(conv_pairs_p)3.6 Excluding convergent AA substitutions that are divergent at the codon level
This pattern, and actually most convergent amino acid substitutions, are actually driven by divergent substitutions at the codon level:
anc_info_w_root = read.csv(anc_file, header=T, comment.char="#")
# Read the pairwise counts
cols_to_na = names(anc_info_w_root)[3:80]
for(col in cols_to_na){
anc_info_w_root[anc_info_w_root$node1 %in% exclude_branches,][[col]] = NA
anc_info_w_root[anc_info_w_root$node2 %in% exclude_branches,][[col]] = NA
}
# This loop changes all the counts for the branches descending from the root and the outgroup to NA to exclude them
anc_info_w_root = anc_info_w_root %>% mutate(aa.div = rowSums(select(., ends_with(".div.div")), na.rm=T))
anc_info_w_root = anc_info_w_root %>% mutate(aa.par = rowSums(select(., ends_with(".par.par")), na.rm=T))
anc_info_w_root = anc_info_w_root %>% mutate(aa.con = rowSums(select(., ends_with(".con.con")), na.rm=T))
#second_cor = subset(anc_info_w_root, aa.con > 200)
aa_p = ggplot(anc_info_w_root, aes(x=aa.div, y=aa.con)) +
geom_point(size=2, alpha=0.3, color="#333333") +
#geom_point(data=second_cor, aes(x=aa.div, y=aa.con), size=2, alpha=0.3, color=corecol(numcol=1)) +
geom_smooth(method="lm", size=2, linetype="dashed", color="#920000") +
#geom_smooth(data=second_cor, aes(x=aa.div, y=aa.con), method="lm", size=2, linetype="dashed", color="#920000") +
xlab("Divergent AA substitutions") +
ylab("Convergent AA substitutions") +
bartheme()
print(aa_p)3.7 Including only convergent amino acid substitutions that are divergent at the codon level
anc_info_w_root = read.csv(anc_file, header=T, comment.char="#")
# Read the pairwise counts
cols_to_na = names(anc_info_w_root)[3:80]
for(col in cols_to_na){
anc_info_w_root[anc_info_w_root$node1 %in% exclude_branches,][[col]] = NA
anc_info_w_root[anc_info_w_root$node2 %in% exclude_branches,][[col]] = NA
}
# This loop changes all the counts for the branches descending from the root and the outgroup to NA to exclude them
anc_info_w_root = anc_info_w_root %>% mutate(aa.div = rowSums(select(., ends_with(".div.div")), na.rm=T))
anc_info_w_root = anc_info_w_root %>% mutate(aa.con = rowSums(select(., ends_with(".div.con")), na.rm=T))
#second_cor = subset(anc_info_w_root, aa.con > 200)
aa_p = ggplot(anc_info_w_root, aes(x=aa.div, y=aa.con)) +
geom_point(size=2, alpha=0.3, color="#333333") +
#geom_point(data=second_cor, aes(x=aa.div, y=aa.con), size=2, alpha=0.3, color=corecol(numcol=1)) +
geom_smooth(method="lm", size=2, linetype="dashed", color="#920000") +
#geom_smooth(data=second_cor, aes(x=aa.div, y=aa.con), method="lm", size=2, linetype="dashed", color="#920000") +
xlab("Divergent AA substitutions") +
ylab("Convergent AA substitutions") +
bartheme()
print(aa_p)4 Pairwise amino acid convergence without descendant branches
Excluding descendant branches removes the secondary correlation
anc_file = "../../data/rates/full-coding-astral-pairwise-convergence-nodesc.csv"
anc_info_w_root = read.csv(anc_file, header=T, comment.char="#")
# Read the pairwise counts
cols_to_na = names(anc_info_w_root)[3:80]
for(col in cols_to_na){
anc_info_w_root[anc_info_w_root$node1 %in% exclude_branches,][[col]] = NA
anc_info_w_root[anc_info_w_root$node2 %in% exclude_branches,][[col]] = NA
}
# This loop changes all the counts for the branches descending from the root and the outgroup to NA to exclude them
anc_info_w_root = anc_info_w_root %>% mutate(aa.div = rowSums(select(., ends_with(".div")), na.rm=T))
anc_info_w_root = anc_info_w_root %>% mutate(aa.par = rowSums(select(., ends_with(".par")), na.rm=T))
anc_info_w_root = anc_info_w_root %>% mutate(aa.con = rowSums(select(., ends_with(".con")), na.rm=T))
second_cor = subset(anc_info_w_root, aa.con > 200)
#second_cor = rbind(second_cor, subset(anc_info_w_root, aa.div < 3000 & aa.con > 200))
second_cor = rbind(second_cor, subset(anc_info_w_root, aa.div < 1000 & aa.con > 100))
second_cor = rbind(second_cor, subset(anc_info_w_root, aa.div < 400 & aa.con > 50))
second_cor = rbind(second_cor, subset(anc_info_w_root, aa.div < 200 & aa.con > 30))
aa_p = ggplot(anc_info_w_root, aes(x=aa.div, y=aa.con)) +
geom_point(size=2, alpha=0.3, color="#333333") +
geom_point(data=second_cor, aes(x=aa.div, y=aa.con), size=2, alpha=0.3, color=corecol(numcol=1)) +
geom_smooth(method="lm", size=2, linetype="dashed", color="#920000") +
geom_smooth(data=second_cor, aes(x=aa.div, y=aa.con), method="lm", size=2, linetype="dashed", color="#920000") +
xlab("Divergent AA substitutions") +
ylab("Convergent AA substitutions") +
bartheme()
print(aa_p)4.1 Divergent vs parallel substitutions
second_cor_par = subset(anc_info_w_root, aa.con > 200)
#second_cor = rbind(second_cor, subset(anc_info_w_root, aa.div < 3000 & aa.con > 200))
second_cor_par = rbind(second_cor_par, subset(anc_info_w_root, aa.div < 1000 & aa.con > 100))
second_cor_par = rbind(second_cor_par, subset(anc_info_w_root, aa.div < 400 & aa.con > 50))
second_cor_par = rbind(second_cor_par, subset(anc_info_w_root, aa.div < 200 & aa.con > 30))
aa_p = ggplot(anc_info_w_root, aes(x=aa.div, y=aa.par)) +
geom_point(size=2, alpha=0.3, color="#333333") +
geom_point(data=second_cor_par, aes(x=aa.div, y=aa.par), size=2, alpha=0.3, color=corecol(numcol=1)) +
geom_smooth(method="lm", size=2, linetype="dashed", color="#920000") +
#geom_smooth(data=second_cor_par, aes(x=aa.div, y=aa.con), method="lm", size=2, linetype="dashed", color="#920000") +
xlab("Divergent AA substitutions") +
ylab("Parallel AA substitutions") +
bartheme()
print(aa_p)4.2 Amino acid convergence on the phylogeny
conv_counts = data.frame("tp"=tree_info$tp)
conv_counts$aa.con.sum = 0
conv_counts$aa.div.sum = 0
for(i in 1:nrow(anc_info_w_root)){
row = anc_info_w_root[i,]
n1 = row$node1
n2 = row$node2
conv_counts[conv_counts$tp==n1,]$aa.con.sum = conv_counts[conv_counts$tp==n1,]$aa.con.sum + row$aa.con
conv_counts[conv_counts$tp==n1,]$aa.div.sum = conv_counts[conv_counts$tp==n1,]$aa.div.sum + row$aa.div
conv_counts[conv_counts$tp==n2,]$aa.con.sum = conv_counts[conv_counts$tp==n2,]$aa.con.sum + row$aa.con
conv_counts[conv_counts$tp==n2,]$aa.div.sum = conv_counts[conv_counts$tp==n2,]$aa.div.sum + row$aa.div
}
tree_info = merge(tree_info, conv_counts, by="tp")
tree_info = tree_info[order(tree_info$node), ]
# Re-order the rows by the R nodeh = corecol(numcol=1, pal="wilke", offset=3)
l = corecol(numcol=1, offset=3)
# Colors
tree_info$cd.ratio = tree_info$aa.con.sum / tree_info$aa.div.sum
tree_info[is.nan(tree_info$cd.ratio),]$cd.ratio = NA
conv_tree = ggtree(tree, size=0.8, ladderize=F, aes(color=tree_info$cd.ratio)) +
scale_color_continuous(name='# convergent subs / # divergent subs', low=l, high=h) +
xlim(0, xmax) +
geom_tiplab(color="#333333", fontface='italic', size=2) +
#geom_label(aes(x=branch, label=ifelse(tree_info$conv.pair.count>0,as.character(tree_info$conv.pair.count),'')), label.size=NA, fill="transparent") +
theme(legend.position=c(0.15,0.9))
print(conv_tree)if(save_tree_fig){
gcf_tree = gcf_tree + geom_text(aes(x=branch, label=ifelse(tree_info$node.type=="internal",as.character(node), ''), label.size=NA, fill="transparent"), size=2, vjust=-0.2)
tree_outfile = paste("../../data/trees/", tree_type, "-gcf-tree-PARED-gcf50.pdf", sep="")
ggsave(tree_outfile, gcf_tree, width=8, height=16, unit="in")
}
# conv tree5 Codon convergence
5.1 Non-synonymous codon convergence
anc_info_w_root = anc_info_w_root %>% mutate(syn.con = rowSums(select(., starts_with("s.s.con")), na.rm=T))
#anc_info_w_root = anc_info_w_root %>% mutate(non.con = rowSums(select(., starts_with("n.n.con") | starts_with("s.n.con")), na.rm=T))
#anc_info_w_root = anc_info_w_root %>% mutate(non.div = rowSums(select(., starts_with("n.n.div") | starts_with("s.n.div")), na.rm=T))
anc_info_w_root = anc_info_w_root %>% mutate(non.con = rowSums(select(., starts_with("n.n.con")), na.rm=T))
anc_info_w_root = anc_info_w_root %>% mutate(non.div = rowSums(select(., starts_with("n.n.div")), na.rm=T))
ns_codon_p = ggplot(anc_info_w_root, aes(x=non.div, y=non.con)) +
geom_point(size=2, alpha=0.3, color="#333333") +
geom_smooth(method="lm", size=2, linetype="dashed", color="#920000") +
xlab("Divergent non-synonymous codon substitutions") +
ylab("Convergent non-synonymous codon substitutions") +
bartheme()
print(ns_codon_p)5.2 Non-synonymous codon convergence / synonymous codon convergence
For each pair, counts where both convergent substitutions are non-synonymous vs. both are synonymous
cncs_set = subset(anc_info_w_root, syn.con > 4)
cncs_set$cncs = cncs_set$non.con / cncs_set$syn.con
#cncs_set$cncs[is.na(cncs_set$cncs)] = NA
#cncs_set$cncs[is.infinite(cncs_set$cncs)] = NA
cncs_p = ggplot(cncs_set, aes(x=cncs)) +
geom_histogram(fill=corecol(numcol=1, offset=1), color="#ececec") +
scale_x_continuous(expand=c(0,0)) +
scale_y_continuous(expand=c(0,0)) +
xlab("cN/cS") +
ylab("Branch pairs") +
bartheme()
print(cncs_p)cat("Avg. cN/cS: ", mean(cncs_set$cncs, na.rm=T), "\n")## Avg. cN/cS: 0.2915383
cat("Median cN/cS: ", median(cncs_set$cncs, na.rm=T), "\n")## Median cN/cS: 0.2391304
cncs_subset = select(cncs_set, node1, node2, non.con, syn.con, cncs)
cncs_subset = cncs_subset[order(-cncs_subset$cncs), ]
head(cncs_subset, n=20)cncs_95_perc = quantile(cncs_set$cncs, 0.95, na.rm=T)
cat("cN/cS 95th percentile: ", cncs_95_perc, "\n")## cN/cS 95th percentile: 0.7777778
5.3 Synonymous vs. non-synonymous codon convergence
Y = above 95th percentile of cN/cS
N = below 95th percentile of cN/cS
cncs_subset$cncs.95 = ifelse(cncs_subset$cncs > cncs_95_perc, "Y", "N")
s_ns_codon_p = ggplot(cncs_subset, aes(x=syn.con, y=non.con, color=cncs.95)) +
geom_point(size=2, alpha=0.3) +
geom_smooth(method="lm", size=2, linetype="dashed", color="#920000") +
xlab("Convergent synonymous codon substitutions") +
ylab("Convergent non-synonymous\ncodon substitutions") +
bartheme()
print(s_ns_codon_p)cs_95_perc = quantile(anc_info$cS, 0.95, na.rm=T)
cn_95_perc = quantile(anc_info$cN, 0.95, na.rm=T)
p = ggplot(anc_info, aes(x=cS, y=cN, color=node.type)) +
geom_point(size=2, alpha=0.2) +
geom_text_repel(aes(label=ifelse(cS>cs_95_perc | cN>cn_95_perc, as.character(node), '')), show_guide=F) +
#(aes(label=ifelse(avg.dN>dn_95_perc, as.character(node), '')), show_guide=F) +
geom_vline(xintercept=cs_95_perc, linetype="dashed", color=corecol(numcol=1, offset=6)) +
geom_hline(yintercept=cn_95_perc, linetype="dashed", color=corecol(numcol=1, offset=6)) +
#geom_smooth(method="lm", se=F, ) +
xlab("cS per branch") +
ylab("cN per branch") +
bartheme() +
theme(legend.position="bottom") +
guides(colour = guide_legend(override.aes = list(alpha = 1)))
p = ggExtra::ggMarginal(p, type="histogram", bins=50, fill=corecol(pal="wilke", numcol=1, offset=1), color="#666666")
print(p)
anc_info$cs.outlier = ifelse(anc_info$cS>cs_95_perc,anc_info$node,'')
anc_info$cn.outlier = ifelse(anc_info$cN>cn_95_perc,anc_info$node,'')# Synonymous and non-synonymous convergent substitutions
#This results in the following distributions for number of convergent substitutions across branches (green lines indicating 95th percentile of each rate):
cs_95_perc = quantile(anc_info$cS, 0.95, na.rm=T)
cn_95_perc = quantile(anc_info$cN, 0.95, na.rm=T)
p = ggplot(anc_info, aes(x=cS, y=cN, color=node.type)) +
geom_point(size=2, alpha=0.2) +
geom_text_repel(aes(label=ifelse(cS>cs_95_perc | cN>cn_95_perc, as.character(node), '')), show_guide=F) +
#(aes(label=ifelse(avg.dN>dn_95_perc, as.character(node), '')), show_guide=F) +
geom_vline(xintercept=cs_95_perc, linetype="dashed", color=corecol(numcol=1, offset=6)) +
geom_hline(yintercept=cn_95_perc, linetype="dashed", color=corecol(numcol=1, offset=6)) +
#geom_smooth(method="lm", se=F, ) +
xlab("cS per branch") +
ylab("cN per branch") +
bartheme() +
theme(legend.position="bottom") +
guides(colour = guide_legend(override.aes = list(alpha = 1)))
p = ggExtra::ggMarginal(p, type="histogram", bins=50, fill=corecol(pal="wilke", numcol=1, offset=1), color="#666666")
print(p)
anc_info$cs.outlier = ifelse(anc_info$cS>cs_95_perc,anc_info$node,'')
anc_info$cn.outlier = ifelse(anc_info$cN>cn_95_perc,anc_info$node,'')# Species tree with branches colored # of convergent substitutions to any other branch
h = corecol(numcol=1, pal="wilke", offset=3)
l = corecol(numcol=1, offset=3)
# Colors
c_tree = ggtree(tree, size=0.8, ladderize=F, aes(color=anc_info_w_root$total.c)) +
scale_color_continuous(name='Convergent subs', low=l, high=h) +
xlim(0, xmax) +
geom_tiplab(color="#333333", fontface='italic', size=2) +
theme(legend.position=c(0.05,0.9))
print(c_tree)# Ratio of non-synyonymous to synonymous convergent substitutions (cN/cS)
cncs_p = ggplot(anc_info, aes(x=cn.cs)) +
geom_histogram(bins=50, fill=corecol(pal="wilke", numcol=1), color="#666666") +
scale_y_continuous(expand=c(0,0)) +
xlab("cN/cS") +
ylab("# of branches") +
bartheme()
print(cncs_p)
# Distribution of dN/dS when using gene trees
cncs_95_perc = quantile(anc_info$cn.cs, 0.95, na.rm=T)
anc_info$cncs.outlier = ifelse(anc_info$cn.cs>cncs_95_perc,anc_info$node,'')# Species tree with branches colored by cN/cS
h = corecol(numcol=1, pal="wilke", offset=3)
l = corecol(numcol=1, offset=3)
# Colors
cncs_tree = ggtree(tree, size=0.8, ladderize=F, aes(color=anc_info_w_root$cn.cs)) +
scale_color_continuous(name='cN/cS', low=l, high=h) +
xlim(0, xmax) +
geom_tiplab(color="#333333", fontface='italic', size=2) +
theme(legend.position=c(0.05,0.9))
print(cncs_tree)# Pairwise convergent substitutions
# Convergent and divergent substitutions between every pair of branches in the tree
pairwise_data = read.csv("../../data/rates/")
pairwise_data$convergent = pairwise_data$con.syn + pairwise_data$con.one.each + pairwise_data$con.non.syn
pairwise_data$divergent = pairwise_data$div.syn + pairwise_data$div.one.each + pairwise_data$div.non.syn
pairwise_p = ggplot(pairwise_data, aes(x=divergent, y=convergent)) +
geom_point(size=2, alpha=0.3, color="#920000") +
geom_smooth(method="lm", se=F, size=2, linetype="dashed", color="#333333") +
bartheme()
print(pairwise_p)