Murine convergent substitutions

knitr::opts_chunk$set(echo = TRUE, warning = FALSE, message = FALSE)

library(ggplot2)
library(cowplot)
library(ggbeeswarm)
library(dplyr)
#library(kableExtra)
library(tidyr)
library(ggtree)
library(phytools)
library(phangorn)
library(reshape2)
library(ggExtra)
library(ggrepel)
#library(vroom)
library(ggdist)
#library(stringr)
source("../lib/design.r")
source("../lib/get_tree_info.r")

< Back to summary

tree_type = "astral"
save_tree_fig = F

cat(tree_type, " tree\n")

## astral  tree

cat("188 species targeted capture to mouse exons assembled with SPADES.\n")

## 188 species targeted capture to mouse exons assembled with SPADES.

cat("11,775 coding loci aligned with exonerate+mafft\n")

## 11,775 coding loci aligned with exonerate+mafft

cat("Gene trees inferred with IQtree.\n")

## Gene trees inferred with IQtree.

if(tree_type == "astral"){
  cat("Species tree inferred with ASTRAL.\n")
  cat("Branch lengths estimated using 865 loci with normalized RF distance to species tree <= 0.25.\n")
  tree_file = "../../data/trees/full_coding_iqtree_astral.cf.bl.nrf25.rooted.labeled.treefile"
  anc_file = "../../data/rates/full-coding-astral-pairwise-convergence.csv"
  branch_count_file = "../../data/rates/full-coding-astral-mg94-local-branch-mns.csv"
  exclude_branches = c("Lophiomys_imhausi_UM5152", "Lophuromys_woosnami_LSUMZ37793", "<1>", "<187>", "<186>")
  xmax = 0.125
}else if(tree_type == "concat"){
  cat("Species tree inferred by concatenation of all loci with IQtree.\n")
  tree_file = "../../data/trees/full_coding_iqtree_concat.cf.rooted.tree"
  anc_file = "../../data/rates/full-coding-concat-pairwise-convergence.csv"
  branch_count_file = ""
  exclude_branches = c("Lophiomys_imhausi_UM5152", "Lophuromys_woosnami_LSUMZ37793", "<1>", "<187>", "<186>")
  xmax = 0.125
}

## Species tree inferred with ASTRAL.
## Branch lengths estimated using 865 loci with normalized RF distance to species tree <= 0.25.

cat("Ancestral sequences inferred with HyPhy\n")

## Ancestral sequences inferred with HyPhy

tree = read.tree(tree_file)
tree_to_df_list = treeToDF(tree)
tree_info = tree_to_df_list[["info"]]

if(tree_type == "astral"){
  tree_info = tree_info %>% separate(label, c("tp", "astral", "gcf", "scf"), sep="/", remove=F)
  tree_info$astral[tree_info$node.type=="tip"] = NA
  tree_info$astral = as.numeric(tree_info$astral)
  tree_info$gcf = as.numeric(tree_info$gcf)
  tree_info$scf = as.numeric(tree_info$scf)
}else if(tree_type == "concat"){
  tree_info = tree_info %>% separate(label, c("bootstrap", "gcf", "scf"), sep="/", remove=F)
  tree_info$bootstrap[tree_info$node.type=="tip"] = NA
  tree_info$bootstrap = as.numeric(tree_info$bootstrap)
  tree_info$gcf = as.numeric(tree_info$gcf)
  tree_info$scf = as.numeric(tree_info$scf)
}

1 Definitions

Divergent substitution: A substitution at the same site on two branches of the tree to a different derived state regardless of ancestral state (e.g. A,A -> C,G or A,T -> C,G).
Convergent substitution: A substitution at the same site on two branches of the tree that differ in their ancestral state to an identical derived state (e.g. A,G -> C,C).
Parllel substitution: A substitution at the same site on two branches of the tree that share the same ancestral state to an identical derived state (e.g. A,A -> C,C).

For all pairwise analyses, comparisons between branches where one is the direct descendant of the other are excluded.

2 Species tree branch presence/absence per gene

Following the same procedure as before, we count convergent substitutions per species tree branch only in genes that contain that branch (full clade or partial clade).

3 Pairwise amino acid convergence

Convergent and divergent amino acid substitutions were counted on each pair of branches in the species tree in every gene where both branches were present, excluding pairs where one branch is the direct descendant of the other.

We find a strong correlation between divergent and convergent amino acid substitutions – with a secondary correlation (orange points).

anc_info_w_root = read.csv(anc_file, header=T, comment.char="#")
# Read the pairwise counts

cols_to_na = names(anc_info_w_root)[3:80]
for(col in cols_to_na){
  anc_info_w_root[anc_info_w_root$node1 %in% exclude_branches,][[col]] = NA
  anc_info_w_root[anc_info_w_root$node2 %in% exclude_branches,][[col]] = NA
}
# This loop changes all the counts for the branches descending from the root and the outgroup to NA to exclude them

anc_info_w_root = anc_info_w_root %>% mutate(aa.div = rowSums(select(., ends_with(".div")), na.rm=T))
anc_info_w_root = anc_info_w_root %>% mutate(aa.par = rowSums(select(., ends_with(".par")), na.rm=T))
anc_info_w_root = anc_info_w_root %>% mutate(aa.con = rowSums(select(., ends_with(".con")), na.rm=T))

second_cor = subset(anc_info_w_root, aa.con > 200)
#second_cor = rbind(second_cor, subset(anc_info_w_root, aa.div < 3000 & aa.con > 200))
second_cor = rbind(second_cor, subset(anc_info_w_root, aa.div < 1000 & aa.con > 100))
second_cor = rbind(second_cor, subset(anc_info_w_root, aa.div < 400 & aa.con > 50))
second_cor = rbind(second_cor, subset(anc_info_w_root, aa.div < 200 & aa.con > 30))

aa_p = ggplot(anc_info_w_root, aes(x=aa.div, y=aa.con)) +
  geom_point(size=2, alpha=0.3, color="#333333") +
  geom_point(data=second_cor, aes(x=aa.div, y=aa.con), size=2, alpha=0.3, color=corecol(numcol=1)) +
  geom_smooth(method="lm", size=2, linetype="dashed", color="#920000") +
  geom_smooth(data=second_cor, aes(x=aa.div, y=aa.con), method="lm", size=2, linetype="dashed", color="#920000") +
  xlab("Divergent AA substitutions") +
  ylab("Convergent AA substitutions") +
  bartheme()

print(aa_p)

3.1 Divergent vs parallel substitutions

No secondary correlation between divergent and parallel (same ancestral state) substitutions, though those pairs seem to cluster on the lower end of parallel substitutions.

second_cor_par = subset(anc_info_w_root, aa.con > 200)
#second_cor = rbind(second_cor, subset(anc_info_w_root, aa.div < 3000 & aa.con > 200))
second_cor_par = rbind(second_cor_par, subset(anc_info_w_root, aa.div < 1000 & aa.con > 100))
second_cor_par = rbind(second_cor_par, subset(anc_info_w_root, aa.div < 400 & aa.con > 50))
second_cor_par = rbind(second_cor_par, subset(anc_info_w_root, aa.div < 200 & aa.con > 30))

aa_p = ggplot(anc_info_w_root, aes(x=aa.div, y=aa.par)) +
  geom_point(size=2, alpha=0.3, color="#333333") +
  geom_point(data=second_cor_par, aes(x=aa.div, y=aa.par), size=2, alpha=0.3, color=corecol(numcol=1)) +
  geom_smooth(method="lm", size=2, linetype="dashed", color="#920000") +
  #geom_smooth(data=second_cor_par, aes(x=aa.div, y=aa.con), method="lm", size=2, linetype="dashed", color="#920000") +
  xlab("Divergent AA substitutions") +
  ylab("Parallel AA substitutions") +
  bartheme()

print(aa_p)

second_cor_br = select(second_cor, node1, node2)
second_cor_br_counts = data.frame(table(unlist(second_cor_br)))
names(second_cor_br_counts) = c("tp", "conv.pair.count")

#second_cor_p = ggplot(second_cor_br_counts, aes(x=tp, y=conv.pair.count)) + 
#  geom_bar(stat="identity") +
#  bartheme()
#print(second_cor_p)

uniq_info_cols = names(second_cor_br_counts)[!(names(second_cor_br_counts) %in% names(tree_info))] 
# # Get non common names
 
uniq_info_cols = c(uniq_info_cols, "tp") # appending key parameter
# Get a list of columns from the tree_info df to join to the tree rates df
 
tree_info = tree_info %>% left_join((second_cor_br_counts %>% select(uniq_info_cols)), by="tp")
# Select the columns from tree_info and join to tree_rates, merging by clade
# https://stackoverflow.com/a/61628157

tree_info = tree_info[order(tree_info$node), ]
# Re-order the rows by the R node

tree_info$conv.pair.count[is.na(tree_info$conv.pair.count)] = 0

3.2 Branches involved in secondary correlation

Remember that each point above is a count between a PAIR of branches. This tree displays the number of times each branch in the tree is involved in one of the pairings in the secondary correlation (orange points).

h = corecol(numcol=1, pal="wilke", offset=3)
l = corecol(numcol=1, offset=3)
# Colors

conv_tree = ggtree(tree, size=0.8, ladderize=F, aes(color=tree_info$conv.pair.count)) +
  scale_color_continuous(name='Times in secondary pair', low=l, high=h) +
  xlim(0, xmax) +
  geom_tiplab(color="#333333", fontface='italic', size=2) +
  geom_label(aes(x=branch, label=ifelse(tree_info$conv.pair.count>0,as.character(tree_info$conv.pair.count),'')), label.size=NA, fill="transparent") +
  theme(legend.position=c(0.15,0.9))
print(conv_tree)

if(save_tree_fig){
  gcf_tree = gcf_tree + geom_text(aes(x=branch, label=ifelse(tree_info$node.type=="internal",as.character(node), ''), label.size=NA, fill="transparent"), size=2, vjust=-0.2)
  tree_outfile = paste("../../data/trees/", tree_type, "-gcf-tree-PARED-gcf50.pdf", sep="")
  ggsave(tree_outfile, gcf_tree, width=8, height=16, unit="in")
}
# conv tree

3.3 Secondary correlation and branch length

No correlation between branch length and the number of times a branch is on this secondary correlation.

Note: ALL branches plotted, but still no correlation if only those with at least 1 pairing in the secondary correlation are used

secondary_tree_info = subset(tree_info, conv.pair.count > 0)

conv_bl_p = ggplot(tree_info, aes(x=branch.length, y=conv.pair.count)) +
  geom_point(size=2, alpha=0.3, color=corecol(numcol=1)) +
  xlab("Branch length") +
  ylab("# of times in second correlation") +
  bartheme()
print(conv_bl_p)

3.4 Secondary correlation and number of loci branch present in

No correlation between branch presence and the number of times a branch is on this secondary correlation.

Note: ALL branches plotted, but still no correlation if only those with at least 1 pairing in the secondary correlation are used

branch_presence = read.csv(branch_count_file, header=T)
names(branch_presence)[1] = "tp"

uniq_info_cols = names(branch_presence)[!(names(branch_presence) %in% names(tree_info))] 
# # Get non common names
 
uniq_info_cols = c(uniq_info_cols, "tp") # appending key parameter
# Get a list of columns from the tree_info df to join to the tree rates df
 
tree_info = tree_info %>% left_join((branch_presence %>% select(uniq_info_cols)), by="tp")
# Select the columns from tree_info and join to tree_rates, merging by clade
# https://stackoverflow.com/a/61628157

tree_info = tree_info[order(tree_info$node), ]
# Re-order the rows by the R node


secondary_tree_info = subset(tree_info, conv.pair.count > 0)

conv_bp_p = ggplot(tree_info, aes(x=num.genes.full+num.genes.partial, y=conv.pair.count)) +
  geom_point(size=2, alpha=0.3, color=corecol(numcol=1)) +
  xlab("# of times branch in gene") +
  ylab("# of times in second correlation") +
  bartheme()
print(conv_bp_p)

3.5 Convergent substitutions and number of time branch pairs are present in gene trees

#secondary_tree_info = subset(tree_info, conv.pair.count > 0)

conv_pairs_p = ggplot(anc_info_w_root, aes(x=count, y=aa.con)) +
  geom_point(size=2, alpha=0.3, color="#333333") +
  geom_point(data=second_cor, aes(x=count, y=aa.con), size=2, alpha=0.3, color=corecol(numcol=1)) +
  #geom_smooth(method="lm", size=2, linetype="dashed", color="#920000") +
  #geom_smooth(data=second_cor, aes(x=aa.div, y=aa.con), method="lm", size=2, linetype="dashed", color="#920000") +
  xlab("# of times branches appear together") +
  ylab("Convergent AA substitutions") +
  bartheme()

print(conv_pairs_p)

3.6 Excluding convergent AA substitutions that are divergent at the codon level

This pattern, and actually most convergent amino acid substitutions, are actually driven by divergent substitutions at the codon level:

anc_info_w_root = read.csv(anc_file, header=T, comment.char="#")
# Read the pairwise counts

cols_to_na = names(anc_info_w_root)[3:80]
for(col in cols_to_na){
  anc_info_w_root[anc_info_w_root$node1 %in% exclude_branches,][[col]] = NA
  anc_info_w_root[anc_info_w_root$node2 %in% exclude_branches,][[col]] = NA
}
# This loop changes all the counts for the branches descending from the root and the outgroup to NA to exclude them

anc_info_w_root = anc_info_w_root %>% mutate(aa.div = rowSums(select(., ends_with(".div.div")), na.rm=T))
anc_info_w_root = anc_info_w_root %>% mutate(aa.par = rowSums(select(., ends_with(".par.par")), na.rm=T))
anc_info_w_root = anc_info_w_root %>% mutate(aa.con = rowSums(select(., ends_with(".con.con")), na.rm=T))

#second_cor = subset(anc_info_w_root, aa.con > 200)

aa_p = ggplot(anc_info_w_root, aes(x=aa.div, y=aa.con)) +
  geom_point(size=2, alpha=0.3, color="#333333") +
  #geom_point(data=second_cor, aes(x=aa.div, y=aa.con), size=2, alpha=0.3, color=corecol(numcol=1)) +
  geom_smooth(method="lm", size=2, linetype="dashed", color="#920000") +
  #geom_smooth(data=second_cor, aes(x=aa.div, y=aa.con), method="lm", size=2, linetype="dashed", color="#920000") +
  xlab("Divergent AA substitutions") +
  ylab("Convergent AA substitutions") +
  bartheme()

print(aa_p)

3.7 Including only convergent amino acid substitutions that are divergent at the codon level

anc_info_w_root = read.csv(anc_file, header=T, comment.char="#")
# Read the pairwise counts

cols_to_na = names(anc_info_w_root)[3:80]
for(col in cols_to_na){
  anc_info_w_root[anc_info_w_root$node1 %in% exclude_branches,][[col]] = NA
  anc_info_w_root[anc_info_w_root$node2 %in% exclude_branches,][[col]] = NA
}
# This loop changes all the counts for the branches descending from the root and the outgroup to NA to exclude them

anc_info_w_root = anc_info_w_root %>% mutate(aa.div = rowSums(select(., ends_with(".div.div")), na.rm=T))
anc_info_w_root = anc_info_w_root %>% mutate(aa.con = rowSums(select(., ends_with(".div.con")), na.rm=T))

#second_cor = subset(anc_info_w_root, aa.con > 200)

aa_p = ggplot(anc_info_w_root, aes(x=aa.div, y=aa.con)) +
  geom_point(size=2, alpha=0.3, color="#333333") +
  #geom_point(data=second_cor, aes(x=aa.div, y=aa.con), size=2, alpha=0.3, color=corecol(numcol=1)) +
  geom_smooth(method="lm", size=2, linetype="dashed", color="#920000") +
  #geom_smooth(data=second_cor, aes(x=aa.div, y=aa.con), method="lm", size=2, linetype="dashed", color="#920000") +
  xlab("Divergent AA substitutions") +
  ylab("Convergent AA substitutions") +
  bartheme()

print(aa_p)

4 Pairwise amino acid convergence without descendant branches

Excluding descendant branches removes the secondary correlation

anc_file = "../../data/rates/full-coding-astral-pairwise-convergence-nodesc.csv"
anc_info_w_root = read.csv(anc_file, header=T, comment.char="#")
# Read the pairwise counts

cols_to_na = names(anc_info_w_root)[3:80]
for(col in cols_to_na){
  anc_info_w_root[anc_info_w_root$node1 %in% exclude_branches,][[col]] = NA
  anc_info_w_root[anc_info_w_root$node2 %in% exclude_branches,][[col]] = NA
}
# This loop changes all the counts for the branches descending from the root and the outgroup to NA to exclude them

anc_info_w_root = anc_info_w_root %>% mutate(aa.div = rowSums(select(., ends_with(".div")), na.rm=T))
anc_info_w_root = anc_info_w_root %>% mutate(aa.par = rowSums(select(., ends_with(".par")), na.rm=T))
anc_info_w_root = anc_info_w_root %>% mutate(aa.con = rowSums(select(., ends_with(".con")), na.rm=T))

second_cor = subset(anc_info_w_root, aa.con > 200)
#second_cor = rbind(second_cor, subset(anc_info_w_root, aa.div < 3000 & aa.con > 200))
second_cor = rbind(second_cor, subset(anc_info_w_root, aa.div < 1000 & aa.con > 100))
second_cor = rbind(second_cor, subset(anc_info_w_root, aa.div < 400 & aa.con > 50))
second_cor = rbind(second_cor, subset(anc_info_w_root, aa.div < 200 & aa.con > 30))

aa_p = ggplot(anc_info_w_root, aes(x=aa.div, y=aa.con)) +
  geom_point(size=2, alpha=0.3, color="#333333") +
  geom_point(data=second_cor, aes(x=aa.div, y=aa.con), size=2, alpha=0.3, color=corecol(numcol=1)) +
  geom_smooth(method="lm", size=2, linetype="dashed", color="#920000") +
  geom_smooth(data=second_cor, aes(x=aa.div, y=aa.con), method="lm", size=2, linetype="dashed", color="#920000") +
  xlab("Divergent AA substitutions") +
  ylab("Convergent AA substitutions") +
  bartheme()

print(aa_p)

4.1 Divergent vs parallel substitutions

second_cor_par = subset(anc_info_w_root, aa.con > 200)
#second_cor = rbind(second_cor, subset(anc_info_w_root, aa.div < 3000 & aa.con > 200))
second_cor_par = rbind(second_cor_par, subset(anc_info_w_root, aa.div < 1000 & aa.con > 100))
second_cor_par = rbind(second_cor_par, subset(anc_info_w_root, aa.div < 400 & aa.con > 50))
second_cor_par = rbind(second_cor_par, subset(anc_info_w_root, aa.div < 200 & aa.con > 30))

aa_p = ggplot(anc_info_w_root, aes(x=aa.div, y=aa.par)) +
  geom_point(size=2, alpha=0.3, color="#333333") +
  geom_point(data=second_cor_par, aes(x=aa.div, y=aa.par), size=2, alpha=0.3, color=corecol(numcol=1)) +
  geom_smooth(method="lm", size=2, linetype="dashed", color="#920000") +
  #geom_smooth(data=second_cor_par, aes(x=aa.div, y=aa.con), method="lm", size=2, linetype="dashed", color="#920000") +
  xlab("Divergent AA substitutions") +
  ylab("Parallel AA substitutions") +
  bartheme()

print(aa_p)

4.2 Amino acid convergence on the phylogeny

conv_counts = data.frame("tp"=tree_info$tp)
conv_counts$aa.con.sum = 0
conv_counts$aa.div.sum = 0

for(i in 1:nrow(anc_info_w_root)){
  row = anc_info_w_root[i,]
  n1 = row$node1
  n2 = row$node2
  conv_counts[conv_counts$tp==n1,]$aa.con.sum = conv_counts[conv_counts$tp==n1,]$aa.con.sum + row$aa.con
  conv_counts[conv_counts$tp==n1,]$aa.div.sum = conv_counts[conv_counts$tp==n1,]$aa.div.sum + row$aa.div
  
  conv_counts[conv_counts$tp==n2,]$aa.con.sum = conv_counts[conv_counts$tp==n2,]$aa.con.sum + row$aa.con
  conv_counts[conv_counts$tp==n2,]$aa.div.sum = conv_counts[conv_counts$tp==n2,]$aa.div.sum + row$aa.div
}

tree_info = merge(tree_info, conv_counts, by="tp")

tree_info = tree_info[order(tree_info$node), ]
# Re-order the rows by the R node

h = corecol(numcol=1, pal="wilke", offset=3)
l = corecol(numcol=1, offset=3)
# Colors

tree_info$cd.ratio = tree_info$aa.con.sum / tree_info$aa.div.sum
tree_info[is.nan(tree_info$cd.ratio),]$cd.ratio = NA

conv_tree = ggtree(tree, size=0.8, ladderize=F, aes(color=tree_info$cd.ratio)) +
  scale_color_continuous(name='# convergent subs / # divergent subs', low=l, high=h) +
  xlim(0, xmax) +
  geom_tiplab(color="#333333", fontface='italic', size=2) +
  #geom_label(aes(x=branch, label=ifelse(tree_info$conv.pair.count>0,as.character(tree_info$conv.pair.count),'')), label.size=NA, fill="transparent") +
  theme(legend.position=c(0.15,0.9))
print(conv_tree)

if(save_tree_fig){
  gcf_tree = gcf_tree + geom_text(aes(x=branch, label=ifelse(tree_info$node.type=="internal",as.character(node), ''), label.size=NA, fill="transparent"), size=2, vjust=-0.2)
  tree_outfile = paste("../../data/trees/", tree_type, "-gcf-tree-PARED-gcf50.pdf", sep="")
  ggsave(tree_outfile, gcf_tree, width=8, height=16, unit="in")
}
# conv tree

5 Codon convergence

5.1 Non-synonymous codon convergence

anc_info_w_root = anc_info_w_root %>% mutate(syn.con = rowSums(select(., starts_with("s.s.con")), na.rm=T))
#anc_info_w_root = anc_info_w_root %>% mutate(non.con = rowSums(select(., starts_with("n.n.con") | starts_with("s.n.con")), na.rm=T))
#anc_info_w_root = anc_info_w_root %>% mutate(non.div = rowSums(select(., starts_with("n.n.div") | starts_with("s.n.div")), na.rm=T))

anc_info_w_root = anc_info_w_root %>% mutate(non.con = rowSums(select(., starts_with("n.n.con")), na.rm=T))
anc_info_w_root = anc_info_w_root %>% mutate(non.div = rowSums(select(., starts_with("n.n.div")), na.rm=T))

ns_codon_p = ggplot(anc_info_w_root, aes(x=non.div, y=non.con)) +
  geom_point(size=2, alpha=0.3, color="#333333") +
  geom_smooth(method="lm", size=2, linetype="dashed", color="#920000") +
  xlab("Divergent non-synonymous codon substitutions") +
  ylab("Convergent non-synonymous codon substitutions") +
  bartheme()

print(ns_codon_p)

5.2 Non-synonymous codon convergence / synonymous codon convergence

For each pair, counts where both convergent substitutions are non-synonymous vs. both are synonymous

cncs_set = subset(anc_info_w_root, syn.con > 4)

cncs_set$cncs = cncs_set$non.con / cncs_set$syn.con
#cncs_set$cncs[is.na(cncs_set$cncs)] = NA
#cncs_set$cncs[is.infinite(cncs_set$cncs)] = NA

cncs_p = ggplot(cncs_set, aes(x=cncs)) +
  geom_histogram(fill=corecol(numcol=1, offset=1), color="#ececec") +
  scale_x_continuous(expand=c(0,0)) +
  scale_y_continuous(expand=c(0,0)) +
  xlab("cN/cS") +
  ylab("Branch pairs") +
  bartheme()

print(cncs_p)

cat("Avg. cN/cS: ", mean(cncs_set$cncs, na.rm=T), "\n")

## Avg. cN/cS:  0.2915383

cat("Median cN/cS: ", median(cncs_set$cncs, na.rm=T), "\n")

## Median cN/cS:  0.2391304

cncs_subset = select(cncs_set, node1, node2, non.con, syn.con, cncs)
cncs_subset = cncs_subset[order(-cncs_subset$cncs), ]

head(cncs_subset, n=20)

cncs_95_perc = quantile(cncs_set$cncs, 0.95, na.rm=T)

cat("cN/cS 95th percentile: ", cncs_95_perc, "\n")

## cN/cS 95th percentile:  0.7777778

5.3 Synonymous vs. non-synonymous codon convergence

Y = above 95th percentile of cN/cS

N = below 95th percentile of cN/cS

cncs_subset$cncs.95 = ifelse(cncs_subset$cncs > cncs_95_perc, "Y", "N")

s_ns_codon_p = ggplot(cncs_subset, aes(x=syn.con, y=non.con, color=cncs.95)) +
  geom_point(size=2, alpha=0.3) +
  geom_smooth(method="lm", size=2, linetype="dashed", color="#920000") +
  xlab("Convergent synonymous codon substitutions") +
  ylab("Convergent non-synonymous\ncodon substitutions") +
  bartheme()

print(s_ns_codon_p)

cs_95_perc = quantile(anc_info$cS, 0.95, na.rm=T)
cn_95_perc = quantile(anc_info$cN, 0.95, na.rm=T)

p = ggplot(anc_info, aes(x=cS, y=cN, color=node.type)) +
  geom_point(size=2, alpha=0.2) +
  geom_text_repel(aes(label=ifelse(cS>cs_95_perc | cN>cn_95_perc, as.character(node), '')), show_guide=F) +
  #(aes(label=ifelse(avg.dN>dn_95_perc, as.character(node), '')), show_guide=F) +
  geom_vline(xintercept=cs_95_perc, linetype="dashed", color=corecol(numcol=1, offset=6)) +
  geom_hline(yintercept=cn_95_perc, linetype="dashed", color=corecol(numcol=1, offset=6)) +
  #geom_smooth(method="lm", se=F, ) +
  xlab("cS per branch") + 
  ylab("cN per branch") + 
  bartheme() +
  theme(legend.position="bottom") +
  guides(colour = guide_legend(override.aes = list(alpha = 1)))
p = ggExtra::ggMarginal(p, type="histogram", bins=50, fill=corecol(pal="wilke", numcol=1, offset=1), color="#666666")
print(p)

anc_info$cs.outlier = ifelse(anc_info$cS>cs_95_perc,anc_info$node,'')
anc_info$cn.outlier = ifelse(anc_info$cN>cn_95_perc,anc_info$node,'')

# Synonymous and non-synonymous convergent substitutions

#This results in the following distributions for number of convergent substitutions across branches (green lines indicating 95th percentile of each rate):
  
  
cs_95_perc = quantile(anc_info$cS, 0.95, na.rm=T)
cn_95_perc = quantile(anc_info$cN, 0.95, na.rm=T)

p = ggplot(anc_info, aes(x=cS, y=cN, color=node.type)) +
  geom_point(size=2, alpha=0.2) +
  geom_text_repel(aes(label=ifelse(cS>cs_95_perc | cN>cn_95_perc, as.character(node), '')), show_guide=F) +
  #(aes(label=ifelse(avg.dN>dn_95_perc, as.character(node), '')), show_guide=F) +
  geom_vline(xintercept=cs_95_perc, linetype="dashed", color=corecol(numcol=1, offset=6)) +
  geom_hline(yintercept=cn_95_perc, linetype="dashed", color=corecol(numcol=1, offset=6)) +
  #geom_smooth(method="lm", se=F, ) +
  xlab("cS per branch") + 
  ylab("cN per branch") + 
  bartheme() +
  theme(legend.position="bottom") +
  guides(colour = guide_legend(override.aes = list(alpha = 1)))
p = ggExtra::ggMarginal(p, type="histogram", bins=50, fill=corecol(pal="wilke", numcol=1, offset=1), color="#666666")
print(p)

anc_info$cs.outlier = ifelse(anc_info$cS>cs_95_perc,anc_info$node,'')
anc_info$cn.outlier = ifelse(anc_info$cN>cn_95_perc,anc_info$node,'')

# Species tree with branches colored # of convergent substitutions to any other branch

h = corecol(numcol=1, pal="wilke", offset=3)
l = corecol(numcol=1, offset=3)
# Colors

c_tree = ggtree(tree, size=0.8, ladderize=F, aes(color=anc_info_w_root$total.c)) +
  scale_color_continuous(name='Convergent subs', low=l, high=h) +
  xlim(0, xmax) +
  geom_tiplab(color="#333333", fontface='italic', size=2) +
  theme(legend.position=c(0.05,0.9))
print(c_tree)

# Ratio of non-synyonymous to synonymous convergent substitutions (cN/cS)

cncs_p = ggplot(anc_info, aes(x=cn.cs)) +
  geom_histogram(bins=50, fill=corecol(pal="wilke", numcol=1), color="#666666") +
  scale_y_continuous(expand=c(0,0)) +
  xlab("cN/cS") +
  ylab("# of branches") +
  bartheme()
print(cncs_p)
# Distribution of dN/dS when using gene trees

cncs_95_perc = quantile(anc_info$cn.cs, 0.95, na.rm=T)
anc_info$cncs.outlier = ifelse(anc_info$cn.cs>cncs_95_perc,anc_info$node,'')

# Species tree with branches colored by cN/cS

h = corecol(numcol=1, pal="wilke", offset=3)
l = corecol(numcol=1, offset=3)
# Colors

cncs_tree = ggtree(tree, size=0.8, ladderize=F, aes(color=anc_info_w_root$cn.cs)) +
  scale_color_continuous(name='cN/cS', low=l, high=h) +
  xlim(0, xmax) +
  geom_tiplab(color="#333333", fontface='italic', size=2) +
  theme(legend.position=c(0.05,0.9))
print(cncs_tree)

# Pairwise convergent substitutions

# Convergent and divergent substitutions between every pair of branches in the tree


pairwise_data = read.csv("../../data/rates/")

pairwise_data$convergent = pairwise_data$con.syn + pairwise_data$con.one.each + pairwise_data$con.non.syn
pairwise_data$divergent = pairwise_data$div.syn + pairwise_data$div.one.each + pairwise_data$div.non.syn

pairwise_p = ggplot(pairwise_data, aes(x=divergent, y=convergent)) +
  geom_point(size=2, alpha=0.3, color="#920000") +
  geom_smooth(method="lm", se=F, size=2, linetype="dashed", color="#333333") +
  bartheme()
print(pairwise_p)

< Back to summary