Murine species trees & rates
::opts_chunk$set(echo = TRUE, warning = FALSE, message = FALSE)
knitr
library(ggplot2)
library(cowplot)
library(ggbeeswarm)
library(dplyr)
library(kableExtra)
library(tidyr)
library(ggtree)
library(phytools)
library(phangorn)
library(reshape2)
library(ggExtra)
library(ggrepel)
library(vroom)
library(ggdist)
library(stringr)
library(ggsignif)
source("C:/bin/core/r/design.r")
source("C:/bin/core/r/get_tree_info.r")
#source("C:/Users/grt814/bin/core/corelib/design.r")
#source("C:/Users/grt814/bin/core/get_tree_info.r")
#htmltools::includeHTML("../html-chunks/rmd_nav.html")
1 Full coding species tree
# This chunk handles all of the main inputs and reads the tree
#tree_type = "astral"
= F
save_tree_fig # Meta options. Comment tree_type when running form generator
cat(tree_type, " tree\n")
## astral tree
cat("188 species targeted capture to mouse exons assembled with SPADES.\n")
## 188 species targeted capture to mouse exons assembled with SPADES.
cat("11,775 coding loci aligned with exonerate+mafft\n")
## 11,775 coding loci aligned with exonerate+mafft
cat("Gene trees inferred with IQtree.\n")
## Gene trees inferred with IQtree.
# Data summary
if(tree_type == "astral"){
cat("Species tree inferred with ASTRAL.\n")
cat("Branch lengths estimated by ASTRAL.\n")
= "../../data/trees/full_coding_iqtree_astral.cf.rooted.labeled.tree"
tree_file = "../../data/trees/full_coding_iqtree_astral.cf.branch.rooted"
iq_tree_labels = "../../data/trees/full_coding_iqtree_astral.cf.stat"
cf_stat_file = "../../data/trees/astral-cf-reps/"
cf_rep_dir = "../../data/trees/astral-delta.tab"
delta_outfile = "../../data/rates/full-coding-astral-slac-branch-rates.csv"
branch_rates_file = "../../data/rates/full-coding-astral-slac-branch-rates-arid.csv"
branch_rates_file_arid = "../../data/rates/full-coding-astral-slac-branch-rates-morphofacial.csv"
branch_rates_file_mf = "../../data/trees/astral-colonization-branches.txt"
col_file = "../../data/trees/astral-moprho-ou-shift-branches.txt"
morpho_file = c("Lophiomys_imhausi_UM5152", "Lophuromys_woosnami_LSUMZ37793", "<1>", "<187>", "<186>")
exclude_branches = 31
xmax else if(tree_type == "concat"){
}cat("Species tree inferred by concatenation of all loci with IQtree.\n")
= "../../data/trees/full_coding_iqtree_concat.cf.rooted.tree"
tree_file = "../../data/trees/full_coding_iqtree_concat.cf.branch.rooted"
iq_tree_labels = "../../data/trees/full_coding_iqtree_concat.cf.stat"
cf_stat_file = "../../data/trees/concat-cf-reps/"
cf_rep_dir = "../../data/trees/concat-delta.tab"
delta_outfile = "../../data/rates/full-coding-concat-slac-branch-rates.csv"
branch_rates_file = ""
col_file = c("Lophiomys_imhausi_UM5152", "Lophuromys_woosnami_LSUMZ37793", "<1>", "<187>", "<186>")
exclude_branches = 0.125
xmax }
## Species tree inferred with ASTRAL.
## Branch lengths estimated by ASTRAL.
# Tree type specific info, options, and files
= "../../data/trees/loci-labeled.treefile"
gt_file # The file containing the gene trees
= "../../data/rates/full-coding-slac.csv.gz"
gene_rates_file # File with rates calculated per gene
= read.tree(tree_file)
rodent_tree = treeToDF(rodent_tree)
tree_to_df_list = tree_to_df_list[["info"]]
tree_info # Read the tree and parse with treetoDF
if(tree_type == "astral"){
= tree_info %>% separate(label, c("tp", "astral", "gcf", "scf"), sep="/", remove=F)
tree_info # Split the label by /. tp is my treeParse label.
$astral[tree_info$node.type=="tip"] = NA
tree_info# Fill in ASTRAL support as NA for the tips
$astral = as.numeric(tree_info$astral)
tree_info$gcf = as.numeric(tree_info$gcf)
tree_info$scf = as.numeric(tree_info$scf)
tree_info# Convert all supports to numeric
else if(tree_type == "concat"){
}= tree_info %>% separate(label, c("tp", "bootstrap"), sep="/", remove=F)
tree_info # Split the label by /. tp is my treeParse label.
$bootstrap[tree_info$node.type=="tip"] = NA
tree_info# Fill in bootstrap support as NA for the tips
$bootstrap = as.numeric(tree_info$bootstrap)
tree_info$gcf = as.numeric(tree_info$gcf)
tree_info$scf = as.numeric(tree_info$scf)
tree_info# Convert all supports to numeric
}
= read.table(cf_stat_file, header=T)
cf_stats # Read the concordance factor table
# The node/branch labels in R and IQtree differ. IQtree uses a nice, logical post-ordering
# of internal nodes while R does something random and assigns labels to tips as well. This
# chunk matches those up for the delta analysis later
= read.tree(iq_tree_labels)
iq_tree = treeToDF(iq_tree)
iqtree_to_df_list = iqtree_to_df_list[["info"]]
iqtree_info # Read the IQ tree tree with branch labels in and parse with get_tree_info
#node_convert = matchNodes(tree_to_df_list[["labeled.tree"]], iqtree_to_df_list[["labeled.tree"]], method="descendants")
$iqtree.node = NA
tree_info# Add a column to the main tree table about IQ tree labels
for(i in 1:nrow(tree_info)){
= tree_info[i,]$node
cur_node = subset(iqtree_info, node==cur_node)
iqtree_row = iqtree_row$label
iqtree_label $iqtree.node = iqtree_label
tree_info[i,]
}# For every row in the main tree table, add in the IQ tree node label given that
# we've read the same tree in R and can use the node.labels
1.1 Branches colored by gCF
= corecol(numcol=1, pal="wilke", offset=3)
h = corecol(numcol=1, offset=3)
l # Colors
= ggtree(rodent_tree, size=0.8, ladderize=F, aes(color=tree_info$gcf)) +
gcf_tree scale_color_continuous(name='gCF', low=l, high=h, limits=c(0,100)) +
xlim(0, xmax) +
geom_tiplab(color="#333333", fontface='italic', size=2) +
theme(legend.position=c(0.05,0.9))
print(gcf_tree)
if(save_tree_fig){
= gcf_tree + geom_text(aes(x=branch, label=ifelse(tree_info$node.type=="internal",as.character(node), ''), label.size=NA, fill="transparent"), size=2, vjust=-0.2)
gcf_tree = paste("../../data/trees/", tree_type, "-gcf-tree-PARED-gcf50.pdf", sep="")
tree_outfile ggsave(tree_outfile, gcf_tree, width=8, height=16, unit="in")
}# gCF tree
1.2 Branches colored by sCF
= corecol(numcol=1, pal="wilke", offset=3)
h = corecol(numcol=1, offset=3)
l # Colors
= ggtree(rodent_tree, size=0.8, ladderize=F, aes(color=tree_info$scf)) +
scf_tree scale_color_continuous(name='sCF', low=l, high=h, limits=c(0,100)) +
xlim(0, xmax) +
geom_tiplab(color="#333333", fontface='italic', size=2) +
theme(legend.position=c(0.05,0.9))
#geom_text(aes(label=rodent_data$support), hjust=-.1, color="#006ddb") +
#geom_nodepoint(color="#666666", alpha=0.85, size=4)
print(scf_tree)
#ggsave("../data/trees/scf-tree.pdf", scf_tree, width=8, height=16, unit="in")
# sCF tree
1.3 Gene vs site concordance factors colored by branch support
= corecol(numcol=1, pal="wilke", offset=3)
h = corecol(numcol=1, offset=3)
l # Colors
if(tree_type == "astral"){
= ggplot(tree_info, aes(x=gcf, y=scf, color=astral)) +
p geom_point(size=2, alpha=0.5) +
scale_color_continuous(name='Astral support', low=l, high=h, limits=c(0.8,1))
else if(tree_type == "concat"){
}= ggplot(tree_info, aes(x=gcf, y=scf, color=bootstrap)) +
p geom_point(size=2, alpha=0.5) +
scale_color_continuous(name='Bootstrap', low=l, high=h, limits=c(0,100))
}= p + bartheme() +
p theme(legend.title=element_text(size=12))
print(p)
1.4 Concordance factors vs. branch lengths
= ggplot(tree_info, aes(x=branch.length, y=gcf)) +
bl_gcf_p geom_point(size=3, alpha=0.5) +
#geom_text_repel(aes(label=ifelse(avg.ds>0.2|avg.dn>0.01,as.character(node),'')), show_guide=F) +
#geom_smooth(method="lm", se=F, ) +
xlab("Branch length") +
ylab("gCF per branch") +
bartheme()
if(tree_type=="concat"){
= bl_gcf_p + scale_x_continuous(limits=c(0,0.035))
bl_gcf_p
}#theme(legend.position="bottom") +
#guides(colour = guide_legend(override.aes = list(alpha = 1)))
= ggExtra::ggMarginal(bl_gcf_p, type="histogram", bins=50, fill=corecol(pal="wilke", numcol=1, offset=4), color="#666666")
bl_gcf_p
= ggplot(tree_info, aes(x=branch.length, y=scf)) +
bl_scf_p geom_point(size=3, alpha=0.5) +
#geom_text_repel(aes(label=ifelse(avg.ds>0.2|avg.dn>0.01,as.character(node),'')), show_guide=F) +
#geom_smooth(method="lm", se=F, ) +
xlab("Branch length") +
ylab("sCF per branch") +
bartheme()
if(tree_type=="concat"){
= bl_scf_p + scale_x_continuous(limits=c(0,0.035))
bl_scf_p
}#theme(legend.position="bottom") +
#guides(colour = guide_legend(override.aes = list(alpha = 1)))
= ggExtra::ggMarginal(bl_scf_p, type="histogram", bins=50, fill=corecol(pal="wilke", numcol=1, offset=5), color="#666666")
bl_scf_p
= plot_grid(bl_gcf_p, bl_scf_p, ncol=2)
p print(p)
2 Gene trees
The 11,774 gene trees contain varying numbers of taxa due to filtering on the alignment and filtering by IQtree for identical sequences.
2.1 Taxa per gene tree
= read.table(gt_file, sep="\t")
gene_trees names(gene_trees) = c("gene", "tree")
# Read the geen tree file
$gene = word(gene_trees$gene, 1, sep="-")
gene_trees$gene = word(gene_trees$gene, 2, sep="/")
gene_trees# Parse out the protein ID from the filename for each gene
= data.frame("gene"=c(), "rf"=c(), "num.tips"=c(), "rf.zeros"=c(), "num.tips.zeros"=c())
gt_data # A df to track info for each gene tree
for(i in 1:nrow(gene_trees)){
= read.tree(text=gene_trees[i,]$tree)
gt # Read the gene tree as a phylo object
= gt$tip.label
cur_tips = drop.tip(rodent_tree, rodent_tree$tip.label[-match(cur_tips, rodent_tree$tip.label)])
pruned_tree # Get a list of the tips and prune the species tree to contain only those tips
= RF.dist(pruned_tree, gt, normalize=T)
cur_rf # Calculate RF between the pruned species tree and the gene tree
if(cur_rf==0){
= rbind(gt_data, data.frame("gene"=gene_trees[i,]$gene, "rf"=cur_rf, "num.tips"=length(cur_tips), "rf.zeros"=cur_rf, "num.tips.zeros"=length(cur_tips)))
gt_data else{
}= rbind(gt_data, data.frame("gene"=gene_trees[i,]$gene, "rf"=cur_rf, "num.tips"=length(cur_tips), "rf.zeros"=NA, "num.tips.zeros"=NA))
gt_data
}# Add the gene tree info to the df, with a special case for when the RF is 0
}# Read all the gene trees
= ggplot(gt_data, aes(x=num.tips)) +
p geom_histogram(bins=50, fill=corecol(pal="wilke", numcol=1, offset=6), color="#666666") +
#geom_quasirandom(size=2, width=0.25, alpha=0.25, color="#666666") +
#geom_boxplot(outlier.shape=NA, alpha=0.75, width=0.5, color="#666666") +
scale_y_continuous(expand=c(0,0)) +
xlab("# of taxa") +
ylab("# of gene trees") +
bartheme()
print(p)
#write.table(subset(gt_data, rf<=0.25), file="../../data/trees/loci-nrf-below-0.25.txt", row.names=F, quote=F, sep="\t")
# Write info for genes with low RF
2.2 Tree distance (RF) between gene trees and species tree
Since RF cannot handle missing taxa, the species tree is pruned for each gene tree to calculate Robinson-Foulds distance. We use the normalized metric since there are varying numbers of tips per gene tree.
= ggplot(gt_data, aes(x=rf)) +
p geom_histogram(bins=50, fill=corecol(pal="wilke", numcol=1, offset=4), color="#666666") +
#geom_quasirandom(size=2, width=0.25, alpha=0.25, color="#666666") +
#geom_boxplot(outlier.shape=NA, alpha=0.75, width=0.5, color="#666666") +
scale_y_continuous(expand=c(0,0)) +
xlab("Normalized RF") +
ylab("# of gene trees") +
bartheme()
print(p)
3 Delta
3.1 Introduction
For each lineage in the species tree with gCF < 95% we calculated the \(\Delta\) statistic (Huson et al. 2005). This statistic follows the same logic as the ABBA-BABA site patterns used to calculate D-statistics, but uses gene tree topologies instead of alignment sites. Briefly, a given branch in an unrooted tree is defined by a quartet of species groupings with two possible discordant topologies, \(D_1\) and \(D_2\) (see Figure 1 from Minh et al. 2020). Under assumptions that discordance is caused by ILS, both discordant topologies should be present in equal proportions. However, if introgression has occurred one discordant topology will appear more frequently than the other. \(\Delta\) is calculated for a branch as follows, using the frequency of each discordant topology (Vanderpool et al. 2020):
\[\Delta = \frac{D_1 - D_2}{D_1 + D_2}\]
This normalized \(\Delta\) calculation ensures that all values are scaled between 0 and 1, with larger values indicating a larger skew towards one topology, and a higher chance that introgression has occurred.
To test whether the observed \(\Delta\) values are skewed significantly from 0 to imply introgression, we performed concordance factor analysis on 1,000 bootstrap replicates of our inferred gene trees to generate a null distribution of \(\Delta\) values. We then calculated Z-scores and p-values and assessed significance for each branch at a threshold of 0.01.
3.2 Null and actual distributions
Nodes with p < 0.01:
$delta = (abs(cf_stats$gDF1_N - cf_stats$gDF2_N)) / (cf_stats$gDF1_N + cf_stats$gDF2_N)
cf_stats# Calculate the delta values on the actual data
= select(cf_stats, ID, delta)
iqtree_delta names(iqtree_delta) = c("iqtree.node", "delta")
= merge(x = tree_info, y = iqtree_delta, by = "iqtree.node", all.x=TRUE)
tree_info # Adding the delta values to the main tree info table
= tree_info[order(tree_info$node), ]
tree_info # Re-sort the data frame by R node order after the merge so the trees still work
= subset(cf_stats, gCF < 95)
low_cf_nodes # Get the low concordance factor nodes from the data to test with delta
= c()
delta_null for(i in 0:999){
= as.character(i)
cur_rep_str #print(cur_rep_str)
while(nchar(cur_rep_str) < 4){
= paste("0", cur_rep_str, sep="")
cur_rep_str
}# Handling the string of the rep
= paste(cf_rep_dir, "rep", cur_rep_str, ".cf.stat", sep="")
cur_cf_file = read.table(cur_cf_file, header=T, fill=T)
cf_rep # Read the current reps cf file
$delta = (abs(cf_rep$gDF1_N - cf_rep$gDF2_N)) / (cf_rep$gDF1_N + cf_rep$gDF2_N)
cf_rep= c(delta_null, cf_rep$delta)
delta_null # Calculate delta for this rep and save values in vector
}# Read concordance factors from bootstrap samples of gene trees and calculate delta to generate
# null distribution
= data.frame("delta"=delta_null, y="duh")
delta_null_df = subset(delta_null_df, !is.nan(delta))
delta_null_df # Convert the delta values to a data frame for ggplot
= mean(delta_null_df$delta, na.rm=T)
delta_mu = sd(delta_null_df$delta, na.rm=T)
delta_sd # Calculate the mean and sd of the null distribution to get z-scores and p-values
= data.frame("node"=c(), "delta"=c(), "z-score"=c(), "p-value"=c())
delta_out # Initialize output data frame
$delta.sig = F
tree_info# Add a column to the tree info table about significant delta values
for(i in 1:nrow(low_cf_nodes)){
= low_cf_nodes[i,]
row = (row$delta - delta_mu) / delta_sd
z = pnorm(-abs(z))
p if(p < 0.01){
print(paste(row$ID, row$delta, z, p, sep=" "))
$delta.sig[tree_info$iqtree.node==row$ID] = T
tree_info# Set the delta significant to TRUE in the main tree info table
}= rbind(delta_out, data.frame("node"=row$ID, "delta"=row$delta, "z-score"=z, "p-value"=p))
delta_out }
## [1] "282 0.40548137737175 2.43334742938723 0.00747996932293452"
## [1] "286 0.410672853828306 2.47825884686394 0.00660126628112862"
## [1] "288 0.627906976744186 4.3575493466097 6.57634251391119e-06"
## [1] "304 0.407725321888412 2.45275977379941 0.0070882488601971"
## [1] "307 0.511338697878566 3.34911818439904 0.00040534603511502"
## [1] "312 0.41615356754799 2.52567245337739 0.00577385448888136"
## [1] "357 0.532062391681109 3.5283986784577 0.000209040982510575"
## [1] "359 0.465612648221344 2.95354252824397 0.00157074659668177"
# Calculate z and p for each low gCF node in the species tree and save to output data frame
write.table(file=delta_outfile, delta_out, sep="\t", row.names=F)
= ggplot(delta_null_df, aes(x=delta)) +
delta_null_p geom_histogram(color="#ececec", bins=50) +
scale_x_continuous(limits=c(0,1)) +
scale_y_continuous(expand=c(0,0)) +
xlab("Delta") +
ylab("# nodes") +
bartheme()
= ggplot(low_cf_nodes, aes(x=delta)) +
delta_actual_p geom_histogram(fill="#920000", color="#ececec") +
scale_x_continuous(limits=c(0,1)) +
scale_y_continuous(expand=c(0,0)) +
xlab("Delta") +
ylab("# nodes") +
bartheme()
= plot_grid(delta_null_p, delta_actual_p, ncol=2, labels=c("Null distribution", "Actual distribution"), label_size=12)
delta_p print(delta_p)
3.3 Branches with evidence for introgression
= corecol(numcol=1, pal="wilke", offset=1)
h = corecol(numcol=1, offset=1)
l
= ggtree(rodent_tree, size=0.8, ladderize=F, aes(color=tree_info$delta.sig)) +
intro_tree scale_color_manual(name="Significant Delta", labels=c("False", "True"), values=corecol(pal="trek", numcol=2)) +
xlim(0, xmax) +
geom_tiplab(color="#333333", fontface='italic', size=2) +
theme(legend.position=c(0.05,0.9)) +
geom_label(aes(x=branch, label=ifelse(tree_info$delta.sig,as.character(tree_info$tp),'')), label.size=NA, fill="transparent", show.legend=F, vjust=-0.1)
#geom_text(aes(label=rodent_data$support), hjust=-.1, color="#006ddb") +
#geom_nodepoint(color="#666666", alpha=0.85, size=4)
print(intro_tree)
3.4 Concordance factors and Delta
= corecol(numcol=1, pal="wilke", offset=3)
l = corecol(numcol=1, offset=3)
h # Colors
= ggplot(tree_info, aes(x=gcf, y=scf, color=delta)) +
p geom_point(size=2, alpha=0.5) +
geom_text_repel(aes(label=ifelse(delta.sig,as.character(tp),'')), show_guide=F, min.segment.length=0) +
scale_color_continuous(name='Delta', low=l, high=h) +
xlab("gCF") +
ylab("sCF") +
bartheme() +
theme(legend.title=element_text(size=12))
print(p)
4 Substitution rates by gene (mg94-local)
We ran each of the 11,775 coding loci through HyPhy’s standard MG94 fit with the -local
option to estimate a rate for each branch in the input tree.
For input trees we used the gene tree estimated from each individual alignment to reduce false inferences of substitutions that result from tree misspecification.
= vroom(gene_rates_file, comment="#")
rates # Use vroom to unzip and read the gene rates file
# NOTE: vroom does a bad job cleaning up huge tmp files, so be sure to check manually
$dn = rates$N / rates$EN
ratesis.nan(rates$dn),]$dn = NA
rates[$ds = rates$S / rates$ES
ratesis.nan(rates$ds),]$ds = NA
rates[$dn.ds = rates$dn / rates$ds
rates
= group_by(rates, file) %>% summarize(dn=mean(dn, na.rm=T), ds=mean(ds, na.rm=T), dn.ds=mean(dn.ds, na.rm=T))
gene_rates # Average rates for each branch by gene
$high.dn.ds = "N"
gene_rates$dn / gene_rates$ds > 1,]$high.dn.ds = "Y" gene_rates[gene_rates
= ggplot(subset(gene_rates, ds < 0.05), aes(x=ds, y=dn, color=high.dn.ds)) +
p geom_point(size=2, alpha=0.2) +
#geom_smooth(method="lm", se=F, ) +
xlab("Avg. dS per gene") +
ylab("Avg. dN per gene") +
scale_color_manual(name="dN/dS>1", values=c("#333333", corecol(numcol=1, offset=2))) +
bartheme() +
theme(legend.position="bottom",
legend.title = element_text(size=14))
= ggExtra::ggMarginal(p, type="histogram", bins=50, fill=corecol(pal="wilke", numcol=1), color="#666666")
p print(p)
= ggplot(subset(gene_rates, ds < 0.05), aes(x=ds)) +
ds_p geom_histogram(bins=50, fill=corecol(pal="wilke", numcol=1), color="#666666") +
scale_y_continuous(expand=c(0,0)) +
xlab("dS") +
ylab("# of genes") +
bartheme()
print(ds_p)
# Distribution of dS when using gene trees
= ggplot(gene_rates, aes(x=dn)) +
dn_p geom_histogram(bins=50, fill=corecol(pal="wilke", numcol=1), color="#666666") +
scale_y_continuous(expand=c(0,0)) +
xlab("dN") +
ylab("# of genes") +
bartheme()
print(dn_p)
# Distribution of dN when using gene trees
= quantile(gene_rates$ds, 0.98)
ds_filter_level #ds_filter_level = 0.03
= subset(gene_rates, ds > ds_filter_level)$file
ds_filter # Get a list of genes to filter out in subsequent analyses based on dS
print(paste("Removing", length(ds_filter), "genes with dS above", ds_filter_level, "from subsequent analyses."))
## [1] "Removing 236 genes with dS above 0.0338467469553527 from subsequent analyses."
#write.csv(ds_filter, file="../../data/rates/full-coding-mg94-local-ds-filter-0.95quant.csv", row.names=F)
5 Calculation substitution rates per branch in the presence of gene tree discordance
5.1 Branch presence
Because we used the gene trees, in order to quantify rates on species tree branches we first needed to check whether branches in the species tree exist in a given gene tree.
This resulted in three categories for a species tree branch in a given gene tree:
- Full clade: All species in the clade that descends from the branch in the species tree are present as a monophyletic split in the gene tree.
- Parital clade: Not all species in the clade that descends from the branch in the species tree are present in the gene tree, but the ones that are present form a monophyletic split.
- Discordant/missing clade: Either the species in the clade that descends from the branch in the species tree do not form a monophyletic split in this gene tree (discordant) or ALL species in this clade are missing from this gene tree (missing)
Average rates are then calculated as (for dS, for example):
\[\frac{\sum_{i=1}^{n}\text{branch dS}_i}{n}\]
Where:
- \(n = \text{# full clade genes} + \text{# partial clade genes}\) for this branch
5.1.1 Species tree branch presence/absence per gene
= read.csv(branch_rates_file, header=T)
branch_rates # Read the branch rates data
names(branch_rates)[1] = "tp"
= names(branch_rates)[5:20]
cols_to_na for(col in cols_to_na){
$tp %in% exclude_branches,][[col]] = NA
branch_rates[branch_rates
}# For branches that we want to exclude for counting, convert
# columns with counts to NA
= names(tree_info)[!(names(tree_info) %in% names(branch_rates))] # get non common names
uniq_info_cols = c(uniq_info_cols,"tp") # appending key parameter
uniq_info_cols # Get a list of columns from the tree_info df to join to the tree rates df
= branch_rates %>% left_join((tree_info %>% select(uniq_info_cols)), by="tp")
branch_rates # Select the columns from tree_info and join to tree_rates, merging by clade
# https://stackoverflow.com/a/61628157
= branch_rates[order(branch_rates$node), ]
branch_rates # Re-order the rows by the R node
= select(branch_rates, clade, node.type, num.genes.full)
full_clade $label = "Full clade"
full_cladenames(full_clade)[3] = "num.genes"
= select(branch_rates, clade, node.type, num.genes.partial)
partial_clade $label = "Partial clade"
partial_cladenames(partial_clade)[3] = "num.genes"
= select(branch_rates, clade, node.type, num.genes.descendant.counted)
descendant_counted $label = "Descendant counted"
descendant_countednames(descendant_counted)[3] = "num.genes"
= select(branch_rates, clade, node.type, num.genes.discordant)
discordant_clade $label = "Discordant clade"
discordant_cladenames(discordant_clade)[3] = "num.genes"
= select(branch_rates, clade, node.type, num.genes.missing)
no_clade $label = "Missing clade"
no_cladenames(no_clade)[3] = "num.genes"
# Subset each clade count column to add a label
= rbind(full_clade, partial_clade, descendant_counted, discordant_clade, no_clade)
clade_counts # Convert branch categories to long format
$label = factor(clade_counts$label, levels=c("Full clade", "Partial clade", "Descendant counted", "Discordant clade", "Missing clade"))
clade_counts# Factorize the labels in order
= ggplot(clade_counts, aes(x=label, y=num.genes, group=label, color=node.type)) +
branch_counts geom_quasirandom(size=2, width=0.25, alpha=0.25) +
geom_boxplot(outlier.shape=NA, alpha=0.15, width=0.5, color="#666666") +
ylab("# of genes") +
xlab("Species tree\nbranch classification") +
bartheme() +
theme(axis.text.x = element_text(angle=25, hjust=1)) +
guides(colour=guide_legend(override.aes=list(alpha=1)))
print(branch_counts)
5.1.2 Number of genes per branch
$num.genes.present = branch_rates$num.genes.full + branch_rates$num.genes.partial
branch_rates# Sum the two columns that indicate a clade is present in a gene
= ggplot(branch_rates, aes(x=num.genes.present)) +
presence_dist geom_histogram(bins=50, fill=corecol(pal="wilke", numcol=1, offset=1), color="#ececec") +
scale_y_continuous(expand=c(0,0)) +
xlab("Genes per branch") +
ylab("# of branches") +
bartheme()
print(presence_dist)
= data.frame("Stat"=c("Branch with most genes", "Branch with fewest genes", "Median genes per branch", "Mean genes per branch"),
t_data "Branch label"=c(branch_rates$tp[branch_rates$num.genes.present==max(branch_rates$num.genes.present, na.rm=T)][3], branch_rates$tp[branch_rates$num.genes.present==min(branch_rates$num.genes.present, na.rm=T)][6],"NA", "NA"),
"Num genes"=c(branch_rates$num.genes.present[branch_rates$num.genes.present==max(branch_rates$num.genes.present, na.rm=T)][3], branch_rates$num.genes.present[branch_rates$num.genes.present==min(branch_rates$num.genes.present, na.rm=T)][6],
median(branch_rates$num.genes.present, na.rm=T),
mean(branch_rates$num.genes.present, na.rm=T)
)
)
%>% kable(caption="Branch statistics") %>% kable_styling(bootstrap_options=c("striped", "condensed", "responsive"), full_width=F) t_data
Stat | Branch.label | Num.genes |
---|---|---|
Branch with most genes | mm10 | 11793.000 |
Branch with fewest genes | <31> | 233.000 |
Median genes per branch | NA | 7506.000 |
Mean genes per branch | NA | 6756.251 |
5.1.3 gCF and branch presence
These measures are highly correlated with gene concordance factors:
$clade.perc = (branch_rates$num.genes.full + branch_rates$num.genes.partial) / (branch_rates$num.genes.full + branch_rates$num.genes.partial + branch_rates$num.genes.descendant.counted + branch_rates$num.genes.discordant + branch_rates$num.genes.missing)
branch_rates
= ggplot(branch_rates, aes(x=gcf, y=clade.perc)) +
p geom_point(size=2, alpha=0.4, color="#333333") +
geom_smooth(method="lm", se=F, linetype="dashed", color="#920000") +
xlab("gCF") +
ylab("% of genes with clade present") +
bartheme() +
theme(legend.position="none")
print(p)
5.2 Calculating branch rates
To control for the fact that different branches will be present in different numbers of genes, we can calculate average rates per branch by summing the raw counts of the number of sites and the number of substitutions across all genes in which that branch is present as a full or partial clade.
For a given branch, \(b\):
\[dN_b = \frac{\sum_{g \in G_b}N_g}{\sum_{g \in G_b}EN_g}\]
and
\[dS_b = \frac{\sum_{g \in G_b}S_g}{\sum_{g \in G_b}ES_g}\]
where \(G_b\) is the set of genes that contain branch \(b\). Then,
\[\omega_b = \frac{dN_b}{dS_b}\]
6 Substitution rates per branch (all genes)
6.1 dN and dS by branch
This results in the following distributions for average rates across branches (green lines indicating 95th percentile of each rate):
= quantile(branch_rates$dS, 0.95, na.rm=T)
ds_95_perc = quantile(branch_rates$dN, 0.95, na.rm=T)
dn_95_perc
= ggplot(branch_rates, aes(x=dS, y=dN, color=node.type)) +
p geom_point(size=2, alpha=0.2) +
geom_text_repel(aes(label=ifelse(dS>ds_95_perc | dN>dn_95_perc, as.character(node), '')), show_guide=F) +
#(aes(label=ifelse(avg.dN>dn_95_perc, as.character(node), '')), show_guide=F) +
geom_vline(xintercept=ds_95_perc, linetype="dashed", color=corecol(numcol=1, offset=6)) +
geom_hline(yintercept=dn_95_perc, linetype="dashed", color=corecol(numcol=1, offset=6)) +
#geom_smooth(method="lm", se=F, ) +
xlab("dS per branch") +
ylab("dN per branch") +
bartheme() +
theme(legend.position="bottom") +
guides(colour = guide_legend(override.aes = list(alpha = 1)))
= ggExtra::ggMarginal(p, type="histogram", bins=50, fill=corecol(pal="wilke", numcol=1, offset=1), color="#666666")
p print(p)
$ds.outlier = ifelse(branch_rates$dS>ds_95_perc,branch_rates$tp,'')
branch_rates$dn.outlier = ifelse(branch_rates$dN>dn_95_perc,branch_rates$tp,'') branch_rates
6.2 dS tree
= corecol(numcol=1, pal="wilke", offset=1)
h = corecol(numcol=1, offset=1)
l
= ggtree(rodent_tree, size=0.8, ladderize=F, aes(color=log(branch_rates$dS))) +
rate_tree scale_color_continuous(name='log dS', low=l, high=h) +
xlim(0, xmax) +
geom_tiplab(color="#333333", fontface='italic', size=2) +
theme(legend.position=c(0.05,0.9)) +
geom_label(aes(x=branch, label=ifelse(branch_rates$dS>ds_95_perc,as.character(node),'')), label.size=NA, fill="transparent")
#geom_text(aes(label=node), hjust=-.1, color="#006ddb")
#geom_nodepoint(color="#666666", alpha=0.85, size=4)
print(rate_tree)
6.3 dN tree
= corecol(numcol=1, pal="wilke", offset=1)
h = corecol(numcol=1, offset=1)
l
= ggtree(rodent_tree, size=0.8, ladderize=F, aes(color=log(branch_rates$dN))) +
rate_tree scale_color_continuous(name='log dN', low=l, high=h) +
xlim(0, xmax) +
geom_tiplab(color="#333333", fontface='italic', size=2) +
theme(legend.position=c(0.05,0.9)) +
geom_label(aes(x=branch, label=ifelse(branch_rates$dN > dn_95_perc, as.character(node) ,'')), label.size=NA, fill="transparent")
#geom_text(aes(label=rodent_data$support), hjust=-.1, color="#006ddb") +
#geom_nodepoint(color="#666666", alpha=0.85, size=4)
print(rate_tree)
6.4 dN/dS distribution
= ggplot(branch_rates, aes(x=dNdS)) +
dnds_p geom_histogram(bins=50, fill=corecol(pal="wilke", numcol=1), color="#666666") +
scale_y_continuous(expand=c(0,0)) +
xlab("dN/dS") +
ylab("# of branches") +
bartheme()
print(dnds_p)
# Distribution of dN/dS when using gene trees
= quantile(branch_rates$dNdS, 0.95, na.rm=T)
dnds_95_perc $dnds.outlier = ifelse(branch_rates$dNdS>dnds_95_perc,branch_rates$node,'') branch_rates
6.5 dN/dS tree
= corecol(numcol=1, pal="wilke", offset=1)
h = corecol(numcol=1, offset=1)
l
= ggtree(rodent_tree, size=0.8, ladderize=F, aes(color=log(branch_rates$dNdS))) +
rate_tree scale_color_continuous(name='log dN/dS', low=l, high=h) +
xlim(0, xmax) +
geom_tiplab(color="#333333", fontface='italic', size=2) +
theme(legend.position=c(0.05,0.9)) +
geom_label(aes(x=branch, label=ifelse(branch_rates$dNdS>dnds_95_perc ,as.character(node), '')), label.size=NA, fill="transparent")
#geom_text(aes(label=rodent_data$support), hjust=-.1, color="#006ddb") +
#geom_nodepoint(color="#666666", alpha=0.85, size=4)
print(rate_tree)
6.6 Substitution rates and discordance
6.6.1 dS vs. concordance factors
Only branches with avg. dS < 0.05
= ggplot(subset(branch_rates, node.type!="ROOT" & dS < ds_95_perc), aes(x=dS, y=gcf)) +
ds_gcf_p geom_point(size=3, alpha=0.5) +
#geom_text_repel(aes(label=ifelse(avg.ds>0.2|avg.dn>0.01,as.character(node),'')), show_guide=F) +
#geom_smooth(method="lm", se=F, ) +
xlab("dS per branch") +
ylab("gCF per branch") +
bartheme()
#theme(legend.position="bottom") +
#guides(colour = guide_legend(override.aes = list(alpha = 1)))
= ggExtra::ggMarginal(ds_gcf_p, type="histogram", bins=50, fill=corecol(pal="wilke", numcol=1, offset=4), color="#666666")
ds_gcf_p
= ggplot(subset(branch_rates, node.type!="ROOT" & dS < ds_95_perc), aes(x=dS, y=scf)) +
ds_scf_p geom_point(size=3, alpha=0.5) +
#geom_text_repel(aes(label=ifelse(avg.ds>0.2|avg.dn>0.01,as.character(node),'')), show_guide=F) +
#geom_smooth(method="lm", se=F, ) +
xlab("dS per branch") +
ylab("sCF per branch") +
bartheme()
#theme(legend.position="bottom") +
#guides(colour = guide_legend(override.aes = list(alpha = 1)))
= ggExtra::ggMarginal(ds_scf_p, type="histogram", bins=50, fill=corecol(pal="wilke", numcol=1, offset=5), color="#666666")
ds_scf_p
= plot_grid(ds_gcf_p, ds_scf_p, ncol=2)
p print(p)
6.6.2 dN vs. concordance factors
Only branches with avg. dN < 0.01
= ggplot(subset(branch_rates, node.type!="ROOT" & dN < dn_95_perc), aes(x=dN, y=gcf)) +
dn_gcf_p geom_point(size=3, alpha=0.5) +
#geom_text_repel(aes(label=ifelse(avg.ds>0.2|avg.dn>0.01,as.character(node),'')), show_guide=F) +
#geom_smooth(method="lm", se=F, ) +
xlab("dN per branch") +
ylab("gCF per branch") +
bartheme()
#theme(legend.position="bottom") +
#guides(colour = guide_legend(override.aes = list(alpha = 1)))
= ggExtra::ggMarginal(dn_gcf_p, type="histogram", bins=50, fill=corecol(pal="wilke", numcol=1, offset=4), color="#666666")
dn_gcf_p
= ggplot(subset(branch_rates, node.type!="ROOT" & dN < dn_95_perc), aes(x=dN, y=scf)) +
dn_scf_p geom_point(size=3, alpha=0.5) +
#geom_text_repel(aes(label=ifelse(avg.ds>0.2|avg.dn>0.01,as.character(node),'')), show_guide=F) +
#geom_smooth(method="lm", se=F, ) +
xlab("dN per branch") +
ylab("sCF per branch") +
bartheme()
#theme(legend.position="bottom") +
#guides(colour = guide_legend(override.aes = list(alpha = 1)))
= ggExtra::ggMarginal(dn_scf_p, type="histogram", bins=50, fill=corecol(pal="wilke", numcol=1, offset=5), color="#666666")
dn_scf_p
= plot_grid(dn_gcf_p, dn_scf_p, ncol=2)
p print(p)
6.6.3 dN/dS vs. concordance factors
Only branches with avg. dN/dS < 0.5
= ggplot(subset(branch_rates, node.type!="ROOT" & dNdS < 0.5), aes(x=dNdS, y=gcf)) +
dnds_gcf_p geom_point(size=3, alpha=0.5) +
#geom_text_repel(aes(label=ifelse(avg.ds>0.2|avg.dn>0.01,as.character(node),'')), show_guide=F) +
#geom_smooth(method="lm", se=F, ) +
xlab("dN/dS per branch") +
ylab("gCF per branch") +
bartheme()
#theme(legend.position="bottom") +
#guides(colour = guide_legend(override.aes = list(alpha = 1)))
= ggExtra::ggMarginal(dnds_gcf_p, type="histogram", bins=50, fill=corecol(pal="wilke", numcol=1, offset=4), color="#666666")
dnds_gcf_p
= ggplot(subset(branch_rates, node.type!="ROOT" & dNdS < 0.5), aes(x=dNdS, y=scf)) +
dnds_scf_p geom_point(size=3, alpha=0.5) +
#geom_text_repel(aes(label=ifelse(avg.ds>0.2|avg.dn>0.01,as.character(node),'')), show_guide=F) +
#geom_smooth(method="lm", se=F, ) +
xlab("dN/dS per branch") +
ylab("sCF per branch") +
bartheme()
#theme(legend.position="bottom") +
#guides(colour = guide_legend(override.aes = list(alpha = 1)))
= ggExtra::ggMarginal(dnds_scf_p, type="histogram", bins=50, fill=corecol(pal="wilke", numcol=1, offset=5), color="#666666")
dnds_scf_p
= plot_grid(dnds_gcf_p, dnds_scf_p, ncol=2)
p print(p)
6.7 Substitution rates and colonization branches (all genes)
= read.csv(col_file, header=F, comment.char="#")
col_branches names(col_branches) = c("tp")
$col.branch = "Other"
branch_rates
$col.branch[branch_rates$tp %in% col_branches$tp] = "Colonization"
branch_rates
#tree_info$col.desc.branch = "N"
for(i in 1:nrow(branch_rates)){
if(branch_rates[i,]$node.type == "internal" && branch_rates[i,]$col.branch == "Colonization"){
= getDescendants(rodent_tree, branch_rates[i,]$node)
cur_desc $col.branch[branch_rates$node==cur_desc[1]] = "Descendant"
branch_rates$col.branch[branch_rates$node==cur_desc[2]] = "Descendant"
branch_rates
} }
6.7.1 Tree with colonization branches labeled
= corecol(numcol=1, pal="wilke", offset=3)
h = corecol(numcol=1, offset=3)
l # Colors
= ggtree(rodent_tree, size=0.8, ladderize=F, aes(color=branch_rates$col.branch)) +
col_tree scale_color_manual(name='Branch partition', values=corecol(pal="wilke", numcol=3)) +
xlim(0, xmax) +
geom_tiplab(color="#333333", fontface='italic', size=2) +
theme(legend.position=c(0.15,0.9))
print(col_tree)
if(save_tree_fig){
= gcf_tree + geom_text(aes(x=branch, label=ifelse(branch_rates$node.type=="internal",as.character(node), ''), label.size=NA, fill="transparent"), size=2, vjust=-0.2)
gcf_tree = paste("../../data/trees/", tree_type, "-gcf-tree-PARED-gcf50.pdf", sep="")
tree_outfile ggsave(tree_outfile, gcf_tree, width=8, height=16, unit="in")
}# Colonization branch tree
6.7.2 dS by colonization branch
#anc_info = subset(anc_info_w_root, node.type != "root")
= subset(select(branch_rates, clade, node.type, col.branch, dS), col.branch == "Colonization")
col_ds $label = "Colonization"
col_ds#names(full_clade)[3] = "num.genes"
= subset(select(branch_rates, clade, node.type, col.branch, dS), col.branch == "Descendant")
desc_ds $label = "Descendant"
desc_ds
= subset(select(branch_rates, clade, node.type, col.branch, dS), col.branch == "Other")
other_ds $label = "Other"
other_ds
= rbind(col_ds, desc_ds, other_ds)
ds_df # Convert branch categories to long format
$label = factor(ds_df$label, levels=c("Colonization", "Descendant", "Other"))
ds_df
= list(c("Colonization", "Descendant"), c("Colonization", "Other"), c("Descendant", "Other"))
x_comps
= ggplot(ds_df, aes(x=label, y=dS, group=label, color=node.type)) +
branch_ps_counts geom_quasirandom(size=2, width=0.25, alpha=0.25) +
geom_boxplot(outlier.shape=NA, alpha=0.15, width=0.5, color="#666666") +
geom_signif(comparisons=x_comps, map_signif_level=TRUE, textsize=4, size=1, step_increase=0.12, margin_top=0.1) +
ylab("dS") +
xlab("Species tree\nbranch partition") +
bartheme() +
theme(axis.text.x = element_text(angle=25, hjust=1)) +
guides(colour=guide_legend(override.aes=list(alpha=1)))
print(branch_ps_counts)
6.7.3 dN by colonization branch
#anc_info = subset(anc_info_w_root, node.type != "root")
= subset(select(branch_rates, clade, node.type, col.branch, dN), col.branch == "Colonization")
col_dn $label = "Colonization"
col_dn#names(full_clade)[3] = "num.genes"
= subset(select(branch_rates, clade, node.type, col.branch, dN), col.branch == "Descendant")
desc_dn $label = "Descendant"
desc_dn
= subset(select(branch_rates, clade, node.type, col.branch, dN), col.branch == "Other")
other_dn $label = "Other"
other_dn
= rbind(col_dn, desc_dn, other_dn)
dn_df # Convert branch categories to long format
$label = factor(dn_df$label, levels=c("Colonization", "Descendant", "Other"))
dn_df
= list(c("Colonization", "Descendant"), c("Colonization", "Other"), c("Descendant", "Other"))
x_comps
= ggplot(dn_df, aes(x=label, y=dN, group=label, color=node.type)) +
branch_ps_counts geom_quasirandom(size=2, width=0.25, alpha=0.25) +
geom_boxplot(outlier.shape=NA, alpha=0.15, width=0.5, color="#666666") +
geom_signif(comparisons=x_comps, map_signif_level=TRUE, textsize=4, size=1, step_increase=0.12, margin_top=0.1) +
ylab("dN") +
xlab("Species tree\nbranch partition") +
bartheme() +
theme(axis.text.x = element_text(angle=25, hjust=1)) +
guides(colour=guide_legend(override.aes=list(alpha=1)))
print(branch_ps_counts)
6.7.4 dN/dS by colonization branch
#anc_info = subset(anc_info_w_root, node.type != "root")
= subset(select(branch_rates, clade, node.type, col.branch, dNdS), col.branch == "Colonization")
col_dnds $label = "Colonization"
col_dnds#names(full_clade)[3] = "num.genes"
= subset(select(branch_rates, clade, node.type, col.branch, dNdS), col.branch == "Descendant")
desc_dnds $label = "Descendant"
desc_dnds
= subset(select(branch_rates, clade, node.type, col.branch, dNdS), col.branch == "Other")
other_dnds $label = "Other"
other_dnds
= rbind(col_dnds, desc_dnds, other_dnds)
dnds_df # Convert branch categories to long format
$label = factor(dnds_df$label, levels=c("Colonization", "Descendant", "Other"))
dnds_df
= list(c("Colonization", "Descendant"), c("Colonization", "Other"), c("Descendant", "Other"))
x_comps
= ggplot(dnds_df, aes(x=label, y=dNdS, group=label, color=node.type)) +
branch_ps_counts geom_quasirandom(size=2, width=0.25, alpha=0.25) +
geom_boxplot(outlier.shape=NA, alpha=0.15, width=0.5, color="#666666") +
geom_signif(comparisons=x_comps, map_signif_level=TRUE, textsize=4, size=1, step_increase=0.12, margin_top=0.1) +
ylab("dN/dS") +
xlab("Species tree\nbranch partition") +
bartheme() +
theme(axis.text.x = element_text(angle=25, hjust=1)) +
guides(colour=guide_legend(override.aes=list(alpha=1)))
print(branch_ps_counts)
7 Substitution rates (1072 morphofacial genes)
7.1 Branch presence
#anc_info = subset(anc_info_w_root, node.type != "root")
= read.csv(branch_rates_file_mf, header=T, comment.char="#")
branch_rates_mf
names(branch_rates_mf)[1] = "tp"
= names(branch_rates_mf)[5:20]
cols_to_na for(col in cols_to_na){
$tp %in% exclude_branches,][[col]] = NA
branch_rates_mf[branch_rates_mf
}# For branches that we want to exclude for counting, convert
# columns with counts to NA
= names(tree_info)[!(names(tree_info) %in% names(branch_rates_mf))] # get non common names
uniq_info_cols = c(uniq_info_cols,"tp") # appending key parameter
uniq_info_cols # Get a list of columns from the tree_info df to join to the tree rates df
= branch_rates_mf %>% left_join((tree_info %>% select(uniq_info_cols)), by="tp")
branch_rates_mf # Select the columns from tree_info and join to tree_rates, merging by clade
# https://stackoverflow.com/a/61628157
= branch_rates_mf[order(branch_rates_mf$node), ]
branch_rates_mf # Re-order the rows by the R node
= select(branch_rates_mf, clade, node.type, num.genes.full)
full_clade $label = "Full clade"
full_cladenames(full_clade)[3] = "num.genes"
= select(branch_rates_mf, clade, node.type, num.genes.partial)
partial_clade $label = "Partial clade"
partial_cladenames(partial_clade)[3] = "num.genes"
= select(branch_rates_mf, clade, node.type, num.genes.descendant.counted)
descendant_counted $label = "Descendant counted"
descendant_countednames(descendant_counted)[3] = "num.genes"
= select(branch_rates_mf, clade, node.type, num.genes.discordant)
discordant_clade $label = "Discordant clade"
discordant_cladenames(discordant_clade)[3] = "num.genes"
= select(branch_rates_mf, clade, node.type, num.genes.missing)
no_clade $label = "Missing clade"
no_cladenames(no_clade)[3] = "num.genes"
# Subset each clade count column to add a label
= rbind(full_clade, partial_clade, descendant_counted, discordant_clade, no_clade)
clade_counts # Convert branch categories to long format
$label = factor(clade_counts$label, levels=c("Full clade", "Partial clade", "Descendant counted", "Discordant clade", "Missing clade"))
clade_counts# Factorize the labels in order
= ggplot(clade_counts, aes(x=label, y=num.genes, group=label, color=node.type)) +
branch_counts geom_quasirandom(size=2, width=0.25, alpha=0.25) +
geom_boxplot(outlier.shape=NA, alpha=0.15, width=0.5, color="#666666") +
ylab("# of genes") +
xlab("Species tree\nbranch classification") +
bartheme() +
theme(axis.text.x = element_text(angle=25, hjust=1)) +
guides(colour=guide_legend(override.aes=list(alpha=1)))
print(branch_counts)
7.2 Tree with OU shift branches labeled
= read.csv(morpho_file, header=F, comment.char="#")
morpho_branches names(morpho_branches) = c("tp")
$morpho.branch = "No OU shift"
branch_rates$morpho.branch[branch_rates$tp %in% morpho_branches$tp] = "OU shift"
branch_rates
$morpho.branch = "No OU shift"
branch_rates_mf$morpho.branch[branch_rates_mf$tp %in% morpho_branches$tp] = "OU shift"
branch_rates_mf
= corecol(numcol=1, pal="wilke", offset=3)
h = corecol(numcol=1, offset=3)
l # Colors
= ggtree(rodent_tree, size=0.8, ladderize=F, aes(color=branch_rates_mf$morpho.branch)) +
col_tree scale_color_manual(name='Branch partition', values=corecol(numcol=2)) +
xlim(0, xmax) +
geom_tiplab(color="#333333", fontface='italic', size=2) +
theme(legend.position=c(0.15,0.9))
print(col_tree)
if(save_tree_fig){
= gcf_tree + geom_text(aes(x=branch, label=ifelse(branch_rates$node.type=="internal",as.character(node), ''), label.size=NA, fill="transparent"), size=2, vjust=-0.2)
gcf_tree = paste("../../data/trees/", tree_type, "-gcf-tree-PARED-gcf50.pdf", sep="")
tree_outfile ggsave(tree_outfile, gcf_tree, width=8, height=16, unit="in")
}# OU shift branch tree
7.3 dS
= subset(select(branch_rates_mf, clade, node.type, morpho.branch, dS), morpho.branch == "OU shift")
morpho_ds_mf $label = "OU shift (MF genes)"
morpho_ds_mf#names(full_clade)[3] = "num.genes"
= subset(select(branch_rates_mf, clade, node.type, morpho.branch, dS), morpho.branch == "No OU shift")
other_ds_mf $label = "No OU shift (MF genes)"
other_ds_mf
= subset(select(branch_rates, clade, node.type, morpho.branch, dS), morpho.branch == "OU shift")
morpho_ds_all $label = "OU shift (All genes)"
morpho_ds_all#names(full_clade)[3] = "num.genes"
= subset(select(branch_rates, clade, node.type, morpho.branch, dS), morpho.branch == "No OU shift")
other_ds_all $label = "No OU shift (All genes)"
other_ds_all
= rbind(morpho_ds_all, other_ds_all, morpho_ds_mf, other_ds_mf)
ds_df # Convert branch categories to long format
$label = factor(ds_df$label, levels=c("OU shift (All genes)", "No OU shift (All genes)", "OU shift (MF genes)", "No OU shift (MF genes)"))
ds_df
= list(c("OU shift (All genes)", "No OU shift (All genes)"), c("OU shift (MF genes)", "No OU shift (MF genes)"), c("OU shift (All genes)", "OU shift (MF genes)"))
x_comps
= ggplot(ds_df, aes(x=label, y=dS, group=label, color=node.type)) +
morpho_branch_ds geom_quasirandom(size=2, width=0.25, alpha=0.25) +
geom_boxplot(outlier.shape=NA, alpha=0.15, width=0.5, color="#666666") +
geom_signif(comparisons=x_comps, map_signif_level=TRUE, textsize=4, size=1, step_increase=0.12, margin_top=0.1) +
ylab("dS") +
xlab("Species tree\nbranch partition") +
bartheme() +
theme(axis.text.x = element_text(angle=25, hjust=1)) +
guides(colour=guide_legend(override.aes=list(alpha=1)))
print(morpho_branch_ds)
7.4 dN
= subset(select(branch_rates_mf, clade, node.type, morpho.branch, dN), morpho.branch == "OU shift")
morpho_dn_mf $label = "OU shift (MF genes)"
morpho_dn_mf#names(full_clade)[3] = "num.genes"
= subset(select(branch_rates_mf, clade, node.type, morpho.branch, dN), morpho.branch == "No OU shift")
other_dn_mf $label = "No OU shift (MF genes)"
other_dn_mf
= subset(select(branch_rates, clade, node.type, morpho.branch, dN), morpho.branch == "OU shift")
morpho_dn_all $label = "OU shift (All genes)"
morpho_dn_all#names(full_clade)[3] = "num.genes"
= subset(select(branch_rates, clade, node.type, morpho.branch, dN), morpho.branch == "No OU shift")
other_dn_all $label = "No OU shift (All genes)"
other_dn_all
= rbind(morpho_dn_all, other_dn_all, morpho_dn_mf, other_dn_mf)
dn_df # Convert branch categories to long format
$label = factor(dn_df$label, levels=c("OU shift (All genes)", "No OU shift (All genes)", "OU shift (MF genes)", "No OU shift (MF genes)"))
dn_df
= list(c("OU shift (All genes)", "No OU shift (All genes)"), c("OU shift (MF genes)", "No OU shift (MF genes)"), c("OU shift (All genes)", "OU shift (MF genes)"))
x_comps
= ggplot(dn_df, aes(x=label, y=dN, group=label, color=node.type)) +
morpho_branch_dn geom_quasirandom(size=2, width=0.25, alpha=0.25) +
geom_boxplot(outlier.shape=NA, alpha=0.15, width=0.5, color="#666666") +
geom_signif(comparisons=x_comps, map_signif_level=TRUE, textsize=4, size=1, step_increase=0.12, margin_top=0.1) +
ylab("dN") +
xlab("Species tree\nbranch partition") +
bartheme() +
theme(axis.text.x = element_text(angle=25, hjust=1)) +
guides(colour=guide_legend(override.aes=list(alpha=1)))
print(morpho_branch_dn)
7.5 dN/dS
= subset(select(branch_rates_mf, clade, node.type, morpho.branch, dNdS), morpho.branch == "OU shift")
morpho_dnds_mf $label = "OU shift (MF genes)"
morpho_dnds_mf#names(full_clade)[3] = "num.genes"
= subset(select(branch_rates_mf, clade, node.type, morpho.branch, dNdS), morpho.branch == "No OU shift")
other_dnds_mf $label = "No OU shift (MF genes)"
other_dnds_mf
= subset(select(branch_rates, clade, node.type, morpho.branch, dNdS), morpho.branch == "OU shift")
morpho_dnds_all $label = "OU shift (All genes)"
morpho_dnds_all#names(full_clade)[3] = "num.genes"
= subset(select(branch_rates, clade, node.type, morpho.branch, dNdS), morpho.branch == "No OU shift")
other_dnds_all $label = "No OU shift (All genes)"
other_dnds_all
= rbind(morpho_dnds_all, other_dnds_all, morpho_dnds_mf, other_dnds_mf)
dnds_df # Convert branch categories to long format
$label = factor(dnds_df$label, levels=c("OU shift (All genes)", "No OU shift (All genes)", "OU shift (MF genes)", "No OU shift (MF genes)"))
dnds_df
= list(c("OU shift (All genes)", "No OU shift (All genes)"), c("OU shift (MF genes)", "No OU shift (MF genes)"), c("OU shift (All genes)", "OU shift (MF genes)"), c("No OU shift (All genes)", "No OU shift (MF genes)"))
x_comps
= ggplot(dnds_df, aes(x=label, y=dNdS, group=label, color=node.type)) +
morpho_branch_dnds geom_quasirandom(size=2, width=0.25, alpha=0.25) +
geom_boxplot(outlier.shape=NA, alpha=0.15, width=0.5, color="#666666") +
geom_signif(comparisons=x_comps, map_signif_level=TRUE, textsize=4, size=1, step_increase=0.12, margin_top=0.1) +
ylab("dN/dS") +
xlab("Species tree\nbranch partition") +
bartheme() +
theme(axis.text.x = element_text(angle=25, hjust=1)) +
guides(colour=guide_legend(override.aes=list(alpha=1)))
print(morpho_branch_dnds)
8 Substitution rates (108 arid genes)
8.1 Branch presence
#anc_info = subset(anc_info_w_root, node.type != "root")
= read.csv(branch_rates_file_arid, header=T, comment.char="#")
branch_rates_arid
names(branch_rates_arid)[1] = "tp"
= names(branch_rates_arid)[5:20]
cols_to_na for(col in cols_to_na){
$tp %in% exclude_branches,][[col]] = NA
branch_rates_arid[branch_rates_arid
}# For branches that we want to exclude for counting, convert
# columns with counts to NA
= names(tree_info)[!(names(tree_info) %in% names(branch_rates_arid))] # get non common names
uniq_info_cols = c(uniq_info_cols,"tp") # appending key parameter
uniq_info_cols # Get a list of columns from the tree_info df to join to the tree rates df
= branch_rates_arid %>% left_join((tree_info %>% select(uniq_info_cols)), by="tp")
branch_rates_arid # Select the columns from tree_info and join to tree_rates, merging by clade
# https://stackoverflow.com/a/61628157
= branch_rates_arid[order(branch_rates_arid$node), ]
branch_rates_arid # Re-order the rows by the R node
= select(branch_rates_arid, clade, node.type, num.genes.full)
full_clade $label = "Full clade"
full_cladenames(full_clade)[3] = "num.genes"
= select(branch_rates_arid, clade, node.type, num.genes.partial)
partial_clade $label = "Partial clade"
partial_cladenames(partial_clade)[3] = "num.genes"
= select(branch_rates_arid, clade, node.type, num.genes.descendant.counted)
descendant_counted $label = "Descendant counted"
descendant_countednames(descendant_counted)[3] = "num.genes"
= select(branch_rates_arid, clade, node.type, num.genes.discordant)
discordant_clade $label = "Discordant clade"
discordant_cladenames(discordant_clade)[3] = "num.genes"
= select(branch_rates_arid, clade, node.type, num.genes.missing)
no_clade $label = "Missing clade"
no_cladenames(no_clade)[3] = "num.genes"
# Subset each clade count column to add a label
= rbind(full_clade, partial_clade, descendant_counted, discordant_clade, no_clade)
clade_counts # Convert branch categories to long format
$label = factor(clade_counts$label, levels=c("Full clade", "Partial clade", "Descendant counted", "Discordant clade", "Missing clade"))
clade_counts# Factorize the labels in order
= ggplot(clade_counts, aes(x=label, y=num.genes, group=label, color=node.type)) +
branch_counts geom_quasirandom(size=2, width=0.25, alpha=0.25) +
geom_boxplot(outlier.shape=NA, alpha=0.15, width=0.5, color="#666666") +
ylab("# of genes") +
xlab("Species tree\nbranch classification") +
bartheme() +
theme(axis.text.x = element_text(angle=25, hjust=1)) +
guides(colour=guide_legend(override.aes=list(alpha=1)))
print(branch_counts)
8.2 dS
# col_branches = read.csv(col_file, header=F, comment.char="#")
# names(col_branches) = c("tp")
$col.branch = "Other"
branch_rates_arid
$col.branch[branch_rates_arid$tp %in% col_branches$tp] = "Colonization"
branch_rates_arid
#tree_info$col.desc.branch = "N"
for(i in 1:nrow(branch_rates_arid)){
if(branch_rates[i,]$node.type == "internal" && branch_rates_arid[i,]$col.branch == "Colonization"){
= getDescendants(rodent_tree, branch_rates_arid[i,]$node)
cur_desc $col.branch[branch_rates_arid$node==cur_desc[1]] = "Descendant"
branch_rates_arid$col.branch[branch_rates_arid$node==cur_desc[2]] = "Descendant"
branch_rates_arid
}
}
= subset(select(branch_rates_arid, clade, node.type, col.branch, dS), col.branch == "Colonization")
col_ds_arid $label = "Colonization (arid genes)"
col_ds_arid#names(full_clade)[3] = "num.genes"
= subset(select(branch_rates_arid, clade, node.type, col.branch, dS), col.branch == "Descendant")
desc_ds_arid $label = "Descendant (arid genes)"
desc_ds_arid
= subset(select(branch_rates_arid, clade, node.type, col.branch, dS), col.branch == "Other")
other_ds_arid $label = "Other (arid genes)"
other_ds_arid
= subset(select(branch_rates, clade, node.type, col.branch, dS), col.branch == "Colonization")
col_ds $label = "Colonization (All genes)"
col_ds#names(full_clade)[3] = "num.genes"
= subset(select(branch_rates, clade, node.type, col.branch, dS), col.branch == "Descendant")
desc_ds $label = "Descendant (All genes)"
desc_ds
= subset(select(branch_rates, clade, node.type, col.branch, dS), col.branch == "Other")
other_ds $label = "Other (All genes)"
other_ds
= rbind(col_ds, desc_ds, other_ds, col_ds_arid, desc_ds_arid, other_ds_arid)
ds_df # Convert branch categories to long format
$label = factor(ds_df$label, levels=c("Colonization (All genes)", "Descendant (All genes)", "Other (All genes)", "Colonization (arid genes)", "Descendant (arid genes)", "Other (arid genes)"))
ds_df
= list(c("Other (All genes)", "Other (arid genes)"), c("Colonization (All genes)", "Colonization (arid genes)"), c("Descendant (All genes)", "Descendant (arid genes)"))
x_comps
= ggplot(ds_df, aes(x=label, y=dS, group=label, color=node.type)) +
branch_ps_counts geom_quasirandom(size=2, width=0.25, alpha=0.25) +
geom_boxplot(outlier.shape=NA, alpha=0.15, width=0.5, color="#666666") +
geom_signif(comparisons=x_comps, map_signif_level=TRUE, textsize=4, size=1, step_increase=0.12, margin_top=0.1) +
ylab("dS") +
xlab("Species tree\nbranch partition") +
bartheme() +
theme(axis.text.x = element_text(angle=25, hjust=1)) +
guides(colour=guide_legend(override.aes=list(alpha=1)))
print(branch_ps_counts)
8.3 dN
= subset(select(branch_rates_arid, clade, node.type, col.branch, dN), col.branch == "Colonization")
col_dn_arid $label = "Colonization (arid genes)"
col_dn_arid#names(full_clade)[3] = "num.genes"
= subset(select(branch_rates_arid, clade, node.type, col.branch, dN), col.branch == "Descendant")
desc_dn_arid $label = "Descendant (arid genes)"
desc_dn_arid
= subset(select(branch_rates_arid, clade, node.type, col.branch, dN), col.branch == "Other")
other_dn_arid $label = "Other (arid genes)"
other_dn_arid
= subset(select(branch_rates, clade, node.type, col.branch, dN), col.branch == "Colonization")
col_dn $label = "Colonization (All genes)"
col_dn#names(full_clade)[3] = "num.genes"
= subset(select(branch_rates, clade, node.type, col.branch, dN), col.branch == "Descendant")
desc_dn $label = "Descendant (All genes)"
desc_dn
= subset(select(branch_rates, clade, node.type, col.branch, dN), col.branch == "Other")
other_dn $label = "Other (All genes)"
other_dn
= rbind(col_dn, desc_dn, other_dn, col_dn_arid, desc_dn_arid, other_dn_arid)
dn_df # Convert branch categories to long format
$label = factor(ds_df$label, levels=c("Colonization (All genes)", "Descendant (All genes)", "Other (All genes)", "Colonization (arid genes)", "Descendant (arid genes)", "Other (arid genes)"))
dn_df
= list(c("Other (All genes)", "Other (arid genes)"), c("Colonization (All genes)", "Colonization (arid genes)"), c("Descendant (All genes)", "Descendant (arid genes)"))
x_comps
= ggplot(dn_df, aes(x=label, y=dN, group=label, color=node.type)) +
branch_ps_counts geom_quasirandom(size=2, width=0.25, alpha=0.25) +
geom_boxplot(outlier.shape=NA, alpha=0.15, width=0.5, color="#666666") +
geom_signif(comparisons=x_comps, map_signif_level=TRUE, textsize=4, size=1, step_increase=0.12, margin_top=0.1) +
ylab("dN") +
xlab("Species tree\nbranch partition") +
bartheme() +
theme(axis.text.x = element_text(angle=25, hjust=1)) +
guides(colour=guide_legend(override.aes=list(alpha=1)))
print(branch_ps_counts)
8.4 dN/dS
= subset(select(branch_rates_arid, clade, node.type, col.branch, dNdS), col.branch == "Colonization")
col_dnds_arid $label = "Colonization (arid genes)"
col_dnds_arid#names(full_clade)[3] = "num.genes"
= subset(select(branch_rates_arid, clade, node.type, col.branch, dNdS), col.branch == "Descendant")
desc_dnds_arid $label = "Descendant (arid genes)"
desc_dnds_arid
= subset(select(branch_rates_arid, clade, node.type, col.branch, dNdS), col.branch == "Other")
other_dnds_arid $label = "Other (arid genes)"
other_dnds_arid
= subset(select(branch_rates, clade, node.type, col.branch, dNdS), col.branch == "Colonization")
col_dnds $label = "Colonization (All genes)"
col_dnds#names(full_clade)[3] = "num.genes"
= subset(select(branch_rates, clade, node.type, col.branch, dNdS), col.branch == "Descendant")
desc_dnds $label = "Descendant (All genes)"
desc_dnds
= subset(select(branch_rates, clade, node.type, col.branch, dNdS), col.branch == "Other")
other_dnds $label = "Other (All genes)"
other_dnds
= rbind(col_dnds, desc_dnds, other_dnds, col_dnds_arid, desc_dnds_arid, other_dnds_arid)
dnds_df # Convert branch categories to long format
$label = factor(ds_df$label, levels=c("Colonization (All genes)", "Descendant (All genes)", "Other (All genes)", "Colonization (arid genes)", "Descendant (arid genes)", "Other (arid genes)"))
dnds_df
= list(c("Other (All genes)", "Other (arid genes)"), c("Colonization (All genes)", "Colonization (arid genes)"), c("Descendant (All genes)", "Descendant (arid genes)"))
x_comps
= ggplot(dnds_df, aes(x=label, y=dNdS, group=label, color=node.type)) +
branch_ps_counts geom_quasirandom(size=2, width=0.25, alpha=0.25) +
geom_boxplot(outlier.shape=NA, alpha=0.15, width=0.5, color="#666666") +
geom_signif(comparisons=x_comps, map_signif_level=TRUE, textsize=4, size=1, step_increase=0.12, margin_top=0.1) +
ylab("dN/dS") +
xlab("Species tree\nbranch partition") +
bartheme() +
theme(axis.text.x = element_text(angle=25, hjust=1)) +
guides(colour=guide_legend(override.aes=list(alpha=1)))
print(branch_ps_counts)
9 Basic rate and phenotype correlations with tip branches
= read.csv("../../data/phenotype-data/combined-phenotype-data.csv", header=T, comment.char="#")
pheno = subset(branch_rates, node.type=="tip")
tips names(tips)[2] = "sample"
= merge(pheno, tips, by="sample")
pheno_rates
= select(pheno_rates, sample, Adult_Mass.g., Total_Length.mm., Head.Body_Length.mm., Tail_Length.mm., Hind_Foot_Length.mm., Relative_Tail_Length, Relative_Hind_Foot_Length, dN, dS, dNdS)
pheno_rates
= melt(pheno_rates, id.vars=c("sample", "dN", "dS", "dNdS"))
pheno_rates_long
#pheno_rates_long = gather(pheno_rates, sample, value, Adult_Mass.g., Total_Length.mm., Head.Body_Length.mm., Tail_Length.mm., Hind_Foot_Length.mm., Relative_Tail_Length, Relative_Hind_Foot_Length)
9.1 Avg dS per tip vs phenotype
= ggplot(pheno_rates_long, aes(x=log(value), y=log(dS))) +
p geom_point(size=2, alpha=0.2, color="#333333") +
geom_smooth(method="lm", se=F, linetype="dashed", color=corecol(numcol=1, pal="wilke", offset=2)) +
xlab("log avg. trait value per tip") +
ylab("log dS per tip") +
facet_wrap(~variable, scales="free_x") +
bartheme()
print(p)
9.2 Avg dN per tip vs phenotype
= ggplot(pheno_rates_long, aes(x=log(value), y=log(dN))) +
p geom_point(size=2, alpha=0.2, color="#333333") +
geom_smooth(method="lm", se=F, linetype="dashed", color=corecol(numcol=1, pal="wilke", offset=2)) +
xlab("log avg. trait value per tip") +
ylab("log dN per tip") +
facet_wrap(~variable, scales="free_x") +
bartheme()
print(p)
9.3 Avg dN/dS per tip vs phenotype
= ggplot(pheno_rates_long, aes(x=log(value), y=log(dNdS))) +
p geom_point(size=2, alpha=0.2, color="#333333") +
geom_smooth(method="lm", se=F, linetype="dashed", color=corecol(numcol=1, pal="wilke", offset=2)) +
xlab("log avg. trait value per tip") +
ylab("log dN/dS per tip") +
facet_wrap(~variable, scales="free_x") +
bartheme()
print(p)