Murine species trees

knitr::opts_chunk$set(echo = TRUE, warning = FALSE, message = FALSE)

library(ggplot2)
library(cowplot)
library(ggbeeswarm)
library(dplyr)
library(kableExtra)
library(tidyr)
library(ggtree)
library(phytools)
library(phangorn)
library(reshape2)
library(ggExtra)
library(ggrepel)
library(vroom)
library(ggdist)
library(stringr)
library(ggsignif)
library(phytools)
library(castor)
library(MCMCtreeR)
library(here)
library(stringr)
library(BAMMtools)
library(nlme)
library(RPANDA)

source(here("docs", "scripts", "lib", "design.r"))
source(here("docs", "scripts", "lib", "get_tree_info.r"))

#source("C:/bin/core/r/design.r")
#source("C:/bin/core/r/get_tree_info.r")

#source("C:/Users/grt814/bin/core/corelib/design.r")
#source("C:/Users/grt814/bin/core/get_tree_info.r")

#htmltools::includeHTML("../html-chunks/rmd_nav.html")

< Back to summary

1 Full coding species tree

188 species targeted capture to mouse exons
Assembled with SPADES
11,795 coding loci aligned with exonerate+MAFFT
Gene trees inferred with IQ-Tree
Species tree inferred with ASTRAL
Branch lengths inferred with IQ-Tree using the 866 loci with a normalized RF distance to the species tree of less than 0.25.

# This chunk handles all of the main inputs and reads the tree

# cat("188 species targeted capture to mouse exons assembled with SPADES.\n")
# cat("11,775 coding loci aligned with exonerate+mafft\n")
# cat("Gene trees inferred with IQtree.\n")
# cat("Species tree inferred with ASTRAL.\n")
# cat("Branch lengths estimated by ASTRAL.\n")
# Data summary

###############

#tree_file = here("docs", "data", "trees", "astral", "concord-rooted", "full_coding_iqtree_astral.cf.labeled.tree")
tree_file = here("docs", "data", "trees", "astral", "concord-rooted-bl", "full_coding_iqtree_astral_rooted_bl.cf.labeled.tree")
# Newick tree file, with tp labels

rodent_tree = read.tree(tree_file)
tree_to_df_list = treeToDF(rodent_tree)
tree_info = tree_to_df_list[["info"]]
# Read the tree and parse with treetoDF

tree_info = tree_info %>% separate(label, c("tp", "support"), sep=">", remove=F)
tree_info$tp = paste(tree_info$tp, ">", sep="")
tree_info = tree_info %>% separate(support, c("astral", "gcf", "scf"), sep="/", remove=T)
# Split the label by /. tp is my treeParse label.

tree_info$astral[tree_info$node.type=="tip"] = NA
# Fill in ASTRAL support as NA for the tips

tree_info$astral = as.numeric(tree_info$astral)
tree_info$gcf = as.numeric(tree_info$gcf)
tree_info$scf = as.numeric(tree_info$scf)
# Convert all supports to numeric

###############

#iq_tree_labels = here("docs", "data", "trees", "astral", "concord-rooted", "full_coding_iqtree_astral.cf.branch")
iq_tree_labels = here("docs", "data", "trees", "astral", "concord-rooted-bl", "full_coding_iqtree_astral_rooted_bl.cf.branch")
# Newick tree file, with iqtree labels

###############

#cf_stat_file = here("docs", "data", "trees", "astral", "concord-rooted", "full_coding_iqtree_astral.cf.stat")
cf_stat_file = here("docs", "data", "trees", "astral", "concord-rooted-bl", "full_coding_iqtree_astral_rooted_bl.cf.stat")
cf_stats = read.table(cf_stat_file, header=T)
# Concordance factor file

###############

#cf_rep_dir = here("docs", "data", "trees", "astral", "concord-rooted", "cf-reps")
cf_rep_dir = here("docs", "data", "trees", "astral", "concord-rooted-bl", "cf-reps")
#delta_outfile = here("docs", "data", "trees", "astral", "concord-rooted", "delta.tab")
delta_outfile = here("docs", "data", "trees", "astral", "concord-rooted-bl", "delta.tab")
# CF reps for delta and delta outfile

###############

col_file = here("docs", "data", "trees", "astral", "astral-colonization-branches.txt")
exclude_branches = c("Lophiomys_imhausi_UM5152", "Lophuromys_woosnami_LSUMZ37793", "<1>", "<187>", "<186>")
# Colonization branches and branches around the root to exclude from some things

###############

gt_file = here("docs", "data", "trees", "gene-trees", "loci-labeled.treefile")
# The file containing the gene trees

###############

#xmax = 31
xmax = 0.175
# The max of the x-axis for tree figures

# The node/branch labels in R and IQtree differ. IQtree uses a nice, logical post-ordering
# of internal nodes while R does something random and assigns labels to tips as well. This 
# chunk matches those up for the delta analysis later

iq_tree = read.tree(iq_tree_labels)
iqtree_to_df_list = treeToDF(iq_tree)
iqtree_info = iqtree_to_df_list[["info"]]
# Read the IQ tree tree with branch labels in and parse with get_tree_info

#node_convert = matchNodes(tree_to_df_list[["labeled.tree"]], iqtree_to_df_list[["labeled.tree"]], method="descendants")

tree_info$iqtree.node = NA
# Add a column to the main tree table about IQ tree labels

for(i in 1:nrow(tree_info)){
  cur_node = tree_info[i,]$node
  iqtree_row = subset(iqtree_info, node==cur_node)
  iqtree_label = iqtree_row$label
  tree_info[i,]$iqtree.node = iqtree_label
}
# For every row in the main tree table, add in the IQ tree node label given that
# we've read the same tree in R and can use the node.labels

1.1 Branches colored by gCF

h = corecol(numcol=1, pal="wilke", offset=3)
l = corecol(numcol=1, offset=3)
# Colors

gcf_tree = ggtree(rodent_tree, size=0.8, ladderize=F, aes(color=tree_info$gcf)) +
  scale_color_continuous(name='gCF', low=l, high=h, limits=c(0,100)) +
  xlim(0, xmax) +
  geom_tiplab(color="#333333", fontface='italic', size=2) +
  theme(legend.position=c(0.05,0.9))
#gcf_tree = gcf_tree + geom_text(aes(x=branch, label=ifelse(tree_info$node.type=="internal",as.character(node), ''), label.size=NA, fill="transparent"), size=2, vjust=-0.2)
# Option to add branch labels to the figure
print(gcf_tree)

###############

fig_outfile = here("docs", "figs", "full-coding-astral-rooted-gcf.png")
ggsave(fig_outfile, gcf_tree, width=14, height=14, unit="in")
# Save the tree image

## gCF tree

1.2 Branches colored by sCF

h = corecol(numcol=1, pal="wilke", offset=3)
l = corecol(numcol=1, offset=3)
# Colors

scf_tree = ggtree(rodent_tree, size=0.8, ladderize=F, aes(color=tree_info$scf)) +
  scale_color_continuous(name='sCF', low=l, high=h, limits=c(0,100)) +
  xlim(0, xmax) +
  geom_tiplab(color="#333333", fontface='italic', size=2) +
  theme(legend.position=c(0.05,0.9))
#gcf_tree = gcf_tree + geom_text(aes(x=branch, label=ifelse(tree_info$node.type=="internal",as.character(node), ''), label.size=NA, fill="transparent"), size=2, vjust=-0.2)
# Option to add branch labels to the figure
print(gcf_tree)

###############

fig_outfile = here("docs", "figs", "full-coding-astral-rooted-scf.png")
ggsave(fig_outfile, scf_tree, width=14, height=14, unit="in")
# Save the tree image

## gCF tree

1.3 Gene vs site concordance factors colored by branch support

h = corecol(numcol=1, pal="wilke", offset=3)
l = corecol(numcol=1, offset=3)
# Colors

p = ggplot(tree_info, aes(x=gcf, y=scf, color=astral)) + 
  geom_point(size=2, alpha=0.5) +
  scale_color_continuous(name='Astral support', low=l, high=h, limits=c(0.8,1)) +
  bartheme() +
  theme(legend.title=element_text(size=12))
print(p)

###############

fig_outfile = here("docs", "figs", "full-coding-astral-cf.png")
ggsave(fig_outfile, p, width=6, height=3, unit="in")
# Save the figure

1.4 Concordance factors vs. branch lengths

bl_gcf_p = ggplot(tree_info, aes(x=branch.length, y=gcf)) +
  geom_point(size=3, alpha=0.5) +
  #geom_text_repel(aes(label=ifelse(avg.ds>0.2|avg.dn>0.01,as.character(node),'')), show_guide=F) +
  #geom_smooth(method="lm", se=F, ) +
  scale_x_continuous(limits=c(0,0.03)) +
  xlab("Branch length") + 
  ylab("gCF per branch") + 
  bartheme()
bl_gcf_p = ggExtra::ggMarginal(bl_gcf_p, type="histogram", bins=50, fill=corecol(pal="wilke", numcol=1, offset=4), color="#666666")

bl_scf_p = ggplot(tree_info, aes(x=branch.length, y=scf)) +
  geom_point(size=3, alpha=0.5) +
  #geom_text_repel(aes(label=ifelse(avg.ds>0.2|avg.dn>0.01,as.character(node),'')), show_guide=F) +
  #geom_smooth(method="lm", se=F, ) +
  scale_x_continuous(limits=c(0,0.03)) +
  xlab("Branch length") + 
  ylab("sCF per branch") + 
  bartheme()
bl_scf_p = ggExtra::ggMarginal(bl_scf_p, type="histogram", bins=50, fill=corecol(pal="wilke", numcol=1, offset=5), color="#666666")

p = plot_grid(bl_gcf_p, bl_scf_p, ncol=2)
print(p)

###############

fig_outfile = here("docs", "figs", "full-coding-astral-cf-bl.png")
ggsave(fig_outfile, p, width=6, height=3, unit="in")
# Save the figure

2 Species tree comparisons

2.1 ASTRAL tree vs. concatenated tree

concat_file = here("docs", "data", "trees", "concat", "full-coding-concat-r-labels.tre")
concat_tree = read.tree(concat_file)
# Read the concatenated tree

astral_concat_comp = cophylo(rodent_tree, concat_tree)

## Rotating nodes to optimize matching...
## Done.

# Generate the cophylo object

plot(astral_concat_comp, link.type="curved", link.lwd=4, link.lty="solid", link.col=make.transparent(corecol(numcol=1), 0.5), fsize=c(0.5,0.5))

# Plot the trees

#comparePhylo(rodent_tree, concat_tree, plot=T, force.rooted=T)

2.2 ASTRAL tree vs. mitochondrial tree

mito_file = here("docs", "data", "trees", "mitochondrial", "concat-concord", "full-mitochondrial-concat.cf.rooted.tree")
mito_tree = read.tree(mito_file)
# Read the mitochondrial tree

astral_mito_comp = cophylo(rodent_tree, mito_tree)

## Rotating nodes to optimize matching...
## Done.

# Generate the cophylo object

plot(astral_mito_comp, link.type="curved", link.lwd=4, link.lty="solid", link.col=make.transparent(corecol(numcol=1, offset=1), 0.5), fsize=c(0.5,0.5))

# Plot the trees

3 Gene trees

The 11,774 gene trees contain varying numbers of taxa due to filtering on the alignment and filtering by IQtree for identical sequences.

3.1 Taxa per gene tree

gene_trees = read.table(gt_file, sep="\t")
names(gene_trees) = c("gene", "tree")
# Read the geen tree file

gene_trees$gene = word(gene_trees$gene, 1, sep="-")
gene_trees$gene = word(gene_trees$gene, 2, sep="/")
# Parse out the protein ID from the filename for each gene

###############

gt_data = data.frame("gene"=c(), "rf"=c(), "num.tips"=c(), "rf.zeros"=c(), "num.tips.zeros"=c())
# A df to track info for each gene tree

for(i in 1:nrow(gene_trees)){
  gt = read.tree(text=gene_trees[i,]$tree)
  # Read the gene tree as a phylo object
  
  cur_tips = gt$tip.label
  pruned_tree = drop.tip(rodent_tree, rodent_tree$tip.label[-match(cur_tips, rodent_tree$tip.label)])
  # Get a list of the tips and prune the species tree to contain only those tips
  
  cur_rf = RF.dist(pruned_tree, gt, normalize=T)
  # Calculate RF between the pruned species tree and the gene tree

  if(cur_rf==0){
    gt_data = rbind(gt_data, data.frame("gene"=gene_trees[i,]$gene, "rf"=cur_rf, "num.tips"=length(cur_tips), "rf.zeros"=cur_rf, "num.tips.zeros"=length(cur_tips)))
  }else{
    gt_data = rbind(gt_data, data.frame("gene"=gene_trees[i,]$gene, "rf"=cur_rf, "num.tips"=length(cur_tips), "rf.zeros"=NA, "num.tips.zeros"=NA))
  }
  # Add the gene tree info to the df, with a special case for when the RF is 0
}
# Read all the gene trees

p = ggplot(gt_data, aes(x=num.tips)) +
  geom_histogram(bins=50, fill=corecol(pal="wilke", numcol=1, offset=6), color="#666666") +
  #geom_quasirandom(size=2, width=0.25, alpha=0.25, color="#666666") +
  #geom_boxplot(outlier.shape=NA, alpha=0.75, width=0.5, color="#666666") +
  scale_y_continuous(expand=c(0,0)) +
  xlab("# of taxa") +
  ylab("# of gene trees") +
  bartheme()
print(p)

###############

fig_outfile = here("docs", "figs", "full-coding-gt-taxa-dist.png")
ggsave(fig_outfile, p, width=6, height=3, unit="in")
# Save the figure

#rf_file = here("docs", "data", "trees", "astral", "concord-rooted", "loci-nrf-below-0.25.txt")
#write.table(subset(gt_data, rf<=0.25), file=rf_file, row.names=F, quote=F, sep="\t")
# Write info for genes with low RF

3.2 Tree distance (RF) between gene trees and species tree

Since RF cannot handle missing taxa, the species tree is pruned for each gene tree to calculate Robinson-Foulds distance. We use the normalized metric since there are varying numbers of tips per gene tree.

p = ggplot(gt_data, aes(x=rf)) +
  geom_histogram(bins=50, fill=corecol(pal="wilke", numcol=1, offset=4), color="#666666") +
  #geom_quasirandom(size=2, width=0.25, alpha=0.25, color="#666666") +
  #geom_boxplot(outlier.shape=NA, alpha=0.75, width=0.5, color="#666666") +
  scale_y_continuous(expand=c(0,0)) +
  xlab("Normalized RF") +
  ylab("# of gene trees") +
  bartheme()
print(p)

###############

fig_outfile = here("docs", "figs", "full-coding-gt-rf-dist.png")
ggsave(fig_outfile, p, width=6, height=3, unit="in")
# Save the figure

4 Delta

4.1 Introduction

For each lineage in the species tree with gCF < 95% we calculated the \(\Delta\) statistic (Huson et al. 2005). This statistic follows the same logic as the ABBA-BABA site patterns used to calculate D-statistics, but uses gene tree topologies instead of alignment sites. Briefly, a given branch in an unrooted tree is defined by a quartet of species groupings with two possible discordant topologies, \(D_1\) and \(D_2\) (see Figure 1 from Minh et al. 2020). Under assumptions that discordance is caused by ILS, both discordant topologies should be present in equal proportions. However, if introgression has occurred one discordant topology will appear more frequently than the other. \(\Delta\) is calculated for a branch as follows, using the frequency of each discordant topology (Vanderpool et al. 2020):

\[\Delta = \frac{D_1 - D_2}{D_1 + D_2}\]

This normalized \(\Delta\) calculation ensures that all values are scaled between 0 and 1, with larger values indicating a larger skew towards one topology, and a higher chance that introgression has occurred.

To test whether the observed \(\Delta\) values are skewed significantly from 0 to imply introgression, we performed concordance factor analysis on 1,000 bootstrap replicates of our inferred gene trees to generate a null distribution of \(\Delta\) values. We then calculated Z-scores and p-values and assessed significance for each branch at a threshold of 0.01.

4.2 Null and actual distributions

Nodes with p < 0.01:

tree_info$delta.sig = F
# Add a column to the tree info table about significant delta values

###############

fig_outfile = here("docs", "figs", "full-coding-delta-dists.png")
# The image file to either generate or insert

if(!file.exists(delta_outfile)){
  cf_stats$delta = (abs(cf_stats$gDF1_N - cf_stats$gDF2_N)) / (cf_stats$gDF1_N + cf_stats$gDF2_N)
  # Calculate the delta values on the actual data
  
  iqtree_delta = select(cf_stats, ID, delta)
  names(iqtree_delta) = c("iqtree.node", "delta")
  tree_info = merge(x = tree_info, y = iqtree_delta, by = "iqtree.node", all.x=TRUE)
  # Adding the delta values to the main tree info table
  
  tree_info = tree_info[order(tree_info$node), ]
  # Re-sort the data frame by R node order after the merge so the trees still work
    
  low_cf_nodes = subset(cf_stats, gCF < 95)
  # Get the low concordance factor nodes from the data to test with delta
  
  delta_null = c()
  for(i in 0:999){
    cur_rep_str = as.character(i)
    while(nchar(cur_rep_str) < 4){
      cur_rep_str = paste("0", cur_rep_str, sep="")
    }
    # Handling the string of the rep
    
    cur_cf_file = paste(cf_rep_dir, "/rep", cur_rep_str, ".cf.stat", sep="")
    cf_rep = read.table(cur_cf_file, header=T, fill=T)
    # Read the current reps cf file
    
    cf_rep$delta = (abs(cf_rep$gDF1_N - cf_rep$gDF2_N)) / (cf_rep$gDF1_N + cf_rep$gDF2_N)
    delta_null = c(delta_null, cf_rep$delta)
    # Calculate delta for this rep and save values in vector
  }
  # Read concordance factors from bootstrap samples of gene trees and calculate delta to generate
  # null distribution
  
  ###############
  
  delta_null_df = data.frame("delta"=delta_null, y="duh")
  delta_null_df = subset(delta_null_df, !is.nan(delta))
  # Convert the delta values to a data frame for ggplot
  
  delta_mu = mean(delta_null_df$delta, na.rm=T)
  delta_sd = sd(delta_null_df$delta, na.rm=T)
  # Calculate the mean and sd of the null distribution to get z-scores and p-values
  
  ###############
  
  delta_out = data.frame("node"=c(), "delta"=c(), "z-score"=c(), "p-value"=c())
  # Initialize output data frame
  
  for(i in 1:nrow(low_cf_nodes)){
    row = low_cf_nodes[i,]
    z = (row$delta - delta_mu) / delta_sd
    p = pnorm(-abs(z))
    if(p < 0.01){
      print(paste(row$ID, row$delta, z, p, sep=" "))
      tree_info$delta.sig[tree_info$iqtree.node==row$ID] = T
      # Set the delta significant to TRUE in the main tree info table
    }
    delta_out = rbind(delta_out, data.frame("node"=row$ID, "delta"=row$delta, "z-score"=z, "p-value"=p))
  }
  # Calculate z and p for each low gCF node in the species tree and save to output data frame
  
  write.table(file=delta_outfile, delta_out, sep="\t", row.names=F)
  # Write the delta values per node to the output file
  
  ###############
  
  delta_null_p = ggplot(delta_null_df, aes(x=delta)) +
    geom_histogram(color="#ececec", bins=50) +
    scale_x_continuous(limits=c(0,1)) +
    scale_y_continuous(expand=c(0,0)) +
    xlab("Delta") + 
    ylab("# nodes") +
    bartheme()
  
  delta_actual_p = ggplot(low_cf_nodes, aes(x=delta)) +
    geom_histogram(fill="#920000", color="#ececec") +
    scale_x_continuous(limits=c(0,1)) +
    scale_y_continuous(expand=c(0,0)) +
    xlab("Delta") +
    ylab("# nodes") +
    bartheme()
  
  delta_p = plot_grid(delta_null_p, delta_actual_p, ncol=2, labels=c("Bootstrap distribution", "Actual distribution"), label_size=12)
  print(delta_p)
  
  ###############
  
  ggsave(fig_outfile, delta_p, width=10, height=4, unit="in")
  # Save the figure
  
}else{
  tree_info$delta = NA
  
  delta_out = read.table(delta_outfile, header=T)
  for(i in 1:nrow(delta_out)){
    row = delta_out[i,]
    tree_info$delta[tree_info$iqtree.node==row$node] = row$delta
    if(row$p.value < 0.01){
      print(paste(row$node, row$delta, row$z.score, row$p.value, sep=" "))
      tree_info$delta.sig[tree_info$iqtree.node==row$node] = T
      # Set the delta significant to TRUE in the main tree info table      
    }
  }
  
  knitr::include_graphics(fig_outfile)
  
}

## [1] "219 0.406732117812062 2.44516394160885 0.00723931554647087"
## [1] "222 0.407151095732411 2.44879368056455 0.00716677631309369"
## [1] "232 0.622047244094488 4.31050734912284 8.14402076760321e-06"
## [1] "260 0.416606498194946 2.53070883884166 0.00569161487327277"
## [1] "272 0.531951640759931 3.52998048773703 0.000207795157849876"
## [1] "273 0.464877663772691 2.9488972960955 0.00159454968995379"
## [1] "317 0.510204081632653 3.34157446746339 0.000416523334864317"
## [1] "324 0.407725321888412 2.45376838435342 0.00706840016380011"

4.3 Branches by delta

Branches with significant delta are labeled

h = corecol(numcol=1, pal="wilke", offset=3)
l = corecol(numcol=1, offset=3)

delta_tree = ggtree(rodent_tree, size=0.8, ladderize=F, aes(color=tree_info$delta)) +
  scale_color_continuous(name="Delta", high=h, low=l) +
  xlim(0, xmax) +
  geom_tiplab(color="#333333", fontface='italic', size=2) +
  theme(legend.position=c(0.05,0.9)) +
  geom_label(aes(x=branch, label=ifelse(tree_info$delta.sig,as.character(tree_info$tp),'')), label.size=NA, fill="transparent", show.legend=F, vjust=-0.1)
#geom_text(aes(label=rodent_data$support), hjust=-.1, color="#006ddb") +
#geom_nodepoint(color="#666666", alpha=0.85, size=4)
print(delta_tree)

###############

fig_outfile = here("docs", "figs", "full-coding-astral-delta.png")
ggsave(fig_outfile, delta_tree, width=14, height=14, unit="in")
# Save the tree image

4.4 Branches with significant delta

h = corecol(numcol=1, pal="wilke", offset=1)
l = corecol(numcol=1, offset=1)

intro_tree = ggtree(rodent_tree, size=0.8, ladderize=F, aes(color=tree_info$delta.sig)) +
  scale_color_manual(name="Significant Delta", labels=c("False", "True"), values=corecol(pal="trek", numcol=2)) +
  xlim(0, xmax) +
  geom_tiplab(color="#333333", fontface='italic', size=2) +
  theme(legend.position=c(0.05,0.9)) +
  geom_label(aes(x=branch, label=ifelse(tree_info$delta.sig,as.character(tree_info$tp),'')), label.size=NA, fill="transparent", show.legend=F, vjust=-0.1)
#geom_text(aes(label=rodent_data$support), hjust=-.1, color="#006ddb") +
#geom_nodepoint(color="#666666", alpha=0.85, size=4)
print(intro_tree)

4.5 Concordance factors and delta

h = corecol(numcol=1, pal="wilke", offset=3)
l = corecol(numcol=1, offset=3)
# Colors

p = ggplot(tree_info, aes(x=gcf, y=scf, color=delta)) + 
  geom_point(size=2, alpha=0.5) +
  geom_text_repel(aes(label=ifelse(delta.sig,as.character(tp),'')), show_guide=F, min.segment.length=0) +
  scale_color_continuous(name='Delta', low=l, high=h) +
  xlab("gCF") +
  ylab("sCF") +
  bartheme() +
  theme(legend.title=element_text(size=12))

print(p)

###############

fig_outfile = here("docs", "figs", "full-coding-delta-cf.png")
ggsave(fig_outfile, p, width=6, height=4, unit="in")
# Save the figure

4.6 Branch length and delta

h = corecol(numcol=1, pal="wilke", offset=3)
l = corecol(numcol=1, offset=3)
# Colors

p = ggplot(tree_info, aes(x=branch.length, y=delta, color=delta)) + 
  geom_point(size=2, alpha=0.5) +
  geom_text_repel(aes(label=ifelse(delta.sig,as.character(tp),'')), show_guide=F, min.segment.length=0) +
  scale_x_continuous(limits=c(0,0.03)) +
  #scale_color_continuous(name='Delta', low=l, high=h) +
  xlab("Branch length") +
  ylab("Delta") +
  bartheme() +
  theme(legend.title=element_text(size=12))

print(p)

###############

fig_outfile = here("docs", "figs", "full-coding-delta-bl.png")
ggsave(fig_outfile, p, width=6, height=4, unit="in")
# Save the figure

4.7 Species tree without tips

#no_tip_file = here("docs", "data", "trees", "astral", "concord-rooted", "full_coding_iqtree_astral.cf.labeled.notips.tree")
#no_tip_file = here("docs", "data", "trees", "astral", "concord-rooted-bl", "full_coding_iqtree_astral_rooted_bl.cf.labeled.notips.tree")

no_tip_file = here("docs", "data", "trees", "astral", "iqtree-dating", "full-coding-astral-iqtree-dating-anil-mmus-12.1.timetree.reordered.labeled.notips.tre")


tree2 = read.tree(no_tip_file)
tree_to_df_list = treeToDF(tree2)
tree2_info = tree_to_df_list[["info"]]
names(tree2_info)[5] = "tp"
# Read the tree and parse with treetoDF

tree_info_notip = subset(tree_info, tp %in% tree2_info$tp)
tree_info_notip = tree_info_notip %>% select(tp, delta, delta.sig)
tree2_info = left_join(tree2_info, tree_info_notip, by="tp")
# Add the delta values for each node to the no tip tree

tree2_tip_info = subset(tree2_info, node.type=="tip")
## No tip tree
###############

tree3 = drop.tip(tree2, "<1>")
tree_to_df_list = treeToDF(tree3)
tree3_info = tree_to_df_list[["info"]]
names(tree3_info)[5] = "tp"
# Read the tree and parse with treetoDF

tree_info_notip = subset(tree_info, tp %in% tree2_info$tp)
tree_info_notip = tree_info_notip %>% select(tp, delta, delta.sig)
tree3_info = left_join(tree3_info, tree_info_notip, by="tp")
# Add the delta values for each node to the no tip tree

tree3_tip_info = subset(tree3_info, node.type=="tip")
## No tip tree, no outgroup
###############

h = corecol(numcol=1, pal="wilke", offset=3)
l = corecol(numcol=1, offset=3)

no_tip_max = 25
#no_tip_max = 0.13

delta_tree = ggtree(tree2, size=0.8, ladderize=F, aes(color=tree2_info$delta)) +
  scale_color_continuous(name="Delta", high=h, low=l) +
  xlim(0, no_tip_max) +
  geom_tiplab(color="#333333", fontface='italic', size=5) +
  theme(legend.position=c(0.05,0.9))
  #geom_label(aes(x=branch, label=ifelse(tree_info$delta.sig,as.character(tree_info$tp),'')), label.size=NA, fill="transparent", show.legend=F, vjust=-0.1)
#geom_text(aes(label=rodent_data$support), hjust=-.1, color="#006ddb") +
#geom_nodepoint(color="#666666", alpha=0.85, size=4)
print(delta_tree)

###############

fig_outfile = here("docs", "figs", "full-coding-astral-delta-notips.png")
ggsave(fig_outfile, delta_tree, width=12, height=12, unit="in")
# Save the tree image

4.8 Phylogenetic signal of delta (Pagel’s lambda, Bloomberg K)

# Generally helpful: https://lukejharmon.github.io/pcm/chapter6_beyondbm/

phylosig(tree2, tree2_tip_info$delta, test=T)

## [1] "x has no names; assuming x is in the same order as tree$tip.label"
## [1] "some data in x given as 'NA', dropping corresponding species from tree"

## 
## Phylogenetic signal K : 0.222633 
## P-value (based on 1000 randomizations) : 0.211

# Bloomberg K

phylosig(tree2, tree2_tip_info$delta, method="lambda", test=T)

## [1] "x has no names; assuming x is in the same order as tree$tip.label"
## [1] "some data in x given as 'NA', dropping corresponding species from tree"

## 
## Phylogenetic signal lambda : 6.6107e-05 
## logL(lambda) : 52.31 
## LR(lambda=0) : -0.00215541 
## P-value (based on LR test) : 1

#Pagel's lambda

###############

#new_tree = drop.tip(no_tip_tree, "<1>")
#new_info = subset(new_tip_info, tp != "<1>")
#a = data.frame(x=new_info$branch.length, y=new_info$delta, names=new_info$tp, row.names=new_info$tp)
#picModel <- pic.pgls(formula = y ~ x, phy = new_tree, y = a, lambda = "ML", return.intercept.stat = FALSE)

# PGLS.. not sure how to interpret
# https://cran.r-project.org/web/packages/motmot/vignettes/motmot.html#fast-estimation-of-phylogenetic-generalised-least-squares
# Or maybe this is better: https://lukejharmon.github.io/ilhabela/instruction/2015/07/03/PGLS/
###############

4.9 Phylogenetic signal of delta (Pagel’s lambda, Bloomberg K) for clades with high speciation rates (see below)

4.9.1 Rattini

# Generally helpful: https://lukejharmon.github.io/pcm/chapter6_beyondbm/

tree2_rattini = tree2
tree2_rattini$tip.label = paste(tree2_rattini$tip.label, "/", subset(tree2_info, node.type == "tip")$delta, sep="")
tree2_rattini$node.label = paste(tree2_rattini$node.label, "/", subset(tree2_info, node.type != "tip")$delta, sep="")
# Add delta values to the labels of the tree with no tips

tree2_rattini = extract.clade(tree2_rattini, tree2_info$node[tree2_info$tp=="<169>"])
tree_to_df_list = treeToDF(tree2_rattini)
tree2_rattini_info = tree_to_df_list[["info"]]
# Get the subtree and parse to df

tree2_rattini_info = tree2_rattini_info %>% separate(label, c("tp", "delta"), sep="/", remove=F)
tree2_rattini_info$delta = as.numeric(tree2_rattini_info$delta)
# Split the label by / to get the delta value

tree2_rattini_tip_info = subset(tree2_rattini_info, node.type=="tip")
# Get only the tip info for the tests

phylosig(tree2_rattini, tree2_rattini_tip_info$delta, test=T)

## [1] "x has no names; assuming x is in the same order as tree$tip.label"

## 
## Phylogenetic signal K : 0.634626 
## P-value (based on 1000 randomizations) : 0.129

# Bloomberg K

phylosig(tree2_rattini, tree2_rattini_tip_info$delta, method="lambda", test=T)

## [1] "x has no names; assuming x is in the same order as tree$tip.label"

## 
## Phylogenetic signal lambda : 6.6107e-05 
## logL(lambda) : 12.2344 
## LR(lambda=0) : -0.00023808 
## P-value (based on LR test) : 1

#Pagel's lambda

4.9.2 Hydromyini

# Generally helpful: https://lukejharmon.github.io/pcm/chapter6_beyondbm/

tree2_rattini = tree2
tree2_rattini$tip.label = paste(tree2_rattini$tip.label, "/", subset(tree2_info, node.type == "tip")$delta, sep="")
tree2_rattini$node.label = paste(tree2_rattini$node.label, "/", subset(tree2_info, node.type != "tip")$delta, sep="")
# Add delta values to the labels of the tree with no tips

tree2_rattini = extract.clade(tree2_rattini, tree2_info$node[tree2_info$tp=="<177>"])
tree_to_df_list = treeToDF(tree2_rattini)
tree2_rattini_info = tree_to_df_list[["info"]]
# Get the subtree and parse to df

tree2_rattini_info = tree2_rattini_info %>% separate(label, c("tp", "delta"), sep="/", remove=F)
tree2_rattini_info$delta = as.numeric(tree2_rattini_info$delta)
# Split the label by / to get the delta value

tree2_rattini_tip_info = subset(tree2_rattini_info, node.type=="tip")
# Get only the tip info for the tests

phylosig(tree2_rattini, tree2_rattini_tip_info$delta, test=T)

## [1] "x has no names; assuming x is in the same order as tree$tip.label"

## 
## Phylogenetic signal K : 0.505996 
## P-value (based on 1000 randomizations) : 0.455

# Bloomberg K

phylosig(tree2_rattini, tree2_rattini_tip_info$delta, method="lambda", test=T)

## [1] "x has no names; assuming x is in the same order as tree$tip.label"

## 
## Phylogenetic signal lambda : 6.6107e-05 
## logL(lambda) : 15.8495 
## LR(lambda=0) : -0.000279124 
## P-value (based on LR test) : 1

#Pagel's lambda

4.10 Phylogenetic independent contrasts of delta

PIC are basically the expected differences in trait values between pairs of branches descending from a node in a tree corrected for expected variance, which is just the sum of the branch lengths. For example, for a given node \(N\) in a tree, the PIC will be:

\[PIC(N) = \frac{V_1 - V_2}{L_1 + L_2}\]

With a post-order traversal of the tree, PICs can be calculated for each node in the tree. Generally, for internal nodes, a trait value \(V\) is assigned after its PIC has been calculated so it can be used in calculation of the PIC of its ancestor.

\[V = \frac{(1/L_1)V_1+(1/L_2)V_2}{1/L_1+1/L_2}\]

The branch is also lengthened to account for possible error in \(V\).

Finally, the rate of trait evolution \(R\) is calculated as

\[R = \frac{\sum ^{n}{PIC_n}}{n}\]

where \(n\) is the number of internal nodes in the tree and \(PIC_n\) is the PIC calculated for that node.

However, we have no need to estimate values \(V\) at internal nodes in our tree since we have delta values for each branch. Here I compare both the contrasts themselves and the rate estimated from them when using the inferred ancestral states or the observed states in our tree.

# Helpful:
# https://bio.libretexts.org/Bookshelves/Evolutionary_Developmental_Biology/Phylogenetic_Comparative_Methods_(Harmon)/04%3A_Fitting_Brownian_Motion/4.02%3A_Estimating_Rates_using_Independent_Contrasts
# https://lukejharmon.github.io/ilhabela/instruction/2015/07/02/phylogenetic-independent-contrasts/

inferred_pic = pic(tree3_tip_info$delta, tree3, var.contrasts=T, rescaled.tree=T)
# Compute the contrasts based on the 'tip' values from the tree

inferred_rate = sum(inferred_pic$contr[,1]^2)/nrow(inferred_pic$contr)
# Compute the rate from the inferred constrasts

###############

tree2_info$pic = NA
tree2_info$inferred.pic = NA
# Columns to add to the tree info

num_contrasts = 0
# Keep track of the number of contrasts done to calculate rate later

for(i in 1:nrow(tree2_info)){
  row = tree2_info[i,]
  if(row$node.type == "tip" || row$node.type=="ROOT"){
    next
  }
  # Skip tips and the root since we can't calculate on them
  
  tree2_info$inferred.pic[tree2_info$tp==row$tp] = inferred_pic[["contr"]][,1][[row$tp]]
  # Add the inferred pic to the tree info df here
  
  desc = getDescendants(tree2, row$node)
  desc1 = tree2_info[tree2_info$node==desc[1],]
  desc2 = tree2_info[tree2_info$node==desc[2],]
  # Get the descendants of the current node
  
  pic = (desc1$delta - desc2$delta) / (desc1$branch.length + desc2$branch.length)
  # Calculate the contrast based on the descendant deltas and branch lenghts
  
  tree2_info$pic[tree2_info$node==row$node] = pic
  # Add the contrast to the tree info df
  
  num_contrasts = num_contrasts + 1
  # Increment number of contrasts
}

observed_rate = sum(tree2_info$pic^2, na.rm=t)/num_contrasts
# Compute the rate from the observed contrasts

## Compute contrasts from observed delta 
###############

plot_info = subset(tree2_info, node.type != "tip" & !is.na(delta))

#a = subset(no_tip_info, node.type != "tip")
a = select(plot_info, tp, pic, delta.sig)
a$label = "Observed"

b = select(plot_info, tp, inferred.pic, delta.sig)
names(b)[2] = "pic"
b$label = "Inferred"

z = rbind(a,b)

x_comps = list(c("Observed", "Inferred"))

p = ggplot(data=z, aes(x=label, y=pic, group=label, color=delta.sig)) +
  geom_quasirandom(size=2, width=0.25, alpha=0.30) +
  geom_boxplot(outlier.shape=NA, alpha=0.15, width=0.5, color="#666666") +
  geom_signif(comparisons=x_comps, map_signif_level=TRUE, textsize=4, size=1, step_increase=0.12, margin_top=0.1, test="ks.test") +
  #scale_y_continuous(limits=c(0,0.7)) +
  scale_color_manual("Significant delta?", values=c(corecol(numcol=1, pal="wilke", offset=6), corecol(numcol=1, pal="wilke", offset=4))) +
  xlab("") +
  ylab("PIC") +
  bartheme() +
  theme(axis.text.x=element_text(angle=15, hjust=1),
        plot.title = element_text(size=14, vjust=1),
        legend.title=element_text(size=12),
        legend.position="bottom",
        plot.margin = unit(c(1,0,0,0), "cm")) +
  guides(colour=guide_legend(override.aes=list(alpha=1)))
print(p)

cat("KS test. Note that distributions are not significantly different with Wilcox test\n")

## KS test. Note that distributions are not significantly different with Wilcox test

###############

fig_outfile = here("docs", "figs", "full-coding-delta-pic.png")
ggsave(fig_outfile, p, width=6, height=4, unit="in")
# Save the figure

###############

rate_table = data.frame("Rate type"=c("Inferred", "Observed"), "Rate"=c(inferred_rate, observed_rate))
rate_table %>% kable() %>% kable_styling(bootstrap_options=c("striped", "condended", "responsive"), full_width=F)

Rate.type	Rate
Inferred	0.0047570
Observed	0.0095405

# Table to display rates

4.11 Ancestral reconstruction of delta values

#new_tree = drop.tip(no_tip_tree, "<1>")
# tree_to_df_list = treeToDF(tree2_no_out)
# tree2_no_out_ = tree_to_df_list[["info"]]
# names(new_tree_info)[5] = "tp"
# Drop the outgroup and re-read the tree

#new_info = subset(new_tip_info, tp != "<1>")

###############

a = data.frame(delta=tree3_tip_info$delta, row.names=tree3_tip_info$tp)
b = as.matrix(a)[,1]
fit = fastAnc(tree3, b, vars=T,CI=T)
# Transform delta values from tips and infer ancestral states

#obj<-contMap(new_tree,b,plot=FALSE)
#plot(obj,type="fan",legend=0.7*max(nodeHeights(new_tree)),
#    fsize=c(0.7,0.9))

# See http://www.phytools.org/eqg2015/asr.html

###############

tree2_info$delta.recon = NA
tree2_info$delta.lower = NA
tree2_info$delta.upper = NA
for(i in 1:nrow(tree2_info)){
  row = tree2_info[i,]
  if(row$node.type == "tip" || row$node.type=="ROOT"){
    next
  }
 
  tp_node = row$tp
  node = row$node
  new_node = tree3_info$node[tree3_info$tp==tp_node]
  
  tree2_info$delta.recon[tree2_info$tp==row$tp] = fit[["ace"]][[as.character(new_node)]]
  tree2_info$delta.lower[tree2_info$tp==row$tp] = fit[["CI95"]][as.character(new_node),1]
  tree2_info$delta.upper[tree2_info$tp==row$tp] = fit[["CI95"]][as.character(new_node),2]
}
# Add the inferred delta values to the main data frame for the no tip tree

###############

# delta_tree = ggtree(fit$rescaled.tree, size=0.8, ladderize=F) +
#   #scale_color_continuous(name="Delta", high=h, low=l) +
#   xlim(0, 0.2) +
#   geom_tiplab(color="#333333", fontface='italic', size=5) +
#   theme(legend.position=c(0.05,0.9))
#   #geom_label(aes(x=branch, label=ifelse(tree_info$delta.sig,as.character(tree_info$tp),'')), label.size=NA, fill="transparent", show.legend=F, vjust=-0.1)
# #geom_text(aes(label=rodent_data$support), hjust=-.1, color="#006ddb") +
# #geom_nodepoint(color="#666666", alpha=0.85, size=4)
# print(delta_tree)
  
h = corecol(numcol=1, pal="wilke", offset=3)
l = corecol(numcol=1, offset=3)

no_tip_max = 25
#no_tip_max = 0.13

delta_tree = ggtree(tree2, size=0.8, ladderize=F, aes(color=tree2_info$delta.recon)) +
  scale_color_continuous(name="Delta", high=h, low=l) +
  xlim(0, no_tip_max) +
  geom_tiplab(color="#333333", fontface='italic', size=5) +
  theme(legend.position=c(0.05,0.9))
  #geom_label(aes(x=branch, label=ifelse(tree_info$delta.sig,as.character(tree_info$tp),'')), label.size=NA, fill="transparent", show.legend=F, vjust=-0.1)
#geom_text(aes(label=rodent_data$support), hjust=-.1, color="#006ddb") +
#geom_nodepoint(color="#666666", alpha=0.85, size=4)
print(delta_tree)

# Tree with ancestral reconstructions of delta

###############

fig_outfile = here("docs", "figs", "full-coding-astral-delta-recon-notips.png")
ggsave(fig_outfile, delta_tree, width=12, height=12, unit="in")
# Save the tree image

# w = subset(tree2_info, node.type=="tip")
# w = select(w, tp, delta)
# w$label = "delta.recon"

x = subset(tree2_info, node.type != "tip")
x = select(x, tp, delta, delta.sig)
x$delta.sig = ifelse(x$delta.sig, "Yes", "No")
x$label = "Observed"

y = select(tree2_info, tp, delta.recon, delta.sig)
names(y)[2] = "delta"
y$delta.sig = ifelse(y$delta.sig, "Yes", "No")
y$label = "Simulated"

z = rbind(x,y)
# Convert to long with two categories

###############

x_comps = list(c("Observed", "Simulated"))

p = ggplot(z, aes(x=label, y=delta, group=label, color=delta.sig)) +
  geom_quasirandom(size=3, width=0.25, alpha=0.5) +
  geom_boxplot(outlier.shape=NA, alpha=0.15, width=0.5, color="#666666") +
  geom_signif(comparisons=x_comps, map_signif_level=TRUE, textsize=4, size=1, step_increase=0.12, margin_top=0.1) +
  ylab("Delta") +
  xlab("") +
  scale_color_manual("Significant observed delta?", values=c(corecol(numcol=1, pal="wilke", offset=3), corecol(numcol=1, pal="wilke", offset=4))) +
  bartheme() +
  theme(plot.title = element_text(size=14, vjust=1),
        legend.title=element_text(size=12),
        legend.position="bottom",
        plot.margin = unit(c(1,0,0,0), "cm"))
print(p)

###############

fig_outfile = here("docs", "figs", "full-coding-delta-recon.png")
ggsave(fig_outfile, p, width=6, height=5, unit="in")
# Save the figure

###############

p = ggplot(tree2_info, aes(x=delta.recon, delta)) +
  geom_point(size=2, alpha=0.5) +
  geom_smooth(method="lm") +
  bartheme()
print(p)

fig_outfile = here("docs", "figs", "full-coding-delta-recon-corr.png")
ggsave(fig_outfile, p, width=6, height=4, unit="in")
# Save the figure

###############

tree2_internal_info = subset(tree2_info, node.type=="internal" & !tp %in% exclude_branches)
p = ggplot(tree2_internal_info, aes(x=delta.recon, y=node)) +
  geom_point(aes(color="Simulated (95% CI)"), size=2, alpha=0.7) +
  geom_segment(aes(x=delta.lower, xend=delta.upper, y=node, yend=node)) +
  geom_point(aes(x=delta, y=node, color="Observed"), size=2, alpha=0.7) +
  scale_color_manual(values=c("Simulated (95% CI)"="black", "Observed"="red")) +
  xlab("Delta") +
  ylab("Node") +
  bartheme() +
  theme(legend.position="bottom")
print(p)

tree2_internal_info$within.bounds = 0
for(i in 1:nrow(tree2_internal_info)){
  if(tree2_internal_info[i,]$delta >= tree2_internal_info[i,]$delta.lower && tree2_internal_info[i,]$delta <= tree2_internal_info[i,]$delta.upper){
    tree2_internal_info[i,]$within.bounds = 1
  }
}

fig_outfile = here("docs", "figs", "full-coding-delta-recon-ci.png")
ggsave(fig_outfile, p, width=4, height=6, unit="in")
# Save the figure

4.12 Delta differences to ancestors (observed)

My own attempt at some sort of phylogenetic independent comparison.

For each branch, compute the difference in the value delta between it and its ancestor and the difference between it and a random branch in the tree. Repeat random sampling as needed.

delta_info = subset(tree2_info, !is.na(delta))
# Get info only from branches with delta values

#delta_diffs = data.frame("ref.node"=c(), "ref.delta"=c(), "query.node"=c(), "query.delta"=c(), "query.type"=c(), "delta.diff"=c())

delta_anc_diffs = data.frame("ref.node"=c(), "ref.delta"=c(), "sig.ref.delta"=c(), "query.node"=c(), "query.delta"=c(), "query.type"=c(), "delta.diff"=c())
delta_random_diffs = data.frame("ref.node"=c(), "ref.delta"=c(), "sig.ref.delta"=c(), "query.node"=c(), "query.delta"=c(), "query.type"=c(), "delta.diff"=c(), "replicate"=c())
# Data frames to save delta diffs

for(i in 1:nrow(delta_info)){
# Loop over every branch with delta
  
  node = delta_info[i,]$node
  delta = delta_info[i,]$delta
  sig_delta = delta_info[i,]$delta.sig
  sig_label = "N"
  
  if(sig_delta){
    sig_label = "Y"
  }
  # Get the delta value and other info from the branch
  
  anc = Ancestors(tree2, node, type="parent")
  if(tree2_info[tree2_info$node==node,]$tp %in% exclude_branches || tree2_info[tree2_info$node==anc,]$tp %in% exclude_branches){
    next
  }
  # Get the ancestral node, but move on if it is an excluded branch
  
  anc_delta = delta_info[delta_info$node==anc,]$delta
  if(delta_info[delta_info$node==anc,]$delta.sig){
    sig_label = "Ancestor Y"
  }
  # Get the delta info from the ancestor
  
  #delta_info[delta_info$node==node,]$delta.diff = abs(delta - anc_delta)
  
    delta_anc_diffs = rbind(delta_anc_diffs, data.frame("ref.node"=node, "ref.delta"=delta, "sig.ref.delta"=sig_label, "query.node"=anc, "query.delta"=anc_delta, "query.type"="Ancestor", "delta.diff"=abs(delta-anc_delta)))
    # Add the info of the comparison of this node to its ancestor to the anc delta diff data frame
  
  for(j in 1:100){
  # Select random branches
    
    random_anc = anc
    while(random_anc == anc || random_anc == node){
      random_row = delta_info[sample(nrow(delta_info),1),]
      random_anc = random_row$node
    }
    # Select a random branch in the tree as long as it is not this branch or its ancestor
    
    delta_random_diffs = rbind(delta_random_diffs, data.frame("ref.node"=node, "ref.delta"=delta, "sig.ref.delta"=sig_label, "query.node"=random_anc, "query.delta"=random_row$delta, "query.type"="Random branch", "delta.diff"=abs(delta-random_row$delta), "replicate"=j))
    # Add the info of the comparison of this node to the random branch to the random delta diff data frame
  }
    
  #delta_info[delta_info$node==node,]$random.diff = abs(delta - random_row$delta)

}

###############

x_comps = list(c("Ancestor", "Random branch"))
# List of comparisons to make

delta_random_single = subset(delta_random_diffs, replicate==1)
delta_random_single = select(delta_random_single, -replicate)
delta_random_single$query.type = "Random branch"
delta_diffs_single = rbind(delta_anc_diffs, delta_random_single)
# Get the first replicate, add label, and combine with anc diffs

delta_diff_single_p = ggplot(data=delta_diffs_single, aes(x=query.type, y=delta.diff, group=query.type, color=sig.ref.delta)) +
  geom_quasirandom(size=2, width=0.25, alpha=0.30) +
  geom_boxplot(outlier.shape=NA, alpha=0.15, width=0.5, color="#666666") +
  geom_signif(comparisons=x_comps, map_signif_level=TRUE, textsize=4, size=1, step_increase=0.12, margin_top=0.1) +
  scale_y_continuous(limits=c(0,0.7)) +
  scale_color_manual("Significant delta?", values=c(corecol(numcol=1, pal="wilke", offset=4), corecol(numcol=1, pal="wilke", offset=3), "#920000")) +
  ggtitle("Single random sample") +
  xlab("") +
  ylab("Delta difference") +
  bartheme() +
  theme(axis.text.x=element_text(angle=15, hjust=1),
        plot.title = element_text(size=14, vjust=1),
        legend.title=element_text(size=12),
        legend.position="bottom",
        plot.margin = unit(c(1,0,0,0), "cm")) +
  guides(colour=guide_legend(override.aes=list(alpha=1)))

print(delta_diff_single_p)

# Distribution of delta diffs to ancestor and one random branch
###############

x_comps = list(c("Ancestor", "Random branch"))
# List of comparisons to make

delta_random_meds = delta_random_diffs %>% group_by(ref.node, query.type, sig.ref.delta) %>% summarize(delta.diff=median(delta.diff))
delta_diffs_meds = rbind(select(delta_anc_diffs, ref.node, query.type, sig.ref.delta, delta.diff), delta_random_meds)
# Calculate median of all random branch diffs and combine with anc diffs

delta_diff_med_p = ggplot(data=delta_diffs_meds, aes(x=query.type, y=delta.diff, group=query.type, color=sig.ref.delta)) +
  geom_quasirandom(size=2, width=0.25, alpha=0.30) +
  geom_boxplot(outlier.shape=NA, alpha=0.15, width=0.5, color="#666666") +
  geom_signif(comparisons=x_comps, map_signif_level=TRUE, textsize=4, size=1, step_increase=0.12, margin_top=0.1) +
  scale_y_continuous(limits=c(0,0.7)) +
  scale_color_manual("Significant delta?", values=c(corecol(numcol=1, pal="wilke", offset=4), corecol(numcol=1, pal="wilke", offset=3), "#920000")) +
  #ggtitle("Difference between delta and ancestral delta / average after 100 replicates of random sampling") +
  ggtitle("Median after 100 random samples") +
  xlab("") +
  ylab("Delta difference") +
  bartheme() +
  theme(axis.text.x=element_text(angle=15, hjust=1),
        plot.title = element_text(size=14, vjust=1),
        legend.title=element_text(size=12),
        legend.position="bottom",
        plot.margin = unit(c(1,0,0,0), "cm")) +
  guides(colour=guide_legend(override.aes=list(alpha=1)))

print(delta_diff_med_p)

# Distribution of delta diffs to ancestor and median over many random branches
###############

x_comps = list(c("Ancestor", "Random branch"))
# List of comparisons to make

delta_random_avgs = delta_random_diffs %>% group_by(ref.node, query.type, sig.ref.delta) %>% summarize(delta.diff=mean(delta.diff))
delta_random_avgs$query.type = "Random branch"
delta_diffs_avgs = rbind(select(delta_anc_diffs, ref.node, query.type, sig.ref.delta, delta.diff), delta_random_avgs)
# Calculate average of all random branch diffs and combine with anc diffs

delta_diff_avg_p = ggplot(data=delta_diffs_avgs, aes(x=query.type, y=delta.diff, group=query.type, color=sig.ref.delta)) +
  geom_quasirandom(size=2, width=0.25, alpha=0.30) +
  geom_boxplot(outlier.shape=NA, alpha=0.15, width=0.5, color="#666666") +
  geom_signif(comparisons=x_comps, map_signif_level=TRUE, textsize=4, size=1, step_increase=0.12, margin_top=0.1) +
  scale_y_continuous(limits=c(0,0.7)) +
  scale_color_manual("Significant delta?", values=c(corecol(numcol=1, pal="wilke", offset=4), corecol(numcol=1, pal="wilke", offset=3), "#920000")) +
  #ggtitle("Difference between delta and ancestral delta / average after 100 replicates of random sampling") +
  ggtitle("Mean after 100 random samples") +
  xlab("") +
  ylab("Delta difference") +
  bartheme() +
  theme(axis.text.x=element_text(angle=15, hjust=1),
        plot.title = element_text(size=14, vjust=1),
        legend.title=element_text(size=12),
        legend.position="bottom",
        plot.margin = unit(c(1,0,0,0), "cm")) +
  guides(colour=guide_legend(override.aes=list(alpha=1)))

print(delta_diff_avg_p)

# Distribution of delta diffs to ancestor and average over many random branches
###############

p_legend = get_legend(delta_diff_single_p)
p_grid = plot_grid(delta_diff_single_p + theme(legend.position="none"),
                   delta_diff_med_p + theme(legend.position="none"),
                   delta_diff_avg_p + theme(legend.position="none"), 
                   ncol=3)
p = plot_grid(p_grid, p_legend, nrow=2, rel_heights=c(1,0.1), align='vh')


fig_outfile = here("docs", "figs", "full-coding-delta-anc-diffs.png")
ggsave(fig_outfile, p, width=12, height=4, unit="in")
# Save the figure

4.13 Delta differences to ancestors (reconstructed)

The same comparisons, but using values of delta reconstructed via a Brownian motion model

delta_info = subset(tree2_info, !is.na(delta.recon))
# Get info only from branches with delta values

#delta_diffs = data.frame("ref.node"=c(), "ref.delta"=c(), "query.node"=c(), "query.delta"=c(), "query.type"=c(), "delta.diff"=c())

delta_anc_diffs = data.frame("ref.node"=c(), "ref.delta"=c(), "sig.ref.delta"=c(), "query.node"=c(), "query.delta"=c(), "query.type"=c(), "delta.diff"=c())
delta_random_diffs = data.frame("ref.node"=c(), "ref.delta"=c(), "sig.ref.delta"=c(), "query.node"=c(), "query.delta"=c(), "query.type"=c(), "delta.diff"=c(), "replicate"=c())
# Data frames to save delta diffs

for(i in 1:nrow(delta_info)){
# Loop over every branch with delta
  
  node = delta_info[i,]$node
  delta = delta_info[i,]$delta.recon
  sig_delta = delta_info[i,]$delta.sig
  sig_label = "N"
  
  if(sig_delta){
    sig_label = "Y"
  }
  # Get the delta value and other info from the branch
  
  anc = Ancestors(tree2, node, type="parent")
  if(tree2_info[tree2_info$node==node,]$tp %in% exclude_branches || tree2_info[tree2_info$node==anc,]$tp %in% exclude_branches){
    next
  }
  # Get the ancestral node, but move on if it is an excluded branch
  
  anc_delta = delta_info[delta_info$node==anc,]$delta.recon
  if(delta_info[delta_info$node==anc,]$delta.sig){
    sig_label = "Ancestor Y"
  }
  # Get the delta info from the ancestor
  
  #delta_info[delta_info$node==node,]$delta.diff = abs(delta - anc_delta)
  
    delta_anc_diffs = rbind(delta_anc_diffs, data.frame("ref.node"=node, "ref.delta"=delta, "sig.ref.delta"=sig_label, "query.node"=anc, "query.delta"=anc_delta, "query.type"="Ancestor", "delta.diff"=abs(delta-anc_delta)))
    # Add the info of the comparison of this node to its ancestor to the anc delta diff data frame
  
  for(j in 1:100){
  # Select random branches
    
    random_anc = anc
    while(random_anc == anc || random_anc == node){
      random_row = delta_info[sample(nrow(delta_info),1),]
      random_anc = random_row$node
    }
    # Select a random branch in the tree as long as it is not this branch or its ancestor
    
    delta_random_diffs = rbind(delta_random_diffs, data.frame("ref.node"=node, "ref.delta"=delta, "sig.ref.delta"=sig_label, "query.node"=random_anc, "query.delta"=random_row$delta, "query.type"="Random branch", "delta.diff"=abs(delta-random_row$delta), "replicate"=j))
    # Add the info of the comparison of this node to the random branch to the random delta diff data frame
  }
    
  #delta_info[delta_info$node==node,]$random.diff = abs(delta - random_row$delta)

}

###############

x_comps = list(c("Ancestor", "Random branch"))
# List of comparisons to make

delta_random_single = subset(delta_random_diffs, replicate==1)
delta_random_single = select(delta_random_single, -replicate)
delta_random_single$query.type = "Random branch"
delta_diffs_single = rbind(delta_anc_diffs, delta_random_single)
# Get the first replicate, add label, and combine with anc diffs

delta_diff_single_p = ggplot(data=delta_diffs_single, aes(x=query.type, y=delta.diff, group=query.type, color=sig.ref.delta)) +
  geom_quasirandom(size=2, width=0.25, alpha=0.30) +
  geom_boxplot(outlier.shape=NA, alpha=0.15, width=0.5, color="#666666") +
  geom_signif(comparisons=x_comps, map_signif_level=TRUE, textsize=4, size=1, step_increase=0.12, margin_top=0.1) +
  scale_y_continuous(limits=c(0,0.7)) +
  scale_color_manual("Significant delta?", values=c(corecol(numcol=1, pal="wilke", offset=4), corecol(numcol=1, pal="wilke", offset=3), "#920000")) +
  ggtitle("Single random sample") +
  xlab("") +
  ylab("Delta difference") +
  bartheme() +
  theme(axis.text.x=element_text(angle=15, hjust=1),
        plot.title = element_text(size=14, vjust=1),
        legend.title=element_text(size=12),
        legend.position="bottom",
        plot.margin = unit(c(1,0,0,0), "cm")) +
  guides(colour=guide_legend(override.aes=list(alpha=1)))

print(delta_diff_single_p)

# Distribution of delta diffs to ancestor and one random branch
###############

x_comps = list(c("Ancestor", "Random branch"))
# List of comparisons to make

delta_random_meds = delta_random_diffs %>% group_by(ref.node, query.type, sig.ref.delta) %>% summarize(delta.diff=median(delta.diff))
delta_diffs_meds = rbind(select(delta_anc_diffs, ref.node, query.type, sig.ref.delta, delta.diff), delta_random_meds)
# Calculate median of all random branch diffs and combine with anc diffs

delta_diff_med_p = ggplot(data=delta_diffs_meds, aes(x=query.type, y=delta.diff, group=query.type, color=sig.ref.delta)) +
  geom_quasirandom(size=2, width=0.25, alpha=0.30) +
  geom_boxplot(outlier.shape=NA, alpha=0.15, width=0.5, color="#666666") +
  geom_signif(comparisons=x_comps, map_signif_level=TRUE, textsize=4, size=1, step_increase=0.12, margin_top=0.1) +
  scale_y_continuous(limits=c(0,0.7)) +
  scale_color_manual("Significant delta?", values=c(corecol(numcol=1, pal="wilke", offset=4), corecol(numcol=1, pal="wilke", offset=3), "#920000")) +
  #ggtitle("Difference between delta and ancestral delta / average after 100 replicates of random sampling") +
  ggtitle("Median after 100 random samples") +
  xlab("") +
  ylab("Delta difference") +
  bartheme() +
  theme(axis.text.x=element_text(angle=15, hjust=1),
        plot.title = element_text(size=14, vjust=1),
        legend.title=element_text(size=12),
        legend.position="bottom",
        plot.margin = unit(c(1,0,0,0), "cm")) +
  guides(colour=guide_legend(override.aes=list(alpha=1)))

print(delta_diff_med_p)

# Distribution of delta diffs to ancestor and median over many random branches
###############

x_comps = list(c("Ancestor", "Random branch"))
# List of comparisons to make

delta_random_avgs = delta_random_diffs %>% group_by(ref.node, query.type, sig.ref.delta) %>% summarize(delta.diff=mean(delta.diff))
delta_random_avgs$query.type = "Random branch"
delta_diffs_avgs = rbind(select(delta_anc_diffs, ref.node, query.type, sig.ref.delta, delta.diff), delta_random_avgs)
# Calculate average of all random branch diffs and combine with anc diffs

delta_diff_avg_p = ggplot(data=delta_diffs_avgs, aes(x=query.type, y=delta.diff, group=query.type, color=sig.ref.delta)) +
  geom_quasirandom(size=2, width=0.25, alpha=0.30) +
  geom_boxplot(outlier.shape=NA, alpha=0.15, width=0.5, color="#666666") +
  geom_signif(comparisons=x_comps, map_signif_level=TRUE, textsize=4, size=1, step_increase=0.12, margin_top=0.1) +
  scale_y_continuous(limits=c(0,0.7)) +
  scale_color_manual("Significant delta?", values=c(corecol(numcol=1, pal="wilke", offset=4), corecol(numcol=1, pal="wilke", offset=3), "#920000")) +
  #ggtitle("Difference between delta and ancestral delta / average after 100 replicates of random sampling") +
  ggtitle("Mean after 100 random samples") +
  xlab("") +
  ylab("Delta difference") +
  bartheme() +
  theme(axis.text.x=element_text(angle=15, hjust=1),
        plot.title = element_text(size=14, vjust=1),
        legend.title=element_text(size=12),
        legend.position="bottom",
        plot.margin = unit(c(1,0,0,0), "cm")) +
  guides(colour=guide_legend(override.aes=list(alpha=1)))

print(delta_diff_avg_p)

# Distribution of delta diffs to ancestor and average over many random branches
###############

p_legend = get_legend(delta_diff_single_p)
p_grid = plot_grid(delta_diff_single_p + theme(legend.position="none"),
                   delta_diff_med_p + theme(legend.position="none"),
                   delta_diff_avg_p + theme(legend.position="none"), 
                   ncol=3)
p = plot_grid(p_grid, p_legend, nrow=2, rel_heights=c(1,0.1), align='vh')


fig_outfile = here("docs", "figs", "full-coding-delta-anc-diffs-recon.png")
ggsave(fig_outfile, p, width=12, height=4, unit="in")
# Save the figure

5 Delta spatial patterns

Here, we look for evidence of introgression along the genome by calculating the average size of blocks of genes that share the same topology. For all genes that form a block of topologies on a given branch (e.g. all genes in a row that share the same topology), we subtract the starting position of the final gene from the starting position of the first gene in the block.

We then look at the sizes of blocks around each branch for each topology: 1. the concordant topology, 2. the major discordant topology (the one that is most frequent), and 3. the minor discordant topology.

Specifically, we take the difference in the average block size of the major and minor topologies to measure if the skew in gene tree topologies is observable in different areas of the genome.

topo_file = here("docs", "data", "trees", "astral", "concord-rooted-bl", "topo-counts.tab")
topo_counts = read.table(topo_file, header=T)
topo_counts = subset(topo_counts, !tp.node %in% exclude_branches)
topo_counts = subset(topo_counts, mchr != "X" & mchr != "Y")
# Read the topo counts for each gene for each branch

###############

block_outfile = here("docs", "data", "trees", "astral", "concord-rooted-bl", "blocks.tab")
# A file to save the block sizes to

###############

tree_info$depth = node.depth(rodent_tree)
#tree_info$depth.bl = get_all_node_depths(rodent_tree)
tree_info$depth.edge = NA
tree_info$depth.bl = NA

tree_info$depth.edge[189:nrow(tree_info)] = get_all_node_depths(rodent_tree, as_edge_count=T)
tree_info$depth.bl[189:nrow(tree_info)] = get_all_node_depths(rodent_tree)
# Depths of branches in the tree relative to tips

###############

topo_tree_info = subset(tree_info, node.type!="tip" & !tp %in% exclude_branches)
topo_cf_stats = cf_stats
names(topo_cf_stats)[1] = "node"
topo_cf_stats = merge(topo_cf_stats, select(topo_tree_info, node, node.type, branch.length, depth, depth.bl, depth.edge, tp, delta, delta.sig), by="node")
# Add the tree info to the cf table

topo_cf_stats$gD_prev = ifelse(topo_cf_stats$gDF1_N > topo_cf_stats$gDF2_N, "d1", "d2")
# Identify the prevalant discorant topology for each branch

if(!file.exists(block_outfile)){
  
  block_data = data.frame("tp"=c(), "num.genes.c"=c(), "num_genes_d1"=c(), "num_genes_d2"=c(), "num.genes.m"=c(),
                          "c.avg.block.size"=c(), "c.avg.block.count"=c(), "c.num.blocks"=c(),
                          "d1.avg.block.size"=c(), "d1.avg.block.count"=c(), "d1.num.blocks"=c(),
                          "d2.avg.block.size"=c(), "d2.avg.block.count"=c(), "d2.num.blocks"=c(),
                          "m.avg.block.size"=c(), "m.avg.block.count"=c(), "m.num.blocks"=c())
  # Block sizes for each topology around each branch
  
  for(node in levels(as.factor(topo_counts$tp.node))){
  # Loop over every node in the tree
    
    c_blocks = 0
    c_block_sizes = c()
    c_block_counts = c()
    d1_blocks = 0
    d1_block_sizes = c()
    d1_block_counts = c()
    d2_blocks = 0
    d2_block_sizes = c()
    d2_block_counts = c()
    m_blocks = 0
    m_block_sizes = c()
    m_block_counts = c()
    # Running tallies and lists of block sizes for each topology to be averaged
    # over all chromosomes
    
    num_genes_c = 0
    num_genes_d1 = 0
    num_genes_d2 = 0
    num_genes_m = 0
    
    for(chr in levels(as.factor(topo_counts$mchr))){
    # Loop over every mouse chromosome
      #print(paste(node, chr))
      chrdata = subset(topo_counts, tp.node==node & mchr==chr)
      # Subset topo counts from this chromosome and node
      
      chrdata = chrdata[order(chrdata$mstart), ]
      # Order the genes by their starting position on the chrome
      
      chrdata$row.num = seq_along(chrdata[,1])
      # Add the row number as a column
      
      first = T
      cur_topo = NA
      num_genes = 0
      # Initialize some things for this chromosomes
      
      for(i in 1:nrow(chrdata)){
      # Loop over every gene on this chromosomes
        
         cur_gene = chrdata[i,]
         # Get the current gene
         
        if(first){
          cur_topo = cur_gene$topo
          num_genes = 0
          block_start = cur_gene$mstart
          
          first = F
  
          next
        }
        # If this is the first gene of this chromosome/node combo, start the first block
  
        if(cur_gene$topo == "m"){
          num_genes_m = num_genes_m + 1
        }else if(cur_gene$topo == "c"){
          num_genes_c = num_genes_c + 1
        }else if(cur_gene$topo == "d1"){
          num_genes_d1 = num_genes_d1 + 1
        }else if(cur_gene$topo == "d2"){
          num_genes_d2 = num_genes_d2 + 1
        }
        # Count the topology of the current gene... I probably don't need to do this...
         
        if(!cur_gene$topo %in% cur_topo || i == nrow(chrdata)){
          block_size = cur_gene$mstart - block_start
          if(cur_topo == "m"){
            m_blocks = m_blocks + 1
            m_block_sizes = c(m_block_sizes, block_size)
            m_block_counts = c(m_block_counts, num_genes)
          }else if(cur_topo == "c"){
            c_blocks = c_blocks + 1
            c_block_sizes = c(c_block_sizes, block_size)
            c_block_counts = c(c_block_counts, num_genes)
          }else if(cur_topo == "d1"){
            d1_blocks = d1_blocks + 1
            d1_block_sizes = c(d1_block_sizes, block_size)
            d1_block_counts = c(d1_block_counts, num_genes)
          }else if(cur_topo == "d2"){
            d2_blocks = d2_blocks + 1
            d2_block_sizes = c(d2_block_sizes, block_size)
            d2_block_counts = c(d2_block_counts, num_genes)
          }
          # Get the size of the current block and add it to the correct list
          
          cur_topo = cur_gene$topo
          num_genes = 0
          block_start = cur_gene$mstart
          # Reset the block counts and positions        
        }else{
          num_genes = num_genes + 1
        }
        # If there is a decisive topology for this gene/node combo AND the topology doesn't match the
        # previous topology .. OR if this is the last gene of the chromosome, then we need to count the
        # size of the block and start the next block
        # Otherwise, just increment the number of genes
      }
      ## End gene loop
    }
    ## End chromosome loop
    
    block_data = rbind(block_data, data.frame("tp"=node, 
                        "num.genes.c"=num_genes_c, "num_genes_d1"=num_genes_d1, "num_genes_d2"=num_genes_d2, "num.genes.m"=num_genes_m,
                        "c.avg.block.size"=mean(c_block_sizes, na.rm=T), "c.avg.block.count"=mean(c_block_counts, na.rm=T), "c.num.blocks"=c_blocks,
                        "d1.avg.block.size"=mean(d1_block_sizes, na.rm=T), "d1.avg.block.count"=mean(d1_block_counts, na.rm=T), "d1.num.blocks"=d1_blocks,
                        "d2.avg.block.size"=mean(d2_block_sizes, na.rm=T), "d2.avg.block.count"=mean(d2_block_counts, na.rm=T), "d2.num.blocks"=d2_blocks,
                        "m.avg.block.size"=mean(m_block_sizes, na.rm=T), "m.avg.block.count"=mean(m_block_counts, na.rm=T), "m.num.blocks"=m_blocks))
      # Calculate average block sizes for this node over all chromosomes and add to the main df
  }
  ## End node loop
  
  write.table(file=block_outfile, block_data, sep="\t", row.names=F)
  # Write the block sizes per node to a file
}else{
  block_data = read.table(block_outfile, header=T)
}
# If the block sizes have already been calculated, just read them in


block_data_merge = merge(block_data, topo_cf_stats, by="tp")
# Merge the block counts into the main cf df

###############

block_data_merge$anc.delta.sig = "N"
block_data_merge$desc.delta.sig = "N"
for(i in 1:nrow(block_data_merge)){
# Loop over every branch with delta
  
  node = block_data_merge[i,]$node
  # Get the node of the current row
  
  anc = Ancestors(rodent_tree, node, type="parent")
  if(!anc %in% topo_tree_info$node){
    next
  }
  # Get the ancestral node, but move on if it is an excluded branch
  
  anc_delta = block_data_merge[block_data_merge$node==anc,]$delta
  if(block_data_merge[block_data_merge$node==anc,]$delta.sig == "Y"){
    block_data_merge[i,]$anc.delta.sig = "Y"
  }
  # Get the delta info from the ancestor
  
  for(desc in getDescendants(rodent_tree, node)){
    if(!desc %in% topo_tree_info$node){
      next
    }
    
    desc_delta = block_data_merge[block_data_merge$node==desc,]$delta
    if(block_data_merge[block_data_merge$node==desc,]$delta.sig == "Y"){
      block_data_merge[i,]$desc.delta.sig = "Y"
    }
    # Get the delta info from the ancestor
  }
}
## Determine whether ancestor or either descendant has a significant delta for each branch

###############

block_data_merge$major.block.size = ifelse(block_data_merge$gD_prev=="d1", block_data_merge$d1.avg.block.size, block_data_merge$d2.avg.block.size)
block_data_merge$minor.block.size = ifelse(block_data_merge$gD_prev=="d1", block_data_merge$d2.avg.block.size, block_data_merge$d1.avg.block.size)
block_data_merge$block.diff = block_data_merge$major.block.size - block_data_merge$minor.block.size
block_data_merge$block.ratio = block_data_merge$major.block.size / block_data_merge$minor.block.size
# Finally, determine the block sizes for the major and minor topologies, and calculate their differences

5.1 Distribution of block sizes for each topology around each branch

m = select(block_data_merge, tp, m.avg.block.size, delta.sig)
names(m)[2] = "block.size"
m$label = "Missing"

c = select(block_data_merge, tp, c.avg.block.size, delta.sig)
names(c)[2] = "block.size"
c$label = "Concordant"

d1 = select(block_data_merge, tp, d1.avg.block.size, delta.sig)
names(d1)[2] = "block.size"
d1$label = "D1"

d2 = select(block_data_merge, tp, d2.avg.block.size, delta.sig)
names(d2)[2] = "block.size"
d2$label = "D2"

a = rbind(c, d1, d2, m)

#x_comps = list(c("Observed", "Inferred"))

p = ggplot(a, aes(x=label, y=block.size, group=label, color=delta.sig)) +
  geom_quasirandom(size=2, width=0.25, alpha=0.5) +
  geom_boxplot(outlier.shape=NA, alpha=0.15, width=0.5, color="#666666") +
  #geom_signif(comparisons=x_comps, map_signif_level=TRUE, textsize=4, size=1, step_increase=0.12, margin_top=0.1) +
  ylab("Avg. block size") +
  xlab("") +
  scale_color_manual("Significant delta?", values=c(corecol(numcol=1, pal="wilke", offset=3), corecol(numcol=1, pal="wilke", offset=4))) +
  bartheme() +
  theme(plot.title = element_text(size=14, vjust=1),
        legend.title=element_text(size=12),
        legend.position="bottom",
        plot.margin = unit(c(1,0,0,0), "cm"))
print(p)

###############

fig_outfile = here("docs", "figs", "full-coding-delta-block-dists.png")
ggsave(fig_outfile, p, width=4, height=4, unit="in")
# Save the figure

5.2 Distribution of the difference between major and minor block sizes

Note that a negative value indicates that, on average, blocks consisting of the minor topology are larger than those of the major topology.

###############

p = ggplot(block_data_merge, aes(x=1, y=block.diff, color=delta.sig)) +
  geom_quasirandom(size=2, width=0.25, alpha=0.5) +
  geom_boxplot(outlier.shape=NA, alpha=0.15, width=0.5, color="#666666") +
  #geom_signif(comparisons=x_comps, map_signif_level=TRUE, textsize=4, size=1, step_increase=0.12, margin_top=0.1) +
  scale_x_continuous(limits=c(0,2)) +
  ylab("Block size difference") +
  xlab("") +
  scale_color_manual("Significant delta?", values=c(corecol(numcol=1, pal="wilke", offset=3), corecol(numcol=1, pal="wilke", offset=4))) +
  bartheme() +
  theme(axis.title.x=element_blank(),
        axis.text.x=element_blank(),
        axis.ticks.x=element_blank(),
        plot.title = element_text(size=14, vjust=1),
        legend.title=element_text(size=12),
        legend.position="bottom",
        plot.margin = unit(c(1,0,0,0), "cm"))
print(p)

###############

fig_outfile = here("docs", "figs", "full-coding-delta-block-diff-dist.png")
ggsave(fig_outfile, p, width=4, height=4, unit="in")
# Save the figure

5.3 Block size difference and delta

# p = ggplot(block_data_merge, aes(x=block.diff, y=num.genes.na, color=delta.sig)) +
#   geom_point(size=3, alpha=0.5) +
#   geom_smooth(method="lm", se=F) +
#   geom_text_repel(aes(label=ifelse(delta.sig=="Y",as.character(tp),'')), show_guide=F, min.segment.length=0) +
#   bartheme()
# print(p)

p = ggplot(block_data_merge, aes(x=delta, y=block.diff, color=delta.sig)) +
  geom_point(size=2, alpha=0.5) +
  geom_smooth(method="lm", se=F, linetype="dashed", color="#333333") +
  xlab("Delta") +
  ylab("Block size difference") +
  scale_color_manual("Significant delta?", values=c(corecol(numcol=1, pal="wilke", offset=3), corecol(numcol=1, pal="wilke", offset=4))) +
  bartheme() +
  theme(legend.title=element_text(size=12),
    legend.position="bottom",
    plot.margin = unit(c(1,0,0,0), "cm"))
print(p)

###############

fig_outfile = here("docs", "figs", "full-coding-delta-block-diff-delta.png")
ggsave(fig_outfile, p, width=6, height=4, unit="in")
# Save the figure

5.4 Block size difference and tree depth

Maximum number of branches from a tip

# p = ggplot(block_data_merge, aes(x=block.diff, y=num.genes.na, color=delta.sig)) +
#   geom_point(size=3, alpha=0.5) +
#   geom_smooth(method="lm", se=F) +
#   geom_text_repel(aes(label=ifelse(delta.sig=="Y",as.character(tp),'')), show_guide=F, min.segment.length=0) +
#   bartheme()
# print(p)

block_data_merge = block_data_merge[order(block_data_merge$node), ]
# Re-sort the data frame by R node order after the merge so the trees still work

p = ggplot(block_data_merge, aes(x=depth.bl, y=block.diff, color=delta.sig)) +
  geom_point(size=2, alpha=0.5) +
  geom_smooth(method="lm", se=F, linetype="dashed", color="#333333") +
  xlab("Tree depth") +
  ylab("Block size difference") +
  scale_color_manual("Significant delta?", values=c(corecol(numcol=1, pal="wilke", offset=3), corecol(numcol=1, pal="wilke", offset=4))) +
  bartheme() +
  theme(legend.title=element_text(size=12),
    legend.position="bottom",
    plot.margin = unit(c(1,0,0,0), "cm"))
print(p)

###############

fig_outfile = here("docs", "figs", "full-coding-delta-block-diff-depth.png")
ggsave(fig_outfile, p, width=6, height=4, unit="in")
# Save the figure

6 Time tree

Tree was dated using IQ-tree’s implementation of least squares dating with the following fossil calibrations:

calibration_file = here("docs", "data", "trees", "astral", "iqtree-dating", "fossil-calibrations-iqtree.txt")
cal_dates = read.table(calibration_file, header=F)
names(cal_dates) = c("MRCA of", "dates")
# Read the fossil calibration dates

cal_dates = cal_dates %>% separate(dates, c("Min.age", "Max.age"), sep=":", remove=T)
cal_dates$Max.age = abs(as.numeric(cal_dates$Max.age))
cal_dates$Min.age = abs(as.numeric(cal_dates$Min.age))
# Parse the dates better

cal_dates %>% kable() %>% kable_styling(bootstrap_options=c("striped", "condended", "responsive"), full_width=F)

MRCA of	Min.age	Max.age
mm10,Arvicanthus_niloticus_MNHN1999201	11.1	12.3
Pithecheir_melanurus_LSUMZ38198,Arvicanthus_niloticus_MNHN1999201	8.7	10.1
mm10,Mus_pahari_10460X1	7.3	8.3
Leggadina_forresti_WAMM62323,Pseudomys_shortridgei_Z25113	4.0	4.5

# Display calibration points as a table

###############

#dated_tree_file = here("docs", "data", "trees", "astral", "iqtree-dating", "full-coding-astral-iqtree-dating-anil-mmus-12.1.timetree.nwk")
#dated_tree = read.tree(dated_tree_file)

#tree_to_df_list = treeToDF(rodent_tree)
#dated_info = tree_to_df_list[["info"]]
# Read the tree and parse with treetoDF

#plotTreeTime(dated_tree, runif(Ntip(dated_tree)))

dated_tree_file = here("docs", "data", "trees", "astral", "iqtree-dating", "full-coding-astral-iqtree-dating-anil-mmus-12.1.timetree.nex.mod")
dated_tree_text = readLines(con=dated_tree_file, n = -1)
dated_tree = readMCMCtree(dated_tree_text, from.file = FALSE)
# Read tree as nexus file

# From: https://cran.r-project.org/web/packages/MCMCtreeR/vignettes/MCMCtree_plot.html

MCMC.tree.plot(dated_tree, cex.tips = 0.3, time.correction = 1, scale.res = c("Eon", 
    "Period", "Epoch", "Age"), plot.type = "phylogram", cex.age = 0.6, cex.labels = 0.6, 
    relative.height = 0.1, col.tree = "grey40", label.offset = 0.05, 
    node.method = "bar", no.margin = TRUE, label.timescale.names=TRUE, add.time.scale=TRUE, add.abs.time=TRUE,
    lwd.bar=2)

###############

fig_outfile = here("docs", "figs", "full-coding-timetree.png")
png(fig_outfile, width=12, height=12, units="in", res=320)
MCMC.tree.plot(dated_tree, cex.tips = 0.3, time.correction = 1, scale.res = c("Eon", 
    "Period", "Epoch", "Age"), plot.type = "phylogram", cex.age = 0.6, cex.labels = 0.6, 
    relative.height = 0.1, col.tree = "grey40", label.offset = 0.05, 
    node.method = "bar", no.margin = TRUE, label.timescale.names=TRUE, add.time.scale=TRUE, add.abs.time=TRUE,
    lwd.bar=2)
dev.off()

## png 
##   2

7 Speciation rates with BAMM

taxo_file = here("docs", "data", "full-murinae-taxonomy.csv")
taxo_data = read.csv(taxo_file, header=T, quote="\"")
# Read the taxonomy data

###############

div_comp_sums = taxo_data %>% count(Division)
names(div_comp_sums)[2] = "Full"
div_samp_sums = subset(taxo_data, Exome==1) %>% count(Division)
names(div_samp_sums)[2] = "Sampled"
div_props = merge(div_comp_sums, div_samp_sums, by="Division")
div_props$div.prop.sampled = div_props$Sampled / div_props$Full
taxo_data = merge(taxo_data, select(div_props, Division, div.prop.sampled), by="Division")
# Get Division counts and sampling rates

div_sampling_file = here("docs", "data", "bamm", "div-sampling-rates.tab")
write.table(div_props, file=div_sampling_file, row.names=F, quote=F)
# Write the counts to a file

###############

tribe_comp_sums = taxo_data %>% count(Tribe)
names(tribe_comp_sums)[2] = "Full"
tribe_samp_sums = subset(taxo_data, Exome==1) %>% count(Tribe)
names(tribe_samp_sums)[2] = "Sampled"
tribe_props = merge(tribe_comp_sums, tribe_samp_sums, by="Tribe")
tribe_props$tribe.prop.sampled = tribe_props$Sampled / tribe_props$Full
taxo_data = merge(taxo_data, select(tribe_props, Tribe, tribe.prop.sampled), by="Tribe")
# Get Tribe counts and sampling rates

tribe_sampling_file = here("docs", "data", "bamm", "tribe-sampling-rates.tab")
write.table(tribe_props, file=tribe_sampling_file, row.names=F, quote=F)
# Write the counts to a file

###############

sample_taxo_data = subset(taxo_data, Exome==1)
# Get the sampled species

###############

missing_samples = c("Lophuromys_woosnami_LSUMZ37793", "Lophiomys_imhausi_UM5152", "Mus_musculus-reference_10460X5", "Mus_musculus-domesticus_10460X15", "Mus_musculus-castaneus_10460X7", "Hydromys_sp-nov_YS391", "Genus_sp-nov_NMVZ21816", "Paruromys_dominator_JAE4870", "Mus_musculus-musculus_10460X14")
# A list of samples missing from the taxonomy file

tribe_prob_full = c("Pithecheir_melanurus_LSUMZ38198", "Vandeleuria_oleracea_ABTC116404")
tribe_prob = c("Pithecheir melanurus", "Vandeleuria oleracea")
div_prob_full = c("Coccymys_ruemmleri_ABTC49489", "Vandeleuria_oleracea_ABTC116404", "Anonymomys_mindorensis_FMNH2222010")
div_prob = c("Coccymys ruemmleri", "Vandeleuria oleracea", "Anonymomys mindorensis")
# Lists of samples with unclear taxonomy

###############

tribe_tree_file = here("docs", "data", "bamm", "full-coding-astral-timetree-tribe.tre")
time_tree = read.nexus(dated_tree_file)
tribe_tree = drop.tip(time_tree, c(missing_samples, tribe_prob_full))
write.tree(tribe_tree, file=tribe_tree_file)
# Write the tree for Tribe sampling
# NOTE: manually convert 0 branch lengths to 0.000001 (there are 5 of these for some reason)

tree_to_df_list = treeToDF(tribe_tree)
tribe_tree_info = tree_to_df_list[["info"]]
tribe_tree_tips = subset(tribe_tree_info, node.type=="tip")
# Read the tree and parse with treetoDF

tribe_tree_tips = tribe_tree_tips %>% separate(label, c("genus", "species", "sample"), sep="_", remove=F)
tribe_tree_tips$Genus.species = paste(tribe_tree_tips$genus, tribe_tree_tips$species)
tribe_tree_tips[tribe_tree_tips$label=="mm10",]$Genus.species = "Mus musculus"
tribe_tree_tips[tribe_tree_tips$label=="rnor6",]$Genus.species = "Rattus norvegicus"
# Parse the genus and species names of the remaining tips

###############

tribe_sample_file = here("docs", "data", "bamm", "tribe-sampling-rates.tab")
tribe_data = subset(sample_taxo_data, !Genus.species %in% tribe_prob)
tribe_data = merge(tribe_data, tribe_tree_tips, by="Genus.species")
tribe_data = select(tribe_data, label, Tribe, tribe.prop.sampled)
write.table(tribe_data, file=tribe_sample_file, sep="\t", row.names=F, col.names=F, quote=F)
# Write the Tribe sampling rates to a file
# NOTE: manually add 1.0 to the first line
# Format from: http://bamm-project.org/advanced.html#accounting-for-non-random-incomplete-taxon-sampling-in-diversification-studies

###############

# From: http://bamm-project.org/quickstart.html#speciation-extinction-analyses 

taxo_data_nonextinct = subset(taxo_data, !grepl("EXTINCT",Extinct.TAXONOMY))
# Remove extinct species from complete list of species

prior_file = here("docs", "data", "bamm", "tribe-bamm-priors.txt")
setBAMMpriors(tribe_tree, total.taxa=nrow(taxo_data_nonextinct), outfile=prior_file)
# Generate BAMM priors based on the tree

## TRIBE
##########################################################################################
## DIVISION

div_tree_file = here("docs", "data", "bamm", "full-coding-astral-timetree-div.tre")
time_tree = read.nexus(dated_tree_file)
div_tree = drop.tip(time_tree, c(missing_samples, div_prob_full))
write.tree(div_tree, file=div_tree_file)
# Write the tree for Tribe sampling
# NOTE: manually convert 0 branch lengths to 0.000001 (there are 5 of these for some reason)

tree_to_df_list = treeToDF(div_tree)
div_tree_info = tree_to_df_list[["info"]]
div_tree_tips = subset(div_tree_info, node.type=="tip")
# Read the tree and parse with treetoDF

div_tree_tips = div_tree_tips %>% separate(label, c("genus", "species", "sample"), sep="_", remove=F)
div_tree_tips$Genus.species = paste(div_tree_tips$genus, div_tree_tips$species)
div_tree_tips[div_tree_tips$label=="mm10",]$Genus.species = "Mus musculus"
div_tree_tips[div_tree_tips$label=="rnor6",]$Genus.species = "Rattus norvegicus"
# Parse the genus and species names of the remaining tips

###############

div_sample_file = here("docs", "data", "bamm", "div-sampling-rates.tab")
div_data = subset(sample_taxo_data, !Genus.species %in% div_prob)
div_data = merge(div_data, div_tree_tips, by="Genus.species")
div_data = select(div_data, label, Division, div.prop.sampled)
write.table(div_data, file=div_sample_file, sep="\t", row.names=F, col.names=F, quote=F)
# Write the Tribe sampling rates to a file
# NOTE: manually add 1.0 to the first line
# Format from: http://bamm-project.org/advanced.html#accounting-for-non-random-incomplete-taxon-sampling-in-diversification-studies

###############

# From: http://bamm-project.org/quickstart.html#speciation-extinction-analyses 

taxo_data_nonextinct = subset(taxo_data, !grepl("EXTINCT",Extinct.TAXONOMY))
# Remove extinct species from complete list of species

prior_file = here("docs", "data", "bamm", "div-bamm-priors.txt")
setBAMMpriors(div_tree, total.taxa=nrow(taxo_data_nonextinct), outfile=prior_file)
# Generate BAMM priors based on the tree

###############

7.1 Sampling rates based on Tribe

tribe_props %>% kable() %>% kable_styling(bootstrap_options=c("striped", "condended", "responsive"), full_width=F)

Tribe	Full	Sampled	tribe.prop.sampled
Apodemini	26	1	0.0384615
Arvicanthini	90	22	0.2444444
Hydromyini	211	70	0.3317536
incertae sedis	11	2	0.1818182
Malacomyini	3	1	0.3333333
Murini	43	11	0.2558140
Otomyini	31	2	0.0645161
Phloeomyini	18	6	0.3333333
Praomyini	55	9	0.1636364
Rattini	193	55	0.2849741

7.2 Speciation rates based on Tribe sampling (BAMM)

# From: https://lukejharmon.github.io/ilhabela/instruction/2015/07/02/diversification-analysis-bamm-rpanda/

bamm_tree_file = here("docs", "data", "bamm", "full-coding-astral-timetree-tribe.tre")
bamm_tree = read.tree(bamm_tree_file)
# Read the tree from the BAMM run

event_data_file = here("docs", "data", "bamm", "tribe_event_data.txt")
events = read.csv(event_data_file)
# Read the event file from the BAMM run

###############

ed = getEventData(bamm_tree, events, burnin=0.25)

## Processing event data from data.frame
## 
## Discarded as burnin: GENERATIONS <  249000
## Analyzing  751  samples from posterior
## 
## Setting recursive sequence on tree...
## 
## Done with recursive sequence

best = getBestShiftConfiguration(ed, expectedNumberOfShifts=1, threshold=5)

## Processing event data from data.frame
## 
## Discarded as burnin: GENERATIONS <  0
## Analyzing  1  samples from posterior
## 
## Setting recursive sequence on tree...
## 
## Done with recursive sequence

# Get the best shift possibility (MAP)
# See: http://bamm-project.org/rateshifts.html#overall-best-shift-configuration

bamm_tree_p = plot.bammdata(best, lwd=2, labels = T, cex = 0.3)
addBAMMshifts(best, cex=2)
addBAMMlegend(bamm_tree_p, nTicks = 4, side = 4, las = 1)

# Plot the rates and shifts on the tree

###############

fig_outfile = here("docs", "figs", "full-coding-bamm-tribe.png")
png(fig_outfile, width=12, height=12, units="in", res=320)
bamm_tree_p = plot.bammdata(best, lwd=2, labels = T, cex = 0.3)
addBAMMshifts(best, cex=2)
addBAMMlegend(bamm_tree_p, nTicks = 4, side = 4, las = 1)
dev.off()

## png 
##   2

plotRateThroughTime(ed, intervalCol=corecol(numcol=1), avgCol=corecol(numcol=1), ylim=c(0,1), cex.axis=1, cex.lab=1.5)

###############

fig_outfile = here("docs", "figs", "full-coding-bamm-tribe-time.png")
png(fig_outfile, width=6, height=4, units="in", res=320)
plotRateThroughTime(ed, intervalCol=corecol(numcol=1), avgCol=corecol(numcol=1), ylim=c(0,1), cex.axis=1, cex=1.5)
dev.off()

## png 
##   2

7.3 Speciation rates based on Tribe sampling (BAMM) and delta

labeled_timetree_file = here("docs", "data", "trees", "astral", "iqtree-dating", "full-coding-astral-iqtree-dating-anil-mmus-12.1.timetree.nex")
labeled_timetree = read.nexus(labeled_timetree_file)
tree_to_df_list = treeToDF(labeled_timetree)
labeled_timetree_info = tree_to_df_list[["info"]]
# Read the labeled time tree and parse with treetoDF

tree_info_clades = select(tree_info, node, clade, tp, delta)
names(tree_info_clades)[1] = "orig.node"
# Select relevant columns from original tree df

labeled_timetree_info = merge(labeled_timetree_info, tree_info_clades, by="clade")
# Merge the original tree info with the time tree info to get the delta values

labeled_timetree_info = labeled_timetree_info[order(labeled_timetree_info$node), ]
# Re-sort the data frame by R node order after the merge so the trees still work

###############
# Since the speciation rates are missing some species, we need to prune the tree info
# to remove those species. We do this by removing those species from the tree info df
# and from any clade in any other node. At the end, for clades that are identical 
# (because their descendants have been pruned), we keep the branch at the shallowest
# depth

labeled_timetree_info$depth = node.depth(labeled_timetree)
# Get the depth of each node in the tree for pruning

tips_to_drop = c(missing_samples, tribe_prob_full)
# The species we need to remove

for(tip in tips_to_drop){
  labeled_timetree_info = subset(labeled_timetree_info, label != tip)
  labeled_timetree_info$clade = gsub(paste(";", tip, sep=""), "", labeled_timetree_info$clade)
  labeled_timetree_info$clade = gsub(paste(tip, ";", sep=""), "", labeled_timetree_info$clade)
  labeled_timetree_info$clade = gsub(tip, "", labeled_timetree_info$clade)
}
# Loop over each species to remove and remove that row from the df and that species from any
# clade in the df

labeled_timetree_info = subset(labeled_timetree_info, clade != "")
# For some branches with all descendant species removed, they will now have empty clades
# Remove them here

labeled_timetree$node.label = paste(labeled_timetree_info$label, "/", labeled_timetree_info$delta, sep="")
# Add delta to the label of the time tree

labeled_timetree_info = labeled_timetree_info[order(labeled_timetree_info[,'clade'],labeled_timetree_info[,'depth']),]
labeled_timetree_info = labeled_timetree_info[!duplicated(labeled_timetree_info$clade),]
# Sort nodes in the tree df by clade and then by depth
# Then, for duplicate clades, keep the one at the shallowest depth

labeled_timetree_info = labeled_timetree_info[order(labeled_timetree_info$node), ]
# Re-sort the data frame by R node order after the merge so the trees still work

###############

labeled_tribetree = drop.tip(labeled_timetree, c(missing_samples, tribe_prob_full))
tree_to_df_list = treeToDF(labeled_tribetree)
labeled_tribetree_info = tree_to_df_list[["info"]]
# Now, prune the tree structure and re-read the pruned tree and parse with treetoDF

labeled_tribetree_info = merge(select(labeled_timetree_info, clade, tp, delta), labeled_tribetree_info, by="clade")
# Add the info from the full tree to the pruned tribe tree by merging by clade

labeled_tribetree_info = labeled_tribetree_info[order(labeled_tribetree_info$node), ]
# Re-sort the data frame by R node order after the merge so the trees still work

###############

speciation_tree = getMeanBranchLengthTree(best)$phy
tree_to_df_list = treeToDF(speciation_tree)
speciation_tree_info = tree_to_df_list[["info"]]
# Get the tree with speciation rates as branch lengths from BAMM and parse to df

write.csv(speciation_tree_info, file=here("docs", "data", "bamm", "full-coding-astral-bamm-as-bl.csv"), row.names=F)

speciation_tree_info = merge(select(labeled_tribetree_info, clade, tp, delta), speciation_tree_info, by="clade")
# Add the info from the tribe tree to the speciation tree

speciation_tree_info = labeled_timetree_info[order(labeled_timetree_info$node), ]
# Re-sort the data frame by R node order after the merge so the trees still work

###############

p = ggplot(speciation_tree_info, aes(x=delta, y=branch.length)) +
  geom_point() +
  geom_smooth(method="lm") +
  bartheme()
#print(p)
# Simple correlation

###############

speciation_tree_internal_info = subset(speciation_tree_info, node.type != "tip")
speciation_tree$node.label = paste(speciation_tree_internal_info$label, "/", speciation_tree_internal_info$delta, sep="")
# Add delta values to the internal node labels of the speciation tree

###############
# At this point, to do PGLS without delta values on the tips, we need to again
# remove tip branches. To do this, you'd have to write.tree() and then use
# bonsai to remove the tips
#write.tree(speciation_tree)


speciation_tree2_file = here("docs", "data", "trees", "astral", "concord-rooted-bl",  "full_coding_iqtree_astral_rooted_speciation.bl.cf.labeled.notips.tree")
speciation_tree2 = read.tree(file=speciation_tree2_file)
tree_to_df_list = treeToDF(speciation_tree2)
speciation_tree2_info = tree_to_df_list[["info"]]
# Read the speciation tree without tips and parse to df

speciation_tree2_info = speciation_tree2_info %>% separate(label, c("tp", "support"), sep=">", remove=F)
speciation_tree2_info$tp = paste(speciation_tree2_info$tp, ">", sep="")
speciation_tree2_info = speciation_tree2_info %>% separate(support, c("astral", "gcf", "scf", "delta"), sep="/", remove=T)
speciation_tree2_info$delta = as.numeric(speciation_tree2_info$delta)
# Split the label by /. tp is my treeParse label.

speciation_tree2_tip_info = subset(speciation_tree2_info, node.type=="tip")
# Get only the tip info from the new tree

###############
# From: https://lukejharmon.github.io/ilhabela/instruction/2015/07/03/PGLS/

delta = speciation_tree2_tip_info$delta
speciation.rate = speciation_tree2_tip_info$branch.length
# Get the delta values and branch length values for plotting later

pglsModel = gls(delta ~ branch.length, correlation = corBrownian(phy = speciation_tree2),
    data = speciation_tree2_tip_info, method = "ML")
summary(pglsModel)

## Generalized least squares fit by maximum likelihood
##   Model: delta ~ branch.length 
##   Data: speciation_tree2_tip_info 
##         AIC      BIC   logLik
##   -70.74475 -64.6687 38.37238
## 
## Correlation Structure: corBrownian
##  Formula: ~1 
##  Parameter estimate(s):
## numeric(0)
## 
## Coefficients:
##                     Value  Std.Error    t-value p-value
## (Intercept)    0.11288103 0.08639655  1.3065455  0.1969
## branch.length -0.00390095 0.02904279 -0.1343172  0.8937
## 
##  Correlation: 
##               (Intr)
## branch.length -0.48 
## 
## Standardized residuals:
##        Min         Q1        Med         Q3        Max 
## -0.5606684 -0.4261186 -0.2044627  0.2750020  1.6042793 
## 
## Residual standard error: 0.1861634 
## Degrees of freedom: 56 total; 54 residual

coef(pglsModel)

##   (Intercept) branch.length 
##   0.112881032  -0.003900946

# Run the PGLS

plot(speciation.rate,delta)
abline(a = coef(pglsModel)[1], b = coef(pglsModel)[2])

# Plot the PGLS

###############

fig_outfile = here("docs", "figs", "full-coding-bamm-tribe-spec-delta.png")
png(fig_outfile, width=6, height=4, units="in", res=320)
plot(speciation.rate,delta)
abline(a = coef(pglsModel)[1], b = coef(pglsModel)[2])
dev.off()

## png 
##   2

# Save the figure

7.4 Sampling rates based on Division

div_props %>% kable() %>% kable_styling(bootstrap_options=c("striped", "condended", "responsive"), full_width=F)

Division	Full	Sampled	div.prop.sampled
Aethomys	11	3	0.2727273
Apodemus	26	1	0.0384615
Arvicanthis	30	7	0.2333333
Berylmys	4	2	0.5000000
Bunomys	31	15	0.4838710
Chiropodomys	6	1	0.1666667
Chrotomys	39	5	0.1282051
Colomys	4	2	0.5000000
Conilurus	7	6	0.8571429
Dacnomys	41	7	0.1707317
Dasymys	14	2	0.1428571
Echiothrix	10	9	0.9000000
Haeromys	3	1	0.3333333
Hybomys	11	5	0.4545455
Hydromys	32	9	0.2812500
incertae sedis	13	3	0.2307692
Malacomys	3	1	0.3333333
Mallomys	11	5	0.4545455
Maxomys	22	4	0.1818182
Mus	43	11	0.2558140
Oenomys	24	5	0.2083333
Otomys	31	2	0.0645161
Phloeomys	18	6	0.3333333
Pithecheir	3	1	0.3333333
Pogonomys	16	6	0.3750000
Pseudomys	41	32	0.7804878
Rattus	84	17	0.2023810
Stenocephalemys	51	7	0.1372549
Uromys	52	4	0.0769231

7.5 Speciation rates based on Division sampling (BAMM)

# From: https://lukejharmon.github.io/ilhabela/instruction/2015/07/02/diversification-analysis-bamm-rpanda/

bamm_tree_file = here("docs", "data", "bamm", "full-coding-astral-timetree-div.tre")
bamm_tree = read.tree(bamm_tree_file)
# Read the tree from the BAMM run

event_data_file = here("docs", "data", "bamm", "div_event_data.txt")
events = read.csv(event_data_file)
# Read the event file from the BAMM run

###############

ed = getEventData(bamm_tree, events, burnin=0.25)

## Processing event data from data.frame
## 
## Discarded as burnin: GENERATIONS <  249000
## Analyzing  751  samples from posterior
## 
## Setting recursive sequence on tree...
## 
## Done with recursive sequence

best = getBestShiftConfiguration(ed, expectedNumberOfShifts=1, threshold=5)

## Processing event data from data.frame
## 
## Discarded as burnin: GENERATIONS <  0
## Analyzing  1  samples from posterior
## 
## Setting recursive sequence on tree...
## 
## Done with recursive sequence

# Get the best shift possibility (MAP)
# See: http://bamm-project.org/rateshifts.html#overall-best-shift-configuration

bamm_tree_p = plot.bammdata(best, lwd=2, labels = T, cex = 0.3)
addBAMMshifts(best, cex=2)
addBAMMlegend(bamm_tree_p, nTicks = 4, side = 4, las = 1)

# Plot the rates and shifts on the tree

###############

fig_outfile = here("docs", "figs", "full-coding-bamm-div.png")
png(fig_outfile, width=12, height=12, units="in", res=320)
bamm_tree_p = plot.bammdata(best, lwd=2, labels = T, cex = 0.3)
addBAMMshifts(best, cex=2)
addBAMMlegend(bamm_tree_p, nTicks = 4, side = 4, las = 1)
dev.off()

## png 
##   2

plotRateThroughTime(ed, intervalCol=corecol(numcol=1), avgCol=corecol(numcol=1), ylim=c(0,1), cex.axis=1, cex.lab=1.5)

###############

fig_outfile = here("docs", "figs", "full-coding-bamm-div-time.png")
png(fig_outfile, width=6, height=4, units="in", res=320)
plotRateThroughTime(ed, intervalCol=corecol(numcol=1), avgCol=corecol(numcol=1), ylim=c(0,1), cex.axis=1, cex=1.5)
dev.off()

## png 
##   2

8 Speciation rates with RPanda

8.1 Species tree colored by tribe

Following http://www.phytools.org/***SanJuan2016/ex/14/Complex-diversification-models.html

## The following rpanda sections follow http://www.phytools.org/***SanJuan2016/ex/14/Complex-diversification-models.html

labeled_timetree_file = here("docs", "data", "trees", "astral", "iqtree-dating", "full-coding-astral-iqtree-dating-anil-mmus-12.1.timetree.nex")
labeled_timetree = read.nexus(labeled_timetree_file)
labeled_timetree = drop.tip(labeled_timetree, c(missing_samples, tribe_prob_full))
tree_to_df_list = treeToDF(labeled_timetree)
labeled_timetree_info = tree_to_df_list[["info"]]
# Read the labeled time tree and parse with treetoDF

labeled_timetree_info = labeled_timetree_info[order(labeled_timetree_info$node), ]
# Re-sort the data frame by R node order after the merge so the trees still work

rpanda_tree_tips = subset(labeled_timetree_info, node.type=="tip")
rpanda_tree_tips = rpanda_tree_tips %>% separate(clade, c("genus", "species", "sample"), sep="_", remove=F)
rpanda_tree_tips$Genus.species = paste(rpanda_tree_tips$genus, rpanda_tree_tips$species)
rpanda_tree_tips[rpanda_tree_tips$label=="mm10",]$Genus.species = "Mus musculus"
rpanda_tree_tips[rpanda_tree_tips$label=="rnor6",]$Genus.species = "Rattus norvegicus"
# Parse the genus and species names of the remaining tips

labeled_timetree$tip.label = rpanda_tree_tips$Genus.species
# Replace the tip labels in the time tree with the genus and species names

labeled_timetree_info$Genus.species = NA
labeled_timetree_info[labeled_timetree_info$clade == rpanda_tree_tips$clade,]$Genus.species = rpanda_tree_tips$Genus.species
# Add the genus and species labels to the info df

labeled_timetree_info$tribe = NA
# Initialize a tribe column in the tree info

###############

rpanda_taxo_data = subset(sample_taxo_data, !Genus.species %in% tribe_prob)
# Remove the problematic tribe samples for rpanda

rpanda_taxo_data$Tribe = as.factor(rpanda_taxo_data$Tribe)
# Factorize the tribe column

tribes = c()
tribe_subtrees = list()
# Initialize a vector of tribes for all tribes with > 1 species and
# a list of tribe subtrees

for(tribe in levels(rpanda_taxo_data$Tribe)){
  #print(tribe)
  
  tribe_samples = subset(rpanda_taxo_data, Tribe==tribe)$Genus.species
  # Get all the samples from the current tribe

  if(length(tribe_samples) < 2){
    labeled_timetree_info[labeled_timetree_info$Genus.species %in% tribe_samples,]$tribe = tribe
    next
  }
  # If the tribe has only one species, add the tribe label to the tree info column for that species,
  # but then skip the rest
  
  tribes = c(tribes, tribe)
  # Add this tribe to the vector of tribes that have > 2 species
  
  n = getMRCA(labeled_timetree, tribe_samples)
  #print(n)
  # Get the mrca of all the tribe species
  
  tribe_nodes = getDescendants(labeled_timetree, n)
  # Get all the descendant nodes from the mrca
  
  labeled_timetree_info[labeled_timetree_info$node == n,]$tribe = tribe
  labeled_timetree_info[labeled_timetree_info$node %in% tribe_nodes,]$tribe = tribe
  # Add tribe info for the tribe nodes to the tree info df
  
  tribe_subtrees[[tribe]] = extract.clade(labeled_timetree, n)
  # Extract the subtree for this tribe
}

tribe_cols = c(corecol(numcol=8), corecol(numcol=2, offset=10))

xmax = 16
tribe_tree = ggtree(labeled_timetree, size=0.8, aes(color=labeled_timetree_info$tribe)) + 
    xlim(0, xmax) +
  geom_tiplab(fontface='italic', size=2, show.legend=F) +
  scale_color_manual(name="Tribe", values=tribe_cols) + 
  #theme(legend.position="bottom")
  theme(legend.position=c(0.05,0.9))
print(tribe_tree)

# A tribe tree figure

num_tribes = length(tribes)
# Variable to keep track of the number of tribes

###############

###############

lambda.cst <- function(x,y){y}
lambda.var <- function(x,y){y[1]*exp(y[2]*x)}
mu.cst <- function(x,y){y}
mu.var <- function(x,y){y[1]*exp(y[2]*x)}
# The different models to test

fit.multi.rpanda <- function(tree, par, sample_rate) {
  bcstdcst = fit_bd(tree, max(branching.times(tree)), f.lamb=lambda.cst, f.mu=mu.cst, lamb_par=par[[1]][1], mu_par=par[[1]][2], cst.lamb=TRUE, cst.mu=TRUE, cond="crown", f=sample_rate, dt=1e-3)
  bvardcst = fit_bd(tree, max(branching.times(tree)), f.lamb=lambda.var, f.mu=mu.cst, lamb_par=par[[2]][c(1,2)], mu_par=par[[2]][3], expo.lamb=TRUE, cst.mu=TRUE, cond="crown", f=sample_rate, dt=1e-3)
  bcstdvar = fit_bd(tree, max(branching.times(tree)), f.lamb=lambda.cst, f.mu=mu.var, lamb_par=par[[3]][1], mu_par=par[[3]][c(2,3)], cst.lamb=TRUE, expo.mu=TRUE, cond="crown", f=sample_rate, dt=1e-3)
  bvardvar = fit_bd(tree, max(branching.times(tree)), f.lamb=lambda.var, f.mu=mu.var, lamb_par=par[[4]][c(1,2)], mu_par=par[[4]][c(3,4)], expo.lamb=TRUE, expo.mu=TRUE, cond="crown", f=sample_rate, dt=1e-3)
  return(list("bcstdcst"=bcstdcst,"bvardcst"=bvardcst,"bcstdvar"=bcstdvar,"bvardvar"=bvardvar))
}
# A function that uses rpanda to fit various models, given a tree and some initial params

###############

init_params = list(c(0.4,0),c(0.4,-0.05,0),c(0.4,0.1,0.05),c(0.4,-0.05,0.1,0.05)) 
# We need to supply starting parameter values for optimization to RPANDA

rpanda_results = list()
# Initialize a list of results by tribe

for(tribe in tribes){
  #print(tribe)
  prop_sampled = tribe_props[tribe_props$Tribe==tribe,]$tribe.prop.sampled
  #print(prop_sampled)
  rpanda_results[[tribe]] = fit.multi.rpanda(tribe_subtrees[[tribe]], init_params, 1)
}
# Run the model fitting function on each tribe

###############

8.2 AICc results for 4 models

See: http://www.phytools.org/***SanJuan2016/ex/14/Complex-diversification-models.html

aic.table = matrix(nrow=4,ncol=num_tribes,NA)
# Initialize a matrix for the AIC table
# One column for each tribe, one row for each model

colnames(aic.table) = tribes
rownames(aic.table) = c("bcstdcst","bvardcst","bcstdvar","bvardvar")
# Rename the columns and the rows based on the tribes and the models

rpanda_best_fit = list()
# Initialize an empty list with the best fitting (lowest aicc) model for each tribe

col_i = 0
# A column counter

for(tribe in tribes) {
  col_i = col_i + 1
  # Increment the column counter to match the current tribe's column
  
  min_aic = 999999999
  min_aic_model = ""
  # Initialize the min aic and model to pick the best model
  
  row_j = 0
  # A row counter
  
  for(model in rownames(aic.table)) {
    row_j = row_j + 1
    # Increment the row counter to match the current model's row
    
    cur_aic = rpanda_results[[tribe]][[model]]$aicc
    # Get the aic for the current model
    
    if(cur_aic < min_aic){
      min_aic = cur_aic
      min_aic_model = model
    }
    # Check if the aic is lower than the lowest so far and replace if so
    
    aic.table[row_j,col_i] = cur_aic
    # Add the aic to the table
  }
  
  rpanda_best_fit[[tribe]] = min_aic_model
  # After all models have been checked for this tribe, store the best fitting one on the list
}
# Retrieve the AIC values from the rpanda results

aic.table %>% kable() %>% kable_styling(bootstrap_options=c("striped", "condended", "responsive"), full_width=F)

	Arvicanthini	Hydromyini	Murini	Otomyini	Phloeomyini	Praomyini	Rattini
bcstdcst	110.63650	289.5277	50.78988	-8.000000	33.29397	42.77105	209.6955
bvardcst	97.15104	288.0626	54.60092	-6.000000	43.11915	41.75441	209.6553
bcstdvar	99.16857	291.7122	54.71845	-6.000000	43.12934	47.57092	211.9353
bvardvar	100.18723	290.3143	59.83902	-5.333333	73.11915	48.95441	211.9847

###############

8.3 Parameter estimates for the best fitting model for each tribe

RPanda estimates lambda (speciation rate) and mu (extinction rate)

param_table = data.frame("dummy"=c(0,0,0,0))
# Initialize the parameters table with a dummy column with one row per model tested

for(tribe in tribes){
  best_model = rpanda_best_fit[[tribe]]
  # Get the best model for the current tribe
  
  param_table[,tribe] = c(rpanda_results[[tribe]][[best_model]]$lamb_par[1:2],rpanda_results[[tribe]][[best_model]]$mu_par[1:2])
  # Add the estimated params for the best model for each tribe to the table
}

param_table = select(param_table, -dummy)
# Remove the dummy column

rownames(param_table) = c("Lambda1", "Lambda2", "Mu1", "Mu2")
# Add row names to the table for each parameter

param_table %>% kable() %>% kable_styling(bootstrap_options=c("striped", "condended", "responsive"), full_width=F)

	Arvicanthini	Hydromyini	Murini	Otomyini	Phloeomyini	Praomyini	Rattini
Lambda1	0.0942485	0.2544161	0.2195913	0.000000	0.1151515	0.0329268	0.3090729
Lambda2	0.4567976	0.1095986	NA	NA	NA	0.6308453	0.1138635
Mu1	-0.5176374	0.0000001	-0.0000001	1.082918	-0.0000002	0.0000000	0.0000002
Mu2	NA	NA	NA	NA	NA	NA	NA

###############

8.4 Estimating total number of species over time

rpanda_plot_df = data.frame(tribe=c(), t=c(), N=c())
# A df to store results for each tribe

for(tribe in tribes){
  best_model = rpanda_results[[tribe]][[rpanda_best_fit[[tribe]]]]
  # Get the best fitting model for the current tribe
  
  max_div_time = max(branching.times(tribe_subtrees[[tribe]]))
  
  if (!inherits(best_model, "fit.bd")) { 
    stop("object \"fit.bd\" is not of class \"fit.bd\"")
  }
  # If the best fitting model isn't an rpanda bd then error out
  
  t = seq(0, max_div_time, 0.01)
  # Intervals of time to estimate N
  
  if ("f.mu" %in% attributes(best_model)$names) {
    r <- function(t) { -best_model$f.lamb(t) + best_model$f.mu(t) }
  } else {
    r <- function(t) { -best_model$f.lamb(t) }
  }
    R <- function(s) { RPANDA:::.Integrate(Vectorize(r), 0, s) }
  # Depending on the best model, functions to sample parameters at a given time
  
  num_tips = Ntip(tribe_subtrees[[tribe]])
  N = num_tips * exp(Vectorize(R)(t))
  # Estimate the number of tips at a given time
  
  rpanda_plot_df = rbind(rpanda_plot_df, data.frame(tribe=tribe, t=-t, N=N))
  # Add the data for the current tribe to the results df
}

###############

matched_tribe_cols = c(tribe_cols[2:3], tribe_cols[5:9])

rpanda_p = ggplot(rpanda_plot_df, aes(x=t, y=N, color=tribe)) +
  geom_line(size=1) +
  xlab("Time (mya)") +
  ylab("# of species") +
  scale_color_manual(values=matched_tribe_cols) +
  bartheme() +
  theme(legend.position="bottom")
print(rpanda_p)

8.5 Spectral density analysis of murine phylogeny

https://besjournals.onlinelibrary.wiley.com/doi/10.1111/2041-210X.12526

res1 = spectR(labeled_timetree)
plot_spectR(res1)

res2 = BICompare(labeled_timetree, res1$eigengap)
plot_BICompare(labeled_timetree, res2)

9 Tip introgression with RND

From https://onlinelibrary.wiley.com/doi/10.1111/mec.13610

RND is a measure of sequence divergence between a pair of species corrected for divergence from an outgroup:

\[RND = \frac{d_{XY}}{d_{out}}\]

Where: \(d_{XY}\) is the average distance (\(d_{x,y}\)) between all sequences between the two species \(X\) and \(Y\), and:

\[d_{out} = \frac{d_{XO} + d_{YO}}{2}\]

Since we have two outgroups, \(O_1\) and \(O_2\), each of \(d_{XO}\) and \(d_{XY}\) is also an average of the distances between the two outgroups, e.g.:

\[d_{XO} = \frac{d_{XO_1} + d_{XO_2}}{2}\] Low values of \(d_{XY}\) and/or \(RND\) are indicative of recent introgression.

9.1 Summary of sister pairs

#source("C:/bin/Ks_plots/fitMixEM.R")
#source("C:/bin/Ks_plots/bicWGD.R")
#source("C:/bin/Ks_plots/plotComponentExpectations.R")
# Files for mixture model testing code

rnd_dir = here("docs", "data", "trees", "tip-introgression-rnd")
# The input directory for the dxy files

rnd_sis_dir = paste0(rnd_dir, "/sister/")
sis_files = list.files(rnd_sis_dir)
sis_files = paste0(rnd_sis_dir, sis_files)
# Read the sister tip pairs

rnd_non_sis_dir = paste0(rnd_dir, "/non-sister/")
non_sis_files = list.files(rnd_non_sis_dir)
non_sis_files = paste0(rnd_non_sis_dir, non_sis_files)
# Read the non-sister tip pairs

#rnd_files = c(sis_files, non_sis_files)
#rnd_files = sis_files
#rnd_files = sample(non_sis_files, 10)

rnd_summary_sis = data.frame("pair"=c(), "mean.rnd"=c(), "median.rnd"=c())
# A df for the sister pairs

for(file in sis_files){
  cur_data = read.csv(file, header=F)
  names(cur_data) = c("gene", "cds.len", "pair.diffs", "x.out1", "x.out2", "y.out1", "y.out2")
  # Read the file for the current pair and add headers
  
  comp = tools::file_path_sans_ext(basename(file))
  cur_data$pair = comp
  # Get the pair name
  
  cur_data$d.xy = cur_data$pair.diffs / cur_data$cds.len
  # Calculate dxy between the pair
  
  cur_data$x.out = rowMeans(cur_data[,c('x.out1', 'x.out2')], na.rm=T)
  cur_data$y.out = rowMeans(cur_data[,c('y.out1', 'y.out2')], na.rm=T)
  # Calculate the average number of differences between each in the pair and both outgroups
  
  cur_data$d.out = rowMeans(cur_data[,c('x.out', 'y.out')], na.rm=T)
  # Calculate average dxy to the outgroups
  
  cur_data$rnd = cur_data$d.xy / cur_data$d.out
  # Calculate RND as dxy / d_out
  
  #bic.test.wgd(cur_data$rnd, startK = 1, maxK = 2, model = 1, nstarts = 100, outPrefix = comp)
  #x = modetest(cur_data$rnd, method="HH")
  # Mixture model and multimode testing
  
  p = ggplot(cur_data, aes(x=rnd)) +
    geom_histogram(bins=100) +
    ggtitle(basename(comp)) +
    xlab("RND") +
    ylab("# of genes") +
    scale_y_continuous(expand=c(0,0)) +
    bartheme() +
    theme(plot.title=element_text(size=12))
  #print(p)
  # Histogram of rnd for the current pair

  rnd_summary_sis = rbind(rnd_summary_sis, data.frame(pair=comp, mean.rnd=mean(cur_data$rnd, na.rm=T), median.rnd=median(cur_data$rnd, na.rm=T)))
  # Summarize the RND distribution for the current pair
}

print(p)

rnd_summary_sis$pair <- reorder(rnd_summary_sis$pair, -rnd_summary_sis$median.rnd)

p = ggplot(rnd_summary_sis, aes(x=pair, y=mean.rnd, color="Mean")) + 
  geom_point() + 
  xlab("") +
  ylab("RND") +
  geom_point(aes(y=median.rnd, color="Median")) +
  scale_color_manual(values=c("Mean"="#333333", "Median"=corecol(numcol=1))) +
  theme(legend.position="bottom") +
  coord_flip()
print(p)

9.2 Summary of non-sister pairs

Distributions of median RND values for each pair

rnd_non_sis_dir = paste0(rnd_dir, "/non-sister/")
non_sis_files = list.files(rnd_non_sis_dir)
non_sis_files = paste0(rnd_non_sis_dir, non_sis_files)
# Read the non-sister tip pairs

#non_sis_files = sample(non_sis_files, 50)

rnd_summary_non_sis = data.frame("pair"=c(), "mean.rnd"=c(), "median.rnd"=c())
# A df for the sister pairs

for(file in non_sis_files){
  cur_data = read.csv(file, header=F)
  names(cur_data) = c("gene", "cds.len", "pair.diffs", "x.out1", "x.out2", "y.out1", "y.out2")
  # Read the file for the current pair and add headers
  
  comp = tools::file_path_sans_ext(basename(file))
  cur_data$pair = comp
  # Get the pair name
  
  cur_data$d.xy = cur_data$pair.diffs / cur_data$cds.len
  # Calculate dxy between the pair
  
  cur_data$x.out = rowMeans(cur_data[,c('x.out1', 'x.out2')], na.rm=T)
  cur_data$y.out = rowMeans(cur_data[,c('y.out1', 'y.out2')], na.rm=T)
  # Calculate the average number of differences between each in the pair and both outgroups
  
  cur_data$d.out = rowMeans(cur_data[,c('x.out', 'y.out')], na.rm=T)
  # Calculate average dxy to the outgroups
  
  cur_data$rnd = cur_data$d.xy / cur_data$d.out
  # Calculate RND as dxy / d_out

  rnd_summary_non_sis = rbind(rnd_summary_non_sis, data.frame(pair=comp, mean.rnd=mean(cur_data$rnd, na.rm=T), median.rnd=median(cur_data$rnd, na.rm=T)))
  # Summarize the RND distribution for the current pair
}

rnd_summary_sis$type = "Sister pairs"
rnd_summary_non_sis$type = "Non-sister pairs"

rnd_summary = rbind(rnd_summary_sis, rnd_summary_non_sis)

p = ggplot(rnd_summary, aes(x=type, y=median.rnd, group=type, color=type)) +
  geom_quasirandom(size=2, width=0.25, alpha=0.25) +
  geom_boxplot(outlier.shape=NA, alpha=0.15, width=0.5, color="#666666") +
  xlab("") +
  ylab("Median RND") +
  scale_color_manual(values=corecol(numcol=2, pal="wilke")) +
  #geom_signif(comparisons=x_comps, map_signif_level=TRUE, textsize=4, size=1, step_increase=0.12, margin_top=0.1) +
  #scale_x_continuous(limits=c(0,2)) +
  bartheme() +
  theme(legend.position="none")
print(p)

< Back to summary