Phodopus molecular evolution

#knitr::opts_chunk$set(echo = FALSE)
#this.dir <- dirname(parent.frame(2)$ofile)
#setwd(this.dir)

library(ggtree)
library(ggplot2)
library(ggbeeswarm)
library(cowplot)
library(ggsignif)
library(dplyr)
library(colorspace)
library(kableExtra)
source("../lib/design.r")



#htmltools::includeHTML("../html-chunks/nav.html")

< Back Home

Main questions

  1. Does lack of recombination affect molecular evolution on the X chromosome in phodopus?

Mus and Phodopus species tree branch rates

Goal: compare rate distributions for TWO small sets of rodent species, including phodopus, to determine how lack of recombination on the phodopus X shapes molecular evolution on that chromosome.

pho_tree_file = "../../data/mol-evol/phodopus-5spec-iqtree-concat-rooted.tre"
pho_tree = read.tree(pho_tree_file)
pho_trait_file = "../../data/mol-evol/phodopus-5spec-concat-traits.csv"
pho_data = read.csv(pho_trait_file, header=TRUE)
pho_data = pho_data[order(pho_data$ggtree),]
pho_branch_labels = pho_data
pho_branch_labels$node[pho_branch_labels$node.type=='tip'] = NA
pho_data$clade = as.character(pho_data$clade)

pho_m1_data = read.csv("../../data/mol-evol/phodopus-5spec-m1.csv", header=T, comment.char="#")
pho_m1_data$clade = as.character(pho_m1_data$clade)
# Phodopus data

mus_tree_file = "../../data/mol-evol/mus-4spec-iqtree-concat-rooted.tre"
mus_tree = read.tree(mus_tree_file)
mus_trait_file = "../../data/mol-evol/mus-4spec-concat-traits.csv"
mus_data = read.csv(mus_trait_file, header=TRUE)
mus_data = mus_data[order(mus_data$ggtree),]
mus_branch_labels = mus_data
mus_branch_labels$node[mus_branch_labels$node.type=='tip'] = NA
mus_data$clade = as.character(mus_data$clade)

mus_m1_data = read.csv("../../data/mol-evol/mus-4spec-m1.csv", header=T, comment.char="#")
mus_m1_data$clade = as.character(mus_m1_data$clade)
# Mus data
pho_data$avg.dnds = NA
pho_data$avg.dn = NA
pho_data$avg.ds = NA
pho_data$num.monophyletic = NA
pho_data$num.monophyletic.filtered = NA

pho_clade_data = list()
pho_clade_data_f = list()
pho_ds_dists = list()
i =  1
for(c in pho_data$clade){
  if(pho_data$node.type[pho_data$clade==c]=="ROOT"){
    next
  }
  
  cur_label = as.character(pho_data$clade.label[pho_data$clade==c])
  cur_data = subset(pho_m1_data, clade==c & clade.ds <= 2)
  cur_data$clade.label = cur_label
  
  ds_dist = ggplot(cur_data, aes(x=clade.ds)) +
    geom_histogram(color="#000000", fill="#d3d3d3") +
    scale_y_continuous(expand=c(0,0)) +
    xlab("dS") +
    ylab("# of genes") +
    ggtitle(pho_data$clade.label[pho_data$clade==c]) +
    theme(axis.text.x = element_text(angle=90, hjust=1, size=8))
  
  #print(ds_dist)
  #stop()
  pho_ds_dists[[i]] = ds_dist
  i = i + 1
  
  cur_data_f = subset(cur_data, clade.ds < 0.3 & clade.dn.ds <= 2)
  
  pho_data$num.monophyletic[pho_data$clade==c] = nrow(cur_data)
  pho_data$num.monophyletic.filtered[pho_data$clade==c] = nrow(cur_data_f)
  pho_data$avg.dnds[pho_data$clade==c] = mean(cur_data_f$clade.dn.ds)
  pho_data$avg.dn[pho_data$clade==c] = mean(cur_data_f$clade.dn)
  pho_data$avg.ds[pho_data$clade==c] = mean(cur_data$clade.ds)
  
  pho_clade_data[[cur_label]] = length(cur_data[,1])
  pho_clade_data_f[[cur_label]] = cur_data_f
}
mus_data$avg.dnds = NA
mus_data$avg.dn = NA
mus_data$avg.ds = NA
mus_data$num.monophyletic = NA
mus_data$num.monophyletic.filtered = NA

mus_clade_data = list()
mus_clade_data_f = list()
mus_ds_dists = list()
i =  1
for(c in mus_data$clade){
  if(mus_data$node.type[mus_data$clade==c]=="ROOT"){
    next
  }
  
  cur_label = as.character(mus_data$clade.label[mus_data$clade==c])
  cur_data = subset(mus_m1_data, clade==c & clade.ds <= 2)
  cur_data$clade.label = cur_label
  
  ds_dist = ggplot(cur_data, aes(x=clade.ds)) +
    geom_histogram(color="#000000", fill="#d3d3d3") +
    scale_y_continuous(expand=c(0,0)) +
    xlab("dS") +
    ylab("# of genes") +
    ggtitle(mus_data$clade.label[mus_data$clade==c]) +
    theme(axis.text.x = element_text(angle=90, hjust=1, size=8))
  
  #print(ds_dist)
  #stop()
  mus_ds_dists[[i]] = ds_dist
  i = i + 1
  
  cur_data_f = subset(cur_data, clade.ds < 0.3 & clade.dn.ds <= 2)
  
  mus_data$num.monophyletic[mus_data$clade==c] = nrow(cur_data)
  mus_data$num.monophyletic.filtered[mus_data$clade==c] = nrow(cur_data_f)
  mus_data$avg.dnds[mus_data$clade==c] = mean(cur_data_f$clade.dn.ds)
  mus_data$avg.dn[mus_data$clade==c] = mean(cur_data_f$clade.dn)
  mus_data$avg.ds[mus_data$clade==c] = mean(cur_data$clade.ds)
  
  mus_clade_data[[cur_label]] = length(cur_data[,1])
  mus_clade_data_f[[cur_label]] = cur_data_f
}

Methods

Step 0: Genome assembly and annotation (expand later)

Phodopus sungorous was de novo assembled with Dovetails, and campbelli and robovorski were assembled based on iterative mapping to the sungorus assembly. Annotation was done using the Maker pipeline (Seb’s first attempt).

Step 1: Select orthologs between 5 hamster species and 3 mouse+rat species separately

We selected 2 previously sequenced hamster species to estimate and compare rates of synonymous and non-synonymous substitutions with our set of newly sequenced Phodopus genomes. Separately, we selected 3 mouse species + rat to compare rates between phodopus and mus (Table 1). The CDS sequences for these species were downloaded from Ensembl (release 100).

t1_data_pho = subset(pho_data, node.type=='tip')
t1_data_pho = subset(t1_data_pho, select=c(species, common, source))
t1_data_pho$set = "Phodopus"
names(t1_data_pho) = c("Species", "Common name", "Source", "Dataset")
rownames(t1_data_pho) = c()
# Phodopus data

t1_data_mus = subset(mus_data, node.type=='tip')
t1_data_mus = subset(t1_data_mus, select=c(species, common, source))
t1_data_mus$set = "Mus"
names(t1_data_mus) = c("Species", "Common name", "Source", "Dataset")
rownames(t1_data_mus) = c()
# Mus data

t1_data = rbind(t1_data_pho, t1_data_mus)

t1_data %>% kable(caption="Selected species for comparative molecular evolution analysis") %>% kable_styling(bootstrap_options=c("striped", "condensed", "responsive"), full_width=F)
Selected species for comparative molecular evolution analysis
Species Common name Source Dataset
Cricetulus_griseus Chinese hamster Ensembl Phodopus
Mesocricetus_auratus Golden hamster Ensembl Phodopus
Phodopus_campbelli Campbell’s dwarf hamster This study Phodopus
Phodopus_sungorus Djungarian hamster This study Phodopus
Phodopus_robovorski Roborvorski hamster This study Phodopus
Rattus_norvegicus Brown rat Ensembl Mus
Mus_musculus House mouse Ensembl Mus
Mus_caroli Ryukyu mouse Ensembl Mus
Mus_pahari Gairdner’s shrewmouse Ensembl Mus

Phylogenetic and comparative analyses often rely on selection of a single sequence from each species to assess relatedness of genes. However because genes can have multiple transcripts due to alternative splicing of exons and because genes can be duplicated or lost over time this requires selection of sequences in each species at two levels: 1) selection of a single transcript within a gene and 2) selection of genes without paralogs (i.e. those that are single copy across all species). The most common method for selection of transcripts is to simply pick the longest transcript in each species.

Mus transcript selection:

  1. Total Mus transcripts: 142,447
  2. Longest transcripts: 21,875
  3. Single-copy between 4 species: 15,772
  4. X-genes before alignments: 551
  5. X-genes after alignments: 526

Phodopus transcript selection:

  1. Total transcripts predicted from Maker: 27,988
  2. Longest BLAST match to CGRI transcripts: 14,768
  3. Also single-copy in MAUR: 12,413
  4. X-genes before alignments: 493
  5. X-genes after alignments: 413

Step 2: Align orthologs

We aligned the coding sequences from each single-copy ortholog using a suite of programs in the MACSE software as follows:

  1. Trim non-homologous regions with MACSE’s trimNonHomologousFragments program
  2. Codon aware alignments with MACSE’s alignSequences program
  3. Trim gappy edges from alignment with MACSE’s trimAlignment program
  4. Removed alignments with premature stop codons

Step 3: Construct species and gene trees

We constructed maximum likelihood gene trees and a concatenated species tree using IQ-tree aligned coding sequences. We used IQ-trees concordance factors to assess the confidence of the species tree as well as the consistency of the gene trees. We find that most branches in the species tree are consistently present in a high proportion of gene trees (>90% for all branches).

node_check = F

##########
# Node label checking
if(node_check){
  node_test = ggtree(pho_tree, size=2, ladderize=F) +
    ggplot2::xlim(0, 0.05) +
    geom_tiplab(color="#333333", fontface='italic', size=5) +
    geom_text(aes(label=node), hjust=-.3, vjust=-.3, color="#ff6db6") +
    geom_nodepoint(color="#666666", alpha=0.85, size=4)
  
  #node_test = node_test %>% rotate(25) %>% rotate(26) %>% rotate(27) %>% rotate(28) %>% rotate(29) %>% rotate(30) %>% rotate(32) %>%
  #  rotate(33) %>% rotate(35) %>% rotate(36) %>% rotate(39) %>% rotate(41) %>% rotate(44)
  
  print(node_test)  
  stop("Node check ok")
}
# Node label checking
##########

h = corecol(numcol=1, pal="wilke", offset=3)
l = corecol(numcol=1, offset=3)

clade_cols = corecol(numcol=3)
clade_labs = c("pho", "cs")

pho_tree_fig = ggtree(pho_tree, size=2, ladderize=F, aes(color=pho_data$gcf)) +
  scale_color_continuous(name='gCF', low=l, high=h, limits=c(0,100)) +
  ggplot2::xlim(0, 0.07) +
  geom_tiplab(aes(label=pho_data$species), color="#333333", fontface='italic', size=5) +
  #geom_cladelabel(node=13, size=2, label="pho", color="green", offset = 0.07, align=T) +
  #geom_text(aes(label=rodent_data$label), hjust=-.3, vjust=-.3, color="#ff6db6") +
  geom_label(aes(x=branch, label=pho_branch_labels$branch), color="#d3d3d3", fill=pho_branch_labels$clade.col) + 
  #labs(caption="Figure 1: 5 species hamster phylogeny constructed from concatenation of XX single-copy orthologs.\nGene concordance factors (gCF) are indicated by branch colors.") +
  theme(legend.position=c(0.9,0.25),
        plot.caption=element_text(hjust=0))

mus_tree_fig = ggtree(mus_tree, size=2, ladderize=F, aes(color=mus_data$gcf)) +
  scale_color_continuous(name='gCF', low=l, high=h, limits=c(0,100)) +
  ggplot2::xlim(0, 0.07) +
  geom_tiplab(aes(label=mus_data$species), color="#333333", fontface='italic', size=5) +
  #geom_cladelabel(node=13, size=2, label="pho", color="green", offset = 0.07, align=T) +
  #geom_text(aes(label=rodent_data$label), hjust=-.3, vjust=-.3, color="#ff6db6") +
  geom_label(aes(x=branch, label=mus_branch_labels$branch), color="#d3d3d3", fill=mus_branch_labels$clade.col) + 
  #labs(caption="Figure 1: 5 species hamster phylogeny constructed from concatenation of XX single-copy orthologs.\nGene concordance factors (gCF) are indicated by branch colors.") +
  theme(legend.position=c(0.9,0.25),
        plot.caption=element_text(hjust=0))

#geom_text(aes(label=rodent_data$support), hjust=-.1, color="#006ddb") +
#geom_nodepoint(color="#666666", alpha=0.85, size=4)

#print(mus_tree_fig)

tree_fig = plot_grid(pho_tree_fig, mus_tree_fig, nrow=1, align='vh')
cap = "Figure 1: (left) 5 species hamster phylogeny constructed from concatenation of 10,300 single-copy orthologs.\n(right) 4 species mouse phylogeny constructed from concatenation of 15,427 single-copy orthologes\nGene concordance factors (gCF) are indicated by branch colors."
pcap = ggdraw() + draw_label(cap, size=10, fontface="bold", x=0, hjust=0) + theme(plot.margin=margin(0,0,0,7))
tree_p = plot_grid(tree_fig, pcap, ncol=1, rel_heights=c(1, 0.1))
print(tree_p)

Step 4: Estimation of synonymous (dS) and nonysynonymous (dN) substitution rates

We used codeml within the PAML program to estimate rates of synonymous (dS) and non-synonymous (dN) substitutions using a free-ratio model (model=1, NSsites=0) to get an estimate of rate for each branch in our phylogeny. Relationships among genes may differ amongst themselves and relative to the species relationships due to biological factors such as incomplete lineage sorting and introgression. In cases where a gene tree and the species tree disagree, inferring substitutions using the species tree (the wrong tree in this case) can lead to spurious inferences of substitutions and affect rate estimates. To mitigate this, we used the corresponding gene tree for each alignment when estimating rates. When compiling rate estimates, we only counted rates for a branch in the species tree if it existed in the gene tree (Table 2).

# t2_data = data.frame("Branch"=c(), "Gene trees"=c(), "Genes after dS filter"=c())
# for(clade_label in pho_data$clade.label){
#   if(pho_data$node.type[pho_data$clade.label==clade_label]!="internal"){
#     next
#   }
# 
#   t2_data = rbind(t2_data, data.frame("Branch"=clade_label, "Gene trees"=clade_data[[clade_label]], "Genes after dS filter"=length(clade_data_f[[clade_label]][,1])))
# }
# 
# rownames(t2_data) = c()
# 
# t2_data %>% kable(col.names=c("Branch","Monophyletic gene trees", "Genes after dS filter"), caption="Number of gene trees containing branch before and after dS filtering", align="l") %>% kable_styling(bootstrap_options=c("striped", "condensed", "responsive"), full_width=F)# %>%
#   #column_spec(1, width="7em") %>%
#   #column_spec(2, width="13em")

Filtering based on dS

As another check on alignment quality, we assessed distributions of dS along each branch and only counted genes for each branch where dS fell below 0.3. (QUESTION: should this dS filter step have a different threshold for each branch?) We also omitted genes for a branch where dN/dS was greater than 2.

ds_plots = plot_grid(plotlist=pho_ds_dists, nrow=3)
print(ds_plots)

ds_plots = plot_grid(plotlist=mus_ds_dists, nrow=3)
print(ds_plots)

Post filter dS distributions per branch

pho_m1_data_f = do.call("rbind", pho_clade_data_f)
mus_m1_data_f = do.call("rbind", mus_clade_data_f)
m1_data_f = rbind(pho_m1_data_f, mus_m1_data_f)

#clade_cols = as.character(pho_data$clade.col)
#clade_labs = as.character(pho_data$clade.label)

x_comps = list(c("mus", "pho"))

clade_ds_p = ggplot(m1_data_f, aes(clade.label, clade.ds)) +
  geom_quasirandom(size=0.5, alpha=0.3, width=0.25, color="#d3d3d3") +
  geom_boxplot(outlier.shape=NA, alpha=0.7, width=0.5, fill="#333333") +
  #geom_signif(comparisons=x_comps, map_signif_level=TRUE, textsize=4, size=1, step_increase=0.12, margin_top=0.1) +
  xlab("Branch") +
  ylab("dS") +
  #scale_fill_manual(limits=clade_labs, values=clade_cols) +
  bartheme() +
  theme(legend.position="none",
        axis.text.x=element_text(angle=25, hjust=1, size=8))

print(clade_ds_p)

Post filter dN distributions per branch

clade_cols = as.character(pho_data$clade.col)
clade_labs = as.character(pho_data$clade.label)

#x_comps = list(c("mus", "pho"))

clade_dn_p = ggplot(m1_data_f, aes(clade.label, clade.dn)) +
  geom_quasirandom(size=0.5, alpha=0.3, width=0.25, color="#d3d3d3") +
  geom_boxplot(outlier.shape=NA, alpha=0.7, width=0.5, fill="#333333") +
  #geom_signif(comparisons=x_comps, map_signif_level=TRUE, textsize=4, size=1, step_increase=0.12, margin_top=0.1) +
  xlab("Branch") +
  ylab("dN") +
  scale_fill_manual(limits=clade_labs, values=clade_cols) +
  bartheme() +
  theme(legend.position="none",
        axis.text.x = element_text(angle=25, hjust=1, size=8)
        )

print(clade_dn_p)

Post filter dN/dS distributions per branch

clade_cols = as.character(pho_data$clade.col)
clade_labs = as.character(pho_data$clade.label)

x_comps = list(c("mus", "pho"))

clade_dnds_p = ggplot(m1_data_f, aes(clade.label, clade.dn.ds)) +
  geom_quasirandom(size=0.5, alpha=0.3, width=0.25, color="#d3d3d3") +
  geom_boxplot(outlier.shape=NA, alpha=0.7, width=0.5, fill="#333333") +
  #geom_signif(comparisons=x_comps, map_signif_level=TRUE, textsize=4, size=1, step_increase=0.12, margin_top=0.1) +
  xlab("Branch") +
  ylab("dN/dS") +
  #scale_fill_manual(limits=clade_labs, values=clade_cols) +
  bartheme() +
  theme(legend.position="none",
        axis.text.x=element_text(angle=25, hjust=1, size=8))

print(clade_dnds_p)

Results (M1 branch leading to Phodopus and Mus only)

1. X vs. autosome dS distributions in Mus and Phodopus

pho_branches = c("pho")
pho_a = data.frame()
pho_x = data.frame()
for(b in pho_branches){
  pho_a = rbind(pho_a, subset(pho_clade_data_f[[b]], chr!="ScmyWZ3_7747_HRSCAF_7900"))
  pho_x = rbind(pho_x, subset(pho_clade_data_f[[b]], chr=="ScmyWZ3_7747_HRSCAF_7900"))
}
pho_x$label = "Phodopus X"
pho_a$label = "Phodopus Auto"

mus_branches = c("mus")
mus_a = data.frame()
mus_x = data.frame()
for(b in mus_branches){
  mus_a = rbind(mus_a, subset(mus_clade_data_f[[b]], chr!="X"))
  mus_x = rbind(mus_x, subset(mus_clade_data_f[[b]], chr=="X"))
}
mus_x$label = "Mus X"
mus_a$label = "Mus Auto"


chrome_data = rbind(pho_x, pho_a, mus_x, mus_a)

# t = as.data.frame(table(chrome_data$chr))
# test = ggplot(t, aes(x=Var1, y=Freq)) +
#   geom_bar(stat="identity") +
#   scale_y_continuous(expand=c(0,0)) +
#   bartheme() +
#   theme()
# print(test)


#clade_cols = as.character(rodent_data$clade.col)
#clade_labs = as.character(rodent_data$clade.label)

chrome_cols = c("#006ddb", "#006ddb", "#db6d00", "#db6d00")
chrome_labs = c("Mus Auto", "Mus X", "Phodopus Auto", "Phodopus X")

x_comps = list(c("Phodopus Auto", "Phodopus X"), c("Mus Auto", "Mus X"))

chrome_ds_p = ggplot(chrome_data, aes(label, clade.ds, fill=label)) +
  geom_quasirandom(size=0.5, alpha=0.3, width=0.25, color="#d3d3d3") +
  geom_boxplot(outlier.shape=NA, alpha=0.7, width=0.5) +
  geom_signif(comparisons=x_comps, map_signif_level=TRUE, textsize=4, size=1, step_increase=0.12, margin_top=0.1) +
  xlab("Chromosomes") +
  ylab("ds") +
  scale_fill_manual(limits=chrome_labs, values=chrome_cols) +
  bartheme() +
  theme(legend.position="none")

print(chrome_ds_p)

2. X vs. autosome dN distributions in Mus and Phodopus

chrome_cols = c("#006ddb", "#006ddb", "#db6d00", "#db6d00")
chrome_labs = c("Mus Auto", "Mus X", "Phodopus Auto", "Phodopus X")

x_comps = list(c("Phodopus Auto", "Phodopus X"), c("Mus Auto", "Mus X"))

chrome_dn_p = ggplot(chrome_data, aes(label, clade.dn, fill=label)) +
  geom_quasirandom(size=0.5, alpha=0.3, width=0.25, color="#d3d3d3") +
  geom_boxplot(outlier.shape=NA, alpha=0.7, width=0.5) +
  geom_signif(comparisons=x_comps, map_signif_level=TRUE, textsize=4, size=1, step_increase=0.12, margin_top=0.1) +
  xlab("Chromosomes") +
  ylab("dN") +
  scale_fill_manual(limits=chrome_labs, values=chrome_cols) +
  bartheme() +
  theme(legend.position="none")

print(chrome_dn_p)

3. X vs. autosome dN/dS distributions in Mus and Phodopus

chrome_cols = c("#006ddb", "#006ddb", "#db6d00", "#db6d00")
chrome_labs = c("Mus Auto", "Mus X", "Phodopus Auto", "Phodopus X")

x_comps = list(c("Phodopus Auto", "Phodopus X"), c("Mus Auto", "Mus X"))

chrome_dnds_p = ggplot(chrome_data, aes(label, clade.dn.ds, fill=label)) +
  geom_quasirandom(size=0.5, alpha=0.3, width=0.25, color="#d3d3d3") +
  geom_boxplot(outlier.shape=NA, alpha=0.7, width=0.5) +
  geom_signif(comparisons=x_comps, map_signif_level=TRUE, textsize=4, size=1, step_increase=0.12, margin_top=0.1) +
  xlab("Chromosomes") +
  ylab("dN/dS") +
  scale_fill_manual(limits=chrome_labs, values=chrome_cols) +
  bartheme() +
  theme(legend.position="none")

print(chrome_dnds_p)

t3_data = data.frame("chrome"=c("Mus auto", "Mus X", "Phodopus auto", "Phodopus X"), 
                     "genes"=c(length(mus_a[,1]), length(mus_x[,1]), length(pho_a[,1]), length(pho_x[,1])), 
                     "avg.ds"=c(mean(mus_a$clade.ds), mean(mus_x$clade.ds), mean(pho_a$clade.ds), mean(pho_x$clade.ds)), 
                     "avg.dn"=c(mean(mus_a$clade.dn), mean(mus_x$clade.dn), mean(pho_a$clade.dn), mean(pho_x$clade.dn)), 
                     "avg.dnds"=c(mean(mus_a$clade.dn.ds), mean(mus_x$clade.dn.ds), mean(pho_a$clade.dn.ds), mean(pho_x$clade.dn.ds)))

rownames(t3_data) = c()

t3_data %>% kable(col.names=c("Chromosome","Number of genes", "Avg. dS", "Avg. dN", "Avg. dN/dS"), caption="Substitution rates for Mus and Phodopus branches", align="l") %>% kable_styling(bootstrap_options=c("striped", "condensed", "responsive"), full_width=F)# %>%
Substitution rates for Mus and Phodopus branches
Chromosome Number of genes Avg. dS Avg. dN Avg. dN/dS
Mus auto 14695 0.1476831 0.0214826 0.1511614
Mus X 517 0.1167536 0.0223234 0.1857207
Phodopus auto 9581 0.0604657 0.0095896 0.1749161
Phodopus X 399 0.0463514 0.0099188 0.2132867
  #column_spec(1, width="7em") %>%
  #column_spec(2, width="13em")

4. X vs. autosome dS distributions in Mus and Phodopus, partitioning Phodopus X between recombining and non-recombining regions

        # if scaff == "ScmyWZ3_7747_HRSCAF_7900":
        #     if int(end) < 35225145:
        #         scaff = "ScmyWZ3_7747_HRSCAF_7900_R";
        #         chrome = "chrX_R";
        #     elif int(start) >= 35225145:
        #         scaff = "ScmyWZ3_7747_HRSCAF_7900_NR"
        #         chrome = "chrX_NR";
        #     else:
        #         chrome = "chrX";
        # elif scaff in scaff_to_chr:
        #     chrome =  scaff_to_chr[scaff];
        # else:
        #     chrome = "NA";

pho_xr = subset(pho_x, start < 35225145)
pho_xr$label = "Phodopus X recombining"
pho_xnr = subset(pho_x, start >= 35225145)
pho_xnr$label = "Phodopus X non-recombining"

chrome_data = rbind(mus_a, mus_x, pho_a, pho_xr, pho_xnr)

chrome_cols = c("#006ddb", "#006ddb", "#db6d00", "#db6d00", "#db6d00")
chrome_labs = c("Mus Auto", "Mus X", "Phodopus Auto", "Phodopus X recombining", "Phodopus X non-recombining")

x_comps = list(c("Phodopus X recombining", "Phodopus X non-recombining"), c("Phodopus Auto", "Phodopus X recombining"), c("Phodopus Auto", "Phodopus X non-recombining"))

chrome_ds_p = ggplot(chrome_data, aes(label, clade.ds, fill=label)) +
  geom_quasirandom(size=0.5, alpha=0.3, width=0.25, color="#d3d3d3") +
  geom_boxplot(outlier.shape=NA, alpha=0.7, width=0.5) +
  geom_signif(comparisons=x_comps, map_signif_level=TRUE, textsize=4, size=1, step_increase=0.12, margin_top=0.1) +
  xlab("Chromosomes") +
  ylab("ds") +
  scale_fill_manual(limits=chrome_labs, values=chrome_cols) +
  bartheme() +
  theme(legend.position="none",
        axis.text.x = element_text(angle=10, hjust=1, size=10)
        )

print(chrome_ds_p)

5. X vs. autosome dN distributions in Mus and Phodopus, partitioning Phodopus X between recombining and non-recombining regions

chrome_cols = c("#006ddb", "#006ddb", "#db6d00", "#db6d00", "#db6d00")
chrome_labs = c("Mus Auto", "Mus X", "Phodopus Auto", "Phodopus X recombining", "Phodopus X non-recombining")

x_comps = list(c("Phodopus X recombining", "Phodopus X non-recombining"), c("Phodopus Auto", "Phodopus X recombining"), c("Phodopus Auto", "Phodopus X non-recombining"))

chrome_dn_p = ggplot(chrome_data, aes(label, clade.dn, fill=label)) +
  geom_quasirandom(size=0.5, alpha=0.3, width=0.25, color="#d3d3d3") +
  geom_boxplot(outlier.shape=NA, alpha=0.7, width=0.5) +
  geom_signif(comparisons=x_comps, map_signif_level=TRUE, textsize=4, size=1, step_increase=0.12, margin_top=0.1) +
  xlab("Chromosomes") +
  ylab("dN") +
  scale_fill_manual(limits=chrome_labs, values=chrome_cols) +
  bartheme() +
  theme(legend.position="none",
        axis.text.x = element_text(angle=10, hjust=1, size=10)
        )

print(chrome_dn_p)

6. X vs. autosome dN/dS distributions in Mus and Phodopus, partitioning Phodopus X between recombining and non-recombining regions

chrome_cols = c("#006ddb", "#006ddb", "#db6d00", "#db6d00", "#db6d00")
chrome_labs = c("Mus Auto", "Mus X", "Phodopus Auto", "Phodopus X recombining", "Phodopus X non-recombining")

x_comps = list(c("Phodopus X recombining", "Phodopus X non-recombining"), c("Phodopus Auto", "Phodopus X recombining"), c("Phodopus Auto", "Phodopus X non-recombining"))

chrome_dnds_p = ggplot(chrome_data, aes(label, clade.dn.ds, fill=label)) +
  geom_quasirandom(size=0.5, alpha=0.3, width=0.25, color="#d3d3d3") +
  geom_boxplot(outlier.shape=NA, alpha=0.7, width=0.5) +
  geom_signif(comparisons=x_comps, map_signif_level=TRUE, textsize=4, size=1, step_increase=0.12, margin_top=0.1) +
  xlab("Chromosomes") +
  ylab("dN/dS") +
  scale_fill_manual(limits=chrome_labs, values=chrome_cols) +
  bartheme() +
  theme(legend.position="none",
        axis.text.x = element_text(angle=10, hjust=1, size=10)
        )

print(chrome_dnds_p)

7. M1 summary stats

t4_data = data.frame("chrome"=c("Mus Auto", "Mus X", "Phodopus Auto", "Phodopus X non-recombining", "Phodopus X recombining"), 
                     "genes"=c(length(mus_a[,1]), length(mus_x[,1]), length(pho_a[,1]), length(pho_xnr[,1]), length(pho_xr[,1])), 
                     "avg.ds"=c(mean(mus_a$clade.ds), mean(mus_x$clade.ds), mean(pho_a$clade.ds), mean(pho_xnr$clade.ds), mean(pho_xr$clade.ds)), 
                     "avg.dn"=c(mean(mus_a$clade.dn), mean(mus_x$clade.dn), mean(pho_a$clade.dn), mean(pho_xnr$clade.dn), mean(pho_xr$clade.dn)), 
                     "avg.dnds"=c(mean(mus_a$clade.dn.ds), mean(mus_x$clade.dn.ds), mean(pho_a$clade.dn.ds), mean(pho_xnr$clade.dn.ds), mean(pho_xr$clade.dn.ds))
                     )

rownames(t4_data) = c()

t4_data %>% kable(col.names=c("Chromosome","Number of genes", "Avg. dS", "Avg. dN", "Avg. dN/dS"), caption="Substitution rates for Mus and Phodopus branches", align="l") %>% kable_styling(bootstrap_options=c("striped", "condensed", "responsive"), full_width=F)# %>%
Substitution rates for Mus and Phodopus branches
Chromosome Number of genes Avg. dS Avg. dN Avg. dN/dS
Mus Auto 14695 0.1476831 0.0214826 0.1511614
Mus X 517 0.1167536 0.0223234 0.1857207
Phodopus Auto 9581 0.0604657 0.0095896 0.1749161
Phodopus X non-recombining 257 0.0442354 0.0098794 0.2236335
Phodopus X recombining 142 0.0501810 0.0099901 0.1945606
  #column_spec(1, width="7em") %>%
  #column_spec(2, width="13em")

Results (Clade model C; not useful?)

cmc_data = read.csv("../../data/mol-evol/phodopus-5spec-cmc.csv", header=TRUE, comment.char="#")

cmc_a = subset(cmc_data, chr!="ScmyWZ3_7747_HRSCAF_7900")
cmc_a$label = "Phodopus Auto"

cmc_a_1_b = subset(cmc_a, select=c(label, background.clade.omega.1))
cmc_a_1_b$class = "Background omega Auto 1"
names(cmc_a_1_b)[2] = "omega"
cmc_a_2_b = subset(cmc_a, select=c(label, background.clade.omega.2))
cmc_a_2_b$class = "Background omega Auto 2"
names(cmc_a_2_b)[2] = "omega"
cmc_a_3_b = subset(cmc_a, select=c(label, background.clade.omega.3))
cmc_a_3_b$class = "Background omega Auto 3"
names(cmc_a_3_b)[2] = "omega"

cmc_a_1_t= subset(cmc_a, select=c(label, target.clade.omega.1))
cmc_a_1_t$class = "Target omega Auto 1"
names(cmc_a_1_t)[2] = "omega"
cmc_a_2_t = subset(cmc_a, select=c(label, target.clade.omega.2))
cmc_a_2_t$class = "Target omega Auto 2"
names(cmc_a_2_t)[2] = "omega"
cmc_a_3_t = subset(cmc_a, select=c(label, target.clade.omega.3))
cmc_a_3_t$class = "Target omega Auto 3"
names(cmc_a_3_t)[2] = "omega"

cmc_x = subset(cmc_data, chr!="ScmyWZ3_7747_HRSCAF_7900")
cmc_x$label = "Phodopus X"

cmc_x_1_b = subset(cmc_x, select=c(label, background.clade.omega.1))
cmc_x_1_b$class = "Background omega X 1"
names(cmc_x_1_b)[2] = "omega"
cmc_x_2_b = subset(cmc_x, select=c(label, background.clade.omega.2))
cmc_x_2_b$class = "Background omega X 2"
names(cmc_x_2_b)[2] = "omega"
cmc_x_3_b = subset(cmc_x, select=c(label, background.clade.omega.3))
cmc_x_3_b$class = "Background omega X 3"
names(cmc_x_3_b)[2] = "omega"

cmc_x_1_t= subset(cmc_x, select=c(label, target.clade.omega.1))
cmc_x_1_t$class = "Target omega X 1"
names(cmc_x_1_t)[2] = "omega"
cmc_x_2_t = subset(cmc_x, select=c(label, target.clade.omega.2))
cmc_x_2_t$class = "Target omega X 2"
names(cmc_x_2_t)[2] = "omega"
cmc_x_3_t = subset(cmc_x, select=c(label, target.clade.omega.3))
cmc_x_3_t$class = "Target omega X 3"
names(cmc_x_3_t)[2] = "omega"

cmc_data_long = rbind(cmc_a_1_b, cmc_a_2_b, cmc_a_3_b, cmc_a_1_t, cmc_a_2_t, cmc_a_3_t,
                      cmc_x_1_b, cmc_x_2_b, cmc_x_3_b, cmc_x_1_t, cmc_x_2_t, cmc_x_3_t)

cmc_data_long = subset(cmc_data_long, omega < 2)

chrome_cols = c("#006ddb", "#006ddb", "#db6d00", "#db6d00", "#db6d00")
chrome_labs = c("Mus Auto", "Mus X", "Phodopus Auto", "Phodopus X recombining", "Phodopus X non-recombining")

x_comps = list(c("Target omega X 3", "Target omega Auto 3"))

cmc_dnds_p = ggplot(cmc_data_long, aes(class, omega, fill=class)) +
  geom_quasirandom(size=0.5, alpha=0.3, width=0.25, color="#d3d3d3") +
  geom_boxplot(outlier.shape=NA, alpha=0.7, width=0.5) +
  geom_signif(comparisons=x_comps, map_signif_level=TRUE, textsize=4, size=1, step_increase=0.12, margin_top=0.1) +
  xlab("Chromosomes") +
  ylab("dN/dS") +
  #scale_fill_manual(limits=chrome_labs, values=chrome_cols) +
  bartheme() +
  theme(legend.position="none",
        axis.text.x = element_text(angle=25, hjust=1, size=10)
        )

print(cmc_dnds_p)

Results (M2 when setting all phodopus branches and all mus branches as targets)

pho_m2_data = read.csv("../../data/mol-evol/phodopus-5spec-m2.csv", header=T, comment.char="#")
pho_m2_data$clade = as.character(pho_m2_data$clade)
# Phodopus data

mus_m2_data = read.csv("../../data/mol-evol/mus-4spec-m2.csv", header=T, comment.char="#")
mus_m2_data$clade = as.character(mus_m2_data$clade)
# Mus data

0. dS distributions by branch for filtering

pho_data$avg.dnds = NA
pho_data$avg.dn = NA
pho_data$avg.ds = NA
pho_data$num.monophyletic = NA
pho_data$num.monophyletic.filtered = NA

pho_clade_data = list()
pho_clade_data_f = list()
pho_ds_dists = list()
i =  1
for(c in pho_data$clade){
  if(pho_data$node.type[pho_data$clade==c]=="ROOT"){
    next
  }
  
  cur_label = as.character(pho_data$clade.label[pho_data$clade==c])
  cur_data = subset(pho_m2_data, clade==c & clade.ds <= 2)
  cur_data$clade.label = cur_label
  
  ds_dist = ggplot(cur_data, aes(x=clade.ds)) +
    geom_histogram(color="#000000", fill="#d3d3d3") +
    scale_y_continuous(expand=c(0,0)) +
    xlab("dS") +
    ylab("# of genes") +
    ggtitle(pho_data$clade.label[pho_data$clade==c]) +
    theme(axis.text.x = element_text(angle=90, hjust=1, size=8))
  
  #print(ds_dist)
  #stop()
  pho_ds_dists[[i]] = ds_dist
  i = i + 1
  
  cur_data_f = subset(cur_data, clade.ds < 0.3 & clade.dn.ds <= 2)
  
  pho_data$num.monophyletic[pho_data$clade==c] = nrow(cur_data)
  pho_data$num.monophyletic.filtered[pho_data$clade==c] = nrow(cur_data_f)
  pho_data$avg.dnds[pho_data$clade==c] = mean(cur_data_f$clade.dn.ds)
  pho_data$avg.dn[pho_data$clade==c] = mean(cur_data_f$clade.dn)
  pho_data$avg.ds[pho_data$clade==c] = mean(cur_data$clade.ds)
  
  pho_clade_data[[cur_label]] = length(cur_data[,1])
  pho_clade_data_f[[cur_label]] = cur_data_f
}
mus_data$avg.dnds = NA
mus_data$avg.dn = NA
mus_data$avg.ds = NA
mus_data$num.monophyletic = NA
mus_data$num.monophyletic.filtered = NA

mus_clade_data = list()
mus_clade_data_f = list()
mus_ds_dists = list()
i =  1
for(c in mus_data$clade){
  if(mus_data$node.type[mus_data$clade==c]=="ROOT"){
    next
  }

  cur_label = as.character(mus_data$clade.label[mus_data$clade==c])
  cur_data = subset(mus_m2_data, clade==c & clade.ds <= 2)
  cur_data$clade.label = cur_label

  ds_dist = ggplot(cur_data, aes(x=clade.ds)) +
    geom_histogram(color="#000000", fill="#d3d3d3") +
    scale_y_continuous(expand=c(0,0)) +
    xlab("dS") +
    ylab("# of genes") +
    ggtitle(mus_data$clade.label[mus_data$clade==c]) +
    theme(axis.text.x = element_text(angle=90, hjust=1, size=8))

  #print(ds_dist)
  #stop()
  mus_ds_dists[[i]] = ds_dist
  i = i + 1

  cur_data_f = subset(cur_data, clade.ds < 0.3 & clade.dn.ds <= 2)

  mus_data$num.monophyletic[mus_data$clade==c] = nrow(cur_data)
  mus_data$num.monophyletic.filtered[mus_data$clade==c] = nrow(cur_data_f)
  mus_data$avg.dnds[mus_data$clade==c] = mean(cur_data_f$clade.dn.ds)
  mus_data$avg.dn[mus_data$clade==c] = mean(cur_data_f$clade.dn)
  mus_data$avg.ds[mus_data$clade==c] = mean(cur_data$clade.ds)

  mus_clade_data[[cur_label]] = length(cur_data[,1])
  mus_clade_data_f[[cur_label]] = cur_data_f
}
ds_plots = plot_grid(plotlist=pho_ds_dists, nrow=3)
print(ds_plots)

ds_plots = plot_grid(plotlist=mus_ds_dists, nrow=3)
print(ds_plots)

1. X vs. autosome dS distributions in Mus and Phodopus

  • Estimates are averaged across all branches in clade.
pho_branches = c("cs", "psun", "pcam", "prob")
pho_a = data.frame()
pho_x = data.frame()
for(b in pho_branches){
  pho_a = rbind(pho_a, subset(pho_clade_data_f[[b]], chr!="ScmyWZ3_7747_HRSCAF_7900"))
  pho_x = rbind(pho_x, subset(pho_clade_data_f[[b]], chr=="ScmyWZ3_7747_HRSCAF_7900"))
}
pho_x = pho_x %>% group_by(id, chr, start, end) %>% 
  summarize(ds=mean(clade.ds, na.rm=T), dn=mean(clade.dn, na.rm=T), dn.ds=mean(clade.dn.ds, na.rm=T))
## `summarise()` has grouped output by 'id', 'chr', 'start'. You can override using the `.groups` argument.
pho_x$label = "Phodopus X"
pho_a = pho_a %>% group_by(id, chr, start, end) %>% 
  summarize(ds=mean(clade.ds, na.rm=T), dn=mean(clade.dn, na.rm=T), dn.ds=mean(clade.dn.ds, na.rm=T))
## `summarise()` has grouped output by 'id', 'chr', 'start'. You can override using the `.groups` argument.
pho_a$label = "Phodopus Auto"

mus_branches = c("cm", "mpah", "mcar", "mmus")
mus_a = data.frame()
mus_x = data.frame()
for(b in mus_branches){
  mus_a = rbind(mus_a, subset(mus_clade_data_f[[b]], chr!="X"))
  mus_x = rbind(mus_x, subset(mus_clade_data_f[[b]], chr=="X"))
}
mus_x = mus_x %>% group_by(id, chr, start, end) %>% 
  summarize(ds=mean(clade.ds, na.rm=T), dn=mean(clade.dn, na.rm=T), dn.ds=mean(clade.dn.ds, na.rm=T))
## `summarise()` has grouped output by 'id', 'chr', 'start'. You can override using the `.groups` argument.
mus_x$label = "Mus X"
mus_a = mus_a %>% group_by(id, chr, start, end) %>% 
  summarize(ds=mean(clade.ds, na.rm=T), dn=mean(clade.dn, na.rm=T), dn.ds=mean(clade.dn.ds, na.rm=T))
## `summarise()` has grouped output by 'id', 'chr', 'start'. You can override using the `.groups` argument.
mus_a$label = "Mus Auto"

chrome_data = rbind(pho_x, pho_a, mus_x, mus_a)

# t = as.data.frame(table(chrome_data$chr))
# test = ggplot(t, aes(x=Var1, y=Freq)) +
#   geom_bar(stat="identity") +
#   scale_y_continuous(expand=c(0,0)) +
#   bartheme() +
#   theme()
# print(test)


#clade_cols = as.character(rodent_data$clade.col)
#clade_labs = as.character(rodent_data$clade.label)

chrome_cols = c("#006ddb", "#006ddb", "#db6d00", "#db6d00")
chrome_labs = c("Mus Auto", "Mus X", "Phodopus Auto", "Phodopus X")

x_comps = list(c("Phodopus Auto", "Phodopus X"), c("Mus Auto", "Mus X"))

chrome_ds_p = ggplot(chrome_data, aes(label, ds, fill=label)) +
  geom_quasirandom(size=0.5, alpha=0.3, width=0.25, color="#d3d3d3") +
  geom_boxplot(outlier.shape=NA, alpha=0.7, width=0.5) +
  geom_signif(comparisons=x_comps, map_signif_level=TRUE, textsize=4, size=1, step_increase=0.12, margin_top=0.1) +
  xlab("Chromosomes") +
  ylab("ds") +
  scale_fill_manual(limits=chrome_labs, values=chrome_cols) +
  bartheme() +
  theme(legend.position="none")

print(chrome_ds_p)

2. X vs. autosome dN distributions in Mus and Phodopus

  • Estimates are averaged across all branches in clade.
chrome_cols = c("#006ddb", "#006ddb", "#db6d00", "#db6d00")
chrome_labs = c("Mus Auto", "Mus X", "Phodopus Auto", "Phodopus X")

x_comps = list(c("Phodopus Auto", "Phodopus X"), c("Mus Auto", "Mus X"))

chrome_dn_p = ggplot(chrome_data, aes(label, dn, fill=label)) +
  geom_quasirandom(size=0.5, alpha=0.3, width=0.25, color="#d3d3d3") +
  geom_boxplot(outlier.shape=NA, alpha=0.7, width=0.5) +
  geom_signif(comparisons=x_comps, map_signif_level=TRUE, textsize=4, size=1, step_increase=0.12, margin_top=0.1) +
  xlab("Chromosomes") +
  ylab("dN") +
  scale_fill_manual(limits=chrome_labs, values=chrome_cols) +
  bartheme() +
  theme(legend.position="none")

print(chrome_dn_p)

3. X vs. autosome dN/dS distributions in Mus and Phodopus

  • Single estimate for whole clade.
chrome_cols = c("#006ddb", "#006ddb", "#db6d00", "#db6d00")
chrome_labs = c("Mus Auto", "Mus X", "Phodopus Auto", "Phodopus X")

x_comps = list(c("Phodopus Auto", "Phodopus X"), c("Mus Auto", "Mus X"))

chrome_dnds_p = ggplot(chrome_data, aes(label, dn.ds, fill=label)) +
  geom_quasirandom(size=0.5, alpha=0.3, width=0.25, color="#d3d3d3") +
  geom_boxplot(outlier.shape=NA, alpha=0.7, width=0.5) +
  geom_signif(comparisons=x_comps, map_signif_level=TRUE, textsize=4, size=1, step_increase=0.12, margin_top=0.1) +
  xlab("Chromosomes") +
  ylab("dN/dS") +
  scale_fill_manual(limits=chrome_labs, values=chrome_cols) +
  bartheme() +
  theme(legend.position="none")

print(chrome_dnds_p)

4. X vs. autosome dS distributions in Mus and Phodopus, partitioning Phodopus X between recombining and non-recombining regions

  • Estimates are averaged across all branches in clade.
pho_xr = subset(pho_x, start < 35225145)
pho_xr$label = "Phodopus X recombining"
pho_xnr = subset(pho_x, start >= 35225145)
pho_xnr$label = "Phodopus X non-recombining"

chrome_data = rbind(mus_a, mus_x, pho_a, pho_xr, pho_xnr)

chrome_cols = c("#006ddb", "#006ddb", "#db6d00", "#db6d00", "#db6d00")
chrome_labs = c("Mus Auto", "Mus X", "Phodopus Auto", "Phodopus X recombining", "Phodopus X non-recombining")

x_comps = list(c("Phodopus X recombining", "Phodopus X non-recombining"), c("Phodopus Auto", "Phodopus X recombining"), c("Phodopus Auto", "Phodopus X non-recombining"))

chrome_ds_p = ggplot(chrome_data, aes(label, ds, fill=label)) +
  geom_quasirandom(size=0.5, alpha=0.3, width=0.25, color="#d3d3d3") +
  geom_boxplot(outlier.shape=NA, alpha=0.7, width=0.5) +
  geom_signif(comparisons=x_comps, map_signif_level=TRUE, textsize=4, size=1, step_increase=0.12, margin_top=0.1) +
  xlab("Chromosomes") +
  ylab("ds") +
  scale_fill_manual(limits=chrome_labs, values=chrome_cols) +
  bartheme() +
  theme(legend.position="none",
        axis.text.x = element_text(angle=10, hjust=1, size=10)
        )

print(chrome_ds_p)

5. X vs. autosome dN distributions in Mus and Phodopus, partitioning Phodopus X between recombining and non-recombining regions

  • Estimates are averaged across all branches in clade.
chrome_cols = c("#006ddb", "#006ddb", "#db6d00", "#db6d00", "#db6d00")
chrome_labs = c("Mus Auto", "Mus X", "Phodopus Auto", "Phodopus X recombining", "Phodopus X non-recombining")

x_comps = list(c("Phodopus X recombining", "Phodopus X non-recombining"), c("Phodopus Auto", "Phodopus X recombining"), c("Phodopus Auto", "Phodopus X non-recombining"))

chrome_dn_p = ggplot(chrome_data, aes(label, dn, fill=label)) +
  geom_quasirandom(size=0.5, alpha=0.3, width=0.25, color="#d3d3d3") +
  geom_boxplot(outlier.shape=NA, alpha=0.7, width=0.5) +
  geom_signif(comparisons=x_comps, map_signif_level=TRUE, textsize=4, size=1, step_increase=0.12, margin_top=0.1) +
  xlab("Chromosomes") +
  ylab("dN") +
  scale_fill_manual(limits=chrome_labs, values=chrome_cols) +
  bartheme() +
  theme(legend.position="none",
        axis.text.x = element_text(angle=10, hjust=1, size=10)
        )

print(chrome_dn_p)

6. X vs. autosome dN/dS distributions in Mus and Phodopus, partitioning Phodopus X between recombining and non-recombining regions

  • Single estimate for whole clade.
chrome_cols = c("#006ddb", "#006ddb", "#db6d00", "#db6d00", "#db6d00")
chrome_labs = c("Mus Auto", "Mus X", "Phodopus Auto", "Phodopus X recombining", "Phodopus X non-recombining")

x_comps = list(c("Phodopus X recombining", "Phodopus X non-recombining"), c("Phodopus Auto", "Phodopus X recombining"), c("Phodopus Auto", "Phodopus X non-recombining"))

chrome_dnds_p = ggplot(chrome_data, aes(label, dn.ds, fill=label)) +
  geom_quasirandom(size=0.5, alpha=0.3, width=0.25, color="#d3d3d3") +
  geom_boxplot(outlier.shape=NA, alpha=0.7, width=0.5) +
  geom_signif(comparisons=x_comps, map_signif_level=TRUE, textsize=4, size=1, step_increase=0.12, margin_top=0.1) +
  xlab("Chromosomes") +
  ylab("dN/dS") +
  scale_fill_manual(limits=chrome_labs, values=chrome_cols) +
  bartheme() +
  theme(legend.position="none",
        axis.text.x = element_text(angle=10, hjust=1, size=10)
        )

print(chrome_dnds_p)

7. M2 summary stats

t5_data = data.frame("chrome"=c("Mus Auto", "Mus X", "Phodopus Auto", "Phodopus X non-recombining", "Phodopus X recombining"), 
                     "genes"=c(length(mus_a[,1]), length(mus_x[,1]), length(pho_a[,1]), length(pho_xnr[,1]), length(pho_xr[,1])), 
                     "avg.ds"=c(mean(mus_a$ds), mean(mus_x$ds), mean(pho_a$ds), mean(pho_xnr$ds), mean(pho_xr$ds)), 
                     "avg.dn"=c(mean(mus_a$dn), mean(mus_x$dn), mean(pho_a$dn), mean(pho_xnr$dn), mean(pho_xr$dn)), 
                     "avg.dnds"=c(mean(mus_a$dn.ds), mean(mus_x$dn.ds), mean(pho_a$dn.ds), mean(pho_xnr$dn.ds), mean(pho_xr$dn.ds))
                     )

rownames(t5_data) = c()

t5_data %>% kable(col.names=c("Chromosome","Number of genes", "Avg. dS", "Avg. dN", "Avg. dN/dS"), caption="Substitution rates for Mus and Phodopus branches", align="l") %>% kable_styling(bootstrap_options=c("striped", "condensed", "responsive"), full_width=F)# %>%
Substitution rates for Mus and Phodopus branches
Chromosome Number of genes Avg. dS Avg. dN Avg. dN/dS
Mus Auto 1 0.0305418 0.0042116 0.1474844
Mus X 1 0.0233840 0.0044003 0.1906650
Phodopus Auto 1 0.0217890 0.0034842 0.1745578
Phodopus X non-recombining 1 0.0155352 0.0026421 0.1825200
Phodopus X recombining 1 0.0165241 0.0036164 0.2217289
  #column_spec(1, width="7em") %>%
  #column_spec(2, width="13em")

Results (Hyphy free ratio model [FitMG94], branch leading to phodopus and mus only)

pho_mg_data = read.csv("../../data/mol-evol/phodopus-5spec-mg94-local.csv", header=T, comment.char="#")
pho_mg_data$clade = as.character(pho_mg_data$clade)
pho_mg_data$clade[pho_mg_data$clade=="psun;pcam"] = "pcam;psun"
pho_mg_data$clade[pho_mg_data$clade=="psun;pcam;prob"] = "pcam;prob;psun"
pho_mg_data$clade[pho_mg_data$clade=="psun;prob;pcam"] = "pcam;prob;psun"
pho_mg_data$clade[pho_mg_data$clade=="pcam;psun;prob"] = "pcam;prob;psun"
pho_mg_data$clade[pho_mg_data$clade=="prob;psun;pcam"] = "pcam;prob;psun"
pho_mg_data$clade[pho_mg_data$clade=="prob;pcam;psun"] = "pcam;prob;psun"
# Phodopus data

mus_mg_data = read.csv("../../data/mol-evol/mus-4spec-mg94-local.csv", header=T, comment.char="#")
mus_mg_data$clade = as.character(mus_mg_data$clade)
mus_mg_data$clade[mus_mg_data$clade=="mmus;mcar"] = "mcar;mmus"
mus_mg_data$clade[mus_mg_data$clade=="mmus;mcar;mpah"] = "mcar;mmus;mpah"
mus_mg_data$clade[mus_mg_data$clade=="mmus;mpah;mcar"] = "mcar;mmus;mpah"
mus_mg_data$clade[mus_mg_data$clade=="mpah;mmus;mcar"] = "mcar;mmus;mpah"
mus_mg_data$clade[mus_mg_data$clade=="mpah;mcar;mmus"] = "mcar;mmus;mpah"
mus_mg_data$clade[mus_mg_data$clade=="mcar;mpah;mmus"] = "mcar;mmus;mpah"
# Mus data

0. dS distributions by branch for filtering

pho_data$avg.dnds = NA
pho_data$avg.dn = NA
pho_data$avg.ds = NA
pho_data$num.monophyletic = NA
pho_data$num.monophyletic.filtered = NA

pho_clade_data = list()
pho_clade_data_f = list()
pho_ds_dists = list()
i =  1
for(c in pho_data$clade){
  if(pho_data$node.type[pho_data$clade==c]=="ROOT"){
    next
  }
  
  cur_label = as.character(pho_data$clade.label[pho_data$clade==c])
  cur_data = subset(pho_mg_data, clade==c & ds <= 2)
  cur_data$clade.label = cur_label
  
  ds_dist = ggplot(cur_data, aes(x=ds)) +
    geom_histogram(color="#000000", fill="#d3d3d3") +
    scale_y_continuous(expand=c(0,0)) +
    xlab("dS") +
    ylab("# of genes") +
    ggtitle(pho_data$clade.label[pho_data$clade==c]) +
    theme(axis.text.x = element_text(angle=90, hjust=1, size=8))
  
  #print(ds_dist)
  #stop()
  pho_ds_dists[[i]] = ds_dist
  i = i + 1
  
  cur_data_f = subset(cur_data, ds < 0.3 & dn.ds <= 2)
  
  pho_data$num.monophyletic[pho_data$clade==c] = nrow(cur_data)
  pho_data$num.monophyletic.filtered[pho_data$clade==c] = nrow(cur_data_f)
  pho_data$avg.dnds[pho_data$clade==c] = mean(cur_data_f$dn.ds)
  pho_data$avg.dn[pho_data$clade==c] = mean(cur_data_f$dn)
  pho_data$avg.ds[pho_data$clade==c] = mean(cur_data$ds)
  
  pho_clade_data[[cur_label]] = length(cur_data[,1])
  pho_clade_data_f[[cur_label]] = cur_data_f
}
mus_data$avg.dnds = NA
mus_data$avg.dn = NA
mus_data$avg.ds = NA
mus_data$num.monophyletic = NA
mus_data$num.monophyletic.filtered = NA

mus_clade_data = list()
mus_clade_data_f = list()
mus_ds_dists = list()
i =  1
for(c in mus_data$clade){
  if(mus_data$node.type[mus_data$clade==c]=="ROOT"){
    next
  }
  print(c)
  cur_label = as.character(mus_data$clade.label[mus_data$clade==c])
  cur_data = subset(mus_mg_data, clade==c & ds <= 2)
  cur_data$clade.label = cur_label

  ds_dist = ggplot(cur_data, aes(x=ds)) +
    geom_histogram(color="#000000", fill="#d3d3d3") +
    scale_y_continuous(expand=c(0,0)) +
    xlab("dS") +
    ylab("# of genes") +
    ggtitle(mus_data$clade.label[mus_data$clade==c]) +
    theme(axis.text.x = element_text(angle=90, hjust=1, size=8))

  #print(ds_dist)
  #stop()
  mus_ds_dists[[i]] = ds_dist
  i = i + 1

  cur_data_f = subset(cur_data, ds < 0.3 & dn.ds <= 2)

  mus_data$num.monophyletic[mus_data$clade==c] = nrow(cur_data)
  mus_data$num.monophyletic.filtered[mus_data$clade==c] = nrow(cur_data_f)
  mus_data$avg.dnds[mus_data$clade==c] = mean(cur_data_f$dn.ds)
  mus_data$avg.dn[mus_data$clade==c] = mean(cur_data_f$dn)
  mus_data$avg.ds[mus_data$clade==c] = mean(cur_data$ds)

  mus_clade_data[[cur_label]] = length(cur_data[,1])
  mus_clade_data_f[[cur_label]] = cur_data_f
}
## [1] "rnor"
## [1] "mmus"
## [1] "mcar"
## [1] "mpah"
## [1] "mcar;mmus;mpah"
## [1] "mcar;mmus"
ds_plots = plot_grid(plotlist=pho_ds_dists, nrow=3)
print(ds_plots)

ds_plots = plot_grid(plotlist=mus_ds_dists, nrow=3)
print(ds_plots)

1. X vs. autosome dS distributions in Mus and Phodopus

pho_branches = c("pho")
pho_a = data.frame()
pho_x = data.frame()
for(b in pho_branches){
  pho_a = rbind(pho_a, subset(pho_clade_data_f[[b]], chr!="ScmyWZ3_7747_HRSCAF_7900"))
  pho_x = rbind(pho_x, subset(pho_clade_data_f[[b]], chr=="ScmyWZ3_7747_HRSCAF_7900"))
}
#pho_x = pho_x %>% group_by(id, chr, start, end) %>% 
#  summarize(ds=mean(ds, na.rm=T), dn=mean(dn, na.rm=T), dn.ds=mean(dn.ds, na.rm=T))
pho_x$label = "Phodopus X"
#pho_a = pho_a %>% group_by(id, chr, start, end) %>% 
#  summarize(ds=mean(ds, na.rm=T), dn=mean(dn, na.rm=T), dn.ds=mean(dn.ds, na.rm=T))
pho_a$label = "Phodopus Auto"

mus_branches = c("mus")
mus_a = data.frame()
mus_x = data.frame()
for(b in mus_branches){
  mus_a = rbind(mus_a, subset(mus_clade_data_f[[b]], chr!="X"))
  mus_x = rbind(mus_x, subset(mus_clade_data_f[[b]], chr=="X"))
}
#mus_x = mus_x %>% group_by(id, chr, start, end) %>% 
#  summarize(ds=mean(ds, na.rm=T), dn=mean(dn, na.rm=T), dn.ds=mean(dn.ds, na.rm=T))
mus_x$label = "Mus X"
#mus_a = mus_a %>% group_by(id, chr, start, end) %>% 
#  summarize(ds=mean(ds, na.rm=T), dn=mean(dn, na.rm=T), dn.ds=mean(dn.ds, na.rm=T))
mus_a$label = "Mus Auto"

chrome_data = rbind(pho_x, pho_a, mus_x, mus_a)

# t = as.data.frame(table(chrome_data$chr))
# test = ggplot(t, aes(x=Var1, y=Freq)) +
#   geom_bar(stat="identity") +
#   scale_y_continuous(expand=c(0,0)) +
#   bartheme() +
#   theme()
# print(test)


#clade_cols = as.character(rodent_data$clade.col)
#clade_labs = as.character(rodent_data$clade.label)

chrome_cols = c("#006ddb", "#006ddb", "#db6d00", "#db6d00")
chrome_labs = c("Mus Auto", "Mus X", "Phodopus Auto", "Phodopus X")

x_comps = list(c("Phodopus Auto", "Phodopus X"), c("Mus Auto", "Mus X"))

chrome_ds_p = ggplot(chrome_data, aes(label, ds, fill=label)) +
  geom_quasirandom(size=0.5, alpha=0.3, width=0.25, color="#d3d3d3") +
  geom_boxplot(outlier.shape=NA, alpha=0.7, width=0.5) +
  geom_signif(comparisons=x_comps, map_signif_level=TRUE, textsize=4, size=1, step_increase=0.12, margin_top=0.1) +
  xlab("Chromosomes") +
  ylab("ds") +
  scale_fill_manual(limits=chrome_labs, values=chrome_cols) +
  bartheme() +
  theme(legend.position="none")

print(chrome_ds_p)

2. X vs. autosome dN distributions in Mus and Phodopus

chrome_cols = c("#006ddb", "#006ddb", "#db6d00", "#db6d00")
chrome_labs = c("Mus Auto", "Mus X", "Phodopus Auto", "Phodopus X")

x_comps = list(c("Phodopus Auto", "Phodopus X"), c("Mus Auto", "Mus X"))

chrome_dn_p = ggplot(chrome_data, aes(label, dn, fill=label)) +
  geom_quasirandom(size=0.5, alpha=0.3, width=0.25, color="#d3d3d3") +
  geom_boxplot(outlier.shape=NA, alpha=0.7, width=0.5) +
  geom_signif(comparisons=x_comps, map_signif_level=TRUE, textsize=4, size=1, step_increase=0.12, margin_top=0.1) +
  xlab("Chromosomes") +
  ylab("dN") +
  scale_fill_manual(limits=chrome_labs, values=chrome_cols) +
  bartheme() +
  theme(legend.position="none")

print(chrome_dn_p)

3. X vs. autosome dN/dS distributions in Mus and Phodopus

chrome_cols = c("#006ddb", "#006ddb", "#db6d00", "#db6d00")
chrome_labs = c("Mus Auto", "Mus X", "Phodopus Auto", "Phodopus X")

x_comps = list(c("Phodopus Auto", "Phodopus X"), c("Mus Auto", "Mus X"))

chrome_dnds_p = ggplot(chrome_data, aes(label, dn.ds, fill=label)) +
  geom_quasirandom(size=0.5, alpha=0.3, width=0.25, color="#d3d3d3") +
  geom_boxplot(outlier.shape=NA, alpha=0.7, width=0.5) +
  geom_signif(comparisons=x_comps, map_signif_level=TRUE, textsize=4, size=1, step_increase=0.12, margin_top=0.1) +
  xlab("Chromosomes") +
  ylab("dN/dS") +
  scale_fill_manual(limits=chrome_labs, values=chrome_cols) +
  bartheme() +
  theme(legend.position="none")

print(chrome_dnds_p)

4. X vs. autosome dS distributions in Mus and Phodopus, partitioning Phodopus X between recombining and non-recombining regions

  • Estimates are averaged across all branches in clade.
pho_xr = subset(pho_x, start < 35225145)
pho_xr$label = "Phodopus X recombining"
pho_xnr = subset(pho_x, start >= 35225145)
pho_xnr$label = "Phodopus X non-recombining"

chrome_data = rbind(mus_a, mus_x, pho_a, pho_xr, pho_xnr)

chrome_cols = c("#006ddb", "#006ddb", "#db6d00", "#db6d00", "#db6d00")
chrome_labs = c("Mus Auto", "Mus X", "Phodopus Auto", "Phodopus X recombining", "Phodopus X non-recombining")

x_comps = list(c("Phodopus X recombining", "Phodopus X non-recombining"), c("Phodopus Auto", "Phodopus X recombining"), c("Phodopus Auto", "Phodopus X non-recombining"))

chrome_ds_p = ggplot(chrome_data, aes(label, ds, fill=label)) +
  geom_quasirandom(size=0.5, alpha=0.3, width=0.25, color="#d3d3d3") +
  geom_boxplot(outlier.shape=NA, alpha=0.7, width=0.5) +
  geom_signif(comparisons=x_comps, map_signif_level=TRUE, textsize=4, size=1, step_increase=0.12, margin_top=0.1) +
  xlab("Chromosomes") +
  ylab("ds") +
  scale_fill_manual(limits=chrome_labs, values=chrome_cols) +
  bartheme() +
  theme(legend.position="none",
        axis.text.x = element_text(angle=10, hjust=1, size=10)
        )

print(chrome_ds_p)

5. X vs. autosome dN distributions in Mus and Phodopus, partitioning Phodopus X between recombining and non-recombining regions

  • Estimates are averaged across all branches in clade.
chrome_cols = c("#006ddb", "#006ddb", "#db6d00", "#db6d00", "#db6d00")
chrome_labs = c("Mus Auto", "Mus X", "Phodopus Auto", "Phodopus X recombining", "Phodopus X non-recombining")

x_comps = list(c("Phodopus X recombining", "Phodopus X non-recombining"), c("Phodopus Auto", "Phodopus X recombining"), c("Phodopus Auto", "Phodopus X non-recombining"))

chrome_dn_p = ggplot(chrome_data, aes(label, dn, fill=label)) +
  geom_quasirandom(size=0.5, alpha=0.3, width=0.25, color="#d3d3d3") +
  geom_boxplot(outlier.shape=NA, alpha=0.7, width=0.5) +
  geom_signif(comparisons=x_comps, map_signif_level=TRUE, textsize=4, size=1, step_increase=0.12, margin_top=0.1) +
  xlab("Chromosomes") +
  ylab("dN") +
  scale_fill_manual(limits=chrome_labs, values=chrome_cols) +
  bartheme() +
  theme(legend.position="none",
        axis.text.x = element_text(angle=10, hjust=1, size=10)
        )

print(chrome_dn_p)

6. X vs. autosome dN/dS distributions in Mus and Phodopus, partitioning Phodopus X between recombining and non-recombining regions

  • Single estimate for whole clade.
chrome_cols = c("#006ddb", "#006ddb", "#db6d00", "#db6d00", "#db6d00")
chrome_labs = c("Mus Auto", "Mus X", "Phodopus Auto", "Phodopus X recombining", "Phodopus X non-recombining")

x_comps = list(c("Phodopus X recombining", "Phodopus X non-recombining"), c("Phodopus Auto", "Phodopus X recombining"), c("Phodopus Auto", "Phodopus X non-recombining"))

chrome_dnds_p = ggplot(chrome_data, aes(label, dn.ds, fill=label)) +
  geom_quasirandom(size=0.5, alpha=0.3, width=0.25, color="#d3d3d3") +
  geom_boxplot(outlier.shape=NA, alpha=0.7, width=0.5) +
  geom_signif(comparisons=x_comps, map_signif_level=TRUE, textsize=4, size=1, step_increase=0.12, margin_top=0.1) +
  xlab("Chromosomes") +
  ylab("dN/dS") +
  scale_fill_manual(limits=chrome_labs, values=chrome_cols) +
  bartheme() +
  theme(legend.position="none",
        axis.text.x = element_text(angle=10, hjust=1, size=10)
        )

print(chrome_dnds_p)

7. HyPhy MG94 summary stats

t6_data = data.frame("chrome"=c("Mus Auto", "Mus X", "Phodopus Auto", "Phodopus X non-recombining", "Phodopus X recombining"), 
                     "genes"=c(length(mus_a[,1]), length(mus_x[,1]), length(pho_a[,1]), length(pho_xnr[,1]), length(pho_xr[,1])), 
                     "avg.ds"=c(mean(mus_a$ds), mean(mus_x$ds), mean(pho_a$ds), mean(pho_xnr$ds), mean(pho_xr$ds)), 
                     "avg.dn"=c(mean(mus_a$dn), mean(mus_x$dn), mean(pho_a$dn), mean(pho_xnr$dn), mean(pho_xr$dn)), 
                     "avg.dnds"=c(mean(mus_a$dn.ds), mean(mus_x$dn.ds), mean(pho_a$dn.ds), mean(pho_xnr$dn.ds), mean(pho_xr$dn.ds))
                     )

rownames(t6_data) = c()

t6_data %>% kable(col.names=c("Chromosome","Number of genes", "Avg. dS", "Avg. dN", "Avg. dN/dS"), caption="Substitution rates for Mus and Phodopus branches", align="l") %>% kable_styling(bootstrap_options=c("striped", "condensed", "responsive"), full_width=F)# %>%
Substitution rates for Mus and Phodopus branches
Chromosome Number of genes Avg. dS Avg. dN Avg. dN/dS
Mus Auto 14656 0.1582118 0.0201877 0.1572215
Mus X 515 0.1300482 0.0206157 0.1905945
Phodopus Auto 9566 0.0651789 0.0090807 0.1806505
Phodopus X non-recombining 256 0.0499538 0.0091739 0.2187543
Phodopus X recombining 142 0.0555683 0.0094026 0.2030216
  #column_spec(1, width="7em") %>%
  #column_spec(2, width="13em")

Results (Hyphy free ratio local model [FitMG94], branch leading to phodopus only, only Cgri as outgroup)

pho_tree_file = "../../data/mol-evol/phodopus-4spec-iqtree-concat-rooted.tre"
pho_tree = read.tree(pho_tree_file)
pho_trait_file = "../../data/mol-evol/phodopus-4spec-concat-traits.csv"
pho_data = read.csv(pho_trait_file, header=TRUE)
pho_data = pho_data[order(pho_data$ggtree),]
pho_branch_labels = pho_data
pho_branch_labels$node[pho_branch_labels$node.type=='tip'] = NA
pho_data$clade = as.character(pho_data$clade)
# Phodopus data
pho_mg_data = read.csv("../../data/mol-evol/phodopus-4spec-mg94-local.csv", header=T, comment.char="#")
pho_mg_data$clade = as.character(pho_mg_data$clade)
#names(pho_mg_data)[8] = "n1"
#names(pho_mg_data)[9] = "n2"
#names(pho_mg_data)[11] = "dn"
#names(pho_mg_data)[12] = "ds"
#pho_mg_data = subset(pho_mg_data, select=-c(n1, n2))
#pho_mg_data = subset(pho_mg_data, select=-c(nonsynonymous.bl, synonymous.bl))
# Phodopus data

0. dS distributions by branch for filtering

pho_data$avg.dnds = NA
pho_data$avg.dn = NA
pho_data$avg.ds = NA
pho_data$num.monophyletic = NA
pho_data$num.monophyletic.filtered = NA

pho_clade_data = list()
pho_clade_data_f = list()
pho_ds_dists = list()
i =  1
for(c in pho_data$clade){
  if(pho_data$node.type[pho_data$clade==c]=="ROOT"){
    next
  }
  
  cur_label = as.character(pho_data$clade.label[pho_data$clade==c])
  cur_data = subset(pho_mg_data, clade==c & ds <= 2)
  cur_data$clade.label = cur_label
  
  ds_dist = ggplot(cur_data, aes(x=ds)) +
    geom_histogram(color="#000000", fill="#d3d3d3") +
    scale_y_continuous(expand=c(0,0)) +
    xlab("dS") +
    ylab("# of genes") +
    ggtitle(pho_data$clade.label[pho_data$clade==c]) +
    theme(axis.text.x = element_text(angle=90, hjust=1, size=8))
  
  #print(ds_dist)
  #stop()
  pho_ds_dists[[i]] = ds_dist
  i = i + 1
  
  cur_data_f = subset(cur_data, ds < 0.3 & dn.ds <= 2)
  
  pho_data$num.monophyletic[pho_data$clade==c] = nrow(cur_data)
  pho_data$num.monophyletic.filtered[pho_data$clade==c] = nrow(cur_data_f)
  pho_data$avg.dnds[pho_data$clade==c] = mean(cur_data_f$dn.ds)
  pho_data$avg.dn[pho_data$clade==c] = mean(cur_data_f$dn)
  pho_data$avg.ds[pho_data$clade==c] = mean(cur_data$ds)
  
  pho_clade_data[[cur_label]] = length(cur_data[,1])
  pho_clade_data_f[[cur_label]] = cur_data_f
}
ds_plots = plot_grid(plotlist=pho_ds_dists, nrow=3)
print(ds_plots)

1. X vs. autosome dS distributions in Mus and Phodopus

pho_branches = c("pho")
pho_a = data.frame()
pho_x = data.frame()
for(b in pho_branches){
  pho_a = rbind(pho_a, subset(pho_clade_data_f[[b]], chr!="ScmyWZ3_7747_HRSCAF_7900"))
  pho_x = rbind(pho_x, subset(pho_clade_data_f[[b]], chr=="ScmyWZ3_7747_HRSCAF_7900"))
}
#pho_x = pho_x %>% group_by(id, chr, start, end) %>% 
#  summarize(ds=mean(ds, na.rm=T), dn=mean(dn, na.rm=T), dn.ds=mean(dn.ds, na.rm=T))
pho_x$label = "Phodopus X"
#pho_a = pho_a %>% group_by(id, chr, start, end) %>% 
#  summarize(ds=mean(ds, na.rm=T), dn=mean(dn, na.rm=T), dn.ds=mean(dn.ds, na.rm=T))
pho_a$label = "Phodopus Auto"

mus_branches = c("mus")
mus_a = data.frame()
mus_x = data.frame()
for(b in mus_branches){
  mus_a = rbind(mus_a, subset(mus_clade_data_f[[b]], chr!="X"))
  mus_x = rbind(mus_x, subset(mus_clade_data_f[[b]], chr=="X"))
}
#mus_x = mus_x %>% group_by(id, chr, start, end) %>% 
#  summarize(ds=mean(ds, na.rm=T), dn=mean(dn, na.rm=T), dn.ds=mean(dn.ds, na.rm=T))
mus_x$label = "Mus X"
#mus_a = mus_a %>% group_by(id, chr, start, end) %>% 
#  summarize(ds=mean(ds, na.rm=T), dn=mean(dn, na.rm=T), dn.ds=mean(dn.ds, na.rm=T))
mus_a$label = "Mus Auto"

chrome_data = rbind(pho_x, pho_a, mus_x, mus_a)

# t = as.data.frame(table(chrome_data$chr))
# test = ggplot(t, aes(x=Var1, y=Freq)) +
#   geom_bar(stat="identity") +
#   scale_y_continuous(expand=c(0,0)) +
#   bartheme() +
#   theme()
# print(test)


#clade_cols = as.character(rodent_data$clade.col)
#clade_labs = as.character(rodent_data$clade.label)

chrome_cols = c("#006ddb", "#006ddb", "#db6d00", "#db6d00")
chrome_labs = c("Mus Auto", "Mus X", "Phodopus Auto", "Phodopus X")

x_comps = list(c("Phodopus Auto", "Phodopus X"), c("Mus Auto", "Mus X"))

chrome_ds_p = ggplot(chrome_data, aes(label, ds, fill=label)) +
  geom_quasirandom(size=0.5, alpha=0.3, width=0.25, color="#d3d3d3") +
  geom_boxplot(outlier.shape=NA, alpha=0.7, width=0.5) +
  geom_signif(comparisons=x_comps, map_signif_level=TRUE, textsize=4, size=1, step_increase=0.12, margin_top=0.1) +
  xlab("Chromosomes") +
  ylab("ds") +
  scale_fill_manual(limits=chrome_labs, values=chrome_cols) +
  bartheme() +
  theme(legend.position="none")

print(chrome_ds_p)

2. X vs. autosome dN distributions in Mus and Phodopus

chrome_cols = c("#006ddb", "#006ddb", "#db6d00", "#db6d00")
chrome_labs = c("Mus Auto", "Mus X", "Phodopus Auto", "Phodopus X")

x_comps = list(c("Phodopus Auto", "Phodopus X"), c("Mus Auto", "Mus X"))

chrome_dn_p = ggplot(chrome_data, aes(label, dn, fill=label)) +
  geom_quasirandom(size=0.5, alpha=0.3, width=0.25, color="#d3d3d3") +
  geom_boxplot(outlier.shape=NA, alpha=0.7, width=0.5) +
  geom_signif(comparisons=x_comps, map_signif_level=TRUE, textsize=4, size=1, step_increase=0.12, margin_top=0.1) +
  xlab("Chromosomes") +
  ylab("dN") +
  scale_fill_manual(limits=chrome_labs, values=chrome_cols) +
  bartheme() +
  theme(legend.position="none")

print(chrome_dn_p)

3. X vs. autosome dN/dS distributions in Mus and Phodopus

chrome_cols = c("#006ddb", "#006ddb", "#db6d00", "#db6d00")
chrome_labs = c("Mus Auto", "Mus X", "Phodopus Auto", "Phodopus X")

x_comps = list(c("Phodopus Auto", "Phodopus X"), c("Mus Auto", "Mus X"))

chrome_dnds_p = ggplot(chrome_data, aes(label, dn.ds, fill=label)) +
  geom_quasirandom(size=0.5, alpha=0.3, width=0.25, color="#d3d3d3") +
  geom_boxplot(outlier.shape=NA, alpha=0.7, width=0.5) +
  geom_signif(comparisons=x_comps, map_signif_level=TRUE, textsize=4, size=1, step_increase=0.12, margin_top=0.1) +
  xlab("Chromosomes") +
  ylab("dN/dS") +
  scale_fill_manual(limits=chrome_labs, values=chrome_cols) +
  bartheme() +
  theme(legend.position="none")

print(chrome_dnds_p)

4. X vs. autosome dS distributions in Mus and Phodopus, partitioning Phodopus X between recombining and non-recombining regions

  • Estimates are averaged across all branches in clade.
pho_xr = subset(pho_x, start < 35225145)
pho_xr$label = "Phodopus X recombining"
pho_xnr = subset(pho_x, start >= 35225145)
pho_xnr$label = "Phodopus X non-recombining"

chrome_data = rbind(mus_a, mus_x, pho_a, pho_xr, pho_xnr)

chrome_cols = c("#006ddb", "#006ddb", "#db6d00", "#db6d00", "#db6d00")
chrome_labs = c("Mus Auto", "Mus X", "Phodopus Auto", "Phodopus X recombining", "Phodopus X non-recombining")

x_comps = list(c("Phodopus X recombining", "Phodopus X non-recombining"), c("Phodopus Auto", "Phodopus X recombining"), c("Phodopus Auto", "Phodopus X non-recombining"))

chrome_ds_p = ggplot(chrome_data, aes(label, ds, fill=label)) +
  geom_quasirandom(size=0.5, alpha=0.3, width=0.25, color="#d3d3d3") +
  geom_boxplot(outlier.shape=NA, alpha=0.7, width=0.5) +
  geom_signif(comparisons=x_comps, map_signif_level=TRUE, textsize=4, size=1, step_increase=0.12, margin_top=0.1) +
  xlab("Chromosomes") +
  ylab("ds") +
  scale_fill_manual(limits=chrome_labs, values=chrome_cols) +
  bartheme() +
  theme(legend.position="none",
        axis.text.x = element_text(angle=10, hjust=1, size=10)
        )

print(chrome_ds_p)

5. X vs. autosome dN distributions in Mus and Phodopus, partitioning Phodopus X between recombining and non-recombining regions

  • Estimates are averaged across all branches in clade.
chrome_cols = c("#006ddb", "#006ddb", "#db6d00", "#db6d00", "#db6d00")
chrome_labs = c("Mus Auto", "Mus X", "Phodopus Auto", "Phodopus X recombining", "Phodopus X non-recombining")

x_comps = list(c("Phodopus X recombining", "Phodopus X non-recombining"), c("Phodopus Auto", "Phodopus X recombining"), c("Phodopus Auto", "Phodopus X non-recombining"))

chrome_dn_p = ggplot(chrome_data, aes(label, dn, fill=label)) +
  geom_quasirandom(size=0.5, alpha=0.3, width=0.25, color="#d3d3d3") +
  geom_boxplot(outlier.shape=NA, alpha=0.7, width=0.5) +
  geom_signif(comparisons=x_comps, map_signif_level=TRUE, textsize=4, size=1, step_increase=0.12, margin_top=0.1) +
  xlab("Chromosomes") +
  ylab("dN") +
  scale_fill_manual(limits=chrome_labs, values=chrome_cols) +
  bartheme() +
  theme(legend.position="none",
        axis.text.x = element_text(angle=10, hjust=1, size=10)
        )

print(chrome_dn_p)

6. X vs. autosome dN/dS distributions in Mus and Phodopus, partitioning Phodopus X between recombining and non-recombining regions

  • Single estimate for whole clade.
chrome_cols = c("#006ddb", "#006ddb", "#db6d00", "#db6d00", "#db6d00")
chrome_labs = c("Mus Auto", "Mus X", "Phodopus Auto", "Phodopus X recombining", "Phodopus X non-recombining")

x_comps = list(c("Phodopus X recombining", "Phodopus X non-recombining"), c("Phodopus Auto", "Phodopus X recombining"), c("Phodopus Auto", "Phodopus X non-recombining"))

chrome_dnds_p = ggplot(chrome_data, aes(label, dn.ds, fill=label)) +
  geom_quasirandom(size=0.5, alpha=0.3, width=0.25, color="#d3d3d3") +
  geom_boxplot(outlier.shape=NA, alpha=0.7, width=0.5) +
  geom_signif(comparisons=x_comps, map_signif_level=TRUE, textsize=4, size=1, step_increase=0.12, margin_top=0.1) +
  xlab("Chromosomes") +
  ylab("dN/dS") +
  scale_fill_manual(limits=chrome_labs, values=chrome_cols) +
  bartheme() +
  theme(legend.position="none",
        axis.text.x = element_text(angle=10, hjust=1, size=10)
        )

print(chrome_dnds_p)

7. HyPhy MG94 local summary stats

t6_data = data.frame("chrome"=c("Mus Auto", "Mus X", "Phodopus Auto", "Phodopus X non-recombining", "Phodopus X recombining"), 
                     "genes"=c(length(mus_a[,1]), length(mus_x[,1]), length(pho_a[,1]), length(pho_xnr[,1]), length(pho_xr[,1])), 
                     "avg.ds"=c(mean(mus_a$ds), mean(mus_x$ds), mean(pho_a$ds), mean(pho_xnr$ds), mean(pho_xr$ds)), 
                     "avg.dn"=c(mean(mus_a$dn), mean(mus_x$dn), mean(pho_a$dn), mean(pho_xnr$dn), mean(pho_xr$dn)), 
                     "avg.dnds"=c(mean(mus_a$dn.ds), mean(mus_x$dn.ds), mean(pho_a$dn.ds), mean(pho_xnr$dn.ds), mean(pho_xr$dn.ds))
                     )

rownames(t6_data) = c()

t6_data %>% kable(col.names=c("Chromosome","Number of genes", "Avg. dS", "Avg. dN", "Avg. dN/dS"), caption="Substitution rates for Mus and Phodopus branches", align="l") %>% kable_styling(bootstrap_options=c("striped", "condensed", "responsive"), full_width=F)# %>%
Substitution rates for Mus and Phodopus branches
Chromosome Number of genes Avg. dS Avg. dN Avg. dN/dS
Mus Auto 14656 0.1582118 0.0201877 0.1572215
Mus X 515 0.1300482 0.0206157 0.1905945
Phodopus Auto 12886 0.1243969 0.0176781 0.1796804
Phodopus X non-recombining 306 0.1005724 0.0189231 0.2208170
Phodopus X recombining 166 0.1098163 0.0212593 0.2361039
  #column_spec(1, width="7em") %>%
  #column_spec(2, width="13em")

Amino acid profile change

Ancestral reconstructions with M1 free-ratio model, branches leading to Mus clade and Phodopus Clade

pho_aa_data = read.csv("../../data/mol-evol/phodopus-5spec-m1-anc.csv", header=T, comment.char="#")
pho_aa_data = subset(pho_aa_data, clade=="pcam;prob;psun")

pho_aa_x = subset(pho_aa_data, chr=="ScmyWZ3_7747_HRSCAF_7900")
pho_aa_x$label = "Phodopus X"

pho_aa_auto = subset(pho_aa_data, chr!="ScmyWZ3_7747_HRSCAF_7900")
pho_aa_auto$label = "Phodopus Auto"


mus_aa_data = read.csv("../../data/mol-evol/mus-4spec-m1-anc.csv", header=T, comment.char="#")
mus_aa_data = subset(mus_aa_data, clade=="mcar;mmus;mpah")

mus_aa_x = subset(mus_aa_data, chr=="X")
mus_aa_x$label = "Mus X"

mus_aa_auto = subset(mus_aa_data, chr!="X")
mus_aa_auto$label = "Mus Auto"

aa_data = rbind(pho_aa_x, pho_aa_auto, mus_aa_x, mus_aa_auto)
x_comps = list(c("Phodopus Auto", "Phodopus X"), c("Mus Auto", "Mus X"), c("Mus Auto", "Phodopus Auto"), c("Mus X", "Phodopus X"))


chrome_cols = c("#006ddb", "#006ddb", "#db6d00", "#db6d00")
chrome_labs = c("Mus Auto", "Mus X", "Phodopus Auto", "Phodopus X")

aa_p = ggplot(aa_data, aes(label, avg.sub.score, fill=label)) +
  geom_quasirandom(size=0.5, alpha=0.3, width=0.25, color="#d3d3d3") +
  geom_boxplot(outlier.shape=NA, alpha=0.7, width=0.5) +
  geom_signif(comparisons=x_comps, map_signif_level=TRUE, textsize=4, size=1, step_increase=0.12, margin_top=0.1) +
  xlab("Branch") +
  ylab("Avg. BLOSUM score per gene") +
  scale_fill_manual(limits=chrome_labs, values=chrome_cols) +
  bartheme() +
  theme(legend.position="none")
print(aa_p)

pho_aa_xr = subset(pho_aa_x, start < 35225145)
pho_aa_xr$label = "Phodopus X recombining"

pho_aa_xn = subset(pho_aa_x, start >= 35225145)
pho_aa_xn$label = "Phodopus X non-recombining"

aa_data = rbind(pho_aa_xr, pho_aa_xn, pho_aa_auto, mus_aa_x, mus_aa_auto)

chrome_cols = c("#006ddb", "#006ddb", "#db6d00", "#db6d00", "#db6d00")
chrome_labs = c("Mus Auto", "Mus X", "Phodopus Auto", "Phodopus X recombining", "Phodopus X non-recombining")

x_comps = list(c("Phodopus X recombining", "Phodopus X non-recombining"), c("Phodopus Auto", "Phodopus X recombining"), c("Phodopus Auto", "Phodopus X non-recombining"))

aa_p = ggplot(aa_data, aes(label, avg.sub.score, fill=label)) +
  geom_quasirandom(size=0.5, alpha=0.3, width=0.25, color="#d3d3d3") +
  geom_boxplot(outlier.shape=NA, alpha=0.7, width=0.5) +
  geom_signif(comparisons=x_comps, map_signif_level=TRUE, textsize=4, size=1, step_increase=0.12, margin_top=0.1) +
  xlab("Branch") +
  ylab("Avg. BLOSUM score per gene") +
  scale_fill_manual(limits=chrome_labs, values=chrome_cols) +
  bartheme() +
  theme(legend.position="none",
        axis.text.x = element_text(angle=10, hjust=1, size=10))
print(aa_p)

Results summary

  1. Lower dS on X in both Mus and Phodopus
  2. Faster dN/dS on X in branches leading to Mus and Phodopus (M1), Mus clade (M2), but NOT phodopus clade (M2). Results consistent between PAML’s M1 model and HyPhy’s FitMG94.
  3. No difference between recombining and non-recombining portions of the X in phodopus.
  4. No big difference between amino acid profile changes between Mus and Phodopus or Autosomes and X.

< Back Home

cat("Page last updated:", format(Sys.time(), "%m/%d/%Y %H:%M:%S %Z"))
## Page last updated: 07/22/2021 15:02:03 MDT
htmltools::includeHTML("../html-chunks/rmd_footer.html")