Phodopus molecular evolution

#knitr::opts_chunk$set(echo = FALSE)
#this.dir <- dirname(parent.frame(2)$ofile)
#setwd(this.dir)

library(ggtree)
library(ggplot2)
library(ggbeeswarm)
library(cowplot)
library(ggsignif)
library(dplyr)
library(colorspace)
library(kableExtra)
source("../lib/design.r")



#htmltools::includeHTML("../html-chunks/nav.html")

< Back Home

Main questions

Does lack of recombination affect molecular evolution on the X chromosome in phodopus?

Mus and Phodopus species tree branch rates

Goal: compare rate distributions for TWO small sets of rodent species, including phodopus, to determine how lack of recombination on the phodopus X shapes molecular evolution on that chromosome.

pho_tree_file = "../../data/mol-evol/phodopus-5spec-iqtree-concat-rooted.tre"
pho_tree = read.tree(pho_tree_file)
pho_trait_file = "../../data/mol-evol/phodopus-5spec-concat-traits.csv"
pho_data = read.csv(pho_trait_file, header=TRUE)
pho_data = pho_data[order(pho_data$ggtree),]
pho_branch_labels = pho_data
pho_branch_labels$node[pho_branch_labels$node.type=='tip'] = NA
pho_data$clade = as.character(pho_data$clade)

pho_m1_data = read.csv("../../data/mol-evol/phodopus-5spec-m1.csv", header=T, comment.char="#")
pho_m1_data$clade = as.character(pho_m1_data$clade)
# Phodopus data

mus_tree_file = "../../data/mol-evol/mus-4spec-iqtree-concat-rooted.tre"
mus_tree = read.tree(mus_tree_file)
mus_trait_file = "../../data/mol-evol/mus-4spec-concat-traits.csv"
mus_data = read.csv(mus_trait_file, header=TRUE)
mus_data = mus_data[order(mus_data$ggtree),]
mus_branch_labels = mus_data
mus_branch_labels$node[mus_branch_labels$node.type=='tip'] = NA
mus_data$clade = as.character(mus_data$clade)

mus_m1_data = read.csv("../../data/mol-evol/mus-4spec-m1.csv", header=T, comment.char="#")
mus_m1_data$clade = as.character(mus_m1_data$clade)
# Mus data

pho_data$avg.dnds = NA
pho_data$avg.dn = NA
pho_data$avg.ds = NA
pho_data$num.monophyletic = NA
pho_data$num.monophyletic.filtered = NA

pho_clade_data = list()
pho_clade_data_f = list()
pho_ds_dists = list()
i =  1
for(c in pho_data$clade){
  if(pho_data$node.type[pho_data$clade==c]=="ROOT"){
    next
  }
  
  cur_label = as.character(pho_data$clade.label[pho_data$clade==c])
  cur_data = subset(pho_m1_data, clade==c & clade.ds <= 2)
  cur_data$clade.label = cur_label
  
  ds_dist = ggplot(cur_data, aes(x=clade.ds)) +
    geom_histogram(color="#000000", fill="#d3d3d3") +
    scale_y_continuous(expand=c(0,0)) +
    xlab("dS") +
    ylab("# of genes") +
    ggtitle(pho_data$clade.label[pho_data$clade==c]) +
    theme(axis.text.x = element_text(angle=90, hjust=1, size=8))
  
  #print(ds_dist)
  #stop()
  pho_ds_dists[[i]] = ds_dist
  i = i + 1
  
  cur_data_f = subset(cur_data, clade.ds < 0.3 & clade.dn.ds <= 2)
  
  pho_data$num.monophyletic[pho_data$clade==c] = nrow(cur_data)
  pho_data$num.monophyletic.filtered[pho_data$clade==c] = nrow(cur_data_f)
  pho_data$avg.dnds[pho_data$clade==c] = mean(cur_data_f$clade.dn.ds)
  pho_data$avg.dn[pho_data$clade==c] = mean(cur_data_f$clade.dn)
  pho_data$avg.ds[pho_data$clade==c] = mean(cur_data$clade.ds)
  
  pho_clade_data[[cur_label]] = length(cur_data[,1])
  pho_clade_data_f[[cur_label]] = cur_data_f
}

mus_data$avg.dnds = NA
mus_data$avg.dn = NA
mus_data$avg.ds = NA
mus_data$num.monophyletic = NA
mus_data$num.monophyletic.filtered = NA

mus_clade_data = list()
mus_clade_data_f = list()
mus_ds_dists = list()
i =  1
for(c in mus_data$clade){
  if(mus_data$node.type[mus_data$clade==c]=="ROOT"){
    next
  }
  
  cur_label = as.character(mus_data$clade.label[mus_data$clade==c])
  cur_data = subset(mus_m1_data, clade==c & clade.ds <= 2)
  cur_data$clade.label = cur_label
  
  ds_dist = ggplot(cur_data, aes(x=clade.ds)) +
    geom_histogram(color="#000000", fill="#d3d3d3") +
    scale_y_continuous(expand=c(0,0)) +
    xlab("dS") +
    ylab("# of genes") +
    ggtitle(mus_data$clade.label[mus_data$clade==c]) +
    theme(axis.text.x = element_text(angle=90, hjust=1, size=8))
  
  #print(ds_dist)
  #stop()
  mus_ds_dists[[i]] = ds_dist
  i = i + 1
  
  cur_data_f = subset(cur_data, clade.ds < 0.3 & clade.dn.ds <= 2)
  
  mus_data$num.monophyletic[mus_data$clade==c] = nrow(cur_data)
  mus_data$num.monophyletic.filtered[mus_data$clade==c] = nrow(cur_data_f)
  mus_data$avg.dnds[mus_data$clade==c] = mean(cur_data_f$clade.dn.ds)
  mus_data$avg.dn[mus_data$clade==c] = mean(cur_data_f$clade.dn)
  mus_data$avg.ds[mus_data$clade==c] = mean(cur_data$clade.ds)
  
  mus_clade_data[[cur_label]] = length(cur_data[,1])
  mus_clade_data_f[[cur_label]] = cur_data_f
}

Methods

Step 0: Genome assembly and annotation (expand later)

Phodopus sungorous was de novo assembled with Dovetails, and campbelli and robovorski were assembled based on iterative mapping to the sungorus assembly. Annotation was done using the Maker pipeline (Seb’s first attempt).

Step 1: Select orthologs between 5 hamster species and 3 mouse+rat species separately

We selected 2 previously sequenced hamster species to estimate and compare rates of synonymous and non-synonymous substitutions with our set of newly sequenced Phodopus genomes. Separately, we selected 3 mouse species + rat to compare rates between phodopus and mus (Table 1). The CDS sequences for these species were downloaded from Ensembl (release 100).

t1_data_pho = subset(pho_data, node.type=='tip')
t1_data_pho = subset(t1_data_pho, select=c(species, common, source))
t1_data_pho$set = "Phodopus"
names(t1_data_pho) = c("Species", "Common name", "Source", "Dataset")
rownames(t1_data_pho) = c()
# Phodopus data

t1_data_mus = subset(mus_data, node.type=='tip')
t1_data_mus = subset(t1_data_mus, select=c(species, common, source))
t1_data_mus$set = "Mus"
names(t1_data_mus) = c("Species", "Common name", "Source", "Dataset")
rownames(t1_data_mus) = c()
# Mus data

t1_data = rbind(t1_data_pho, t1_data_mus)

t1_data %>% kable(caption="Selected species for comparative molecular evolution analysis") %>% kable_styling(bootstrap_options=c("striped", "condensed", "responsive"), full_width=F)

Selected species for comparative molecular evolution analysis
Species	Common name	Source	Dataset
Cricetulus_griseus	Chinese hamster	Ensembl	Phodopus
Mesocricetus_auratus	Golden hamster	Ensembl	Phodopus
Phodopus_campbelli	Campbell’s dwarf hamster	This study	Phodopus
Phodopus_sungorus	Djungarian hamster	This study	Phodopus
Phodopus_robovorski	Roborvorski hamster	This study	Phodopus
Rattus_norvegicus	Brown rat	Ensembl	Mus
Mus_musculus	House mouse	Ensembl	Mus
Mus_caroli	Ryukyu mouse	Ensembl	Mus
Mus_pahari	Gairdner’s shrewmouse	Ensembl	Mus

Phylogenetic and comparative analyses often rely on selection of a single sequence from each species to assess relatedness of genes. However because genes can have multiple transcripts due to alternative splicing of exons and because genes can be duplicated or lost over time this requires selection of sequences in each species at two levels: 1) selection of a single transcript within a gene and 2) selection of genes without paralogs (i.e. those that are single copy across all species). The most common method for selection of transcripts is to simply pick the longest transcript in each species.

Mus transcript selection:

Total Mus transcripts: 142,447
Longest transcripts: 21,875
Single-copy between 4 species: 15,772
X-genes before alignments: 551
X-genes after alignments: 526

Phodopus transcript selection:

Total transcripts predicted from Maker: 27,988
Longest BLAST match to CGRI transcripts: 14,768
Also single-copy in MAUR: 12,413
X-genes before alignments: 493
X-genes after alignments: 413

Step 2: Align orthologs

We aligned the coding sequences from each single-copy ortholog using a suite of programs in the MACSE software as follows:

Trim non-homologous regions with MACSE’s trimNonHomologousFragments program
Codon aware alignments with MACSE’s alignSequences program
Trim gappy edges from alignment with MACSE’s trimAlignment program
Removed alignments with premature stop codons

Step 3: Construct species and gene trees

We constructed maximum likelihood gene trees and a concatenated species tree using IQ-tree aligned coding sequences. We used IQ-trees concordance factors to assess the confidence of the species tree as well as the consistency of the gene trees. We find that most branches in the species tree are consistently present in a high proportion of gene trees (>90% for all branches).

node_check = F

##########
# Node label checking
if(node_check){
  node_test = ggtree(pho_tree, size=2, ladderize=F) +
    ggplot2::xlim(0, 0.05) +
    geom_tiplab(color="#333333", fontface='italic', size=5) +
    geom_text(aes(label=node), hjust=-.3, vjust=-.3, color="#ff6db6") +
    geom_nodepoint(color="#666666", alpha=0.85, size=4)
  
  #node_test = node_test %>% rotate(25) %>% rotate(26) %>% rotate(27) %>% rotate(28) %>% rotate(29) %>% rotate(30) %>% rotate(32) %>%
  #  rotate(33) %>% rotate(35) %>% rotate(36) %>% rotate(39) %>% rotate(41) %>% rotate(44)
  
  print(node_test)  
  stop("Node check ok")
}
# Node label checking
##########

h = corecol(numcol=1, pal="wilke", offset=3)
l = corecol(numcol=1, offset=3)

clade_cols = corecol(numcol=3)
clade_labs = c("pho", "cs")

pho_tree_fig = ggtree(pho_tree, size=2, ladderize=F, aes(color=pho_data$gcf)) +
  scale_color_continuous(name='gCF', low=l, high=h, limits=c(0,100)) +
  ggplot2::xlim(0, 0.07) +
  geom_tiplab(aes(label=pho_data$species), color="#333333", fontface='italic', size=5) +
  #geom_cladelabel(node=13, size=2, label="pho", color="green", offset = 0.07, align=T) +
  #geom_text(aes(label=rodent_data$label), hjust=-.3, vjust=-.3, color="#ff6db6") +
  geom_label(aes(x=branch, label=pho_branch_labels$branch), color="#d3d3d3", fill=pho_branch_labels$clade.col) + 
  #labs(caption="Figure 1: 5 species hamster phylogeny constructed from concatenation of XX single-copy orthologs.\nGene concordance factors (gCF) are indicated by branch colors.") +
  theme(legend.position=c(0.9,0.25),
        plot.caption=element_text(hjust=0))

mus_tree_fig = ggtree(mus_tree, size=2, ladderize=F, aes(color=mus_data$gcf)) +
  scale_color_continuous(name='gCF', low=l, high=h, limits=c(0,100)) +
  ggplot2::xlim(0, 0.07) +
  geom_tiplab(aes(label=mus_data$species), color="#333333", fontface='italic', size=5) +
  #geom_cladelabel(node=13, size=2, label="pho", color="green", offset = 0.07, align=T) +
  #geom_text(aes(label=rodent_data$label), hjust=-.3, vjust=-.3, color="#ff6db6") +
  geom_label(aes(x=branch, label=mus_branch_labels$branch), color="#d3d3d3", fill=mus_branch_labels$clade.col) + 
  #labs(caption="Figure 1: 5 species hamster phylogeny constructed from concatenation of XX single-copy orthologs.\nGene concordance factors (gCF) are indicated by branch colors.") +
  theme(legend.position=c(0.9,0.25),
        plot.caption=element_text(hjust=0))

#geom_text(aes(label=rodent_data$support), hjust=-.1, color="#006ddb") +
#geom_nodepoint(color="#666666", alpha=0.85, size=4)

#print(mus_tree_fig)

tree_fig = plot_grid(pho_tree_fig, mus_tree_fig, nrow=1, align='vh')
cap = "Figure 1: (left) 5 species hamster phylogeny constructed from concatenation of 10,300 single-copy orthologs.\n(right) 4 species mouse phylogeny constructed from concatenation of 15,427 single-copy orthologes\nGene concordance factors (gCF) are indicated by branch colors."
pcap = ggdraw() + draw_label(cap, size=10, fontface="bold", x=0, hjust=0) + theme(plot.margin=margin(0,0,0,7))
tree_p = plot_grid(tree_fig, pcap, ncol=1, rel_heights=c(1, 0.1))
print(tree_p)

Step 4: Estimation of synonymous (dS) and nonysynonymous (dN) substitution rates

We used codeml within the PAML program to estimate rates of synonymous (dS) and non-synonymous (dN) substitutions using a free-ratio model (model=1, NSsites=0) to get an estimate of rate for each branch in our phylogeny. Relationships among genes may differ amongst themselves and relative to the species relationships due to biological factors such as incomplete lineage sorting and introgression. In cases where a gene tree and the species tree disagree, inferring substitutions using the species tree (the wrong tree in this case) can lead to spurious inferences of substitutions and affect rate estimates. To mitigate this, we used the corresponding gene tree for each alignment when estimating rates. When compiling rate estimates, we only counted rates for a branch in the species tree if it existed in the gene tree (Table 2).

# t2_data = data.frame("Branch"=c(), "Gene trees"=c(), "Genes after dS filter"=c())
# for(clade_label in pho_data$clade.label){
#   if(pho_data$node.type[pho_data$clade.label==clade_label]!="internal"){
#     next
#   }
# 
#   t2_data = rbind(t2_data, data.frame("Branch"=clade_label, "Gene trees"=clade_data[[clade_label]], "Genes after dS filter"=length(clade_data_f[[clade_label]][,1])))
# }
# 
# rownames(t2_data) = c()
# 
# t2_data %>% kable(col.names=c("Branch","Monophyletic gene trees", "Genes after dS filter"), caption="Number of gene trees containing branch before and after dS filtering", align="l") %>% kable_styling(bootstrap_options=c("striped", "condensed", "responsive"), full_width=F)# %>%
#   #column_spec(1, width="7em") %>%
#   #column_spec(2, width="13em")

Filtering based on dS

As another check on alignment quality, we assessed distributions of dS along each branch and only counted genes for each branch where dS fell below 0.3. (QUESTION: should this dS filter step have a different threshold for each branch?) We also omitted genes for a branch where dN/dS was greater than 2.

ds_plots = plot_grid(plotlist=pho_ds_dists, nrow=3)
print(ds_plots)

ds_plots = plot_grid(plotlist=mus_ds_dists, nrow=3)
print(ds_plots)

Post filter dS distributions per branch

pho_m1_data_f = do.call("rbind", pho_clade_data_f)
mus_m1_data_f = do.call("rbind", mus_clade_data_f)
m1_data_f = rbind(pho_m1_data_f, mus_m1_data_f)

#clade_cols = as.character(pho_data$clade.col)
#clade_labs = as.character(pho_data$clade.label)

x_comps = list(c("mus", "pho"))

clade_ds_p = ggplot(m1_data_f, aes(clade.label, clade.ds)) +
  geom_quasirandom(size=0.5, alpha=0.3, width=0.25, color="#d3d3d3") +
  geom_boxplot(outlier.shape=NA, alpha=0.7, width=0.5, fill="#333333") +
  #geom_signif(comparisons=x_comps, map_signif_level=TRUE, textsize=4, size=1, step_increase=0.12, margin_top=0.1) +
  xlab("Branch") +
  ylab("dS") +
  #scale_fill_manual(limits=clade_labs, values=clade_cols) +
  bartheme() +
  theme(legend.position="none",
        axis.text.x=element_text(angle=25, hjust=1, size=8))

print(clade_ds_p)

Post filter dN distributions per branch

clade_cols = as.character(pho_data$clade.col)
clade_labs = as.character(pho_data$clade.label)

#x_comps = list(c("mus", "pho"))

clade_dn_p = ggplot(m1_data_f, aes(clade.label, clade.dn)) +
  geom_quasirandom(size=0.5, alpha=0.3, width=0.25, color="#d3d3d3") +
  geom_boxplot(outlier.shape=NA, alpha=0.7, width=0.5, fill="#333333") +
  #geom_signif(comparisons=x_comps, map_signif_level=TRUE, textsize=4, size=1, step_increase=0.12, margin_top=0.1) +
  xlab("Branch") +
  ylab("dN") +
  scale_fill_manual(limits=clade_labs, values=clade_cols) +
  bartheme() +
  theme(legend.position="none",
        axis.text.x = element_text(angle=25, hjust=1, size=8)
        )

print(clade_dn_p)

Post filter dN/dS distributions per branch

clade_cols = as.character(pho_data$clade.col)
clade_labs = as.character(pho_data$clade.label)

x_comps = list(c("mus", "pho"))

clade_dnds_p = ggplot(m1_data_f, aes(clade.label, clade.dn.ds)) +
  geom_quasirandom(size=0.5, alpha=0.3, width=0.25, color="#d3d3d3") +
  geom_boxplot(outlier.shape=NA, alpha=0.7, width=0.5, fill="#333333") +
  #geom_signif(comparisons=x_comps, map_signif_level=TRUE, textsize=4, size=1, step_increase=0.12, margin_top=0.1) +
  xlab("Branch") +
  ylab("dN/dS") +
  #scale_fill_manual(limits=clade_labs, values=clade_cols) +
  bartheme() +
  theme(legend.position="none",
        axis.text.x=element_text(angle=25, hjust=1, size=8))

print(clade_dnds_p)

Results (M1 branch leading to Phodopus and Mus only)

1. X vs. autosome dS distributions in Mus and Phodopus

pho_branches = c("pho")
pho_a = data.frame()
pho_x = data.frame()
for(b in pho_branches){
  pho_a = rbind(pho_a, subset(pho_clade_data_f[[b]], chr!="ScmyWZ3_7747_HRSCAF_7900"))
  pho_x = rbind(pho_x, subset(pho_clade_data_f[[b]], chr=="ScmyWZ3_7747_HRSCAF_7900"))
}
pho_x$label = "Phodopus X"
pho_a$label = "Phodopus Auto"

mus_branches = c("mus")
mus_a = data.frame()
mus_x = data.frame()
for(b in mus_branches){
  mus_a = rbind(mus_a, subset(mus_clade_data_f[[b]], chr!="X"))
  mus_x = rbind(mus_x, subset(mus_clade_data_f[[b]], chr=="X"))
}
mus_x$label = "Mus X"
mus_a$label = "Mus Auto"


chrome_data = rbind(pho_x, pho_a, mus_x, mus_a)

# t = as.data.frame(table(chrome_data$chr))
# test = ggplot(t, aes(x=Var1, y=Freq)) +
#   geom_bar(stat="identity") +
#   scale_y_continuous(expand=c(0,0)) +
#   bartheme() +
#   theme()
# print(test)


#clade_cols = as.character(rodent_data$clade.col)
#clade_labs = as.character(rodent_data$clade.label)

chrome_cols = c("#006ddb", "#006ddb", "#db6d00", "#db6d00")
chrome_labs = c("Mus Auto", "Mus X", "Phodopus Auto", "Phodopus X")

x_comps = list(c("Phodopus Auto", "Phodopus X"), c("Mus Auto", "Mus X"))

chrome_ds_p = ggplot(chrome_data, aes(label, clade.ds, fill=label)) +
  geom_quasirandom(size=0.5, alpha=0.3, width=0.25, color="#d3d3d3") +
  geom_boxplot(outlier.shape=NA, alpha=0.7, width=0.5) +
  geom_signif(comparisons=x_comps, map_signif_level=TRUE, textsize=4, size=1, step_increase=0.12, margin_top=0.1) +
  xlab("Chromosomes") +
  ylab("ds") +
  scale_fill_manual(limits=chrome_labs, values=chrome_cols) +
  bartheme() +
  theme(legend.position="none")

print(chrome_ds_p)

2. X vs. autosome dN distributions in Mus and Phodopus

chrome_cols = c("#006ddb", "#006ddb", "#db6d00", "#db6d00")
chrome_labs = c("Mus Auto", "Mus X", "Phodopus Auto", "Phodopus X")

x_comps = list(c("Phodopus Auto", "Phodopus X"), c("Mus Auto", "Mus X"))

chrome_dn_p = ggplot(chrome_data, aes(label, clade.dn, fill=label)) +
  geom_quasirandom(size=0.5, alpha=0.3, width=0.25, color="#d3d3d3") +
  geom_boxplot(outlier.shape=NA, alpha=0.7, width=0.5) +
  geom_signif(comparisons=x_comps, map_signif_level=TRUE, textsize=4, size=1, step_increase=0.12, margin_top=0.1) +
  xlab("Chromosomes") +
  ylab("dN") +
  scale_fill_manual(limits=chrome_labs, values=chrome_cols) +
  bartheme() +
  theme(legend.position="none")

print(chrome_dn_p)

3. X vs. autosome dN/dS distributions in Mus and Phodopus

chrome_cols = c("#006ddb", "#006ddb", "#db6d00", "#db6d00")
chrome_labs = c("Mus Auto", "Mus X", "Phodopus Auto", "Phodopus X")

x_comps = list(c("Phodopus Auto", "Phodopus X"), c("Mus Auto", "Mus X"))

chrome_dnds_p = ggplot(chrome_data, aes(label, clade.dn.ds, fill=label)) +
  geom_quasirandom(size=0.5, alpha=0.3, width=0.25, color="#d3d3d3") +
  geom_boxplot(outlier.shape=NA, alpha=0.7, width=0.5) +
  geom_signif(comparisons=x_comps, map_signif_level=TRUE, textsize=4, size=1, step_increase=0.12, margin_top=0.1) +
  xlab("Chromosomes") +
  ylab("dN/dS") +
  scale_fill_manual(limits=chrome_labs, values=chrome_cols) +
  bartheme() +
  theme(legend.position="none")

print(chrome_dnds_p)

t3_data = data.frame("chrome"=c("Mus auto", "Mus X", "Phodopus auto", "Phodopus X"), 
                     "genes"=c(length(mus_a[,1]), length(mus_x[,1]), length(pho_a[,1]), length(pho_x[,1])), 
                     "avg.ds"=c(mean(mus_a$clade.ds), mean(mus_x$clade.ds), mean(pho_a$clade.ds), mean(pho_x$clade.ds)), 
                     "avg.dn"=c(mean(mus_a$clade.dn), mean(mus_x$clade.dn), mean(pho_a$clade.dn), mean(pho_x$clade.dn)), 
                     "avg.dnds"=c(mean(mus_a$clade.dn.ds), mean(mus_x$clade.dn.ds), mean(pho_a$clade.dn.ds), mean(pho_x$clade.dn.ds)))

rownames(t3_data) = c()

t3_data %>% kable(col.names=c("Chromosome","Number of genes", "Avg. dS", "Avg. dN", "Avg. dN/dS"), caption="Substitution rates for Mus and Phodopus branches", align="l") %>% kable_styling(bootstrap_options=c("striped", "condensed", "responsive"), full_width=F)# %>%

Substitution rates for Mus and Phodopus branches
Chromosome	Number of genes	Avg. dS	Avg. dN	Avg. dN/dS
Mus auto	14695	0.1476831	0.0214826	0.1511614
Mus X	517	0.1167536	0.0223234	0.1857207
Phodopus auto	9581	0.0604657	0.0095896	0.1749161
Phodopus X	399	0.0463514	0.0099188	0.2132867

  #column_spec(1, width="7em") %>%
  #column_spec(2, width="13em")

4. X vs. autosome dS distributions in Mus and Phodopus, partitioning Phodopus X between recombining and non-recombining regions

        # if scaff == "ScmyWZ3_7747_HRSCAF_7900":
        #     if int(end) < 35225145:
        #         scaff = "ScmyWZ3_7747_HRSCAF_7900_R";
        #         chrome = "chrX_R";
        #     elif int(start) >= 35225145:
        #         scaff = "ScmyWZ3_7747_HRSCAF_7900_NR"
        #         chrome = "chrX_NR";
        #     else:
        #         chrome = "chrX";
        # elif scaff in scaff_to_chr:
        #     chrome =  scaff_to_chr[scaff];
        # else:
        #     chrome = "NA";

pho_xr = subset(pho_x, start < 35225145)
pho_xr$label = "Phodopus X recombining"
pho_xnr = subset(pho_x, start >= 35225145)
pho_xnr$label = "Phodopus X non-recombining"

chrome_data = rbind(mus_a, mus_x, pho_a, pho_xr, pho_xnr)

chrome_cols = c("#006ddb", "#006ddb", "#db6d00", "#db6d00", "#db6d00")
chrome_labs = c("Mus Auto", "Mus X", "Phodopus Auto", "Phodopus X recombining", "Phodopus X non-recombining")

x_comps = list(c("Phodopus X recombining", "Phodopus X non-recombining"), c("Phodopus Auto", "Phodopus X recombining"), c("Phodopus Auto", "Phodopus X non-recombining"))

chrome_ds_p = ggplot(chrome_data, aes(label, clade.ds, fill=label)) +
  geom_quasirandom(size=0.5, alpha=0.3, width=0.25, color="#d3d3d3") +
  geom_boxplot(outlier.shape=NA, alpha=0.7, width=0.5) +
  geom_signif(comparisons=x_comps, map_signif_level=TRUE, textsize=4, size=1, step_increase=0.12, margin_top=0.1) +
  xlab("Chromosomes") +
  ylab("ds") +
  scale_fill_manual(limits=chrome_labs, values=chrome_cols) +
  bartheme() +
  theme(legend.position="none",
        axis.text.x = element_text(angle=10, hjust=1, size=10)
        )

print(chrome_ds_p)

5. X vs. autosome dN distributions in Mus and Phodopus, partitioning Phodopus X between recombining and non-recombining regions

chrome_cols = c("#006ddb", "#006ddb", "#db6d00", "#db6d00", "#db6d00")
chrome_labs = c("Mus Auto", "Mus X", "Phodopus Auto", "Phodopus X recombining", "Phodopus X non-recombining")

x_comps = list(c("Phodopus X recombining", "Phodopus X non-recombining"), c("Phodopus Auto", "Phodopus X recombining"), c("Phodopus Auto", "Phodopus X non-recombining"))

chrome_dn_p = ggplot(chrome_data, aes(label, clade.dn, fill=label)) +
  geom_quasirandom(size=0.5, alpha=0.3, width=0.25, color="#d3d3d3") +
  geom_boxplot(outlier.shape=NA, alpha=0.7, width=0.5) +
  geom_signif(comparisons=x_comps, map_signif_level=TRUE, textsize=4, size=1, step_increase=0.12, margin_top=0.1) +
  xlab("Chromosomes") +
  ylab("dN") +
  scale_fill_manual(limits=chrome_labs, values=chrome_cols) +
  bartheme() +
  theme(legend.position="none",
        axis.text.x = element_text(angle=10, hjust=1, size=10)
        )

print(chrome_dn_p)

6. X vs. autosome dN/dS distributions in Mus and Phodopus, partitioning Phodopus X between recombining and non-recombining regions

chrome_cols = c("#006ddb", "#006ddb", "#db6d00", "#db6d00", "#db6d00")
chrome_labs = c("Mus Auto", "Mus X", "Phodopus Auto", "Phodopus X recombining", "Phodopus X non-recombining")

x_comps = list(c("Phodopus X recombining", "Phodopus X non-recombining"), c("Phodopus Auto", "Phodopus X recombining"), c("Phodopus Auto", "Phodopus X non-recombining"))

chrome_dnds_p = ggplot(chrome_data, aes(label, clade.dn.ds, fill=label)) +
  geom_quasirandom(size=0.5, alpha=0.3, width=0.25, color="#d3d3d3") +
  geom_boxplot(outlier.shape=NA, alpha=0.7, width=0.5) +
  geom_signif(comparisons=x_comps, map_signif_level=TRUE, textsize=4, size=1, step_increase=0.12, margin_top=0.1) +
  xlab("Chromosomes") +
  ylab("dN/dS") +
  scale_fill_manual(limits=chrome_labs, values=chrome_cols) +
  bartheme() +
  theme(legend.position="none",
        axis.text.x = element_text(angle=10, hjust=1, size=10)
        )

print(chrome_dnds_p)

7. M1 summary stats

t4_data = data.frame("chrome"=c("Mus Auto", "Mus X", "Phodopus Auto", "Phodopus X non-recombining", "Phodopus X recombining"), 
                     "genes"=c(length(mus_a[,1]), length(mus_x[,1]), length(pho_a[,1]), length(pho_xnr[,1]), length(pho_xr[,1])), 
                     "avg.ds"=c(mean(mus_a$clade.ds), mean(mus_x$clade.ds), mean(pho_a$clade.ds), mean(pho_xnr$clade.ds), mean(pho_xr$clade.ds)), 
                     "avg.dn"=c(mean(mus_a$clade.dn), mean(mus_x$clade.dn), mean(pho_a$clade.dn), mean(pho_xnr$clade.dn), mean(pho_xr$clade.dn)), 
                     "avg.dnds"=c(mean(mus_a$clade.dn.ds), mean(mus_x$clade.dn.ds), mean(pho_a$clade.dn.ds), mean(pho_xnr$clade.dn.ds), mean(pho_xr$clade.dn.ds))
                     )

rownames(t4_data) = c()

t4_data %>% kable(col.names=c("Chromosome","Number of genes", "Avg. dS", "Avg. dN", "Avg. dN/dS"), caption="Substitution rates for Mus and Phodopus branches", align="l") %>% kable_styling(bootstrap_options=c("striped", "condensed", "responsive"), full_width=F)# %>%

Substitution rates for Mus and Phodopus branches
Chromosome	Number of genes	Avg. dS	Avg. dN	Avg. dN/dS
Mus Auto	14695	0.1476831	0.0214826	0.1511614
Mus X	517	0.1167536	0.0223234	0.1857207
Phodopus Auto	9581	0.0604657	0.0095896	0.1749161
Phodopus X non-recombining	257	0.0442354	0.0098794	0.2236335
Phodopus X recombining	142	0.0501810	0.0099901	0.1945606

  #column_spec(1, width="7em") %>%
  #column_spec(2, width="13em")

Results (Clade model C; not useful?)

cmc_data = read.csv("../../data/mol-evol/phodopus-5spec-cmc.csv", header=TRUE, comment.char="#")

cmc_a = subset(cmc_data, chr!="ScmyWZ3_7747_HRSCAF_7900")
cmc_a$label = "Phodopus Auto"

cmc_a_1_b = subset(cmc_a, select=c(label, background.clade.omega.1))
cmc_a_1_b$class = "Background omega Auto 1"
names(cmc_a_1_b)[2] = "omega"
cmc_a_2_b = subset(cmc_a, select=c(label, background.clade.omega.2))
cmc_a_2_b$class = "Background omega Auto 2"
names(cmc_a_2_b)[2] = "omega"
cmc_a_3_b = subset(cmc_a, select=c(label, background.clade.omega.3))
cmc_a_3_b$class = "Background omega Auto 3"
names(cmc_a_3_b)[2] = "omega"

cmc_a_1_t= subset(cmc_a, select=c(label, target.clade.omega.1))
cmc_a_1_t$class = "Target omega Auto 1"
names(cmc_a_1_t)[2] = "omega"
cmc_a_2_t = subset(cmc_a, select=c(label, target.clade.omega.2))
cmc_a_2_t$class = "Target omega Auto 2"
names(cmc_a_2_t)[2] = "omega"
cmc_a_3_t = subset(cmc_a, select=c(label, target.clade.omega.3))
cmc_a_3_t$class = "Target omega Auto 3"
names(cmc_a_3_t)[2] = "omega"

cmc_x = subset(cmc_data, chr!="ScmyWZ3_7747_HRSCAF_7900")
cmc_x$label = "Phodopus X"

cmc_x_1_b = subset(cmc_x, select=c(label, background.clade.omega.1))
cmc_x_1_b$class = "Background omega X 1"
names(cmc_x_1_b)[2] = "omega"
cmc_x_2_b = subset(cmc_x, select=c(label, background.clade.omega.2))
cmc_x_2_b$class = "Background omega X 2"
names(cmc_x_2_b)[2] = "omega"
cmc_x_3_b = subset(cmc_x, select=c(label, background.clade.omega.3))
cmc_x_3_b$class = "Background omega X 3"
names(cmc_x_3_b)[2] = "omega"

cmc_x_1_t= subset(cmc_x, select=c(label, target.clade.omega.1))
cmc_x_1_t$class = "Target omega X 1"
names(cmc_x_1_t)[2] = "omega"
cmc_x_2_t = subset(cmc_x, select=c(label, target.clade.omega.2))
cmc_x_2_t$class = "Target omega X 2"
names(cmc_x_2_t)[2] = "omega"
cmc_x_3_t = subset(cmc_x, select=c(label, target.clade.omega.3))
cmc_x_3_t$class = "Target omega X 3"
names(cmc_x_3_t)[2] = "omega"

cmc_data_long = rbind(cmc_a_1_b, cmc_a_2_b, cmc_a_3_b, cmc_a_1_t, cmc_a_2_t, cmc_a_3_t,
                      cmc_x_1_b, cmc_x_2_b, cmc_x_3_b, cmc_x_1_t, cmc_x_2_t, cmc_x_3_t)

cmc_data_long = subset(cmc_data_long, omega < 2)

chrome_cols = c("#006ddb", "#006ddb", "#db6d00", "#db6d00", "#db6d00")
chrome_labs = c("Mus Auto", "Mus X", "Phodopus Auto", "Phodopus X recombining", "Phodopus X non-recombining")

x_comps = list(c("Target omega X 3", "Target omega Auto 3"))

cmc_dnds_p = ggplot(cmc_data_long, aes(class, omega, fill=class)) +
  geom_quasirandom(size=0.5, alpha=0.3, width=0.25, color="#d3d3d3") +
  geom_boxplot(outlier.shape=NA, alpha=0.7, width=0.5) +
  geom_signif(comparisons=x_comps, map_signif_level=TRUE, textsize=4, size=1, step_increase=0.12, margin_top=0.1) +
  xlab("Chromosomes") +
  ylab("dN/dS") +
  #scale_fill_manual(limits=chrome_labs, values=chrome_cols) +
  bartheme() +
  theme(legend.position="none",
        axis.text.x = element_text(angle=25, hjust=1, size=10)
        )

print(cmc_dnds_p)

Results (M2 when setting all phodopus branches and all mus branches as targets)

pho_m2_data = read.csv("../../data/mol-evol/phodopus-5spec-m2.csv", header=T, comment.char="#")
pho_m2_data$clade = as.character(pho_m2_data$clade)
# Phodopus data

mus_m2_data = read.csv("../../data/mol-evol/mus-4spec-m2.csv", header=T, comment.char="#")
mus_m2_data$clade = as.character(mus_m2_data$clade)
# Mus data

0. dS distributions by branch for filtering

pho_data$avg.dnds = NA
pho_data$avg.dn = NA
pho_data$avg.ds = NA
pho_data$num.monophyletic = NA
pho_data$num.monophyletic.filtered = NA

pho_clade_data = list()
pho_clade_data_f = list()
pho_ds_dists = list()
i =  1
for(c in pho_data$clade){
  if(pho_data$node.type[pho_data$clade==c]=="ROOT"){
    next
  }
  
  cur_label = as.character(pho_data$clade.label[pho_data$clade==c])
  cur_data = subset(pho_m2_data, clade==c & clade.ds <= 2)
  cur_data$clade.label = cur_label
  
  ds_dist = ggplot(cur_data, aes(x=clade.ds)) +
    geom_histogram(color="#000000", fill="#d3d3d3") +
    scale_y_continuous(expand=c(0,0)) +
    xlab("dS") +
    ylab("# of genes") +
    ggtitle(pho_data$clade.label[pho_data$clade==c]) +
    theme(axis.text.x = element_text(angle=90, hjust=1, size=8))
  
  #print(ds_dist)
  #stop()
  pho_ds_dists[[i]] = ds_dist
  i = i + 1
  
  cur_data_f = subset(cur_data, clade.ds < 0.3 & clade.dn.ds <= 2)
  
  pho_data$num.monophyletic[pho_data$clade==c] = nrow(cur_data)
  pho_data$num.monophyletic.filtered[pho_data$clade==c] = nrow(cur_data_f)
  pho_data$avg.dnds[pho_data$clade==c] = mean(cur_data_f$clade.dn.ds)
  pho_data$avg.dn[pho_data$clade==c] = mean(cur_data_f$clade.dn)
  pho_data$avg.ds[pho_data$clade==c] = mean(cur_data$clade.ds)
  
  pho_clade_data[[cur_label]] = length(cur_data[,1])
  pho_clade_data_f[[cur_label]] = cur_data_f
}

mus_data$avg.dnds = NA
mus_data$avg.dn = NA
mus_data$avg.ds = NA
mus_data$num.monophyletic = NA
mus_data$num.monophyletic.filtered = NA

mus_clade_data = list()
mus_clade_data_f = list()
mus_ds_dists = list()
i =  1
for(c in mus_data$clade){
  if(mus_data$node.type[mus_data$clade==c]=="ROOT"){
    next
  }

  cur_label = as.character(mus_data$clade.label[mus_data$clade==c])
  cur_data = subset(mus_m2_data, clade==c & clade.ds <= 2)
  cur_data$clade.label = cur_label

  ds_dist = ggplot(cur_data, aes(x=clade.ds)) +
    geom_histogram(color="#000000", fill="#d3d3d3") +
    scale_y_continuous(expand=c(0,0)) +
    xlab("dS") +
    ylab("# of genes") +
    ggtitle(mus_data$clade.label[mus_data$clade==c]) +
    theme(axis.text.x = element_text(angle=90, hjust=1, size=8))

  #print(ds_dist)
  #stop()
  mus_ds_dists[[i]] = ds_dist
  i = i + 1

  cur_data_f = subset(cur_data, clade.ds < 0.3 & clade.dn.ds <= 2)

  mus_data$num.monophyletic[mus_data$clade==c] = nrow(cur_data)
  mus_data$num.monophyletic.filtered[mus_data$clade==c] = nrow(cur_data_f)
  mus_data$avg.dnds[mus_data$clade==c] = mean(cur_data_f$clade.dn.ds)
  mus_data$avg.dn[mus_data$clade==c] = mean(cur_data_f$clade.dn)
  mus_data$avg.ds[mus_data$clade==c] = mean(cur_data$clade.ds)

  mus_clade_data[[cur_label]] = length(cur_data[,1])
  mus_clade_data_f[[cur_label]] = cur_data_f
}

ds_plots = plot_grid(plotlist=pho_ds_dists, nrow=3)
print(ds_plots)

ds_plots = plot_grid(plotlist=mus_ds_dists, nrow=3)
print(ds_plots)

1. X vs. autosome dS distributions in Mus and Phodopus

Estimates are averaged across all branches in clade.

pho_branches = c("cs", "psun", "pcam", "prob")
pho_a = data.frame()
pho_x = data.frame()
for(b in pho_branches){
  pho_a = rbind(pho_a, subset(pho_clade_data_f[[b]], chr!="ScmyWZ3_7747_HRSCAF_7900"))
  pho_x = rbind(pho_x, subset(pho_clade_data_f[[b]], chr=="ScmyWZ3_7747_HRSCAF_7900"))
}
pho_x = pho_x %>% group_by(id, chr, start, end) %>% 
  summarize(ds=mean(clade.ds, na.rm=T), dn=mean(clade.dn, na.rm=T), dn.ds=mean(clade.dn.ds, na.rm=T))

## `summarise()` has grouped output by 'id', 'chr', 'start'. You can override using the `.groups` argument.

pho_x$label = "Phodopus X"
pho_a = pho_a %>% group_by(id, chr, start, end) %>% 
  summarize(ds=mean(clade.ds, na.rm=T), dn=mean(clade.dn, na.rm=T), dn.ds=mean(clade.dn.ds, na.rm=T))

## `summarise()` has grouped output by 'id', 'chr', 'start'. You can override using the `.groups` argument.

pho_a$label = "Phodopus Auto"

mus_branches = c("cm", "mpah", "mcar", "mmus")
mus_a = data.frame()
mus_x = data.frame()
for(b in mus_branches){
  mus_a = rbind(mus_a, subset(mus_clade_data_f[[b]], chr!="X"))
  mus_x = rbind(mus_x, subset(mus_clade_data_f[[b]], chr=="X"))
}
mus_x = mus_x %>% group_by(id, chr, start, end) %>% 
  summarize(ds=mean(clade.ds, na.rm=T), dn=mean(clade.dn, na.rm=T), dn.ds=mean(clade.dn.ds, na.rm=T))

## `summarise()` has grouped output by 'id', 'chr', 'start'. You can override using the `.groups` argument.

mus_x$label = "Mus X"
mus_a = mus_a %>% group_by(id, chr, start, end) %>% 
  summarize(ds=mean(clade.ds, na.rm=T), dn=mean(clade.dn, na.rm=T), dn.ds=mean(clade.dn.ds, na.rm=T))

## `summarise()` has grouped output by 'id', 'chr', 'start'. You can override using the `.groups` argument.

mus_a$label = "Mus Auto"

chrome_data = rbind(pho_x, pho_a, mus_x, mus_a)

# t = as.data.frame(table(chrome_data$chr))
# test = ggplot(t, aes(x=Var1, y=Freq)) +
#   geom_bar(stat="identity") +
#   scale_y_continuous(expand=c(0,0)) +
#   bartheme() +
#   theme()
# print(test)


#clade_cols = as.character(rodent_data$clade.col)
#clade_labs = as.character(rodent_data$clade.label)

chrome_cols = c("#006ddb", "#006ddb", "#db6d00", "#db6d00")
chrome_labs = c("Mus Auto", "Mus X", "Phodopus Auto", "Phodopus X")

x_comps = list(c("Phodopus Auto", "Phodopus X"), c("Mus Auto", "Mus X"))

chrome_ds_p = ggplot(chrome_data, aes(label, ds, fill=label)) +
  geom_quasirandom(size=0.5, alpha=0.3, width=0.25, color="#d3d3d3") +
  geom_boxplot(outlier.shape=NA, alpha=0.7, width=0.5) +
  geom_signif(comparisons=x_comps, map_signif_level=TRUE, textsize=4, size=1, step_increase=0.12, margin_top=0.1) +
  xlab("Chromosomes") +
  ylab("ds") +
  scale_fill_manual(limits=chrome_labs, values=chrome_cols) +
  bartheme() +
  theme(legend.position="none")

print(chrome_ds_p)

2. X vs. autosome dN distributions in Mus and Phodopus

Estimates are averaged across all branches in clade.

chrome_cols = c("#006ddb", "#006ddb", "#db6d00", "#db6d00")
chrome_labs = c("Mus Auto", "Mus X", "Phodopus Auto", "Phodopus X")

x_comps = list(c("Phodopus Auto", "Phodopus X"), c("Mus Auto", "Mus X"))

chrome_dn_p = ggplot(chrome_data, aes(label, dn, fill=label)) +
  geom_quasirandom(size=0.5, alpha=0.3, width=0.25, color="#d3d3d3") +
  geom_boxplot(outlier.shape=NA, alpha=0.7, width=0.5) +
  geom_signif(comparisons=x_comps, map_signif_level=TRUE, textsize=4, size=1, step_increase=0.12, margin_top=0.1) +
  xlab("Chromosomes") +
  ylab("dN") +
  scale_fill_manual(limits=chrome_labs, values=chrome_cols) +
  bartheme() +
  theme(legend.position="none")

print(chrome_dn_p)

3. X vs. autosome dN/dS distributions in Mus and Phodopus

Single estimate for whole clade.

chrome_cols = c("#006ddb", "#006ddb", "#db6d00", "#db6d00")
chrome_labs = c("Mus Auto", "Mus X", "Phodopus Auto", "Phodopus X")

x_comps = list(c("Phodopus Auto", "Phodopus X"), c("Mus Auto", "Mus X"))

chrome_dnds_p = ggplot(chrome_data, aes(label, dn.ds, fill=label)) +
  geom_quasirandom(size=0.5, alpha=0.3, width=0.25, color="#d3d3d3") +
  geom_boxplot(outlier.shape=NA, alpha=0.7, width=0.5) +
  geom_signif(comparisons=x_comps, map_signif_level=TRUE, textsize=4, size=1, step_increase=0.12, margin_top=0.1) +
  xlab("Chromosomes") +
  ylab("dN/dS") +
  scale_fill_manual(limits=chrome_labs, values=chrome_cols) +
  bartheme() +
  theme(legend.position="none")

print(chrome_dnds_p)

4. X vs. autosome dS distributions in Mus and Phodopus, partitioning Phodopus X between recombining and non-recombining regions

Estimates are averaged across all branches in clade.

pho_xr = subset(pho_x, start < 35225145)
pho_xr$label = "Phodopus X recombining"
pho_xnr = subset(pho_x, start >= 35225145)
pho_xnr$label = "Phodopus X non-recombining"

chrome_data = rbind(mus_a, mus_x, pho_a, pho_xr, pho_xnr)

chrome_cols = c("#006ddb", "#006ddb", "#db6d00", "#db6d00", "#db6d00")
chrome_labs = c("Mus Auto", "Mus X", "Phodopus Auto", "Phodopus X recombining", "Phodopus X non-recombining")

x_comps = list(c("Phodopus X recombining", "Phodopus X non-recombining"), c("Phodopus Auto", "Phodopus X recombining"), c("Phodopus Auto", "Phodopus X non-recombining"))

chrome_ds_p = ggplot(chrome_data, aes(label, ds, fill=label)) +
  geom_quasirandom(size=0.5, alpha=0.3, width=0.25, color="#d3d3d3") +
  geom_boxplot(outlier.shape=NA, alpha=0.7, width=0.5) +
  geom_signif(comparisons=x_comps, map_signif_level=TRUE, textsize=4, size=1, step_increase=0.12, margin_top=0.1) +
  xlab("Chromosomes") +
  ylab("ds") +
  scale_fill_manual(limits=chrome_labs, values=chrome_cols) +
  bartheme() +
  theme(legend.position="none",
        axis.text.x = element_text(angle=10, hjust=1, size=10)
        )

print(chrome_ds_p)

5. X vs. autosome dN distributions in Mus and Phodopus, partitioning Phodopus X between recombining and non-recombining regions

Estimates are averaged across all branches in clade.

chrome_cols = c("#006ddb", "#006ddb", "#db6d00", "#db6d00", "#db6d00")
chrome_labs = c("Mus Auto", "Mus X", "Phodopus Auto", "Phodopus X recombining", "Phodopus X non-recombining")

x_comps = list(c("Phodopus X recombining", "Phodopus X non-recombining"), c("Phodopus Auto", "Phodopus X recombining"), c("Phodopus Auto", "Phodopus X non-recombining"))

chrome_dn_p = ggplot(chrome_data, aes(label, dn, fill=label)) +
  geom_quasirandom(size=0.5, alpha=0.3, width=0.25, color="#d3d3d3") +
  geom_boxplot(outlier.shape=NA, alpha=0.7, width=0.5) +
  geom_signif(comparisons=x_comps, map_signif_level=TRUE, textsize=4, size=1, step_increase=0.12, margin_top=0.1) +
  xlab("Chromosomes") +
  ylab("dN") +
  scale_fill_manual(limits=chrome_labs, values=chrome_cols) +
  bartheme() +
  theme(legend.position="none",
        axis.text.x = element_text(angle=10, hjust=1, size=10)
        )

print(chrome_dn_p)

6. X vs. autosome dN/dS distributions in Mus and Phodopus, partitioning Phodopus X between recombining and non-recombining regions

Single estimate for whole clade.

chrome_cols = c("#006ddb", "#006ddb", "#db6d00", "#db6d00", "#db6d00")
chrome_labs = c("Mus Auto", "Mus X", "Phodopus Auto", "Phodopus X recombining", "Phodopus X non-recombining")

x_comps = list(c("Phodopus X recombining", "Phodopus X non-recombining"), c("Phodopus Auto", "Phodopus X recombining"), c("Phodopus Auto", "Phodopus X non-recombining"))

chrome_dnds_p = ggplot(chrome_data, aes(label, dn.ds, fill=label)) +
  geom_quasirandom(size=0.5, alpha=0.3, width=0.25, color="#d3d3d3") +
  geom_boxplot(outlier.shape=NA, alpha=0.7, width=0.5) +
  geom_signif(comparisons=x_comps, map_signif_level=TRUE, textsize=4, size=1, step_increase=0.12, margin_top=0.1) +
  xlab("Chromosomes") +
  ylab("dN/dS") +
  scale_fill_manual(limits=chrome_labs, values=chrome_cols) +
  bartheme() +
  theme(legend.position="none",
        axis.text.x = element_text(angle=10, hjust=1, size=10)
        )

print(chrome_dnds_p)

7. M2 summary stats

t5_data = data.frame("chrome"=c("Mus Auto", "Mus X", "Phodopus Auto", "Phodopus X non-recombining", "Phodopus X recombining"), 
                     "genes"=c(length(mus_a[,1]), length(mus_x[,1]), length(pho_a[,1]), length(pho_xnr[,1]), length(pho_xr[,1])), 
                     "avg.ds"=c(mean(mus_a$ds), mean(mus_x$ds), mean(pho_a$ds), mean(pho_xnr$ds), mean(pho_xr$ds)), 
                     "avg.dn"=c(mean(mus_a$dn), mean(mus_x$dn), mean(pho_a$dn), mean(pho_xnr$dn), mean(pho_xr$dn)), 
                     "avg.dnds"=c(mean(mus_a$dn.ds), mean(mus_x$dn.ds), mean(pho_a$dn.ds), mean(pho_xnr$dn.ds), mean(pho_xr$dn.ds))
                     )

rownames(t5_data) = c()

t5_data %>% kable(col.names=c("Chromosome","Number of genes", "Avg. dS", "Avg. dN", "Avg. dN/dS"), caption="Substitution rates for Mus and Phodopus branches", align="l") %>% kable_styling(bootstrap_options=c("striped", "condensed", "responsive"), full_width=F)# %>%

Substitution rates for Mus and Phodopus branches
Chromosome	Number of genes	Avg. dS	Avg. dN	Avg. dN/dS
Mus Auto	1	0.0305418	0.0042116	0.1474844
Mus X	1	0.0233840	0.0044003	0.1906650
Phodopus Auto	1	0.0217890	0.0034842	0.1745578
Phodopus X non-recombining	1	0.0155352	0.0026421	0.1825200
Phodopus X recombining	1	0.0165241	0.0036164	0.2217289

  #column_spec(1, width="7em") %>%
  #column_spec(2, width="13em")

Results (Hyphy free ratio model [FitMG94], branch leading to phodopus and mus only)

pho_mg_data = read.csv("../../data/mol-evol/phodopus-5spec-mg94-local.csv", header=T, comment.char="#")
pho_mg_data$clade = as.character(pho_mg_data$clade)
pho_mg_data$clade[pho_mg_data$clade=="psun;pcam"] = "pcam;psun"
pho_mg_data$clade[pho_mg_data$clade=="psun;pcam;prob"] = "pcam;prob;psun"
pho_mg_data$clade[pho_mg_data$clade=="psun;prob;pcam"] = "pcam;prob;psun"
pho_mg_data$clade[pho_mg_data$clade=="pcam;psun;prob"] = "pcam;prob;psun"
pho_mg_data$clade[pho_mg_data$clade=="prob;psun;pcam"] = "pcam;prob;psun"
pho_mg_data$clade[pho_mg_data$clade=="prob;pcam;psun"] = "pcam;prob;psun"
# Phodopus data

mus_mg_data = read.csv("../../data/mol-evol/mus-4spec-mg94-local.csv", header=T, comment.char="#")
mus_mg_data$clade = as.character(mus_mg_data$clade)
mus_mg_data$clade[mus_mg_data$clade=="mmus;mcar"] = "mcar;mmus"
mus_mg_data$clade[mus_mg_data$clade=="mmus;mcar;mpah"] = "mcar;mmus;mpah"
mus_mg_data$clade[mus_mg_data$clade=="mmus;mpah;mcar"] = "mcar;mmus;mpah"
mus_mg_data$clade[mus_mg_data$clade=="mpah;mmus;mcar"] = "mcar;mmus;mpah"
mus_mg_data$clade[mus_mg_data$clade=="mpah;mcar;mmus"] = "mcar;mmus;mpah"
mus_mg_data$clade[mus_mg_data$clade=="mcar;mpah;mmus"] = "mcar;mmus;mpah"
# Mus data

0. dS distributions by branch for filtering

pho_data$avg.dnds = NA
pho_data$avg.dn = NA
pho_data$avg.ds = NA
pho_data$num.monophyletic = NA
pho_data$num.monophyletic.filtered = NA

pho_clade_data = list()
pho_clade_data_f = list()
pho_ds_dists = list()
i =  1
for(c in pho_data$clade){
  if(pho_data$node.type[pho_data$clade==c]=="ROOT"){
    next
  }
  
  cur_label = as.character(pho_data$clade.label[pho_data$clade==c])
  cur_data = subset(pho_mg_data, clade==c & ds <= 2)
  cur_data$clade.label = cur_label
  
  ds_dist = ggplot(cur_data, aes(x=ds)) +
    geom_histogram(color="#000000", fill="#d3d3d3") +
    scale_y_continuous(expand=c(0,0)) +
    xlab("dS") +
    ylab("# of genes") +
    ggtitle(pho_data$clade.label[pho_data$clade==c]) +
    theme(axis.text.x = element_text(angle=90, hjust=1, size=8))
  
  #print(ds_dist)
  #stop()
  pho_ds_dists[[i]] = ds_dist
  i = i + 1
  
  cur_data_f = subset(cur_data, ds < 0.3 & dn.ds <= 2)
  
  pho_data$num.monophyletic[pho_data$clade==c] = nrow(cur_data)
  pho_data$num.monophyletic.filtered[pho_data$clade==c] = nrow(cur_data_f)
  pho_data$avg.dnds[pho_data$clade==c] = mean(cur_data_f$dn.ds)
  pho_data$avg.dn[pho_data$clade==c] = mean(cur_data_f$dn)
  pho_data$avg.ds[pho_data$clade==c] = mean(cur_data$ds)
  
  pho_clade_data[[cur_label]] = length(cur_data[,1])
  pho_clade_data_f[[cur_label]] = cur_data_f
}

mus_data$avg.dnds = NA
mus_data$avg.dn = NA
mus_data$avg.ds = NA
mus_data$num.monophyletic = NA
mus_data$num.monophyletic.filtered = NA

mus_clade_data = list()
mus_clade_data_f = list()
mus_ds_dists = list()
i =  1
for(c in mus_data$clade){
  if(mus_data$node.type[mus_data$clade==c]=="ROOT"){
    next
  }
  print(c)
  cur_label = as.character(mus_data$clade.label[mus_data$clade==c])
  cur_data = subset(mus_mg_data, clade==c & ds <= 2)
  cur_data$clade.label = cur_label

  ds_dist = ggplot(cur_data, aes(x=ds)) +
    geom_histogram(color="#000000", fill="#d3d3d3") +
    scale_y_continuous(expand=c(0,0)) +
    xlab("dS") +
    ylab("# of genes") +
    ggtitle(mus_data$clade.label[mus_data$clade==c]) +
    theme(axis.text.x = element_text(angle=90, hjust=1, size=8))

  #print(ds_dist)
  #stop()
  mus_ds_dists[[i]] = ds_dist
  i = i + 1

  cur_data_f = subset(cur_data, ds < 0.3 & dn.ds <= 2)

  mus_data$num.monophyletic[mus_data$clade==c] = nrow(cur_data)
  mus_data$num.monophyletic.filtered[mus_data$clade==c] = nrow(cur_data_f)
  mus_data$avg.dnds[mus_data$clade==c] = mean(cur_data_f$dn.ds)
  mus_data$avg.dn[mus_data$clade==c] = mean(cur_data_f$dn)
  mus_data$avg.ds[mus_data$clade==c] = mean(cur_data$ds)

  mus_clade_data[[cur_label]] = length(cur_data[,1])
  mus_clade_data_f[[cur_label]] = cur_data_f
}

## [1] "rnor"
## [1] "mmus"
## [1] "mcar"
## [1] "mpah"
## [1] "mcar;mmus;mpah"
## [1] "mcar;mmus"

ds_plots = plot_grid(plotlist=pho_ds_dists, nrow=3)
print(ds_plots)

ds_plots = plot_grid(plotlist=mus_ds_dists, nrow=3)
print(ds_plots)

1. X vs. autosome dS distributions in Mus and Phodopus

pho_branches = c("pho")
pho_a = data.frame()
pho_x = data.frame()
for(b in pho_branches){
  pho_a = rbind(pho_a, subset(pho_clade_data_f[[b]], chr!="ScmyWZ3_7747_HRSCAF_7900"))
  pho_x = rbind(pho_x, subset(pho_clade_data_f[[b]], chr=="ScmyWZ3_7747_HRSCAF_7900"))
}
#pho_x = pho_x %>% group_by(id, chr, start, end) %>% 
#  summarize(ds=mean(ds, na.rm=T), dn=mean(dn, na.rm=T), dn.ds=mean(dn.ds, na.rm=T))
pho_x$label = "Phodopus X"
#pho_a = pho_a %>% group_by(id, chr, start, end) %>% 
#  summarize(ds=mean(ds, na.rm=T), dn=mean(dn, na.rm=T), dn.ds=mean(dn.ds, na.rm=T))
pho_a$label = "Phodopus Auto"

mus_branches = c("mus")
mus_a = data.frame()
mus_x = data.frame()
for(b in mus_branches){
  mus_a = rbind(mus_a, subset(mus_clade_data_f[[b]], chr!="X"))
  mus_x = rbind(mus_x, subset(mus_clade_data_f[[b]], chr=="X"))
}
#mus_x = mus_x %>% group_by(id, chr, start, end) %>% 
#  summarize(ds=mean(ds, na.rm=T), dn=mean(dn, na.rm=T), dn.ds=mean(dn.ds, na.rm=T))
mus_x$label = "Mus X"
#mus_a = mus_a %>% group_by(id, chr, start, end) %>% 
#  summarize(ds=mean(ds, na.rm=T), dn=mean(dn, na.rm=T), dn.ds=mean(dn.ds, na.rm=T))
mus_a$label = "Mus Auto"

chrome_data = rbind(pho_x, pho_a, mus_x, mus_a)

# t = as.data.frame(table(chrome_data$chr))
# test = ggplot(t, aes(x=Var1, y=Freq)) +
#   geom_bar(stat="identity") +
#   scale_y_continuous(expand=c(0,0)) +
#   bartheme() +
#   theme()
# print(test)


#clade_cols = as.character(rodent_data$clade.col)
#clade_labs = as.character(rodent_data$clade.label)

chrome_cols = c("#006ddb", "#006ddb", "#db6d00", "#db6d00")
chrome_labs = c("Mus Auto", "Mus X", "Phodopus Auto", "Phodopus X")

x_comps = list(c("Phodopus Auto", "Phodopus X"), c("Mus Auto", "Mus X"))

chrome_ds_p = ggplot(chrome_data, aes(label, ds, fill=label)) +
  geom_quasirandom(size=0.5, alpha=0.3, width=0.25, color="#d3d3d3") +
  geom_boxplot(outlier.shape=NA, alpha=0.7, width=0.5) +
  geom_signif(comparisons=x_comps, map_signif_level=TRUE, textsize=4, size=1, step_increase=0.12, margin_top=0.1) +
  xlab("Chromosomes") +
  ylab("ds") +
  scale_fill_manual(limits=chrome_labs, values=chrome_cols) +
  bartheme() +
  theme(legend.position="none")

print(chrome_ds_p)

2. X vs. autosome dN distributions in Mus and Phodopus

chrome_cols = c("#006ddb", "#006ddb", "#db6d00", "#db6d00")
chrome_labs = c("Mus Auto", "Mus X", "Phodopus Auto", "Phodopus X")

x_comps = list(c("Phodopus Auto", "Phodopus X"), c("Mus Auto", "Mus X"))

chrome_dn_p = ggplot(chrome_data, aes(label, dn, fill=label)) +
  geom_quasirandom(size=0.5, alpha=0.3, width=0.25, color="#d3d3d3") +
  geom_boxplot(outlier.shape=NA, alpha=0.7, width=0.5) +
  geom_signif(comparisons=x_comps, map_signif_level=TRUE, textsize=4, size=1, step_increase=0.12, margin_top=0.1) +
  xlab("Chromosomes") +
  ylab("dN") +
  scale_fill_manual(limits=chrome_labs, values=chrome_cols) +
  bartheme() +
  theme(legend.position="none")

print(chrome_dn_p)

3. X vs. autosome dN/dS distributions in Mus and Phodopus

chrome_cols = c("#006ddb", "#006ddb", "#db6d00", "#db6d00")
chrome_labs = c("Mus Auto", "Mus X", "Phodopus Auto", "Phodopus X")

x_comps = list(c("Phodopus Auto", "Phodopus X"), c("Mus Auto", "Mus X"))

chrome_dnds_p = ggplot(chrome_data, aes(label, dn.ds, fill=label)) +
  geom_quasirandom(size=0.5, alpha=0.3, width=0.25, color="#d3d3d3") +
  geom_boxplot(outlier.shape=NA, alpha=0.7, width=0.5) +
  geom_signif(comparisons=x_comps, map_signif_level=TRUE, textsize=4, size=1, step_increase=0.12, margin_top=0.1) +
  xlab("Chromosomes") +
  ylab("dN/dS") +
  scale_fill_manual(limits=chrome_labs, values=chrome_cols) +
  bartheme() +
  theme(legend.position="none")

print(chrome_dnds_p)

4. X vs. autosome dS distributions in Mus and Phodopus, partitioning Phodopus X between recombining and non-recombining regions

Estimates are averaged across all branches in clade.

pho_xr = subset(pho_x, start < 35225145)
pho_xr$label = "Phodopus X recombining"
pho_xnr = subset(pho_x, start >= 35225145)
pho_xnr$label = "Phodopus X non-recombining"

chrome_data = rbind(mus_a, mus_x, pho_a, pho_xr, pho_xnr)

chrome_cols = c("#006ddb", "#006ddb", "#db6d00", "#db6d00", "#db6d00")
chrome_labs = c("Mus Auto", "Mus X", "Phodopus Auto", "Phodopus X recombining", "Phodopus X non-recombining")

x_comps = list(c("Phodopus X recombining", "Phodopus X non-recombining"), c("Phodopus Auto", "Phodopus X recombining"), c("Phodopus Auto", "Phodopus X non-recombining"))

chrome_ds_p = ggplot(chrome_data, aes(label, ds, fill=label)) +
  geom_quasirandom(size=0.5, alpha=0.3, width=0.25, color="#d3d3d3") +
  geom_boxplot(outlier.shape=NA, alpha=0.7, width=0.5) +
  geom_signif(comparisons=x_comps, map_signif_level=TRUE, textsize=4, size=1, step_increase=0.12, margin_top=0.1) +
  xlab("Chromosomes") +
  ylab("ds") +
  scale_fill_manual(limits=chrome_labs, values=chrome_cols) +
  bartheme() +
  theme(legend.position="none",
        axis.text.x = element_text(angle=10, hjust=1, size=10)
        )

print(chrome_ds_p)

5. X vs. autosome dN distributions in Mus and Phodopus, partitioning Phodopus X between recombining and non-recombining regions

Estimates are averaged across all branches in clade.

chrome_cols = c("#006ddb", "#006ddb", "#db6d00", "#db6d00", "#db6d00")
chrome_labs = c("Mus Auto", "Mus X", "Phodopus Auto", "Phodopus X recombining", "Phodopus X non-recombining")

x_comps = list(c("Phodopus X recombining", "Phodopus X non-recombining"), c("Phodopus Auto", "Phodopus X recombining"), c("Phodopus Auto", "Phodopus X non-recombining"))

chrome_dn_p = ggplot(chrome_data, aes(label, dn, fill=label)) +
  geom_quasirandom(size=0.5, alpha=0.3, width=0.25, color="#d3d3d3") +
  geom_boxplot(outlier.shape=NA, alpha=0.7, width=0.5) +
  geom_signif(comparisons=x_comps, map_signif_level=TRUE, textsize=4, size=1, step_increase=0.12, margin_top=0.1) +
  xlab("Chromosomes") +
  ylab("dN") +
  scale_fill_manual(limits=chrome_labs, values=chrome_cols) +
  bartheme() +
  theme(legend.position="none",
        axis.text.x = element_text(angle=10, hjust=1, size=10)
        )

print(chrome_dn_p)

6. X vs. autosome dN/dS distributions in Mus and Phodopus, partitioning Phodopus X between recombining and non-recombining regions

Single estimate for whole clade.

chrome_cols = c("#006ddb", "#006ddb", "#db6d00", "#db6d00", "#db6d00")
chrome_labs = c("Mus Auto", "Mus X", "Phodopus Auto", "Phodopus X recombining", "Phodopus X non-recombining")

x_comps = list(c("Phodopus X recombining", "Phodopus X non-recombining"), c("Phodopus Auto", "Phodopus X recombining"), c("Phodopus Auto", "Phodopus X non-recombining"))

chrome_dnds_p = ggplot(chrome_data, aes(label, dn.ds, fill=label)) +
  geom_quasirandom(size=0.5, alpha=0.3, width=0.25, color="#d3d3d3") +
  geom_boxplot(outlier.shape=NA, alpha=0.7, width=0.5) +
  geom_signif(comparisons=x_comps, map_signif_level=TRUE, textsize=4, size=1, step_increase=0.12, margin_top=0.1) +
  xlab("Chromosomes") +
  ylab("dN/dS") +
  scale_fill_manual(limits=chrome_labs, values=chrome_cols) +
  bartheme() +
  theme(legend.position="none",
        axis.text.x = element_text(angle=10, hjust=1, size=10)
        )

print(chrome_dnds_p)

7. HyPhy MG94 summary stats

t6_data = data.frame("chrome"=c("Mus Auto", "Mus X", "Phodopus Auto", "Phodopus X non-recombining", "Phodopus X recombining"), 
                     "genes"=c(length(mus_a[,1]), length(mus_x[,1]), length(pho_a[,1]), length(pho_xnr[,1]), length(pho_xr[,1])), 
                     "avg.ds"=c(mean(mus_a$ds), mean(mus_x$ds), mean(pho_a$ds), mean(pho_xnr$ds), mean(pho_xr$ds)), 
                     "avg.dn"=c(mean(mus_a$dn), mean(mus_x$dn), mean(pho_a$dn), mean(pho_xnr$dn), mean(pho_xr$dn)), 
                     "avg.dnds"=c(mean(mus_a$dn.ds), mean(mus_x$dn.ds), mean(pho_a$dn.ds), mean(pho_xnr$dn.ds), mean(pho_xr$dn.ds))
                     )

rownames(t6_data) = c()

t6_data %>% kable(col.names=c("Chromosome","Number of genes", "Avg. dS", "Avg. dN", "Avg. dN/dS"), caption="Substitution rates for Mus and Phodopus branches", align="l") %>% kable_styling(bootstrap_options=c("striped", "condensed", "responsive"), full_width=F)# %>%

Substitution rates for Mus and Phodopus branches
Chromosome	Number of genes	Avg. dS	Avg. dN	Avg. dN/dS
Mus Auto	14656	0.1582118	0.0201877	0.1572215
Mus X	515	0.1300482	0.0206157	0.1905945
Phodopus Auto	9566	0.0651789	0.0090807	0.1806505
Phodopus X non-recombining	256	0.0499538	0.0091739	0.2187543
Phodopus X recombining	142	0.0555683	0.0094026	0.2030216

  #column_spec(1, width="7em") %>%
  #column_spec(2, width="13em")

Results (Hyphy free ratio local model [FitMG94], branch leading to phodopus only, only Cgri as outgroup)

pho_tree_file = "../../data/mol-evol/phodopus-4spec-iqtree-concat-rooted.tre"
pho_tree = read.tree(pho_tree_file)
pho_trait_file = "../../data/mol-evol/phodopus-4spec-concat-traits.csv"
pho_data = read.csv(pho_trait_file, header=TRUE)
pho_data = pho_data[order(pho_data$ggtree),]
pho_branch_labels = pho_data
pho_branch_labels$node[pho_branch_labels$node.type=='tip'] = NA
pho_data$clade = as.character(pho_data$clade)
# Phodopus data

pho_mg_data = read.csv("../../data/mol-evol/phodopus-4spec-mg94-local.csv", header=T, comment.char="#")
pho_mg_data$clade = as.character(pho_mg_data$clade)
#names(pho_mg_data)[8] = "n1"
#names(pho_mg_data)[9] = "n2"
#names(pho_mg_data)[11] = "dn"
#names(pho_mg_data)[12] = "ds"
#pho_mg_data = subset(pho_mg_data, select=-c(n1, n2))
#pho_mg_data = subset(pho_mg_data, select=-c(nonsynonymous.bl, synonymous.bl))
# Phodopus data

0. dS distributions by branch for filtering

pho_data$avg.dnds = NA
pho_data$avg.dn = NA
pho_data$avg.ds = NA
pho_data$num.monophyletic = NA
pho_data$num.monophyletic.filtered = NA

pho_clade_data = list()
pho_clade_data_f = list()
pho_ds_dists = list()
i =  1
for(c in pho_data$clade){
  if(pho_data$node.type[pho_data$clade==c]=="ROOT"){
    next
  }
  
  cur_label = as.character(pho_data$clade.label[pho_data$clade==c])
  cur_data = subset(pho_mg_data, clade==c & ds <= 2)
  cur_data$clade.label = cur_label
  
  ds_dist = ggplot(cur_data, aes(x=ds)) +
    geom_histogram(color="#000000", fill="#d3d3d3") +
    scale_y_continuous(expand=c(0,0)) +
    xlab("dS") +
    ylab("# of genes") +
    ggtitle(pho_data$clade.label[pho_data$clade==c]) +
    theme(axis.text.x = element_text(angle=90, hjust=1, size=8))
  
  #print(ds_dist)
  #stop()
  pho_ds_dists[[i]] = ds_dist
  i = i + 1
  
  cur_data_f = subset(cur_data, ds < 0.3 & dn.ds <= 2)
  
  pho_data$num.monophyletic[pho_data$clade==c] = nrow(cur_data)
  pho_data$num.monophyletic.filtered[pho_data$clade==c] = nrow(cur_data_f)
  pho_data$avg.dnds[pho_data$clade==c] = mean(cur_data_f$dn.ds)
  pho_data$avg.dn[pho_data$clade==c] = mean(cur_data_f$dn)
  pho_data$avg.ds[pho_data$clade==c] = mean(cur_data$ds)
  
  pho_clade_data[[cur_label]] = length(cur_data[,1])
  pho_clade_data_f[[cur_label]] = cur_data_f
}

ds_plots = plot_grid(plotlist=pho_ds_dists, nrow=3)
print(ds_plots)

1. X vs. autosome dS distributions in Mus and Phodopus

pho_branches = c("pho")
pho_a = data.frame()
pho_x = data.frame()
for(b in pho_branches){
  pho_a = rbind(pho_a, subset(pho_clade_data_f[[b]], chr!="ScmyWZ3_7747_HRSCAF_7900"))
  pho_x = rbind(pho_x, subset(pho_clade_data_f[[b]], chr=="ScmyWZ3_7747_HRSCAF_7900"))
}
#pho_x = pho_x %>% group_by(id, chr, start, end) %>% 
#  summarize(ds=mean(ds, na.rm=T), dn=mean(dn, na.rm=T), dn.ds=mean(dn.ds, na.rm=T))
pho_x$label = "Phodopus X"
#pho_a = pho_a %>% group_by(id, chr, start, end) %>% 
#  summarize(ds=mean(ds, na.rm=T), dn=mean(dn, na.rm=T), dn.ds=mean(dn.ds, na.rm=T))
pho_a$label = "Phodopus Auto"

mus_branches = c("mus")
mus_a = data.frame()
mus_x = data.frame()
for(b in mus_branches){
  mus_a = rbind(mus_a, subset(mus_clade_data_f[[b]], chr!="X"))
  mus_x = rbind(mus_x, subset(mus_clade_data_f[[b]], chr=="X"))
}
#mus_x = mus_x %>% group_by(id, chr, start, end) %>% 
#  summarize(ds=mean(ds, na.rm=T), dn=mean(dn, na.rm=T), dn.ds=mean(dn.ds, na.rm=T))
mus_x$label = "Mus X"
#mus_a = mus_a %>% group_by(id, chr, start, end) %>% 
#  summarize(ds=mean(ds, na.rm=T), dn=mean(dn, na.rm=T), dn.ds=mean(dn.ds, na.rm=T))
mus_a$label = "Mus Auto"

chrome_data = rbind(pho_x, pho_a, mus_x, mus_a)

# t = as.data.frame(table(chrome_data$chr))
# test = ggplot(t, aes(x=Var1, y=Freq)) +
#   geom_bar(stat="identity") +
#   scale_y_continuous(expand=c(0,0)) +
#   bartheme() +
#   theme()
# print(test)


#clade_cols = as.character(rodent_data$clade.col)
#clade_labs = as.character(rodent_data$clade.label)

chrome_cols = c("#006ddb", "#006ddb", "#db6d00", "#db6d00")
chrome_labs = c("Mus Auto", "Mus X", "Phodopus Auto", "Phodopus X")

x_comps = list(c("Phodopus Auto", "Phodopus X"), c("Mus Auto", "Mus X"))

chrome_ds_p = ggplot(chrome_data, aes(label, ds, fill=label)) +
  geom_quasirandom(size=0.5, alpha=0.3, width=0.25, color="#d3d3d3") +
  geom_boxplot(outlier.shape=NA, alpha=0.7, width=0.5) +
  geom_signif(comparisons=x_comps, map_signif_level=TRUE, textsize=4, size=1, step_increase=0.12, margin_top=0.1) +
  xlab("Chromosomes") +
  ylab("ds") +
  scale_fill_manual(limits=chrome_labs, values=chrome_cols) +
  bartheme() +
  theme(legend.position="none")

print(chrome_ds_p)

2. X vs. autosome dN distributions in Mus and Phodopus

chrome_cols = c("#006ddb", "#006ddb", "#db6d00", "#db6d00")
chrome_labs = c("Mus Auto", "Mus X", "Phodopus Auto", "Phodopus X")

x_comps = list(c("Phodopus Auto", "Phodopus X"), c("Mus Auto", "Mus X"))

chrome_dn_p = ggplot(chrome_data, aes(label, dn, fill=label)) +
  geom_quasirandom(size=0.5, alpha=0.3, width=0.25, color="#d3d3d3") +
  geom_boxplot(outlier.shape=NA, alpha=0.7, width=0.5) +
  geom_signif(comparisons=x_comps, map_signif_level=TRUE, textsize=4, size=1, step_increase=0.12, margin_top=0.1) +
  xlab("Chromosomes") +
  ylab("dN") +
  scale_fill_manual(limits=chrome_labs, values=chrome_cols) +
  bartheme() +
  theme(legend.position="none")

print(chrome_dn_p)

3. X vs. autosome dN/dS distributions in Mus and Phodopus

chrome_cols = c("#006ddb", "#006ddb", "#db6d00", "#db6d00")
chrome_labs = c("Mus Auto", "Mus X", "Phodopus Auto", "Phodopus X")

x_comps = list(c("Phodopus Auto", "Phodopus X"), c("Mus Auto", "Mus X"))

chrome_dnds_p = ggplot(chrome_data, aes(label, dn.ds, fill=label)) +
  geom_quasirandom(size=0.5, alpha=0.3, width=0.25, color="#d3d3d3") +
  geom_boxplot(outlier.shape=NA, alpha=0.7, width=0.5) +
  geom_signif(comparisons=x_comps, map_signif_level=TRUE, textsize=4, size=1, step_increase=0.12, margin_top=0.1) +
  xlab("Chromosomes") +
  ylab("dN/dS") +
  scale_fill_manual(limits=chrome_labs, values=chrome_cols) +
  bartheme() +
  theme(legend.position="none")

print(chrome_dnds_p)

4. X vs. autosome dS distributions in Mus and Phodopus, partitioning Phodopus X between recombining and non-recombining regions

Estimates are averaged across all branches in clade.

pho_xr = subset(pho_x, start < 35225145)
pho_xr$label = "Phodopus X recombining"
pho_xnr = subset(pho_x, start >= 35225145)
pho_xnr$label = "Phodopus X non-recombining"

chrome_data = rbind(mus_a, mus_x, pho_a, pho_xr, pho_xnr)

chrome_cols = c("#006ddb", "#006ddb", "#db6d00", "#db6d00", "#db6d00")
chrome_labs = c("Mus Auto", "Mus X", "Phodopus Auto", "Phodopus X recombining", "Phodopus X non-recombining")

x_comps = list(c("Phodopus X recombining", "Phodopus X non-recombining"), c("Phodopus Auto", "Phodopus X recombining"), c("Phodopus Auto", "Phodopus X non-recombining"))

chrome_ds_p = ggplot(chrome_data, aes(label, ds, fill=label)) +
  geom_quasirandom(size=0.5, alpha=0.3, width=0.25, color="#d3d3d3") +
  geom_boxplot(outlier.shape=NA, alpha=0.7, width=0.5) +
  geom_signif(comparisons=x_comps, map_signif_level=TRUE, textsize=4, size=1, step_increase=0.12, margin_top=0.1) +
  xlab("Chromosomes") +
  ylab("ds") +
  scale_fill_manual(limits=chrome_labs, values=chrome_cols) +
  bartheme() +
  theme(legend.position="none",
        axis.text.x = element_text(angle=10, hjust=1, size=10)
        )

print(chrome_ds_p)

5. X vs. autosome dN distributions in Mus and Phodopus, partitioning Phodopus X between recombining and non-recombining regions

Estimates are averaged across all branches in clade.

chrome_cols = c("#006ddb", "#006ddb", "#db6d00", "#db6d00", "#db6d00")
chrome_labs = c("Mus Auto", "Mus X", "Phodopus Auto", "Phodopus X recombining", "Phodopus X non-recombining")

x_comps = list(c("Phodopus X recombining", "Phodopus X non-recombining"), c("Phodopus Auto", "Phodopus X recombining"), c("Phodopus Auto", "Phodopus X non-recombining"))

chrome_dn_p = ggplot(chrome_data, aes(label, dn, fill=label)) +
  geom_quasirandom(size=0.5, alpha=0.3, width=0.25, color="#d3d3d3") +
  geom_boxplot(outlier.shape=NA, alpha=0.7, width=0.5) +
  geom_signif(comparisons=x_comps, map_signif_level=TRUE, textsize=4, size=1, step_increase=0.12, margin_top=0.1) +
  xlab("Chromosomes") +
  ylab("dN") +
  scale_fill_manual(limits=chrome_labs, values=chrome_cols) +
  bartheme() +
  theme(legend.position="none",
        axis.text.x = element_text(angle=10, hjust=1, size=10)
        )

print(chrome_dn_p)

6. X vs. autosome dN/dS distributions in Mus and Phodopus, partitioning Phodopus X between recombining and non-recombining regions

Single estimate for whole clade.

chrome_cols = c("#006ddb", "#006ddb", "#db6d00", "#db6d00", "#db6d00")
chrome_labs = c("Mus Auto", "Mus X", "Phodopus Auto", "Phodopus X recombining", "Phodopus X non-recombining")

x_comps = list(c("Phodopus X recombining", "Phodopus X non-recombining"), c("Phodopus Auto", "Phodopus X recombining"), c("Phodopus Auto", "Phodopus X non-recombining"))

chrome_dnds_p = ggplot(chrome_data, aes(label, dn.ds, fill=label)) +
  geom_quasirandom(size=0.5, alpha=0.3, width=0.25, color="#d3d3d3") +
  geom_boxplot(outlier.shape=NA, alpha=0.7, width=0.5) +
  geom_signif(comparisons=x_comps, map_signif_level=TRUE, textsize=4, size=1, step_increase=0.12, margin_top=0.1) +
  xlab("Chromosomes") +
  ylab("dN/dS") +
  scale_fill_manual(limits=chrome_labs, values=chrome_cols) +
  bartheme() +
  theme(legend.position="none",
        axis.text.x = element_text(angle=10, hjust=1, size=10)
        )

print(chrome_dnds_p)

7. HyPhy MG94 local summary stats

t6_data = data.frame("chrome"=c("Mus Auto", "Mus X", "Phodopus Auto", "Phodopus X non-recombining", "Phodopus X recombining"), 
                     "genes"=c(length(mus_a[,1]), length(mus_x[,1]), length(pho_a[,1]), length(pho_xnr[,1]), length(pho_xr[,1])), 
                     "avg.ds"=c(mean(mus_a$ds), mean(mus_x$ds), mean(pho_a$ds), mean(pho_xnr$ds), mean(pho_xr$ds)), 
                     "avg.dn"=c(mean(mus_a$dn), mean(mus_x$dn), mean(pho_a$dn), mean(pho_xnr$dn), mean(pho_xr$dn)), 
                     "avg.dnds"=c(mean(mus_a$dn.ds), mean(mus_x$dn.ds), mean(pho_a$dn.ds), mean(pho_xnr$dn.ds), mean(pho_xr$dn.ds))
                     )

rownames(t6_data) = c()

t6_data %>% kable(col.names=c("Chromosome","Number of genes", "Avg. dS", "Avg. dN", "Avg. dN/dS"), caption="Substitution rates for Mus and Phodopus branches", align="l") %>% kable_styling(bootstrap_options=c("striped", "condensed", "responsive"), full_width=F)# %>%

Substitution rates for Mus and Phodopus branches
Chromosome	Number of genes	Avg. dS	Avg. dN	Avg. dN/dS
Mus Auto	14656	0.1582118	0.0201877	0.1572215
Mus X	515	0.1300482	0.0206157	0.1905945
Phodopus Auto	12886	0.1243969	0.0176781	0.1796804
Phodopus X non-recombining	306	0.1005724	0.0189231	0.2208170
Phodopus X recombining	166	0.1098163	0.0212593	0.2361039

  #column_spec(1, width="7em") %>%
  #column_spec(2, width="13em")

Amino acid profile change

Ancestral reconstructions with M1 free-ratio model, branches leading to Mus clade and Phodopus Clade

pho_aa_data = read.csv("../../data/mol-evol/phodopus-5spec-m1-anc.csv", header=T, comment.char="#")
pho_aa_data = subset(pho_aa_data, clade=="pcam;prob;psun")

pho_aa_x = subset(pho_aa_data, chr=="ScmyWZ3_7747_HRSCAF_7900")
pho_aa_x$label = "Phodopus X"

pho_aa_auto = subset(pho_aa_data, chr!="ScmyWZ3_7747_HRSCAF_7900")
pho_aa_auto$label = "Phodopus Auto"


mus_aa_data = read.csv("../../data/mol-evol/mus-4spec-m1-anc.csv", header=T, comment.char="#")
mus_aa_data = subset(mus_aa_data, clade=="mcar;mmus;mpah")

mus_aa_x = subset(mus_aa_data, chr=="X")
mus_aa_x$label = "Mus X"

mus_aa_auto = subset(mus_aa_data, chr!="X")
mus_aa_auto$label = "Mus Auto"

aa_data = rbind(pho_aa_x, pho_aa_auto, mus_aa_x, mus_aa_auto)
x_comps = list(c("Phodopus Auto", "Phodopus X"), c("Mus Auto", "Mus X"), c("Mus Auto", "Phodopus Auto"), c("Mus X", "Phodopus X"))


chrome_cols = c("#006ddb", "#006ddb", "#db6d00", "#db6d00")
chrome_labs = c("Mus Auto", "Mus X", "Phodopus Auto", "Phodopus X")

aa_p = ggplot(aa_data, aes(label, avg.sub.score, fill=label)) +
  geom_quasirandom(size=0.5, alpha=0.3, width=0.25, color="#d3d3d3") +
  geom_boxplot(outlier.shape=NA, alpha=0.7, width=0.5) +
  geom_signif(comparisons=x_comps, map_signif_level=TRUE, textsize=4, size=1, step_increase=0.12, margin_top=0.1) +
  xlab("Branch") +
  ylab("Avg. BLOSUM score per gene") +
  scale_fill_manual(limits=chrome_labs, values=chrome_cols) +
  bartheme() +
  theme(legend.position="none")
print(aa_p)

pho_aa_xr = subset(pho_aa_x, start < 35225145)
pho_aa_xr$label = "Phodopus X recombining"

pho_aa_xn = subset(pho_aa_x, start >= 35225145)
pho_aa_xn$label = "Phodopus X non-recombining"

aa_data = rbind(pho_aa_xr, pho_aa_xn, pho_aa_auto, mus_aa_x, mus_aa_auto)

chrome_cols = c("#006ddb", "#006ddb", "#db6d00", "#db6d00", "#db6d00")
chrome_labs = c("Mus Auto", "Mus X", "Phodopus Auto", "Phodopus X recombining", "Phodopus X non-recombining")

x_comps = list(c("Phodopus X recombining", "Phodopus X non-recombining"), c("Phodopus Auto", "Phodopus X recombining"), c("Phodopus Auto", "Phodopus X non-recombining"))

aa_p = ggplot(aa_data, aes(label, avg.sub.score, fill=label)) +
  geom_quasirandom(size=0.5, alpha=0.3, width=0.25, color="#d3d3d3") +
  geom_boxplot(outlier.shape=NA, alpha=0.7, width=0.5) +
  geom_signif(comparisons=x_comps, map_signif_level=TRUE, textsize=4, size=1, step_increase=0.12, margin_top=0.1) +
  xlab("Branch") +
  ylab("Avg. BLOSUM score per gene") +
  scale_fill_manual(limits=chrome_labs, values=chrome_cols) +
  bartheme() +
  theme(legend.position="none",
        axis.text.x = element_text(angle=10, hjust=1, size=10))
print(aa_p)

Results summary

Lower dS on X in both Mus and Phodopus
Faster dN/dS on X in branches leading to Mus and Phodopus (M1), Mus clade (M2), but NOT phodopus clade (M2). Results consistent between PAML’s M1 model and HyPhy’s FitMG94.
No difference between recombining and non-recombining portions of the X in phodopus.
No big difference between amino acid profile changes between Mus and Phodopus or Autosomes and X.

< Back Home

cat("Page last updated:", format(Sys.time(), "%m/%d/%Y %H:%M:%S %Z"))

## Page last updated: 07/22/2021 15:02:03 MDT

htmltools::includeHTML("../html-chunks/rmd_footer.html")