Phodopus molecular evolution
#knitr::opts_chunk$set(echo = FALSE)
#this.dir <- dirname(parent.frame(2)$ofile)
#setwd(this.dir)
library(ggtree)
library(ggplot2)
library(ggbeeswarm)
library(cowplot)
library(ggsignif)
library(dplyr)
library(colorspace)
library(kableExtra)
source("../lib/design.r")
#htmltools::includeHTML("../html-chunks/nav.html")
Main questions
- Does lack of recombination affect molecular evolution on the X chromosome in phodopus?
Mus and Phodopus species tree branch rates
Goal: compare rate distributions for TWO small sets of rodent species, including phodopus, to determine how lack of recombination on the phodopus X shapes molecular evolution on that chromosome.
= "../../data/mol-evol/phodopus-5spec-iqtree-concat-rooted.tre"
pho_tree_file = read.tree(pho_tree_file)
pho_tree = "../../data/mol-evol/phodopus-5spec-concat-traits.csv"
pho_trait_file = read.csv(pho_trait_file, header=TRUE)
pho_data = pho_data[order(pho_data$ggtree),]
pho_data = pho_data
pho_branch_labels $node[pho_branch_labels$node.type=='tip'] = NA
pho_branch_labels$clade = as.character(pho_data$clade)
pho_data
= read.csv("../../data/mol-evol/phodopus-5spec-m1.csv", header=T, comment.char="#")
pho_m1_data $clade = as.character(pho_m1_data$clade)
pho_m1_data# Phodopus data
= "../../data/mol-evol/mus-4spec-iqtree-concat-rooted.tre"
mus_tree_file = read.tree(mus_tree_file)
mus_tree = "../../data/mol-evol/mus-4spec-concat-traits.csv"
mus_trait_file = read.csv(mus_trait_file, header=TRUE)
mus_data = mus_data[order(mus_data$ggtree),]
mus_data = mus_data
mus_branch_labels $node[mus_branch_labels$node.type=='tip'] = NA
mus_branch_labels$clade = as.character(mus_data$clade)
mus_data
= read.csv("../../data/mol-evol/mus-4spec-m1.csv", header=T, comment.char="#")
mus_m1_data $clade = as.character(mus_m1_data$clade)
mus_m1_data# Mus data
$avg.dnds = NA
pho_data$avg.dn = NA
pho_data$avg.ds = NA
pho_data$num.monophyletic = NA
pho_data$num.monophyletic.filtered = NA
pho_data
= list()
pho_clade_data = list()
pho_clade_data_f = list()
pho_ds_dists = 1
i for(c in pho_data$clade){
if(pho_data$node.type[pho_data$clade==c]=="ROOT"){
next
}
= as.character(pho_data$clade.label[pho_data$clade==c])
cur_label = subset(pho_m1_data, clade==c & clade.ds <= 2)
cur_data $clade.label = cur_label
cur_data
= ggplot(cur_data, aes(x=clade.ds)) +
ds_dist geom_histogram(color="#000000", fill="#d3d3d3") +
scale_y_continuous(expand=c(0,0)) +
xlab("dS") +
ylab("# of genes") +
ggtitle(pho_data$clade.label[pho_data$clade==c]) +
theme(axis.text.x = element_text(angle=90, hjust=1, size=8))
#print(ds_dist)
#stop()
= ds_dist
pho_ds_dists[[i]] = i + 1
i
= subset(cur_data, clade.ds < 0.3 & clade.dn.ds <= 2)
cur_data_f
$num.monophyletic[pho_data$clade==c] = nrow(cur_data)
pho_data$num.monophyletic.filtered[pho_data$clade==c] = nrow(cur_data_f)
pho_data$avg.dnds[pho_data$clade==c] = mean(cur_data_f$clade.dn.ds)
pho_data$avg.dn[pho_data$clade==c] = mean(cur_data_f$clade.dn)
pho_data$avg.ds[pho_data$clade==c] = mean(cur_data$clade.ds)
pho_data
= length(cur_data[,1])
pho_clade_data[[cur_label]] = cur_data_f
pho_clade_data_f[[cur_label]] }
$avg.dnds = NA
mus_data$avg.dn = NA
mus_data$avg.ds = NA
mus_data$num.monophyletic = NA
mus_data$num.monophyletic.filtered = NA
mus_data
= list()
mus_clade_data = list()
mus_clade_data_f = list()
mus_ds_dists = 1
i for(c in mus_data$clade){
if(mus_data$node.type[mus_data$clade==c]=="ROOT"){
next
}
= as.character(mus_data$clade.label[mus_data$clade==c])
cur_label = subset(mus_m1_data, clade==c & clade.ds <= 2)
cur_data $clade.label = cur_label
cur_data
= ggplot(cur_data, aes(x=clade.ds)) +
ds_dist geom_histogram(color="#000000", fill="#d3d3d3") +
scale_y_continuous(expand=c(0,0)) +
xlab("dS") +
ylab("# of genes") +
ggtitle(mus_data$clade.label[mus_data$clade==c]) +
theme(axis.text.x = element_text(angle=90, hjust=1, size=8))
#print(ds_dist)
#stop()
= ds_dist
mus_ds_dists[[i]] = i + 1
i
= subset(cur_data, clade.ds < 0.3 & clade.dn.ds <= 2)
cur_data_f
$num.monophyletic[mus_data$clade==c] = nrow(cur_data)
mus_data$num.monophyletic.filtered[mus_data$clade==c] = nrow(cur_data_f)
mus_data$avg.dnds[mus_data$clade==c] = mean(cur_data_f$clade.dn.ds)
mus_data$avg.dn[mus_data$clade==c] = mean(cur_data_f$clade.dn)
mus_data$avg.ds[mus_data$clade==c] = mean(cur_data$clade.ds)
mus_data
= length(cur_data[,1])
mus_clade_data[[cur_label]] = cur_data_f
mus_clade_data_f[[cur_label]] }
Methods
Step 0: Genome assembly and annotation (expand later)
Phodopus sungorous was de novo assembled with Dovetails, and campbelli and robovorski were assembled based on iterative mapping to the sungorus assembly. Annotation was done using the Maker pipeline (Seb’s first attempt).
Step 1: Select orthologs between 5 hamster species and 3 mouse+rat species separately
We selected 2 previously sequenced hamster species to estimate and compare rates of synonymous and non-synonymous substitutions with our set of newly sequenced Phodopus genomes. Separately, we selected 3 mouse species + rat to compare rates between phodopus and mus (Table 1). The CDS sequences for these species were downloaded from Ensembl (release 100).
= subset(pho_data, node.type=='tip')
t1_data_pho = subset(t1_data_pho, select=c(species, common, source))
t1_data_pho $set = "Phodopus"
t1_data_phonames(t1_data_pho) = c("Species", "Common name", "Source", "Dataset")
rownames(t1_data_pho) = c()
# Phodopus data
= subset(mus_data, node.type=='tip')
t1_data_mus = subset(t1_data_mus, select=c(species, common, source))
t1_data_mus $set = "Mus"
t1_data_musnames(t1_data_mus) = c("Species", "Common name", "Source", "Dataset")
rownames(t1_data_mus) = c()
# Mus data
= rbind(t1_data_pho, t1_data_mus)
t1_data
%>% kable(caption="Selected species for comparative molecular evolution analysis") %>% kable_styling(bootstrap_options=c("striped", "condensed", "responsive"), full_width=F) t1_data
Species | Common name | Source | Dataset |
---|---|---|---|
Cricetulus_griseus | Chinese hamster | Ensembl | Phodopus |
Mesocricetus_auratus | Golden hamster | Ensembl | Phodopus |
Phodopus_campbelli | Campbell’s dwarf hamster | This study | Phodopus |
Phodopus_sungorus | Djungarian hamster | This study | Phodopus |
Phodopus_robovorski | Roborvorski hamster | This study | Phodopus |
Rattus_norvegicus | Brown rat | Ensembl | Mus |
Mus_musculus | House mouse | Ensembl | Mus |
Mus_caroli | Ryukyu mouse | Ensembl | Mus |
Mus_pahari | Gairdner’s shrewmouse | Ensembl | Mus |
Phylogenetic and comparative analyses often rely on selection of a single sequence from each species to assess relatedness of genes. However because genes can have multiple transcripts due to alternative splicing of exons and because genes can be duplicated or lost over time this requires selection of sequences in each species at two levels: 1) selection of a single transcript within a gene and 2) selection of genes without paralogs (i.e. those that are single copy across all species). The most common method for selection of transcripts is to simply pick the longest transcript in each species.
Mus transcript selection:
- Total Mus transcripts: 142,447
- Longest transcripts: 21,875
- Single-copy between 4 species: 15,772
- X-genes before alignments: 551
- X-genes after alignments: 526
Phodopus transcript selection:
- Total transcripts predicted from Maker: 27,988
- Longest BLAST match to CGRI transcripts: 14,768
- Also single-copy in MAUR: 12,413
- X-genes before alignments: 493
- X-genes after alignments: 413
Step 2: Align orthologs
We aligned the coding sequences from each single-copy ortholog using a suite of programs in the MACSE software as follows:
- Trim non-homologous regions with MACSE’s trimNonHomologousFragments program
- Codon aware alignments with MACSE’s alignSequences program
- Trim gappy edges from alignment with MACSE’s trimAlignment program
- Removed alignments with premature stop codons
Step 3: Construct species and gene trees
We constructed maximum likelihood gene trees and a concatenated species tree using IQ-tree aligned coding sequences. We used IQ-trees concordance factors to assess the confidence of the species tree as well as the consistency of the gene trees. We find that most branches in the species tree are consistently present in a high proportion of gene trees (>90% for all branches).
= F
node_check
##########
# Node label checking
if(node_check){
= ggtree(pho_tree, size=2, ladderize=F) +
node_test ::xlim(0, 0.05) +
ggplot2geom_tiplab(color="#333333", fontface='italic', size=5) +
geom_text(aes(label=node), hjust=-.3, vjust=-.3, color="#ff6db6") +
geom_nodepoint(color="#666666", alpha=0.85, size=4)
#node_test = node_test %>% rotate(25) %>% rotate(26) %>% rotate(27) %>% rotate(28) %>% rotate(29) %>% rotate(30) %>% rotate(32) %>%
# rotate(33) %>% rotate(35) %>% rotate(36) %>% rotate(39) %>% rotate(41) %>% rotate(44)
print(node_test)
stop("Node check ok")
}# Node label checking
##########
= corecol(numcol=1, pal="wilke", offset=3)
h = corecol(numcol=1, offset=3)
l
= corecol(numcol=3)
clade_cols = c("pho", "cs")
clade_labs
= ggtree(pho_tree, size=2, ladderize=F, aes(color=pho_data$gcf)) +
pho_tree_fig scale_color_continuous(name='gCF', low=l, high=h, limits=c(0,100)) +
::xlim(0, 0.07) +
ggplot2geom_tiplab(aes(label=pho_data$species), color="#333333", fontface='italic', size=5) +
#geom_cladelabel(node=13, size=2, label="pho", color="green", offset = 0.07, align=T) +
#geom_text(aes(label=rodent_data$label), hjust=-.3, vjust=-.3, color="#ff6db6") +
geom_label(aes(x=branch, label=pho_branch_labels$branch), color="#d3d3d3", fill=pho_branch_labels$clade.col) +
#labs(caption="Figure 1: 5 species hamster phylogeny constructed from concatenation of XX single-copy orthologs.\nGene concordance factors (gCF) are indicated by branch colors.") +
theme(legend.position=c(0.9,0.25),
plot.caption=element_text(hjust=0))
= ggtree(mus_tree, size=2, ladderize=F, aes(color=mus_data$gcf)) +
mus_tree_fig scale_color_continuous(name='gCF', low=l, high=h, limits=c(0,100)) +
::xlim(0, 0.07) +
ggplot2geom_tiplab(aes(label=mus_data$species), color="#333333", fontface='italic', size=5) +
#geom_cladelabel(node=13, size=2, label="pho", color="green", offset = 0.07, align=T) +
#geom_text(aes(label=rodent_data$label), hjust=-.3, vjust=-.3, color="#ff6db6") +
geom_label(aes(x=branch, label=mus_branch_labels$branch), color="#d3d3d3", fill=mus_branch_labels$clade.col) +
#labs(caption="Figure 1: 5 species hamster phylogeny constructed from concatenation of XX single-copy orthologs.\nGene concordance factors (gCF) are indicated by branch colors.") +
theme(legend.position=c(0.9,0.25),
plot.caption=element_text(hjust=0))
#geom_text(aes(label=rodent_data$support), hjust=-.1, color="#006ddb") +
#geom_nodepoint(color="#666666", alpha=0.85, size=4)
#print(mus_tree_fig)
= plot_grid(pho_tree_fig, mus_tree_fig, nrow=1, align='vh')
tree_fig = "Figure 1: (left) 5 species hamster phylogeny constructed from concatenation of 10,300 single-copy orthologs.\n(right) 4 species mouse phylogeny constructed from concatenation of 15,427 single-copy orthologes\nGene concordance factors (gCF) are indicated by branch colors."
cap = ggdraw() + draw_label(cap, size=10, fontface="bold", x=0, hjust=0) + theme(plot.margin=margin(0,0,0,7))
pcap = plot_grid(tree_fig, pcap, ncol=1, rel_heights=c(1, 0.1))
tree_p print(tree_p)
Step 4: Estimation of synonymous (dS) and nonysynonymous (dN) substitution rates
We used codeml within the PAML program to estimate rates of synonymous (dS) and non-synonymous (dN) substitutions using a free-ratio model (model=1, NSsites=0) to get an estimate of rate for each branch in our phylogeny. Relationships among genes may differ amongst themselves and relative to the species relationships due to biological factors such as incomplete lineage sorting and introgression. In cases where a gene tree and the species tree disagree, inferring substitutions using the species tree (the wrong tree in this case) can lead to spurious inferences of substitutions and affect rate estimates. To mitigate this, we used the corresponding gene tree for each alignment when estimating rates. When compiling rate estimates, we only counted rates for a branch in the species tree if it existed in the gene tree (Table 2).
# t2_data = data.frame("Branch"=c(), "Gene trees"=c(), "Genes after dS filter"=c())
# for(clade_label in pho_data$clade.label){
# if(pho_data$node.type[pho_data$clade.label==clade_label]!="internal"){
# next
# }
#
# t2_data = rbind(t2_data, data.frame("Branch"=clade_label, "Gene trees"=clade_data[[clade_label]], "Genes after dS filter"=length(clade_data_f[[clade_label]][,1])))
# }
#
# rownames(t2_data) = c()
#
# t2_data %>% kable(col.names=c("Branch","Monophyletic gene trees", "Genes after dS filter"), caption="Number of gene trees containing branch before and after dS filtering", align="l") %>% kable_styling(bootstrap_options=c("striped", "condensed", "responsive"), full_width=F)# %>%
# #column_spec(1, width="7em") %>%
# #column_spec(2, width="13em")
Filtering based on dS
As another check on alignment quality, we assessed distributions of dS along each branch and only counted genes for each branch where dS fell below 0.3. (QUESTION: should this dS filter step have a different threshold for each branch?) We also omitted genes for a branch where dN/dS was greater than 2.
= plot_grid(plotlist=pho_ds_dists, nrow=3)
ds_plots print(ds_plots)
= plot_grid(plotlist=mus_ds_dists, nrow=3)
ds_plots print(ds_plots)
Post filter dS distributions per branch
= do.call("rbind", pho_clade_data_f)
pho_m1_data_f = do.call("rbind", mus_clade_data_f)
mus_m1_data_f = rbind(pho_m1_data_f, mus_m1_data_f)
m1_data_f
#clade_cols = as.character(pho_data$clade.col)
#clade_labs = as.character(pho_data$clade.label)
= list(c("mus", "pho"))
x_comps
= ggplot(m1_data_f, aes(clade.label, clade.ds)) +
clade_ds_p geom_quasirandom(size=0.5, alpha=0.3, width=0.25, color="#d3d3d3") +
geom_boxplot(outlier.shape=NA, alpha=0.7, width=0.5, fill="#333333") +
#geom_signif(comparisons=x_comps, map_signif_level=TRUE, textsize=4, size=1, step_increase=0.12, margin_top=0.1) +
xlab("Branch") +
ylab("dS") +
#scale_fill_manual(limits=clade_labs, values=clade_cols) +
bartheme() +
theme(legend.position="none",
axis.text.x=element_text(angle=25, hjust=1, size=8))
print(clade_ds_p)
Post filter dN distributions per branch
= as.character(pho_data$clade.col)
clade_cols = as.character(pho_data$clade.label)
clade_labs
#x_comps = list(c("mus", "pho"))
= ggplot(m1_data_f, aes(clade.label, clade.dn)) +
clade_dn_p geom_quasirandom(size=0.5, alpha=0.3, width=0.25, color="#d3d3d3") +
geom_boxplot(outlier.shape=NA, alpha=0.7, width=0.5, fill="#333333") +
#geom_signif(comparisons=x_comps, map_signif_level=TRUE, textsize=4, size=1, step_increase=0.12, margin_top=0.1) +
xlab("Branch") +
ylab("dN") +
scale_fill_manual(limits=clade_labs, values=clade_cols) +
bartheme() +
theme(legend.position="none",
axis.text.x = element_text(angle=25, hjust=1, size=8)
)
print(clade_dn_p)
Post filter dN/dS distributions per branch
= as.character(pho_data$clade.col)
clade_cols = as.character(pho_data$clade.label)
clade_labs
= list(c("mus", "pho"))
x_comps
= ggplot(m1_data_f, aes(clade.label, clade.dn.ds)) +
clade_dnds_p geom_quasirandom(size=0.5, alpha=0.3, width=0.25, color="#d3d3d3") +
geom_boxplot(outlier.shape=NA, alpha=0.7, width=0.5, fill="#333333") +
#geom_signif(comparisons=x_comps, map_signif_level=TRUE, textsize=4, size=1, step_increase=0.12, margin_top=0.1) +
xlab("Branch") +
ylab("dN/dS") +
#scale_fill_manual(limits=clade_labs, values=clade_cols) +
bartheme() +
theme(legend.position="none",
axis.text.x=element_text(angle=25, hjust=1, size=8))
print(clade_dnds_p)
Results (M1 branch leading to Phodopus and Mus only)
1. X vs. autosome dS distributions in Mus and Phodopus
= c("pho")
pho_branches = data.frame()
pho_a = data.frame()
pho_x for(b in pho_branches){
= rbind(pho_a, subset(pho_clade_data_f[[b]], chr!="ScmyWZ3_7747_HRSCAF_7900"))
pho_a = rbind(pho_x, subset(pho_clade_data_f[[b]], chr=="ScmyWZ3_7747_HRSCAF_7900"))
pho_x
}$label = "Phodopus X"
pho_x$label = "Phodopus Auto"
pho_a
= c("mus")
mus_branches = data.frame()
mus_a = data.frame()
mus_x for(b in mus_branches){
= rbind(mus_a, subset(mus_clade_data_f[[b]], chr!="X"))
mus_a = rbind(mus_x, subset(mus_clade_data_f[[b]], chr=="X"))
mus_x
}$label = "Mus X"
mus_x$label = "Mus Auto"
mus_a
= rbind(pho_x, pho_a, mus_x, mus_a)
chrome_data
# t = as.data.frame(table(chrome_data$chr))
# test = ggplot(t, aes(x=Var1, y=Freq)) +
# geom_bar(stat="identity") +
# scale_y_continuous(expand=c(0,0)) +
# bartheme() +
# theme()
# print(test)
#clade_cols = as.character(rodent_data$clade.col)
#clade_labs = as.character(rodent_data$clade.label)
= c("#006ddb", "#006ddb", "#db6d00", "#db6d00")
chrome_cols = c("Mus Auto", "Mus X", "Phodopus Auto", "Phodopus X")
chrome_labs
= list(c("Phodopus Auto", "Phodopus X"), c("Mus Auto", "Mus X"))
x_comps
= ggplot(chrome_data, aes(label, clade.ds, fill=label)) +
chrome_ds_p geom_quasirandom(size=0.5, alpha=0.3, width=0.25, color="#d3d3d3") +
geom_boxplot(outlier.shape=NA, alpha=0.7, width=0.5) +
geom_signif(comparisons=x_comps, map_signif_level=TRUE, textsize=4, size=1, step_increase=0.12, margin_top=0.1) +
xlab("Chromosomes") +
ylab("ds") +
scale_fill_manual(limits=chrome_labs, values=chrome_cols) +
bartheme() +
theme(legend.position="none")
print(chrome_ds_p)
2. X vs. autosome dN distributions in Mus and Phodopus
= c("#006ddb", "#006ddb", "#db6d00", "#db6d00")
chrome_cols = c("Mus Auto", "Mus X", "Phodopus Auto", "Phodopus X")
chrome_labs
= list(c("Phodopus Auto", "Phodopus X"), c("Mus Auto", "Mus X"))
x_comps
= ggplot(chrome_data, aes(label, clade.dn, fill=label)) +
chrome_dn_p geom_quasirandom(size=0.5, alpha=0.3, width=0.25, color="#d3d3d3") +
geom_boxplot(outlier.shape=NA, alpha=0.7, width=0.5) +
geom_signif(comparisons=x_comps, map_signif_level=TRUE, textsize=4, size=1, step_increase=0.12, margin_top=0.1) +
xlab("Chromosomes") +
ylab("dN") +
scale_fill_manual(limits=chrome_labs, values=chrome_cols) +
bartheme() +
theme(legend.position="none")
print(chrome_dn_p)
3. X vs. autosome dN/dS distributions in Mus and Phodopus
= c("#006ddb", "#006ddb", "#db6d00", "#db6d00")
chrome_cols = c("Mus Auto", "Mus X", "Phodopus Auto", "Phodopus X")
chrome_labs
= list(c("Phodopus Auto", "Phodopus X"), c("Mus Auto", "Mus X"))
x_comps
= ggplot(chrome_data, aes(label, clade.dn.ds, fill=label)) +
chrome_dnds_p geom_quasirandom(size=0.5, alpha=0.3, width=0.25, color="#d3d3d3") +
geom_boxplot(outlier.shape=NA, alpha=0.7, width=0.5) +
geom_signif(comparisons=x_comps, map_signif_level=TRUE, textsize=4, size=1, step_increase=0.12, margin_top=0.1) +
xlab("Chromosomes") +
ylab("dN/dS") +
scale_fill_manual(limits=chrome_labs, values=chrome_cols) +
bartheme() +
theme(legend.position="none")
print(chrome_dnds_p)
= data.frame("chrome"=c("Mus auto", "Mus X", "Phodopus auto", "Phodopus X"),
t3_data "genes"=c(length(mus_a[,1]), length(mus_x[,1]), length(pho_a[,1]), length(pho_x[,1])),
"avg.ds"=c(mean(mus_a$clade.ds), mean(mus_x$clade.ds), mean(pho_a$clade.ds), mean(pho_x$clade.ds)),
"avg.dn"=c(mean(mus_a$clade.dn), mean(mus_x$clade.dn), mean(pho_a$clade.dn), mean(pho_x$clade.dn)),
"avg.dnds"=c(mean(mus_a$clade.dn.ds), mean(mus_x$clade.dn.ds), mean(pho_a$clade.dn.ds), mean(pho_x$clade.dn.ds)))
rownames(t3_data) = c()
%>% kable(col.names=c("Chromosome","Number of genes", "Avg. dS", "Avg. dN", "Avg. dN/dS"), caption="Substitution rates for Mus and Phodopus branches", align="l") %>% kable_styling(bootstrap_options=c("striped", "condensed", "responsive"), full_width=F)# %>% t3_data
Chromosome | Number of genes | Avg. dS | Avg. dN | Avg. dN/dS |
---|---|---|---|---|
Mus auto | 14695 | 0.1476831 | 0.0214826 | 0.1511614 |
Mus X | 517 | 0.1167536 | 0.0223234 | 0.1857207 |
Phodopus auto | 9581 | 0.0604657 | 0.0095896 | 0.1749161 |
Phodopus X | 399 | 0.0463514 | 0.0099188 | 0.2132867 |
#column_spec(1, width="7em") %>%
#column_spec(2, width="13em")
4. X vs. autosome dS distributions in Mus and Phodopus, partitioning Phodopus X between recombining and non-recombining regions
# if scaff == "ScmyWZ3_7747_HRSCAF_7900":
# if int(end) < 35225145:
# scaff = "ScmyWZ3_7747_HRSCAF_7900_R";
# chrome = "chrX_R";
# elif int(start) >= 35225145:
# scaff = "ScmyWZ3_7747_HRSCAF_7900_NR"
# chrome = "chrX_NR";
# else:
# chrome = "chrX";
# elif scaff in scaff_to_chr:
# chrome = scaff_to_chr[scaff];
# else:
# chrome = "NA";
= subset(pho_x, start < 35225145)
pho_xr $label = "Phodopus X recombining"
pho_xr= subset(pho_x, start >= 35225145)
pho_xnr $label = "Phodopus X non-recombining"
pho_xnr
= rbind(mus_a, mus_x, pho_a, pho_xr, pho_xnr)
chrome_data
= c("#006ddb", "#006ddb", "#db6d00", "#db6d00", "#db6d00")
chrome_cols = c("Mus Auto", "Mus X", "Phodopus Auto", "Phodopus X recombining", "Phodopus X non-recombining")
chrome_labs
= list(c("Phodopus X recombining", "Phodopus X non-recombining"), c("Phodopus Auto", "Phodopus X recombining"), c("Phodopus Auto", "Phodopus X non-recombining"))
x_comps
= ggplot(chrome_data, aes(label, clade.ds, fill=label)) +
chrome_ds_p geom_quasirandom(size=0.5, alpha=0.3, width=0.25, color="#d3d3d3") +
geom_boxplot(outlier.shape=NA, alpha=0.7, width=0.5) +
geom_signif(comparisons=x_comps, map_signif_level=TRUE, textsize=4, size=1, step_increase=0.12, margin_top=0.1) +
xlab("Chromosomes") +
ylab("ds") +
scale_fill_manual(limits=chrome_labs, values=chrome_cols) +
bartheme() +
theme(legend.position="none",
axis.text.x = element_text(angle=10, hjust=1, size=10)
)
print(chrome_ds_p)
5. X vs. autosome dN distributions in Mus and Phodopus, partitioning Phodopus X between recombining and non-recombining regions
= c("#006ddb", "#006ddb", "#db6d00", "#db6d00", "#db6d00")
chrome_cols = c("Mus Auto", "Mus X", "Phodopus Auto", "Phodopus X recombining", "Phodopus X non-recombining")
chrome_labs
= list(c("Phodopus X recombining", "Phodopus X non-recombining"), c("Phodopus Auto", "Phodopus X recombining"), c("Phodopus Auto", "Phodopus X non-recombining"))
x_comps
= ggplot(chrome_data, aes(label, clade.dn, fill=label)) +
chrome_dn_p geom_quasirandom(size=0.5, alpha=0.3, width=0.25, color="#d3d3d3") +
geom_boxplot(outlier.shape=NA, alpha=0.7, width=0.5) +
geom_signif(comparisons=x_comps, map_signif_level=TRUE, textsize=4, size=1, step_increase=0.12, margin_top=0.1) +
xlab("Chromosomes") +
ylab("dN") +
scale_fill_manual(limits=chrome_labs, values=chrome_cols) +
bartheme() +
theme(legend.position="none",
axis.text.x = element_text(angle=10, hjust=1, size=10)
)
print(chrome_dn_p)
6. X vs. autosome dN/dS distributions in Mus and Phodopus, partitioning Phodopus X between recombining and non-recombining regions
= c("#006ddb", "#006ddb", "#db6d00", "#db6d00", "#db6d00")
chrome_cols = c("Mus Auto", "Mus X", "Phodopus Auto", "Phodopus X recombining", "Phodopus X non-recombining")
chrome_labs
= list(c("Phodopus X recombining", "Phodopus X non-recombining"), c("Phodopus Auto", "Phodopus X recombining"), c("Phodopus Auto", "Phodopus X non-recombining"))
x_comps
= ggplot(chrome_data, aes(label, clade.dn.ds, fill=label)) +
chrome_dnds_p geom_quasirandom(size=0.5, alpha=0.3, width=0.25, color="#d3d3d3") +
geom_boxplot(outlier.shape=NA, alpha=0.7, width=0.5) +
geom_signif(comparisons=x_comps, map_signif_level=TRUE, textsize=4, size=1, step_increase=0.12, margin_top=0.1) +
xlab("Chromosomes") +
ylab("dN/dS") +
scale_fill_manual(limits=chrome_labs, values=chrome_cols) +
bartheme() +
theme(legend.position="none",
axis.text.x = element_text(angle=10, hjust=1, size=10)
)
print(chrome_dnds_p)
7. M1 summary stats
= data.frame("chrome"=c("Mus Auto", "Mus X", "Phodopus Auto", "Phodopus X non-recombining", "Phodopus X recombining"),
t4_data "genes"=c(length(mus_a[,1]), length(mus_x[,1]), length(pho_a[,1]), length(pho_xnr[,1]), length(pho_xr[,1])),
"avg.ds"=c(mean(mus_a$clade.ds), mean(mus_x$clade.ds), mean(pho_a$clade.ds), mean(pho_xnr$clade.ds), mean(pho_xr$clade.ds)),
"avg.dn"=c(mean(mus_a$clade.dn), mean(mus_x$clade.dn), mean(pho_a$clade.dn), mean(pho_xnr$clade.dn), mean(pho_xr$clade.dn)),
"avg.dnds"=c(mean(mus_a$clade.dn.ds), mean(mus_x$clade.dn.ds), mean(pho_a$clade.dn.ds), mean(pho_xnr$clade.dn.ds), mean(pho_xr$clade.dn.ds))
)
rownames(t4_data) = c()
%>% kable(col.names=c("Chromosome","Number of genes", "Avg. dS", "Avg. dN", "Avg. dN/dS"), caption="Substitution rates for Mus and Phodopus branches", align="l") %>% kable_styling(bootstrap_options=c("striped", "condensed", "responsive"), full_width=F)# %>% t4_data
Chromosome | Number of genes | Avg. dS | Avg. dN | Avg. dN/dS |
---|---|---|---|---|
Mus Auto | 14695 | 0.1476831 | 0.0214826 | 0.1511614 |
Mus X | 517 | 0.1167536 | 0.0223234 | 0.1857207 |
Phodopus Auto | 9581 | 0.0604657 | 0.0095896 | 0.1749161 |
Phodopus X non-recombining | 257 | 0.0442354 | 0.0098794 | 0.2236335 |
Phodopus X recombining | 142 | 0.0501810 | 0.0099901 | 0.1945606 |
#column_spec(1, width="7em") %>%
#column_spec(2, width="13em")
Results (Clade model C; not useful?)
= read.csv("../../data/mol-evol/phodopus-5spec-cmc.csv", header=TRUE, comment.char="#")
cmc_data
= subset(cmc_data, chr!="ScmyWZ3_7747_HRSCAF_7900")
cmc_a $label = "Phodopus Auto"
cmc_a
= subset(cmc_a, select=c(label, background.clade.omega.1))
cmc_a_1_b $class = "Background omega Auto 1"
cmc_a_1_bnames(cmc_a_1_b)[2] = "omega"
= subset(cmc_a, select=c(label, background.clade.omega.2))
cmc_a_2_b $class = "Background omega Auto 2"
cmc_a_2_bnames(cmc_a_2_b)[2] = "omega"
= subset(cmc_a, select=c(label, background.clade.omega.3))
cmc_a_3_b $class = "Background omega Auto 3"
cmc_a_3_bnames(cmc_a_3_b)[2] = "omega"
= subset(cmc_a, select=c(label, target.clade.omega.1))
cmc_a_1_t$class = "Target omega Auto 1"
cmc_a_1_tnames(cmc_a_1_t)[2] = "omega"
= subset(cmc_a, select=c(label, target.clade.omega.2))
cmc_a_2_t $class = "Target omega Auto 2"
cmc_a_2_tnames(cmc_a_2_t)[2] = "omega"
= subset(cmc_a, select=c(label, target.clade.omega.3))
cmc_a_3_t $class = "Target omega Auto 3"
cmc_a_3_tnames(cmc_a_3_t)[2] = "omega"
= subset(cmc_data, chr!="ScmyWZ3_7747_HRSCAF_7900")
cmc_x $label = "Phodopus X"
cmc_x
= subset(cmc_x, select=c(label, background.clade.omega.1))
cmc_x_1_b $class = "Background omega X 1"
cmc_x_1_bnames(cmc_x_1_b)[2] = "omega"
= subset(cmc_x, select=c(label, background.clade.omega.2))
cmc_x_2_b $class = "Background omega X 2"
cmc_x_2_bnames(cmc_x_2_b)[2] = "omega"
= subset(cmc_x, select=c(label, background.clade.omega.3))
cmc_x_3_b $class = "Background omega X 3"
cmc_x_3_bnames(cmc_x_3_b)[2] = "omega"
= subset(cmc_x, select=c(label, target.clade.omega.1))
cmc_x_1_t$class = "Target omega X 1"
cmc_x_1_tnames(cmc_x_1_t)[2] = "omega"
= subset(cmc_x, select=c(label, target.clade.omega.2))
cmc_x_2_t $class = "Target omega X 2"
cmc_x_2_tnames(cmc_x_2_t)[2] = "omega"
= subset(cmc_x, select=c(label, target.clade.omega.3))
cmc_x_3_t $class = "Target omega X 3"
cmc_x_3_tnames(cmc_x_3_t)[2] = "omega"
= rbind(cmc_a_1_b, cmc_a_2_b, cmc_a_3_b, cmc_a_1_t, cmc_a_2_t, cmc_a_3_t,
cmc_data_long
cmc_x_1_b, cmc_x_2_b, cmc_x_3_b, cmc_x_1_t, cmc_x_2_t, cmc_x_3_t)
= subset(cmc_data_long, omega < 2)
cmc_data_long
= c("#006ddb", "#006ddb", "#db6d00", "#db6d00", "#db6d00")
chrome_cols = c("Mus Auto", "Mus X", "Phodopus Auto", "Phodopus X recombining", "Phodopus X non-recombining")
chrome_labs
= list(c("Target omega X 3", "Target omega Auto 3"))
x_comps
= ggplot(cmc_data_long, aes(class, omega, fill=class)) +
cmc_dnds_p geom_quasirandom(size=0.5, alpha=0.3, width=0.25, color="#d3d3d3") +
geom_boxplot(outlier.shape=NA, alpha=0.7, width=0.5) +
geom_signif(comparisons=x_comps, map_signif_level=TRUE, textsize=4, size=1, step_increase=0.12, margin_top=0.1) +
xlab("Chromosomes") +
ylab("dN/dS") +
#scale_fill_manual(limits=chrome_labs, values=chrome_cols) +
bartheme() +
theme(legend.position="none",
axis.text.x = element_text(angle=25, hjust=1, size=10)
)
print(cmc_dnds_p)
Results (M2 when setting all phodopus branches and all mus branches as targets)
= read.csv("../../data/mol-evol/phodopus-5spec-m2.csv", header=T, comment.char="#")
pho_m2_data $clade = as.character(pho_m2_data$clade)
pho_m2_data# Phodopus data
= read.csv("../../data/mol-evol/mus-4spec-m2.csv", header=T, comment.char="#")
mus_m2_data $clade = as.character(mus_m2_data$clade)
mus_m2_data# Mus data
0. dS distributions by branch for filtering
$avg.dnds = NA
pho_data$avg.dn = NA
pho_data$avg.ds = NA
pho_data$num.monophyletic = NA
pho_data$num.monophyletic.filtered = NA
pho_data
= list()
pho_clade_data = list()
pho_clade_data_f = list()
pho_ds_dists = 1
i for(c in pho_data$clade){
if(pho_data$node.type[pho_data$clade==c]=="ROOT"){
next
}
= as.character(pho_data$clade.label[pho_data$clade==c])
cur_label = subset(pho_m2_data, clade==c & clade.ds <= 2)
cur_data $clade.label = cur_label
cur_data
= ggplot(cur_data, aes(x=clade.ds)) +
ds_dist geom_histogram(color="#000000", fill="#d3d3d3") +
scale_y_continuous(expand=c(0,0)) +
xlab("dS") +
ylab("# of genes") +
ggtitle(pho_data$clade.label[pho_data$clade==c]) +
theme(axis.text.x = element_text(angle=90, hjust=1, size=8))
#print(ds_dist)
#stop()
= ds_dist
pho_ds_dists[[i]] = i + 1
i
= subset(cur_data, clade.ds < 0.3 & clade.dn.ds <= 2)
cur_data_f
$num.monophyletic[pho_data$clade==c] = nrow(cur_data)
pho_data$num.monophyletic.filtered[pho_data$clade==c] = nrow(cur_data_f)
pho_data$avg.dnds[pho_data$clade==c] = mean(cur_data_f$clade.dn.ds)
pho_data$avg.dn[pho_data$clade==c] = mean(cur_data_f$clade.dn)
pho_data$avg.ds[pho_data$clade==c] = mean(cur_data$clade.ds)
pho_data
= length(cur_data[,1])
pho_clade_data[[cur_label]] = cur_data_f
pho_clade_data_f[[cur_label]] }
$avg.dnds = NA
mus_data$avg.dn = NA
mus_data$avg.ds = NA
mus_data$num.monophyletic = NA
mus_data$num.monophyletic.filtered = NA
mus_data
= list()
mus_clade_data = list()
mus_clade_data_f = list()
mus_ds_dists = 1
i for(c in mus_data$clade){
if(mus_data$node.type[mus_data$clade==c]=="ROOT"){
next
}
= as.character(mus_data$clade.label[mus_data$clade==c])
cur_label = subset(mus_m2_data, clade==c & clade.ds <= 2)
cur_data $clade.label = cur_label
cur_data
= ggplot(cur_data, aes(x=clade.ds)) +
ds_dist geom_histogram(color="#000000", fill="#d3d3d3") +
scale_y_continuous(expand=c(0,0)) +
xlab("dS") +
ylab("# of genes") +
ggtitle(mus_data$clade.label[mus_data$clade==c]) +
theme(axis.text.x = element_text(angle=90, hjust=1, size=8))
#print(ds_dist)
#stop()
= ds_dist
mus_ds_dists[[i]] = i + 1
i
= subset(cur_data, clade.ds < 0.3 & clade.dn.ds <= 2)
cur_data_f
$num.monophyletic[mus_data$clade==c] = nrow(cur_data)
mus_data$num.monophyletic.filtered[mus_data$clade==c] = nrow(cur_data_f)
mus_data$avg.dnds[mus_data$clade==c] = mean(cur_data_f$clade.dn.ds)
mus_data$avg.dn[mus_data$clade==c] = mean(cur_data_f$clade.dn)
mus_data$avg.ds[mus_data$clade==c] = mean(cur_data$clade.ds)
mus_data
= length(cur_data[,1])
mus_clade_data[[cur_label]] = cur_data_f
mus_clade_data_f[[cur_label]] }
= plot_grid(plotlist=pho_ds_dists, nrow=3)
ds_plots print(ds_plots)
= plot_grid(plotlist=mus_ds_dists, nrow=3)
ds_plots print(ds_plots)
1. X vs. autosome dS distributions in Mus and Phodopus
- Estimates are averaged across all branches in clade.
= c("cs", "psun", "pcam", "prob")
pho_branches = data.frame()
pho_a = data.frame()
pho_x for(b in pho_branches){
= rbind(pho_a, subset(pho_clade_data_f[[b]], chr!="ScmyWZ3_7747_HRSCAF_7900"))
pho_a = rbind(pho_x, subset(pho_clade_data_f[[b]], chr=="ScmyWZ3_7747_HRSCAF_7900"))
pho_x
}= pho_x %>% group_by(id, chr, start, end) %>%
pho_x summarize(ds=mean(clade.ds, na.rm=T), dn=mean(clade.dn, na.rm=T), dn.ds=mean(clade.dn.ds, na.rm=T))
## `summarise()` has grouped output by 'id', 'chr', 'start'. You can override using the `.groups` argument.
$label = "Phodopus X"
pho_x= pho_a %>% group_by(id, chr, start, end) %>%
pho_a summarize(ds=mean(clade.ds, na.rm=T), dn=mean(clade.dn, na.rm=T), dn.ds=mean(clade.dn.ds, na.rm=T))
## `summarise()` has grouped output by 'id', 'chr', 'start'. You can override using the `.groups` argument.
$label = "Phodopus Auto"
pho_a
= c("cm", "mpah", "mcar", "mmus")
mus_branches = data.frame()
mus_a = data.frame()
mus_x for(b in mus_branches){
= rbind(mus_a, subset(mus_clade_data_f[[b]], chr!="X"))
mus_a = rbind(mus_x, subset(mus_clade_data_f[[b]], chr=="X"))
mus_x
}= mus_x %>% group_by(id, chr, start, end) %>%
mus_x summarize(ds=mean(clade.ds, na.rm=T), dn=mean(clade.dn, na.rm=T), dn.ds=mean(clade.dn.ds, na.rm=T))
## `summarise()` has grouped output by 'id', 'chr', 'start'. You can override using the `.groups` argument.
$label = "Mus X"
mus_x= mus_a %>% group_by(id, chr, start, end) %>%
mus_a summarize(ds=mean(clade.ds, na.rm=T), dn=mean(clade.dn, na.rm=T), dn.ds=mean(clade.dn.ds, na.rm=T))
## `summarise()` has grouped output by 'id', 'chr', 'start'. You can override using the `.groups` argument.
$label = "Mus Auto"
mus_a
= rbind(pho_x, pho_a, mus_x, mus_a)
chrome_data
# t = as.data.frame(table(chrome_data$chr))
# test = ggplot(t, aes(x=Var1, y=Freq)) +
# geom_bar(stat="identity") +
# scale_y_continuous(expand=c(0,0)) +
# bartheme() +
# theme()
# print(test)
#clade_cols = as.character(rodent_data$clade.col)
#clade_labs = as.character(rodent_data$clade.label)
= c("#006ddb", "#006ddb", "#db6d00", "#db6d00")
chrome_cols = c("Mus Auto", "Mus X", "Phodopus Auto", "Phodopus X")
chrome_labs
= list(c("Phodopus Auto", "Phodopus X"), c("Mus Auto", "Mus X"))
x_comps
= ggplot(chrome_data, aes(label, ds, fill=label)) +
chrome_ds_p geom_quasirandom(size=0.5, alpha=0.3, width=0.25, color="#d3d3d3") +
geom_boxplot(outlier.shape=NA, alpha=0.7, width=0.5) +
geom_signif(comparisons=x_comps, map_signif_level=TRUE, textsize=4, size=1, step_increase=0.12, margin_top=0.1) +
xlab("Chromosomes") +
ylab("ds") +
scale_fill_manual(limits=chrome_labs, values=chrome_cols) +
bartheme() +
theme(legend.position="none")
print(chrome_ds_p)
2. X vs. autosome dN distributions in Mus and Phodopus
- Estimates are averaged across all branches in clade.
= c("#006ddb", "#006ddb", "#db6d00", "#db6d00")
chrome_cols = c("Mus Auto", "Mus X", "Phodopus Auto", "Phodopus X")
chrome_labs
= list(c("Phodopus Auto", "Phodopus X"), c("Mus Auto", "Mus X"))
x_comps
= ggplot(chrome_data, aes(label, dn, fill=label)) +
chrome_dn_p geom_quasirandom(size=0.5, alpha=0.3, width=0.25, color="#d3d3d3") +
geom_boxplot(outlier.shape=NA, alpha=0.7, width=0.5) +
geom_signif(comparisons=x_comps, map_signif_level=TRUE, textsize=4, size=1, step_increase=0.12, margin_top=0.1) +
xlab("Chromosomes") +
ylab("dN") +
scale_fill_manual(limits=chrome_labs, values=chrome_cols) +
bartheme() +
theme(legend.position="none")
print(chrome_dn_p)
3. X vs. autosome dN/dS distributions in Mus and Phodopus
- Single estimate for whole clade.
= c("#006ddb", "#006ddb", "#db6d00", "#db6d00")
chrome_cols = c("Mus Auto", "Mus X", "Phodopus Auto", "Phodopus X")
chrome_labs
= list(c("Phodopus Auto", "Phodopus X"), c("Mus Auto", "Mus X"))
x_comps
= ggplot(chrome_data, aes(label, dn.ds, fill=label)) +
chrome_dnds_p geom_quasirandom(size=0.5, alpha=0.3, width=0.25, color="#d3d3d3") +
geom_boxplot(outlier.shape=NA, alpha=0.7, width=0.5) +
geom_signif(comparisons=x_comps, map_signif_level=TRUE, textsize=4, size=1, step_increase=0.12, margin_top=0.1) +
xlab("Chromosomes") +
ylab("dN/dS") +
scale_fill_manual(limits=chrome_labs, values=chrome_cols) +
bartheme() +
theme(legend.position="none")
print(chrome_dnds_p)
4. X vs. autosome dS distributions in Mus and Phodopus, partitioning Phodopus X between recombining and non-recombining regions
- Estimates are averaged across all branches in clade.
= subset(pho_x, start < 35225145)
pho_xr $label = "Phodopus X recombining"
pho_xr= subset(pho_x, start >= 35225145)
pho_xnr $label = "Phodopus X non-recombining"
pho_xnr
= rbind(mus_a, mus_x, pho_a, pho_xr, pho_xnr)
chrome_data
= c("#006ddb", "#006ddb", "#db6d00", "#db6d00", "#db6d00")
chrome_cols = c("Mus Auto", "Mus X", "Phodopus Auto", "Phodopus X recombining", "Phodopus X non-recombining")
chrome_labs
= list(c("Phodopus X recombining", "Phodopus X non-recombining"), c("Phodopus Auto", "Phodopus X recombining"), c("Phodopus Auto", "Phodopus X non-recombining"))
x_comps
= ggplot(chrome_data, aes(label, ds, fill=label)) +
chrome_ds_p geom_quasirandom(size=0.5, alpha=0.3, width=0.25, color="#d3d3d3") +
geom_boxplot(outlier.shape=NA, alpha=0.7, width=0.5) +
geom_signif(comparisons=x_comps, map_signif_level=TRUE, textsize=4, size=1, step_increase=0.12, margin_top=0.1) +
xlab("Chromosomes") +
ylab("ds") +
scale_fill_manual(limits=chrome_labs, values=chrome_cols) +
bartheme() +
theme(legend.position="none",
axis.text.x = element_text(angle=10, hjust=1, size=10)
)
print(chrome_ds_p)
5. X vs. autosome dN distributions in Mus and Phodopus, partitioning Phodopus X between recombining and non-recombining regions
- Estimates are averaged across all branches in clade.
= c("#006ddb", "#006ddb", "#db6d00", "#db6d00", "#db6d00")
chrome_cols = c("Mus Auto", "Mus X", "Phodopus Auto", "Phodopus X recombining", "Phodopus X non-recombining")
chrome_labs
= list(c("Phodopus X recombining", "Phodopus X non-recombining"), c("Phodopus Auto", "Phodopus X recombining"), c("Phodopus Auto", "Phodopus X non-recombining"))
x_comps
= ggplot(chrome_data, aes(label, dn, fill=label)) +
chrome_dn_p geom_quasirandom(size=0.5, alpha=0.3, width=0.25, color="#d3d3d3") +
geom_boxplot(outlier.shape=NA, alpha=0.7, width=0.5) +
geom_signif(comparisons=x_comps, map_signif_level=TRUE, textsize=4, size=1, step_increase=0.12, margin_top=0.1) +
xlab("Chromosomes") +
ylab("dN") +
scale_fill_manual(limits=chrome_labs, values=chrome_cols) +
bartheme() +
theme(legend.position="none",
axis.text.x = element_text(angle=10, hjust=1, size=10)
)
print(chrome_dn_p)
6. X vs. autosome dN/dS distributions in Mus and Phodopus, partitioning Phodopus X between recombining and non-recombining regions
- Single estimate for whole clade.
= c("#006ddb", "#006ddb", "#db6d00", "#db6d00", "#db6d00")
chrome_cols = c("Mus Auto", "Mus X", "Phodopus Auto", "Phodopus X recombining", "Phodopus X non-recombining")
chrome_labs
= list(c("Phodopus X recombining", "Phodopus X non-recombining"), c("Phodopus Auto", "Phodopus X recombining"), c("Phodopus Auto", "Phodopus X non-recombining"))
x_comps
= ggplot(chrome_data, aes(label, dn.ds, fill=label)) +
chrome_dnds_p geom_quasirandom(size=0.5, alpha=0.3, width=0.25, color="#d3d3d3") +
geom_boxplot(outlier.shape=NA, alpha=0.7, width=0.5) +
geom_signif(comparisons=x_comps, map_signif_level=TRUE, textsize=4, size=1, step_increase=0.12, margin_top=0.1) +
xlab("Chromosomes") +
ylab("dN/dS") +
scale_fill_manual(limits=chrome_labs, values=chrome_cols) +
bartheme() +
theme(legend.position="none",
axis.text.x = element_text(angle=10, hjust=1, size=10)
)
print(chrome_dnds_p)
7. M2 summary stats
= data.frame("chrome"=c("Mus Auto", "Mus X", "Phodopus Auto", "Phodopus X non-recombining", "Phodopus X recombining"),
t5_data "genes"=c(length(mus_a[,1]), length(mus_x[,1]), length(pho_a[,1]), length(pho_xnr[,1]), length(pho_xr[,1])),
"avg.ds"=c(mean(mus_a$ds), mean(mus_x$ds), mean(pho_a$ds), mean(pho_xnr$ds), mean(pho_xr$ds)),
"avg.dn"=c(mean(mus_a$dn), mean(mus_x$dn), mean(pho_a$dn), mean(pho_xnr$dn), mean(pho_xr$dn)),
"avg.dnds"=c(mean(mus_a$dn.ds), mean(mus_x$dn.ds), mean(pho_a$dn.ds), mean(pho_xnr$dn.ds), mean(pho_xr$dn.ds))
)
rownames(t5_data) = c()
%>% kable(col.names=c("Chromosome","Number of genes", "Avg. dS", "Avg. dN", "Avg. dN/dS"), caption="Substitution rates for Mus and Phodopus branches", align="l") %>% kable_styling(bootstrap_options=c("striped", "condensed", "responsive"), full_width=F)# %>% t5_data
Chromosome | Number of genes | Avg. dS | Avg. dN | Avg. dN/dS |
---|---|---|---|---|
Mus Auto | 1 | 0.0305418 | 0.0042116 | 0.1474844 |
Mus X | 1 | 0.0233840 | 0.0044003 | 0.1906650 |
Phodopus Auto | 1 | 0.0217890 | 0.0034842 | 0.1745578 |
Phodopus X non-recombining | 1 | 0.0155352 | 0.0026421 | 0.1825200 |
Phodopus X recombining | 1 | 0.0165241 | 0.0036164 | 0.2217289 |
#column_spec(1, width="7em") %>%
#column_spec(2, width="13em")
Results (Hyphy free ratio model [FitMG94], branch leading to phodopus and mus only)
= read.csv("../../data/mol-evol/phodopus-5spec-mg94-local.csv", header=T, comment.char="#")
pho_mg_data $clade = as.character(pho_mg_data$clade)
pho_mg_data$clade[pho_mg_data$clade=="psun;pcam"] = "pcam;psun"
pho_mg_data$clade[pho_mg_data$clade=="psun;pcam;prob"] = "pcam;prob;psun"
pho_mg_data$clade[pho_mg_data$clade=="psun;prob;pcam"] = "pcam;prob;psun"
pho_mg_data$clade[pho_mg_data$clade=="pcam;psun;prob"] = "pcam;prob;psun"
pho_mg_data$clade[pho_mg_data$clade=="prob;psun;pcam"] = "pcam;prob;psun"
pho_mg_data$clade[pho_mg_data$clade=="prob;pcam;psun"] = "pcam;prob;psun"
pho_mg_data# Phodopus data
= read.csv("../../data/mol-evol/mus-4spec-mg94-local.csv", header=T, comment.char="#")
mus_mg_data $clade = as.character(mus_mg_data$clade)
mus_mg_data$clade[mus_mg_data$clade=="mmus;mcar"] = "mcar;mmus"
mus_mg_data$clade[mus_mg_data$clade=="mmus;mcar;mpah"] = "mcar;mmus;mpah"
mus_mg_data$clade[mus_mg_data$clade=="mmus;mpah;mcar"] = "mcar;mmus;mpah"
mus_mg_data$clade[mus_mg_data$clade=="mpah;mmus;mcar"] = "mcar;mmus;mpah"
mus_mg_data$clade[mus_mg_data$clade=="mpah;mcar;mmus"] = "mcar;mmus;mpah"
mus_mg_data$clade[mus_mg_data$clade=="mcar;mpah;mmus"] = "mcar;mmus;mpah"
mus_mg_data# Mus data
0. dS distributions by branch for filtering
$avg.dnds = NA
pho_data$avg.dn = NA
pho_data$avg.ds = NA
pho_data$num.monophyletic = NA
pho_data$num.monophyletic.filtered = NA
pho_data
= list()
pho_clade_data = list()
pho_clade_data_f = list()
pho_ds_dists = 1
i for(c in pho_data$clade){
if(pho_data$node.type[pho_data$clade==c]=="ROOT"){
next
}
= as.character(pho_data$clade.label[pho_data$clade==c])
cur_label = subset(pho_mg_data, clade==c & ds <= 2)
cur_data $clade.label = cur_label
cur_data
= ggplot(cur_data, aes(x=ds)) +
ds_dist geom_histogram(color="#000000", fill="#d3d3d3") +
scale_y_continuous(expand=c(0,0)) +
xlab("dS") +
ylab("# of genes") +
ggtitle(pho_data$clade.label[pho_data$clade==c]) +
theme(axis.text.x = element_text(angle=90, hjust=1, size=8))
#print(ds_dist)
#stop()
= ds_dist
pho_ds_dists[[i]] = i + 1
i
= subset(cur_data, ds < 0.3 & dn.ds <= 2)
cur_data_f
$num.monophyletic[pho_data$clade==c] = nrow(cur_data)
pho_data$num.monophyletic.filtered[pho_data$clade==c] = nrow(cur_data_f)
pho_data$avg.dnds[pho_data$clade==c] = mean(cur_data_f$dn.ds)
pho_data$avg.dn[pho_data$clade==c] = mean(cur_data_f$dn)
pho_data$avg.ds[pho_data$clade==c] = mean(cur_data$ds)
pho_data
= length(cur_data[,1])
pho_clade_data[[cur_label]] = cur_data_f
pho_clade_data_f[[cur_label]] }
$avg.dnds = NA
mus_data$avg.dn = NA
mus_data$avg.ds = NA
mus_data$num.monophyletic = NA
mus_data$num.monophyletic.filtered = NA
mus_data
= list()
mus_clade_data = list()
mus_clade_data_f = list()
mus_ds_dists = 1
i for(c in mus_data$clade){
if(mus_data$node.type[mus_data$clade==c]=="ROOT"){
next
}print(c)
= as.character(mus_data$clade.label[mus_data$clade==c])
cur_label = subset(mus_mg_data, clade==c & ds <= 2)
cur_data $clade.label = cur_label
cur_data
= ggplot(cur_data, aes(x=ds)) +
ds_dist geom_histogram(color="#000000", fill="#d3d3d3") +
scale_y_continuous(expand=c(0,0)) +
xlab("dS") +
ylab("# of genes") +
ggtitle(mus_data$clade.label[mus_data$clade==c]) +
theme(axis.text.x = element_text(angle=90, hjust=1, size=8))
#print(ds_dist)
#stop()
= ds_dist
mus_ds_dists[[i]] = i + 1
i
= subset(cur_data, ds < 0.3 & dn.ds <= 2)
cur_data_f
$num.monophyletic[mus_data$clade==c] = nrow(cur_data)
mus_data$num.monophyletic.filtered[mus_data$clade==c] = nrow(cur_data_f)
mus_data$avg.dnds[mus_data$clade==c] = mean(cur_data_f$dn.ds)
mus_data$avg.dn[mus_data$clade==c] = mean(cur_data_f$dn)
mus_data$avg.ds[mus_data$clade==c] = mean(cur_data$ds)
mus_data
= length(cur_data[,1])
mus_clade_data[[cur_label]] = cur_data_f
mus_clade_data_f[[cur_label]] }
## [1] "rnor"
## [1] "mmus"
## [1] "mcar"
## [1] "mpah"
## [1] "mcar;mmus;mpah"
## [1] "mcar;mmus"
= plot_grid(plotlist=pho_ds_dists, nrow=3)
ds_plots print(ds_plots)
= plot_grid(plotlist=mus_ds_dists, nrow=3)
ds_plots print(ds_plots)
1. X vs. autosome dS distributions in Mus and Phodopus
= c("pho")
pho_branches = data.frame()
pho_a = data.frame()
pho_x for(b in pho_branches){
= rbind(pho_a, subset(pho_clade_data_f[[b]], chr!="ScmyWZ3_7747_HRSCAF_7900"))
pho_a = rbind(pho_x, subset(pho_clade_data_f[[b]], chr=="ScmyWZ3_7747_HRSCAF_7900"))
pho_x
}#pho_x = pho_x %>% group_by(id, chr, start, end) %>%
# summarize(ds=mean(ds, na.rm=T), dn=mean(dn, na.rm=T), dn.ds=mean(dn.ds, na.rm=T))
$label = "Phodopus X"
pho_x#pho_a = pho_a %>% group_by(id, chr, start, end) %>%
# summarize(ds=mean(ds, na.rm=T), dn=mean(dn, na.rm=T), dn.ds=mean(dn.ds, na.rm=T))
$label = "Phodopus Auto"
pho_a
= c("mus")
mus_branches = data.frame()
mus_a = data.frame()
mus_x for(b in mus_branches){
= rbind(mus_a, subset(mus_clade_data_f[[b]], chr!="X"))
mus_a = rbind(mus_x, subset(mus_clade_data_f[[b]], chr=="X"))
mus_x
}#mus_x = mus_x %>% group_by(id, chr, start, end) %>%
# summarize(ds=mean(ds, na.rm=T), dn=mean(dn, na.rm=T), dn.ds=mean(dn.ds, na.rm=T))
$label = "Mus X"
mus_x#mus_a = mus_a %>% group_by(id, chr, start, end) %>%
# summarize(ds=mean(ds, na.rm=T), dn=mean(dn, na.rm=T), dn.ds=mean(dn.ds, na.rm=T))
$label = "Mus Auto"
mus_a
= rbind(pho_x, pho_a, mus_x, mus_a)
chrome_data
# t = as.data.frame(table(chrome_data$chr))
# test = ggplot(t, aes(x=Var1, y=Freq)) +
# geom_bar(stat="identity") +
# scale_y_continuous(expand=c(0,0)) +
# bartheme() +
# theme()
# print(test)
#clade_cols = as.character(rodent_data$clade.col)
#clade_labs = as.character(rodent_data$clade.label)
= c("#006ddb", "#006ddb", "#db6d00", "#db6d00")
chrome_cols = c("Mus Auto", "Mus X", "Phodopus Auto", "Phodopus X")
chrome_labs
= list(c("Phodopus Auto", "Phodopus X"), c("Mus Auto", "Mus X"))
x_comps
= ggplot(chrome_data, aes(label, ds, fill=label)) +
chrome_ds_p geom_quasirandom(size=0.5, alpha=0.3, width=0.25, color="#d3d3d3") +
geom_boxplot(outlier.shape=NA, alpha=0.7, width=0.5) +
geom_signif(comparisons=x_comps, map_signif_level=TRUE, textsize=4, size=1, step_increase=0.12, margin_top=0.1) +
xlab("Chromosomes") +
ylab("ds") +
scale_fill_manual(limits=chrome_labs, values=chrome_cols) +
bartheme() +
theme(legend.position="none")
print(chrome_ds_p)
2. X vs. autosome dN distributions in Mus and Phodopus
= c("#006ddb", "#006ddb", "#db6d00", "#db6d00")
chrome_cols = c("Mus Auto", "Mus X", "Phodopus Auto", "Phodopus X")
chrome_labs
= list(c("Phodopus Auto", "Phodopus X"), c("Mus Auto", "Mus X"))
x_comps
= ggplot(chrome_data, aes(label, dn, fill=label)) +
chrome_dn_p geom_quasirandom(size=0.5, alpha=0.3, width=0.25, color="#d3d3d3") +
geom_boxplot(outlier.shape=NA, alpha=0.7, width=0.5) +
geom_signif(comparisons=x_comps, map_signif_level=TRUE, textsize=4, size=1, step_increase=0.12, margin_top=0.1) +
xlab("Chromosomes") +
ylab("dN") +
scale_fill_manual(limits=chrome_labs, values=chrome_cols) +
bartheme() +
theme(legend.position="none")
print(chrome_dn_p)
3. X vs. autosome dN/dS distributions in Mus and Phodopus
= c("#006ddb", "#006ddb", "#db6d00", "#db6d00")
chrome_cols = c("Mus Auto", "Mus X", "Phodopus Auto", "Phodopus X")
chrome_labs
= list(c("Phodopus Auto", "Phodopus X"), c("Mus Auto", "Mus X"))
x_comps
= ggplot(chrome_data, aes(label, dn.ds, fill=label)) +
chrome_dnds_p geom_quasirandom(size=0.5, alpha=0.3, width=0.25, color="#d3d3d3") +
geom_boxplot(outlier.shape=NA, alpha=0.7, width=0.5) +
geom_signif(comparisons=x_comps, map_signif_level=TRUE, textsize=4, size=1, step_increase=0.12, margin_top=0.1) +
xlab("Chromosomes") +
ylab("dN/dS") +
scale_fill_manual(limits=chrome_labs, values=chrome_cols) +
bartheme() +
theme(legend.position="none")
print(chrome_dnds_p)
4. X vs. autosome dS distributions in Mus and Phodopus, partitioning Phodopus X between recombining and non-recombining regions
- Estimates are averaged across all branches in clade.
= subset(pho_x, start < 35225145)
pho_xr $label = "Phodopus X recombining"
pho_xr= subset(pho_x, start >= 35225145)
pho_xnr $label = "Phodopus X non-recombining"
pho_xnr
= rbind(mus_a, mus_x, pho_a, pho_xr, pho_xnr)
chrome_data
= c("#006ddb", "#006ddb", "#db6d00", "#db6d00", "#db6d00")
chrome_cols = c("Mus Auto", "Mus X", "Phodopus Auto", "Phodopus X recombining", "Phodopus X non-recombining")
chrome_labs
= list(c("Phodopus X recombining", "Phodopus X non-recombining"), c("Phodopus Auto", "Phodopus X recombining"), c("Phodopus Auto", "Phodopus X non-recombining"))
x_comps
= ggplot(chrome_data, aes(label, ds, fill=label)) +
chrome_ds_p geom_quasirandom(size=0.5, alpha=0.3, width=0.25, color="#d3d3d3") +
geom_boxplot(outlier.shape=NA, alpha=0.7, width=0.5) +
geom_signif(comparisons=x_comps, map_signif_level=TRUE, textsize=4, size=1, step_increase=0.12, margin_top=0.1) +
xlab("Chromosomes") +
ylab("ds") +
scale_fill_manual(limits=chrome_labs, values=chrome_cols) +
bartheme() +
theme(legend.position="none",
axis.text.x = element_text(angle=10, hjust=1, size=10)
)
print(chrome_ds_p)
5. X vs. autosome dN distributions in Mus and Phodopus, partitioning Phodopus X between recombining and non-recombining regions
- Estimates are averaged across all branches in clade.
= c("#006ddb", "#006ddb", "#db6d00", "#db6d00", "#db6d00")
chrome_cols = c("Mus Auto", "Mus X", "Phodopus Auto", "Phodopus X recombining", "Phodopus X non-recombining")
chrome_labs
= list(c("Phodopus X recombining", "Phodopus X non-recombining"), c("Phodopus Auto", "Phodopus X recombining"), c("Phodopus Auto", "Phodopus X non-recombining"))
x_comps
= ggplot(chrome_data, aes(label, dn, fill=label)) +
chrome_dn_p geom_quasirandom(size=0.5, alpha=0.3, width=0.25, color="#d3d3d3") +
geom_boxplot(outlier.shape=NA, alpha=0.7, width=0.5) +
geom_signif(comparisons=x_comps, map_signif_level=TRUE, textsize=4, size=1, step_increase=0.12, margin_top=0.1) +
xlab("Chromosomes") +
ylab("dN") +
scale_fill_manual(limits=chrome_labs, values=chrome_cols) +
bartheme() +
theme(legend.position="none",
axis.text.x = element_text(angle=10, hjust=1, size=10)
)
print(chrome_dn_p)
6. X vs. autosome dN/dS distributions in Mus and Phodopus, partitioning Phodopus X between recombining and non-recombining regions
- Single estimate for whole clade.
= c("#006ddb", "#006ddb", "#db6d00", "#db6d00", "#db6d00")
chrome_cols = c("Mus Auto", "Mus X", "Phodopus Auto", "Phodopus X recombining", "Phodopus X non-recombining")
chrome_labs
= list(c("Phodopus X recombining", "Phodopus X non-recombining"), c("Phodopus Auto", "Phodopus X recombining"), c("Phodopus Auto", "Phodopus X non-recombining"))
x_comps
= ggplot(chrome_data, aes(label, dn.ds, fill=label)) +
chrome_dnds_p geom_quasirandom(size=0.5, alpha=0.3, width=0.25, color="#d3d3d3") +
geom_boxplot(outlier.shape=NA, alpha=0.7, width=0.5) +
geom_signif(comparisons=x_comps, map_signif_level=TRUE, textsize=4, size=1, step_increase=0.12, margin_top=0.1) +
xlab("Chromosomes") +
ylab("dN/dS") +
scale_fill_manual(limits=chrome_labs, values=chrome_cols) +
bartheme() +
theme(legend.position="none",
axis.text.x = element_text(angle=10, hjust=1, size=10)
)
print(chrome_dnds_p)
7. HyPhy MG94 summary stats
= data.frame("chrome"=c("Mus Auto", "Mus X", "Phodopus Auto", "Phodopus X non-recombining", "Phodopus X recombining"),
t6_data "genes"=c(length(mus_a[,1]), length(mus_x[,1]), length(pho_a[,1]), length(pho_xnr[,1]), length(pho_xr[,1])),
"avg.ds"=c(mean(mus_a$ds), mean(mus_x$ds), mean(pho_a$ds), mean(pho_xnr$ds), mean(pho_xr$ds)),
"avg.dn"=c(mean(mus_a$dn), mean(mus_x$dn), mean(pho_a$dn), mean(pho_xnr$dn), mean(pho_xr$dn)),
"avg.dnds"=c(mean(mus_a$dn.ds), mean(mus_x$dn.ds), mean(pho_a$dn.ds), mean(pho_xnr$dn.ds), mean(pho_xr$dn.ds))
)
rownames(t6_data) = c()
%>% kable(col.names=c("Chromosome","Number of genes", "Avg. dS", "Avg. dN", "Avg. dN/dS"), caption="Substitution rates for Mus and Phodopus branches", align="l") %>% kable_styling(bootstrap_options=c("striped", "condensed", "responsive"), full_width=F)# %>% t6_data
Chromosome | Number of genes | Avg. dS | Avg. dN | Avg. dN/dS |
---|---|---|---|---|
Mus Auto | 14656 | 0.1582118 | 0.0201877 | 0.1572215 |
Mus X | 515 | 0.1300482 | 0.0206157 | 0.1905945 |
Phodopus Auto | 9566 | 0.0651789 | 0.0090807 | 0.1806505 |
Phodopus X non-recombining | 256 | 0.0499538 | 0.0091739 | 0.2187543 |
Phodopus X recombining | 142 | 0.0555683 | 0.0094026 | 0.2030216 |
#column_spec(1, width="7em") %>%
#column_spec(2, width="13em")
Results (Hyphy free ratio local model [FitMG94], branch leading to phodopus only, only Cgri as outgroup)
= "../../data/mol-evol/phodopus-4spec-iqtree-concat-rooted.tre"
pho_tree_file = read.tree(pho_tree_file)
pho_tree = "../../data/mol-evol/phodopus-4spec-concat-traits.csv"
pho_trait_file = read.csv(pho_trait_file, header=TRUE)
pho_data = pho_data[order(pho_data$ggtree),]
pho_data = pho_data
pho_branch_labels $node[pho_branch_labels$node.type=='tip'] = NA
pho_branch_labels$clade = as.character(pho_data$clade)
pho_data# Phodopus data
= read.csv("../../data/mol-evol/phodopus-4spec-mg94-local.csv", header=T, comment.char="#")
pho_mg_data $clade = as.character(pho_mg_data$clade)
pho_mg_data#names(pho_mg_data)[8] = "n1"
#names(pho_mg_data)[9] = "n2"
#names(pho_mg_data)[11] = "dn"
#names(pho_mg_data)[12] = "ds"
#pho_mg_data = subset(pho_mg_data, select=-c(n1, n2))
#pho_mg_data = subset(pho_mg_data, select=-c(nonsynonymous.bl, synonymous.bl))
# Phodopus data
0. dS distributions by branch for filtering
$avg.dnds = NA
pho_data$avg.dn = NA
pho_data$avg.ds = NA
pho_data$num.monophyletic = NA
pho_data$num.monophyletic.filtered = NA
pho_data
= list()
pho_clade_data = list()
pho_clade_data_f = list()
pho_ds_dists = 1
i for(c in pho_data$clade){
if(pho_data$node.type[pho_data$clade==c]=="ROOT"){
next
}
= as.character(pho_data$clade.label[pho_data$clade==c])
cur_label = subset(pho_mg_data, clade==c & ds <= 2)
cur_data $clade.label = cur_label
cur_data
= ggplot(cur_data, aes(x=ds)) +
ds_dist geom_histogram(color="#000000", fill="#d3d3d3") +
scale_y_continuous(expand=c(0,0)) +
xlab("dS") +
ylab("# of genes") +
ggtitle(pho_data$clade.label[pho_data$clade==c]) +
theme(axis.text.x = element_text(angle=90, hjust=1, size=8))
#print(ds_dist)
#stop()
= ds_dist
pho_ds_dists[[i]] = i + 1
i
= subset(cur_data, ds < 0.3 & dn.ds <= 2)
cur_data_f
$num.monophyletic[pho_data$clade==c] = nrow(cur_data)
pho_data$num.monophyletic.filtered[pho_data$clade==c] = nrow(cur_data_f)
pho_data$avg.dnds[pho_data$clade==c] = mean(cur_data_f$dn.ds)
pho_data$avg.dn[pho_data$clade==c] = mean(cur_data_f$dn)
pho_data$avg.ds[pho_data$clade==c] = mean(cur_data$ds)
pho_data
= length(cur_data[,1])
pho_clade_data[[cur_label]] = cur_data_f
pho_clade_data_f[[cur_label]] }
= plot_grid(plotlist=pho_ds_dists, nrow=3)
ds_plots print(ds_plots)
1. X vs. autosome dS distributions in Mus and Phodopus
= c("pho")
pho_branches = data.frame()
pho_a = data.frame()
pho_x for(b in pho_branches){
= rbind(pho_a, subset(pho_clade_data_f[[b]], chr!="ScmyWZ3_7747_HRSCAF_7900"))
pho_a = rbind(pho_x, subset(pho_clade_data_f[[b]], chr=="ScmyWZ3_7747_HRSCAF_7900"))
pho_x
}#pho_x = pho_x %>% group_by(id, chr, start, end) %>%
# summarize(ds=mean(ds, na.rm=T), dn=mean(dn, na.rm=T), dn.ds=mean(dn.ds, na.rm=T))
$label = "Phodopus X"
pho_x#pho_a = pho_a %>% group_by(id, chr, start, end) %>%
# summarize(ds=mean(ds, na.rm=T), dn=mean(dn, na.rm=T), dn.ds=mean(dn.ds, na.rm=T))
$label = "Phodopus Auto"
pho_a
= c("mus")
mus_branches = data.frame()
mus_a = data.frame()
mus_x for(b in mus_branches){
= rbind(mus_a, subset(mus_clade_data_f[[b]], chr!="X"))
mus_a = rbind(mus_x, subset(mus_clade_data_f[[b]], chr=="X"))
mus_x
}#mus_x = mus_x %>% group_by(id, chr, start, end) %>%
# summarize(ds=mean(ds, na.rm=T), dn=mean(dn, na.rm=T), dn.ds=mean(dn.ds, na.rm=T))
$label = "Mus X"
mus_x#mus_a = mus_a %>% group_by(id, chr, start, end) %>%
# summarize(ds=mean(ds, na.rm=T), dn=mean(dn, na.rm=T), dn.ds=mean(dn.ds, na.rm=T))
$label = "Mus Auto"
mus_a
= rbind(pho_x, pho_a, mus_x, mus_a)
chrome_data
# t = as.data.frame(table(chrome_data$chr))
# test = ggplot(t, aes(x=Var1, y=Freq)) +
# geom_bar(stat="identity") +
# scale_y_continuous(expand=c(0,0)) +
# bartheme() +
# theme()
# print(test)
#clade_cols = as.character(rodent_data$clade.col)
#clade_labs = as.character(rodent_data$clade.label)
= c("#006ddb", "#006ddb", "#db6d00", "#db6d00")
chrome_cols = c("Mus Auto", "Mus X", "Phodopus Auto", "Phodopus X")
chrome_labs
= list(c("Phodopus Auto", "Phodopus X"), c("Mus Auto", "Mus X"))
x_comps
= ggplot(chrome_data, aes(label, ds, fill=label)) +
chrome_ds_p geom_quasirandom(size=0.5, alpha=0.3, width=0.25, color="#d3d3d3") +
geom_boxplot(outlier.shape=NA, alpha=0.7, width=0.5) +
geom_signif(comparisons=x_comps, map_signif_level=TRUE, textsize=4, size=1, step_increase=0.12, margin_top=0.1) +
xlab("Chromosomes") +
ylab("ds") +
scale_fill_manual(limits=chrome_labs, values=chrome_cols) +
bartheme() +
theme(legend.position="none")
print(chrome_ds_p)
2. X vs. autosome dN distributions in Mus and Phodopus
= c("#006ddb", "#006ddb", "#db6d00", "#db6d00")
chrome_cols = c("Mus Auto", "Mus X", "Phodopus Auto", "Phodopus X")
chrome_labs
= list(c("Phodopus Auto", "Phodopus X"), c("Mus Auto", "Mus X"))
x_comps
= ggplot(chrome_data, aes(label, dn, fill=label)) +
chrome_dn_p geom_quasirandom(size=0.5, alpha=0.3, width=0.25, color="#d3d3d3") +
geom_boxplot(outlier.shape=NA, alpha=0.7, width=0.5) +
geom_signif(comparisons=x_comps, map_signif_level=TRUE, textsize=4, size=1, step_increase=0.12, margin_top=0.1) +
xlab("Chromosomes") +
ylab("dN") +
scale_fill_manual(limits=chrome_labs, values=chrome_cols) +
bartheme() +
theme(legend.position="none")
print(chrome_dn_p)
3. X vs. autosome dN/dS distributions in Mus and Phodopus
= c("#006ddb", "#006ddb", "#db6d00", "#db6d00")
chrome_cols = c("Mus Auto", "Mus X", "Phodopus Auto", "Phodopus X")
chrome_labs
= list(c("Phodopus Auto", "Phodopus X"), c("Mus Auto", "Mus X"))
x_comps
= ggplot(chrome_data, aes(label, dn.ds, fill=label)) +
chrome_dnds_p geom_quasirandom(size=0.5, alpha=0.3, width=0.25, color="#d3d3d3") +
geom_boxplot(outlier.shape=NA, alpha=0.7, width=0.5) +
geom_signif(comparisons=x_comps, map_signif_level=TRUE, textsize=4, size=1, step_increase=0.12, margin_top=0.1) +
xlab("Chromosomes") +
ylab("dN/dS") +
scale_fill_manual(limits=chrome_labs, values=chrome_cols) +
bartheme() +
theme(legend.position="none")
print(chrome_dnds_p)
4. X vs. autosome dS distributions in Mus and Phodopus, partitioning Phodopus X between recombining and non-recombining regions
- Estimates are averaged across all branches in clade.
= subset(pho_x, start < 35225145)
pho_xr $label = "Phodopus X recombining"
pho_xr= subset(pho_x, start >= 35225145)
pho_xnr $label = "Phodopus X non-recombining"
pho_xnr
= rbind(mus_a, mus_x, pho_a, pho_xr, pho_xnr)
chrome_data
= c("#006ddb", "#006ddb", "#db6d00", "#db6d00", "#db6d00")
chrome_cols = c("Mus Auto", "Mus X", "Phodopus Auto", "Phodopus X recombining", "Phodopus X non-recombining")
chrome_labs
= list(c("Phodopus X recombining", "Phodopus X non-recombining"), c("Phodopus Auto", "Phodopus X recombining"), c("Phodopus Auto", "Phodopus X non-recombining"))
x_comps
= ggplot(chrome_data, aes(label, ds, fill=label)) +
chrome_ds_p geom_quasirandom(size=0.5, alpha=0.3, width=0.25, color="#d3d3d3") +
geom_boxplot(outlier.shape=NA, alpha=0.7, width=0.5) +
geom_signif(comparisons=x_comps, map_signif_level=TRUE, textsize=4, size=1, step_increase=0.12, margin_top=0.1) +
xlab("Chromosomes") +
ylab("ds") +
scale_fill_manual(limits=chrome_labs, values=chrome_cols) +
bartheme() +
theme(legend.position="none",
axis.text.x = element_text(angle=10, hjust=1, size=10)
)
print(chrome_ds_p)
5. X vs. autosome dN distributions in Mus and Phodopus, partitioning Phodopus X between recombining and non-recombining regions
- Estimates are averaged across all branches in clade.
= c("#006ddb", "#006ddb", "#db6d00", "#db6d00", "#db6d00")
chrome_cols = c("Mus Auto", "Mus X", "Phodopus Auto", "Phodopus X recombining", "Phodopus X non-recombining")
chrome_labs
= list(c("Phodopus X recombining", "Phodopus X non-recombining"), c("Phodopus Auto", "Phodopus X recombining"), c("Phodopus Auto", "Phodopus X non-recombining"))
x_comps
= ggplot(chrome_data, aes(label, dn, fill=label)) +
chrome_dn_p geom_quasirandom(size=0.5, alpha=0.3, width=0.25, color="#d3d3d3") +
geom_boxplot(outlier.shape=NA, alpha=0.7, width=0.5) +
geom_signif(comparisons=x_comps, map_signif_level=TRUE, textsize=4, size=1, step_increase=0.12, margin_top=0.1) +
xlab("Chromosomes") +
ylab("dN") +
scale_fill_manual(limits=chrome_labs, values=chrome_cols) +
bartheme() +
theme(legend.position="none",
axis.text.x = element_text(angle=10, hjust=1, size=10)
)
print(chrome_dn_p)
6. X vs. autosome dN/dS distributions in Mus and Phodopus, partitioning Phodopus X between recombining and non-recombining regions
- Single estimate for whole clade.
= c("#006ddb", "#006ddb", "#db6d00", "#db6d00", "#db6d00")
chrome_cols = c("Mus Auto", "Mus X", "Phodopus Auto", "Phodopus X recombining", "Phodopus X non-recombining")
chrome_labs
= list(c("Phodopus X recombining", "Phodopus X non-recombining"), c("Phodopus Auto", "Phodopus X recombining"), c("Phodopus Auto", "Phodopus X non-recombining"))
x_comps
= ggplot(chrome_data, aes(label, dn.ds, fill=label)) +
chrome_dnds_p geom_quasirandom(size=0.5, alpha=0.3, width=0.25, color="#d3d3d3") +
geom_boxplot(outlier.shape=NA, alpha=0.7, width=0.5) +
geom_signif(comparisons=x_comps, map_signif_level=TRUE, textsize=4, size=1, step_increase=0.12, margin_top=0.1) +
xlab("Chromosomes") +
ylab("dN/dS") +
scale_fill_manual(limits=chrome_labs, values=chrome_cols) +
bartheme() +
theme(legend.position="none",
axis.text.x = element_text(angle=10, hjust=1, size=10)
)
print(chrome_dnds_p)
7. HyPhy MG94 local summary stats
= data.frame("chrome"=c("Mus Auto", "Mus X", "Phodopus Auto", "Phodopus X non-recombining", "Phodopus X recombining"),
t6_data "genes"=c(length(mus_a[,1]), length(mus_x[,1]), length(pho_a[,1]), length(pho_xnr[,1]), length(pho_xr[,1])),
"avg.ds"=c(mean(mus_a$ds), mean(mus_x$ds), mean(pho_a$ds), mean(pho_xnr$ds), mean(pho_xr$ds)),
"avg.dn"=c(mean(mus_a$dn), mean(mus_x$dn), mean(pho_a$dn), mean(pho_xnr$dn), mean(pho_xr$dn)),
"avg.dnds"=c(mean(mus_a$dn.ds), mean(mus_x$dn.ds), mean(pho_a$dn.ds), mean(pho_xnr$dn.ds), mean(pho_xr$dn.ds))
)
rownames(t6_data) = c()
%>% kable(col.names=c("Chromosome","Number of genes", "Avg. dS", "Avg. dN", "Avg. dN/dS"), caption="Substitution rates for Mus and Phodopus branches", align="l") %>% kable_styling(bootstrap_options=c("striped", "condensed", "responsive"), full_width=F)# %>% t6_data
Chromosome | Number of genes | Avg. dS | Avg. dN | Avg. dN/dS |
---|---|---|---|---|
Mus Auto | 14656 | 0.1582118 | 0.0201877 | 0.1572215 |
Mus X | 515 | 0.1300482 | 0.0206157 | 0.1905945 |
Phodopus Auto | 12886 | 0.1243969 | 0.0176781 | 0.1796804 |
Phodopus X non-recombining | 306 | 0.1005724 | 0.0189231 | 0.2208170 |
Phodopus X recombining | 166 | 0.1098163 | 0.0212593 | 0.2361039 |
#column_spec(1, width="7em") %>%
#column_spec(2, width="13em")
Amino acid profile change
Ancestral reconstructions with M1 free-ratio model, branches leading to Mus clade and Phodopus Clade
= read.csv("../../data/mol-evol/phodopus-5spec-m1-anc.csv", header=T, comment.char="#")
pho_aa_data = subset(pho_aa_data, clade=="pcam;prob;psun")
pho_aa_data
= subset(pho_aa_data, chr=="ScmyWZ3_7747_HRSCAF_7900")
pho_aa_x $label = "Phodopus X"
pho_aa_x
= subset(pho_aa_data, chr!="ScmyWZ3_7747_HRSCAF_7900")
pho_aa_auto $label = "Phodopus Auto"
pho_aa_auto
= read.csv("../../data/mol-evol/mus-4spec-m1-anc.csv", header=T, comment.char="#")
mus_aa_data = subset(mus_aa_data, clade=="mcar;mmus;mpah")
mus_aa_data
= subset(mus_aa_data, chr=="X")
mus_aa_x $label = "Mus X"
mus_aa_x
= subset(mus_aa_data, chr!="X")
mus_aa_auto $label = "Mus Auto"
mus_aa_auto
= rbind(pho_aa_x, pho_aa_auto, mus_aa_x, mus_aa_auto)
aa_data = list(c("Phodopus Auto", "Phodopus X"), c("Mus Auto", "Mus X"), c("Mus Auto", "Phodopus Auto"), c("Mus X", "Phodopus X"))
x_comps
= c("#006ddb", "#006ddb", "#db6d00", "#db6d00")
chrome_cols = c("Mus Auto", "Mus X", "Phodopus Auto", "Phodopus X")
chrome_labs
= ggplot(aa_data, aes(label, avg.sub.score, fill=label)) +
aa_p geom_quasirandom(size=0.5, alpha=0.3, width=0.25, color="#d3d3d3") +
geom_boxplot(outlier.shape=NA, alpha=0.7, width=0.5) +
geom_signif(comparisons=x_comps, map_signif_level=TRUE, textsize=4, size=1, step_increase=0.12, margin_top=0.1) +
xlab("Branch") +
ylab("Avg. BLOSUM score per gene") +
scale_fill_manual(limits=chrome_labs, values=chrome_cols) +
bartheme() +
theme(legend.position="none")
print(aa_p)
= subset(pho_aa_x, start < 35225145)
pho_aa_xr $label = "Phodopus X recombining"
pho_aa_xr
= subset(pho_aa_x, start >= 35225145)
pho_aa_xn $label = "Phodopus X non-recombining"
pho_aa_xn
= rbind(pho_aa_xr, pho_aa_xn, pho_aa_auto, mus_aa_x, mus_aa_auto)
aa_data
= c("#006ddb", "#006ddb", "#db6d00", "#db6d00", "#db6d00")
chrome_cols = c("Mus Auto", "Mus X", "Phodopus Auto", "Phodopus X recombining", "Phodopus X non-recombining")
chrome_labs
= list(c("Phodopus X recombining", "Phodopus X non-recombining"), c("Phodopus Auto", "Phodopus X recombining"), c("Phodopus Auto", "Phodopus X non-recombining"))
x_comps
= ggplot(aa_data, aes(label, avg.sub.score, fill=label)) +
aa_p geom_quasirandom(size=0.5, alpha=0.3, width=0.25, color="#d3d3d3") +
geom_boxplot(outlier.shape=NA, alpha=0.7, width=0.5) +
geom_signif(comparisons=x_comps, map_signif_level=TRUE, textsize=4, size=1, step_increase=0.12, margin_top=0.1) +
xlab("Branch") +
ylab("Avg. BLOSUM score per gene") +
scale_fill_manual(limits=chrome_labs, values=chrome_cols) +
bartheme() +
theme(legend.position="none",
axis.text.x = element_text(angle=10, hjust=1, size=10))
print(aa_p)
Results summary
- Lower dS on X in both Mus and Phodopus
- Faster dN/dS on X in branches leading to Mus and Phodopus (M1), Mus clade (M2), but NOT phodopus clade (M2). Results consistent between PAML’s M1 model and HyPhy’s FitMG94.
- No difference between recombining and non-recombining portions of the X in phodopus.
- No big difference between amino acid profile changes between Mus and Phodopus or Autosomes and X.
cat("Page last updated:", format(Sys.time(), "%m/%d/%Y %H:%M:%S %Z"))
## Page last updated: 07/22/2021 15:02:03 MDT
::includeHTML("../html-chunks/rmd_footer.html") htmltools