Murine positive selection
::opts_chunk$set(echo = TRUE, warning = FALSE, message = FALSE)
knitr
library(ggplot2)
library(cowplot)
library(ggbeeswarm)
library(dplyr)
library(kableExtra)
library(tidyr)
library(ggtree)
library(phytools)
library(phangorn)
library(reshape2)
library(ggExtra)
library(ggrepel)
#library(vroom)
library(ggdist)
#library(stringr)
library(ggsignif)
source("../lib/design.r")
source("../lib/get_tree_info.r")
#tree_type = "astral"
= F
save_tree_fig
cat(tree_type, " tree\n")
## astral tree
cat("188 species targeted capture to mouse exons assembled with SPADES.\n")
## 188 species targeted capture to mouse exons assembled with SPADES.
cat("Pared to 109 species based on paring algorithm")
## Pared to 109 species based on paring algorithm
cat("11,775 coding loci aligned with exonerate+mafft\n")
## 11,775 coding loci aligned with exonerate+mafft
cat("Gene trees inferred with IQtree.\n")
## Gene trees inferred with IQtree.
if(tree_type == "astral"){
cat("Species tree inferred with ASTRAL.\n")
cat("Branch lengths estimated by ASTRAL.\n")
= "../../data/trees/paring/min-spec-paring-astral-exempt-2/iter-8-pared.tre"
tree_file #gene_ps_file = "../../data/ps/full-coding-absrel-pared.csv"
= "../../data/ps/full-coding-busted-pared.csv"
gene_ps_file = "../../data/ps/full-coding-absrel-pared-branch-ps.csv"
branch_ps_file = "../../data/ps/full-coding-absrel-pared-branch-ps-priori.csv"
priori_branch_ps_file = "../../data/ps/full-coding-fel-pared.csv"
site_ps_file = "../../data/ps/full-coding-absrel-pared-branch-ps-morphofacial.csv"
morpho_branch_ps_file = "../../data/trees/paring/astral-pared-2-colonization-branches.txt"
col_file = "../../data/trees/astral-pared-morpho-ou-shift-branches.txt"
morpho_file = "../../data/datasets/subset_morphofacial_genes.txt"
morpho_genes_file = "../../data/datasets/arid-genes-giorello-2018-pid.txt"
arid_genes_file = c("Lophiomys_imhausi_UM5152", "Lophuromys_woosnami_LSUMZ37793", "<1>", "<108>", "<107>")
exclude_branches = 22
xmax else if(tree_type == "concat"){
}cat("Species tree inferred by concatenation of all loci with IQtree.\n")
= "../../data/trees/full_coding_iqtree_concat.cf.rooted.tree"
tree_file = ""
gene_ps_file = ""
branch_ps_file = ""
site_ps_file = ""
col_file = c("Lophiomys_imhausi_UM5152", "Lophuromys_woosnami_LSUMZ37793", "<1>", "<187>", "<186>")
exclude_branches = 0.125
xmax }
## Species tree inferred with ASTRAL.
## Branch lengths estimated by ASTRAL.
= read.tree(tree_file)
tree = treeToDF(tree)
tree_to_df_list = tree_to_df_list[["info"]]
tree_info
if(tree_type == "astral"){
= tree_info %>% separate(label, c("tp", "gcf"), sep=">_", remove=F)
tree_info
for(i in 1:nrow(tree_info)){
if(tree_info[i,]$node.type == "internal"){
if(substring(tree_info[i,]$tp, nchar(tree_info[i,]$tp)) != ">"){
$tp = paste0(tree_info[i,]$tp, ">")
tree_info[i,]
}
}
}
$astral[tree_info$node.type=="tip"] = NA
tree_info$astral = as.numeric(tree_info$astral)
tree_info$gcf = as.numeric(tree_info$gcf)
tree_infoelse if(tree_type == "concat"){
}= tree_info %>% separate(label, c("tp", "bootstrap"), sep="/", remove=F)
tree_info $bootstrap[tree_info$node.type=="tip"] = NA
tree_info$bootstrap = as.numeric(tree_info$bootstrap)
tree_info$gcf = as.numeric(tree_info$gcf)
tree_info$scf = as.numeric(tree_info$scf)
tree_info }
= read.csv(col_file, header=F)
col_branches names(col_branches) = c("tp", "region")
$col.branch = "Other"
tree_info
$col.branch[tree_info$tp %in% col_branches$tp] = "Colonization"
tree_info
#tree_info$col.desc.branch = "N"
for(i in 1:nrow(tree_info)){
if(tree_info[i,]$node.type == "internal" && tree_info[i,]$col.branch == "Colonization"){
= getDescendants(tree, tree_info[i,]$node)
cur_desc $col.branch[tree_info$node==cur_desc[1]] = "Descendant"
tree_info$col.branch[tree_info$node==cur_desc[2]] = "Descendant"
tree_info
} }
1 Positive selection by gene (all genes)
= read.csv(gene_ps_file, header=T, comment.char="#")
gene_ps $pval.adj = p.adjust(gene_ps$pval, "fdr")
gene_ps
= gene_ps %>% separate(file, c("pid", "aln", "ext"), sep="-", remove=F)
gene_ps
= read.csv(morpho_genes_file, header=F)
morpho_genes names(morpho_genes)[1] = "pid"
= read.csv(arid_genes_file, header=F)
arid_genes names(arid_genes)[1] = "pid"
BUSTED was run on all genes testing all branches. BUSTED fits a model with three omega classes such that \(\omega_1 \leq \omega_2 \leq 1 \leq \omega_3\). It is run once while restricting \(\omega_3 = 1\) and once while allowing \(\omega_3\) to vary freely. It performs a likelihood ratio test to calculate p-values to detect whether the full model (allowing positive selection on the test branches) fits better than the constrained model.
We correct these p-values for false-discovery rate and select genes with adjusted p-values less than 0.01.
$label = "Non-PS"
gene_ps$pval.adj < 0.01, ]$label = "PS"
gene_ps[gene_ps
= nrow(subset(gene_ps, pval.adj < 0.01))
num_ps_genes = nrow(subset(gene_ps, pval.adj >= 0.01))
num_non_ps_genes = nrow(gene_ps)
total_genes
= data.frame("prop.genes"=c(num_ps_genes/total_genes, num_non_ps_genes/total_genes), "num.genes"=c(num_ps_genes, num_non_ps_genes), "type"=c("Evidence for selection\nin at least one branch", "No evidence for selection"))
ps_all_df $label = "All genes"
ps_all_df
= nrow(subset(gene_ps, pval.adj < 0.01 & pid %in% morpho_genes$pid))
num_mf_ps_genes = nrow(subset(gene_ps, pval.adj >= 0.01 & pid %in% morpho_genes$pid))
num_mf_non_ps_genes = num_mf_ps_genes + num_mf_non_ps_genes
total_mf_genes
= data.frame("prop.genes"=c(num_mf_ps_genes/total_mf_genes, num_mf_non_ps_genes/total_mf_genes), "num.genes"=c(num_mf_ps_genes, num_mf_non_ps_genes), "type"=c("Evidence for selection\nin at least one branch", "No evidence for selection"))
ps_mf_df $label = "Morphofacial genes"
ps_mf_df
= nrow(subset(gene_ps, pval.adj < 0.01 & pid %in% arid_genes$pid))
num_arid_ps_genes = nrow(subset(gene_ps, pval.adj >= 0.01 & pid %in% arid_genes$pid))
num_arid_non_ps_genes = num_arid_ps_genes + num_arid_non_ps_genes
total_arid_genes
= data.frame("prop.genes"=c(num_arid_ps_genes/total_arid_genes, num_arid_non_ps_genes/total_arid_genes), "num.genes"=c(num_arid_ps_genes, num_arid_non_ps_genes), "type"=c("Evidence for selection\nin at least one branch", "No evidence for selection"))
ps_arid_df $label = "Arid genes"
ps_arid_df
= rbind(ps_all_df, ps_mf_df, ps_arid_df)
ps_df
= ggplot(ps_df, aes(x=label, y=prop.genes, fill=type)) +
gene_ps_counts geom_bar(stat="identity") +
geom_text(aes(label=num.genes), size=6, position=position_stack(vjust=0.5), color="#f2f2f2") +
# geom_text(size=2.3, position=position_stack(vjust=0.5), color="#f2f2f2") +
# geom_text(aes(label=text2), size=2.3, color="#f2f2f2", nudge_y=0.05) +
# geom_text(aes(label=text3), size=2.3, color="#f2f2f2", nudge_y=-0.05) +
# geom_text(aes(label=text4), size=2.3, color="#f2f2f2", nudge_y=0.82) +
xlab("") +
ylab("Proportion of genes of genes") +
#scale_y_continuous(expand=c(0,0), limits=c(0,1)) +
#scale_x_discrete(limits=unique(comps$label)) +
#scale_x_continuous(limits=c(-0.5,3)) +
#scale_fill_manual(guide=guide_legend(reverse=T), values=cols) +
scale_fill_manual(values=corecol(pal="wilke", numcol=2), guide=guide_legend(reverse=T)) +
#scale_fill_discrete(, values=cols) +
bartheme() +
theme(legend.position="bottom",
axis.title.y=element_blank()) +
coord_flip()
print(gene_ps_counts)
= matrix(c(num_ps_genes,num_mf_ps_genes,num_non_ps_genes,num_mf_non_ps_genes), nrow = 2)
morpho_fisher fisher.test(morpho_fisher)
##
## Fisher's Exact Test for Count Data
##
## data: morpho_fisher
## p-value = 3.696e-09
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
## 0.6024040 0.7770671
## sample estimates:
## odds ratio
## 0.6841711
= matrix(c(num_ps_genes,num_arid_ps_genes,num_non_ps_genes,num_arid_non_ps_genes), nrow = 2)
arid_fisher fisher.test(arid_fisher)
##
## Fisher's Exact Test for Count Data
##
## data: arid_fisher
## p-value = 0.06404
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
## 0.4492452 1.0291151
## sample estimates:
## odds ratio
## 0.6796711
= subset(gene_ps, pval.adj < 0.01)$pid
ps_genes
#write.csv(ps_genes, file="../../data/ps/busted-full-gene-list.txt", quote=F, row.names=F, col.names=F)
write(ps_genes, file="../../data/ps/busted-full-gene-list.txt")
2 Positive selection by branch (all genes)
aBSREL was run on all genes. aBSREL optimizes the number of rate categories per branch and then runs both a free-rate model and a model where it restricts the max rate to be 1 (no positive selection). It compares these models with a likelihood ratio test and computes a p-value using a mixture of chi-squared distributions. All branches are tested, and aBSREL corrects the p-values for multiple testing within a gene using the Holm-Bonferroni method. After correcting p-values, branches are said to have evidence for positive selection if p < 0.05.
Subsequently, we correct for multiple testing again between genes by adjusting our siginficance level using the Bonferroni method from 0.01 to (0.01 / the number of genes tested).
# ## Percent of branches with significant p-value per gene
# Since each gene may have a different number of branches tested, we report the percent of genes with evidence for positive selection per gene.
# ps_genes = read.csv(gene_ps_file, header=T, comment.char="#")
# ps_genes$perc.br.ps = ps_genes$num.branches.pval.less.than.alpha / ps_genes$num.branches
#
# gene_p = ggplot(ps_genes, aes(x=1, y=perc.br.ps)) +
# geom_quasirandom(size=2, width=0.25, alpha=0.25, color=corecol(numcol=1, offset=2)) +
# geom_boxplot(outlier.shape=NA, alpha=0.75, width=0.5, color="#666666", width=0.25) +
# scale_x_continuous(limits=c(0.5,1.5)) +
# xlab("") +
# ylab("% of branches with evidence\nfor positive selection per gene") +
# bartheme() +
# theme(axis.title.x=element_blank(),
# axis.text.x=element_blank(),
# axis.ticks.x=element_blank())
# print(gene_p)
= read.csv(branch_ps_file, header=T, comment.char="#")
ps_branches names(ps_branches)[1] = "tp"
= names(ps_branches)[5:11]
cols_to_na
for(col in cols_to_na){
$tp %in% exclude_branches,][[col]] = NA
ps_branches[ps_branches
}# This loop changes all the counts for the branches descending from the root and the outgroup to NA to exclude them
$perc.ps.genes = ps_branches$num.ps.genes / (ps_branches$num.genes.full + ps_branches$num.genes.partial) ps_branches
2.1 Species tree branch presence/absence per gene
Following the same procedure as before, we count multi-nucleotdie substitutions per species tree branch only in genes that contain that branch (full clade or partial clade).
NOTE: These distributions are for the PARED tree.
#anc_info = subset(anc_info_w_root, node.type != "root")
= select(ps_branches, clade, node.type, num.genes.full)
full_clade $label = "Full clade"
full_cladenames(full_clade)[3] = "num.genes"
= select(ps_branches, clade, node.type, num.genes.partial)
partial_clade $label = "Partial clade"
partial_cladenames(partial_clade)[3] = "num.genes"
= select(ps_branches, clade, node.type, num.genes.descendant.counted)
descendant_counted $label = "Descendant counted"
descendant_countednames(descendant_counted)[3] = "num.genes"
= select(ps_branches, clade, node.type, num.genes.discordant)
discordant_clade $label = "Discordant clade"
discordant_cladenames(discordant_clade)[3] = "num.genes"
= select(ps_branches, clade, node.type, num.genes.missing)
no_clade $label = "Missing clade"
no_cladenames(no_clade)[3] = "num.genes"
= rbind(full_clade, partial_clade, descendant_counted, discordant_clade, no_clade)
clade_counts # Convert branch categories to long format
$label = factor(clade_counts$label, levels=c("Full clade", "Partial clade", "Descendant counted", "Discordant clade", "Missing clade"))
clade_counts
= ggplot(clade_counts, aes(x=label, y=num.genes, group=label, color=node.type)) +
branch_counts geom_quasirandom(size=2, width=0.25, alpha=0.25) +
geom_boxplot(outlier.shape=NA, alpha=0.15, width=0.5, color="#666666") +
ylab("# of genes") +
xlab("Species tree\nbranch classification") +
bartheme() +
theme(axis.text.x = element_text(angle=25, hjust=1)) +
guides(colour=guide_legend(override.aes=list(alpha=1)))
print(branch_counts)
= names(ps_branches)[!(names(ps_branches) %in% names(tree_info))]
uniq_info_cols # # Get non common names
= c(uniq_info_cols, "tp") # appending key parameter
uniq_info_cols # Get a list of columns from the tree_info df to join to the tree rates df
= tree_info %>% left_join((ps_branches %>% select(uniq_info_cols)), by="tp")
tree_info # Select the columns from tree_info and join to tree_rates, merging by clade
# https://stackoverflow.com/a/61628157
= tree_info[order(tree_info$node), ]
tree_info # Re-order the rows by the R node
2.2 Tree with colonization branches labeled
= corecol(numcol=1, pal="wilke", offset=3)
h = corecol(numcol=1, offset=3)
l # Colors
= ggtree(tree, size=0.8, ladderize=F, aes(color=tree_info$col.branch)) +
col_tree scale_color_manual(name='Branch partition', values=corecol(pal="wilke", numcol=3)) +
xlim(0, xmax) +
geom_tiplab(color="#333333", fontface='italic', size=2) +
theme(legend.position=c(0.15,0.9))
print(col_tree)
if(save_tree_fig){
= gcf_tree + geom_text(aes(x=branch, label=ifelse(tree_info$node.type=="internal",as.character(node), ''), label.size=NA, fill="transparent"), size=2, vjust=-0.2)
gcf_tree = paste("../../data/trees/", tree_type, "-gcf-tree-PARED-gcf50.pdf", sep="")
tree_outfile ggsave(tree_outfile, gcf_tree, width=8, height=16, unit="in")
}# Colonization branch tree
2.3 Branches under positive selection by colonization partition
= subset(select(tree_info, clade, node.type, col.branch, perc.ps.genes), col.branch == "Colonization")
col_ps $label = "Colonization"
col_ps#names(full_clade)[3] = "num.genes"
= subset(select(tree_info, clade, node.type, col.branch, perc.ps.genes), col.branch == "Descendant")
desc_ps $label = "Descendant"
desc_ps
= subset(select(tree_info, clade, node.type, col.branch, perc.ps.genes), col.branch == "Other")
other_ps $label = "Other"
other_ps
= rbind(col_ps, desc_ps, other_ps)
ps_counts # Convert branch categories to long format
$label = factor(ps_counts$label, levels=c("Colonization", "Descendant", "Other"))
ps_counts
= list(c("Colonization", "Descendant"), c("Colonization", "Other"), c("Descendant", "Other"))
x_comps
= ggplot(ps_counts, aes(x=label, y=perc.ps.genes, group=label, color=node.type)) +
branch_ps_counts geom_quasirandom(size=2, width=0.25, alpha=0.25) +
geom_boxplot(outlier.shape=NA, alpha=0.15, width=0.5, color="#666666") +
geom_signif(comparisons=x_comps, map_signif_level=TRUE, textsize=4, size=1, step_increase=0.12, margin_top=0.1) +
ylab("Proportion of genes\nwith evidence for PS") +
xlab("Species tree\nbranch partition") +
bartheme() +
theme(axis.text.x = element_text(angle=25, hjust=1)) +
guides(colour=guide_legend(override.aes=list(alpha=1)))
print(branch_ps_counts)
3 Positive selection by branch (4659 BUSTED genes)
= read.csv(priori_branch_ps_file, header=T, comment.char="#")
ps_branches_priori names(ps_branches_priori)[1] = "tp"
= names(ps_branches_priori)[5:11]
cols_to_na
for(col in cols_to_na){
$tp %in% exclude_branches,][[col]] = NA
ps_branches_priori[ps_branches_priori
}# This loop changes all the counts for the branches descending from the root and the outgroup to NA to exclude them
$perc.ps.genes = ps_branches_priori$num.ps.genes / (ps_branches_priori$num.genes.full + ps_branches_priori$num.genes.partial) ps_branches_priori
3.1 Branch presence
= select(ps_branches_priori, clade, node.type, num.genes.full)
full_clade $label = "Full clade"
full_cladenames(full_clade)[3] = "num.genes"
= select(ps_branches_priori, clade, node.type, num.genes.partial)
partial_clade $label = "Partial clade"
partial_cladenames(partial_clade)[3] = "num.genes"
= select(ps_branches_priori, clade, node.type, num.genes.descendant.counted)
descendant_counted $label = "Descendant counted"
descendant_countednames(descendant_counted)[3] = "num.genes"
= select(ps_branches_priori, clade, node.type, num.genes.discordant)
discordant_clade $label = "Discordant clade"
discordant_cladenames(discordant_clade)[3] = "num.genes"
= select(ps_branches_priori, clade, node.type, num.genes.missing)
no_clade $label = "Missing clade"
no_cladenames(no_clade)[3] = "num.genes"
# Subset each clade count column to add a label
= rbind(full_clade, partial_clade, descendant_counted, discordant_clade, no_clade)
clade_counts # Convert branch categories to long format
$label = factor(clade_counts$label, levels=c("Full clade", "Partial clade", "Descendant counted", "Discordant clade", "Missing clade"))
clade_counts# Factorize the labels in order
= ggplot(clade_counts, aes(x=label, y=num.genes, group=label, color=node.type)) +
branch_counts geom_quasirandom(size=2, width=0.25, alpha=0.25) +
geom_boxplot(outlier.shape=NA, alpha=0.15, width=0.5, color="#666666") +
ylab("# of genes") +
xlab("Species tree\nbranch classification") +
bartheme() +
theme(axis.text.x = element_text(angle=25, hjust=1)) +
guides(colour=guide_legend(override.aes=list(alpha=1)))
print(branch_counts)
3.2 Branches under positive selection by colonization partition
= select(ps_branches_priori, tp, perc.ps.genes)
priori_ps_counts names(priori_ps_counts)[2] = "perc.ps.genes.priori"
= merge(tree_info, priori_ps_counts, by="tp")
tree_info
= tree_info[order(tree_info$node), ]
tree_info # Re-order the rows by the R node
= subset(select(tree_info, clade, node.type, col.branch, perc.ps.genes.priori), col.branch == "Colonization")
col_ps $label = "Colonization"
col_ps#names(full_clade)[3] = "num.genes"
= subset(select(tree_info, clade, node.type, col.branch, perc.ps.genes.priori), col.branch == "Descendant")
desc_ps $label = "Descendant"
desc_ps
= subset(select(tree_info, clade, node.type, col.branch, perc.ps.genes.priori), col.branch == "Other")
other_ps $label = "Other"
other_ps
= rbind(col_ps, desc_ps, other_ps)
ps_counts # Convert branch categories to long format
$label = factor(ps_counts$label, levels=c("Colonization", "Descendant", "Other"))
ps_counts
= list(c("Colonization", "Descendant"), c("Colonization", "Other"), c("Descendant", "Other"))
x_comps
= ggplot(ps_counts, aes(x=label, y=perc.ps.genes.priori, group=label, color=node.type)) +
branch_ps_counts geom_quasirandom(size=2, width=0.25, alpha=0.25) +
geom_boxplot(outlier.shape=NA, alpha=0.15, width=0.5, color="#666666") +
geom_signif(comparisons=x_comps, map_signif_level=TRUE, textsize=4, size=1, step_increase=0.12, margin_top=0.1) +
ylab("Proportion of genes\nwith evidence for PS") +
xlab("Species tree\nbranch partition") +
bartheme() +
theme(axis.text.x = element_text(angle=25, hjust=1)) +
guides(colour=guide_legend(override.aes=list(alpha=1)))
print(branch_ps_counts)
4 Positive selection by branch (1072 morphofacial genes)
= read.csv(morpho_branch_ps_file, header=T, comment.char="#")
ps_branches_mf names(ps_branches_mf)[1] = "tp"
= names(ps_branches_mf)[5:11]
cols_to_na
for(col in cols_to_na){
$tp %in% exclude_branches,][[col]] = NA
ps_branches_mf[ps_branches_mf
}# This loop changes all the counts for the branches descending from the root and the outgroup to NA to exclude them
$perc.ps.genes = ps_branches_mf$num.ps.genes / (ps_branches_mf$num.genes.full + ps_branches_mf$num.genes.partial) ps_branches_mf
4.1 Branch presence
= select(ps_branches_mf, clade, node.type, num.genes.full)
full_clade $label = "Full clade"
full_cladenames(full_clade)[3] = "num.genes"
= select(ps_branches_mf, clade, node.type, num.genes.partial)
partial_clade $label = "Partial clade"
partial_cladenames(partial_clade)[3] = "num.genes"
= select(ps_branches_mf, clade, node.type, num.genes.descendant.counted)
descendant_counted $label = "Descendant counted"
descendant_countednames(descendant_counted)[3] = "num.genes"
= select(ps_branches_mf, clade, node.type, num.genes.discordant)
discordant_clade $label = "Discordant clade"
discordant_cladenames(discordant_clade)[3] = "num.genes"
= select(ps_branches_mf, clade, node.type, num.genes.missing)
no_clade $label = "Missing clade"
no_cladenames(no_clade)[3] = "num.genes"
# Subset each clade count column to add a label
= rbind(full_clade, partial_clade, descendant_counted, discordant_clade, no_clade)
clade_counts # Convert branch categories to long format
$label = factor(clade_counts$label, levels=c("Full clade", "Partial clade", "Descendant counted", "Discordant clade", "Missing clade"))
clade_counts# Factorize the labels in order
= ggplot(clade_counts, aes(x=label, y=num.genes, group=label, color=node.type)) +
branch_counts geom_quasirandom(size=2, width=0.25, alpha=0.25) +
geom_boxplot(outlier.shape=NA, alpha=0.15, width=0.5, color="#666666") +
ylab("# of genes") +
xlab("Species tree\nbranch classification") +
bartheme() +
theme(axis.text.x = element_text(angle=25, hjust=1)) +
guides(colour=guide_legend(override.aes=list(alpha=1)))
print(branch_counts)
4.2 Tree with OU shift branches labeled
= read.csv(morpho_file, header=F, comment.char="#")
morpho_branches names(morpho_branches) = c("tp")
$morpho.branch = "No OU shift"
tree_info$morpho.branch[tree_info$tp %in% morpho_branches$tp] = "OU shift"
tree_info
= corecol(numcol=1, pal="wilke", offset=3)
h = corecol(numcol=1, offset=3)
l # Colors
= ggtree(tree, size=0.8, ladderize=F, aes(color=tree_info$morpho.branch)) +
col_tree scale_color_manual(name='Branch partition', values=corecol(numcol=2)) +
xlim(0, xmax) +
geom_tiplab(color="#333333", fontface='italic', size=2) +
theme(legend.position=c(0.15,0.9))
print(col_tree)
if(save_tree_fig){
= gcf_tree + geom_text(aes(x=branch, label=ifelse(branch_rates$node.type=="internal",as.character(node), ''), label.size=NA, fill="transparent"), size=2, vjust=-0.2)
gcf_tree = paste("../../data/trees/", tree_type, "-gcf-tree-PARED-gcf50.pdf", sep="")
tree_outfile ggsave(tree_outfile, gcf_tree, width=8, height=16, unit="in")
}# OU shift branch tree
4.3 Branches under positive selection by OU shift partition
= select(ps_branches_mf, tp, perc.ps.genes)
morpho_ps_counts names(morpho_ps_counts)[2] = "perc.ps.genes.mf"
= merge(tree_info, morpho_ps_counts, by="tp")
tree_info
= tree_info[order(tree_info$node), ]
tree_info # Re-order the rows by the R node
= subset(select(tree_info, clade, node.type, morpho.branch, perc.ps.genes.mf), morpho.branch == "OU shift")
morpho_ps_mf names(morpho_ps_mf)[4] = "perc.ps.genes"
$label = "OU shift (MF genes)"
morpho_ps_mf
= subset(select(tree_info, clade, node.type, morpho.branch, perc.ps.genes.mf), morpho.branch == "No OU shift")
other_ps_mf names(other_ps_mf)[4] = "perc.ps.genes"
$label = "No OU shift (MF genes)"
other_ps_mf
= subset(select(tree_info, clade, node.type, morpho.branch, perc.ps.genes), morpho.branch == "OU shift")
morpho_ps_all $label = "OU shift (All genes)"
morpho_ps_all#names(full_clade)[3] = "num.genes"
= subset(select(tree_info, clade, node.type, morpho.branch, perc.ps.genes), morpho.branch == "No OU shift")
other_ps_all $label = "No OU shift (All genes)"
other_ps_all
= rbind(morpho_ps_all, other_ps_all, morpho_ps_mf, other_ps_mf)
ps_df # Convert branch categories to long format
$label = factor(ps_df$label, levels=c("OU shift (All genes)", "No OU shift (All genes)", "OU shift (MF genes)", "No OU shift (MF genes)"))
ps_df
= list(c("OU shift (All genes)", "No OU shift (All genes)"), c("OU shift (MF genes)", "No OU shift (MF genes)"), c("OU shift (All genes)", "OU shift (MF genes)"), c("No OU shift (All genes)", "No OU shift (MF genes)"))
x_comps
= ggplot(ps_df, aes(x=label, y=perc.ps.genes, group=label, color=node.type)) +
morpho_branch_ps geom_quasirandom(size=2, width=0.25, alpha=0.25) +
geom_boxplot(outlier.shape=NA, alpha=0.15, width=0.5, color="#666666") +
geom_signif(comparisons=x_comps, map_signif_level=TRUE, textsize=4, size=1, step_increase=0.12, margin_top=0.1, color="#333333") +
ylab("Proportion of genes\nwith evidence for PS") +
xlab("Species tree\nbranch partition") +
bartheme() +
theme(axis.text.x = element_text(angle=25, hjust=1)) +
guides(colour=guide_legend(override.aes=list(alpha=1)))
print(morpho_branch_ps)
5 Positive selection by site
FEL was run on all genes. FEL estimates dN and dS per site, but assumes these rates are constant across branches. It compares dN and dS with a likelihood ratio test at each site and computes a p-value using a chi-squared distribution. A site is said to be under selection then if its p-value falls below an initial level of 0.01, corrected using the Bonferroni method across all sites.
= read.csv(site_ps_file, header=T, comment.char="#")
site_ps $prop.sites.ps = site_ps$num.sites.ps / site_ps$num.sites
site_ps= site_ps %>% separate(file, c("pid", "aln", "ext"), sep="-", remove=F)
site_ps
# Other files read above
5.1 Proportion of sites under selection (all genes)
= ggplot(subset(site_ps, num.sites.ps > 0), aes(x=prop.sites.ps)) +
site_ps_counts geom_histogram(bins=100, color="#ececec") +
#geom_quasirandom(size=2, width=0.25, alpha=0.25) +
#geom_boxplot(outlier.shape=NA, alpha=0.15, width=0.5, color="#666666") +
#geom_signif(comparisons=x_comps, map_signif_level=TRUE, textsize=4, size=1, step_increase=0.12, margin_top=0.1) +
scale_y_continuous(expand=c(0,0)) +
ylab("# of genes") +
xlab("Porportion of sites with evidence for selection") +
bartheme()
#theme(axis.text.x = element_text(angle=25, hjust=1)) +
#guides(colour=guide_legend(override.aes=list(alpha=1)))
print(site_ps_counts)
## Proportion of sites under selection (all genes vs. morpho-facial genes vs. arid genes)
= subset(site_ps, pid %in% ps_genes)
site_ps_priori = select(site_ps_priori, pid, prop.sites.ps)
site_ps_priori_sub $label = "BUSTED genes"
site_ps_priori_sub
= subset(site_ps, pid %in% morpho_genes$pid)
site_ps_mf = select(site_ps_mf, pid, prop.sites.ps)
site_ps_mf_sub $label = "MF genes"
site_ps_mf_sub
= subset(site_ps, pid %in% arid_genes$pid)
site_ps_arid = select(site_ps_arid, pid, prop.sites.ps)
site_ps_arid_sub $label = "Arid genes"
site_ps_arid_sub
= select(site_ps, pid, prop.sites.ps)
site_ps_sub $label = "All genes"
site_ps_sub
= rbind(site_ps_sub, site_ps_priori_sub, site_ps_mf_sub, site_ps_arid_sub)
site_ps_cats
$label = factor(site_ps_cats$label, levels=c("All genes", "BUSTED genes", "MF genes", "Arid genes"))
site_ps_cats
= list(c("All genes", "BUSTED genes"), c("All genes", "MF genes"), c("All genes", "Arid genes"))
x_comps
= ggplot(site_ps_cats, aes(x=label, y=prop.sites.ps, group=label)) +
site_ps_comps geom_quasirandom(size=2, width=0.25, alpha=0.25) +
geom_boxplot(outlier.shape=NA, alpha=0.15, width=0.5, color="#666666") +
geom_signif(comparisons=x_comps, map_signif_level=TRUE, textsize=4, size=1, step_increase=0.12, margin_top=0.1) +
ylab("Proportion of sites\nwith evidence for PS") +
xlab("Gene partition") +
bartheme() +
theme(axis.text.x = element_text(angle=25, hjust=1)) +
guides(colour=guide_legend(override.aes=list(alpha=1)))
print(site_ps_comps)
= data.frame("Gene set"=c("All genes", "BUSTED genes", "Morphofacial genes", "Arid genes"),
site_table "Avg. proportion of sites under positive selection"=c(mean(site_ps_sub$prop.sites.ps, na.rm=T), mean(site_ps_priori_sub$prop.sites.ps, na.rm=T), mean(site_ps_mf_sub$prop.sites.ps, na.rm=T), mean(site_ps_arid_sub$prop.sites.ps, na.rm=T)),
"Median proportion of sites under positive selection"=c(median(site_ps_sub$prop.sites.ps, na.rm=T), median(site_ps_priori_sub$prop.sites.ps, na.rm=T), median(site_ps_mf_sub$prop.sites.ps, na.rm=T), median(site_ps_arid_sub$prop.sites.ps, na.rm=T)))
%>% kable() %>% kable_styling(bootstrap_options=c("striped", "condended", "responsive"), full_width=F) site_table
Gene.set | Avg..proportion.of.sites.under.positive.selection | Median.proportion.of.sites.under.positive.selection |
---|---|---|
All genes | 0.0036334 | 0.0000000 |
BUSTED genes | 0.0048887 | 0.0023502 |
Morphofacial genes | 0.0043709 | 0.0016340 |
Arid genes | 0.0050106 | 0.0021517 |