Positive selection tree comparisons
#knitr::opts_chunk$set(echo = FALSE)
#this.dir <- dirname(parent.frame(2)$ofile)
#setwd(this.dir)
::opts_chunk$set(echo = TRUE, warning = FALSE, message = FALSE)
knitr
library(ggtree)
library(ggplot2)
library(ggbeeswarm)
library(cowplot)
library(ggsignif)
library(dplyr)
library(tidyr)
library(kableExtra)
library(ggVennDiagram)
library(eulerr)
source("C:/Users/Gregg/.RProfile")
1 Coding alignments
Coding sequences from mm10 were lifted over to the 6 new rodent genomes from the longest mouse transcript. These sequences were trimmed and aligned with MACSE for in-frame alignments. These alignments were filtered as follows:
= read.csv("../../data/wgs-ps-comps/04_penn_7spec_filter_cds.log", sep="\t", comment.char="#", stringsAsFactors=F)
aln_stats = subset(aln_stats, Short.seq == FALSE & Percent.ident.seqs.high == FALSE & Percent.no.info.high == FALSE & Premature.stop.codons == 0)
aln_stats print(paste("Number of alignments: ", nrow(aln_stats)))
## [1] "Number of alignments: 16773"
## Read the alignment filter data data
# CDS alignment filter
# PYTHON VERSION: 2.7.5
# Script call: /home/gt156213e/bin/core/generators/cds-aln/04_codon_filter.py -i ../aln/03_penn-7spec-trim/ -o ../aln/04_penn-7spec-filter/ -n 04_penn_7spec_filter_cds -e 7
# Runtime: 11/23/2020 14:18:01
# ----------------
# IO OPTIONS
# Input CDS directory: /mnt/beegfs/gt156213e/penn-genomes/cds/aln/03_penn-7spec-trim
# Input sequence type: codon
# Job name: 04_penn_7spec_filter_cds
# Output directory: /mnt/beegfs/gt156213e/penn-genomes/cds/aln/04_penn-7spec-filter
# Creating output directory.
# Log file: logs/04_penn_7spec_filter_cds.log
# ----------------
# Total aligns 21733
# Files skipped: 0
# Aligns written 16773.0
# Aligns shorter than 100bp 62.0
# Aligns with >50% identical sequences 1758.0
# Aligns with >20% of gappy/Ny sites 0.0
# Aligns with at least one premature stop: 3214.0
# Aligns with >20% of seqs with premature stops 962.0
# ----------------
# Spec Number >20% gappy Number premature stop
# >mnat 122 884
# >pdel 115 808
# >gdol 127 752
# >mmus 12 9
# >hall 127 780
# >rsor 130 767
# >rdil 83 608
# ----------------
1.1 Alignment lengths
= subset(aln_stats, Seq.length > 5000)
aln_len_filter = subset(aln_stats, Seq.length <= 5000)
aln_stats print(paste("Removing ", nrow(aln_len_filter), " alignments over 5000bp"))
## [1] "Removing 599 alignments over 5000bp"
= ggplot(aln_stats, aes(x=Seq.length)) +
raw_len_p geom_histogram(bins=50, color="#999999") +
scale_y_continuous(expand=c(0,0)) +
bartheme()
print(raw_len_p)
# Simple distribution of alignment lengths
2 Gene and species trees
Gene and species trees for this set of species were constructed with IQtree and ASTRAL. Both a concatenated and ASTRAL species tree were inferred The same topology was recovered with both methods. Below is the concatenated topology and branch lengths with gene and site concordance factors calculated with IQtree.
= "../../data/wgs-ps-comps/05_penn_7spec_iqtree.cf.rooted.tree"
tree_file = read.tree(tree_file)
concat_tree
= treeToDF(concat_tree)
tree_info_concat = tree_info_concat %>% separate(label, c("astral", "gcf", "scf"), sep="/", remove=F)
tree_info_concat $bootstrap[tree_info_concat$node.type=="tip"] = NA
tree_info_concat$bootstrap = as.numeric(tree_info_concat$astral)
tree_info_concat$gcf = as.numeric(tree_info_concat$gcf)
tree_info_concat$scf = as.numeric(tree_info_concat$scf)
tree_info_concat# Read astral tree data
$species = c("Rhynchomys soricoides", "Grammomys dolichurus", "Rhabdomys dilectus", "Hylomyscus alleni", "Mastomys natalensis", "Praomys delectorum", "Mus musculus", NA, NA, NA, NA, NA, NA) tree_info_concat
2.1 Species tree with branches colored by gene concordance factor
= corecol(numcol=1, pal="wilke", offset=3)
h = corecol(numcol=1, offset=3)
l # Colors
= ggtree(concat_tree, size=2, ladderize=F, aes(color=tree_info_concat$gcf)) +
gcf_tree scale_color_continuous(name='gCF', low=l, high=h, limits=c(0,100)) +
xlim(0, 0.055) +
geom_tiplab(aes(label=tree_info_concat$species), color="#333333", fontface='italic', size=5) +
theme(legend.position="right")
#geom_text(aes(label=rodent_data$support), hjust=-.1, color="#006ddb") +
#geom_nodepoint(color="#666666", alpha=0.85, size=4)
print(gcf_tree)
# gCF tree
2.2 Species tree with branches colored by site concordance factor
= ggtree(concat_tree, size=2, ladderize=F, aes(color=tree_info_concat$scf)) +
scf_tree scale_color_continuous(name='sCF', low=l, high=h, limits=c(0,100)) +
xlim(0, 0.055) +
geom_tiplab(aes(label=tree_info_concat$species), color="#333333", fontface='italic', size=5) +
theme(legend.position="right")
#geom_text(aes(label=rodent_data$support), hjust=-.1, color="#006ddb") +
#geom_nodepoint(color="#666666", alpha=0.85, size=4)
print(scf_tree)
# sCF tree
2.3 Discordance among gene trees and the species tree
Only 14% of gene trees are completely concordant with the species topology. Most gene trees do not match the topology of the species tree.
= read.csv("../../data/wgs-ps-comps/mm10-cds-windows.csv", header=T, comment.char="#", stringsAsFactors=F)
transcript_windows # The 10kb windows that overlap with mouse transcripts
= read.csv("../../data/wgs-ps-comps/mm10-transcripts-longest.tab", sep="\t", header=T, comment.char="#", stringsAsFactors=F)
longest_transcripts # The transcripts used for gene trees and dN/dS/
= subset(transcript_windows, Transcript.ID %in% longest_transcripts$feature.id)
twin # Only take windows that have one of the transcripts we used in them.
= subset(twin, Gene.tree.match.concat.tree==TRUE)
concordant_gt = unique(select(concordant_gt, Gene.ID, chr, CDS.start, CDS.end))
concordant_gt # Get the concordant gene trees
= subset(twin, Gene.tree.match.concat.tree==FALSE)
discordant_gt = unique(select(discordant_gt, Gene.ID, chr, CDS.start, CDS.end))
discordant_gt # Get the discordant gene trees
## Read the windowed phylogeny and trancript data
#twin_uniq = unique(select(twin, Gene.ID, chr, CDS.start, CDS.end, Gene.tree.match.concat.tree))
# Get only a single row from each gene
= data.frame("type"=c("Concordant", "Discordant"), "count"=c(nrow(concordant_gt), nrow(discordant_gt)))
concordance # Combine the concordant and discordant gene trees into a single table
= ggplot(concordance, aes(x=1, y=count, fill=type)) +
concordant_p geom_bar(stat="identity") +
ylab("# of gene trees") +
scale_y_continuous(expand=c(0,0)) +
scale_x_continuous(limits=c(0,2)) +
scale_fill_manual(values=corecol(numcol=2)) +
bartheme() +
theme(legend.position="bottom",
axis.line.y=element_blank(),
axis.title.y=element_blank(),
axis.text.y=element_blank(),
axis.ticks.y=element_blank()) +
coord_flip()
print(concordant_p)
# Simple barplot of the proportion of concordant and discordant genes
3 Genes and 10kb window overlap
Most coding regions of genes only overlap a few 10kb windows.
$cds.len = (twin$CDS.end - twin$CDS.start) / 1000000
twin= mean(twin$cds.len)
mean_cds_len # Calculate CDS length in Mb
= unique(select(twin, Transcript.ID, CDS.start, CDS.end, cds.len))
cds_len # Take only one window entry per CDS
#ts_counts = count(twin$Transcript.ID)
= as.data.frame(table(twin$Transcript.ID))
ts_counts # Count the number of times a transcript overlaps a window.
= ggplot(subset(ts_counts, Freq<=30), aes(x=Freq)) +
cds_win_p geom_histogram(bins=30, fill=corecol(pal="trek", numcol=1, offset=2), color="#ececec") +
scale_x_continuous(expand=c(0,0)) +
scale_y_continuous(expand=c(0,0)) +
xlab("# of 10kb windows overlapped by CDS") +
ylab("# of genes") +
bartheme()
print(cds_win_p)
print(paste("Average distance from start to end of CDS: ", mean(cds_len$cds.len) * 1000000))
## [1] "Average distance from start to end of CDS: 43025.1631971369"
print(paste("Median distance from start to end of CDS: ", median(cds_len$cds.len) * 1000000))
## [1] "Median distance from start to end of CDS: 13295"
4 Rate estimates and tree misspecification
4.1 MG94-local model fit
First, a basic fit with HyPhy’s MG94 model to estimate dN and dS per gene.
= read.csv("../../data/wgs-ps-comps/penn-7spec-mg94-local-concat.csv", header=T, comment.char="#", stringsAsFactors=F)
concat_mg_local #concat_mg_local$id = as.string(concat_mg_local$id)
= group_by(concat_mg_local, id) %>% summarize(dn=mean(dn, na.rm=T), ds=mean(ds, na.rm=T), dn.ds=mean(dn.ds, na.rm=T))
concat_mg_local_gene # Average dn and ds across all branches for each gene
= read.csv("../../data/wgs-ps-comps/penn-7spec-mg94-local-gt.csv", header=T, comment.char="#", stringsAsFactors=F)
gt_mg_local = group_by(gt_mg_local, id) %>% summarize(dn=mean(dn, na.rm=T), ds=mean(ds, na.rm=T), dn.ds=mean(dn.ds, na.rm=T))
gt_mg_local_gene # Average dn and ds across all branches for each gene
## Read the results from the LOCAL MG94 model (dN and dS estimated for each branch)
Average dS distributions are similar to both the concatenated tree and gene trees
4.1.1 Distributions of avg dS per gene
= seq(0,0.2,by=0.001)
bins = seq(0.001,0.2,by=0.001)
bin_labels = hist(concat_mg_local_gene$ds, breaks=bins, include.lowest=T, plot=F)$counts
concat_counts = data.frame("bin"=bin_labels, "count"=concat_counts, filter="To concat tree")
concat_binned
= hist(gt_mg_local_gene$ds, breaks=bins, include.lowest=T, plot=F)$counts
gt_counts = data.frame("bin"=bin_labels, "count"=gt_counts, filter="To gene trees")
gt_binned
= bind_rows(concat_binned, gt_binned)
binned_data = corecol(numcol=2)
col = c("To concat tree", "To gene trees")
lab #ylabels = c("1200", as.character(seq(from=0, to=7000, by=1000)))
= as.character(seq(from=-1000, to=1000, by=100))
ylabels
= ggplot(binned_data, aes(x = bin, y = ifelse(filter == "To gene trees",-1, 1)*count, fill = filter)) +
dualp geom_col(width=0.001) +
geom_hline(yintercept=0) +
scale_x_continuous(name = "Avg. dS", limits = c(0, 0.2), expand = c(0, 0)) +
scale_y_continuous(name = "# genes", limits=c(-1000,1000), breaks = seq(from=-1000, to=1000, by=100), labels=ylabels) +
scale_fill_manual(name="", limits=lab, values=col) +
bartheme()
print(dualp)
# Distributions of dS
= 0.0575
ds_filter_level print(paste("Removing genes with dS above ", ds_filter_level, " from subsequent analyses."))
## [1] "Removing genes with dS above 0.0575 from subsequent analyses."
= subset(concat_mg_local_gene, ds > ds_filter_level)
concat_ds_filter = subset(gt_mg_local_gene, ds > ds_filter_level)
gt_ds_filter = intersect(concat_ds_filter$id, gt_ds_filter$id)
ds_filter # Get a list of genes to filter out in subsequent analyses based on dS
4.2 BUSTED
Next, each gene was run through HyPhy’s BUSTED test to see whether “a gene has experienced positive selection at at least one site on at least one branch.” No target branches were specified.
BUSTED estimates 3 omega categories, with one allowed to be larger than 1. It then estimates the same model but restricting the third omega to equal 1. These two models are compared with a likelihood ratio test and evidence ratios for each site.
Below, I plot the number of genes the where the p-value for the LRT is less than 0.01, adjusting pvalues with FDR.
4.2.1 BUSTED genes under positive selection with concatenation vs gene trees
= read.csv("../../data/wgs-ps-comps/penn-7spec-busted-concat.csv", header=T, comment.char="#", stringsAsFactors=F)
concat_busted = concat_busted[!concat_busted$id %in% ds_filter,]
concat_busted = read.csv("../../data/wgs-ps-comps/penn-7spec-busted-gt.csv", header=T, comment.char="#", stringsAsFactors=F)
gt_busted = gt_busted[!gt_busted$id %in% ds_filter,]
gt_busted
#concat_busted_ps = subset(concat_busted, pval < 1 - ( 1-0.01)^(1/nrow(concat_busted)))
#gt_busted_ps = subset(gt_busted, pval < 1 - ( 1-0.01)^(1/nrow(gt_busted)))
# Dunn-Sidak correction: Genes with p-value less than corrected threshold inferred as PS
$pval = p.adjust(concat_busted$pval, "fdr")
concat_busted$pval = p.adjust(gt_busted$pval, "fdr")
gt_busted# Adjust p-values for multiple testing
= subset(concat_busted, pval < 0.01)
concat_busted_ps = subset(gt_busted, pval < 0.01)
gt_busted_ps # No multi-test correction: Genes with p-values less than 0.01 inferred as PS
## Read the BUSTED data for both concat and gene trees.
= list("gt.ids"=gt_busted_ps$id, "concat.ids"=concat_busted_ps$id)
busted_comp #ggVennDiagram(busted_comp)
plot(euler(busted_comp, shape = "ellipse"), quantities=TRUE, fill=corecol(numcol=2, pal="wilke"), legend=list(labels=c("Gene trees", "Concat tree")))
## Venn diagrams for genes under PS in the concat and gt analyses
Here, we find more positively selected genes when using the correct gene tree. This is opposite to our expectation, indicating that this test is prone to false negatives when using the wrong (concatenated) tree.
4.2.2 CDS length for BUSTED PS categories
There is no difference in length of coding regons between categories.
= merge(x=concat_busted_ps, y=twin, by.x="id", by.y="Gene.ID")
concat_busted_ps_m $cds.len = concat_busted_ps_m$CDS.end - concat_busted_ps_m$CDS.start
concat_busted_ps_m= merge(x=gt_busted_ps, y=twin, by.x="id", by.y="Gene.ID")
gt_busted_ps_m $cds.len = gt_busted_ps_m$CDS.end - gt_busted_ps_m$CDS.start
gt_busted_ps_m## Get CDS lengths for both concat and gt genes
= subset(concat_busted_ps_m, !id %in% gt_busted_ps_m$id)
concat_uniq = unique(select(concat_uniq, id, lrt, pval, cds.len))
concat_uniq $type = "Concat only"
concat_uniq# Get genes unique to the concat dataset and label
## Concat genes
= subset(gt_busted_ps_m, !id %in% concat_busted_ps_m$id)
gt_uniq = unique(select(gt_uniq, id, lrt, pval, cds.len))
gt_uniq $type = "GT only"
gt_uniq# Get genes unique to the gt dataset and label
## Gene tree genes
= subset(gt_busted_ps_m, id %in% concat_busted_ps_m$id)
gt_concat_shared = unique(select(gt_concat_shared, id, lrt, pval, cds.len))
gt_concat_shared $type = "Shared"
gt_concat_shared# Get genes shared between the two datasets.
= rbind(gt_uniq, concat_uniq, gt_concat_shared)
ps_lens = subset(ps_lens, cds.len < 50000)
ps_lens # Combine data and filter for length
= list(c("Concat only", "Shared"), c("Concat only", "GT only"))
x_comps # Set up comparisons for significance testing with Wilcox
= ggplot(ps_lens, aes(x=type, y=cds.len)) +
ps_len_p geom_quasirandom(size=2, width=0.25) +
geom_boxplot(outlier.shape=NA, alpha=0.3, width=0.5, fill="transparent", color="#666666") +
geom_signif(comparisons=x_comps, map_signif_level=TRUE, textsize=4, size=1, step_increase=0.12, margin_top=0.1) +
xlab("") +
ylab("CDS length (bp)") +
bartheme() +
theme(legend.position="none")
print(ps_len_p)
# Plot CDS lengths of each category
## CDS length distributions of positively selected genes
4.2.3 dS for BUSTED PS categories
There is a slight increase in dS for concatenated genes compared to those found in both analyses, but power may be an issue.
= merge(x=concat_busted_ps, y=concat_mg_local_gene, by.x="id", by.y="id")
concat_busted_ps_mg = merge(x=gt_busted_ps, y=gt_mg_local_gene, by.x="id", by.y="id")
gt_busted_ps_mg # Get dS estimates from the MG94 local fit for each gene in the BUSTED analysis
= subset(concat_busted_ps_mg, !id %in% gt_busted_ps_mg$id)
concat_busted_ps_mg_u = unique(select(concat_busted_ps_mg_u, id, dn.ds, dn, ds))
concat_busted_ps_mg_u $type = "Concat only"
concat_busted_ps_mg_u# Select the genes with evidence for PS in concatenated analysis only
= subset(gt_busted_ps_mg, !id %in% concat_busted_ps_mg$id)
gt_busted_ps_mg_u = unique(select(gt_busted_ps_mg_u, id, dn.ds, dn, ds))
gt_busted_ps_mg_u $type = "GT only"
gt_busted_ps_mg_u# Select the genes with evidence for PS in gene tree analysis only
= subset(gt_busted_ps_mg, id %in% concat_busted_ps_mg$id)
gt_concat_shared_u = unique(select(gt_concat_shared_u, id, dn.ds, dn, ds))
gt_concat_shared_u $type = "Shared"
gt_concat_shared_u# Get genes shared between the two datasets.
= rbind(gt_busted_ps_mg_u, concat_busted_ps_mg_u, gt_concat_shared_u)
ps_dnds = subset(ps_dnds, dn.ds < 10)
ps_dnds # Combine data and filter for length
= list(c("Concat only", "Shared"), c("GT only", "Shared"), c("Concat only", "GT only"))
x_comps # Set up comparisons for significance testing with Wilcox
= ggplot(ps_dnds, aes(x=type, y=ds)) +
dnds_p geom_quasirandom(size=2, width=0.25) +
geom_boxplot(outlier.shape=NA, alpha=0.3, width=0.5, fill="transparent", color="#666666") +
geom_signif(comparisons=x_comps, map_signif_level=TRUE, textsize=4, size=1, step_increase=0.12, margin_top=0.1) +
xlab("") +
ylab("dS") +
bartheme() +
theme(legend.position="none")
print(dnds_p)
# Plot dS of each category
## CDS length distributions of positively selected genes
4.2.4 CDS length for BUSTED genes with gene trees
For the gene tree analysis, coding regions from genes under positive selection are slightly longer than those not under positive selection.
$cds.len = twin$CDS.end - twin$CDS.start
twin= unique(select(twin, Gene.ID, cds.len))
twin_uniq = subset(twin_uniq, !Gene.ID %in% ps_lens$id)
twin_uniq $type2 = "All other CDS"
twin_uniqnames(twin_uniq)[1] = "id"
# Get lengths of all CDS not under positive selection
= subset(ps_lens, type!="Concat only")
ps_lens_sub $type2 = "PS CDS"
ps_lens_sub# Add another label tot he PS genes
= rbind(twin_uniq, select(ps_lens_sub, id, cds.len, type2))
all_cds = subset(all_cds, cds.len < 50000)
all_cds # Combine data and filter for length
= list(c("All other CDS", "PS CDS"))
x_comps = ggplot(all_cds, aes(x=type2, y=cds.len)) +
all_len_p geom_quasirandom(size=2, width=0.25) +
geom_boxplot(outlier.shape=NA, alpha=0.3, width=0.5, fill="transparent", color="#666666") +
geom_signif(comparisons=x_comps, map_signif_level=TRUE, textsize=4, size=1, step_increase=0.12, margin_top=0.1) +
xlab("") +
ylab("CDS length (bp)") +
bartheme() +
theme(legend.position="none")
print(all_len_p)
# Plot CDS lengths of each category
print(paste("Average distance from start to end of PS CDS : ", mean(ps_lens$cds.len)))
## [1] "Average distance from start to end of PS CDS : 14587.5440414508"
print(paste("Average distance from start to end of non-PS CDS: ", mean(all_cds$cds.len)))
## [1] "Average distance from start to end of non-PS CDS: 12950.9640647133"
## CDS length distributions of positively selected genes vs non-positively selected genes
4.3 aBSREL
Next, we did the same analysis using HyPhy’s aBSREL test. aBSREL is a “branch-site” model, but does not test for selection at specific sites. “Instead, aBSREL will test, for each branch (or branch of interest) in the phylogeny, whether a proportion of sites have evolved under positive selection.” aBSREL infers the number of omega classes for each branch and uses a LRT against a model where branches are not allowed to have omega classes > 1. Below are the results without setting a foreground branch (testing the whole phylogeny per gene). P-values are corrected for multiple testing by aBSREL for multiple tests per gene, and I probably need to correct again for multiple gene tests with FDR.
= read.csv("../../data/wgs-ps-comps/penn-7spec-absrel-concat.csv", header=T, comment.char="#", stringsAsFactors=F)
concat_absrel = concat_absrel[!concat_absrel$id %in% ds_filter,]
concat_absrel
= read.csv("../../data/wgs-ps-comps/penn-7spec-absrel-gt.csv", header=T, comment.char="#", stringsAsFactors=F)
gt_absrel = gt_absrel[!gt_absrel$id %in% ds_filter,]
gt_absrel
= subset(concat_absrel, num.ps.branches > 0)
concat_absrel_ps = subset(gt_absrel, num.ps.branches > 0)
gt_absrel_ps ## Read the aBSREL data
= list("gt.ids"=gt_absrel_ps$id, "concat.ids"=concat_absrel_ps$id)
absrel_comp #ggVennDiagram(absrel_comp)
plot(euler(absrel_comp, shape = "ellipse"), quantities=TRUE, fill=corecol(numcol=2, pal="wilke"), legend=list(labels=c("Gene trees", "Concat tree")))
## Venn diagrams for genes under PS in the concat and gt analyses
With this test we see very similar results whether using concatenated or gene trees.
4.3.1 dS for aBSREL PS categories
We see no difference in dS between categories.
= merge(x=concat_absrel_ps, y=concat_mg_local_gene, by.x="id", by.y="id")
concat_absrel_ps_mg = merge(x=gt_absrel_ps, y=gt_mg_local_gene, by.x="id", by.y="id")
gt_absrel_ps_mg
= subset(concat_absrel_ps_mg, !id %in% gt_absrel_ps_mg$id)
concat_absrel_ps_mg_u = unique(select(concat_absrel_ps_mg_u, id, dn.ds, dn, ds))
concat_absrel_ps_mg_u $type = "Concat only"
concat_absrel_ps_mg_u
= subset(gt_absrel_ps_mg, !id %in% concat_absrel_ps_mg$id)
gt_absrel_ps_mg_u = unique(select(gt_absrel_ps_mg_u, id, dn.ds, dn, ds))
gt_absrel_ps_mg_u $type = "GT only"
gt_absrel_ps_mg_u
= subset(gt_absrel_ps_mg, id %in% concat_absrel_ps_mg$id)
gt_concat_shared_u = unique(select(gt_concat_shared_u, id, dn.ds, dn, ds))
gt_concat_shared_u $type = "Shared"
gt_concat_shared_u# Get genes shared between the two datasets.
= rbind(gt_absrel_ps_mg_u, concat_absrel_ps_mg_u, gt_concat_shared_u)
ps_dnds = subset(ps_dnds, dn.ds < 10)
ps_dnds # Combine data and filter for length
= list(c("Concat only", "Shared"), c("GT only", "Shared"), c("Concat only", "GT only"))
x_comps # Set up comparisons for significance testing with Wilcox
= ggplot(ps_dnds, aes(x=type, y=ds)) +
dnds_p geom_quasirandom(size=2, width=0.25) +
geom_boxplot(outlier.shape=NA, alpha=0.3, width=0.5, fill="transparent", color="#666666") +
geom_signif(comparisons=x_comps, map_signif_level=TRUE, textsize=4, size=1, step_increase=0.12, margin_top=0.1) +
xlab("") +
ylab("dS") +
bartheme() +
theme(legend.position="none")
print(dnds_p)
# Plot CDS lengths of each category
## CDS length distributions of positively selected genes
4.4 PAML’s M1a vs M2a
Finally, we ran PAML’s M1a vs M2a. This is a site model, meaning omega can vary among sites. M1a allows for 2 omega categories while M2a allows a third category to be estimated > 1. These two models are compared with a LRT and the p-values have been corrected with FDR.
= read.csv("../../data/wgs-ps-comps/penn-7spec-m1a-concat.csv", header=T, comment.char="#", stringsAsFactors=F)
concat_m1a = unique(select(concat_m1a, id, chr, start, end, lnl, k))
concat_m1a = concat_m1a[!concat_m1a$id %in% ds_filter,]
concat_m1a = read.csv("../../data/wgs-ps-comps//penn-7spec-m2a-concat.csv", header=T, comment.char="#", stringsAsFactors=F)
concat_m2a = unique(select(concat_m2a, id, chr, start, end, lnl, k))
concat_m2a = concat_m2a[!concat_m2a$id %in% ds_filter,]
concat_m2a # Read and filter the data
= merge(x=concat_m1a, y=concat_m2a, by="id")
concat_paml $lrt = 2 * (concat_paml$lnl.y - concat_paml$lnl.x)
concat_paml$pval = pchisq(concat_paml$lrt, 2, lower.tail=F)
concat_paml$pval = p.adjust(concat_paml$pval, "fdr")
concat_paml= subset(concat_paml, concat_paml$pval < 0.01)
concat_paml_ps # Perform the likelihood ratio test and p-value adjustment
## CONCAT DATA
= read.csv("../../data/wgs-ps-comps/penn-7spec-m1a-gt.csv", header=T, comment.char="#", stringsAsFactors=F)
gt_m1a = unique(select(gt_m1a, id, chr, start, end, lnl, k))
gt_m1a = gt_m1a[!gt_m1a$id %in% ds_filter,]
gt_m1a = read.csv("../../data/wgs-ps-comps/penn-7spec-m2a-gt.csv", header=T, comment.char="#", stringsAsFactors=F)
gt_m2a = unique(select(gt_m2a, id, chr, start, end, lnl, k))
gt_m2a = gt_m2a[!gt_m2a$id %in% ds_filter,]
gt_m2a # Read and filter the data
= merge(x=gt_m1a, y=gt_m2a, by="id")
gt_paml $lrt = 2 * (gt_paml$lnl.y - gt_paml$lnl.x)
gt_paml$pval = pchisq(gt_paml$lrt, 2, lower.tail=F)
gt_paml$pval = p.adjust(gt_paml$pval, "fdr")
gt_paml= subset(gt_paml, gt_paml$pval < 0.01)
gt_paml_ps # Perform the likelihood ratio test and p-value adjustment
## GT DATA
= list("gt.ids"=gt_paml_ps$id, "concat.ids"=concat_paml_ps$id)
paml_comp #ggVennDiagram(paml_comp)
plot(euler(paml_comp, shape = "ellipse"), quantities=TRUE, fill=corecol(numcol=2, pal="wilke"), legend=list(labels=c("Gene trees", "Concat tree")))
## Venn diagrams for genes under PS in the concat and gt analyses
Here we find a large excess of genes inferred to be under positive selection when using the concatenated tree.