Murine exome alignments
<- function(df, column, max_bin){
binData # Function to pre-bin SVs by length
= data.frame("bin"=seq(from=0,to=max_bin,by=10), count=0)
binned_df for(i in 1:nrow(df)) {
<- df[i,]
row = floor(row[[column]]/10)*10
bin $bin==bin,]$count = binned_df[binned_df$bin==bin,]$count + 1
binned_df[binned_df
}return(binned_df)
}
::opts_chunk$set(echo = TRUE, warning = FALSE, message = FALSE)
knitr
library(ggplot2)
library(cowplot)
library(ggbeeswarm)
library(ggpubr)
#library(plyr)
library(dplyr)
library(kableExtra)
source("../lib/design.r")
#htmltools::includeHTML("../html-chunks/rmd_nav.html")
Alignment and filtering steps:
1. BLAST mouse exons to all assembled contigs to identify exon-contig homology.
2. Exonerate mouse exons (as trimmed AA sequences) to matching contig for each sample.
3. Remove exons that have fewer than 175 samples with a matching BLAST hit.
4. Align amino acid exonerate hits with MAFFT.
5. Backtranslate to nucleotide sequences.
6. For each alignment, remove sequences that are >20% gaps.
7. For each alignment, remove 3-codon windows in which 2 or more codons have 2 or more gaps in >50% of sequences.
8. Remove any samples with pre-mature stop codons.
1 Coding alignments
1.1 Exon stats
1.1.1 Exons vs BLAST hits
= read.csv("../../data/aln-stats/sample-hits-exons.csv", header=T)
blast_data
= ggplot(blast_data, aes(x=total.exons, y=blast.hits)) +
exons_v_bhits geom_abline(aes(slope=1, intercept=0, color="1:1"), size=1, linetype="dashed", show.legend=F) +
geom_point(size=2, alpha=0.1, color=corecol(pal="wilke", numcol=1, offset=6)) +
geom_smooth(aes(color="Fit"), method="lm", linetype="dashed", se=F) +
ylab("# of BLAST hits per sample") +
xlab("# of mouse exons") +
scale_color_manual(values=c("1:1"="#999999", "Fit"=corecol(pal="wilke", numcol=1, offset=6))) +
bartheme() +
theme(legend.position="right")
print(exons_v_bhits)
1.1.2 BLAST hits vs exonerate hits (no hit threshold)
= read.csv("../../data/aln-stats/targets-exonerate-f0-TEST.csv", header=T)
targets_exonerate
= ggplot(targets_exonerate, aes(x=init.sample.hits, y=exonerate.sample.hits)) +
bhits_v_ehits geom_abline(aes(slope=1, intercept=0, color="1:1"), size=1, linetype="dashed", show.legend=F) +
geom_point(size=2, alpha=0.1, color=corecol(pal="wilke", numcol=1, offset=4)) +
geom_smooth(aes(color="Fit"), method="lm", linetype="dashed", se=F) +
xlab("# of samples with BLAST hits") +
ylab("# of samples with\nexonerate hits") +
scale_color_manual(values=c("1:1"="#999999", "Fit"=corecol(pal="wilke", numcol=1, offset=4))) +
bartheme() +
theme(legend.position="right")
print(bhits_v_ehits)
1.1.3 Mouse exons vs exonerate hits (no hit threshold)
= read.csv("../../data/aln-stats/sample-targets-exonerate-f0-TEST.csv", header=T)
sample_targets_exonerate ## HEADERS ARE OFF
= ggplot(sample_targets_exonerate, aes(x=pid.1, y=mm.exons)) +
exons_v_hits geom_abline(aes(slope=1, intercept=0, color="1:1"), size=1, linetype="dashed", show.legend=F) +
geom_point(size=2, alpha=0.1, color=corecol(pal="wilke", numcol=1, offset=5)) +
geom_smooth(aes(color="Fit"), method="lm", linetype="dashed", se=F) +
xlab("# of exons in mouse protein") +
ylab("# of exonerate hits\nper sample per protein") +
scale_color_manual(values=c("1:1"="#999999", "Fit"="#000000")) +
bartheme() +
theme(legend.position="right")
print(exons_v_hits)
1.2 Alignment stats
1.2.1 Number of sequences per alignment
= read.csv("../../data/aln-stats/aln-stats-f150-seq20-site50.tab", header=T, sep="\t", comment.char="#")
aln_data_150 = read.csv("../../data/aln-stats/aln-stats-f175-seq20-site50.tab", header=T, sep="\t", comment.char="#")
aln_data_175 = read.csv("../../data/aln-stats/aln-stats-mclennan-f0-seq20-site50.tab", header=T, sep="\t", comment.char="#")
aln_data_mcl = read.csv("../../data/aln-stats/aln-stats-aus-full-f0-seq20-site50.tab", header=T, sep="\t", comment.char="#")
aln_data_aus = read.csv("../../data/aln-stats/roycroft-aln-stats.tab", header=T, sep="\t", comment.char="#")
gbe_data
= corecol(numcol=10)
filter_col
= select(aln_data_150, align, pre.num.seqs)
pre_seqs_150 $label = "Pre-filter (F150)"
pre_seqs_150names(pre_seqs_150)[2] = "num.seqs"
= select(aln_data_150, align, post.num.seqs)
post_seqs_150 $label = "Post-filter (F150)"
post_seqs_150names(post_seqs_150)[2] = "num.seqs"
= select(aln_data_175, align, pre.num.seqs)
pre_seqs_175 $label = "Pre-filter (F175)"
pre_seqs_175names(pre_seqs_175)[2] = "num.seqs"
= select(aln_data_175, align, post.num.seqs)
post_seqs_175 $label = "Post-filter (F175)"
post_seqs_175names(post_seqs_175)[2] = "num.seqs"
= select(aln_data_mcl, align, pre.num.seqs)
pre_seqs_mcl $label = "Pre-filter (McL F0)"
pre_seqs_mclnames(pre_seqs_mcl)[2] = "num.seqs"
= select(aln_data_mcl, align, post.num.seqs)
post_seqs_mcl $label = "Post-filter (McL F0)"
post_seqs_mclnames(post_seqs_mcl)[2] = "num.seqs"
= select(aln_data_aus, align, pre.num.seqs)
pre_seqs_aus $label = "Pre-filter (Aus F0)"
pre_seqs_ausnames(pre_seqs_aus)[2] = "num.seqs"
= select(aln_data_aus, align, post.num.seqs)
post_seqs_aus $label = "Post-filter (Aus F0)"
post_seqs_ausnames(post_seqs_aus)[2] = "num.seqs"
= rbind(pre_seqs_150, post_seqs_150, pre_seqs_175, post_seqs_175, pre_seqs_mcl, post_seqs_mcl, pre_seqs_aus, post_seqs_aus)
num_seqs
$label = factor(num_seqs$label, levels=c("Pre-filter (F150)", "Post-filter (F150)", "Pre-filter (F175)", "Post-filter (F175)", "Pre-filter (McL F0)", "Post-filter (McL F0)", "Pre-filter (Aus F0)", "Post-filter (Aus F0)"))
num_seqs
= ggplot(num_seqs, aes(x=label, y=num.seqs, fill=label)) +
aln_seqs geom_quasirandom(size=2, width=0.25, alpha=0.25, color="#666666") +
geom_boxplot(outlier.shape=NA, alpha=0.5, width=0.5, color="#666666") +
ylab("# of aligned sequences\nper protein") +
xlab("") +
scale_fill_manual(labels=c("Pre-filter (F150)", "Post-filter (F150)", "Pre-filter (F175)", "Post-filter (F175)", "Pre-filter (McL F0)", "Post-filter (McL F0)", "Pre-filter (Aus F0)", "Post-filter (Aus F0)"), values=filter_col) +
scale_x_discrete(labels=c("Pre-filter (F150)", "Post-filter (F150)", "Pre-filter (F175)", "Post-filter (F175)", "Pre-filter (McL F0)", "Post-filter (McL F0)", "Pre-filter (Aus F0)", "Post-filter (Aus F0)")) +
#scale_y_continuous(expand=c(0,0)) +
bartheme() +
theme(legend.position="none",
axis.text.x=element_text(angle=45, hjust=1))
print(aln_seqs)
1.2.2 Length of alignment
= select(aln_data_150, align, pre.codon.aln.length)
pre_seqs_150 $label = "Pre-filter (F150)"
pre_seqs_150names(pre_seqs_150)[2] = "aln.len"
= select(aln_data_150, align, post.codon.aln.length)
post_seqs_150 $label = "Post-filter (F150)"
post_seqs_150names(post_seqs_150)[2] = "aln.len"
= select(aln_data_175, align, pre.codon.aln.length)
pre_seqs_175 $label = "Pre-filter (F175)"
pre_seqs_175names(pre_seqs_175)[2] = "aln.len"
= select(aln_data_175, align, post.codon.aln.length)
post_seqs_175 $label = "Post-filter (F175)"
post_seqs_175names(post_seqs_175)[2] = "aln.len"
= select(aln_data_mcl, align, pre.codon.aln.length)
pre_seqs_mcl $label = "Pre-filter (McL F0)"
pre_seqs_mclnames(pre_seqs_mcl)[2] = "aln.len"
= select(aln_data_mcl, align, post.codon.aln.length)
post_seqs_mcl $label = "Post-filter (McL F0)"
post_seqs_mclnames(post_seqs_mcl)[2] = "aln.len"
= select(aln_data_aus, align, pre.codon.aln.length)
pre_seqs_aus $label = "Pre-filter (Aus F0)"
pre_seqs_ausnames(pre_seqs_aus)[2] = "aln.len"
= select(aln_data_aus, align, post.codon.aln.length)
post_seqs_aus $label = "Post-filter (Aus F0)"
post_seqs_ausnames(post_seqs_aus)[2] = "aln.len"
= select(gbe_data, align, pre.codon.aln.length)
gbe_seqs $label = "GBE"
gbe_seqsnames(gbe_seqs)[2] = "aln.len"
= subset(gbe_seqs, aln.len < 10000)
gbe_seqs
= rbind(pre_seqs_150, post_seqs_150, pre_seqs_175, post_seqs_175, pre_seqs_mcl, post_seqs_mcl, pre_seqs_aus, post_seqs_aus, gbe_seqs)
num_seqs = subset(num_seqs, aln.len <= 1000)
num_seqs
$label = factor(num_seqs$label, levels=c("Pre-filter (F150)", "Post-filter (F150)", "Pre-filter (F175)", "Post-filter (F175)", "Pre-filter (McL F0)", "Post-filter (McL F0)", "Pre-filter (Aus F0)", "Post-filter (Aus F0)", "GBE"))
num_seqs
= ggplot(num_seqs, aes(x=label, y=aln.len, fill=label)) +
aln_len geom_quasirandom(size=2, width=0.25, alpha=0.25, color="#666666") +
geom_boxplot(outlier.shape=NA, alpha=0.5, width=0.5, color="#666666") +
ylab("Alignment length (codons)") +
xlab("") +
scale_fill_manual(labels=c("Pre-filter (F150)", "Post-filter (F150)", "Pre-filter (F175)", "Post-filter (F175)", "Pre-filter (McL F0)", "Post-filter (McL F0)", "Pre-filter (Aus F0)", "Post-filter (Aus F0)", "GBE"), values=filter_col) +
scale_x_discrete(labels=c("Pre-filter (F150)", "Post-filter (F150)", "Pre-filter (F175)", "Post-filter (F175)", "Pre-filter (McL F0)", "Post-filter (McL F0)", "Pre-filter (Aus F0)", "Post-filter (Aus F0)", "GBE")) +
#scale_y_continuous(expand=c(0,0)) +
bartheme() +
theme(legend.position="none",
axis.text.x=element_text(angle=45, hjust=1))
print(aln_len)
# bin_width = 100
#
# bins = seq(0,6000,by=bin_width)
# bin_labels = seq(1,6000,by=bin_width)
# pre_counts = hist(aln_data$pre.codon.aln.length, breaks=bins, include.lowest=T, plot=F)$counts
# pre_binned = data.frame("bin"=bin_labels, "count"=pre_counts, filter="Pre-filter")
#
# filter_counts = hist(aln_data$post.codon.aln.length, breaks=bins, include.lowest=T, plot=F)$counts
# filter_binned = data.frame("bin"=bin_labels, "count"=filter_counts, filter="Post-filter")
#
# binned_data = rbind.fill(pre_binned, filter_binned)
# col = corecol(numcol=2)
# lab = c("Pre-filter", "Post-filter")
# #ylabels = c("1200", as.character(seq(from=0, to=7000, by=1000)))
# ylabels = as.character(seq(from=-200, to=200, by=100))
#
# dualp = ggplot(binned_data, aes(x = bin, y = ifelse(filter == "Post-filter",-1, 1)*count, fill = filter)) +
# geom_col(width=bin_width) +
# geom_hline(yintercept=0) +
# scale_x_continuous(name = "dS", limits = c(-bin_width, 7000), expand = c(0, 0)) +
# scale_y_continuous(name = "# Transcripts", limits=c(-200,200), breaks = 100*(-2:2), labels=ylabels) +
# scale_fill_manual(name="", limits=lab, values=col) +
# bartheme()
# print(dualp)
1.2.3 Avg non-gapped sequence length per protein
= read.csv("../../data/Mus-selected-sequences_metadata_samp-counts_ratids.csv", header=TRUE)
mus_data = subset(mus_data, feature_type=="CDS")
mus_data = select(mus_data, transcript_id, exon_length)
mus_lens = mus_lens %>% group_by(transcript_id) %>% summarize(nongap.len=sum(exon_length))
mus_lens $nongap.len = mus_lens$nongap.len / 3
mus_lens$label = "Mus CDS"
mus_lensnames(mus_lens)[1] = "align"
= subset(mus_lens, nongap.len < 5000)
mus_lens
= select(aln_data_150, align, pre.avg.nongap.length)
pre_seqs_150 $label = "Pre-filter (F150)"
pre_seqs_150names(pre_seqs_150)[2] = "nongap.len"
= select(aln_data_150, align, post.avg.nongap.length)
post_seqs_150 $label = "Post-filter (F150)"
post_seqs_150names(post_seqs_150)[2] = "nongap.len"
= select(aln_data_175, align, pre.avg.nongap.length)
pre_seqs_175 $label = "Pre-filter (F175)"
pre_seqs_175names(pre_seqs_175)[2] = "nongap.len"
= select(aln_data_175, align, post.avg.nongap.length)
post_seqs_175 $label = "Post-filter (F175)"
post_seqs_175names(post_seqs_175)[2] = "nongap.len"
= select(aln_data_mcl, align, pre.avg.nongap.length)
pre_seqs_mcl $label = "Pre-filter (McL F0)"
pre_seqs_mclnames(pre_seqs_mcl)[2] = "nongap.len"
= select(aln_data_mcl, align, post.avg.nongap.length)
post_seqs_mcl $label = "Post-filter (McL F0)"
post_seqs_mclnames(post_seqs_mcl)[2] = "nongap.len"
= select(aln_data_aus, align, pre.avg.nongap.length)
pre_seqs_aus $label = "Pre-filter (Aus F0)"
pre_seqs_ausnames(pre_seqs_aus)[2] = "nongap.len"
= select(aln_data_aus, align, post.avg.nongap.length)
post_seqs_aus $label = "Post-filter (Aus F0)"
post_seqs_ausnames(post_seqs_aus)[2] = "nongap.len"
= select(gbe_data, align, pre.avg.nongap.length)
gbe_seqs $label = "GBE"
gbe_seqsnames(gbe_seqs)[2] = "nongap.len"
= subset(gbe_seqs, nongap.len < 10000)
gbe_seqs
= rbind(pre_seqs_150, post_seqs_150, pre_seqs_175, post_seqs_175, pre_seqs_mcl, post_seqs_mcl, pre_seqs_aus, post_seqs_aus, gbe_seqs, mus_lens)
num_seqs = subset(num_seqs, nongap.len <= 1000)
num_seqs
$label = factor(num_seqs$label, levels=c("Pre-filter (F150)", "Post-filter (F150)", "Pre-filter (F175)", "Post-filter (F175)", "Pre-filter (McL F0)", "Post-filter (McL F0)", "Pre-filter (Aus F0)", "Post-filter (Aus F0)", "GBE", "Mus CDS"))
num_seqs
= ggplot(num_seqs, aes(x=label, y=nongap.len, fill=label)) +
nongap_len geom_quasirandom(size=2, width=0.25, alpha=0.25, color="#666666") +
geom_boxplot(outlier.shape=NA, alpha=0.5, width=0.5, color="#666666") +
ylab("Avg. ungapped\nsequence length (codons)") +
xlab("") +
scale_fill_manual(labels=c("Pre-filter (150)", "Post-filter (150)", "Pre-filter (175)", "Post-filter (175)", "Pre-filter (McL F0)", "Post-filter (McL F0)", "Pre-filter (Aus F0)", "Post-filter (Aus F0)", "GBE", "Mus CDS"), values=filter_col) +
scale_x_discrete(labels=c("Pre-filter (150)", "Post-filter (150)", "Pre-filter (175)", "Post-filter (175)", "Pre-filter (McL F0)", "Post-filter (McL F0)", "Pre-filter (Aus F0)", "Post-filter (Aus F0)", "GBE", "Mus CDS")) +
#scale_y_continuous(expand=c(0,0)) +
bartheme() +
theme(legend.position="none",
axis.text.x=element_text(angle=45, hjust=1))
print(nongap_len)
1.2.4 Number of sequences per alignment that are >20% gap (pre-filter)
$pre.gappy.seqs.perc = (aln_data_150$pre.gappy.seqs / aln_data_150$pre.num.seqs) * 100
aln_data_150= seq(0,100,by=1)
bins = seq(1,100,by=1)
bin_labels = hist(aln_data_150$pre.gappy.seqs.perc, breaks=bins, include.lowest=T, plot=F)$counts
gappy_counts_150 = data.frame("bin"=bin_labels, "count"=gappy_counts_150, filter="F150")
gappy_binned_150
$pre.gappy.seqs.perc = (aln_data_175$pre.gappy.seqs / aln_data_175$pre.num.seqs) * 100
aln_data_175= seq(0,100,by=1)
bins = seq(1,100,by=1)
bin_labels = hist(aln_data_175$pre.gappy.seqs.perc, breaks=bins, include.lowest=T, plot=F)$counts
gappy_counts_175 = data.frame("bin"=bin_labels, "count"=gappy_counts_175, filter="F175")
gappy_binned_175
= bind_rows(gappy_binned_150, gappy_binned_175)
binned_data = corecol(numcol=2)
col = c("F150", "F175")
lab = as.character(abs(seq(from=-700, to=700, by=100)))
ylabels
= ggplot(binned_data, aes(x = bin, y = ifelse(filter == "F175",-1, 1)*count, fill = filter)) +
dualp geom_col(width=1) +
geom_hline(yintercept=0) +
scale_x_continuous(name = "% of sequences that are\n>20% gap", limits = c(0, 100), expand = c(0, 0)) +
scale_y_continuous(name = "# of alignments", limits=c(-700,700), breaks = 100*(-7:7), labels=ylabels) +
scale_fill_manual(name="", limits=lab, values=col) +
bartheme()
print(dualp)
# aln_data$pre.gappy.seqs = (aln_data$pre.gappy.seqs / aln_data$pre.num.seqs) * 100
# aln_20_gaps_seqs = ggplot(aln_data, aes(x=pre.gappy.seqs)) +
# geom_histogram(fill=filter_col[1], color="#333333", bins=50) +
# scale_x_continuous(expand=c(0,0)) +
# scale_y_continuous(expand=c(0,0)) +
# xlab("% of sequences that are\n>20% gap") +
# ylab("# of alignments") +
# bartheme()
# print(aln_20_gaps_seqs)
1.2.5 Number of sites that are >20% gap (pre-filter)
= select(aln_data_150, align, pre.gappy.sites)
pre_sites_150 $label = "Pre-filter (F150)"
pre_sites_150names(pre_sites_150)[2] = "perc.sites"
= select(aln_data_150, align, post.gappy.sites)
post_sites_150 $label = "Post-filter (F150)"
post_sites_150names(post_sites_150)[2] = "perc.sites"
= select(aln_data_175, align, pre.gappy.sites)
pre_sites_175 $label = "Pre-filter (F175)"
pre_sites_175names(pre_sites_175)[2] = "perc.sites"
= select(aln_data_175, align, post.gappy.sites)
post_sites_175 $label = "Post-filter (F175)"
post_sites_175names(post_sites_175)[2] = "perc.sites"
= select(aln_data_mcl, align, pre.gappy.sites)
pre_sites_mcl $label = "Pre-filter (McL F0)"
pre_sites_mclnames(pre_sites_mcl)[2] = "perc.sites"
= select(aln_data_mcl, align, post.gappy.sites)
post_sites_mcl $label = "Post-filter (McL F0)"
post_sites_mclnames(post_sites_mcl)[2] = "perc.sites"
= select(aln_data_aus, align, pre.gappy.sites)
pre_sites_aus $label = "Pre-filter (Aus F0)"
pre_sites_ausnames(pre_sites_aus)[2] = "perc.sites"
= select(aln_data_aus, align, post.gappy.sites)
post_sites_aus $label = "Post-filter (Aus F0)"
post_sites_ausnames(post_sites_aus)[2] = "perc.sites"
= rbind(pre_sites_150, post_sites_150, pre_sites_175, post_sites_175, pre_sites_mcl, post_sites_mcl, pre_sites_aus, post_sites_aus)
num_sites = subset(num_sites, perc.sites < 1000)
num_sites $label = factor(num_sites$label, levels=c("Pre-filter (F150)", "Post-filter (F150)", "Pre-filter (F175)", "Post-filter (F175)", "Pre-filter (McL F0)", "Post-filter (McL F0)", "Pre-filter (Aus F0)", "Post-filter (Aus F0)"))
num_sites
= ggplot(num_sites, aes(x=label, y=perc.sites, fill=label)) +
aln_20_gap_sites geom_quasirandom(size=2, width=0.25, alpha=0.25, color="#666666") +
geom_boxplot(outlier.shape=NA, alpha=0.75, width=0.5, color="#666666") +
ylab("# of sites ") +
xlab("") +
scale_fill_manual(labels=c("Pre-filter (F150)", "Post-filter (F150)", "Pre-filter (F175)", "Post-filter (F175)", "Pre-filter (McL F0)", "Post-filter (McL F0)", "Pre-filter (Aus F0)", "Post-filter (Aus F0)"), values=filter_col) +
scale_x_discrete(labels=c("Pre-filter (F150)", "Post-filter (F150)", "Pre-filter (F175)", "Post-filter (F175)", "Pre-filter (McL F0)", "Post-filter (McL F0)", "Pre-filter (Aus F0)", "Post-filter (Aus F0)")) +
#scale_y_continuous(expand=c(0,0)) +
bartheme() +
theme(legend.position="none",
axis.text.x=element_text(angle=45, hjust=1))
print(aln_20_gap_sites)
# aln_20_gap_sites = ggplot(aln_data, aes(x=1, y=pre.percent.sites.20.gap)) +
# geom_quasirandom(size=2, width=0.25) +
# geom_boxplot(outlier.shape=NA, alpha=0.3, width=0.5, fill="transparent", color="#666666") +
# geom_quasirandom(data=aln_data, aes(x=2, y=post.percent.sites.20.gap), size=2, width=0.25) +
# geom_boxplot(data=aln_data, aes(x=2, y=post.percent.sites.20.gap), outlier.shape=NA, alpha=0.3, width=0.5, fill="transparent", color="#666666") +
# #scale_y_continuous(expand=c(0,0)) +
# bartheme()
# print(aln_20_gap_sites)
2 Coding alignment filters
A look at the effects of varying all three filters:
- Filtering the required number of samples present for exonerate. Targets with fewer than [0, 50, 100, 150, 175, 200] sample hits will be filtered. This is the presence filter denoted “P”.
- Filtering sequences that are more than [20, 40, 60, 80, 100]% gaps. This is the sequence filter denoted as “S”.
- Filtering codon sites that, among sliding windows of 3 codons, have at least 2 gaps in at least 2 codons in [25, 50, 75, 100]% of samples. This is the codon site filter denoted as “C”.
= read.csv("../../data/aln-filters/aln-stats-f0-seq20-site50.tab", header=T, comment.char="#", sep="\t")
p0s20c50 $label = "P0S20C50"
p0s20c50
= read.csv("../../data/aln-filters/aln-stats-f0-seq40-site50.tab", header=T, comment.char="#", sep="\t")
p0s40c50 $label = "P0S40C50"
p0s40c50
= read.csv("../../data/aln-filters/aln-stats-f0-seq60-site50.tab", header=T, comment.char="#", sep="\t")
p0s60c50 $label = "P0S60C50"
p0s60c50
= read.csv("../../data/aln-filters/aln-stats-f0-seq80-site50.tab", header=T, comment.char="#", sep="\t")
p0s80c50 $label = "P0S80C50"
p0s80c50
= read.csv("../../data/aln-filters/aln-stats-f0-seq100-site50.tab", header=T, comment.char="#", sep="\t")
p0s100c50 $label = "P0S100C50"
p0s100c50
= read.csv("../../data/aln-filters/aln-stats-f0-seq20-site25.tab", header=T, comment.char="#", sep="\t")
p0s20c25 $label = "P0S20C25"
p0s20c25
= read.csv("../../data/aln-filters/aln-stats-f0-seq20-site75.tab", header=T, comment.char="#", sep="\t")
p0s20c75 $label = "P0S20C75"
p0s20c75
= read.csv("../../data/aln-filters/aln-stats-f0-seq20-site100.tab", header=T, comment.char="#", sep="\t")
p0s20c100 $label = "P0S20C100"
p0s20c100
= read.csv("../../data/aln-filters/aln-stats-f50-seq20-site50.tab", header=T, comment.char="#", sep="\t")
p50s20c50 $label = "P50S20C50"
p50s20c50
= read.csv("../../data/aln-filters/aln-stats-f100-seq20-site50.tab", header=T, comment.char="#", sep="\t")
p100s20c50 $label = "P100S20C50"
p100s20c50
= read.csv("../../data/aln-filters/aln-stats-f150-seq20-site50.tab", header=T, comment.char="#", sep="\t")
p150s20c50 $label = "P150S20C50"
p150s20c50
= read.csv("../../data/aln-filters/aln-stats-f175-seq20-site50.tab", header=T, comment.char="#", sep="\t")
p175s20c50 $label = "P175S20C50"
p175s20c50
= read.csv("../../data/aln-filters/aln-stats-f175-seq20-site50.tab", header=T, comment.char="#", sep="\t")
p175s40c50 $label = "P175S40C50"
p175s40c50
= read.csv("../../data/aln-filters/aln-stats-f175-seq20-site50.tab", header=T, comment.char="#", sep="\t")
p175s60c50 $label = "P175S60C50"
p175s60c50
= read.csv("../../data/aln-filters/aln-stats-f175-seq20-site50.tab", header=T, comment.char="#", sep="\t")
p175s80c50 $label = "P175S80C50"
p175s80c50
= read.csv("../../data/aln-filters/aln-stats-f175-seq20-site50.tab", header=T, comment.char="#", sep="\t")
p175s100c50 $label = "P175S100C50"
p175s100c50
= read.csv("../../data/aln-filters/aln-stats-f175-seq20-site25.tab", header=T, comment.char="#", sep="\t")
p175s20c25 $label = "P175S20C25"
p175s20c25
= read.csv("../../data/aln-filters/aln-stats-f175-seq20-site75.tab", header=T, comment.char="#", sep="\t")
p175s20c75 $label = "P175S20C75"
p175s20c75
= read.csv("../../data/aln-filters/aln-stats-f175-seq20-site100.tab", header=T, comment.char="#", sep="\t")
p175s20c100 $label = "P175S20C100"
p175s20c100
= read.csv("../../data/aln-filters/aln-stats-f200-seq20-site50.tab", header=T, comment.char="#", sep="\t")
p200s20c50 $label = "P200S20C50" p200s20c50
2.1 Varying the Presence (P) filter
Filtering targets with exonerate matches in less than [0, 50, 100, 150, 175, 200] samples
2.1.1 Alignment length (codons)
#gbe_seqs = select(gbe_data, align, pre.codon.aln.length)
$label = "GBE"
gbe_datanames(gbe_data)[3] = "post.codon.aln.length"
= select(gbe_data, align, label, post.codon.aln.length)
gbe_lens = subset(gbe_lens, post.codon.aln.length < 10000)
gbe_lens
= rbind(p0s20c50, p50s20c50, p100s20c50, p150s20c50, p175s20c50, p200s20c50)
p_data = select(p_data, align, label, post.codon.aln.length)
p_data
= rbind(p_data, gbe_lens)
p_data
= select(p_data, label, post.codon.aln.length)
p_means = p_means %>% group_by(label) %>% summarize(post.codon.aln.length=mean(post.codon.aln.length, na.rm=T))
p_means
$label = factor(p_data$label, levels=c("P0S20C50", "P50S20C50", "P100S20C50", "P150S20C50", "P175S20C50", "P200S20C50", "GBE"), ordered=T)
p_data#levels(p_data$label) = c("P0S20C50", "P50S20C50", "P100S20C50", "P150S20C50", "P175S20C50", "P200S20C50", "GBE"), ordered=T)
$label = factor(p_means$label, levels=c("P0S20C50", "P50S20C50", "P100S20C50", "P150S20C50", "P175S20C50", "P200S20C50", "GBE"), ordered=T)
p_means#p_means$label = as.factor(p_means$label)
#levels(p_means$label) = c("P0S20C50", "P50S20C50", "P100S20C50", "P150S20C50", "P175S20C50", "P200S20C50")
= ggplot(p_data, aes(x=label, y=post.codon.aln.length, group=label)) +
p_lens geom_quasirandom(size=2, width=0.25, alpha=0.25, color="#666666") +
geom_boxplot(outlier.shape=NA, alpha=0.75, width=0.5, color="#666666") +
geom_point(data=p_means, aes(x=label, y=post.codon.aln.length), size=2, color="red") +
geom_line(data=p_means, aes(x=label, y=post.codon.aln.length), size=2, color="blue") +
ylab("Alignment length (codons)") +
xlab("") +
#scale_fill_manual(labels=c("Pre-filter", "Post-filter"), values=filter_col) +
#scale_x_discrete(labels=c("Pre-filter", "Post-filter")) +
#scale_y_continuous(expand=c(0,0)) +
bartheme() +
theme(legend.position="none", axis.text.x = element_text(angle=45, hjust=1))
print(p_lens)
%>% kable() %>% kable_styling(bootstrap_options=c("striped", "condended", "responsive"), full_width=F) p_means
label | post.codon.aln.length |
---|---|
GBE | 372.6928 |
P0S20C50 | 613.5621 |
P100S20C50 | 557.2141 |
P150S20C50 | 476.9085 |
P175S20C50 | 423.1905 |
P200S20C50 | 284.0106 |
P50S20C50 | 593.0098 |
2.1.2 Ungapped alignment length (codons)
#gbe_seqs = select(gbe_data, align, pre.codon.aln.length)
$label = "GBE"
gbe_datanames(gbe_data)[4] = "post.avg.nongap.length"
= select(gbe_data, align, label, post.avg.nongap.length)
gbe_lens = subset(gbe_lens, post.avg.nongap.length < 10000)
gbe_lens
= select(mus_data, transcript_id, exon_length)
mus_lens = mus_lens %>% group_by(transcript_id) %>% summarize(post.avg.nongap.length=sum(exon_length))
mus_lens $post.avg.nongap.length = mus_lens$post.avg.nongap.length / 3
mus_lens$label = "Mus CDS"
mus_lensnames(mus_lens)[1] = "align"
= subset(mus_lens, post.avg.nongap.length < 5000)
mus_lens
= rbind(p0s20c50, p50s20c50, p100s20c50, p150s20c50, p175s20c50, p200s20c50)
p_data = select(p_data, align, label, post.avg.nongap.length)
p_data
= rbind(p_data, gbe_lens, mus_lens)
p_data
= select(p_data, label, post.avg.nongap.length)
p_means = p_means %>% group_by(label) %>% summarize(post.avg.nongap.length=mean(post.avg.nongap.length, na.rm=T))
p_means
$label = factor(p_data$label, levels=c("P0S20C50", "P50S20C50", "P100S20C50", "P150S20C50", "P175S20C50", "P200S20C50", "GBE", "Mus CDS"), ordered=T)
p_data#levels(p_data$label) = c("P0S20C50", "P50S20C50", "P100S20C50", "P150S20C50", "P175S20C50", "P200S20C50", "GBE"), ordered=T)
$label = factor(p_means$label, levels=c("P0S20C50", "P50S20C50", "P100S20C50", "P150S20C50", "P175S20C50", "P200S20C50", "GBE", "Mus CDS"), ordered=T)
p_means#p_means$label = as.factor(p_means$label)
#levels(p_means$label) = c("P0S20C50", "P50S20C50", "P100S20C50", "P150S20C50", "P175S20C50", "P200S20C50")
= ggplot(p_data, aes(x=label, y=post.avg.nongap.length, group=label)) +
p_lens geom_quasirandom(size=2, width=0.25, alpha=0.25, color="#666666") +
geom_boxplot(outlier.shape=NA, alpha=0.75, width=0.5, color="#666666") +
geom_point(data=p_means, aes(x=label, y=post.avg.nongap.length), size=2, color="red") +
geom_line(data=p_means, aes(x=label, y=post.avg.nongap.length), size=2, color="blue") +
ylab("Avg. ungapped sequence\nlength (codons)") +
xlab("") +
#scale_fill_manual(labels=c("Pre-filter", "Post-filter"), values=filter_col) +
#scale_x_discrete(labels=c("Pre-filter", "Post-filter")) +
#scale_y_continuous(expand=c(0,0)) +
bartheme() +
theme(legend.position="none", axis.text.x = element_text(angle=45, hjust=1))
print(p_lens)
%>% kable() %>% kable_styling(bootstrap_options=c("striped", "condended", "responsive"), full_width=F) p_means
label | post.avg.nongap.length |
---|---|
GBE | 332.1052 |
Mus CDS | 586.0939 |
P0S20C50 | 550.2048 |
P100S20C50 | 510.1966 |
P150S20C50 | 448.7321 |
P175S20C50 | 403.9664 |
P200S20C50 | 276.3557 |
P50S20C50 | 535.6017 |
2.2 Varying the gappy sequence (S) filter
Filtering sequences with % gaps larger than [20, 40, 60, 80, 100] samples
2.2.1 Alignment length (codons)
#gbe_seqs = select(gbe_data, align, pre.codon.aln.length)
$label = "GBE"
gbe_datanames(gbe_data)[3] = "post.codon.aln.length"
= select(gbe_data, align, label, post.codon.aln.length)
gbe_lens = subset(gbe_lens, post.codon.aln.length < 10000)
gbe_lens
= rbind(p175s20c50, p175s40c50, p175s60c50, p175s80c50, p175s100c50)
p_data = select(p_data, align, label, post.codon.aln.length)
p_data
= rbind(p_data, gbe_lens)
p_data
= select(p_data, label, post.codon.aln.length)
p_means = p_means %>% group_by(label) %>% summarize(post.codon.aln.length=mean(post.codon.aln.length, na.rm=T))
p_means
$label = factor(p_data$label, levels=c("P175S20C50", "P175S40C50", "P175S60C50", "P175S80C50", "P175S100C50", "GBE"), ordered=T)
p_data#levels(p_data$label) = c("P0S20C50", "P50S20C50", "P100S20C50", "P150S20C50", "P0S20C50", "P200S20C50", "GBE"), ordered=T)
$label = factor(p_means$label, levels=c("P175S20C50", "P175S40C50", "P175S60C50", "P175S80C50", "P175S100C50", "GBE"), ordered=T)
p_means#p_means$label = as.factor(p_means$label)
#levels(p_means$label) = c("P0S20C50", "P50S20C50", "P100S20C50", "P150S20C50", "P0S20C50", "P200S20C50")
= ggplot(p_data, aes(x=label, y=post.codon.aln.length, group=label)) +
p_lens geom_quasirandom(size=2, width=0.25, alpha=0.25, color="#666666") +
geom_boxplot(outlier.shape=NA, alpha=0.75, width=0.5, color="#666666") +
geom_point(data=p_means, aes(x=label, y=post.codon.aln.length), size=2, color="red") +
geom_line(data=p_means, aes(x=label, y=post.codon.aln.length), size=2, color="blue") +
ylab("Alignment length (codons)") +
xlab("") +
#scale_fill_manual(labels=c("Pre-filter", "Post-filter"), values=filter_col) +
#scale_x_discrete(labels=c("Pre-filter", "Post-filter")) +
#scale_y_continuous(expand=c(0,0)) +
bartheme() +
theme(legend.position="none", axis.text.x = element_text(angle=45, hjust=1))
print(p_lens)
%>% kable() %>% kable_styling(bootstrap_options=c("striped", "condended", "responsive"), full_width=F) p_means
label | post.codon.aln.length |
---|---|
GBE | 372.6928 |
P175S100C50 | 423.1905 |
P175S20C50 | 423.1905 |
P175S40C50 | 423.1905 |
P175S60C50 | 423.1905 |
P175S80C50 | 423.1905 |
2.2.2 Ungapped alignment length (codons)
#gbe_seqs = select(gbe_data, align, pre.codon.aln.length)
$label = "GBE"
gbe_datanames(gbe_data)[4] = "post.avg.nongap.length"
= select(gbe_data, align, label, post.avg.nongap.length)
gbe_lens = subset(gbe_lens, post.avg.nongap.length < 10000)
gbe_lens
= select(mus_data, transcript_id, exon_length)
mus_lens = mus_lens %>% group_by(transcript_id) %>% summarize(post.avg.nongap.length=sum(exon_length))
mus_lens $post.avg.nongap.length = mus_lens$post.avg.nongap.length / 3
mus_lens$label = "Mus CDS"
mus_lensnames(mus_lens)[1] = "align"
= subset(mus_lens, post.avg.nongap.length < 5000)
mus_lens
= rbind(p175s20c50, p175s40c50, p175s60c50, p175s80c50, p175s100c50)
p_data = select(p_data, align, label, post.avg.nongap.length)
p_data
= rbind(p_data, gbe_lens, mus_lens)
p_data
= select(p_data, label, post.avg.nongap.length)
p_means = p_means %>% group_by(label) %>% summarize(post.avg.nongap.length=mean(post.avg.nongap.length, na.rm=T))
p_means
$label = factor(p_data$label, levels=c("P175S20C50", "P175S40C50", "P175S60C50", "P175S80C50", "P175S100C50", "GBE", "Mus CDS"), ordered=T)
p_data#levels(p_data$label) = c("P0S20C50", "P50S20C50", "P100S20C50", "P150S20C50", "P175S20C50", "P200S20C50", "GBE"), ordered=T)
$label = factor(p_means$label, levels=c("P175S20C50", "P175S40C50", "P175S60C50", "P175S80C50", "P175S100C50", "GBE", "Mus CDS"), ordered=T)
p_means#p_means$label = as.factor(p_means$label)
#levels(p_means$label) = c("P0S20C50", "P50S20C50", "P100S20C50", "P150S20C50", "P175S20C50", "P200S20C50")
= ggplot(p_data, aes(x=label, y=post.avg.nongap.length, group=label)) +
p_lens geom_quasirandom(size=2, width=0.25, alpha=0.25, color="#666666") +
geom_boxplot(outlier.shape=NA, alpha=0.75, width=0.5, color="#666666") +
geom_point(data=p_means, aes(x=label, y=post.avg.nongap.length), size=2, color="red") +
geom_line(data=p_means, aes(x=label, y=post.avg.nongap.length), size=2, color="blue") +
ylab("Avg. ungapped sequence\nlength (codons)") +
xlab("") +
#scale_fill_manual(labels=c("Pre-filter", "Post-filter"), values=filter_col) +
#scale_x_discrete(labels=c("Pre-filter", "Post-filter")) +
#scale_y_continuous(expand=c(0,0)) +
bartheme() +
theme(legend.position="none", axis.text.x = element_text(angle=45, hjust=1))
print(p_lens)
%>% kable() %>% kable_styling(bootstrap_options=c("striped", "condended", "responsive"), full_width=F) p_means
label | post.avg.nongap.length |
---|---|
GBE | 332.1052 |
Mus CDS | 586.0939 |
P175S100C50 | 403.9664 |
P175S20C50 | 403.9664 |
P175S40C50 | 403.9664 |
P175S60C50 | 403.9664 |
P175S80C50 | 403.9664 |
2.3 Varying the codon site (C) filter
Filtering three-codon windows where at least 2 codons have at least 2 gaps in greater than [25, 50, 75, 100]% of samples
2.3.1 Alignment length (codons)
#gbe_seqs = select(gbe_data, align, pre.codon.aln.length)
$label = "GBE"
gbe_datanames(gbe_data)[3] = "post.codon.aln.length"
= select(gbe_data, align, label, post.codon.aln.length)
gbe_lens = subset(gbe_lens, post.codon.aln.length < 10000)
gbe_lens
= rbind(p175s20c25, p175s20c50, p175s20c75, p175s20c100)
p_data = select(p_data, align, label, post.codon.aln.length)
p_data
= rbind(p_data, gbe_lens)
p_data
= select(p_data, label, post.codon.aln.length)
p_means = p_means %>% group_by(label) %>% summarize(post.codon.aln.length=mean(post.codon.aln.length, na.rm=T))
p_means
$label = factor(p_data$label, levels=c("P175S20C25", "P175S20C50", "P175S20C75", "P175S20C100", "GBE"), ordered=T)
p_data#levels(p_data$label) = c("P0S20C50", "P50S20C50", "P100S20C50", "P150S20C50", "P0S20C50", "P200S20C50", "GBE"), ordered=T)
$label = factor(p_means$label, levels=c("P175S20C25", "P175S20C50", "P175S20C75", "P175S20C100", "GBE"), ordered=T)
p_means#p_means$label = as.factor(p_means$label)
#levels(p_means$label) = c("P0S20C50", "P50S20C50", "P100S20C50", "P150S20C50", "P0S20C50", "P200S20C50")
= ggplot(p_data, aes(x=label, y=post.codon.aln.length, group=label)) +
p_lens geom_quasirandom(size=2, width=0.25, alpha=0.25, color="#666666") +
geom_boxplot(outlier.shape=NA, alpha=0.75, width=0.5, color="#666666") +
geom_point(data=p_means, aes(x=label, y=post.codon.aln.length), size=2, color="red") +
geom_line(data=p_means, aes(x=label, y=post.codon.aln.length), size=2, color="blue") +
ylab("Alignment length (codons)") +
xlab("") +
#scale_fill_manual(labels=c("Pre-filter", "Post-filter"), values=filter_col) +
#scale_x_discrete(labels=c("Pre-filter", "Post-filter")) +
#scale_y_continuous(expand=c(0,0)) +
bartheme() +
theme(legend.position="none", axis.text.x = element_text(angle=45, hjust=1))
print(p_lens)
%>% kable() %>% kable_styling(bootstrap_options=c("striped", "condended", "responsive"), full_width=F) p_means
label | post.codon.aln.length |
---|---|
GBE | 372.6928 |
P175S20C100 | 434.4938 |
P175S20C25 | 414.0011 |
P175S20C50 | 423.1905 |
P175S20C75 | 427.6122 |
2.3.2 Ungapped alignment length (codons)
#gbe_seqs = select(gbe_data, align, pre.codon.aln.length)
$label = "GBE"
gbe_datanames(gbe_data)[4] = "post.avg.nongap.length"
= select(gbe_data, align, label, post.avg.nongap.length)
gbe_lens = subset(gbe_lens, post.avg.nongap.length < 10000)
gbe_lens
= select(mus_data, transcript_id, exon_length)
mus_lens = mus_lens %>% group_by(transcript_id) %>% summarize(post.avg.nongap.length=sum(exon_length))
mus_lens $post.avg.nongap.length = mus_lens$post.avg.nongap.length / 3
mus_lens$label = "Mus CDS"
mus_lensnames(mus_lens)[1] = "align"
= subset(mus_lens, post.avg.nongap.length < 5000)
mus_lens
= rbind(p175s20c25, p175s20c50, p175s20c75, p175s20c100)
p_data = select(p_data, align, label, post.avg.nongap.length)
p_data
= rbind(p_data, gbe_lens, mus_lens)
p_data
= select(p_data, label, post.avg.nongap.length)
p_means = p_means %>% group_by(label) %>% summarize(post.avg.nongap.length=mean(post.avg.nongap.length, na.rm=T))
p_means
$label = factor(p_data$label, levels=c("P175S20C25", "P175S20C50", "P175S20C75", "P175S20C100", "GBE", "Mus CDS"), ordered=T)
p_data#levels(p_data$label) = c("P0S20C50", "P50S20C50", "P100S20C50", "P150S20C50", "P175S20C50", "P200S20C50", "GBE"), ordered=T)
$label = factor(p_means$label, levels=c("P175S20C25", "P175S20C50", "P175S20C75", "P175S20C100", "GBE", "Mus CDS"), ordered=T)
p_means#p_means$label = as.factor(p_means$label)
#levels(p_means$label) = c("P0S20C50", "P50S20C50", "P100S20C50", "P150S20C50", "P175S20C50", "P200S20C50")
= ggplot(p_data, aes(x=label, y=post.avg.nongap.length, group=label)) +
p_lens geom_quasirandom(size=2, width=0.25, alpha=0.25, color="#666666") +
geom_boxplot(outlier.shape=NA, alpha=0.75, width=0.5, color="#666666") +
geom_point(data=p_means, aes(x=label, y=post.avg.nongap.length), size=2, color="red") +
geom_line(data=p_means, aes(x=label, y=post.avg.nongap.length), size=2, color="blue") +
ylab("Avg. ungapped sequence\nlength (codons)") +
xlab("") +
#scale_fill_manual(labels=c("Pre-filter", "Post-filter"), values=filter_col) +
#scale_x_discrete(labels=c("Pre-filter", "Post-filter")) +
#scale_y_continuous(expand=c(0,0)) +
bartheme() +
theme(legend.position="none", axis.text.x = element_text(angle=45, hjust=1))
print(p_lens)
%>% kable() %>% kable_styling(bootstrap_options=c("striped", "condended", "responsive"), full_width=F) p_means
label | post.avg.nongap.length |
---|---|
GBE | 332.1052 |
Mus CDS | 586.0939 |
P175S20C100 | 406.3721 |
P175S20C25 | 399.1845 |
P175S20C50 | 403.9664 |
P175S20C75 | 404.9436 |
3 Noncoding+coding alignments
= read.csv("../../data/noncoding-aln-stats/full-all-alnlens-norefs.csv", header=F)
noref_full_alns names(noref_full_alns) = c("file", "sample", "aln.len")
= noref_full_alns %>% group_by(file) %>% summarize(aln.len=mean(aln.len, na.rm=T))
noref_full_alns $label = "No ref"
noref_full_alns
= read.csv("../../data/noncoding-aln-stats/full-all-alnlens-500.csv", header=F)
f500_full_alns names(f500_full_alns) = c("file", "sample", "aln.len")
= f500_full_alns %>% group_by(file) %>% summarize(aln.len=mean(aln.len, na.rm=T))
f500_full_alns $label = "500bp flank refs"
f500_full_alns
= read.csv("../../data/noncoding-aln-stats/full-all-alnlens-500.csv", header=F)
f200_full_alns names(f200_full_alns) = c("file", "sample", "aln.len")
= f200_full_alns %>% group_by(file) %>% summarize(aln.len=mean(aln.len, na.rm=T))
f200_full_alns $label = "200bp flank refs"
f200_full_alns
##
= read.csv("../../data/noncoding-aln-stats/reproductive-all-alnlens-500.csv", header=F)
f500_rep_alns names(f500_rep_alns) = c("file", "sample", "aln.len")
= f500_rep_alns %>% group_by(file) %>% summarize(aln.len=mean(aln.len, na.rm=T))
f500_rep_alns $label = "500bp flank refs"
f500_rep_alns
= read.csv("../../data/noncoding-aln-stats/reproductive-all-alnlens-500.csv", header=F)
f200_rep_alns names(f200_rep_alns) = c("file", "sample", "aln.len")
= f200_rep_alns %>% group_by(file) %>% summarize(aln.len=mean(aln.len, na.rm=T))
f200_rep_alns $label = "200bp flank refs"
f200_rep_alns
##
#noref_full_seqs = read.csv("../../data/noncoding-aln-stats/full-all-seqlens-norefs.csv", header=F)
#names(noref_full_seqs) = c("file", "sample", "aln.len")
#noref_full_seqs = noref_full_seqs %>% group_by(file) %>% summarize(aln.len=mean(aln.len, na.rm=T))
#noref_full_seqs$label = "No ref"
= read.csv("../../data/noncoding-aln-stats/full-all-seqlens-500.csv", header=F)
f500_full_seqs names(f500_full_seqs) = c("file", "sample", "seq.len")
= read.csv("../../data/noncoding-aln-stats/full-all-seqlens-200.csv", header=F)
f200_full_seqs names(f200_full_seqs) = c("file", "sample", "seq.len")
##
= read.csv("../../data/noncoding-aln-stats/reproductive-all-seqlens-500.csv", header=F)
f500_rep_seqs names(f500_rep_seqs) = c("file", "sample", "seq.len")
= read.csv("../../data/noncoding-aln-stats/reproductive-all-seqlens-200.csv", header=F)
f200_rep_seqs names(f200_rep_seqs) = c("file", "sample", "seq.len")
##
= read.csv("../../data/noncoding-aln-stats/mm10-exons-lens-200.csv", header=F)
f200_mm_seqs names(f200_mm_seqs) = c("sample", "avg.seq.len")
$label = "mm10 200bp flank"
f200_mm_seqs
= read.csv("../../data/noncoding-aln-stats/mm10-exons-lens-500.csv", header=F)
f500_mm_seqs names(f500_mm_seqs) = c("sample", "avg.seq.len")
$label = "mm10 500bp flank"
f500_mm_seqs
= read.csv("../../data/noncoding-aln-stats/mm10-exons-lens-noflank.csv", header=F)
mm_seqs names(mm_seqs) = c("sample", "avg.seq.len")
$label = "mm10 no flank" mm_seqs
3.0.1 Unaligned sequence length (Full dataset + mouse and rat reference seqs)
= subset(f500_full_seqs, sample==">mm10" | sample==">rnor6")
s1 = s1 %>% group_by(file) %>% summarize(avg.seq.len=mean(seq.len))
s1 $label = "Ref seqs 500bp flank"
s1= subset(f200_full_seqs, sample==">mm10" | sample==">rnor6")
s2 = s2 %>% group_by(file) %>% summarize(avg.seq.len=mean(seq.len))
s2 $label = "Ref seqs 200bp flank"
s2= subset(f500_full_seqs, sample!=">mm10" & sample!=">rnor6")
s3 = s3 %>% group_by(file) %>% summarize(avg.seq.len=mean(seq.len))
s3 $label = "Non-ref seqs"
s3= rbind(s1, s2, s3)
full_seq_lens = select(full_seq_lens, avg.seq.len, label)
full_seq_lens = rbind(full_seq_lens, select(f200_mm_seqs, avg.seq.len, label),
full_seq_lens select(f500_mm_seqs, avg.seq.len, label), select(mm_seqs, avg.seq.len, label))
#full_seq_lens = rbind(full_seq_lens, s3)
= ggplot(subset(full_seq_lens, avg.seq.len < 2000), aes(x=label, y=avg.seq.len, group=label)) +
seq_lens geom_quasirandom(size=2, width=0.25, alpha=0.25, color="#666666") +
geom_boxplot(outlier.shape=NA, alpha=0.75, width=0.5, color="#666666") +
#geom_point(data=p_means, aes(x=label, y=post.codon.aln.length), size=2, color="red") +
#geom_line(data=p_means, aes(x=label, y=post.codon.aln.length), size=2, color="blue") +
ylab("Sequence length (bp)") +
xlab("") +
bartheme() +
theme(legend.position="none", axis.text.x = element_text(angle=45, hjust=1))
print(seq_lens)
# full_aln_lens = rbind(noref_full_alns, f500_full_alns, f200_full_alns)
# full_lens = ggplot(subset(full_aln_lens, aln.len <= 4000), aes(x=label, y=aln.len, group=label)) +
# geom_quasirandom(size=2, width=0.25, alpha=0.25, color="#666666") +
# geom_boxplot(outlier.shape=NA, alpha=0.75, width=0.5, color="#666666") +
# #geom_point(data=p_means, aes(x=label, y=post.codon.aln.length), size=2, color="red") +
# #geom_line(data=p_means, aes(x=label, y=post.codon.aln.length), size=2, color="blue") +
# ylab("Alignment length (bp)") +
# xlab("") +
# bartheme() +
# theme(legend.position="none", axis.text.x = element_text(angle=45, hjust=1))
# print(full_lens)
3.0.2 Alignment length (full dataset) while varying flank size of reference exons
= rbind(noref_full_alns, f500_full_alns, f200_full_alns)
full_aln_lens
= ggplot(subset(full_aln_lens, aln.len <= 4000), aes(x=label, y=aln.len, group=label)) +
full_lens geom_quasirandom(size=2, width=0.25, alpha=0.25, color="#666666") +
geom_boxplot(outlier.shape=NA, alpha=0.75, width=0.5, color="#666666") +
#geom_point(data=p_means, aes(x=label, y=post.codon.aln.length), size=2, color="red") +
#geom_line(data=p_means, aes(x=label, y=post.codon.aln.length), size=2, color="blue") +
ylab("Alignment length (bp)") +
xlab("") +
bartheme() +
theme(legend.position="none", axis.text.x = element_text(angle=45, hjust=1))
print(full_lens)
3.0.3 Alignment length (reproductive dataset) while varying flank size of reference exons
= rbind(f500_rep_alns, f200_rep_alns)
rep_aln_lens
= ggplot(subset(rep_aln_lens, aln.len <= 4000), aes(x=label, y=aln.len, group=label)) +
rep_lens geom_quasirandom(size=2, width=0.25, alpha=0.25, color="#666666") +
geom_boxplot(outlier.shape=NA, alpha=0.75, width=0.5, color="#666666") +
#geom_point(data=p_means, aes(x=label, y=post.codon.aln.length), size=2, color="red") +
#geom_line(data=p_means, aes(x=label, y=post.codon.aln.length), size=2, color="blue") +
ylab("Alignment length (bp)") +
xlab("") +
bartheme() +
theme(legend.position="none", axis.text.x = element_text(angle=45, hjust=1))
print(rep_lens)