Murine exome alignments

binData <- function(df, column, max_bin){
# Function to pre-bin SVs by length
  binned_df = data.frame("bin"=seq(from=0,to=max_bin,by=10), count=0)
  for(i in 1:nrow(df)) {
    row <- df[i,]
    bin = floor(row[[column]]/10)*10
    binned_df[binned_df$bin==bin,]$count = binned_df[binned_df$bin==bin,]$count + 1
  }
  return(binned_df) 
}

knitr::opts_chunk$set(echo = TRUE, warning = FALSE, message = FALSE)

library(ggplot2)
library(cowplot)
library(ggbeeswarm)
library(ggpubr)
#library(plyr)
library(dplyr)
library(kableExtra)
source("../lib/design.r")

#htmltools::includeHTML("../html-chunks/rmd_nav.html")
Alignment and filtering steps:
1. BLAST mouse exons to all assembled contigs to identify exon-contig homology.
2. Exonerate mouse exons (as trimmed AA sequences) to matching contig for each sample.
3. Remove exons that have fewer than 175 samples with a matching BLAST hit.
4. Align amino acid exonerate hits with MAFFT.
5. Backtranslate to nucleotide sequences.
6. For each alignment, remove sequences that are >20% gaps.
7. For each alignment, remove 3-codon windows in which 2 or more codons have 2 or more gaps in >50% of sequences.
8. Remove any samples with pre-mature stop codons.

1 Coding alignments

1.1 Exon stats

1.1.1 Exons vs BLAST hits

blast_data = read.csv("../../data/aln-stats/sample-hits-exons.csv", header=T)

exons_v_bhits = ggplot(blast_data, aes(x=total.exons, y=blast.hits)) +
  geom_abline(aes(slope=1, intercept=0, color="1:1"), size=1, linetype="dashed", show.legend=F) +
  geom_point(size=2, alpha=0.1, color=corecol(pal="wilke", numcol=1, offset=6)) +
  geom_smooth(aes(color="Fit"), method="lm", linetype="dashed", se=F) +
  ylab("# of BLAST hits per sample") + 
  xlab("# of mouse exons") +
  scale_color_manual(values=c("1:1"="#999999", "Fit"=corecol(pal="wilke", numcol=1, offset=6))) +
  bartheme() +
  theme(legend.position="right")
print(exons_v_bhits)

1.1.2 BLAST hits vs exonerate hits (no hit threshold)

targets_exonerate = read.csv("../../data/aln-stats/targets-exonerate-f0-TEST.csv", header=T)

bhits_v_ehits = ggplot(targets_exonerate, aes(x=init.sample.hits, y=exonerate.sample.hits)) +
  geom_abline(aes(slope=1, intercept=0, color="1:1"), size=1, linetype="dashed", show.legend=F) +
  geom_point(size=2, alpha=0.1, color=corecol(pal="wilke", numcol=1, offset=4)) +
  geom_smooth(aes(color="Fit"), method="lm", linetype="dashed", se=F) +
  xlab("# of samples with BLAST hits") + 
  ylab("# of samples with\nexonerate hits") +
  scale_color_manual(values=c("1:1"="#999999", "Fit"=corecol(pal="wilke", numcol=1, offset=4))) +
  bartheme() +
  theme(legend.position="right")
print(bhits_v_ehits)

1.1.3 Mouse exons vs exonerate hits (no hit threshold)

sample_targets_exonerate = read.csv("../../data/aln-stats/sample-targets-exonerate-f0-TEST.csv", header=T)
## HEADERS ARE OFF


exons_v_hits = ggplot(sample_targets_exonerate, aes(x=pid.1, y=mm.exons)) +
  geom_abline(aes(slope=1, intercept=0, color="1:1"), size=1, linetype="dashed", show.legend=F) +
  geom_point(size=2, alpha=0.1, color=corecol(pal="wilke", numcol=1, offset=5)) +
  geom_smooth(aes(color="Fit"), method="lm", linetype="dashed", se=F) +
  xlab("# of exons in mouse protein") + 
  ylab("# of exonerate hits\nper sample per protein") +
  scale_color_manual(values=c("1:1"="#999999", "Fit"="#000000")) +
  bartheme() + 
  theme(legend.position="right")
print(exons_v_hits)

1.2 Alignment stats

1.2.1 Number of sequences per alignment

aln_data_150 = read.csv("../../data/aln-stats/aln-stats-f150-seq20-site50.tab", header=T, sep="\t", comment.char="#")
aln_data_175 = read.csv("../../data/aln-stats/aln-stats-f175-seq20-site50.tab", header=T, sep="\t", comment.char="#")
aln_data_mcl = read.csv("../../data/aln-stats/aln-stats-mclennan-f0-seq20-site50.tab", header=T, sep="\t", comment.char="#")
aln_data_aus = read.csv("../../data/aln-stats/aln-stats-aus-full-f0-seq20-site50.tab", header=T, sep="\t", comment.char="#")
gbe_data = read.csv("../../data/aln-stats/roycroft-aln-stats.tab", header=T, sep="\t", comment.char="#")

filter_col = corecol(numcol=10)

pre_seqs_150 = select(aln_data_150, align, pre.num.seqs)
pre_seqs_150$label = "Pre-filter (F150)"
names(pre_seqs_150)[2] = "num.seqs"

post_seqs_150 = select(aln_data_150, align, post.num.seqs)
post_seqs_150$label = "Post-filter (F150)"
names(post_seqs_150)[2] = "num.seqs"

pre_seqs_175 = select(aln_data_175, align, pre.num.seqs)
pre_seqs_175$label = "Pre-filter (F175)"
names(pre_seqs_175)[2] = "num.seqs"

post_seqs_175 = select(aln_data_175, align, post.num.seqs)
post_seqs_175$label = "Post-filter (F175)"
names(post_seqs_175)[2] = "num.seqs"

pre_seqs_mcl = select(aln_data_mcl, align, pre.num.seqs)
pre_seqs_mcl$label = "Pre-filter (McL F0)"
names(pre_seqs_mcl)[2] = "num.seqs"

post_seqs_mcl = select(aln_data_mcl, align, post.num.seqs)
post_seqs_mcl$label = "Post-filter (McL F0)"
names(post_seqs_mcl)[2] = "num.seqs"

pre_seqs_aus = select(aln_data_aus, align, pre.num.seqs)
pre_seqs_aus$label = "Pre-filter (Aus F0)"
names(pre_seqs_aus)[2] = "num.seqs"

post_seqs_aus = select(aln_data_aus, align, post.num.seqs)
post_seqs_aus$label = "Post-filter (Aus F0)"
names(post_seqs_aus)[2] = "num.seqs"

num_seqs = rbind(pre_seqs_150, post_seqs_150, pre_seqs_175, post_seqs_175, pre_seqs_mcl, post_seqs_mcl, pre_seqs_aus, post_seqs_aus)

num_seqs$label = factor(num_seqs$label, levels=c("Pre-filter (F150)", "Post-filter (F150)", "Pre-filter (F175)", "Post-filter (F175)", "Pre-filter (McL F0)", "Post-filter (McL F0)", "Pre-filter (Aus F0)", "Post-filter (Aus F0)"))

aln_seqs = ggplot(num_seqs, aes(x=label, y=num.seqs, fill=label)) +
  geom_quasirandom(size=2, width=0.25, alpha=0.25, color="#666666") +
  geom_boxplot(outlier.shape=NA, alpha=0.5, width=0.5, color="#666666") +
  ylab("# of aligned sequences\nper protein") +
  xlab("") +
  scale_fill_manual(labels=c("Pre-filter (F150)", "Post-filter (F150)", "Pre-filter (F175)", "Post-filter (F175)", "Pre-filter (McL F0)", "Post-filter (McL F0)", "Pre-filter (Aus F0)", "Post-filter (Aus F0)"), values=filter_col) +
  scale_x_discrete(labels=c("Pre-filter (F150)", "Post-filter (F150)", "Pre-filter (F175)", "Post-filter (F175)", "Pre-filter (McL F0)", "Post-filter (McL F0)", "Pre-filter (Aus F0)", "Post-filter (Aus F0)")) +
  #scale_y_continuous(expand=c(0,0)) +
  bartheme() +
  theme(legend.position="none",
        axis.text.x=element_text(angle=45, hjust=1))
print(aln_seqs)

1.2.2 Length of alignment

pre_seqs_150 = select(aln_data_150, align, pre.codon.aln.length)
pre_seqs_150$label = "Pre-filter (F150)"
names(pre_seqs_150)[2] = "aln.len"

post_seqs_150 = select(aln_data_150, align, post.codon.aln.length)
post_seqs_150$label = "Post-filter (F150)"
names(post_seqs_150)[2] = "aln.len"

pre_seqs_175 = select(aln_data_175, align, pre.codon.aln.length)
pre_seqs_175$label = "Pre-filter (F175)"
names(pre_seqs_175)[2] = "aln.len"

post_seqs_175 = select(aln_data_175, align, post.codon.aln.length)
post_seqs_175$label = "Post-filter (F175)"
names(post_seqs_175)[2] = "aln.len"

pre_seqs_mcl = select(aln_data_mcl, align, pre.codon.aln.length)
pre_seqs_mcl$label = "Pre-filter (McL F0)"
names(pre_seqs_mcl)[2] = "aln.len"

post_seqs_mcl = select(aln_data_mcl, align, post.codon.aln.length)
post_seqs_mcl$label = "Post-filter (McL F0)"
names(post_seqs_mcl)[2] = "aln.len"

pre_seqs_aus = select(aln_data_aus, align, pre.codon.aln.length)
pre_seqs_aus$label = "Pre-filter (Aus F0)"
names(pre_seqs_aus)[2] = "aln.len"

post_seqs_aus = select(aln_data_aus, align, post.codon.aln.length)
post_seqs_aus$label = "Post-filter (Aus F0)"
names(post_seqs_aus)[2] = "aln.len"

gbe_seqs = select(gbe_data, align, pre.codon.aln.length)
gbe_seqs$label = "GBE"
names(gbe_seqs)[2] = "aln.len"
gbe_seqs = subset(gbe_seqs, aln.len < 10000)


num_seqs = rbind(pre_seqs_150, post_seqs_150, pre_seqs_175, post_seqs_175, pre_seqs_mcl, post_seqs_mcl, pre_seqs_aus, post_seqs_aus, gbe_seqs)
num_seqs = subset(num_seqs, aln.len <= 1000)

num_seqs$label = factor(num_seqs$label, levels=c("Pre-filter (F150)", "Post-filter (F150)", "Pre-filter (F175)", "Post-filter (F175)", "Pre-filter (McL F0)", "Post-filter (McL F0)", "Pre-filter (Aus F0)", "Post-filter (Aus F0)", "GBE"))

aln_len = ggplot(num_seqs, aes(x=label, y=aln.len, fill=label)) +
  geom_quasirandom(size=2, width=0.25, alpha=0.25, color="#666666") +
  geom_boxplot(outlier.shape=NA, alpha=0.5, width=0.5, color="#666666") +
  ylab("Alignment length (codons)") +
  xlab("") +
  scale_fill_manual(labels=c("Pre-filter (F150)", "Post-filter (F150)", "Pre-filter (F175)", "Post-filter (F175)", "Pre-filter (McL F0)", "Post-filter (McL F0)", "Pre-filter (Aus F0)", "Post-filter (Aus F0)", "GBE"), values=filter_col) +
  scale_x_discrete(labels=c("Pre-filter (F150)", "Post-filter (F150)", "Pre-filter (F175)", "Post-filter (F175)", "Pre-filter (McL F0)", "Post-filter (McL F0)", "Pre-filter (Aus F0)", "Post-filter (Aus F0)", "GBE")) +
  #scale_y_continuous(expand=c(0,0)) +
  bartheme() +
  theme(legend.position="none",
        axis.text.x=element_text(angle=45, hjust=1))
print(aln_len)

# bin_width = 100
# 
# bins = seq(0,6000,by=bin_width)
# bin_labels = seq(1,6000,by=bin_width)
# pre_counts = hist(aln_data$pre.codon.aln.length, breaks=bins, include.lowest=T, plot=F)$counts
# pre_binned = data.frame("bin"=bin_labels, "count"=pre_counts, filter="Pre-filter")
# 
# filter_counts = hist(aln_data$post.codon.aln.length, breaks=bins, include.lowest=T, plot=F)$counts
# filter_binned = data.frame("bin"=bin_labels, "count"=filter_counts, filter="Post-filter")
# 
# binned_data = rbind.fill(pre_binned, filter_binned)
# col = corecol(numcol=2)
# lab = c("Pre-filter", "Post-filter")
# #ylabels = c("1200", as.character(seq(from=0, to=7000, by=1000)))
# ylabels = as.character(seq(from=-200, to=200, by=100))
# 
# dualp = ggplot(binned_data, aes(x = bin, y = ifelse(filter == "Post-filter",-1, 1)*count, fill = filter)) + 
#   geom_col(width=bin_width) +
#   geom_hline(yintercept=0) +
#   scale_x_continuous(name = "dS", limits = c(-bin_width, 7000), expand = c(0, 0)) +
#   scale_y_continuous(name = "# Transcripts", limits=c(-200,200), breaks = 100*(-2:2), labels=ylabels) +
#   scale_fill_manual(name="", limits=lab, values=col) +
#   bartheme()
# print(dualp)

1.2.3 Avg non-gapped sequence length per protein

mus_data = read.csv("../../data/Mus-selected-sequences_metadata_samp-counts_ratids.csv", header=TRUE)
mus_data = subset(mus_data, feature_type=="CDS")
mus_lens = select(mus_data, transcript_id, exon_length)
mus_lens = mus_lens %>% group_by(transcript_id) %>% summarize(nongap.len=sum(exon_length))
mus_lens$nongap.len = mus_lens$nongap.len / 3
mus_lens$label = "Mus CDS"
names(mus_lens)[1] = "align"
mus_lens = subset(mus_lens, nongap.len < 5000)

pre_seqs_150 = select(aln_data_150, align, pre.avg.nongap.length)
pre_seqs_150$label = "Pre-filter (F150)"
names(pre_seqs_150)[2] = "nongap.len"

post_seqs_150 = select(aln_data_150, align, post.avg.nongap.length)
post_seqs_150$label = "Post-filter (F150)"
names(post_seqs_150)[2] = "nongap.len"

pre_seqs_175 = select(aln_data_175, align, pre.avg.nongap.length)
pre_seqs_175$label = "Pre-filter (F175)"
names(pre_seqs_175)[2] = "nongap.len"

post_seqs_175 = select(aln_data_175, align, post.avg.nongap.length)
post_seqs_175$label = "Post-filter (F175)"
names(post_seqs_175)[2] = "nongap.len"

pre_seqs_mcl = select(aln_data_mcl, align, pre.avg.nongap.length)
pre_seqs_mcl$label = "Pre-filter (McL F0)"
names(pre_seqs_mcl)[2] = "nongap.len"

post_seqs_mcl = select(aln_data_mcl, align, post.avg.nongap.length)
post_seqs_mcl$label = "Post-filter (McL F0)"
names(post_seqs_mcl)[2] = "nongap.len"

pre_seqs_aus = select(aln_data_aus, align, pre.avg.nongap.length)
pre_seqs_aus$label = "Pre-filter (Aus F0)"
names(pre_seqs_aus)[2] = "nongap.len"

post_seqs_aus = select(aln_data_aus, align, post.avg.nongap.length)
post_seqs_aus$label = "Post-filter (Aus F0)"
names(post_seqs_aus)[2] = "nongap.len"

gbe_seqs = select(gbe_data, align, pre.avg.nongap.length)
gbe_seqs$label = "GBE"
names(gbe_seqs)[2] = "nongap.len"
gbe_seqs = subset(gbe_seqs, nongap.len < 10000)


num_seqs = rbind(pre_seqs_150, post_seqs_150, pre_seqs_175, post_seqs_175, pre_seqs_mcl, post_seqs_mcl, pre_seqs_aus, post_seqs_aus, gbe_seqs, mus_lens)
num_seqs = subset(num_seqs, nongap.len <= 1000)

num_seqs$label = factor(num_seqs$label, levels=c("Pre-filter (F150)", "Post-filter (F150)", "Pre-filter (F175)", "Post-filter (F175)", "Pre-filter (McL F0)", "Post-filter (McL F0)", "Pre-filter (Aus F0)", "Post-filter (Aus F0)", "GBE", "Mus CDS"))

nongap_len = ggplot(num_seqs, aes(x=label, y=nongap.len, fill=label)) +
  geom_quasirandom(size=2, width=0.25, alpha=0.25, color="#666666") +
  geom_boxplot(outlier.shape=NA, alpha=0.5, width=0.5, color="#666666") +
  ylab("Avg. ungapped\nsequence length (codons)") +
  xlab("") +
  scale_fill_manual(labels=c("Pre-filter (150)", "Post-filter (150)", "Pre-filter (175)", "Post-filter (175)", "Pre-filter (McL F0)", "Post-filter (McL F0)", "Pre-filter (Aus F0)", "Post-filter (Aus F0)", "GBE", "Mus CDS"), values=filter_col) +
  scale_x_discrete(labels=c("Pre-filter (150)", "Post-filter (150)", "Pre-filter (175)", "Post-filter (175)", "Pre-filter (McL F0)", "Post-filter (McL F0)", "Pre-filter (Aus F0)", "Post-filter (Aus F0)", "GBE", "Mus CDS")) +
  #scale_y_continuous(expand=c(0,0)) +
  bartheme() +
  theme(legend.position="none",
        axis.text.x=element_text(angle=45, hjust=1))
print(nongap_len)

1.2.4 Number of sequences per alignment that are >20% gap (pre-filter)

aln_data_150$pre.gappy.seqs.perc = (aln_data_150$pre.gappy.seqs / aln_data_150$pre.num.seqs) * 100
bins = seq(0,100,by=1)
bin_labels = seq(1,100,by=1)
gappy_counts_150 = hist(aln_data_150$pre.gappy.seqs.perc, breaks=bins, include.lowest=T, plot=F)$counts
gappy_binned_150 = data.frame("bin"=bin_labels, "count"=gappy_counts_150, filter="F150")

aln_data_175$pre.gappy.seqs.perc = (aln_data_175$pre.gappy.seqs / aln_data_175$pre.num.seqs) * 100
bins = seq(0,100,by=1)
bin_labels = seq(1,100,by=1)
gappy_counts_175 = hist(aln_data_175$pre.gappy.seqs.perc, breaks=bins, include.lowest=T, plot=F)$counts
gappy_binned_175 = data.frame("bin"=bin_labels, "count"=gappy_counts_175, filter="F175")

binned_data = bind_rows(gappy_binned_150, gappy_binned_175)
col = corecol(numcol=2)
lab = c("F150", "F175")
ylabels = as.character(abs(seq(from=-700, to=700, by=100)))

dualp = ggplot(binned_data, aes(x = bin, y = ifelse(filter == "F175",-1, 1)*count, fill = filter)) + 
  geom_col(width=1) +
  geom_hline(yintercept=0) +
  scale_x_continuous(name = "% of sequences that are\n>20% gap", limits = c(0, 100), expand = c(0, 0)) +
  scale_y_continuous(name = "# of alignments", limits=c(-700,700), breaks = 100*(-7:7), labels=ylabels) +
  scale_fill_manual(name="", limits=lab, values=col) +
  bartheme()
print(dualp)

# aln_data$pre.gappy.seqs = (aln_data$pre.gappy.seqs / aln_data$pre.num.seqs) * 100
# aln_20_gaps_seqs = ggplot(aln_data, aes(x=pre.gappy.seqs)) +
#   geom_histogram(fill=filter_col[1], color="#333333", bins=50) +
#   scale_x_continuous(expand=c(0,0)) +
#   scale_y_continuous(expand=c(0,0)) +
#   xlab("% of sequences that are\n>20% gap") +
#   ylab("# of alignments") +
#   bartheme()
# print(aln_20_gaps_seqs)

1.2.5 Number of sites that are >20% gap (pre-filter)

pre_sites_150 = select(aln_data_150, align, pre.gappy.sites)
pre_sites_150$label = "Pre-filter (F150)"
names(pre_sites_150)[2] = "perc.sites"

post_sites_150 = select(aln_data_150, align, post.gappy.sites)
post_sites_150$label = "Post-filter (F150)"
names(post_sites_150)[2] = "perc.sites"

pre_sites_175 = select(aln_data_175, align, pre.gappy.sites)
pre_sites_175$label = "Pre-filter (F175)"
names(pre_sites_175)[2] = "perc.sites"

post_sites_175 = select(aln_data_175, align, post.gappy.sites)
post_sites_175$label = "Post-filter (F175)"
names(post_sites_175)[2] = "perc.sites"

pre_sites_mcl = select(aln_data_mcl, align, pre.gappy.sites)
pre_sites_mcl$label = "Pre-filter (McL F0)"
names(pre_sites_mcl)[2] = "perc.sites"

post_sites_mcl = select(aln_data_mcl, align, post.gappy.sites)
post_sites_mcl$label = "Post-filter (McL F0)"
names(post_sites_mcl)[2] = "perc.sites"

pre_sites_aus = select(aln_data_aus, align, pre.gappy.sites)
pre_sites_aus$label = "Pre-filter (Aus F0)"
names(pre_sites_aus)[2] = "perc.sites"

post_sites_aus = select(aln_data_aus, align, post.gappy.sites)
post_sites_aus$label = "Post-filter (Aus F0)"
names(post_sites_aus)[2] = "perc.sites"

num_sites = rbind(pre_sites_150, post_sites_150, pre_sites_175, post_sites_175, pre_sites_mcl, post_sites_mcl, pre_sites_aus, post_sites_aus)
num_sites = subset(num_sites, perc.sites < 1000)
num_sites$label = factor(num_sites$label, levels=c("Pre-filter (F150)", "Post-filter (F150)", "Pre-filter (F175)", "Post-filter (F175)", "Pre-filter (McL F0)", "Post-filter (McL F0)", "Pre-filter (Aus F0)", "Post-filter (Aus F0)"))


aln_20_gap_sites = ggplot(num_sites, aes(x=label, y=perc.sites, fill=label)) +
  geom_quasirandom(size=2, width=0.25, alpha=0.25, color="#666666") +
  geom_boxplot(outlier.shape=NA, alpha=0.75, width=0.5, color="#666666") +
  ylab("# of sites ") +
  xlab("") +
  scale_fill_manual(labels=c("Pre-filter (F150)", "Post-filter (F150)", "Pre-filter (F175)", "Post-filter (F175)", "Pre-filter (McL F0)", "Post-filter (McL F0)", "Pre-filter (Aus F0)", "Post-filter (Aus F0)"), values=filter_col) +
  scale_x_discrete(labels=c("Pre-filter (F150)", "Post-filter (F150)", "Pre-filter (F175)", "Post-filter (F175)", "Pre-filter (McL F0)", "Post-filter (McL F0)", "Pre-filter (Aus F0)", "Post-filter (Aus F0)")) +
  #scale_y_continuous(expand=c(0,0)) +
  bartheme() +
  theme(legend.position="none",
        axis.text.x=element_text(angle=45, hjust=1))
print(aln_20_gap_sites)

# aln_20_gap_sites = ggplot(aln_data, aes(x=1, y=pre.percent.sites.20.gap)) +
#   geom_quasirandom(size=2, width=0.25) +
#   geom_boxplot(outlier.shape=NA, alpha=0.3, width=0.5, fill="transparent", color="#666666") +
#   geom_quasirandom(data=aln_data, aes(x=2, y=post.percent.sites.20.gap), size=2, width=0.25) +
#   geom_boxplot(data=aln_data, aes(x=2, y=post.percent.sites.20.gap), outlier.shape=NA, alpha=0.3, width=0.5, fill="transparent", color="#666666") +
#   #scale_y_continuous(expand=c(0,0)) +
#   bartheme()
# print(aln_20_gap_sites)

2 Coding alignment filters

A look at the effects of varying all three filters:

  1. Filtering the required number of samples present for exonerate. Targets with fewer than [0, 50, 100, 150, 175, 200] sample hits will be filtered. This is the presence filter denoted “P”.
  2. Filtering sequences that are more than [20, 40, 60, 80, 100]% gaps. This is the sequence filter denoted as “S”.
  3. Filtering codon sites that, among sliding windows of 3 codons, have at least 2 gaps in at least 2 codons in [25, 50, 75, 100]% of samples. This is the codon site filter denoted as “C”.
p0s20c50 = read.csv("../../data/aln-filters/aln-stats-f0-seq20-site50.tab", header=T, comment.char="#", sep="\t")
p0s20c50$label = "P0S20C50"

p0s40c50 = read.csv("../../data/aln-filters/aln-stats-f0-seq40-site50.tab", header=T, comment.char="#", sep="\t")
p0s40c50$label = "P0S40C50"

p0s60c50 = read.csv("../../data/aln-filters/aln-stats-f0-seq60-site50.tab", header=T, comment.char="#", sep="\t")
p0s60c50$label = "P0S60C50"

p0s80c50 = read.csv("../../data/aln-filters/aln-stats-f0-seq80-site50.tab", header=T, comment.char="#", sep="\t")
p0s80c50$label = "P0S80C50"

p0s100c50 = read.csv("../../data/aln-filters/aln-stats-f0-seq100-site50.tab", header=T, comment.char="#", sep="\t")
p0s100c50$label = "P0S100C50"


p0s20c25 = read.csv("../../data/aln-filters/aln-stats-f0-seq20-site25.tab", header=T, comment.char="#", sep="\t")
p0s20c25$label = "P0S20C25"

p0s20c75 = read.csv("../../data/aln-filters/aln-stats-f0-seq20-site75.tab", header=T, comment.char="#", sep="\t")
p0s20c75$label = "P0S20C75"

p0s20c100 = read.csv("../../data/aln-filters/aln-stats-f0-seq20-site100.tab", header=T, comment.char="#", sep="\t")
p0s20c100$label = "P0S20C100"


p50s20c50 = read.csv("../../data/aln-filters/aln-stats-f50-seq20-site50.tab", header=T, comment.char="#", sep="\t")
p50s20c50$label = "P50S20C50"

p100s20c50 = read.csv("../../data/aln-filters/aln-stats-f100-seq20-site50.tab", header=T, comment.char="#", sep="\t")
p100s20c50$label = "P100S20C50"

p150s20c50 = read.csv("../../data/aln-filters/aln-stats-f150-seq20-site50.tab", header=T, comment.char="#", sep="\t")
p150s20c50$label = "P150S20C50"


p175s20c50 = read.csv("../../data/aln-filters/aln-stats-f175-seq20-site50.tab", header=T, comment.char="#", sep="\t")
p175s20c50$label = "P175S20C50"

p175s40c50 = read.csv("../../data/aln-filters/aln-stats-f175-seq20-site50.tab", header=T, comment.char="#", sep="\t")
p175s40c50$label = "P175S40C50"

p175s60c50 = read.csv("../../data/aln-filters/aln-stats-f175-seq20-site50.tab", header=T, comment.char="#", sep="\t")
p175s60c50$label = "P175S60C50"

p175s80c50 = read.csv("../../data/aln-filters/aln-stats-f175-seq20-site50.tab", header=T, comment.char="#", sep="\t")
p175s80c50$label = "P175S80C50"

p175s100c50 = read.csv("../../data/aln-filters/aln-stats-f175-seq20-site50.tab", header=T, comment.char="#", sep="\t")
p175s100c50$label = "P175S100C50"


p175s20c25 = read.csv("../../data/aln-filters/aln-stats-f175-seq20-site25.tab", header=T, comment.char="#", sep="\t")
p175s20c25$label = "P175S20C25"

p175s20c75 = read.csv("../../data/aln-filters/aln-stats-f175-seq20-site75.tab", header=T, comment.char="#", sep="\t")
p175s20c75$label = "P175S20C75"

p175s20c100 = read.csv("../../data/aln-filters/aln-stats-f175-seq20-site100.tab", header=T, comment.char="#", sep="\t")
p175s20c100$label = "P175S20C100"


p200s20c50 = read.csv("../../data/aln-filters/aln-stats-f200-seq20-site50.tab", header=T, comment.char="#", sep="\t")
p200s20c50$label = "P200S20C50"

2.1 Varying the Presence (P) filter

Filtering targets with exonerate matches in less than [0, 50, 100, 150, 175, 200] samples

2.1.1 Alignment length (codons)

#gbe_seqs = select(gbe_data, align, pre.codon.aln.length)
gbe_data$label = "GBE"
names(gbe_data)[3] = "post.codon.aln.length"
gbe_lens = select(gbe_data, align, label, post.codon.aln.length)
gbe_lens = subset(gbe_lens, post.codon.aln.length < 10000)


p_data = rbind(p0s20c50, p50s20c50, p100s20c50, p150s20c50, p175s20c50, p200s20c50)
p_data = select(p_data, align, label, post.codon.aln.length)

p_data = rbind(p_data, gbe_lens)

p_means = select(p_data, label, post.codon.aln.length)
p_means = p_means %>% group_by(label) %>% summarize(post.codon.aln.length=mean(post.codon.aln.length, na.rm=T))

p_data$label = factor(p_data$label, levels=c("P0S20C50", "P50S20C50", "P100S20C50", "P150S20C50", "P175S20C50", "P200S20C50", "GBE"), ordered=T)
#levels(p_data$label) = c("P0S20C50", "P50S20C50", "P100S20C50", "P150S20C50", "P175S20C50", "P200S20C50", "GBE"), ordered=T)


p_means$label = factor(p_means$label, levels=c("P0S20C50", "P50S20C50", "P100S20C50", "P150S20C50", "P175S20C50", "P200S20C50", "GBE"), ordered=T)
#p_means$label = as.factor(p_means$label)
#levels(p_means$label) = c("P0S20C50", "P50S20C50", "P100S20C50", "P150S20C50", "P175S20C50", "P200S20C50")

p_lens = ggplot(p_data, aes(x=label, y=post.codon.aln.length, group=label)) +
  geom_quasirandom(size=2, width=0.25, alpha=0.25, color="#666666") +
  geom_boxplot(outlier.shape=NA, alpha=0.75, width=0.5, color="#666666") +
  geom_point(data=p_means, aes(x=label, y=post.codon.aln.length), size=2, color="red") +
  geom_line(data=p_means, aes(x=label, y=post.codon.aln.length), size=2, color="blue") +
  ylab("Alignment length (codons)") +
  xlab("") +
  #scale_fill_manual(labels=c("Pre-filter", "Post-filter"), values=filter_col) +
  #scale_x_discrete(labels=c("Pre-filter", "Post-filter")) +
  #scale_y_continuous(expand=c(0,0)) +
  bartheme() +
  theme(legend.position="none", axis.text.x = element_text(angle=45, hjust=1))
print(p_lens)

p_means %>% kable() %>% kable_styling(bootstrap_options=c("striped", "condended", "responsive"), full_width=F)
label post.codon.aln.length
GBE 372.6928
P0S20C50 613.5621
P100S20C50 557.2141
P150S20C50 476.9085
P175S20C50 423.1905
P200S20C50 284.0106
P50S20C50 593.0098

2.1.2 Ungapped alignment length (codons)

#gbe_seqs = select(gbe_data, align, pre.codon.aln.length)
gbe_data$label = "GBE"
names(gbe_data)[4] = "post.avg.nongap.length"
gbe_lens = select(gbe_data, align, label, post.avg.nongap.length)
gbe_lens = subset(gbe_lens, post.avg.nongap.length < 10000)

mus_lens = select(mus_data, transcript_id, exon_length)
mus_lens = mus_lens %>% group_by(transcript_id) %>% summarize(post.avg.nongap.length=sum(exon_length))
mus_lens$post.avg.nongap.length = mus_lens$post.avg.nongap.length / 3
mus_lens$label = "Mus CDS"
names(mus_lens)[1] = "align"
mus_lens = subset(mus_lens, post.avg.nongap.length < 5000)


p_data = rbind(p0s20c50, p50s20c50, p100s20c50, p150s20c50, p175s20c50, p200s20c50)
p_data = select(p_data, align, label, post.avg.nongap.length)

p_data = rbind(p_data, gbe_lens, mus_lens)

p_means = select(p_data, label, post.avg.nongap.length)
p_means = p_means %>% group_by(label) %>% summarize(post.avg.nongap.length=mean(post.avg.nongap.length, na.rm=T))

p_data$label = factor(p_data$label, levels=c("P0S20C50", "P50S20C50", "P100S20C50", "P150S20C50", "P175S20C50", "P200S20C50", "GBE", "Mus CDS"), ordered=T)
#levels(p_data$label) = c("P0S20C50", "P50S20C50", "P100S20C50", "P150S20C50", "P175S20C50", "P200S20C50", "GBE"), ordered=T)


p_means$label = factor(p_means$label, levels=c("P0S20C50", "P50S20C50", "P100S20C50", "P150S20C50", "P175S20C50", "P200S20C50", "GBE", "Mus CDS"), ordered=T)
#p_means$label = as.factor(p_means$label)
#levels(p_means$label) = c("P0S20C50", "P50S20C50", "P100S20C50", "P150S20C50", "P175S20C50", "P200S20C50")

p_lens = ggplot(p_data, aes(x=label, y=post.avg.nongap.length, group=label)) +
  geom_quasirandom(size=2, width=0.25, alpha=0.25, color="#666666") +
  geom_boxplot(outlier.shape=NA, alpha=0.75, width=0.5, color="#666666") +
  geom_point(data=p_means, aes(x=label, y=post.avg.nongap.length), size=2, color="red") +
  geom_line(data=p_means, aes(x=label, y=post.avg.nongap.length), size=2, color="blue") +
  ylab("Avg. ungapped sequence\nlength (codons)") +
  xlab("") +
  #scale_fill_manual(labels=c("Pre-filter", "Post-filter"), values=filter_col) +
  #scale_x_discrete(labels=c("Pre-filter", "Post-filter")) +
  #scale_y_continuous(expand=c(0,0)) +
  bartheme() +
  theme(legend.position="none", axis.text.x = element_text(angle=45, hjust=1))
print(p_lens)

p_means %>% kable() %>% kable_styling(bootstrap_options=c("striped", "condended", "responsive"), full_width=F)
label post.avg.nongap.length
GBE 332.1052
Mus CDS 586.0939
P0S20C50 550.2048
P100S20C50 510.1966
P150S20C50 448.7321
P175S20C50 403.9664
P200S20C50 276.3557
P50S20C50 535.6017

2.2 Varying the gappy sequence (S) filter

Filtering sequences with % gaps larger than [20, 40, 60, 80, 100] samples

2.2.1 Alignment length (codons)

#gbe_seqs = select(gbe_data, align, pre.codon.aln.length)
gbe_data$label = "GBE"
names(gbe_data)[3] = "post.codon.aln.length"
gbe_lens = select(gbe_data, align, label, post.codon.aln.length)
gbe_lens = subset(gbe_lens, post.codon.aln.length < 10000)


p_data = rbind(p175s20c50, p175s40c50, p175s60c50, p175s80c50, p175s100c50)
p_data = select(p_data, align, label, post.codon.aln.length)

p_data = rbind(p_data, gbe_lens)

p_means = select(p_data, label, post.codon.aln.length)
p_means = p_means %>% group_by(label) %>% summarize(post.codon.aln.length=mean(post.codon.aln.length, na.rm=T))

p_data$label = factor(p_data$label, levels=c("P175S20C50", "P175S40C50", "P175S60C50", "P175S80C50", "P175S100C50", "GBE"), ordered=T)
#levels(p_data$label) = c("P0S20C50", "P50S20C50", "P100S20C50", "P150S20C50", "P0S20C50", "P200S20C50", "GBE"), ordered=T)


p_means$label = factor(p_means$label, levels=c("P175S20C50", "P175S40C50", "P175S60C50", "P175S80C50", "P175S100C50", "GBE"), ordered=T)
#p_means$label = as.factor(p_means$label)
#levels(p_means$label) = c("P0S20C50", "P50S20C50", "P100S20C50", "P150S20C50", "P0S20C50", "P200S20C50")

p_lens = ggplot(p_data, aes(x=label, y=post.codon.aln.length, group=label)) +
  geom_quasirandom(size=2, width=0.25, alpha=0.25, color="#666666") +
  geom_boxplot(outlier.shape=NA, alpha=0.75, width=0.5, color="#666666") +
  geom_point(data=p_means, aes(x=label, y=post.codon.aln.length), size=2, color="red") +
  geom_line(data=p_means, aes(x=label, y=post.codon.aln.length), size=2, color="blue") +
  ylab("Alignment length (codons)") +
  xlab("") +
  #scale_fill_manual(labels=c("Pre-filter", "Post-filter"), values=filter_col) +
  #scale_x_discrete(labels=c("Pre-filter", "Post-filter")) +
  #scale_y_continuous(expand=c(0,0)) +
  bartheme() +
  theme(legend.position="none", axis.text.x = element_text(angle=45, hjust=1))
print(p_lens)

p_means %>% kable() %>% kable_styling(bootstrap_options=c("striped", "condended", "responsive"), full_width=F)
label post.codon.aln.length
GBE 372.6928
P175S100C50 423.1905
P175S20C50 423.1905
P175S40C50 423.1905
P175S60C50 423.1905
P175S80C50 423.1905

2.2.2 Ungapped alignment length (codons)

#gbe_seqs = select(gbe_data, align, pre.codon.aln.length)
gbe_data$label = "GBE"
names(gbe_data)[4] = "post.avg.nongap.length"
gbe_lens = select(gbe_data, align, label, post.avg.nongap.length)
gbe_lens = subset(gbe_lens, post.avg.nongap.length < 10000)

mus_lens = select(mus_data, transcript_id, exon_length)
mus_lens = mus_lens %>% group_by(transcript_id) %>% summarize(post.avg.nongap.length=sum(exon_length))
mus_lens$post.avg.nongap.length = mus_lens$post.avg.nongap.length / 3
mus_lens$label = "Mus CDS"
names(mus_lens)[1] = "align"
mus_lens = subset(mus_lens, post.avg.nongap.length < 5000)


p_data = rbind(p175s20c50, p175s40c50, p175s60c50, p175s80c50, p175s100c50)
p_data = select(p_data, align, label, post.avg.nongap.length)

p_data = rbind(p_data, gbe_lens, mus_lens)

p_means = select(p_data, label, post.avg.nongap.length)
p_means = p_means %>% group_by(label) %>% summarize(post.avg.nongap.length=mean(post.avg.nongap.length, na.rm=T))

p_data$label = factor(p_data$label, levels=c("P175S20C50", "P175S40C50", "P175S60C50", "P175S80C50", "P175S100C50", "GBE", "Mus CDS"), ordered=T)
#levels(p_data$label) = c("P0S20C50", "P50S20C50", "P100S20C50", "P150S20C50", "P175S20C50", "P200S20C50", "GBE"), ordered=T)


p_means$label = factor(p_means$label, levels=c("P175S20C50", "P175S40C50", "P175S60C50", "P175S80C50", "P175S100C50", "GBE", "Mus CDS"), ordered=T)
#p_means$label = as.factor(p_means$label)
#levels(p_means$label) = c("P0S20C50", "P50S20C50", "P100S20C50", "P150S20C50", "P175S20C50", "P200S20C50")

p_lens = ggplot(p_data, aes(x=label, y=post.avg.nongap.length, group=label)) +
  geom_quasirandom(size=2, width=0.25, alpha=0.25, color="#666666") +
  geom_boxplot(outlier.shape=NA, alpha=0.75, width=0.5, color="#666666") +
  geom_point(data=p_means, aes(x=label, y=post.avg.nongap.length), size=2, color="red") +
  geom_line(data=p_means, aes(x=label, y=post.avg.nongap.length), size=2, color="blue") +
  ylab("Avg. ungapped sequence\nlength (codons)") +
  xlab("") +
  #scale_fill_manual(labels=c("Pre-filter", "Post-filter"), values=filter_col) +
  #scale_x_discrete(labels=c("Pre-filter", "Post-filter")) +
  #scale_y_continuous(expand=c(0,0)) +
  bartheme() +
  theme(legend.position="none", axis.text.x = element_text(angle=45, hjust=1))
print(p_lens)

p_means %>% kable() %>% kable_styling(bootstrap_options=c("striped", "condended", "responsive"), full_width=F)
label post.avg.nongap.length
GBE 332.1052
Mus CDS 586.0939
P175S100C50 403.9664
P175S20C50 403.9664
P175S40C50 403.9664
P175S60C50 403.9664
P175S80C50 403.9664

2.3 Varying the codon site (C) filter

Filtering three-codon windows where at least 2 codons have at least 2 gaps in greater than [25, 50, 75, 100]% of samples

2.3.1 Alignment length (codons)

#gbe_seqs = select(gbe_data, align, pre.codon.aln.length)
gbe_data$label = "GBE"
names(gbe_data)[3] = "post.codon.aln.length"
gbe_lens = select(gbe_data, align, label, post.codon.aln.length)
gbe_lens = subset(gbe_lens, post.codon.aln.length < 10000)


p_data = rbind(p175s20c25, p175s20c50, p175s20c75, p175s20c100)
p_data = select(p_data, align, label, post.codon.aln.length)

p_data = rbind(p_data, gbe_lens)

p_means = select(p_data, label, post.codon.aln.length)
p_means = p_means %>% group_by(label) %>% summarize(post.codon.aln.length=mean(post.codon.aln.length, na.rm=T))

p_data$label = factor(p_data$label, levels=c("P175S20C25", "P175S20C50", "P175S20C75", "P175S20C100", "GBE"), ordered=T)
#levels(p_data$label) = c("P0S20C50", "P50S20C50", "P100S20C50", "P150S20C50", "P0S20C50", "P200S20C50", "GBE"), ordered=T)


p_means$label = factor(p_means$label, levels=c("P175S20C25", "P175S20C50", "P175S20C75", "P175S20C100", "GBE"), ordered=T)
#p_means$label = as.factor(p_means$label)
#levels(p_means$label) = c("P0S20C50", "P50S20C50", "P100S20C50", "P150S20C50", "P0S20C50", "P200S20C50")

p_lens = ggplot(p_data, aes(x=label, y=post.codon.aln.length, group=label)) +
  geom_quasirandom(size=2, width=0.25, alpha=0.25, color="#666666") +
  geom_boxplot(outlier.shape=NA, alpha=0.75, width=0.5, color="#666666") +
  geom_point(data=p_means, aes(x=label, y=post.codon.aln.length), size=2, color="red") +
  geom_line(data=p_means, aes(x=label, y=post.codon.aln.length), size=2, color="blue") +
  ylab("Alignment length (codons)") +
  xlab("") +
  #scale_fill_manual(labels=c("Pre-filter", "Post-filter"), values=filter_col) +
  #scale_x_discrete(labels=c("Pre-filter", "Post-filter")) +
  #scale_y_continuous(expand=c(0,0)) +
  bartheme() +
  theme(legend.position="none", axis.text.x = element_text(angle=45, hjust=1))
print(p_lens)

p_means %>% kable() %>% kable_styling(bootstrap_options=c("striped", "condended", "responsive"), full_width=F)
label post.codon.aln.length
GBE 372.6928
P175S20C100 434.4938
P175S20C25 414.0011
P175S20C50 423.1905
P175S20C75 427.6122

2.3.2 Ungapped alignment length (codons)

#gbe_seqs = select(gbe_data, align, pre.codon.aln.length)
gbe_data$label = "GBE"
names(gbe_data)[4] = "post.avg.nongap.length"
gbe_lens = select(gbe_data, align, label, post.avg.nongap.length)
gbe_lens = subset(gbe_lens, post.avg.nongap.length < 10000)

mus_lens = select(mus_data, transcript_id, exon_length)
mus_lens = mus_lens %>% group_by(transcript_id) %>% summarize(post.avg.nongap.length=sum(exon_length))
mus_lens$post.avg.nongap.length = mus_lens$post.avg.nongap.length / 3
mus_lens$label = "Mus CDS"
names(mus_lens)[1] = "align"
mus_lens = subset(mus_lens, post.avg.nongap.length < 5000)


p_data = rbind(p175s20c25, p175s20c50, p175s20c75, p175s20c100)
p_data = select(p_data, align, label, post.avg.nongap.length)

p_data = rbind(p_data, gbe_lens, mus_lens)

p_means = select(p_data, label, post.avg.nongap.length)
p_means = p_means %>% group_by(label) %>% summarize(post.avg.nongap.length=mean(post.avg.nongap.length, na.rm=T))

p_data$label = factor(p_data$label, levels=c("P175S20C25", "P175S20C50", "P175S20C75", "P175S20C100", "GBE", "Mus CDS"), ordered=T)
#levels(p_data$label) = c("P0S20C50", "P50S20C50", "P100S20C50", "P150S20C50", "P175S20C50", "P200S20C50", "GBE"), ordered=T)


p_means$label = factor(p_means$label, levels=c("P175S20C25", "P175S20C50", "P175S20C75", "P175S20C100", "GBE", "Mus CDS"), ordered=T)
#p_means$label = as.factor(p_means$label)
#levels(p_means$label) = c("P0S20C50", "P50S20C50", "P100S20C50", "P150S20C50", "P175S20C50", "P200S20C50")

p_lens = ggplot(p_data, aes(x=label, y=post.avg.nongap.length, group=label)) +
  geom_quasirandom(size=2, width=0.25, alpha=0.25, color="#666666") +
  geom_boxplot(outlier.shape=NA, alpha=0.75, width=0.5, color="#666666") +
  geom_point(data=p_means, aes(x=label, y=post.avg.nongap.length), size=2, color="red") +
  geom_line(data=p_means, aes(x=label, y=post.avg.nongap.length), size=2, color="blue") +
  ylab("Avg. ungapped sequence\nlength (codons)") +
  xlab("") +
  #scale_fill_manual(labels=c("Pre-filter", "Post-filter"), values=filter_col) +
  #scale_x_discrete(labels=c("Pre-filter", "Post-filter")) +
  #scale_y_continuous(expand=c(0,0)) +
  bartheme() +
  theme(legend.position="none", axis.text.x = element_text(angle=45, hjust=1))
print(p_lens)

p_means %>% kable() %>% kable_styling(bootstrap_options=c("striped", "condended", "responsive"), full_width=F)
label post.avg.nongap.length
GBE 332.1052
Mus CDS 586.0939
P175S20C100 406.3721
P175S20C25 399.1845
P175S20C50 403.9664
P175S20C75 404.9436

3 Noncoding+coding alignments

noref_full_alns = read.csv("../../data/noncoding-aln-stats/full-all-alnlens-norefs.csv", header=F)
names(noref_full_alns) = c("file", "sample", "aln.len")
noref_full_alns = noref_full_alns %>% group_by(file) %>% summarize(aln.len=mean(aln.len, na.rm=T))
noref_full_alns$label = "No ref"

f500_full_alns = read.csv("../../data/noncoding-aln-stats/full-all-alnlens-500.csv", header=F)
names(f500_full_alns) = c("file", "sample", "aln.len")
f500_full_alns = f500_full_alns %>% group_by(file) %>% summarize(aln.len=mean(aln.len, na.rm=T))
f500_full_alns$label = "500bp flank refs"

f200_full_alns = read.csv("../../data/noncoding-aln-stats/full-all-alnlens-500.csv", header=F)
names(f200_full_alns) = c("file", "sample", "aln.len")
f200_full_alns = f200_full_alns %>% group_by(file) %>% summarize(aln.len=mean(aln.len, na.rm=T))
f200_full_alns$label = "200bp flank refs"

##

f500_rep_alns = read.csv("../../data/noncoding-aln-stats/reproductive-all-alnlens-500.csv", header=F)
names(f500_rep_alns) = c("file", "sample", "aln.len")
f500_rep_alns = f500_rep_alns %>% group_by(file) %>% summarize(aln.len=mean(aln.len, na.rm=T))
f500_rep_alns$label = "500bp flank refs"

f200_rep_alns = read.csv("../../data/noncoding-aln-stats/reproductive-all-alnlens-500.csv", header=F)
names(f200_rep_alns) = c("file", "sample", "aln.len")
f200_rep_alns = f200_rep_alns %>% group_by(file) %>% summarize(aln.len=mean(aln.len, na.rm=T))
f200_rep_alns$label = "200bp flank refs"

##

#noref_full_seqs = read.csv("../../data/noncoding-aln-stats/full-all-seqlens-norefs.csv", header=F)
#names(noref_full_seqs) = c("file", "sample", "aln.len")
#noref_full_seqs = noref_full_seqs %>% group_by(file) %>% summarize(aln.len=mean(aln.len, na.rm=T))
#noref_full_seqs$label = "No ref"

f500_full_seqs = read.csv("../../data/noncoding-aln-stats/full-all-seqlens-500.csv", header=F)
names(f500_full_seqs) = c("file", "sample", "seq.len")

f200_full_seqs = read.csv("../../data/noncoding-aln-stats/full-all-seqlens-200.csv", header=F)
names(f200_full_seqs) = c("file", "sample", "seq.len")

##

f500_rep_seqs = read.csv("../../data/noncoding-aln-stats/reproductive-all-seqlens-500.csv", header=F)
names(f500_rep_seqs) = c("file", "sample", "seq.len")


f200_rep_seqs = read.csv("../../data/noncoding-aln-stats/reproductive-all-seqlens-200.csv", header=F)
names(f200_rep_seqs) = c("file", "sample", "seq.len")

##

f200_mm_seqs = read.csv("../../data/noncoding-aln-stats/mm10-exons-lens-200.csv", header=F)
names(f200_mm_seqs) = c("sample", "avg.seq.len")
f200_mm_seqs$label = "mm10 200bp flank"

f500_mm_seqs = read.csv("../../data/noncoding-aln-stats/mm10-exons-lens-500.csv", header=F)
names(f500_mm_seqs) = c("sample", "avg.seq.len")
f500_mm_seqs$label = "mm10 500bp flank"

mm_seqs = read.csv("../../data/noncoding-aln-stats/mm10-exons-lens-noflank.csv", header=F)
names(mm_seqs) = c("sample", "avg.seq.len")
mm_seqs$label = "mm10 no flank"

3.0.1 Unaligned sequence length (Full dataset + mouse and rat reference seqs)

s1 = subset(f500_full_seqs, sample==">mm10" | sample==">rnor6")
s1 = s1 %>% group_by(file) %>% summarize(avg.seq.len=mean(seq.len))
s1$label = "Ref seqs 500bp flank"
s2 = subset(f200_full_seqs, sample==">mm10" | sample==">rnor6")
s2 = s2 %>% group_by(file) %>% summarize(avg.seq.len=mean(seq.len))
s2$label = "Ref seqs 200bp flank"
s3 = subset(f500_full_seqs, sample!=">mm10" & sample!=">rnor6")
s3 = s3 %>% group_by(file) %>% summarize(avg.seq.len=mean(seq.len))
s3$label = "Non-ref seqs"
full_seq_lens = rbind(s1, s2, s3)
full_seq_lens = select(full_seq_lens, avg.seq.len, label)
full_seq_lens = rbind(full_seq_lens, select(f200_mm_seqs, avg.seq.len, label), 
                      select(f500_mm_seqs, avg.seq.len, label), select(mm_seqs, avg.seq.len, label))
#full_seq_lens = rbind(full_seq_lens, s3)

seq_lens = ggplot(subset(full_seq_lens, avg.seq.len < 2000), aes(x=label, y=avg.seq.len, group=label)) +
  geom_quasirandom(size=2, width=0.25, alpha=0.25, color="#666666") +
  geom_boxplot(outlier.shape=NA, alpha=0.75, width=0.5, color="#666666") +
  #geom_point(data=p_means, aes(x=label, y=post.codon.aln.length), size=2, color="red") +
  #geom_line(data=p_means, aes(x=label, y=post.codon.aln.length), size=2, color="blue") +
  ylab("Sequence length (bp)") +
  xlab("") +
  bartheme() +
  theme(legend.position="none", axis.text.x = element_text(angle=45, hjust=1))
print(seq_lens)

# full_aln_lens = rbind(noref_full_alns, f500_full_alns, f200_full_alns)

# full_lens = ggplot(subset(full_aln_lens, aln.len <= 4000), aes(x=label, y=aln.len, group=label)) +
#   geom_quasirandom(size=2, width=0.25, alpha=0.25, color="#666666") +
#   geom_boxplot(outlier.shape=NA, alpha=0.75, width=0.5, color="#666666") +
#   #geom_point(data=p_means, aes(x=label, y=post.codon.aln.length), size=2, color="red") +
#   #geom_line(data=p_means, aes(x=label, y=post.codon.aln.length), size=2, color="blue") +
#   ylab("Alignment length (bp)") +
#   xlab("") +
#   bartheme() +
#   theme(legend.position="none", axis.text.x = element_text(angle=45, hjust=1))
# print(full_lens)

3.0.2 Alignment length (full dataset) while varying flank size of reference exons

full_aln_lens = rbind(noref_full_alns, f500_full_alns, f200_full_alns)

full_lens = ggplot(subset(full_aln_lens, aln.len <= 4000), aes(x=label, y=aln.len, group=label)) +
  geom_quasirandom(size=2, width=0.25, alpha=0.25, color="#666666") +
  geom_boxplot(outlier.shape=NA, alpha=0.75, width=0.5, color="#666666") +
  #geom_point(data=p_means, aes(x=label, y=post.codon.aln.length), size=2, color="red") +
  #geom_line(data=p_means, aes(x=label, y=post.codon.aln.length), size=2, color="blue") +
  ylab("Alignment length (bp)") +
  xlab("") +
  bartheme() +
  theme(legend.position="none", axis.text.x = element_text(angle=45, hjust=1))
print(full_lens)

3.0.3 Alignment length (reproductive dataset) while varying flank size of reference exons

rep_aln_lens = rbind(f500_rep_alns, f200_rep_alns)

rep_lens = ggplot(subset(rep_aln_lens, aln.len <= 4000), aes(x=label, y=aln.len, group=label)) +
  geom_quasirandom(size=2, width=0.25, alpha=0.25, color="#666666") +
  geom_boxplot(outlier.shape=NA, alpha=0.75, width=0.5, color="#666666") +
  #geom_point(data=p_means, aes(x=label, y=post.codon.aln.length), size=2, color="red") +
  #geom_line(data=p_means, aes(x=label, y=post.codon.aln.length), size=2, color="blue") +
  ylab("Alignment length (bp)") +
  xlab("") +
  bartheme() +
  theme(legend.position="none", axis.text.x = element_text(angle=45, hjust=1))
print(rep_lens)