Exome transcript selection
<- function(df, column, max_bin){
binData # Function to pre-bin SVs by length
= data.frame("bin"=seq(from=0,to=max_bin,by=0.01), count=0)
binned_df for(i in 1:nrow(df)) {
<- df[i,]
row = floor(row[[column]]*100)/100
bin $bin==bin,]$count = binned_df[binned_df$bin==bin,]$count + 1
binned_df[binned_df
}return(binned_df)
}
::opts_chunk$set(echo = TRUE)
knitr
library(ggplot2)
#library(plyr)
library(tidyr)
library(dplyr)
library(eulerr)
source("../lib/design.r")
#htmltools::includeHTML("../html-chunks/nav.html")
1 Exome transcript selection
1.1 Filtering criteria
Two possible strategies arise for selecting transcripts if multiple isoforms exist after other filtering steps:
- Select the longest remaining transcript.
- Select the transcript with the most probe target hits.
1.1.1 Comparisons of selection criteria
= scan("../../data/transcript-selection/selected-transcripts-list.txt", what=character())
len_transcripts = scan("../../data/transcript-selection/selected-transcripts-target-check-list.txt", what=character())
target_transcripts = list("Length"=len_transcripts, "Targets"=target_transcripts)
transcript_lists
plot(euler(transcript_lists, shape="ellipse"), quantities=T, fill=corecol(numcol=2, pal="wilke"),
main="Last step of transcript selection based on max...")
Because we do not lose too many transcripts when selecting by max number of prode target hits, we proceed with that as the final filter criteria.
These are the transcript filtering steps:
1. Transcript must be present in both mouse and rat.
2. Transcript must have one-to-one relationship between mouse and rat.
3. Orthology confidence must be high (1).
4. Transcript must have dS below 0.5 (see distibutions below).
5. If multiple transcripts from a gene pass all above filters, the one with the max number of probe targets is selected.
1.2 Filter script log
# Rodent exomes -- select mouse trancsripts
# PYTHON VERSION: 3.8.8
# Script call: select_transcripts.py
# Runtime: 03/17/2021 09:58:48
# ----------------
# Mouse GTF file: ../Reference-genomes/mm10/Mus_musculus.GRCm38.99.gtf.gz
# Rat GTF file: ../Reference-genomes/Rnor6/Rattus_norvegicus.Rnor_6.0.99.gtf.gz
# Ensembl ortholog file: ../02-Annotation-data/mouse-rat-orths-ens99.txt
# Target overlaps file: ../Targets/bed/mm10-targets-to-exons-0.9.bed
# Output file: ../02-Annotation-data/selected-transcripts-targets.txt
# --------------
# dS threshold: 0.5
# --------------
# 03.17.2021 | 09:58:48 Reading target overlaps...
# 03.17.2021 | 09:58:50 Reading mouse feature lengths...
# Mouse transcripts read: 142647
# Rat transcripts read: 41078
# --------------
# 03.17.2021 | 09:59:02 Reading orthologs...
# Genes read: 56289
# Transcipts read: 170971
# --------------
# 03.17.2021 | 09:59:02 Filtering transcripts...
# Transcripts that do not overlap any targets: 36075
# Transcripts with no rat ortholog: 6960
# Transcripts with no one2one rat ortholog: 15496
# Transcripts with no high confidence rat ortholog: 9487
# Transcripts with no rat ortholog below dS threshold: 545
# Transcripts filtered for not having max target hits within gene: 55369
# --------------
# Genes with no passed transcripts: 7084
# Genes with selected transcript: 13130
# --------------
1.3 Mouse-rat dS for orthologous transcripts
All data from Ensembl release 99.
#####################
# Mouse-rat dS for transcripts
#####################\
# pre_binned = binData(pre_data, "dS.with.Rat", 1)
# pre_binned$filter = "Pre-filter"
#
# filter_binned = binData(filter_data, "dS.with.Rat", 1)
# filter_binned$filter = "Post-filter"
= seq(0,1,by=0.01)
bins = seq(0.01,1,by=0.01)
bin_labels = hist(pre_data$dS.with.Rat, breaks=bins, include.lowest=T, plot=F)$counts
pre_counts = data.frame("bin"=bin_labels, "count"=pre_counts, filter="Pre-filter")
pre_binned
= hist(filter_data$dS.with.Rat, breaks=bins, include.lowest=T, plot=F)$counts
filter_counts = data.frame("bin"=bin_labels, "count"=filter_counts, filter="Post-filter")
filter_binned
= bind_rows(pre_binned, filter_binned)
binned_data = corecol(numcol=2)
col = c("Pre-filter", "Post-filter")
lab #ylabels = c("1200", as.character(seq(from=0, to=7000, by=1000)))
= as.character(seq(from=-1000, to=7000, by=1000))
ylabels
= ggplot(binned_data, aes(x = bin, y = ifelse(filter == "Post-filter",-1, 1)*count, fill = filter)) +
dualp geom_col(width=0.01) +
geom_hline(yintercept=0) +
scale_x_continuous(name = "dS", limits = c(-0.02, 1), expand = c(0, 0)) +
scale_y_continuous(name = "# Transcripts", limits=c(-1000,7000), breaks = 1000*(-1:7), labels=ylabels) +
scale_fill_manual(name="", limits=lab, values=col) +
bartheme()
print(dualp)
1.4 Mouse-rat dN for orthologous transcripts
All data from Ensembl release 99.
#####################
# Mouse-rat dN for transcripts
#####################\
# pre_binned = binData(pre_data, "dN.with.Rat", 1)
# pre_binned$filter = "Pre-filter"
#
# filter_binned = binData(filter_data, "dN.with.Rat", 1)
# filter_binned$filter = "Post-filter"
= seq(0,1,by=0.01)
bins = seq(0.01,1,by=0.01)
bin_labels = hist(pre_data$dN.with.Rat, breaks=bins, include.lowest=T, plot=F)$counts
pre_counts = data.frame("bin"=bin_labels, "count"=pre_counts, filter="Pre-filter")
pre_binned
= hist(filter_data$dN.with.Rat, breaks=bins, include.lowest=T, plot=F)$counts
filter_counts = data.frame("bin"=bin_labels, "count"=filter_counts, filter="Post-filter")
filter_binned
= bind_rows(pre_binned, filter_binned)
binned_data = corecol(numcol=2)
col = c("Pre-filter", "Post-filter")
lab = c("5000", as.character(seq(from=0, to=30000, by=5000)))
ylabels
= ggplot(binned_data, aes(x = bin, y = ifelse(filter == "Post-filter",-1, 1)*count, fill = filter)) +
dualp geom_col(width=0.01) +
geom_hline(yintercept=0) +
scale_x_continuous(name = "dN", limits = c(-0.02, 1), expand = c(0, 0)) +
scale_y_continuous(name = "# Transcripts", limits=c(-5000,30000), breaks = 5000*(-1:6), labels=ylabels) +
scale_fill_manual(name="", limits=lab, values=col) +
bartheme()
print(dualp)
1.5 Mouse-rat dN/dS for orthologous transcripts
All data from Ensembl release 99.
#####################
# Mouse-rat dN/dS for transcripts
#####################
# pre_data = subset(pre_data, !is.na(dnds))
# pre_binned = binData(pre_data, "dnds", 2)
# pre_binned$filter = "Pre-filter"
#
# filter_data = subset(filter_data, !is.na(dnds))
# filter_binned = binData(filter_data, "dnds", 2)
# filter_binned$filter = "Post-filter"
= subset(pre_data, dnds <= 2)
pre_data_f
= seq(0,2,by=0.01)
bins = seq(0.01,2,by=0.01)
bin_labels = hist(pre_data_f$dnds, breaks=bins, include.lowest=T, plot=F)$counts
pre_counts = data.frame("bin"=bin_labels, "count"=pre_counts, filter="Pre-filter")
pre_binned
= subset(filter_data, dnds <= 2)
filter_data_f
= hist(filter_data_f$dnds, breaks=bins, include.lowest=T, plot=F)$counts
filter_counts = data.frame("bin"=bin_labels, "count"=filter_counts, filter="Post-filter")
filter_binned
= bind_rows(pre_binned, filter_binned)
binned_data = corecol(numcol=2)
col = c("Pre-filter", "Post-filter")
lab = c(as.character(seq(from=-1000, to=6000, by=1000)))
ylabels
= ggplot(binned_data, aes(x = bin, y = ifelse(filter == "Post-filter",-1, 1)*count, fill = filter)) +
dualp geom_col(width=0.01) +
geom_hline(yintercept=0) +
scale_x_continuous(name = "dN/dS", limits = c(-0.02, 2), expand = c(0, 0)) +
scale_y_continuous(name = "# Transcripts", limits=c(-1000,6000), breaks = 1000*(-1:6), labels=ylabels) +
scale_fill_manual(name="", limits=lab, values=col) +
bartheme()
print(dualp)
1.6 Total transcript length vs total probe target length
Sum length of all probe targets in a given transcript
= read.csv("../../data/Mus-selected-sequences_metadata_samp-counts_ratids.csv", header=T)
ann_data = unique(select(ann_data, transcript_id, transcript_length))
ts_lens names(ts_lens) = c("tid", "ts.len")
= read.csv("../../data/target-bed/mm10-targets-to-exons-0.9.bed", sep="\t", header=F)
target_data names(target_data) = c("target.chrome", "target.start", "target.end", "target.id", "exon.chrome", "exon.start", "exon.end", "exon.ids", "exon.score", "exon.strand", "overlap")
$target.len = target_data$target.end - target_data$target.start
target_data$exon.len = target_data$exon.end - target_data$exon.start
target_data
= target_data %>% separate(exon.ids, c("gid", "tid", "eid"), ";")
target_data = subset(target_data, !is.na(eid))
target_data = subset(target_data, tid %in% ts_lens$tid)
target_data $tid = as.factor(target_data$tid)
target_data#target_data_final = target_data %>% select(tid, target.len) %>% group_by(tid) #%>% summarize(target.len=sum(target.len))
= target_data %>% group_by(tid) %>% summarize(target.len=sum(target.len))
target_data_final #mus_lens = mus_lens %>% group_by(transcript_id) %>% summarize(nongap.len=sum(target_length))
= merge(ts_lens, target_data_final, by="tid")
ts_target_lens = subset(ts_target_lens, ts.len < 10000 & target.len < 10000)
ts_target_lens = ggplot(ts_target_lens, aes(x=ts.len, y=target.len)) +
ts_targ_len_p geom_point(alpha=0.1, color=corecol(pal="wilke", numcol=1, offset=2)) +
geom_smooth(method="lm", se=F, linetype="dashed", color="#333333") +
xlab("Total transcript length") +
ylab("Sum of length of all probe targets\nin transcript") +
bartheme()
print(ts_targ_len_p)
1.7 Number of samples matching each coding exon
= subset(ann_data, protein_coding=="TRUE")
ann_data
= ggplot(ann_data, aes(x=matching_sample_count)) +
exon_count_dist geom_histogram(identity="count", fill=corecol(numcol=1, offset=1)) +
xlab("# of samples with BLAST hit") +
ylab("# of targets") +
scale_y_continuous(expand=c(0,0)) +
bartheme()
print(exon_count_dist)
1.8 Assembly read depth vs number of BLAST hits to probe targets
= read.csv("../../data/exome-stats.csv", header=T)
asm_data = subset(asm_data, referee.avg.depth < 60)
asm_data
= ts_targ_len_p = ggplot(asm_data, aes(x=referee.avg.depth, y=blast.hits.to.targets)) +
targs_depth_p geom_point(alpha=0.5, color=corecol(pal="wilke", numcol=1, offset=4)) +
geom_smooth(method="lm", se=F, linetype="dashed", color="#333333") +
xlab("Avg. read depth") +
ylab("# of BLAST hits") +
bartheme()
print(targs_depth_p)