Exome transcript selection

binData <- function(df, column, max_bin){
# Function to pre-bin SVs by length
  binned_df = data.frame("bin"=seq(from=0,to=max_bin,by=0.01), count=0)
  for(i in 1:nrow(df)) {
    row <- df[i,]
    bin = floor(row[[column]]*100)/100
    binned_df[binned_df$bin==bin,]$count = binned_df[binned_df$bin==bin,]$count + 1
  }
  return(binned_df) 
}

knitr::opts_chunk$set(echo = TRUE)

library(ggplot2)
#library(plyr)
library(tidyr)
library(dplyr)
library(eulerr)
source("../lib/design.r")

#htmltools::includeHTML("../html-chunks/nav.html")

1 Exome transcript selection

1.1 Filtering criteria

Two possible strategies arise for selecting transcripts if multiple isoforms exist after other filtering steps:

  1. Select the longest remaining transcript.
  2. Select the transcript with the most probe target hits.

1.1.1 Comparisons of selection criteria

len_transcripts = scan("../../data/transcript-selection/selected-transcripts-list.txt", what=character())
target_transcripts = scan("../../data/transcript-selection/selected-transcripts-target-check-list.txt", what=character())
transcript_lists = list("Length"=len_transcripts, "Targets"=target_transcripts)

plot(euler(transcript_lists, shape="ellipse"), quantities=T, fill=corecol(numcol=2, pal="wilke"), 
     main="Last step of transcript selection based on max...")

Because we do not lose too many transcripts when selecting by max number of prode target hits, we proceed with that as the final filter criteria.

These are the transcript filtering steps:

1. Transcript must be present in both mouse and rat.
2. Transcript must have one-to-one relationship between mouse and rat.
3. Orthology confidence must be high (1).
4. Transcript must have dS below 0.5 (see distibutions below).
5. If multiple transcripts from a gene pass all above filters, the one with the max number of probe targets is selected.

1.2 Filter script log

# Rodent exomes -- select mouse trancsripts
# PYTHON VERSION: 3.8.8
# Script call:    select_transcripts.py
# Runtime:        03/17/2021 09:58:48
# ----------------
# Mouse GTF file:        ../Reference-genomes/mm10/Mus_musculus.GRCm38.99.gtf.gz
# Rat GTF file:          ../Reference-genomes/Rnor6/Rattus_norvegicus.Rnor_6.0.99.gtf.gz
# Ensembl ortholog file: ../02-Annotation-data/mouse-rat-orths-ens99.txt
# Target overlaps file:  ../Targets/bed/mm10-targets-to-exons-0.9.bed
# Output file:           ../02-Annotation-data/selected-transcripts-targets.txt
# --------------
# dS threshold:          0.5
# --------------
# 03.17.2021 | 09:58:48 Reading target overlaps...
# 03.17.2021 | 09:58:50 Reading mouse feature lengths...
# Mouse transcripts read: 142647
# Rat transcripts read:   41078
# --------------
# 03.17.2021 | 09:59:02 Reading orthologs...
# Genes read:      56289
# Transcipts read: 170971
# --------------
# 03.17.2021 | 09:59:02 Filtering transcripts...
# Transcripts that do not overlap any targets:                                 36075
# Transcripts with no rat ortholog:                                            6960
# Transcripts with no one2one rat ortholog:                                    15496
# Transcripts with no high confidence rat ortholog:                            9487
# Transcripts with no rat ortholog below dS threshold:                         545
# Transcripts filtered for not having max target hits within gene:             55369
# --------------
# Genes with no passed transcripts:                                            7084
# Genes with selected transcript:                                              13130
# --------------

1.3 Mouse-rat dS for orthologous transcripts

All data from Ensembl release 99.

#####################
# Mouse-rat dS for transcripts
#####################\
# pre_binned = binData(pre_data, "dS.with.Rat", 1)
# pre_binned$filter = "Pre-filter"
# 
# filter_binned = binData(filter_data, "dS.with.Rat", 1)
# filter_binned$filter = "Post-filter"

bins = seq(0,1,by=0.01)
bin_labels = seq(0.01,1,by=0.01)
pre_counts = hist(pre_data$dS.with.Rat, breaks=bins, include.lowest=T, plot=F)$counts
pre_binned = data.frame("bin"=bin_labels, "count"=pre_counts, filter="Pre-filter")

filter_counts = hist(filter_data$dS.with.Rat, breaks=bins, include.lowest=T, plot=F)$counts
filter_binned = data.frame("bin"=bin_labels, "count"=filter_counts, filter="Post-filter")

binned_data = bind_rows(pre_binned, filter_binned)
col = corecol(numcol=2)
lab = c("Pre-filter", "Post-filter")
#ylabels = c("1200", as.character(seq(from=0, to=7000, by=1000)))
ylabels = as.character(seq(from=-1000, to=7000, by=1000))

dualp = ggplot(binned_data, aes(x = bin, y = ifelse(filter == "Post-filter",-1, 1)*count, fill = filter)) + 
  geom_col(width=0.01) +
  geom_hline(yintercept=0) +
  scale_x_continuous(name = "dS", limits = c(-0.02, 1), expand = c(0, 0)) +
  scale_y_continuous(name = "# Transcripts", limits=c(-1000,7000), breaks = 1000*(-1:7), labels=ylabels) +
  scale_fill_manual(name="", limits=lab, values=col) +
  bartheme()
print(dualp)

1.4 Mouse-rat dN for orthologous transcripts

All data from Ensembl release 99.

#####################
# Mouse-rat dN for transcripts
#####################\
# pre_binned = binData(pre_data, "dN.with.Rat", 1)
# pre_binned$filter = "Pre-filter"
# 
# filter_binned = binData(filter_data, "dN.with.Rat", 1)
# filter_binned$filter = "Post-filter"

bins = seq(0,1,by=0.01)
bin_labels = seq(0.01,1,by=0.01)
pre_counts = hist(pre_data$dN.with.Rat, breaks=bins, include.lowest=T, plot=F)$counts
pre_binned = data.frame("bin"=bin_labels, "count"=pre_counts, filter="Pre-filter")

filter_counts = hist(filter_data$dN.with.Rat, breaks=bins, include.lowest=T, plot=F)$counts
filter_binned = data.frame("bin"=bin_labels, "count"=filter_counts, filter="Post-filter")

binned_data = bind_rows(pre_binned, filter_binned)
col = corecol(numcol=2)
lab = c("Pre-filter", "Post-filter")
ylabels = c("5000", as.character(seq(from=0, to=30000, by=5000)))

dualp = ggplot(binned_data, aes(x = bin, y = ifelse(filter == "Post-filter",-1, 1)*count, fill = filter)) + 
  geom_col(width=0.01) +
  geom_hline(yintercept=0) +
  scale_x_continuous(name = "dN", limits = c(-0.02, 1), expand = c(0, 0)) +
  scale_y_continuous(name = "# Transcripts", limits=c(-5000,30000), breaks = 5000*(-1:6), labels=ylabels) +
  scale_fill_manual(name="", limits=lab, values=col) +
  bartheme()
print(dualp)

1.5 Mouse-rat dN/dS for orthologous transcripts

All data from Ensembl release 99.

#####################
# Mouse-rat dN/dS for transcripts
#####################
# pre_data = subset(pre_data, !is.na(dnds))
# pre_binned = binData(pre_data, "dnds", 2)
# pre_binned$filter = "Pre-filter"
# 
# filter_data = subset(filter_data, !is.na(dnds))
# filter_binned = binData(filter_data, "dnds", 2)
# filter_binned$filter = "Post-filter"

pre_data_f = subset(pre_data, dnds <= 2)

bins = seq(0,2,by=0.01)
bin_labels = seq(0.01,2,by=0.01)
pre_counts = hist(pre_data_f$dnds, breaks=bins, include.lowest=T, plot=F)$counts
pre_binned = data.frame("bin"=bin_labels, "count"=pre_counts, filter="Pre-filter")

filter_data_f = subset(filter_data, dnds <= 2)

filter_counts = hist(filter_data_f$dnds, breaks=bins, include.lowest=T, plot=F)$counts
filter_binned = data.frame("bin"=bin_labels, "count"=filter_counts, filter="Post-filter")

binned_data = bind_rows(pre_binned, filter_binned)
col = corecol(numcol=2)
lab = c("Pre-filter", "Post-filter")
ylabels = c(as.character(seq(from=-1000, to=6000, by=1000)))

dualp = ggplot(binned_data, aes(x = bin, y = ifelse(filter == "Post-filter",-1, 1)*count, fill = filter)) + 
  geom_col(width=0.01) +
  geom_hline(yintercept=0) +
  scale_x_continuous(name = "dN/dS", limits = c(-0.02, 2), expand = c(0, 0)) +
  scale_y_continuous(name = "# Transcripts", limits=c(-1000,6000), breaks = 1000*(-1:6), labels=ylabels) +
  scale_fill_manual(name="", limits=lab, values=col) +
  bartheme()
print(dualp)

1.6 Total transcript length vs total probe target length

Sum length of all probe targets in a given transcript

ann_data = read.csv("../../data/Mus-selected-sequences_metadata_samp-counts_ratids.csv", header=T)
ts_lens = unique(select(ann_data, transcript_id, transcript_length))
names(ts_lens) = c("tid", "ts.len")

target_data = read.csv("../../data/target-bed/mm10-targets-to-exons-0.9.bed", sep="\t", header=F)
names(target_data) = c("target.chrome", "target.start", "target.end", "target.id", "exon.chrome", "exon.start", "exon.end", "exon.ids", "exon.score", "exon.strand", "overlap")
target_data$target.len = target_data$target.end - target_data$target.start
target_data$exon.len = target_data$exon.end - target_data$exon.start

target_data = target_data %>% separate(exon.ids, c("gid", "tid", "eid"), ";")
target_data = subset(target_data, !is.na(eid))
target_data = subset(target_data, tid %in% ts_lens$tid)
target_data$tid = as.factor(target_data$tid)
#target_data_final = target_data %>% select(tid, target.len) %>% group_by(tid) #%>% summarize(target.len=sum(target.len))
target_data_final = target_data %>% group_by(tid) %>% summarize(target.len=sum(target.len))
#mus_lens = mus_lens %>% group_by(transcript_id) %>% summarize(nongap.len=sum(target_length))

ts_target_lens = merge(ts_lens, target_data_final, by="tid")
ts_target_lens = subset(ts_target_lens, ts.len < 10000 & target.len < 10000)
ts_targ_len_p = ggplot(ts_target_lens, aes(x=ts.len, y=target.len)) +
  geom_point(alpha=0.1, color=corecol(pal="wilke", numcol=1, offset=2)) +
  geom_smooth(method="lm", se=F, linetype="dashed", color="#333333") +
  xlab("Total transcript length") +
  ylab("Sum of length of all probe targets\nin transcript") +
  bartheme()
print(ts_targ_len_p)

1.7 Number of samples matching each coding exon

ann_data = subset(ann_data, protein_coding=="TRUE")

exon_count_dist = ggplot(ann_data, aes(x=matching_sample_count)) +
  geom_histogram(identity="count", fill=corecol(numcol=1, offset=1)) +
  xlab("# of samples with BLAST hit") +
  ylab("# of targets") +
  scale_y_continuous(expand=c(0,0)) +
  bartheme()
print(exon_count_dist)

1.8 Assembly read depth vs number of BLAST hits to probe targets

asm_data = read.csv("../../data/exome-stats.csv", header=T)
asm_data = subset(asm_data, referee.avg.depth < 60)

targs_depth_p = ts_targ_len_p = ggplot(asm_data, aes(x=referee.avg.depth, y=blast.hits.to.targets)) +
  geom_point(alpha=0.5, color=corecol(pal="wilke", numcol=1, offset=4)) +
  geom_smooth(method="lm", se=F, linetype="dashed", color="#333333") +
  xlab("Avg. read depth") +
  ylab("# of BLAST hits") +
  bartheme()
print(targs_depth_p)