Exome assembly

knitr::opts_chunk$set(echo = TRUE, warning = FALSE, message = FALSE)

library(ggplot2)
library(cowplot)
library(ggbeeswarm)
library(ggpubr)
source("../lib/design.r")

targets = 203198
target_avg_len = 267.5448577249776
target_min_len = 100
target_total_len = 54364580

tiles = 470359
tile_avg_len = 109.8747
tile_min_len = 57
tile_total_len = 51679441
# Target and tile info

#htmltools::includeHTML("../html-chunks/nav.html")

1 Exome assembly stats with Spades

1.1 Number of scaffolds by batch

spades_data = read.csv("../../data/exome-stats.csv", header=TRUE)
#spades_data = subset(spades_data, "Mus pos_ctrl" !%in% Species)
#spades_data$batch = as.character(spades_data$batch)
spades_data$Batch = factor(spades_data$Batch, levels=c("1", "2", "3", "4", "5", "6"))

####################
# Spades contig count distributions by batch
scaff_batch_p = ggplot(spades_data, aes(x=Batch, y=num.scaffs, fill=Batch)) +
  geom_hline(yintercept=targets, linetype=2, size=1, color="#666666") +
  geom_hline(yintercept=tiles, linetype=2, size=1, color="#9494b8") +
  geom_quasirandom(size=2, alpha=0.7, width=0.25, color="#d3d3d3") +
  geom_boxplot(outlier.shape=NA, alpha=0.7, width=0.5) +
  scale_fill_manual(labels=c("Original 40", "Rattus 8", "Mus 14", "New 70", "Aussie 57", "New 21"),
                     values=c("#920000", "#333333", "#006ddb", "#db6d00", "#b66dff", "#db6d00")) +
  annotate("text", x=1.1, y=targets-10000, label="# targets", color="#666666", size=4) +
  annotate("text", x=0.9, y=tiles-10000, label="# tiles", color="#9494b8", size=4) +
  #ggtitle("Spades assembly counts") +
  labs(x="Sequencing batch", y="# Scaffolds") +
  scale_x_discrete(labels=c("Original 40", "Rattus 8", "Mus 14", "New 70", "Aussie 57", "New 21")) +
  bartheme() + 
  theme(legend.position="none",
        axis.text.x = element_text(angle=25, hjust=1))

print(scaff_batch_p)

1.2 Scaffold length by batch

####################
# Spades length distributions, by batch
len_batch_p2 = ggplot(spades_data, aes(x=Batch, y=avg.scaff.len, fill=Batch)) +
  geom_hline(yintercept=target_avg_len, linetype=2, size=1, color="#666666") +
  geom_hline(yintercept=target_min_len, linetype=2, size=1, color="#666666") +
  geom_hline(yintercept=tile_avg_len, linetype=2, size=1, color="#9494b8") +
  geom_hline(yintercept=tile_min_len, linetype=2, size=1, color="#9494b8") +
  geom_quasirandom(size=2, alpha=0.7, width=0.25, color="#d3d3d3") +
  geom_boxplot(outlier.shape=NA, alpha=0.7, width=0.5) +
  scale_fill_manual(labels=c("Original 40", "Rattus 8", "Mus 14", "New 70", "Aussie 46", "New 27"),
                     values=c("#920000", "#333333", "#006ddb", "#db6d00", "#b66dff", "#db6d00")) +
  annotate("text", x=1.4, y=target_avg_len-25, label="Avg. target len", color="#666666", size=4) +
  annotate("text", x=1.4, y=target_min_len-20, label="Min. target len", color="#666666", size=4) +
  annotate("text", x=5.5, y=tile_avg_len+35, label="Avg. tile len", color="#9494b8", size=4) +
  annotate("text", x=5.5, y=tile_min_len-15, label="Min. tile len", color="#9494b8", size=4) +
  #ggtitle("Spades assembly counts") +
  labs(x="Sequencing batch", y="Avg. scaffold length") +
  scale_x_discrete(labels=c("Original 40", "Rattus 8", "Mus 14", "New 70", "Aussie 46", "New 27")) +
  bartheme() +
  theme(legend.position="none",
        axis.text.x = element_text(angle=25, hjust=1))

print(len_batch_p2)

1.3 Total assembly length

####################
# Spades length distributions, by batch
asm_batch_p2 = ggplot(spades_data, aes(x=Batch, y=asm.len, fill=Batch)) +
  geom_hline(yintercept=target_total_len, linetype=2, size=1, color="#666666") +
  geom_quasirandom(size=2, alpha=0.7, width=0.25, color="#d3d3d3") +
  geom_boxplot(outlier.shape=NA, alpha=0.7, width=0.5) +
  scale_fill_manual(labels=c("Original 40", "Rattus 8", "Mus 14", "New 70", "Aussie 46", "New 27"),
                     values=c("#920000", "#333333", "#006ddb", "#db6d00", "#b66dff", "#db6d00")) +
  annotate("text", x=1., y=target_total_len-5000000, label="Total target len", color="#666666", size=4) +
  #ggtitle("Spades assembly counts") +
  labs(x="Sequencing batch", y="Total assembly length") +
  scale_x_discrete(labels=c("Original 40", "Rattus 8", "Mus 14", "New 70", "Aussie 46", "New 27")) +
  bartheme() +
  theme(legend.position="none",
        axis.text.x = element_text(angle=25, hjust=1))

print(asm_batch_p2)

short_asm = subset(spades_data, asm.len<target_total_len)
print("Assemblies shorter than total target length:")
## [1] "Assemblies shorter than total target length:"
print(as.character(short_asm$Label))
## [1] "Notomys-amplus"        "Notomys-longicaudatus" "Notomys-macrotis"     
## [4] "Pseudomys-auritus"     "Pseudomys-gouldii"

1.4 Read depth to assembly by batch (no flanking regions)

####################
# Spades length distributions, by batch
#spades_data = subset(spades_data, Label != "Pseudomys-auritus")

depth_p = ggplot(spades_data, aes(x=Batch, y=asm.avg.mid.depth, fill=Batch)) +
  geom_quasirandom(size=2, alpha=0.7, width=0.25, color="#d3d3d3") +
  geom_boxplot(outlier.shape=NA, alpha=0.7, width=0.5) +
  scale_fill_manual(labels=c("Original 40", "Rattus 8", "Mus 14", "New 70", "Aussie 46", "New 27"),
                     values=c("#920000", "#333333", "#006ddb", "#db6d00", "#b66dff", "#db6d00")) +
  #ggtitle("Spades assembly counts") +
  labs(x="Sequencing batch", y="Avg. depth to assembly") +
  scale_x_discrete(labels=c("Original 40", "Rattus 8", "Mus 14", "New 70", "Aussie 46", "New 27")) +
  bartheme() +
  theme(legend.position="none",
        axis.text.x = element_text(angle=25, hjust=1))

print(depth_p)

high_depth = subset(spades_data, asm.avg.mid.depth > 70)
print("Assemblies with avg. depth > 70:")
## [1] "Assemblies with avg. depth > 70:"
print(as.character(high_depth$Label))
## [1] "Conilurus-albipes"         "Leporillus-apicalis"      
## [3] "Notomys-amplus"            "Notomys-longicaudatus"    
## [5] "Notomys-macrotis"          "Pseudomys-auritus"        
## [7] "Pseudomys-gouldii"         "Pseudomys-novaehollandiae"

1.5 Read depth to mouse by batch

####################
# Spades length distributions, by batch
depth_p = ggplot(spades_data, aes(x=Batch, y=mouse.target.avg.depth, fill=Batch)) +
  geom_quasirandom(size=2, alpha=0.7, width=0.25, color="#d3d3d3") +
  geom_boxplot(outlier.shape=NA, alpha=0.7, width=0.5) +
  scale_fill_manual(labels=c("Original 40", "Rattus 8", "Mus 14", "New 70", "Aussie 46", "New 27"),
                     values=c("#920000", "#333333", "#006ddb", "#db6d00", "#b66dff", "#db6d00")) +
  #ggtitle("Spades assembly counts") +
  labs(x="Sequencing batch", y="Avg. depth to mouse") +
  scale_x_discrete(labels=c("Original 40", "Rattus 8", "Mus 14", "New 70", "Aussie 46", "New 27")) +
  bartheme() +
  theme(legend.position="none",
        axis.text.x = element_text(angle=25, hjust=1))

print(depth_p)

1.6 Comparison of read depth mapped to assembly vs to mouse

####################
# Read depth

asm_depth_mid = subset(spades_data, select=c(Label, asm.avg.mid.depth))
names(asm_depth_mid)[2] = "Depth"
asm_depth_mid$Mapping = "Asm mid-regions"
asm_avg_mid = signif(mean(asm_depth_mid$Depth, na.rm=T), 3)

asm_depth_flank = subset(spades_data, select=c(Label, asm.avg.start.depth, asm.avg.end.depth))
asm_depth_flank$Depth = asm_depth_flank$asm.avg.start.depth + asm_depth_flank$asm.avg.end.depth
asm_depth_flank$Mapping = "Asm flanking regions"
asm_depth_flank = subset(asm_depth_flank, select=c(Label, Depth, Mapping))
asm_avg_flank = signif(mean(asm_depth_flank$Depth, na.rm=T), 3)

mouse_depth = subset(spades_data, select=c(Label, mouse.target.avg.depth))
names(mouse_depth)[2] = "Depth"
mouse_depth$Mapping = "Mouse targets"
mouse_avg = signif(mean(mouse_depth$Depth, na.rm=T), 3)

paired_depth = rbind(asm_depth_flank, asm_depth_mid, mouse_depth)

# depth_p = ggpaired(paired_depth, x="Mapping", y="Depth",
#                  fill="Mapping", palette=corecol(pal="trek", numcol=2), color="#333333",
#                  line.color="#b3b3b3", line.size = 0.4,
#                  width=0.4, point.size=2,
#                  xlab="Mapping", ylab="Average read depth") +
#   stat_compare_means(paired=TRUE, label.x=1) +
#   bartheme() +
#   theme(legend.position="none")
# 
# print(depth_p)

cols = corecol(pal="trek", numcol=3)

depth_p = ggplot(paired_depth, aes(x=Mapping, y=Depth, fill=Mapping)) +
  geom_quasirandom(size=2, alpha=0.7, width=0.25, color="#d3d3d3") +
  geom_boxplot(outlier.shape=NA, alpha=0.7, width=0.5) +
  xlab("Mapping to...") +
  ylab("Average read depth") +
  scale_fill_manual(limits=c("Asm flanking regions", "Asm mid-regions", "Mouse targets"), values=cols) +
  geom_hline(yintercept=asm_avg_flank, linetype=2, size=1, color=cols[1]) +
  annotate("text", x=3.5, y=asm_avg_flank-3, label=asm_avg_flank, color=cols[1], size=4) +
  geom_hline(yintercept=asm_avg_mid, linetype=2, size=1, color=cols[2]) +
  annotate("text", x=3.5, y=asm_avg_mid-3, label=asm_avg_mid, color=cols[2], size=4) +
  geom_hline(yintercept=mouse_avg, linetype=2, size=1, color=cols[3]) +
  annotate("text", x=3.5, y=mouse_avg+7, label=mouse_avg, color=cols[3], size=4) +
  bartheme() +
  theme(legend.position="none")
print(depth_p)

1.7 Comparison of # of reads mapped to assembly vs to mouse

####################
# Read depth

asm_reads = subset(spades_data, select=c(Label, asm.reads.mapped))
names(asm_reads)[2] = "Num.reads"
asm_reads$Mapping = "Assembly"
asm_avg_reads = signif(mean(asm_reads$Num.reads, na.rm=T), 3)

mouse_reads = subset(spades_data, select=c(Label, mouse.target.reads.mapped))
names(mouse_reads)[2] = "Num.reads"
mouse_reads$Mapping = "Mouse targets"
mouse_avg_reads = signif(mean(mouse_reads$Num.reads, na.rm=T), 3)

paired_reads = rbind(asm_reads, mouse_reads)

cols = corecol(pal="trek", numcol=2, offset=1)

reads_p = ggplot(paired_reads, aes(x=Mapping, y=Num.reads, fill=Mapping)) +
  geom_quasirandom(size=2, alpha=0.7, width=0.25, color="#d3d3d3") +
  geom_boxplot(outlier.shape=NA, alpha=0.7, width=0.5) +
  xlab("Mapping to...") +
  ylab("# reads mapped") +
  scale_fill_manual(limits=c("Assembly", "Mouse targets"), values=cols) +
  geom_hline(yintercept=asm_avg_reads, linetype=2, size=1, color=cols[1]) +
  annotate("text", x=2.4, y=asm_avg_reads-2000000, label=asm_avg_reads, color=cols[1], size=4) +
  geom_hline(yintercept=mouse_avg_reads, linetype=2, size=1, color=cols[2]) +
  annotate("text", x=2.4, y=mouse_avg_reads+3000000, label=mouse_avg_reads, color=cols[2], size=4) +
  bartheme() +
  theme(legend.position="none")
print(reads_p)

high_reads = subset(spades_data, asm.reads.mapped > 80000000)
print("Assemblies with reads mapped > 80000000:")
## [1] "Assemblies with reads mapped > 80000000:"
print(as.character(high_reads$Label))
## [1] "Rattus-hoffmanni"          "Conilurus-albipes"        
## [3] "Leporillus-apicalis"       "Notomys-amplus"           
## [5] "Pseudomys-gouldii"         "Pseudomys-novaehollandiae"

1.8 Comparison of % of reads mapped to assembly vs to mouse

####################
# Read depth

asm_perc_reads = subset(spades_data, select=c(Label, asm.perc.reads.mapped))
names(asm_perc_reads)[2] = "Perc.reads"
asm_perc_reads$Mapping = "Assembly"
asm_avg_perc = signif(mean(asm_perc_reads$Perc.reads, na.rm=T), 3)

mouse_perc_reads = subset(spades_data, select=c(Label, mouse.target.perc.paired.mapped))
names(mouse_perc_reads)[2] = "Perc.reads"
mouse_perc_reads$Mapping = "Mouse targets"
mouse_avg_perc = signif(mean(mouse_perc_reads$Perc.reads, na.rm=T), 3)

paired_perc = rbind(asm_perc_reads, mouse_perc_reads)

cols = corecol(pal="trek", numcol=2, offset=1)

perc_p = ggplot(paired_perc, aes(x=Mapping, y=Perc.reads, fill=Mapping)) +
  geom_quasirandom(size=2, alpha=0.7, width=0.25, color="#d3d3d3") +
  geom_boxplot(outlier.shape=NA, alpha=0.7, width=0.5) +
  xlab("Mapping to...") +
  ylab("% reads mapped") +
  scale_fill_manual(limits=c("Assembly", "Mouse targets"), values=cols) +
  geom_hline(yintercept=asm_avg_perc, linetype=2, size=1, color=cols[1]) +
  annotate("text", x=2.4, y=asm_avg_perc-1, label=asm_avg_perc, color=cols[1], size=4) +
  geom_hline(yintercept=mouse_avg_perc, linetype=2, size=1, color=cols[2]) +
  annotate("text", x=2.4, y=mouse_avg_perc+1, label=mouse_avg_perc, color=cols[2], size=4) +
  bartheme() +
  theme(legend.position="none")
print(perc_p)

low_perc_reads = subset(spades_data, asm.perc.reads.mapped < 75) 
print("Assemblies with < 75% reads mapped:")
## [1] "Assemblies with < 75% reads mapped:"
print(as.character(low_perc_reads$Label))
## [1] "Notomys-amplus"        "Notomys-longicaudatus" "Notomys-macrotis"     
## [4] "Pseudomys-gouldii"

1.9 Read stats

####################
# Read depth

raw_reads = subset(spades_data, select=c(Label, raw.reads))
names(raw_reads)[2] = "num.reads"
raw_reads$step = "Raw reads"

fastp_reads = subset(spades_data, select=c(Label, fastp.reads))
names(fastp_reads)[2] = "num.reads"
fastp_reads$step = "Fastp reads"

decon_reads = subset(spades_data, select=c(Label, decon.reads))
names(decon_reads)[2] = "num.reads"
decon_reads$step = "Decon reads"

asm_reads = subset(spades_data, select=c(Label, asm.reads.mapped))
names(asm_reads)[2] = "num.reads"
asm_reads$step = "Asm reads mapped"

asm_dups = subset(spades_data, select=c(Label, asm.duplicate.reads))
names(asm_dups)[2] = "num.reads"
asm_dups$step = "Asm duplicate reads"

mouse_reads = subset(spades_data, select=c(Label, mouse.reads.mapped))
names(mouse_reads)[2] = "num.reads"
mouse_reads$step = "Mouse reads mapped"

mouse_dups = subset(spades_data, select=c(Label, mouse.duplicate.reads))
names(mouse_dups)[2] = "num.reads"
mouse_dups$step = "Mouse duplicate reads"

ref_reads = subset(spades_data, select=c(Label, referee.reads.mapped))
names(ref_reads)[2] = "num.reads"
ref_reads$step = "Referee reads mapped"

ref_dups = subset(spades_data, select=c(Label, referee.duplicate.reads))
names(ref_dups)[2] = "num.reads"
ref_dups$step = "Referee duplicate reads" 

reads_data = rbind(raw_reads, fastp_reads, decon_reads, asm_reads, asm_dups, mouse_reads, mouse_dups, ref_reads, ref_dups)
cols = corecol(numcol=9)
reads_data$step = factor(reads_data$step, levels=c("Raw reads", "Fastp reads", "Decon reads", "Asm reads mapped", "Asm duplicate reads", "Mouse reads mapped", "Mouse duplicate reads", "Referee reads mapped", "Referee duplicate reads"))



reads_p = ggplot(reads_data, aes(x=step, y=num.reads, fill=step)) +
  geom_quasirandom(size=2, alpha=0.7, width=0.25, color="#d3d3d3") +
  geom_boxplot(outlier.shape=NA, alpha=0.7, width=0.5) +
  xlab("Step") +
  ylab("# reads") +
  scale_fill_manual(values=cols) +
  bartheme() +
  theme(legend.position="none",
        axis.text.x = element_text(angle=45, hjust=1, size=8))
print(reads_p)

1.10 Referee stats

####################
# Referee
# Low scores are -1, -2, -3, probably mostly unmapped (-2)
# Errors corrected matches 0 scores

corrected_p = ggplot(spades_data, aes(x="Errors corrected", y=referee.errors.corrected)) +
  geom_quasirandom(size=2, alpha=0.7, width=0.25, color="#d3d3d3") +
  geom_boxplot(outlier.shape=NA, alpha=0.7, width=0.5, fill=corecol(pal="trekdark", numcol=1)) +
  xlab("") +
  ylab("# positions") +
  scale_fill_manual(values=cols) +
  bartheme() +
  theme(legend.position="none",
        axis.title.y=element_blank(),
        axis.ticks.x=element_blank(),
        axis.text.y=element_text(angle=90, hjust=0.5))
#print(corrected_p)

rate_p = ggplot(spades_data, aes(x="Errors per base", y=referee.error.rate)) +
  geom_quasirandom(size=2, alpha=0.7, width=0.25, color="#d3d3d3") +
  geom_boxplot(outlier.shape=NA, alpha=0.7, width=0.5, fill=corecol(pal="trekdark", numcol=1, offset=1)) +
  xlab("") +
  ylab("Errors per base") +
  scale_fill_manual(values=cols) +
  bartheme() +
  theme(legend.position="none",
        axis.title.y=element_blank(),
        axis.ticks.x=element_blank(),
        axis.text.y=element_text(angle=90, hjust=0.5))
#print(rate_p)

low_p = ggplot(spades_data, aes(x="Low quality positions", y=referee.low.scores)) +
  geom_quasirandom(size=2, alpha=0.7, width=0.25, color="#d3d3d3") +
  geom_boxplot(outlier.shape=NA, alpha=0.7, width=0.5, fill=corecol(pal="trekdark", numcol=1, offset=2)) +
  xlab("") +
  ylab("# positions") +
  scale_fill_manual(values=cols) +
  bartheme() +
  theme(legend.position="none",
        axis.title.y=element_blank(),
        axis.ticks.x=element_blank(),
        axis.text.y=element_text(angle=90, hjust=0.5))
#print(low_p)

unmapped_p = ggplot(spades_data, aes(x="Unmapped positions", y=referee.unmapped.pos)) +
  geom_quasirandom(size=2, alpha=0.7, width=0.25, color="#d3d3d3") +
  geom_boxplot(outlier.shape=NA, alpha=0.7, width=0.5, fill=corecol(pal="trekdark", numcol=1, offset=3)) +
  xlab("") +
  ylab("# positions") +
  scale_fill_manual(values=cols) +
  bartheme() +
  theme(legend.position="none",
        axis.title.y=element_blank(),
        axis.ticks.x=element_blank(),
        axis.text.y=element_text(angle=90, hjust=0.5))
#print(unmapped_p)

p_title = ggdraw() + draw_label("Referee stats", size=18, fontface="bold", x=0, hjust=0) + theme(plot.margin=margin(0,0,0,7))
p_row = plot_grid(corrected_p, rate_p, low_p, unmapped_p, ncol=4)
p = plot_grid(p_title, p_row, ncol=1, rel_heights=c(0.05,1))
print(p)

1.11 Other stats

contig_data = read.csv("../../data/count-ns.csv", header=T)
#contig_data$Ns.per.site = contig_data$Ns / contig_data$length

n_p = ggplot(contig_data, aes(x=Ns)) +
  geom_histogram() +
  scale_y_continuous(expand=c(0,0)) +
  bartheme()
print(n_p)


contig_data$hets.per.site = contig_data$hets / contig_data$length
hets_p = ggplot(contig_data, aes(x=hets.per.site)) +
  geom_histogram(stat=) +
  scale_y_continuous(expand=c(0,0)) +
  bartheme()
print(hets_p)