Exome assembly
::opts_chunk$set(echo = TRUE, warning = FALSE, message = FALSE)
knitr
library(ggplot2)
library(cowplot)
library(ggbeeswarm)
library(ggpubr)
source("../lib/design.r")
= 203198
targets = 267.5448577249776
target_avg_len = 100
target_min_len = 54364580
target_total_len
= 470359
tiles = 109.8747
tile_avg_len = 57
tile_min_len = 51679441
tile_total_len # Target and tile info
#htmltools::includeHTML("../html-chunks/nav.html")
1 Exome assembly stats with Spades
1.1 Number of scaffolds by batch
= read.csv("../../data/exome-stats.csv", header=TRUE)
spades_data #spades_data = subset(spades_data, "Mus pos_ctrl" !%in% Species)
#spades_data$batch = as.character(spades_data$batch)
$Batch = factor(spades_data$Batch, levels=c("1", "2", "3", "4", "5", "6"))
spades_data
####################
# Spades contig count distributions by batch
= ggplot(spades_data, aes(x=Batch, y=num.scaffs, fill=Batch)) +
scaff_batch_p geom_hline(yintercept=targets, linetype=2, size=1, color="#666666") +
geom_hline(yintercept=tiles, linetype=2, size=1, color="#9494b8") +
geom_quasirandom(size=2, alpha=0.7, width=0.25, color="#d3d3d3") +
geom_boxplot(outlier.shape=NA, alpha=0.7, width=0.5) +
scale_fill_manual(labels=c("Original 40", "Rattus 8", "Mus 14", "New 70", "Aussie 57", "New 21"),
values=c("#920000", "#333333", "#006ddb", "#db6d00", "#b66dff", "#db6d00")) +
annotate("text", x=1.1, y=targets-10000, label="# targets", color="#666666", size=4) +
annotate("text", x=0.9, y=tiles-10000, label="# tiles", color="#9494b8", size=4) +
#ggtitle("Spades assembly counts") +
labs(x="Sequencing batch", y="# Scaffolds") +
scale_x_discrete(labels=c("Original 40", "Rattus 8", "Mus 14", "New 70", "Aussie 57", "New 21")) +
bartheme() +
theme(legend.position="none",
axis.text.x = element_text(angle=25, hjust=1))
print(scaff_batch_p)
1.2 Scaffold length by batch
####################
# Spades length distributions, by batch
= ggplot(spades_data, aes(x=Batch, y=avg.scaff.len, fill=Batch)) +
len_batch_p2 geom_hline(yintercept=target_avg_len, linetype=2, size=1, color="#666666") +
geom_hline(yintercept=target_min_len, linetype=2, size=1, color="#666666") +
geom_hline(yintercept=tile_avg_len, linetype=2, size=1, color="#9494b8") +
geom_hline(yintercept=tile_min_len, linetype=2, size=1, color="#9494b8") +
geom_quasirandom(size=2, alpha=0.7, width=0.25, color="#d3d3d3") +
geom_boxplot(outlier.shape=NA, alpha=0.7, width=0.5) +
scale_fill_manual(labels=c("Original 40", "Rattus 8", "Mus 14", "New 70", "Aussie 46", "New 27"),
values=c("#920000", "#333333", "#006ddb", "#db6d00", "#b66dff", "#db6d00")) +
annotate("text", x=1.4, y=target_avg_len-25, label="Avg. target len", color="#666666", size=4) +
annotate("text", x=1.4, y=target_min_len-20, label="Min. target len", color="#666666", size=4) +
annotate("text", x=5.5, y=tile_avg_len+35, label="Avg. tile len", color="#9494b8", size=4) +
annotate("text", x=5.5, y=tile_min_len-15, label="Min. tile len", color="#9494b8", size=4) +
#ggtitle("Spades assembly counts") +
labs(x="Sequencing batch", y="Avg. scaffold length") +
scale_x_discrete(labels=c("Original 40", "Rattus 8", "Mus 14", "New 70", "Aussie 46", "New 27")) +
bartheme() +
theme(legend.position="none",
axis.text.x = element_text(angle=25, hjust=1))
print(len_batch_p2)
1.3 Total assembly length
####################
# Spades length distributions, by batch
= ggplot(spades_data, aes(x=Batch, y=asm.len, fill=Batch)) +
asm_batch_p2 geom_hline(yintercept=target_total_len, linetype=2, size=1, color="#666666") +
geom_quasirandom(size=2, alpha=0.7, width=0.25, color="#d3d3d3") +
geom_boxplot(outlier.shape=NA, alpha=0.7, width=0.5) +
scale_fill_manual(labels=c("Original 40", "Rattus 8", "Mus 14", "New 70", "Aussie 46", "New 27"),
values=c("#920000", "#333333", "#006ddb", "#db6d00", "#b66dff", "#db6d00")) +
annotate("text", x=1., y=target_total_len-5000000, label="Total target len", color="#666666", size=4) +
#ggtitle("Spades assembly counts") +
labs(x="Sequencing batch", y="Total assembly length") +
scale_x_discrete(labels=c("Original 40", "Rattus 8", "Mus 14", "New 70", "Aussie 46", "New 27")) +
bartheme() +
theme(legend.position="none",
axis.text.x = element_text(angle=25, hjust=1))
print(asm_batch_p2)
= subset(spades_data, asm.len<target_total_len)
short_asm print("Assemblies shorter than total target length:")
## [1] "Assemblies shorter than total target length:"
print(as.character(short_asm$Label))
## [1] "Notomys-amplus" "Notomys-longicaudatus" "Notomys-macrotis"
## [4] "Pseudomys-auritus" "Pseudomys-gouldii"
1.4 Read depth to assembly by batch (no flanking regions)
####################
# Spades length distributions, by batch
#spades_data = subset(spades_data, Label != "Pseudomys-auritus")
= ggplot(spades_data, aes(x=Batch, y=asm.avg.mid.depth, fill=Batch)) +
depth_p geom_quasirandom(size=2, alpha=0.7, width=0.25, color="#d3d3d3") +
geom_boxplot(outlier.shape=NA, alpha=0.7, width=0.5) +
scale_fill_manual(labels=c("Original 40", "Rattus 8", "Mus 14", "New 70", "Aussie 46", "New 27"),
values=c("#920000", "#333333", "#006ddb", "#db6d00", "#b66dff", "#db6d00")) +
#ggtitle("Spades assembly counts") +
labs(x="Sequencing batch", y="Avg. depth to assembly") +
scale_x_discrete(labels=c("Original 40", "Rattus 8", "Mus 14", "New 70", "Aussie 46", "New 27")) +
bartheme() +
theme(legend.position="none",
axis.text.x = element_text(angle=25, hjust=1))
print(depth_p)
= subset(spades_data, asm.avg.mid.depth > 70)
high_depth print("Assemblies with avg. depth > 70:")
## [1] "Assemblies with avg. depth > 70:"
print(as.character(high_depth$Label))
## [1] "Conilurus-albipes" "Leporillus-apicalis"
## [3] "Notomys-amplus" "Notomys-longicaudatus"
## [5] "Notomys-macrotis" "Pseudomys-auritus"
## [7] "Pseudomys-gouldii" "Pseudomys-novaehollandiae"
1.5 Read depth to mouse by batch
####################
# Spades length distributions, by batch
= ggplot(spades_data, aes(x=Batch, y=mouse.target.avg.depth, fill=Batch)) +
depth_p geom_quasirandom(size=2, alpha=0.7, width=0.25, color="#d3d3d3") +
geom_boxplot(outlier.shape=NA, alpha=0.7, width=0.5) +
scale_fill_manual(labels=c("Original 40", "Rattus 8", "Mus 14", "New 70", "Aussie 46", "New 27"),
values=c("#920000", "#333333", "#006ddb", "#db6d00", "#b66dff", "#db6d00")) +
#ggtitle("Spades assembly counts") +
labs(x="Sequencing batch", y="Avg. depth to mouse") +
scale_x_discrete(labels=c("Original 40", "Rattus 8", "Mus 14", "New 70", "Aussie 46", "New 27")) +
bartheme() +
theme(legend.position="none",
axis.text.x = element_text(angle=25, hjust=1))
print(depth_p)
1.6 Comparison of read depth mapped to assembly vs to mouse
####################
# Read depth
= subset(spades_data, select=c(Label, asm.avg.mid.depth))
asm_depth_mid names(asm_depth_mid)[2] = "Depth"
$Mapping = "Asm mid-regions"
asm_depth_mid= signif(mean(asm_depth_mid$Depth, na.rm=T), 3)
asm_avg_mid
= subset(spades_data, select=c(Label, asm.avg.start.depth, asm.avg.end.depth))
asm_depth_flank $Depth = asm_depth_flank$asm.avg.start.depth + asm_depth_flank$asm.avg.end.depth
asm_depth_flank$Mapping = "Asm flanking regions"
asm_depth_flank= subset(asm_depth_flank, select=c(Label, Depth, Mapping))
asm_depth_flank = signif(mean(asm_depth_flank$Depth, na.rm=T), 3)
asm_avg_flank
= subset(spades_data, select=c(Label, mouse.target.avg.depth))
mouse_depth names(mouse_depth)[2] = "Depth"
$Mapping = "Mouse targets"
mouse_depth= signif(mean(mouse_depth$Depth, na.rm=T), 3)
mouse_avg
= rbind(asm_depth_flank, asm_depth_mid, mouse_depth)
paired_depth
# depth_p = ggpaired(paired_depth, x="Mapping", y="Depth",
# fill="Mapping", palette=corecol(pal="trek", numcol=2), color="#333333",
# line.color="#b3b3b3", line.size = 0.4,
# width=0.4, point.size=2,
# xlab="Mapping", ylab="Average read depth") +
# stat_compare_means(paired=TRUE, label.x=1) +
# bartheme() +
# theme(legend.position="none")
#
# print(depth_p)
= corecol(pal="trek", numcol=3)
cols
= ggplot(paired_depth, aes(x=Mapping, y=Depth, fill=Mapping)) +
depth_p geom_quasirandom(size=2, alpha=0.7, width=0.25, color="#d3d3d3") +
geom_boxplot(outlier.shape=NA, alpha=0.7, width=0.5) +
xlab("Mapping to...") +
ylab("Average read depth") +
scale_fill_manual(limits=c("Asm flanking regions", "Asm mid-regions", "Mouse targets"), values=cols) +
geom_hline(yintercept=asm_avg_flank, linetype=2, size=1, color=cols[1]) +
annotate("text", x=3.5, y=asm_avg_flank-3, label=asm_avg_flank, color=cols[1], size=4) +
geom_hline(yintercept=asm_avg_mid, linetype=2, size=1, color=cols[2]) +
annotate("text", x=3.5, y=asm_avg_mid-3, label=asm_avg_mid, color=cols[2], size=4) +
geom_hline(yintercept=mouse_avg, linetype=2, size=1, color=cols[3]) +
annotate("text", x=3.5, y=mouse_avg+7, label=mouse_avg, color=cols[3], size=4) +
bartheme() +
theme(legend.position="none")
print(depth_p)
1.7 Comparison of # of reads mapped to assembly vs to mouse
####################
# Read depth
= subset(spades_data, select=c(Label, asm.reads.mapped))
asm_reads names(asm_reads)[2] = "Num.reads"
$Mapping = "Assembly"
asm_reads= signif(mean(asm_reads$Num.reads, na.rm=T), 3)
asm_avg_reads
= subset(spades_data, select=c(Label, mouse.target.reads.mapped))
mouse_reads names(mouse_reads)[2] = "Num.reads"
$Mapping = "Mouse targets"
mouse_reads= signif(mean(mouse_reads$Num.reads, na.rm=T), 3)
mouse_avg_reads
= rbind(asm_reads, mouse_reads)
paired_reads
= corecol(pal="trek", numcol=2, offset=1)
cols
= ggplot(paired_reads, aes(x=Mapping, y=Num.reads, fill=Mapping)) +
reads_p geom_quasirandom(size=2, alpha=0.7, width=0.25, color="#d3d3d3") +
geom_boxplot(outlier.shape=NA, alpha=0.7, width=0.5) +
xlab("Mapping to...") +
ylab("# reads mapped") +
scale_fill_manual(limits=c("Assembly", "Mouse targets"), values=cols) +
geom_hline(yintercept=asm_avg_reads, linetype=2, size=1, color=cols[1]) +
annotate("text", x=2.4, y=asm_avg_reads-2000000, label=asm_avg_reads, color=cols[1], size=4) +
geom_hline(yintercept=mouse_avg_reads, linetype=2, size=1, color=cols[2]) +
annotate("text", x=2.4, y=mouse_avg_reads+3000000, label=mouse_avg_reads, color=cols[2], size=4) +
bartheme() +
theme(legend.position="none")
print(reads_p)
= subset(spades_data, asm.reads.mapped > 80000000)
high_reads print("Assemblies with reads mapped > 80000000:")
## [1] "Assemblies with reads mapped > 80000000:"
print(as.character(high_reads$Label))
## [1] "Rattus-hoffmanni" "Conilurus-albipes"
## [3] "Leporillus-apicalis" "Notomys-amplus"
## [5] "Pseudomys-gouldii" "Pseudomys-novaehollandiae"
1.8 Comparison of % of reads mapped to assembly vs to mouse
####################
# Read depth
= subset(spades_data, select=c(Label, asm.perc.reads.mapped))
asm_perc_reads names(asm_perc_reads)[2] = "Perc.reads"
$Mapping = "Assembly"
asm_perc_reads= signif(mean(asm_perc_reads$Perc.reads, na.rm=T), 3)
asm_avg_perc
= subset(spades_data, select=c(Label, mouse.target.perc.paired.mapped))
mouse_perc_reads names(mouse_perc_reads)[2] = "Perc.reads"
$Mapping = "Mouse targets"
mouse_perc_reads= signif(mean(mouse_perc_reads$Perc.reads, na.rm=T), 3)
mouse_avg_perc
= rbind(asm_perc_reads, mouse_perc_reads)
paired_perc
= corecol(pal="trek", numcol=2, offset=1)
cols
= ggplot(paired_perc, aes(x=Mapping, y=Perc.reads, fill=Mapping)) +
perc_p geom_quasirandom(size=2, alpha=0.7, width=0.25, color="#d3d3d3") +
geom_boxplot(outlier.shape=NA, alpha=0.7, width=0.5) +
xlab("Mapping to...") +
ylab("% reads mapped") +
scale_fill_manual(limits=c("Assembly", "Mouse targets"), values=cols) +
geom_hline(yintercept=asm_avg_perc, linetype=2, size=1, color=cols[1]) +
annotate("text", x=2.4, y=asm_avg_perc-1, label=asm_avg_perc, color=cols[1], size=4) +
geom_hline(yintercept=mouse_avg_perc, linetype=2, size=1, color=cols[2]) +
annotate("text", x=2.4, y=mouse_avg_perc+1, label=mouse_avg_perc, color=cols[2], size=4) +
bartheme() +
theme(legend.position="none")
print(perc_p)
= subset(spades_data, asm.perc.reads.mapped < 75)
low_perc_reads print("Assemblies with < 75% reads mapped:")
## [1] "Assemblies with < 75% reads mapped:"
print(as.character(low_perc_reads$Label))
## [1] "Notomys-amplus" "Notomys-longicaudatus" "Notomys-macrotis"
## [4] "Pseudomys-gouldii"
1.9 Read stats
####################
# Read depth
= subset(spades_data, select=c(Label, raw.reads))
raw_reads names(raw_reads)[2] = "num.reads"
$step = "Raw reads"
raw_reads
= subset(spades_data, select=c(Label, fastp.reads))
fastp_reads names(fastp_reads)[2] = "num.reads"
$step = "Fastp reads"
fastp_reads
= subset(spades_data, select=c(Label, decon.reads))
decon_reads names(decon_reads)[2] = "num.reads"
$step = "Decon reads"
decon_reads
= subset(spades_data, select=c(Label, asm.reads.mapped))
asm_reads names(asm_reads)[2] = "num.reads"
$step = "Asm reads mapped"
asm_reads
= subset(spades_data, select=c(Label, asm.duplicate.reads))
asm_dups names(asm_dups)[2] = "num.reads"
$step = "Asm duplicate reads"
asm_dups
= subset(spades_data, select=c(Label, mouse.reads.mapped))
mouse_reads names(mouse_reads)[2] = "num.reads"
$step = "Mouse reads mapped"
mouse_reads
= subset(spades_data, select=c(Label, mouse.duplicate.reads))
mouse_dups names(mouse_dups)[2] = "num.reads"
$step = "Mouse duplicate reads"
mouse_dups
= subset(spades_data, select=c(Label, referee.reads.mapped))
ref_reads names(ref_reads)[2] = "num.reads"
$step = "Referee reads mapped"
ref_reads
= subset(spades_data, select=c(Label, referee.duplicate.reads))
ref_dups names(ref_dups)[2] = "num.reads"
$step = "Referee duplicate reads"
ref_dups
= rbind(raw_reads, fastp_reads, decon_reads, asm_reads, asm_dups, mouse_reads, mouse_dups, ref_reads, ref_dups)
reads_data = corecol(numcol=9)
cols $step = factor(reads_data$step, levels=c("Raw reads", "Fastp reads", "Decon reads", "Asm reads mapped", "Asm duplicate reads", "Mouse reads mapped", "Mouse duplicate reads", "Referee reads mapped", "Referee duplicate reads"))
reads_data
= ggplot(reads_data, aes(x=step, y=num.reads, fill=step)) +
reads_p geom_quasirandom(size=2, alpha=0.7, width=0.25, color="#d3d3d3") +
geom_boxplot(outlier.shape=NA, alpha=0.7, width=0.5) +
xlab("Step") +
ylab("# reads") +
scale_fill_manual(values=cols) +
bartheme() +
theme(legend.position="none",
axis.text.x = element_text(angle=45, hjust=1, size=8))
print(reads_p)
1.10 Referee stats
####################
# Referee
# Low scores are -1, -2, -3, probably mostly unmapped (-2)
# Errors corrected matches 0 scores
= ggplot(spades_data, aes(x="Errors corrected", y=referee.errors.corrected)) +
corrected_p geom_quasirandom(size=2, alpha=0.7, width=0.25, color="#d3d3d3") +
geom_boxplot(outlier.shape=NA, alpha=0.7, width=0.5, fill=corecol(pal="trekdark", numcol=1)) +
xlab("") +
ylab("# positions") +
scale_fill_manual(values=cols) +
bartheme() +
theme(legend.position="none",
axis.title.y=element_blank(),
axis.ticks.x=element_blank(),
axis.text.y=element_text(angle=90, hjust=0.5))
#print(corrected_p)
= ggplot(spades_data, aes(x="Errors per base", y=referee.error.rate)) +
rate_p geom_quasirandom(size=2, alpha=0.7, width=0.25, color="#d3d3d3") +
geom_boxplot(outlier.shape=NA, alpha=0.7, width=0.5, fill=corecol(pal="trekdark", numcol=1, offset=1)) +
xlab("") +
ylab("Errors per base") +
scale_fill_manual(values=cols) +
bartheme() +
theme(legend.position="none",
axis.title.y=element_blank(),
axis.ticks.x=element_blank(),
axis.text.y=element_text(angle=90, hjust=0.5))
#print(rate_p)
= ggplot(spades_data, aes(x="Low quality positions", y=referee.low.scores)) +
low_p geom_quasirandom(size=2, alpha=0.7, width=0.25, color="#d3d3d3") +
geom_boxplot(outlier.shape=NA, alpha=0.7, width=0.5, fill=corecol(pal="trekdark", numcol=1, offset=2)) +
xlab("") +
ylab("# positions") +
scale_fill_manual(values=cols) +
bartheme() +
theme(legend.position="none",
axis.title.y=element_blank(),
axis.ticks.x=element_blank(),
axis.text.y=element_text(angle=90, hjust=0.5))
#print(low_p)
= ggplot(spades_data, aes(x="Unmapped positions", y=referee.unmapped.pos)) +
unmapped_p geom_quasirandom(size=2, alpha=0.7, width=0.25, color="#d3d3d3") +
geom_boxplot(outlier.shape=NA, alpha=0.7, width=0.5, fill=corecol(pal="trekdark", numcol=1, offset=3)) +
xlab("") +
ylab("# positions") +
scale_fill_manual(values=cols) +
bartheme() +
theme(legend.position="none",
axis.title.y=element_blank(),
axis.ticks.x=element_blank(),
axis.text.y=element_text(angle=90, hjust=0.5))
#print(unmapped_p)
= ggdraw() + draw_label("Referee stats", size=18, fontface="bold", x=0, hjust=0) + theme(plot.margin=margin(0,0,0,7))
p_title = plot_grid(corrected_p, rate_p, low_p, unmapped_p, ncol=4)
p_row = plot_grid(p_title, p_row, ncol=1, rel_heights=c(0.05,1))
p print(p)
1.11 Other stats
= read.csv("../../data/count-ns.csv", header=T)
contig_data #contig_data$Ns.per.site = contig_data$Ns / contig_data$length
= ggplot(contig_data, aes(x=Ns)) +
n_p geom_histogram() +
scale_y_continuous(expand=c(0,0)) +
bartheme()
print(n_p)
$hets.per.site = contig_data$hets / contig_data$length
contig_data= ggplot(contig_data, aes(x=hets.per.site)) +
hets_p geom_histogram(stat=) +
scale_y_continuous(expand=c(0,0)) +
bartheme()
print(hets_p)