Phyllotis xanthopygus annotation stats
#knitr::opts_chunk$set(echo = FALSE)
#this.dir <- dirname(parent.frame(2)$ofile)
#setwd(this.dir)
library(ggplot2)
library(ggbeeswarm)
library(cowplot)
library(ggrepel)
library(ggsignif)
library(kableExtra)
library(dplyr)
#library(plyr)
#library(colorspace)
#library(kableExtra)
source("../lib/design.r")
#htmltools::includeHTML("../html-chunks/nav.html")
All summaries from file: final_annot_from_nrvertebrata.gff
All comparisons to hifiasm assembly: tom-uni1803-mb-hirise-6427t_01-19-2021__final_assembly.fasta
Feature counts
= "../../data/pxan-feature-counts.csv"
countfile
= read.csv(countfile, header=T, comment.char="#")
counts
%>% kable(caption="Feature counts for P. xanthopygus") %>% kable_styling(bootstrap_options=c("striped", "condensed", "responsive"), full_width=F) counts
feature.type | count | avg.length |
---|---|---|
mRNA | 19179 | 10965.0447 |
exon | 127488 | 233.5878 |
CDS | 120731 | 151.8119 |
five_prime_UTR | 18909 | 119.4861 |
three_prime_UTR | 13378 | 685.1802 |
= ggplot(counts, aes(x=feature.type, y=count)) +
fcounts_p geom_bar(position="dodge", stat="identity", fill=corecol(numcol=1, pal="wilke", offset=4)) +
scale_y_continuous(expand=c(0,0)) +
xlab("Feature type") +
ylab("# features") +
bartheme() +
theme(axis.text.x=element_text(angle=25, hjust=1))
print(fcounts_p)
Transcript lengths (< 100000bp)
= "../../data/pxan-transcripts.tab"
trancsripts_file
= read.csv(trancsripts_file, header=T, sep="\t", comment.char="#")
transcripts $length = transcripts$end - transcripts$start
transcripts
= subset(transcripts, length<100000)
transcripts
= ggplot(transcripts, aes(x=length)) +
transcript_len_p geom_histogram(fill=corecol(numcol=1, pal="wilke", offset=5), color="#ececec", bins=100) +
scale_x_continuous(expand=c(0,0)) +
scale_y_continuous(expand=c(0,0)) +
bartheme()
print(transcript_len_p)
Transcript lengths (< 100000bp) vs. scaffold lengths
= read.table("../../data/tom-uni1803-mb-hirise-6427t_01-19-2021__final_assembly.fasta.fai")
asm_stats names(asm_stats) = c("scaffold", "length", "offset", "linebases", "linewidth")
= c("Scaffold_1", "Scaffold_2", "Scaffold_3", "Scaffold_4", "Scaffold_5", "Scaffold_6", "Scaffold_7", "Scaffold_8", "Scaffold_9", "Scaffold_10", "Scaffold_11", "Scaffold_12", "Scaffold_13", "Scaffold_14", "Scaffold_15", "Scaffold_16", "Scaffold_17", "Scaffold_18", "Scaffold_19", "Scaffold_20", "Scaffold_22")
scaffs_over_10mb_ordered
#asm_stats = subset(asm_stats, length >= 10000000)
= transcripts %>% group_by(chr) %>% summarize(num.transcripts=n(), num.bases=sum(length))
transcripts_scaff_counts names(transcripts_scaff_counts)[1] = "scaffold"
= merge(asm_stats, transcripts_scaff_counts, by="scaffold")
transcripts_per_scaff
= ggplot(transcripts_per_scaff, aes(x=length, y=num.transcripts)) +
transcripts_scaff_p geom_point(size=4, color=corecol(numcol=1), alpha=0.7) +
geom_smooth(method="lm", se=F, linetype="dashed", size=1, color="#999999") +
#geom_text_repel(aes(label=scaffold),hjust=0,vjust=0) +
xlab("Scaffold length") +
ylab("# of transcripts") +
bartheme() +
theme(axis.text.x=element_text(angle=25, hjust=1))
#print(transcripts_scaff_p)
= ggplot(transcripts_per_scaff, aes(x=length, y=num.bases)) +
bases_scaff_p geom_point(size=4, color=corecol(numcol=1), alpha=0.7) +
geom_smooth(method="lm", se=F, linetype="dashed", size=1, color="#999999") +
#geom_text_repel(aes(label=chrome),hjust=0,vjust=0) +
xlab("Scaffold length") +
ylab("# of bases in transcripts") +
bartheme() +
theme(axis.text.x=element_text(angle=25, hjust=1))
#print(bases_scaff_p)
= plot_grid(transcripts_scaff_p, bases_scaff_p, ncol=2, labels=c("A","B"), label_size=16, align='vh')
p print(p)
% of bases in transcripts (< 100000bp) by scaffold
For the 22 scaffolds longer than 10Mb Scaffold_21 has no transcripts.
$perc.bases = transcripts_per_scaff$num.bases / transcripts_per_scaff$length * 100
transcripts_per_scaff= subset(transcripts_per_scaff, length >= 10000000)
transcripts_per_scaff $scaffold = gsub("(.+?)(\\__.*)", "\\1", transcripts_per_scaff$scaffold)
transcripts_per_scaff
$scaffold = factor(transcripts_per_scaff$scaffold, levels=scaffs_over_10mb_ordered)
transcripts_per_scaff
= ggplot(transcripts_per_scaff, aes(x=scaffold, y=perc.bases)) +
avg_p geom_hline(yintercept=mean(transcripts_per_scaff$perc.bases), size=1.5, linetype="dashed", color="#d3d3d3")
#geom_hline(yintercept=median(in_data$X..bases.in.genes.1), size=1.5, linetype="dashed", color="#999999")
# Start plot of avg. distance to random wRF per chrome
= 1
k for(scaff in levels(transcripts_per_scaff$scaffold)){
#print(chrome)
#print(k)
= avg_p + geom_segment(x=k, y=0, xend=k, yend=transcripts_per_scaff$perc.bases[transcripts_per_scaff$scaffold==scaff], color="#666666", linetype="dotted")
avg_p = k + 1
k
}# Add the dotted lines for each chrome
= avg_p + geom_point(size=4, color="#920000") +
avg_p #geom_text(data=blah, aes(x=20.4, y=avg_nonsig_chr-1, label=paste("Avg. = ", avg_nonsig_chr, "Mb", sep="")), size=4, color="#333333") +
#geom_text(data=blah, aes(x=20.4, y=med_nonsig_chr+1, label=paste("Median = ", med_nonsig_chr, "Mb", sep="")), size=4, color="#333333") +
xlab("Chromosome") +
ylab("% of bases in a gene") +
scale_y_continuous(expand=c(0,0), limits=c(0,100)) +
bartheme() +
theme(axis.text.x=element_text(angle=45, hjust=1))
print(avg_p)
Transcript length (< 100000bp) distributions by scaffold
For the 22 scaffolds longer than 10Mb Scaffold_21 has no transcripts.
= subset(transcripts, length <= 100000)
transcripts names(transcripts)[3] = "scaffold"
$scaffold = gsub("(.+?)(\\__.*)", "\\1", transcripts$scaffold)
transcripts= subset(transcripts, scaffold %in% scaffs_over_10mb_ordered)
transcripts $scaffold = factor(transcripts$scaffold, levels=scaffs_over_10mb_ordered)
transcripts
= ggplot(transcripts, aes(x=scaffold, y=length, fill=scaffold)) +
len_p geom_quasirandom(size=1, alpha=0.7, width=0.25, color="#d3d3d3") +
geom_boxplot(outlier.shape=NA, alpha=0.7, width=0.5) +
#geom_signif(comparisons=x_comps, map_signif_level=TRUE, textsize=4, size=1, step_increase=0.12, margin_top=0.1) +
xlab("Scaffold") +
ylab("Transcript length (bp)") +
bartheme() +
theme(legend.position="none",
axis.text.x=element_text(angle=45, hjust=1))
print(len_p)
cat("Page last updated:", format(Sys.time(), "%m/%d/%Y %H:%M:%S %Z"))
## Page last updated: 06/22/2021 12:07:50 MDT
::includeHTML("../html-chunks/rmd_footer.html") htmltools