#knitr::opts_chunk$set(echo = FALSE)
#this.dir <- dirname(parent.frame(2)$ofile)
#setwd(this.dir)

library(ggplot2)
library(ggbeeswarm)
library(cowplot)
library(ggrepel)
library(ggsignif)
#library(plyr)
#library(colorspace)
#library(kableExtra)
source("../lib/design.r")



#htmltools::includeHTML("../html-chunks/nav.html")

< Back Home

Feature counts

newfile = "../../data/annotation/psun-feature-counts-new.csv"
oldfile = "../../data/annotation/psun-feature-counts-old.csv"

old = read.csv(oldfile, header=T, comment.char="#")
new = read.csv(newfile, header=T, comment.char="#")

old$cat = "Old annotation"
new$cat = "New annotation"

all = rbind(old, new)
all$cat = factor(all$cat, levels=c("Old annotation", "New annotation"))

all = subset(all, feature.type != "match" & feature.type != "match_part")
all = subset(all, feature.type != "expressed_sequence_match" & feature.type != "protein_match")
comp_sub_p = ggplot(all, aes(x=feature.type, y=count, fill=cat)) +
  geom_bar(position="dodge", stat="identity") +
  scale_fill_manual(values=corecol(numcol=2, pal="wilke", offset=4)) +
  scale_y_continuous(expand=c(0,0)) +
  xlab("Feature type") +
  ylab("# features") +
  bartheme() +
  theme(axis.text.x=element_text(angle=25, hjust=1))

print(comp_sub_p)

Gene lengths (< 100000bp)

newfile = "../../data/annotation/psun-genes.tab"
oldfile = "../../data/annotation/psun-genes-old.tab"

old = read.csv(oldfile, header=T, sep="\t", comment.char="#")
new = read.csv(newfile, header=T, sep="\t", comment.char="#")

old$length = old$end - old$start
new$length = new$end - new$start

old$cat = "Old annotation"
new$cat = "New annotation"

all = rbind(old, new)
all$cat = factor(all$cat, levels=c("Old annotation", "New annotation"))

all$chr.cat = "Auto"
all$chr.cat[all$chr=="ScmyWZ3_7747_HRSCAF_7900"] = "X"

all = subset(all, length<100000)

len_p_all = ggplot(all, aes(x=cat, y=length, fill=cat)) +
  geom_quasirandom(size=2, alpha=0.7, width=0.25, color="#d3d3d3") +
  geom_boxplot(outlier.shape=NA, alpha=0.7, width=0.5) +
  ylab("Gene length (bp)") +
  xlab("") +
  ggtitle("All genes") + 
  scale_fill_manual(values=corecol(numcol=2, pal="wilke", offset=4)) +
  bartheme() +
  theme(legend.position="none")
print(len_p_all)

xgenes = subset(all, chr=="ScmyWZ3_7747_HRSCAF_7900")

len_p_x = ggplot(xgenes, aes(x=cat, y=length, fill=cat)) +
  geom_quasirandom(size=2, alpha=0.7, width=0.25, color="#d3d3d3") +
  geom_boxplot(outlier.shape=NA, alpha=0.7, width=0.5) +
  ylab("Gene length (bp)") +
  xlab("") +
  ggtitle("X genes") +
  scale_fill_manual(values=corecol(numcol=2, pal="wilke", offset=4)) +
  bartheme() +
  theme(legend.position="none")
print(len_p_x)

Gene lengths (< 100000bp) vs. chromosome lengths

in_data = read.csv("../../data/annotation/psun-gene-counts.csv", header=T)

genes_p = ggplot(in_data, aes(x=length, y=X..genes)) +
  geom_point(size=4, color=corecol(numcol=1), alpha=0.7) +
  geom_smooth(method="lm", se=F, linetype="dashed", size=1, color="#999999") +
  geom_text_repel(aes(label=chrome),hjust=0,vjust=0) +
  xlab("Scaffold length") +
  ylab("# of genes") +
  bartheme() + 
  theme(axis.text.x=element_text(angle=25, hjust=1))

#print(genes_p)

bases_p = ggplot(in_data, aes(x=length, y=X..bases.in.genes)) +
  geom_point(size=4, color=corecol(numcol=1), alpha=0.7) +
  geom_smooth(method="lm", se=F, linetype="dashed", size=1, color="#999999") +
  geom_text_repel(aes(label=chrome),hjust=0,vjust=0) +
  xlab("Scaffold length") +
  ylab("# of bases in genes") +
  bartheme() +
  theme(axis.text.x=element_text(angle=25, hjust=1))

#print(bases_p)

perc_p = ggplot(in_data, aes(x=1, y=X..bases.in.genes.1)) +
  geom_quasirandom(size=4, alpha=0.7, width=0, color=corecol(numcol=1)) +
  #geom_boxplot(outlier.shape=NA, alpha=0, width=0.1) +
  scale_x_continuous(limits=c(0.9,1.1)) +
  scale_y_continuous(limits=c(0,1)) +
  xlab("") +
  ylab("% of bases in genes") +
  bartheme() +
  theme(axis.line.y=element_blank(),
        axis.text.y=element_blank(),
        axis.ticks.y=element_blank()) +
  coord_flip()

#print(perc_p)
                 
p = plot_grid(genes_p, bases_p, ncol=2, labels=c("A","B"), label_size=16, align='vh')
print(p)

% of bases in genes (< 100000bp) by chromosome

in_data$X..bases.in.genes.1 = in_data$X..bases.in.genes.1 * 100

avg_p = ggplot(in_data, aes(x=chrome, y=X..bases.in.genes.1)) +
  geom_hline(yintercept=mean(in_data$X..bases.in.genes.1), size=1.5, linetype="dashed", color="#d3d3d3")
  #geom_hline(yintercept=median(in_data$X..bases.in.genes.1), size=1.5, linetype="dashed", color="#999999")
# Start plot of avg. distance to random wRF per chrome

k = 1
for(chrome in levels(in_data$chrome)){
  #print(chrome)
  #print(k)
  avg_p = avg_p + geom_segment(x=k, y=0, xend=k, yend=in_data$X..bases.in.genes.1[in_data$chrome==chrome], color="#666666", linetype="dotted")
  k = k + 1
}
# Add the dotted lines for each chrome

avg_p = avg_p + geom_point(size=4, color="#920000") +
  #geom_text(data=blah, aes(x=20.4, y=avg_nonsig_chr-1, label=paste("Avg. = ", avg_nonsig_chr, "Mb", sep="")),  size=4, color="#333333") +
  #geom_text(data=blah, aes(x=20.4, y=med_nonsig_chr+1, label=paste("Median = ", med_nonsig_chr, "Mb", sep="")),  size=4, color="#333333") +
  xlab("Chromosome") +
  ylab("% of bases in a gene") +
  scale_y_continuous(expand=c(0,0), limits=c(0,100)) +
  bartheme() +
  theme(axis.text.x=element_text(angle=15, hjust=1))
print(avg_p)

Gene length (< 100000bp) distributions by chromosome

len_data = read.csv("../../data/annotation/psun-gene-lens.csv", header=T)
len_data = subset(len_data, gene.len < 100000)

x_comps = list(c("X_R", "X_NR"))
len_p = ggplot(len_data, aes(x=chrome, y=gene.len, fill=chrome)) +
  geom_quasirandom(size=2, alpha=0.7, width=0.25, color="#d3d3d3") +
  geom_boxplot(outlier.shape=NA, alpha=0.7, width=0.5) +
  geom_signif(comparisons=x_comps, map_signif_level=TRUE, textsize=4, size=1, step_increase=0.12, margin_top=0.1) +
  xlab("Chromosome") +
  ylab("Gene length (bp)") +
  bartheme() +
  theme(legend.position="none",
        axis.text.x=element_text(angle=25, hjust=1))

print(len_p)

< Back Home

cat("Page last updated:", format(Sys.time(), "%m/%d/%Y %H:%M:%S %Z"))
## Page last updated: 11/15/2020 15:52:20 MST
htmltools::includeHTML("../html-chunks/rmd_footer.html")