title: "BUSCO plots" author: "zagor" date: " 09 10 2019" output: html_document editor_options: chunk_output_type: console


knitr::opts_chunk$set(echo = TRUE, fig.width=12, fig.height=9)


myTable = read.table(file = "../intermediate/BUSCOall_embryophyta_odb9.tsv", 
                     header = FALSE, 
                     sep = "\t", 
                     quote = "",
                     stringsAsFactors = FALSE,
                     fill=TRUE)

head(myTable)

myTable2 = read.table(file = "../intermediate/evg_out.tsv", 
                     header = FALSE, 
                     sep = "\t", 
                     quote = "",
                     stringsAsFactors = FALSE,
                     fill=TRUE)
myTable = rbind(myTable2, myTable)




nrow(myTable)

myTable[myTable[,3] == "",3] = "/"
myTable[is.na(myTable[,2]),2] = "/"

tmp = myTable[1,1]
for (i in 2:nrow(myTable)) {
  temp = myTable[i,1]
  if (temp == "") {
    temp = tmp
    myTable[i,1] = temp
  }
  else {
    tmp = temp
  }
}

myTable[,1] = gsub("short_summary_BUSCO_", "", myTable[,1])
myTable[,1] = gsub(".fasta_embryophyta_odb9", "", myTable[,1])
myTable[,1] = gsub(".aa_embryophyta_odb9", "", myTable[,1])
myTable[,1] = gsub("stCuSTr-", "", myTable[,1])
myTable[16:30,1] = c("D_pre_cdhit-2d_rep+alt",
                      "D_pre_cdhit-2d_rep+alt",
                      "D_pre_cdhit-2d_rep+alt",
                      "D_pre_cdhit-2d_rep+alt",
                      "D_pre_cdhit-2d_rep+alt",
                      "P_pre_cdhit-2d_rep+alt",
                      "P_pre_cdhit-2d_rep+alt",
                      "P_pre_cdhit-2d_rep+alt",
                      "P_pre_cdhit-2d_rep+alt",
                      "P_pre_cdhit-2d_rep+alt",
                      "R_pre_cdhit-2d_rep+alt",
                      "R_pre_cdhit-2d_rep+alt",
                      "R_pre_cdhit-2d_rep+alt",
                      "R_pre_cdhit-2d_rep+alt",
                      "R_pre_cdhit-2d_rep+alt")
myTable[61:105,1] = c("D_post_cdhit-2d_rep+alt",
                      "D_post_cdhit-2d_rep+alt",
                      "D_post_cdhit-2d_rep+alt",
                      "D_post_cdhit-2d_rep+alt",
                      "D_post_cdhit-2d_rep+alt",
                      "P_post_cdhit-2d_rep+alt",
                      "P_post_cdhit-2d_rep+alt",
                      "P_post_cdhit-2d_rep+alt",
                      "P_post_cdhit-2d_rep+alt",
                      "P_post_cdhit-2d_rep+alt",
                      "R_post_cdhit-2d_rep+alt",
                      "R_post_cdhit-2d_rep+alt",
                      "R_post_cdhit-2d_rep+alt",
                      "R_post_cdhit-2d_rep+alt",
                      "R_post_cdhit-2d_rep+alt",
                      "D_post_cdhit-2d_rep",
                      "D_post_cdhit-2d_rep",
                      "D_post_cdhit-2d_rep",
                      "D_post_cdhit-2d_rep",
                      "D_post_cdhit-2d_rep",
                      "P_post_cdhit-2d_rep",
                      "P_post_cdhit-2d_rep",
                      "P_post_cdhit-2d_rep",
                      "P_post_cdhit-2d_rep",
                      "P_post_cdhit-2d_rep",
                      "R_post_cdhit-2d_rep",
                      "R_post_cdhit-2d_rep",
                      "R_post_cdhit-2d_rep",
                      "R_post_cdhit-2d_rep",
                      "R_post_cdhit-2d_rep",
                      "D_post_cdhit-2d_alt",
                      "D_post_cdhit-2d_alt",
                      "D_post_cdhit-2d_alt",
                      "D_post_cdhit-2d_alt",
                      "D_post_cdhit-2d_alt",
                      "P_post_cdhit-2d_alt",
                      "P_post_cdhit-2d_alt",
                      "P_post_cdhit-2d_alt",
                      "P_post_cdhit-2d_alt",
                      "P_post_cdhit-2d_alt",
                      "R_post_cdhit-2d_alt",
                      "R_post_cdhit-2d_alt",
                      "R_post_cdhit-2d_alt",
                      "R_post_cdhit-2d_alt",
                      "R_post_cdhit-2d_alt")

myTable[31:45,1] = c("D_utrorf_separated",
                      "D_utrorf_separated",
                      "D_utrorf_separated",
                      "D_utrorf_separated",
                      "D_utrorf_separated",
                      "P_utrorf_separated",
                      "P_utrorf_separated",
                      "P_utrorf_separated",
                      "P_utrorf_separated",
                      "P_utrorf_separated",
                      "R_utrorf_separated",
                      "R_utrorf_separated",
                      "R_utrorf_separated",
                      "R_utrorf_separated",
                      "R_utrorf_separated")
myTable[46:60,1] = c("D_eliminated_alt",
                      "D_eliminated_alt",
                      "D_eliminated_alt",
                      "D_eliminated_alt",
                      "D_eliminated_alt",
                      "P_eliminated_alt",
                      "P_eliminated_alt",
                      "P_eliminated_alt",
                      "P_eliminated_alt",
                      "P_eliminated_alt",
                      "R_eliminated_alt",
                      "R_eliminated_alt",
                      "R_eliminated_alt",
                      "R_eliminated_alt",
                      "R_eliminated_alt")

ind = union( which(myTable[,2] == "/"), which(myTable[,3] == "/"))
data = myTable[-ind, ]
colnames(data) <-c("DataSet", "BUSCOs", "BUSCOclass")
data$BUSCOclass = ordered(data$BUSCOclass, levels = unique(data$BUSCOclass))
data$DataSet = ordered(data$DataSet,  levels = unique(data$DataSet))

data$BUSCOs = as.numeric(data$BUSCOs)
data$perc = format(as.numeric(data$BUSCOs)/1440*100, digits = 0, nsmall = 1)


library(plyr)

yy = NULL
for (i in seq(1,84,4)) {
  yy = c(yy,(rev(c(i:(i+3)))))
}
data$BUSCOs_y = data$BUSCOs[yy]
data$BUSCOclass_y = data$BUSCOclass[yy]
data <- ddply(data, "DataSet",
                   transform, label_ypos=cumsum(BUSCOs_y))
data$perc = format(as.numeric(data$BUSCOs_y)/1440*100, digits = 0, nsmall = 1)



library(ggplot2)

ggplot(data = data, aes(x = DataSet, y = BUSCOs, fill = BUSCOclass), ylab = "BUSCOs", font.axis=2) + 
      geom_bar(stat = "identity") + 
      theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust=-0.3, size=10)) + 
      theme(legend.position = "bottom") + 
  geom_text(aes(y=label_ypos, label=perc), vjust=1.0, color="white", size=3.5)



selection = data[c(grep("*_evigene_initial_*", data[,1]), 
                   13:24,
                   49:60), ]
tmp = data
data = selection
ggplot(data = data, aes(x = DataSet, y = BUSCOs, fill = BUSCOclass), ylab = "BUSCOs", font.axis=2) + 
      geom_bar(stat = "identity") + 
      theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust=-0.3, size=10)) + 
      theme(legend.position = "right") + 
  geom_text(aes(y=label_ypos, label=perc), vjust=1.0, color="white", size=3.5)
data=tmp


selection = data[c(grep("*_evigene_initial_*", data[,1]), 
                   13:24,
                   49:60,
                   25:48), ]
selection$DataSet = ordered(selection$DataSet,  levels = unique(selection$DataSet))
tmp = data
data = selection
ggplot(data = data, aes(x = DataSet, y = BUSCOs, fill = BUSCOclass), ylab = "BUSCOs", font.axis=3) + 
      geom_bar(stat = "identity") + 
      theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust=-0.3, size=10)) + 
      theme(legend.position = "right") + 
  geom_text(aes(y=label_ypos, label=perc), vjust=1.0, color="white", size=3.5)
data=tmp


selection = data[c(grep("*_evigene_initial_*", data[,1]), 
                   13:24,
                   49:60,
                   25:48), ]
selection$DataSet = ordered(selection$DataSet,  levels = unique(selection$DataSet))
tmp = data
data.0 = data
data = selection
ggplot(data = data, aes(x = DataSet, y = BUSCOs, fill = BUSCOclass), ylab = "BUSCOs", font.axis=3) + 
      geom_bar(stat = "identity") + 
      theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust=-0.3, size=10)) + 
      theme(legend.position = "top")
data=tmp



myTable3 = read.table(file = "../intermediate/BUSCO_stPanTr_embryophyta_odb9.tsv", 
                     header = FALSE, 
                     sep = "\t", 
                     quote = "",
                     stringsAsFactors = FALSE,
                     fill=TRUE)
colnames(myTable3) <-c("DataSet", "BUSCOs", "BUSCOclass")
myTable3 = myTable3[-c(1:5),]
myTable3$DataSet[c(1,6,11)] = c("stPanTr_alt", "stPanTr_rep", "stPanTr_rep+alt")
myTable3 = myTable3[c(11:15,6:10,1:5),]

myTable3[myTable3[,3] == "",3] = "/"
myTable3[is.na(myTable3[,2]),2] = "/"

tmp = myTable3[1,1]
for (i in 2:nrow(myTable3)) {
  temp = myTable3[i,1]
  if (temp == "") {
    temp = tmp
    myTable3[i,1] = temp
  }
  else {
    tmp = temp
  }
}

ind = union( which(myTable3[,2] == "/"), which(myTable3[,3] == "/"))
data = myTable3[-ind, ]

colnames(data) <-c("DataSet", "BUSCOs", "BUSCOclass")
data$BUSCOclass = ordered(data$BUSCOclass, levels = unique(data$BUSCOclass))
data$DataSet = ordered(data$DataSet,  levels = unique(data$DataSet))

data$BUSCOs = as.numeric(data$BUSCOs)
data$perc = format(as.numeric(data$BUSCOs)/1440*100, digits = 0, nsmall = 1)


yy = NULL
for (i in seq(1,nrow(data),4)) {
  yy = c(yy,(rev(c(i:(i+3)))))
}
data$BUSCOs_y = data$BUSCOs[yy]
data$BUSCOclass_y = data$BUSCOclass[yy]
data <- ddply(data, "DataSet",
                   transform, label_ypos=cumsum(BUSCOs_y))
data$perc = format(as.numeric(data$BUSCOs_y)/1440*100, digits = 0, nsmall = 1)




ggplot(data = data, aes(x = DataSet, y = BUSCOs, fill = BUSCOclass), ylab = "BUSCOs", font.axis=2) + 
      geom_bar(stat = "identity") + 
      theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust=-0.3, size=10)) + 
      theme(legend.position = "bottom") + 
  geom_text(aes(y=label_ypos, label=perc), vjust=1.0, color="white", size=3.5)


for article


data.all = data.0[c(1:24,49:60),]
data.all$DataSet = gsub("D_", "Desiree_", data.all$DataSet)
data.all$DataSet = gsub("P_", "PW363_", data.all$DataSet)
data.all$DataSet = gsub("R_", "Rywal_", data.all$DataSet)

i1 = grep("Desiree", data.all$DataSet)
i2 = grep("PW363", data.all$DataSet)
i3 = grep("Rywal", data.all$DataSet)

data.all = data.all[c(i1, i2, i3), ]
data.all  = rbind(data.all, data)
data = as.data.frame(data.all)

data$DataSet = gsub("pre_cdhit-2d", "1stFiltering", data$DataSet)
data$DataSet = gsub("post_cdhit-2d", "2ndFiltering", data$DataSet)

data$DataSet = ordered(data$DataSet, levels = unique(data$DataSet))

library(ggthemes)
# 1440 = (2*2*2*2*2*3*3*5)

ggplot(data = data, aes(x = DataSet, y = BUSCOs, fill = BUSCOclass), ylab = "BUSCOs", font.axis=2) + 
   theme_bw() + coord_cartesian(ylim=c(0, 1440)) + scale_y_continuous(breaks=seq(0, 1440, 240)) + 
      geom_bar(stat = "identity", alpha = 0.9, width = 0.66) + theme(aspect.ratio = 1.0) + # geom_rangeframe() +
      theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust=-0.3, size=10)) + 
      theme(legend.position = "right") + 
  geom_text(aes(y=label_ypos, label=BUSCOs_y), vjust=1.0, color="white", size=3.5) + 
  theme(panel.border = element_blank(), panel.grid.major = element_blank(),
panel.grid.minor = element_blank()) +
  theme(
  axis.title.x = element_text(color = "blue", size = 14, face = "bold", hjust = NULL, vjust = -1.25,),
  axis.title.y = element_text(color = "#993333", size = 14, face = "bold")) + 
  guides(fill=guide_legend(ncol=1))

ggplot(data = data, aes(x = DataSet, y = BUSCOs, fill = BUSCOclass), ylab = "BUSCOs", font.axis=2) + 
   theme_bw() + coord_cartesian(ylim=c(0, 1440)) + scale_y_continuous(breaks=seq(0, 1440, 240)) + 
      geom_bar(stat = "identity", alpha = 0.9, width = 0.66) + theme(aspect.ratio = 1.0) + # geom_rangeframe() +
      theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust=-0.3, size=10)) + 
      theme(legend.position = "bottom") + 
  geom_text(aes(y=label_ypos, label=BUSCOs_y), vjust=1.0, color="white", size=3.5) +
    theme(panel.border = element_blank(), panel.grid.major = element_blank(),
panel.grid.minor = element_blank()) +
  theme(
  axis.title.x = element_text(color = "blue", size = 14, face = "bold", hjust = NULL, vjust = -1.25,),
  axis.title.y = element_text(color = "#993333", size = 14, face = "bold")) + 
  guides(fill=guide_legend(ncol=2))


ggplot(data = data, aes(x = DataSet, y = BUSCOs, fill = BUSCOclass), ylab = "BUSCOs", font.axis=2) + 
   theme_bw() + coord_cartesian(ylim=c(0, 1440)) + scale_y_continuous(breaks=seq(0, 1440, 240)) + 
      geom_bar(stat = "identity", alpha = 0.9, width = 0.66) + theme(aspect.ratio = 1.0) + # geom_rangeframe() +
      theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust=-0.3, size=10)) + 
      theme(legend.position = "top") + 
  geom_text(aes(y=label_ypos, label=BUSCOs_y), vjust=1.0, color="white", size=3.5) +
    theme(panel.border = element_blank(), panel.grid.major = element_blank(),
panel.grid.minor = element_blank()) +
  theme(
  axis.title.x = element_text(color = "blue", size = 14, face = "bold", hjust = NULL, vjust = -1.25,),
  axis.title.y = element_text(color = "#993333", size = 14, face = "bold")) + 
  guides(fill=guide_legend(ncol=2)) 



Supplementary


# i = c(grep("_utrorf_separated", data.0$DataSet), grep("_eliminated_", data.0$DataSet))
# data = rbind(data,data.0[i,])
# data$DataSet = gsub("D_", "Desiree_", data$DataSet)
# data$DataSet = gsub("P_", "PW363_", data$DataSet)
# data$DataSet = gsub("R_", "Rywal_", data$DataSet)
# data$DataSet = gsub("_all", "_rep+alt", data$DataSet)