Testing SEEK

title: "merge all wc" author: "zagor" date: "9 October 2019" output: html_document

knitr::opts_chunk$set(echo = TRUE)


library(plyr)
library(dplyr)
library(data.table)


stCusTr = read.table(file = "../../../_S_03_stCuSTr/_A_03.2_components/output/3cv_cdhit-2d_weak-components.txt", 
                     header = TRUE, 
                     sep = "\t", 
                     quote = NULL,
                     dec = ".", 
                     stringsAsFactors = FALSE,
                     na.strings = "NA", fill = TRUE,
                     comment.char = "#")

stCusTr = stCusTr[, c(6,2,4,7)]
colnames(stCusTr) = c("wcID1", "cvID1", "geneID", "classID1")
ind = c(which(stCusTr$classID == "representative"), which(stCusTr$classID == "alternative"))
stCusTr = stCusTr[ind , ]

stPanTr = read.table(file = "../../_A_02_components_1_3cvs-gffmerged/intermediate_2_prepare-network/pantr_evigene-header-network_unfiltered_weak-components_perWC_with-rep.txt", 
                     header = FALSE, 
                     sep = "\t", 
                     quote = NULL,
                     dec = ".", 
                     stringsAsFactors = FALSE,
                     na.strings = "NA", fill = TRUE,
                     comment.char = "#")
stPanTr = stPanTr[, c(1,2,4,5)]
colnames(stPanTr) = c("wcID2", "cvID2", "geneID", "classID2")


nrow(stCusTr) + length(grep("Sotub", stPanTr$geneID)) + length(grep("PGSC", stPanTr$geneID))

combo = NULL


stPanTr.1 <- data.table(stPanTr[, c(3,1,2,4)], key = "geneID") 
stCusTr.1 <- data.table(stCusTr[, c(3,1,2,4)], key = "geneID")

combo = join(stPanTr.1, stCusTr.1, type = "full", match = "all", by = "geneID")

nrow(combo)
head(combo)

ind = which(combo$classID1 == "alternative")
head(ind)
combo[head(ind), ]

combo$classID2[is.na(combo$classID2)] = "cv_alternative"

ind1 = grep("Sotub", combo$geneID)
ind2 = grep("PGSC", combo$geneID)
gffmerge.wcID2 = combo$wcID2[c(ind1, ind2)]
gffmerge.cvID2 = combo$cvID2[c(ind1, ind2)]


comboTranslate = NULL

translate = cbind(combo$wcID1, combo$wcID2)
colnames(translate) = c("wcID1", "wcID2.1")
translate = data.table(translate, key = "wcID1") 
translate = translate[!is.na(translate$wcID1),]
translate = translate[!is.na(translate$wcID2.1),]
nrow(translate) == nrow(stPanTr) - length(grep("Sotub", stPanTr$geneID)) - length(grep("PGSC", stPanTr$geneID))
comboTranslate = join(combo, translate, type = "full", match = "all", by = "wcID1")
length(which((is.na(comboTranslate$wcID2.1)))) == length(grep("Sotub", stPanTr$geneID)) + length(grep("PGSC", stPanTr$geneID))

head(comboTranslate)
combo$wcID2 = comboTranslate$wcID2.1
combo$wcID2[c(ind1, ind2)] = gffmerge.wcID2
any(is.na(combo$wcID2))


comboTranslate = NULL

translate = cbind(combo$wcID1, combo$cvID2)
colnames(translate) = c("wcID1", "cvID2.1")
translate = data.table(translate, key = "wcID1") 
translate = translate[!is.na(translate$wcID1),]
translate = translate[!is.na(translate$cvID2.1),]
nrow(translate) == nrow(stPanTr) - length(grep("Sotub", stPanTr$geneID)) - length(grep("PGSC", stPanTr$geneID))
comboTranslate = join(combo, translate, type = "full", match = "all", by = "wcID1")
length(which((is.na(comboTranslate$cvID2.1)))) == length(grep("Sotub", stPanTr$geneID)) + length(grep("PGSC", stPanTr$geneID))

combo$cvID2 = comboTranslate$cvID2.1
combo$cvID2[c(ind1, ind2)] = gffmerge.cvID2
any(is.na(combo$wcID2))


ind = grep("Sotub", combo$geneID)
combo$cvID1[ind] = "Sotub"
combo$wcID1[ind] = "gffmerge"
combo$classID1[ind] = "gffmerge"
ind = grep("PGSC", combo$geneID)
combo$cvID1[ind] = "PGSC"
combo$wcID1[ind] = "gffmerge"
combo$classID1[ind] = "gffmerge"
unique(combo$cvID1)

unique(combo$cvID2)

combo = combo[with(combo, order(wcID2, wcID1, geneID)), ]

comboTranslate = setDT(combo)
comboTranslate$geneCount = 1
comboTranslate = comboTranslate[, list(sum(geneCount, na.rm = FALSE)), by = wcID2]
dim(comboTranslate)
colnames(comboTranslate) = c("wcID2", "geneCount")
tmp = combo[, c(1:2)]
tmp = join(tmp, comboTranslate, type = "full", match = "all", by = "wcID2")
head(tmp)
nrow(tmp) == nrow(combo)
ind = match(tmp$geneID, combo$geneID)
any(ind != seq(1,nrow(combo),1))
combo$geneCount = tmp$geneCount

head(combo)


write.table(x = combo, 
            file = "../output/5cv_weak-components.txt", 
            append = FALSE, 
            quote = FALSE, 
            sep = "\t",
            eol = "\n", na = "NA", dec = ".", 
            row.names = FALSE,
            col.names = TRUE)


combo$classID2[combo$classID2 == "cv_alternative"] = "alternative1"
combo$classID2[combo$classID2 == "alternative"] = "alternative2"

interesting = combo[which(combo$cvID2 == "D|||P|||R|||g"),]
range(interesting$geneCount)
interesting = interesting[which(interesting$geneCount %in% seq(8, 16)), ]
length(unique(interesting$wcID2))

dir.create("../output/D_P_R_g")

myWC = sort(unique(interesting$wcID2))
for (i in myWC) {
  subset = interesting[which( i == interesting$wcID2),]
  subset = arrange(subset, desc(classID2), desc(cvID1), desc(classID1))
  fname = i
  write.table(x = subset$geneID,
            file = paste0("../output/D_P_R_g/", i, ".txt"), 
            append = FALSE, 
            quote = FALSE, 
            sep = "\t",
            eol = "\n", na = "NA", dec = ".", 
            row.names = FALSE,
            col.names = FALSE)
}


count = read.table(file = "../output/5cv_weak-components.txt", 
                     header = TRUE, 
                     sep = "\t", 
                     quote = NULL,
                     dec = ".", 
                     stringsAsFactors = FALSE,
                     na.strings = "NA", fill = TRUE,
                     comment.char = "#")

count = count[count$classID2 == "representative" | count$classID2 == "alternative",]


library(VennDiagram)
A <- count$geneID[grep("D", count$cvID2)]
B <- count$geneID[grep("P", count$cvID2)]
C <- count$geneID[grep("R", count$cvID2)]
D <- count$geneID[grep("g", count$cvID2)]
DD = count[grep("g", count$cvID2), ]
table(DD$cvID2)


venn.plot <- venn.diagram(
	x = list(
	  Phureja = D,
		Desiree = A,
		PW363 = B,
		Rywal = C
		),
	filename = "../reports/stPanTr_geneCount_weak-components.tiff",
	col = "transparent",
	fill = c("cornflowerblue", "green", "yellow", "darkorchid1"),
	alpha = 0.50,
	label.col = c("orange", "white", "darkorchid4", "white", 
	"white", "white", "white", "white", "darkblue", "white", 
	"white", "white", "white", "darkgreen", "white"),
	cex = 1.25,
	fontfamily = "serif",
	fontface = "bold",
	cat.col = c("darkblue", "darkgreen", "orange", "darkorchid4"),
	cat.cex = 1.5,
	cat.pos = 15,# 30,
	cat.dist = 0.05,# 0.07,
	cat.fontfamily = "serif",
	rotation.degree = 180,
	margin = 0.1
	)


sessionInfo()