title: "reshape_merge_aggregate_filter" author: "zagor" date: "18 julij 2019" output: html_document

knitr::opts_chunk$set(echo = TRUE)

No Eval filter

4 columns


if (!require(data.table)) install.packages("data.table")
library(data.table)

################################################################################
DT1 = read.delim(file= "../intermediate/Rywal_IPS/all.okay.aa.tsv", header = FALSE, sep = "\t", quote = NULL,
                dec = ".",
                stringsAsFactors = FALSE,
                na.strings = "NA", fill = TRUE)
DT2 = read.delim(file= "../intermediate/Rywal_IPS/all.okalt.aa.tsv", header = FALSE, sep = "\t", quote = NULL,
                 dec = ".",
                 stringsAsFactors = FALSE,
                 na.strings = "NA", fill = TRUE)

dim(DT1)
dim(DT2)

DT = rbind(DT1, DT2)

DT$V12[DT$V12 == ""] = "-"
DT$V13[DT$V13 == ""] = "-"
DT$V4[DT$V4 == ""] = "-"
DT$V5[DT$V5 == ""] = "-"

dim(DT)
head(DT)

DT$V9[DT$V9 == '-'] = 1.0e+10
DT$V9 = as.numeric(DT$V9)

head(DT)

DT3 = DT[DT$V9 <=  1.0e-0,] # 1.0e-5,]
max(DT3$V9, na.rm = TRUE)

setDT(DT3)
# DT4 = DT3[ , .(X = paste(paste0(V4, ":", V5, ", ", V12, ":", V13), 
#                                collapse="; ")), by = V1]

# DT4 = DT3[ , .(X = paste(paste0(V4, ":", V5),
#                                collapse="; ")), by = V1]

# unique(DT3$V4)
ignoreDB = c('Coils', 'Gene3D', 'MobiDBLite', 'SUPERFAMILY')
ind = which(DT3$V4 %in% ignoreDB)
DT3 = DT3[-ind,]


DT4 = DT3[ , .(X = paste(V4,
                         collapse="; ")), by = V1]
DT5 = DT3[ , .(Y = paste(V5,
                         collapse="; ")), by = V1]
DT6 = DT3[ , .(Q = paste(V12,
                         collapse="; ")), by = V1]
DT7 = DT3[ , .(P = paste(V13,
                         collapse="; ")), by = V1]

head(DT4)
head(DT5)
head(DT6)
head(DT7)

DT4$X = sapply(1:nrow(DT4), function(x) paste(sort(unique(trimws(unlist(strsplit(DT4$X[x],split="; ",fixed=TRUE))))), collapse = "; "))
DT5$Y = sapply(1:nrow(DT5), function(x) paste(sort(unique(trimws(unlist(strsplit(DT5$Y[x],split="; ",fixed=TRUE))))), collapse = "; "))
DT6$Q = sapply(1:nrow(DT6), function(x) paste(sort(unique(trimws(unlist(strsplit(DT6$Q[x],split="; ",fixed=TRUE))))), collapse = "; "))
DT7$P = sapply(1:nrow(DT7), function(x) paste(sort(unique(trimws(unlist(strsplit(DT7$P[x],split="; ",fixed=TRUE))))), collapse = "; "))

DT4$X = sapply(1:nrow(DT4), function(x) gsub("-; ", "", DT4$X[x]))
DT5$Y = sapply(1:nrow(DT5), function(x) gsub("-; ", "", DT5$Y[x]))
DT6$Q = sapply(1:nrow(DT6), function(x) gsub("-; ", "", DT6$Q[x]))
DT7$P = sapply(1:nrow(DT7), function(x) gsub("-; ", "", DT7$P[x]))


head(DT4)
head(DT5)
head(DT6)
head(DT7)
dim(DT4)

total <- merge(DT4,DT5, by="V1")
total <- merge(total,DT6, by="V1")
total <- merge(total,DT7, by="V1")


# see: https://github.com/ebi-pf-team/interproscan/wiki/InterProScan5OutputFormats
colnames(total) = c("aa_ID", "Analysis", "Signature_Accession", "IPR_annotations_accession", "IPR_annotations_description")

write.table(total, file = "../output/Rywal_IPS_filtered_aggregated_filtered.tsv", 
            append = FALSE, quote = FALSE, sep = "\t",
            eol = "\n", na = "NA", dec = ".", row.names = FALSE,
            col.names = TRUE)

################################################################################



DT1 = read.delim(file= "../intermediate/PW363_IPS/all.okay.aa.tsv", header = FALSE, sep = "\t", quote = NULL,
                 dec = ".",
                 stringsAsFactors = FALSE,
                 na.strings = "NA", fill = TRUE)
DT2 = read.delim(file= "../intermediate/PW363_IPS/all.okalt.aa.tsv", header = FALSE, sep = "\t", quote = NULL,
                 dec = ".",
                 stringsAsFactors = FALSE,
                 na.strings = "NA", fill = TRUE)

dim(DT1)
dim(DT2)

DT = rbind(DT1, DT2)

DT$V12[DT$V12 == ""] = "-"
DT$V13[DT$V13 == ""] = "-"
DT$V4[DT$V4 == ""] = "-"
DT$V5[DT$V5 == ""] = "-"

dim(DT)
head(DT)

DT$V9[DT$V9 == '-'] = 1.0e+10
DT$V9 = as.numeric(DT$V9)

head(DT)

DT3 = DT[DT$V9 <=  1.0e-0,]
max(DT3$V9, na.rm = TRUE)


setDT(DT3)

ind = which(DT3$V4 %in% ignoreDB)
DT3 = DT3[-ind,]

DT4 = DT3[ , .(X = paste(V4,
                         collapse="; ")), by = V1]
DT5 = DT3[ , .(Y = paste(V5,
                         collapse="; ")), by = V1]
DT6 = DT3[ , .(Q = paste(V12,
                         collapse="; ")), by = V1]
DT7 = DT3[ , .(P = paste(V13,
                         collapse="; ")), by = V1]

DT4$X = sapply(1:nrow(DT4), function(x) paste(sort(unique(trimws(unlist(strsplit(DT4$X[x],split="; ",fixed=TRUE))))), collapse = "; "))
DT5$Y = sapply(1:nrow(DT5), function(x) paste(sort(unique(trimws(unlist(strsplit(DT5$Y[x],split="; ",fixed=TRUE))))), collapse = "; "))
DT6$Q = sapply(1:nrow(DT6), function(x) paste(sort(unique(trimws(unlist(strsplit(DT6$Q[x],split="; ",fixed=TRUE))))), collapse = "; "))
DT7$P = sapply(1:nrow(DT7), function(x) paste(sort(unique(trimws(unlist(strsplit(DT7$P[x],split="; ",fixed=TRUE))))), collapse = "; "))

DT4$X = sapply(1:nrow(DT4), function(x) gsub("-; ", "", DT4$X[x]))
DT5$Y = sapply(1:nrow(DT5), function(x) gsub("-; ", "", DT5$Y[x]))
DT6$Q = sapply(1:nrow(DT6), function(x) gsub("-; ", "", DT6$Q[x]))
DT7$P = sapply(1:nrow(DT7), function(x) gsub("-; ", "", DT7$P[x]))

total <- merge(DT4,DT5, by="V1")
total <- merge(total,DT6, by="V1")
total <- merge(total,DT7, by="V1")

colnames(total) = c("aa_ID", "Analysis", "Signature_Accession", "IPR_annotations_accession", "IPR_annotations_description")

write.table(total, file = "../output/PW363_IPS_filtered_aggregated_filtered.tsv", 
            append = FALSE, quote = FALSE, sep = "\t",
            eol = "\n", na = "NA", dec = ".", row.names = FALSE,
            col.names = TRUE)


################################################################################



DT1 = read.delim(file= "../intermediate/Desiree_IPS/all.okay.aa.tsv", header = FALSE, sep = "\t", quote = NULL,
                 dec = ".",
                 stringsAsFactors = FALSE,
                 na.strings = "NA", fill = TRUE)
DT2 = read.delim(file= "../intermediate/Desiree_IPS/all.okalt.aa.tsv", header = FALSE, sep = "\t", quote = NULL,
                 dec = ".",
                 stringsAsFactors = FALSE,
                 na.strings = "NA", fill = TRUE)

dim(DT1)
dim(DT2)

DT = rbind(DT1, DT2)

DT$V12[DT$V12 == ""] = "-"
DT$V13[DT$V13 == ""] = "-"
DT$V4[DT$V4 == ""] = "-"
DT$V5[DT$V5 == ""] = "-"

dim(DT)
head(DT)

DT$V9[DT$V9 == '-'] = 1.0e+10
DT$V9 = as.numeric(DT$V9)

head(DT)

DT3 = DT[DT$V9 <=  1.0e-0,]
max(DT3$V9, na.rm = TRUE)


setDT(DT3)


# DT4 = DT3[ , .(X = paste(paste0(V4, ":", V5),
#                          collapse="; ")), by = V1]
# DT5 = DT3[ , .(Y = paste(paste0(V12, ":", V13),
#                          collapse="; ")), by = V1]
# 
# total <- merge(DT4,DT5,by="V1")

ind = which(DT3$V4 %in% ignoreDB)
DT3 = DT3[-ind,]

DT4 = DT3[ , .(X = paste(V4,
                         collapse="; ")), by = V1]
DT5 = DT3[ , .(Y = paste(V5,
                         collapse="; ")), by = V1]
DT6 = DT3[ , .(Q = paste(V12,
                         collapse="; ")), by = V1]
DT7 = DT3[ , .(P = paste(V13,
                         collapse="; ")), by = V1]

DT4$X = sapply(1:nrow(DT4), function(x) paste(sort(unique(trimws(unlist(strsplit(DT4$X[x],split="; ",fixed=TRUE))))), collapse = "; "))
DT5$Y = sapply(1:nrow(DT5), function(x) paste(sort(unique(trimws(unlist(strsplit(DT5$Y[x],split="; ",fixed=TRUE))))), collapse = "; "))
DT6$Q = sapply(1:nrow(DT6), function(x) paste(sort(unique(trimws(unlist(strsplit(DT6$Q[x],split="; ",fixed=TRUE))))), collapse = "; "))
DT7$P = sapply(1:nrow(DT7), function(x) paste(sort(unique(trimws(unlist(strsplit(DT7$P[x],split="; ",fixed=TRUE))))), collapse = "; "))

DT4$X = sapply(1:nrow(DT4), function(x) gsub("-; ", "", DT4$X[x]))
DT5$Y = sapply(1:nrow(DT5), function(x) gsub("-; ", "", DT5$Y[x]))
DT6$Q = sapply(1:nrow(DT6), function(x) gsub("-; ", "", DT6$Q[x]))
DT7$P = sapply(1:nrow(DT7), function(x) gsub("-; ", "", DT7$P[x]))

total <- merge(DT4,DT5, by="V1")
total <- merge(total,DT6, by="V1")
total <- merge(total,DT7, by="V1")

colnames(total) = c("aa_ID", "Analysis", "Signature_Accession", "IPR_annotations_accession", "IPR_annotations_description")

write.table(total, file = "../output/Desiree_IPS_filtered_aggregated_filtered.tsv", 
            append = FALSE, quote = FALSE, sep = "\t",
            eol = "\n", na = "NA", dec = ".", row.names = FALSE,
            col.names = TRUE)




sessionInfo()