knitr::opts_chunk$set(echo = TRUE)
No Eval filter
4 columns
if (!require(data.table)) install.packages("data.table")
library(data.table)
################################################################################
DT1 = read.delim(file= "../intermediate/Rywal_IPS/all.okay.aa.tsv", header = FALSE, sep = "\t", quote = NULL,
dec = ".",
stringsAsFactors = FALSE,
na.strings = "NA", fill = TRUE)
DT2 = read.delim(file= "../intermediate/Rywal_IPS/all.okalt.aa.tsv", header = FALSE, sep = "\t", quote = NULL,
dec = ".",
stringsAsFactors = FALSE,
na.strings = "NA", fill = TRUE)
dim(DT1)
dim(DT2)
DT = rbind(DT1, DT2)
DT$V12[DT$V12 == ""] = "-"
DT$V13[DT$V13 == ""] = "-"
DT$V4[DT$V4 == ""] = "-"
DT$V5[DT$V5 == ""] = "-"
dim(DT)
head(DT)
DT$V9[DT$V9 == '-'] = 1.0e+10
DT$V9 = as.numeric(DT$V9)
head(DT)
DT3 = DT[DT$V9 <= 1.0e-0,] # 1.0e-5,]
max(DT3$V9, na.rm = TRUE)
setDT(DT3)
# DT4 = DT3[ , .(X = paste(paste0(V4, ":", V5, ", ", V12, ":", V13),
# collapse="; ")), by = V1]
# DT4 = DT3[ , .(X = paste(paste0(V4, ":", V5),
# collapse="; ")), by = V1]
# unique(DT3$V4)
ignoreDB = c('Coils', 'Gene3D', 'MobiDBLite', 'SUPERFAMILY')
ind = which(DT3$V4 %in% ignoreDB)
DT3 = DT3[-ind,]
DT4 = DT3[ , .(X = paste(V4,
collapse="; ")), by = V1]
DT5 = DT3[ , .(Y = paste(V5,
collapse="; ")), by = V1]
DT6 = DT3[ , .(Q = paste(V12,
collapse="; ")), by = V1]
DT7 = DT3[ , .(P = paste(V13,
collapse="; ")), by = V1]
head(DT4)
head(DT5)
head(DT6)
head(DT7)
DT4$X = sapply(1:nrow(DT4), function(x) paste(sort(unique(trimws(unlist(strsplit(DT4$X[x],split="; ",fixed=TRUE))))), collapse = "; "))
DT5$Y = sapply(1:nrow(DT5), function(x) paste(sort(unique(trimws(unlist(strsplit(DT5$Y[x],split="; ",fixed=TRUE))))), collapse = "; "))
DT6$Q = sapply(1:nrow(DT6), function(x) paste(sort(unique(trimws(unlist(strsplit(DT6$Q[x],split="; ",fixed=TRUE))))), collapse = "; "))
DT7$P = sapply(1:nrow(DT7), function(x) paste(sort(unique(trimws(unlist(strsplit(DT7$P[x],split="; ",fixed=TRUE))))), collapse = "; "))
DT4$X = sapply(1:nrow(DT4), function(x) gsub("-; ", "", DT4$X[x]))
DT5$Y = sapply(1:nrow(DT5), function(x) gsub("-; ", "", DT5$Y[x]))
DT6$Q = sapply(1:nrow(DT6), function(x) gsub("-; ", "", DT6$Q[x]))
DT7$P = sapply(1:nrow(DT7), function(x) gsub("-; ", "", DT7$P[x]))
head(DT4)
head(DT5)
head(DT6)
head(DT7)
dim(DT4)
total <- merge(DT4,DT5, by="V1")
total <- merge(total,DT6, by="V1")
total <- merge(total,DT7, by="V1")
# see: https://github.com/ebi-pf-team/interproscan/wiki/InterProScan5OutputFormats
colnames(total) = c("aa_ID", "Analysis", "Signature_Accession", "IPR_annotations_accession", "IPR_annotations_description")
write.table(total, file = "../output/Rywal_IPS_filtered_aggregated_filtered.tsv",
append = FALSE, quote = FALSE, sep = "\t",
eol = "\n", na = "NA", dec = ".", row.names = FALSE,
col.names = TRUE)
################################################################################
DT1 = read.delim(file= "../intermediate/PW363_IPS/all.okay.aa.tsv", header = FALSE, sep = "\t", quote = NULL,
dec = ".",
stringsAsFactors = FALSE,
na.strings = "NA", fill = TRUE)
DT2 = read.delim(file= "../intermediate/PW363_IPS/all.okalt.aa.tsv", header = FALSE, sep = "\t", quote = NULL,
dec = ".",
stringsAsFactors = FALSE,
na.strings = "NA", fill = TRUE)
dim(DT1)
dim(DT2)
DT = rbind(DT1, DT2)
DT$V12[DT$V12 == ""] = "-"
DT$V13[DT$V13 == ""] = "-"
DT$V4[DT$V4 == ""] = "-"
DT$V5[DT$V5 == ""] = "-"
dim(DT)
head(DT)
DT$V9[DT$V9 == '-'] = 1.0e+10
DT$V9 = as.numeric(DT$V9)
head(DT)
DT3 = DT[DT$V9 <= 1.0e-0,]
max(DT3$V9, na.rm = TRUE)
setDT(DT3)
ind = which(DT3$V4 %in% ignoreDB)
DT3 = DT3[-ind,]
DT4 = DT3[ , .(X = paste(V4,
collapse="; ")), by = V1]
DT5 = DT3[ , .(Y = paste(V5,
collapse="; ")), by = V1]
DT6 = DT3[ , .(Q = paste(V12,
collapse="; ")), by = V1]
DT7 = DT3[ , .(P = paste(V13,
collapse="; ")), by = V1]
DT4$X = sapply(1:nrow(DT4), function(x) paste(sort(unique(trimws(unlist(strsplit(DT4$X[x],split="; ",fixed=TRUE))))), collapse = "; "))
DT5$Y = sapply(1:nrow(DT5), function(x) paste(sort(unique(trimws(unlist(strsplit(DT5$Y[x],split="; ",fixed=TRUE))))), collapse = "; "))
DT6$Q = sapply(1:nrow(DT6), function(x) paste(sort(unique(trimws(unlist(strsplit(DT6$Q[x],split="; ",fixed=TRUE))))), collapse = "; "))
DT7$P = sapply(1:nrow(DT7), function(x) paste(sort(unique(trimws(unlist(strsplit(DT7$P[x],split="; ",fixed=TRUE))))), collapse = "; "))
DT4$X = sapply(1:nrow(DT4), function(x) gsub("-; ", "", DT4$X[x]))
DT5$Y = sapply(1:nrow(DT5), function(x) gsub("-; ", "", DT5$Y[x]))
DT6$Q = sapply(1:nrow(DT6), function(x) gsub("-; ", "", DT6$Q[x]))
DT7$P = sapply(1:nrow(DT7), function(x) gsub("-; ", "", DT7$P[x]))
total <- merge(DT4,DT5, by="V1")
total <- merge(total,DT6, by="V1")
total <- merge(total,DT7, by="V1")
colnames(total) = c("aa_ID", "Analysis", "Signature_Accession", "IPR_annotations_accession", "IPR_annotations_description")
write.table(total, file = "../output/PW363_IPS_filtered_aggregated_filtered.tsv",
append = FALSE, quote = FALSE, sep = "\t",
eol = "\n", na = "NA", dec = ".", row.names = FALSE,
col.names = TRUE)
################################################################################
DT1 = read.delim(file= "../intermediate/Desiree_IPS/all.okay.aa.tsv", header = FALSE, sep = "\t", quote = NULL,
dec = ".",
stringsAsFactors = FALSE,
na.strings = "NA", fill = TRUE)
DT2 = read.delim(file= "../intermediate/Desiree_IPS/all.okalt.aa.tsv", header = FALSE, sep = "\t", quote = NULL,
dec = ".",
stringsAsFactors = FALSE,
na.strings = "NA", fill = TRUE)
dim(DT1)
dim(DT2)
DT = rbind(DT1, DT2)
DT$V12[DT$V12 == ""] = "-"
DT$V13[DT$V13 == ""] = "-"
DT$V4[DT$V4 == ""] = "-"
DT$V5[DT$V5 == ""] = "-"
dim(DT)
head(DT)
DT$V9[DT$V9 == '-'] = 1.0e+10
DT$V9 = as.numeric(DT$V9)
head(DT)
DT3 = DT[DT$V9 <= 1.0e-0,]
max(DT3$V9, na.rm = TRUE)
setDT(DT3)
# DT4 = DT3[ , .(X = paste(paste0(V4, ":", V5),
# collapse="; ")), by = V1]
# DT5 = DT3[ , .(Y = paste(paste0(V12, ":", V13),
# collapse="; ")), by = V1]
#
# total <- merge(DT4,DT5,by="V1")
ind = which(DT3$V4 %in% ignoreDB)
DT3 = DT3[-ind,]
DT4 = DT3[ , .(X = paste(V4,
collapse="; ")), by = V1]
DT5 = DT3[ , .(Y = paste(V5,
collapse="; ")), by = V1]
DT6 = DT3[ , .(Q = paste(V12,
collapse="; ")), by = V1]
DT7 = DT3[ , .(P = paste(V13,
collapse="; ")), by = V1]
DT4$X = sapply(1:nrow(DT4), function(x) paste(sort(unique(trimws(unlist(strsplit(DT4$X[x],split="; ",fixed=TRUE))))), collapse = "; "))
DT5$Y = sapply(1:nrow(DT5), function(x) paste(sort(unique(trimws(unlist(strsplit(DT5$Y[x],split="; ",fixed=TRUE))))), collapse = "; "))
DT6$Q = sapply(1:nrow(DT6), function(x) paste(sort(unique(trimws(unlist(strsplit(DT6$Q[x],split="; ",fixed=TRUE))))), collapse = "; "))
DT7$P = sapply(1:nrow(DT7), function(x) paste(sort(unique(trimws(unlist(strsplit(DT7$P[x],split="; ",fixed=TRUE))))), collapse = "; "))
DT4$X = sapply(1:nrow(DT4), function(x) gsub("-; ", "", DT4$X[x]))
DT5$Y = sapply(1:nrow(DT5), function(x) gsub("-; ", "", DT5$Y[x]))
DT6$Q = sapply(1:nrow(DT6), function(x) gsub("-; ", "", DT6$Q[x]))
DT7$P = sapply(1:nrow(DT7), function(x) gsub("-; ", "", DT7$P[x]))
total <- merge(DT4,DT5, by="V1")
total <- merge(total,DT6, by="V1")
total <- merge(total,DT7, by="V1")
colnames(total) = c("aa_ID", "Analysis", "Signature_Accession", "IPR_annotations_accession", "IPR_annotations_description")
write.table(total, file = "../output/Desiree_IPS_filtered_aggregated_filtered.tsv",
append = FALSE, quote = FALSE, sep = "\t",
eol = "\n", na = "NA", dec = ".", row.names = FALSE,
col.names = TRUE)
sessionInfo()