######################## # Supp. File 1. Characterization of Nested Complexes in PINs --R code # Programming language: R v.2.15.2 # Programmer: Antonio Mora, antoniocmora@gmail.com # Jan.-Feb., 2013 ######################## ####################### # 0. Needed Libraries and Files: ####################### setwd("**Introduce your working directory here**") library(iRefR) # iRefR can be installed from CRAN. irefindex_curr_human = get_irefindex("9606", "current", getwd()) # 401140 records # Assumption1: Only human-human interactions: human_human_list = data.frame(irefindex_curr_human$taxa, irefindex_curr_human$taxb) tmp = do.call(`paste`, c(unname(human_human_list), list(sep="."))) irefindex_curr_hh = irefindex_curr_human[tmp=="taxid:9606(Homo sapiens).taxid:9606(Homo sapiens)" | tmp=="-.taxid:9606(Homo sapiens)",] # 361059 records # Assumption2: Removing intra-protein interactions: irefindex_curr_hh = irefindex_curr_hh[-which(irefindex_curr_hh$edgetype=="X" & (as.character(irefindex_curr_hh[,1])==as.character(irefindex_curr_hh[,2]))==TRUE),] # 347753 records # Full PIN: Canonical, all confidence, multigraph: edgeList_full = convert_MITAB_to_edgeList(irefindex_curr_hh, "default", "spoke", node_names="uids", multi_edge="yes") PIN_full = convert_edgeList_to_graph(edgeList_full) PIN_full # 19482 nodes, 162254 edges # Assumption3: Remove predicted data (exclude OPHID): irefindex_curr_hh_exp = select_database("ophid", irefindex_curr_hh, "except_this_database") # 289451 records # Subset experimental: Canonical, all confidence, multigraph: edgeList_exp = convert_MITAB_to_edgeList(irefindex_curr_hh_exp, "default", "spoke", node_names="uids", multi_edge="yes") PIN_exp = convert_edgeList_to_graph(edgeList_exp) PIN_exp # 18383 nodes, 133783 edges ####################### # 1. Generating complex data structures: ####################### # Needed functions (additional to iRefR): generate_known_complexes_uids = function(iRef_C) { # Generating complexList from known complexes: complex_name = list(); complexList = list() irigids = unique(iRef_C$irigid) for (i in irigids) { this_complex = sort(unique(iRef_C[which(iRef_C$irigid==i), "uidB"])) if (length(this_complex) > 1) { complex_name[[i]] = i complexList[[i]] = paste(this_complex, collapse=",", sep="") } } complexName_table = do.call(rbind, complex_name) complexList_table = do.call(rbind, complexList) output = cbind(complexName_table, complexList_table) } # Generating complex data structures: iRef_C = select_interaction_type("complex", irefindex_curr_hh) dim(iRef_C) # 58854 x 54 complexList_C = generate_known_complexes_uids(iRef_C) dim(complexList_C) # 5816x2, 10' edgeList_C = convert_MITAB_to_edgeList(iRef_C, canonical_rep="no", node_names="uids", multi_edge="yes") # 2' dim(edgeList_C) # 37075x5 edges_one_subunit = NULL for (i in unique(edgeList_C[,3])) { tmp = which(edgeList_C[,3]==i) if (length(tmp)==1) { edges_one_subunit = c(edges_one_subunit, tmp) } } edgeList_C = edgeList_C[-edges_one_subunit,] dim(edgeList_C) # 36703x5 # Note: Complexes with one only protein were removed. PIN_C = convert_edgeList_to_graph(edgeList_C) PIN_C # 7397 nodes, 36703 edges iRef_B = select_interaction_type("binary", irefindex_curr_hh) dim(iRef_B) # 287959 x 54 # Removing repeated complexes: remove_repeated_complexes = function(complexes_table_tmp) { complexes_table = NULL each_complex = unique(complexes_table_tmp[,2]) for (i in 1:length(each_complex)) { tmp = complexes_table_tmp[which(complexes_table_tmp[,2] == each_complex[i]),] if (is.vector(tmp)==T) { complexes_table = rbind(complexes_table, tmp) } else { tmp2 = cbind(paste(tmp[,1], collapse="|", sep=""), tmp[1,2]) complexes_table = rbind(complexes_table, tmp2) } } rownames(complexes_table) = NULL complexes_table } complexList_C_unique = remove_repeated_complexes(complexList_C) dim(complexList_C_unique) # 5681 complexes ####################### # 2. Quantifying overlap and nesting in iRefIndex complexes: ####################### # Jaccard Matrix (Overlap quantification) and Meet-Min Matrix (Nesting quantification): generate_overlap_nesting_matrices = function(complexList) { overlap = matrix(0,nrow=length(complexList), ncol=length(complexList)); jaccard_index = matrix(0,nrow=length(complexList), ncol=length(complexList)); meetmin_index = matrix(0,nrow=length(complexList), ncol=length(complexList)) for (i in 1:(length(complexList)-1)) { complex_i = strsplit(complexList[i], ",", fixed=TRUE)[[1]] for (j in (i+1):length(complexList)) { complex_j = strsplit(complexList[j], ",", fixed=TRUE)[[1]] overlap[i,j] = length(intersect(complex_i, complex_j)) jaccard_index[i,j] = overlap[i,j] / length(unique(complex_i, complex_j)) meetmin_index[i,j] = overlap[i,j] / min(length(complex_i), length(complex_j)) } } result = list(overlap=overlap, jaccard_index=jaccard_index, meetmin_index=meetmin_index) } tmp = generate_overlap_nesting_matrices(complexList_C_unique[,2]) # TIME: 1hr 10' overlap_complex_complex = tmp$overlap jaccard_complex_complex = tmp$jaccard_index meetmin_complex_complex = tmp$meetmin_index # Quantify all significant in half the matrix: # TIME: 9' count_over = 0; count_meetmin = 0; all_pairs = 0; positive_overlap = NULL; positive_meetmin = NULL for (i in 1:(length(complexList_C_unique[,2])-1)) { for (j in (i+1):length(complexList_C_unique[,2])) { all_pairs = all_pairs + 1 if (overlap_complex_complex[i,j] > 0) { positive_overlap = c(positive_overlap, overlap_complex_complex[i,j]) count_over = count_over + 1 positive_meetmin = c(positive_meetmin, meetmin_complex_complex[i,j]) if (meetmin_complex_complex[i,j] == 1) { count_meetmin = count_meetmin + 1 } } } } all_pairs # 16134040 count_over # 213626 count_meetmin # 6275 # Histograms: postscript("fig3.eps", horizontal=F) par(mfrow=c(2,1)) hist(positive_overlap, breaks=100, main="(a)") hist(positive_meetmin, main="(b)") dev.off() ####################### # 3. Graphical representation of complex overlap and nesting: ####################### # Detect some interesting nesting complexes from the edgeList: edgeList_tmp = convert_MITAB_to_edgeList(iRef_C, multi_edge="no") #, node_names="uids" chosen_int = 1 rigids_multi_edge = edgeList_tmp[grep("\\|", edgeList_tmp[,3])[[chosen_int]], 3] rigids_multi_edge = strsplit(rigids_multi_edge, "|", fixed="TRUE")[[1]] rigids_multi_edge index_interesting_multi_edge = unlist(lapply(rigids_multi_edge, grep, edgeList_C[,3])) edgeList_interesting_complex = edgeList_C[index_interesting_multi_edge,] edgeList_interesting_complex # Option 0: Simple merge: Information on nesting is lost: graph_interesting = convert_edgeList_to_graph(edgeList_interesting_complex) jpeg("nested_example0.jpg", width = 1024, height = 512) plot(simplify(graph_interesting, edge.attr.comb="ignore"), vertex.label=V(graph_interesting)$name, vertex.size=13, main="Hidden Nested complexes") dev.off() # Plotting option 1: Multi-edge plot: plot_nest_example = function(edgeList_interesting_complex) { graph_interesting = convert_edgeList_to_graph(edgeList_interesting_complex) list_edge_colors = NULL list_edge_colors[which(edgeList_interesting_complex[,3]== "1024442")] = "red" list_edge_colors[which(edgeList_interesting_complex[,3]== "617191")] = "blue" list_edge_colors[which(edgeList_interesting_complex[,3]== "707306")] = "green" graph_interesting <- set.edge.attribute(graph_interesting, "color", value=list_edge_colors) } graph_interesting = plot_nest_example(edgeList_interesting_complex) jpeg("nested_example1.jpg", width = 1024, height = 512) plot(graph_interesting, layout=layout.kamada.kawai, vertex.label=V(graph_interesting)$name, vertex.size=13, main="Nested complexes -Multi-edge view") legend(.7, -1.1, c("complex_ID_1024442", "complex_ID_617191", "complex_ID_707306"), fill=c("red", "blue", "green")) dev.off() # Plotting option 2: Multi-node plot: multiplicate_nodes = function(edgeList_interesting_complex, list_nodes, list_rigids) { for (i in 1:length(list_nodes)) { n = 0 for (j in 1:length(list_rigids)) { tmp = which(edgeList_interesting_complex[,3] == list_rigids[j]) if(edgeList_interesting_complex[tmp[1],1]==list_nodes[i]) { n = n + 1 edgeList_interesting_complex[tmp,1] = paste(edgeList_interesting_complex[tmp[1],1], n, sep="_") } if(edgeList_interesting_complex[tmp[1],2]==list_nodes[i]) { n = n + 1 edgeList_interesting_complex[tmp,2] = paste(edgeList_interesting_complex[tmp[1],2], n, sep="_") } } } edgeList_interesting_complex } # Multiplicating only hub node: edgeList_interesting_case2 = multiplicate_nodes(edgeList_interesting_complex, "uniprotkb:Q9Y468", c("1024442", "617191", "707306")) graph_interesting2 = plot_nest_example(edgeList_interesting_case2) jpeg("nested_example2.jpg", width = 1024, height = 512) plot(layout.sugiyama(graph_interesting2, attributes="all")$extd_graph, vertex.label=V(graph_interesting2)$name, vertex.size=13, main="Nested complexes -Multi-hub view") legend(.7, -1.1, c("complex_ID_1024442", "complex_ID_617191", "complex_ID_707306"), fill=c("red", "blue", "green")) dev.off() # Multiplicating all nodes in the graph: multiplicate_all_nodes = function(edgeList_interesting_complex) { list_rigids = unique(edgeList_interesting_complex[,3]) for (i in 1:length(list_rigids)) { tmp = which(edgeList_interesting_complex[,3] == list_rigids[i]) edgeList_interesting_complex[tmp,1] = paste(edgeList_interesting_complex[tmp,1], i, sep="_") edgeList_interesting_complex[tmp,2] = paste(edgeList_interesting_complex[tmp[1],2], i, sep="_") } edgeList_interesting_complex } edgeList_interesting_case3 = multiplicate_all_nodes(edgeList_interesting_complex) graph_interesting3 = plot_nest_example(edgeList_interesting_case3) jpeg("nested_example3.jpg", width = 1024, height = 512) plot(graph_interesting3, vertex.label=V(graph_interesting3)$name, vertex.size=13, main="Nested complexes -Multi-node view") legend(.7, -1.1, c("complex_ID_1024442", "complex_ID_617191", "complex_ID_707306"), fill=c("red", "blue", "green")) dev.off() # Plotting option 3: Pie nodes: jpeg("nested_example4.jpg", width = 1024, height = 512) plot(simplify(graph_interesting, edge.attr.comb="ignore"), layout=layout.kamada.kawai, vertex.shape="pie", vertex.pie=list(c(.5,0,.5), c(.33,.33,.33),c(0,.5,.5),c(.33,.33,.33)), vertex.pie.color=list(c("red", "blue", "green")), vertex.label=V(graph_interesting)$name, vertex.size=24, main="Nested complexes -Pie nodes view") legend(.7, -1.1, c("complex_ID_1024442", "complex_ID_617191", "complex_ID_707306"), fill=c("red", "blue", "green")) dev.off() postscript("fig4.eps", horizontal=F, fonts="serif") par(mfrow=c(3,2)) plot(simplify(graph_interesting, edge.attr.comb="ignore"), vertex.label=gsub("uniprotkb:", "", V(graph_interesting)$name), vertex.size=50, vertex.color=NULL, vertex.label.cex=1, vertex.label.color="black", main="(a)") plot(graph_interesting, layout=layout.kamada.kawai, vertex.label=gsub("uniprotkb:", "", V(graph_interesting)$name), vertex.size=50, vertex.color=NULL, vertex.label.cex=1, vertex.label.color="black", main="(b)") #legend(.7, -1.1, c("complex_ID_1024442", "complex_ID_617191", "complex_ID_707306"), fill=c("red", "blue", "green")) plot(layout.sugiyama(graph_interesting2, attributes="all")$extd_graph, vertex.label=gsub("uniprotkb:", "", V(graph_interesting2)$name), vertex.size=50, vertex.color=NULL, vertex.label.cex=1, vertex.label.color="black", main="(c)") #legend(.7, -1.1, c("complex_ID_1024442", "complex_ID_617191", "complex_ID_707306"), fill=c("red", "blue", "green")) plot(graph_interesting3, vertex.label=gsub("uniprotkb:", "", V(graph_interesting3)$name), vertex.size=32, vertex.color=NULL, vertex.label.cex=0.6, vertex.label.color="black", main="(d)") #legend(.7, -1.1, c("complex_ID_1024442", "complex_ID_617191", "complex_ID_707306"), fill=c("red", "blue", "green")) plot(simplify(graph_interesting, edge.attr.comb="ignore"), layout=layout.kamada.kawai, vertex.shape="pie", vertex.pie=list(c(.5,0,.5), c(.33,.33,.33),c(0,.5,.5),c(.33,.33,.33)), vertex.pie.color=list(c("red", "blue", "green")), vertex.label=gsub("uniprotkb:", "", V(graph_interesting)$name), vertex.size=50, vertex.label.cex=1.4, vertex.label.color="black", main="(e)") legend(.4, -1.1, c("complex_ID_1024442", "complex_ID_617191", "complex_ID_707306"), fill=c("red", "blue", "green")) dev.off() ####################### # 4. Reconstruct list of nested cases ####################### # Generate a table for each nested pair, using complexList format, and adding overlap and meetmin info: # TIME: 2' create_table_nested_complexList = function(complexList_C_unique, overlap_complex_complex, jaccard_complex_complex, meetmin_complex_complex) { table_nested_complexList = NULL for (i in 1:(dim(overlap_complex_complex)[1]-1)) { for (j in (i+1):dim(overlap_complex_complex)[1]) { if (meetmin_complex_complex[i,j] == 1) { longest_set = which.max(c(length(strsplit(complexList_C_unique[i,2], ",", fixed=T)[[1]]), length(strsplit(complexList_C_unique[j,2], ",", fixed=T)[[1]]))) if (longest_set==1) { new_row = c(complexList_C_unique[i,1], complexList_C_unique[j,1], complexList_C_unique[i,2], complexList_C_unique[j,2], overlap_complex_complex[i,j], jaccard_complex_complex[i,j], meetmin_complex_complex[i,j]) } if (longest_set==2) { new_row = c(complexList_C_unique[j,1], complexList_C_unique[i,1], complexList_C_unique[j,2], complexList_C_unique[i,2], overlap_complex_complex[i,j], jaccard_complex_complex[i,j], meetmin_complex_complex[i,j]) } table_nested_complexList = rbind(table_nested_complexList, new_row) } } } rownames(table_nested_complexList) = NULL colnames(table_nested_complexList) = c("Macro complex -ID", "Nested complex -ID", "Macro complex -subunits", "Nested complex -subunits", "Overlap", "Jaccard", "Meet-Min") dim(table_nested_complexList) # 6275x7 # Grooming the table: V2 = NULL for (i in 1:length(table_nested_complexList[,2])) { tmp = strsplit(table_nested_complexList[i,2], "|", fixed=T)[[1]] V2 = c(V2, sort(tmp)[1]) } table_nested_complexList_final = as.data.frame(cbind(table_nested_complexList[,1], V2, table_nested_complexList[,3:4], table_nested_complexList[,6:7], weight=table_nested_complexList[,5])) # Note: Choosing one ID for complex and leaving overlap as weight. } table_nested_complexList_final = create_table_nested_complexList(complexList_C_unique, overlap_complex_complex, jaccard_complex_complex, meetmin_complex_complex) # Levels of nesting --Summary of nested groups: make_nested_groups = function(table_nested_complexList_final) { # DFS: graph_all_nesting = graph.data.frame(table_nested_complexList_final, directed="T") n=0; table_nested_groups = NULL for (i in unique(V(graph_all_nesting)$name)) { n=n+1 nested_id = paste("NID", n, sep="_") tmp <- unique(graph.dfs(graph_all_nesting, root=which(V(graph_all_nesting)$name==i), "out", unreachable="F")$order) tmp2 = length(tmp) complex_group = V(graph_all_nesting)$name[tmp[-tmp2]] length_group = length(complex_group) nested_group = paste(complex_group, collapse=",", sep="") new_row = c(nested_id, nested_group, length_group) table_nested_groups = rbind(table_nested_groups, new_row) } # Remove subsets of others: complexList = table_nested_groups[,2] meetmin_index = matrix(0,nrow=length(complexList), ncol=length(complexList)) complex_groups_to_remove = NULL for (i in 1:(length(complexList)-1)) { complex_i = strsplit(complexList[i], ",", fixed=TRUE)[[1]] for (j in (i+1):length(complexList)) { complex_j = strsplit(complexList[j], ",", fixed=TRUE)[[1]] overlap = length(intersect(complex_i, complex_j)) meetmin_index = overlap / min(length(complex_i), length(complex_j)) if (meetmin_index == 1) { if (length(complex_i) < length(complex_j)) { complex_groups_to_remove = c(complex_groups_to_remove, i) } else { complex_groups_to_remove = c(complex_groups_to_remove, j) } } } } table_nested_groups = table_nested_groups[-unique(complex_groups_to_remove),] new_ids = paste("NID", 1:dim(table_nested_groups)[1], sep="_") table_nested_groups[,1]= new_ids rownames(table_nested_groups) = NULL colnames(table_nested_groups) = c("nested_group", "nested_complexes", "group_size") # Generating graph objects for each group: graph_nested_comp = list() for (i in 1:dim(table_nested_groups)[1]) { tmp = strsplit(table_nested_groups[i,2], ",", fixed=T)[[1]] edgeList = table_nested_complexList_final[which((table_nested_complexList_final[,1] %in% tmp) & (table_nested_complexList_final[,2] %in% tmp)),] graph_nested_comp[[i]] = graph.data.frame(unique(edgeList), directed="T") } # Adding subunit info: a = c(as.character(table_nested_complexList_final[,1]), as.character(table_nested_complexList_final[,2])) b = c(as.character(table_nested_complexList_final[,3]), as.character(table_nested_complexList_final[,4])) c = unique(cbind(a, b)) complex_to_prot = cbind(c[,1], gsub("uniprotkb:", "", c[,2])) # 3073x2 all_subunits = NULL; size_subunits = NULL for (i in 1:dim(table_nested_groups)[1]) { this_complex = strsplit(table_nested_groups[i,2], ",", fixed=T)[[1]] these_subunits = NULL; this_size = NULL for (j in this_complex) { tmp = complex_to_prot[which(complex_to_prot[,1]==j),2] these_subunits = c(these_subunits, tmp) this_size = c(this_size, length(strsplit(tmp, ",", fixed=T)[[1]])) } all_subunits = c(all_subunits, paste(these_subunits, collapse="|", sep="")) size_subunits = c(size_subunits, paste(this_size, collapse="|", sep="")) } table_nested_groups2 = cbind(table_nested_groups, all_subunits, size_subunits) output = list(table_nested_groups2, graph_nested_comp) } tmp = make_nested_groups(table_nested_complexList_final) # TIME: 5'13'' table_nested_groups2 = tmp[[1]] dim(table_nested_groups2) # 1427x5 graph_nested_comp = tmp[[2]] length(graph_nested_comp) # 1427 hist(as.integer(table_nested_groups2[,3]),breaks=100) # Plot and find motifs per nested group: plot_nested_groups = function(nested_group_ID, table_nested_groups, graph_nested_comp, plotit="F", from="irefindex") { # complex-complex plot: if (plotit=="T") { if (from=="corum") { the_label = as.character(CORUM_2012[which(as.character(CORUM_2012[,1]) %in% V(graph_nested_comp[[nested_group_ID]])$name),2]) postscript("fig2.eps", horizontal = F, fonts = "serif") plot(graph_nested_comp[[nested_group_ID]], vertex.label=the_label, vertex.size=40, vertex.label.color="black", vertex.label.cex=1.2) #jpeg(paste("nested_group_", nested_group_ID, "a.jpg", sep=""), width = 1024, height = 512) #plot(graph_nested_comp[[nested_group_ID]], vertex.label=the_label, vertex.size=24, main=paste("CORUM nested complexes, group", nested_group_ID, sep=" ")) dev.off() } else { the_label = V(graph_nested_comp[[nested_group_ID]])$name jpeg(paste("nested_group_", nested_group_ID, "a.jpg", sep=""), width = 1024, height = 512) plot(graph_nested_comp[[nested_group_ID]], vertex.label=the_label, vertex.size=24, main=paste("iRefIndex nested complexes, group", nested_group_ID, sep=" ")) dev.off() } } # prot-prot plot: if (from=="corum") { tmp = gsub("[|]", ",", table_nested_groups_CORUM[nested_group_ID, 4]) tmp2 = unique(strsplit(tmp, ",", fixed=T)[[1]]) list_proteins = paste("uniprotkb:", tmp2, sep="") g = convert_edgeList_to_graph(convert_MITAB_to_edgeList(iRef_C, node_names="uids", multi_edge="yes")) the_graph <- induced.subgraph(g, which(V(g)$name %in% list_proteins)) V(the_graph)$name = gsub("uniprotkb:", "", V(the_graph)$name) jpeg(paste("nested_group_", nested_group_ID, "b.jpg", sep=""), width = 1024, height = 512) plot(the_graph, layout=layout.kamada.kawai, vertex.label=V(the_graph)$name, vertex.size=24, main=paste("CORUM nested complexes, group", nested_group_ID, sep=" ")) dev.off() } else { list_complexes = strsplit(table_nested_groups[nested_group_ID, 2], ",", fixed=T)[[1]] this_edgeList_C = edgeList_C[which(edgeList_C[,3] %in% list_complexes),] if (is.vector(this_edgeList_C)==T) {this_edgeList_C = t(this_edgeList_C)} the_graph = convert_edgeList_to_graph(this_edgeList_C) V(the_graph)$name = gsub("uniprotkb:", "", V(the_graph)$name) } if (plotit=="T" & from=="irefindex") { jpeg(paste("nested_group_", nested_group_ID, "b.jpg", sep=""), width = 1024, height = 512) #plot_nest_group = function(the_graph) { list_colors_R = c("red", "blue", "green", "black", "orange", "yellow", "purple", "salmon", "grey", "pink", "brown", "violet", "navy", "khaki", "cyan", "magenta", "plum", "aquamarine", "chocolate", "steelblue", "tomato", "tan", "orchid", "ivory", "gold", "darkmagenta", "turquoise", "mistyrose", "skyblue", "palegreen", "palevioletred", "mintcream", "moccasin", "sienna", "maroon", "greenyellow", "darkred", "gainsboro", "coral", "bisque") n=0; list_edge_colors = NULL list_complexes = strsplit(table_nested_groups[nested_group_ID, 2], ",", fixed=T)[[1]] this_edgeList_C = edgeList_C[which(edgeList_C[,3] %in% list_complexes),] for (i in list_complexes) { n=n+1 list_edge_colors[which(this_edgeList_C[,3] == i)] = list_colors_R[n] } the_graph <- set.edge.attribute(the_graph, "color", value=list_edge_colors) #} plot(the_graph, layout=layout.kamada.kawai, vertex.label=V(the_graph)$name, vertex.size=24, main=paste("iRefIndex nested complexes, group", nested_group_ID, sep=" ")) legends = paste("complex_ID", list_complexes, sep="_") legend(.7, -1, legends, fill=list_colors_R[1:n]) dev.off() } output = list(graph_nested_complexes = graph_nested_comp[[nested_group_ID]], graph_nested_prots = the_graph) } tmp = plot_nested_groups(421, table_nested_groups2, graph_nested_comp, "T") tmp tmp = plot_nested_groups(570, table_nested_groups2, graph_nested_comp, "T") tmp postscript("fig1.eps", horizontal=F, fonts="serif") par(mfrow=c(2,2)) tmp = plot_nested_groups(421, table_nested_groups2, graph_nested_comp, "T") plot(tmp$graph_nested_complexes, layout=layout.kamada.kawai, vertex.label=gsub("uniprotkb:", "", V(tmp$graph_nested_complexes)$name), vertex.size=50, vertex.color=NULL, vertex.label.cex=1, vertex.label.color="black", main="(a)") plot(tmp$graph_nested_prots, layout=layout.kamada.kawai, vertex.label=gsub("uniprotkb:", "", V(tmp$graph_nested_prots)$name), vertex.size=50, vertex.color=NULL, vertex.label.cex=1, vertex.label.color="black", main="(b)") tmp = plot_nested_groups(570, table_nested_groups2, graph_nested_comp, "T") plot(tmp$graph_nested_complexes, layout=layout.kamada.kawai, vertex.label=gsub("uniprotkb:", "", V(tmp$graph_nested_complexes)$name), vertex.size=50, vertex.color=NULL, vertex.label.cex=1, vertex.label.color="black", main="(c)") plot(tmp$graph_nested_prots, layout=layout.kamada.kawai, vertex.label=gsub("uniprotkb:", "", V(tmp$graph_nested_prots)$name), vertex.size=40, vertex.color=NULL, vertex.label.cex=1, vertex.label.color="black", main="(d)") dev.off() ####################### # 5. Stating the nesting problem: ####################### # Procedence of complexes of a nested group: table_procedence = NULL for (i in 1:dim(table_nested_groups2)[1]) { tmp = strsplit(table_nested_groups2[i,2], ",", fixed=T)[[1]] pmids=NULL; methods=NULL; dbs=NULL for (j in tmp) { this_pmid = paste(unique(iRef_C[which(iRef_C$irigid == j), "pmids"]), collapse="|") this_method = paste(unique(iRef_C[which(iRef_C$irigid == j), "method"]), collapse="|") this_db = paste(unique(iRef_C[which(iRef_C$irigid == j), "sourcedb"]), collapse="|") pmids = c(pmids, this_pmid) methods = c(methods, this_method) dbs = c(dbs, this_db) } new_row = c(table_nested_groups2[i,1:2], pmids=paste(pmids, collapse=","), methods=paste(methods, collapse=","), dbs=paste(dbs, collapse=","), length(unique(pmids))==1, length(unique(methods))==1, length(unique(dbs))==1) table_procedence = rbind(table_procedence, new_row) } rownames(table_procedence) = NULL dim(table_procedence) # 1427x8 TIME: 16' # Analysis of procedence: grep("x", table_procedence[,4]) # X-ray cases in 190 nested groups; mostly AP groups_same_procedence = table_procedence[which(table_procedence[,6]==TRUE & table_procedence[,7]==TRUE & table_procedence[,8]==TRUE),] # 189 cases where group comes from same paper-db unique(groups_same_procedence[,7]) # 53 methods in these groups of nested complexes from same study tmp = table_procedence[grep("CORUM", table_procedence[,5]),] # 685 cases with CORUM info which(tmp[,8]==TRUE) # 71 cases with ONLY corum info # Examples of biological validation: CORUM_2012 = read.csv("allComplexes.csv", header=T, comment.char="", sep=';', quote="") # http://mips.helmholtz-muenchen.de/genre/proj/corum/allComplexes.csv count_overlap = function(overlap_complex_complex, jaccard_complex_complex, meetmin_complex_complex) { # Quantify all significant in half the matrix: count_over = 0; count_meetmin = 0; all_pairs = 0; positive_overlap = NULL; positive_meetmin = NULL for (i in 1:(dim(overlap_complex_complex)[1]-1)) { for (j in (i+1):dim(overlap_complex_complex)[1]) { all_pairs = all_pairs + 1 if (overlap_complex_complex[i,j] > 0) { positive_overlap = c(positive_overlap, overlap_complex_complex[i,j]) count_over = count_over + 1 positive_meetmin = c(positive_meetmin, meetmin_complex_complex[i,j]) if (meetmin_complex_complex[i,j] == 1) { count_meetmin = count_meetmin + 1 } } } } result = list(all_pairs=all_pairs, count_over=count_over, count_meetmin=count_meetmin) } list_complexes = as.character(CORUM_2012[,5]) # 2867 list_complexes[which(list_complexes=="")] <- 0.0 CORUM_matrices = generate_overlap_nesting_matrices(list_complexes) CORUM_overlap_complex = CORUM_matrices$overlap CORUM_jaccard_complex = CORUM_matrices$jaccard_index CORUM_meetmin_complex = CORUM_matrices$meetmin_index tmp = count_overlap(CORUM_overlap_complex, CORUM_jaccard_complex, CORUM_meetmin_complex) CORUM_all_pairs = tmp$all_pairs CORUM_all_pairs # 4108411 CORUM_count_over = tmp$count_over CORUM_count_over # 25532 CORUM_count_meetmin = tmp$count_meetmin CORUM_count_meetmin # 3053 new_list_complexes = cbind(as.character(CORUM_2012[,1]), list_complexes, as.character(CORUM_2012[,2]), as.character(CORUM_2012[,10]), as.character(CORUM_2012[,11])) table_nested_complexList_CORUM = create_table_nested_complexList(new_list_complexes, CORUM_overlap_complex, CORUM_jaccard_complex, CORUM_meetmin_complex) # note: corum accepts repeated complexes, see this table tmp = make_nested_groups(table_nested_complexList_CORUM) # TIME: 2' table_nested_groups_CORUM = tmp[[1]] dim(table_nested_groups_CORUM) # 750x5 graph_nested_comp_CORUM = tmp[[2]] length(graph_nested_comp_CORUM) # 750 # Adding CORUM annotation info: add_annotation = function(table_nested_groups_CORUM, new_list_complexes) { all_names = NULL; all_functional_annotation = NULL; all_diseases = NULL for (i in 1:dim(table_nested_groups_CORUM)[1]) { this_complex = strsplit(table_nested_groups_CORUM[i,2], ",", fixed=T)[[1]] this_name=NULL; this_functional_annotation=NULL; this_disease=NULL for (j in this_complex) { tmp = new_list_complexes[which(new_list_complexes[,1]==j),] this_name = c(this_name, tmp[3]) this_functional_annotation = c(this_functional_annotation, tmp[4]) this_disease = c(this_disease, tmp[5]) } all_names = c(all_names, paste(this_name, collapse="|", sep="")) all_functional_annotation = c(all_functional_annotation, paste(this_functional_annotation, collapse="|", sep="")) all_diseases = c(all_diseases, paste(this_disease, collapse="|", sep="")) } table_annotated_CORUM = cbind(table_nested_groups_CORUM, all_names, all_functional_annotation, all_diseases) } table_annotated_CORUM = add_annotation(table_nested_groups_CORUM, new_list_complexes) dim(table_annotated_CORUM) # 750x8 # Case studies: Nuclear pore complex, DNA repair, cancer and biggest nested group: pore_nested_group = table_annotated_CORUM[grep("pore", table_annotated_CORUM[,6]),c(1,6)] pore_nested_group cancer_nested_groups = table_annotated_CORUM[unique(c(grep("cancer", table_annotated_CORUM[,7]), grep("cancer", table_annotated_CORUM[,8]))),c(1,5,6,7,8)] dim(cancer_nested_groups) # 32x5 repair_nested_groups = table_annotated_CORUM[unique(c(grep("repair", table_annotated_CORUM[,6]), grep("repair", table_annotated_CORUM[,7]))),c(1,5,6,7)] dim(repair_nested_groups) # 27x4 biggest_nested_group = table_annotated_CORUM[which.max(table_annotated_CORUM[,3]),c(1,3,6,7)] plot_pore = plot_nested_groups(43, table_nested_groups_CORUM, graph_nested_comp_CORUM, "T", from="corum") plot_pore plot_cancer1 = plot_nested_groups(150, table_nested_groups_CORUM, graph_nested_comp_CORUM, "T", from="corum") plot_cancer1 plot_cancer2 = plot_nested_groups(14, table_nested_groups_CORUM, graph_nested_comp_CORUM, "T", from="corum") plot_cancer2 plot_cancer3 = plot_nested_groups(15, table_nested_groups_CORUM, graph_nested_comp_CORUM, "T", from="corum") plot_cancer3 plot_cancer4 = plot_nested_groups(24, table_nested_groups_CORUM, graph_nested_comp_CORUM, "T", from="corum") plot_cancer4 plot_repair1 = plot_nested_groups(88, table_nested_groups_CORUM, graph_nested_comp_CORUM, "T", from="corum") plot_repair1 plot_repair2 = plot_nested_groups(118, table_nested_groups_CORUM, graph_nested_comp_CORUM, "T", from="corum") plot_repair2 plot_repair3 = plot_nested_groups(150, table_nested_groups_CORUM, graph_nested_comp_CORUM, "T", from="corum") plot_repair3 plot_repair4 = plot_nested_groups(159, table_nested_groups_CORUM, graph_nested_comp_CORUM, "T", from="corum") plot_repair4 plot_biggest = plot_nested_groups(30, table_nested_groups_CORUM, graph_nested_comp_CORUM, "T", from="corum") plot_biggest save.image("Project1.RData") ######################## # Part B ######################## ####################### # 0. Needed Libraries and Files: ####################### load("Project1.RData") ####################### # 1. Intuitive/naive criteria: ####################### # 1.1. High Jaccard index is related to artifacts (mistakes): # High Jaccard in Corum less than high Jaccard in all nested groups (suggest integration problem) find_groups_two_complex_high_Jaccard = function(table_nested_groups2) { groups_two_complex = table_nested_groups2[which(table_nested_groups2[,3]==2),] jaccard = NULL for (i in 1:dim(groups_two_complex)[1]) { tmp = strsplit(groups_two_complex[i,4], "|", fixed=T)[[1]] vector1 = strsplit(tmp[1], ",", fixed=T)[[1]] vector2 = strsplit(tmp[2], ",", fixed=T)[[1]] jaccard = c(jaccard, length(intersect(vector1, vector2)) / length(unique(c(vector1, vector2)))) } groups_two_complex = data.frame(groups_two_complex, as.numeric(jaccard)) } groups_two_complex = find_groups_two_complex_high_Jaccard(table_nested_groups2) length(which(groups_two_complex[,6] > 0.75)) # 55 cases of only one subcomplex and high Jaccard groups_two_complex_CORUM = find_groups_two_complex_high_Jaccard(table_nested_groups_CORUM) length(which(groups_two_complex_CORUM[,6] > 0.75)) # 51 cases of only one subcomplex and high Jaccard high_Jaccard_CORUM = groups_two_complex_CORUM[which(groups_two_complex_CORUM[,6] > 0.75),] # Almost all cases are in CORUM, disproving hypothesis. # High Jaccard come from two different PMIDs or databases (suggest integration problem) x = NULL; y = NULL; z = NULL for (i in 50:100) { j = i/100 high_Jaccard = groups_two_complex[which(groups_two_complex[,6] > j),] if (is.vector(high_Jaccard)==F) { high_Jaccard_diff_procedence = high_Jaccard[which(high_Jaccard[,1] %in% table_procedence[which(table_procedence[,6]==F | table_procedence[,8]==F), 1]),] size_Jaccard = dim(high_Jaccard)[1] size_diff_procedence = dim(high_Jaccard_diff_procedence)[1] } else { if (as.character(high_Jaccard[1]) %in% table_procedence[which(table_procedence[,6]==F | table_procedence[,8]==F), 1]) { size_diff_procedence = 1 } else { size_diff_procedence = 0 } if (length(high_Jaccard)>0) { size_Jaccard = 1 } else { size_Jaccard = 0 } } x = c(x, j) y = c(y, size_diff_procedence) z = c(z, size_Jaccard) } jpeg("Jaccard_analysis.jpg", width = 1024, height = 512) plot(x, y, xlab="Jaccard index", ylab="Number of size-2 nested groups", col="red") lines(x, z, col="blue") dev.off() postscript("fig5.eps", horizontal = F, fonts = "serif") plot(x, y, xlab="Jaccard index", ylab="Number of size-2 nested groups", col="red") lines(x, z, col="blue") dev.off() # High number of subcomplexes (possibly overlapping) is related to real subcomplexes: # High subcomplexes in gold standard (CORUM) compared to all nested groups groups_multiple_complex = table_nested_groups2[which(table_nested_groups2[,3]>3),] groups_multiple_complex_CORUM = table_nested_groups_CORUM[which(table_nested_groups_CORUM[,3]>3),] population = dim(table_nested_groups2)[1] # 1427 success_population = dim(groups_multiple_complex)[1] # 398 sample = dim(table_nested_groups_CORUM)[1] # 750 success_sample = dim(groups_multiple_complex_CORUM)[1] # 170 # 170 out of 750 (22.7%) vs 398 out of 1427 (27.9%) (CORUM is less enriched in multiple) pvalue_numbers = phyper(length(success_sample)-1, length(success_population), length(population) - length(success_population), length(sample), lower.tail=FALSE) # 1 (not-significant) # High number of subcomplexes is not a feature of gold standard and, therefore, may not be related to real subcomplexes. # High number subcomplexes come from same PMID?? high_number_diff_procedence = groups_multiple_complex[which(groups_multiple_complex[,1] %in% table_procedence[which(table_procedence[,6]==F | table_procedence[,8]==F), 1]),] # 374 out of 398 # High number subcomplexes come from diff PMID. This is argument pro-mistake, not pro-real nest. ####################### # 1. Analysis of degree, centralities, clustering coefficient and cancer-related status, in nested groups: ####################### # Compute BC of the whole metwork. Then, per pair maximal complex-subcomplex, find BC of the shared and non-shared proteins inside the whole network. Make statistics tests to see if nested or non-nested proteins happen to have different BC. I will get a conclusion per maximal-subcomplex pair, and then I might make a general distribution to conclude if it is really relevant. edgeList_PIN = convert_MITAB_to_edgeList(irefindex_curr_hh, canonical_rep="no", node_names="uids", multi_edge="yes") dim(edgeList_PIN) # 162254x5 # Note: Complexes with one only protein were not removed. PIN_full = convert_edgeList_to_graph(edgeList_PIN) PIN_full # 19482 nodes, 162254 edges network_analysis_nested_group = function(PIN_full, table_nested_complexList_final, cancer_related_prots) { bc_full = betweenness(PIN_full); pr_full = page.rank(PIN_full)$vector deg_full = igraph::degree(PIN_full) pvalue_bc = NULL; pvalue_pr = NULL; pvalue_deg = NULL; pvalue_cc = NULL; hg_score_cancer = NULL for (i in 1:dim(table_nested_complexList_final)[1]) { vector_complex = strsplit(as.character(table_nested_complexList_final[i,3]), ",", fixed=T)[[1]] vector_subcomplex = strsplit(as.character(table_nested_complexList_final[i,4]), ",", fixed=T)[[1]] shared_prots = intersect(vector_complex, vector_subcomplex) non_shared_prots = setdiff(vector_complex, vector_subcomplex) vector_degree_shared = deg_full[which(names(deg_full) %in% shared_prots)] vector_degree_non_shared = deg_full[which(names(deg_full) %in% non_shared_prots)] wtest_deg = wilcox.test(vector_degree_shared, vector_degree_non_shared) pvalue_deg = c(pvalue_deg, wtest_deg$p.value) # (<0.05 different, >0.05 the same) vector_betweenness_shared = bc_full[which(names(bc_full) %in% shared_prots)] vector_betweenness_non_shared = bc_full[which(names(bc_full) %in% non_shared_prots)] wtest_bc = wilcox.test(vector_betweenness_shared, vector_betweenness_non_shared) pvalue_bc = c(pvalue_bc, wtest_bc$p.value) # (<0.05 different, >0.05 the same) vector_prank_shared = pr_full[which(names(pr_full) %in% shared_prots)] vector_prank_non_shared = pr_full[which(names(pr_full) %in% non_shared_prots)] wtest_pr = wilcox.test(vector_prank_shared, vector_prank_non_shared) pvalue_pr = c(pvalue_pr, wtest_pr$p.value) # (<0.05 different, >0.05 the same) vector_cc_shared = transitivity(PIN_full, type="local", vids=which(V(PIN_full)$name %in% shared_prots)) vector_cc_non_shared = transitivity(PIN_full, type="local", vids=which(V(PIN_full)$name %in% non_shared_prots)) index_nan_shared = which(vector_cc_shared=="NaN") index_nan_non_shared = which(vector_cc_non_shared=="NaN") vector_cc_shared[index_nan_shared] = rep(0, length(index_nan_shared)) vector_cc_non_shared[index_nan_non_shared] = rep(0, length(index_nan_non_shared)) wtest_cc = wilcox.test(vector_cc_shared, vector_cc_non_shared) pvalue_cc = c(pvalue_cc, wtest_cc$p.value) # (<0.05 different, >0.05 the same) all_prots = unique(c(shared_prots, non_shared_prots)) cancer_shared_prots = shared_prots %in% cancer_related_prots cancer_all_prots = all_prots %in% cancer_related_prots #total_number_prots = population = N #cancer_all_prots = success_population = m #shared_prots = sample = n #cancer_shared_prots = success_sample = k hg_score_cancer = c(hg_score_cancer, phyper(length(cancer_shared_prots)-1, length(cancer_all_prots), length(all_prots) - length(cancer_all_prots), length(shared_prots), lower.tail=FALSE)) } table_analysis_nested_groups = cbind(table_nested_complexList_final, pvalue_deg, pvalue_bc, pvalue_pr, pvalue_cc, hg_score_cancer) } table_analysis_nested_groups = network_analysis_nested_group(PIN_full, table_nested_complexList_final, cancer_related_prots) #hist(table_analysis_nested_groups[,5]) #hist(table_analysis_nested_groups[,6]) jpeg("histogram_P2_1.jpg", width = 1024, height = 512) hist(table_analysis_nested_groups[,8], breaks=100, xlab="p-value", main="p-values Degree") dev.off() jpeg("histogram_P2_2.jpg", width = 1024, height = 512) hist(table_analysis_nested_groups[,9], breaks=100, xlab="p-value", main="p-values BC") dev.off() jpeg("histogram_P2_3.jpg", width = 1024, height = 512) hist(table_analysis_nested_groups[,10], breaks=100, xlab="p-value", main="p-values PR centrality") dev.off() jpeg("histogram_P2_4.jpg", width = 1024, height = 512) hist(table_analysis_nested_groups[,11], breaks=100, xlab="p-value", main="p-values Clust.Coeff.") dev.off() postscript("fig6.eps", horizontal=F, fonts="serif") par(mfrow=c(2,2)) hist(table_analysis_nested_groups[,8], breaks=100, xlab="p-value -degree", main="(a)") hist(table_analysis_nested_groups[,9], breaks=100, xlab="p-value -BC", main="(b)") hist(table_analysis_nested_groups[,10], breaks=100, xlab="p-value -PRC", main="(c)") hist(table_analysis_nested_groups[,11], breaks=100, xlab="p-value -CC", main="(d)") dev.off() ####################### # 2. Adding motif information: ####################### # Adding motif (NeMo) information: # 2-vertex motif: trivial_motifs = table_nested_groups2[which(table_nested_groups2[,3]=="2"),] dim(trivial_motifs) # 669x5 # Select 3,4 and 5-vertex cases to be analyzed by NeMo: table_nested_groups_motif = table_nested_groups2[-which(table_nested_groups2[,3]=="2" | as.integer(table_nested_groups2[,3])>5),] unique(table_nested_groups_motif[,3]) dim(table_nested_groups_motif) # 510x5 # Find motifs: add_motifs = function(table_nested_groups_motif, edgeList_C, plot_type="complex") { library(NeMo) table_motifs = NULL for (i in 1:dim(table_nested_groups_motif)[1]) { list_complexes = strsplit(table_nested_groups_motif[i, 2], ",", fixed=T)[[1]] this_edgeList_C = edgeList_C[which(edgeList_C[,3] %in% list_complexes),] if (dim(this_edgeList_C)[1]>0) { file_name = paste("edgeList", table_nested_groups_motif[i, 1], "txt", sep=".") write.table(this_edgeList_C[,1:2], file=file_name, sep=" ", row.names=F) # Compute the p-value of all motifs (by default EDD model) for (j in 3:5) { pv <- getExceptMotif(file_name, j) count_found_motifs = pv$counts[pv$counts > 0] index_found_motifs = which(pv$counts > 0) index_significant_motifs = which(pv$P.value < 0.05) count_significant_motifs = length(index_significant_motifs) n=0; all_motifs=NULL for (k in index_found_motifs) { n = n+1 motif_matrix = pv$motifs[k][[1]] motif_index = which(motif_matrix==1, arr.ind=T) the_motif = paste("motif_", n, " = ", paste(paste(motif_index[,1], motif_index[,2], sep=","), collapse="|"), sep="") # remove duplicated all_motifs = c(all_motifs, the_motif) tmp = cbind(NID=table_nested_groups_motif[i,1], motif_size=j, all_motifs, count_found_motifs, count_significant_motifs) table_motifs = rbind(table_motifs, tmp) #graph_found_motif <- graph.adjacency(pv$motifs[[k]], mode="directed") } } } } rownames(table_motifs)=NULL table_motifs } table_motifs = add_motifs(table_nested_groups_motif, edgeList_C) # WARNINGS: Problems with Polya parameters. dim(table_motifs) # 64027x5 table_motifs_significant0 = table_motifs[which(as.integer(table_motifs[,5]) > 0),] dim(table_motifs_significant0) # 46088 (72%) #table_motifs_significant = table_motifs[which(as.integer(table_motifs[,5]) > 15),] #dim(table_motifs_significant) # 18806x5 (29.4%) jpeg("histogram_P2_5.jpg", width = 1024, height = 512) hist(as.integer(table_motifs[,5]), breaks=15) # slightly bimodal around 0 and 20 dev.off() hist(as.integer(table_motifs[,2])) # 5-motifs, then 4-motifs, then 3-motifs hist(as.integer(table_motifs_significant[,5]), breaks=6) table(table_motifs_significant[,2]) # all common enriched are 5-motifs extract_table_motifs = function(table_motifs) { the_table = table(table_motifs[,3]) the_names = names(the_table) edited_names = gsub("motif_[0-9]+ = ", "", the_names) new_edited_names = NULL for (i in edited_names) { tmp = strsplit(i, "|", fixed=T)[[1]] new_tmp = NULL for (j in tmp) { tmp2 = sort(strsplit(j, ",", fixed=T)[[1]]) new_tmp = c(new_tmp, paste(tmp2, collapse=",")) } new_edited_names = c(new_edited_names, paste(unique(new_tmp), collapse="|")) } names(the_table) = new_edited_names sort(the_table, decreasing=T) } extract_table_motifs(table_motifs) # imperfect 5-cliques are the most common motif extract_table_motifs(table_motifs_significant0) # imperfect 5-cliques are the most common motif; examples: # 1,2|1,3|1,4|1,5|2,3|2,4|3,4|3,5|4,5| imperfect clique (missing edge 2,5): 2320 # 1,2|1,3|1,4|1,5|2,3|2,4|3,4|3,5| imperfect clique (missing edges 2,5 and 4,5): 980 # 1,2|1,3|1,4|1,5|2,3|2,4|2,5|3,4|3,5|4,5| full clique: 348 # This is evidence that nesting can be related to the concept of co-graphs, such as overlaps. # 669+510=1179 studied cases (82.6% of all cases; the top 17.4% was not studied). # Checking some over-represented motifs: table_motifs[which(table_motifs[,1]=="NID_22"),] tmp = plot_nested_groups(22, table_nested_groups_motif, graph_nested_comp, "T") tmp plot(tmp$graph_nested_complexes) plot(tmp$graph_nested_prots) table_motifs[which(table_motifs[,1]=="NID_22"),] ###################### # 3. Modularity measures: ###################### create_graph_lists = function(table_nested_groups2, graph_nested_comp, PIN_full) { vector_list_prots = NULL; tmp1 = list(); tmp2 = list() for (i in 1:dim(table_nested_groups2)[1]) { tmp = plot_nested_groups(i, table_nested_groups2, graph_nested_comp, "F") tmp1[[i]] = tmp$graph_nested_prots list_prots = strsplit(table_nested_groups2[i,4], "|", fixed=T)[[1]][1] vector_list_prots = c(vector_list_prots, list_prots) ids = which(gsub("uniprotkb:", "", V(PIN_full)$name) %in% strsplit(list_prots, ",", fixed=T)[[1]]) tmp2[[i]] = induced.subgraph(PIN_full, ids) } output = list(tmp1, tmp2, vector_list_prots) } tmp = create_graph_lists(table_nested_groups2, graph_nested_comp, PIN_full) tmp1 = tmp[[1]] tmp2 = tmp[[2]] vector_list_prots = tmp[[3]] modularity_analysis = function(table_nested_groups2, tmp1, tmp2) { vector_eb_mod_C = NULL; vector_walk_mod_C = NULL; vector_label_mod_C = NULL; vector_eigen_mod_C = NULL vector_eb_number_C = NULL; vector_walk_number_C = NULL; vector_label_number_C = NULL; vector_eigen_number_C = NULL vector_eb_mod_all = NULL; vector_walk_mod_all = NULL; vector_label_mod_all = NULL; vector_eigen_mod_all = NULL vector_eb_number_all = NULL; vector_walk_number_all = NULL; vector_label_number_all = NULL; vector_eigen_number_all = NULL for (i in 1:dim(table_nested_groups2)[1]) { if (length(V(tmp1[[i]]))>0) { eb_community = edge.betweenness.community(tmp1[[i]]) vector_eb_number_C = c(vector_eb_number_C, max(eb_community$membership)) vector_eb_mod_C = c(vector_eb_mod_C, max(eb_community$modularity)) walk_community = walktrap.community(tmp1[[i]]) vector_walk_number_C = c(vector_walk_number_C, max(walk_community$membership)) vector_walk_mod_C = c(vector_walk_mod_C, max(walk_community$modularity)) label_community = label.propagation.community(tmp1[[i]]) vector_label_number_C = c(vector_label_number_C, max(label_community$membership)) vector_label_mod_C = c(vector_label_mod_C, max(label_community$modularity)) # eigen_community = tryCatch(leading.eigenvector.community(tmp1[[i]]), error=function(e) NULL ) # vector_eigen_number_C = c(vector_eigen_number_C, max(eigen_community$membership)) # vector_eigen_mod_C = c(vector_eigen_mod_C, max(eigen_community$modularity)) eb_community = edge.betweenness.community(tmp2[[i]]) vector_eb_number_all = c(vector_eb_number_all, max(eb_community$membership)) vector_eb_mod_all = c(vector_eb_mod_all, max(eb_community$modularity)) walk_community = walktrap.community(tmp2[[i]]) vector_walk_number_all = c(vector_walk_number_all, max(walk_community$membership)) vector_walk_mod_all = c(vector_walk_mod_all, max(walk_community$modularity)) label_community = label.propagation.community(tmp2[[i]]) vector_label_number_all = c(vector_label_number_all, max(label_community$membership)) vector_label_mod_all = c(vector_label_mod_all, max(label_community$modularity)) # eigen_community = leading.eigenvector.community(tmp2[[i]]) # vector_eigen_number_all = c(vector_eigen_number_all, max(eigen_community$membership)) # vector_eigen_mod_all = c(vector_eigen_mod_all, max(eigen_community$modularity)) } else { vector_eb_number_C = c(vector_eb_number_C, "NA") vector_eb_mod_C = c(vector_eb_mod_C, "NA") vector_walk_number_C = c(vector_walk_number_C, "NA") vector_walk_mod_C = c(vector_walk_mod_C, "NA") vector_label_number_C = c(vector_label_number_C, "NA") vector_label_mod_C = c(vector_label_mod_C, "NA") vector_eb_number_all = c(vector_eb_number_all, "NA") vector_eb_mod_all = c(vector_eb_mod_all, "NA") vector_walk_number_all = c(vector_walk_number_all, "NA") vector_walk_mod_all = c(vector_walk_mod_all, "NA") vector_label_number_all = c(vector_label_number_all, "NA") vector_label_mod_all = c(vector_label_mod_all, "NA") } } table_modularity = cbind(table_nested_groups2[,1], vector_list_prots, vector_eb_number_C, vector_eb_number_all, vector_eb_mod_C, vector_eb_mod_all, vector_walk_number_C, vector_walk_number_all, vector_walk_mod_C, vector_walk_mod_all, vector_label_number_C, vector_label_number_all, vector_label_mod_C, vector_label_mod_all) colnames(table_modularity) = c("Group_ID", "Proteins", "#Comms -edge_betw_C", "#Comms -edge_betw_all", "Modul -edge_betw_C", "Modul -edge_betw_all", "#Comms -walk_C", "#Comms -walk_all", "Modul -walk_C", "Modul -walk_all", "#Comms -label_C", "#Comms -label_all", "Modul -label_C", "Modul -label_all") table_modularity } # TIME: 4' table_modularity = modularity_analysis(table_nested_groups2, tmp1, tmp2) # all give results, not C table_modularity_all = table_modularity[,c(1,2,4,6,8,10,12,14)] jpeg("histogram_P2_6.jpg", width = 1024, height = 512) hist(as.numeric(table_modularity_all[,4]), main="Mod -edge betweenness method", xlab="Modularity", breaks=100) dev.off() jpeg("histogram_P2_7.jpg", width = 1024, height = 512) hist(as.numeric(table_modularity_all[,6]), main="Mod -walk trap method", xlab="Modularity", breaks=100) dev.off() jpeg("histogram_P2_8.jpg", width = 1024, height = 512) hist(as.numeric(table_modularity_all[,8]), main="Mod -label propagation method", xlab="Modularity", breaks=100) dev.off() # Overlap between communities and subcomplexes: tmp = table_modularity_all[which(table_modularity_all[,4] != "NA"),] eb_group = tmp[which(as.numeric(tmp[,4]) > 0.5),] tmp = table_modularity_all[which(table_modularity_all[,6] != "NA"),] walk_group = tmp[which(as.numeric(tmp[,6]) > 0.5),] tmp = table_modularity_all[which(table_modularity_all[,8] != "NA"),] label_group = tmp[which(as.numeric(tmp[,8]) > 0.5),] # 3 cases of good modularity: Nested groups 291, 771, 876. # They fail to detect modules. overlap_community_subcomplex = function(tmp2, group_id) { list_a = list(); list_a2 = list() a = membership(label.propagation.community(tmp2[[group_id]])) # table(a) for (i in 1:max(a)) { tmp = a[a==i] list_a[[i]] = paste(sort(gsub("uniprotkb:","",names(tmp))), collapse=",", sep="") } list_a2 = as.list(strsplit(table_nested_groups2[group_id,4], "|", fixed=T)[[1]]) list_a2 = list_a2[-1] coincidences = NULL for (i in 1:length(list_a2)) { for (j in 1:length(list_a)) { #jaccard = intersect() / unique(c()) if (list_a2[[i]]==list_a[[j]]) {coincidences = c(coincidences, list_a2[i])} } } TP = length(coincidences) FP = length(list_a)-length(coincidences) FN = length(list_a2)-length(coincidences) output = list(communities=length(list_a), subcomplexes=length(list_a2), coincidences=coincidences, TP=TP, FP=FP, FN=FN, PRECISION=TP/(TP+FP), FDR=FP/(FP+TP)) } table_overlap_community_subcomplex = NULL for (i in 1:length(tmp2)) { example = overlap_community_subcomplex(tmp2, i) if (length(example)>0) { new_row = c(table_nested_groups2[i,1], example$communities, example$subcomplexes, example$TP, example$PRECISION, example$FDR) } else { new_row = c(table_nested_groups2[i,1], NA, NA, NA, NA, NA) } table_overlap_community_subcomplex = rbind(table_overlap_community_subcomplex, new_row) } rownames(table_overlap_community_subcomplex) = NULL colnames(table_overlap_community_subcomplex) = c("nested_group", "communities", "subcomplexes", "TP", "Precision", "FDR") jpeg("histogram_P2_9.jpg", width = 1024, height = 512) hist(as.numeric(table_overlap_community_subcomplex[,5]), main="Precision -label propagation method", xlab="Precision", breaks=100) dev.off() x11(); hist(as.numeric(table_overlap_community_subcomplex[,6]), breaks=100) # Conclusion: Very few cases of good prediction (precision and FDR). tmp = table_modularity_all[which(table_modularity_all[,4] != "NA"),] table_overlap_community_subcomplex_mod1 = table_overlap_community_subcomplex[which(as.numeric(tmp[,4]) > 0.5),] table_overlap_community_subcomplex_mod2 = table_overlap_community_subcomplex[which(as.numeric(tmp[,3]) > 4 & as.numeric(tmp[,4]) > 0.4),] table_overlap_community_subcomplex_mod3 = table_overlap_community_subcomplex[which(as.numeric(tmp[,3]) > 12 & as.numeric(tmp[,4]) > 0.4),] table_overlap_community_subcomplex_mod4 = table_overlap_community_subcomplex[which(as.numeric(tmp[,4]) > 0.2 & as.numeric(tmp[,4]) < 0.3),] library(plotrix) multhist(list(as.numeric(table_overlap_community_subcomplex_mod2[,5]), as.numeric(table_overlap_community_subcomplex_mod3[,5]), as.numeric(table_overlap_community_subcomplex_mod4[,5])), beside=T) # In all cases (more modular or not), precision is always more bad than good. Conclusion: Not related to quality or quantity of modules. Just uniformly bad. # Some examples: ex1 = overlap_community_subcomplex(tmp2, 291) # 1 out of 3 subcomplexes detected; additional 9 over-predicted ex2 = overlap_community_subcomplex(tmp2, 771) # 0 ex3 = overlap_community_subcomplex(tmp2, 876) # 1 tmp = table_modularity_all[which(table_modularity_all[,4] != "NA"),] eb_group = tmp[which(as.numeric(tmp[,3]) > 4 & as.numeric(tmp[,4]) > 0.4),] # 103,291,876,959,1129,1387 tmp = table_modularity_all[which(table_modularity_all[,6] != "NA"),] walk_group = tmp[which(as.numeric(tmp[,5]) > 4 & as.numeric(tmp[,6]) > 0.4),] # 291,560,650,771,789,1116,1129,1403 tmp = table_modularity_all[which(table_modularity_all[,8] != "NA"),] label_group = tmp[which(as.numeric(tmp[,7]) > 4 & as.numeric(tmp[,8]) > 0.4),] # 291,560,650,771,1116,1129,1249,1403 ex4 = overlap_community_subcomplex(tmp2, 103) # 1 ex5 = overlap_community_subcomplex(tmp2, 560) # 1 ex6 = overlap_community_subcomplex(tmp2, 650) # 0 ex7 = overlap_community_subcomplex(tmp2, 789) # 1 ex8 = overlap_community_subcomplex(tmp2, 959) # 0 ex9 = overlap_community_subcomplex(tmp2, 1116) # 0 ex10 = overlap_community_subcomplex(tmp2, 1129) # 0 ex11 = overlap_community_subcomplex(tmp2, 1249) # 1 ex12 = overlap_community_subcomplex(tmp2, 1387) # 1 ex13 = overlap_community_subcomplex(tmp2, 1403) # 1 ###################### # 4. GO analysis: ###################### # Use GO comparisons such as co-localization: library(org.Hs.eg.db) library(GO.db) library(annotate) load("eg2uid.RData") x <- org.Hs.egGO mapped_genes <- mappedkeys(x) # Get the entrez gene identifiers that are mapped to a GO ID xx <- as.list(x[mapped_genes]) # Convert to a list all_shared_prots = unique(strsplit(paste(table_nested_complexList_final[,4], collapse=",", sep=""), ",", fixed=T)[[1]]) all_non_shared_prots = setdiff(V(PIN_full)$name, all_shared_prots) length_shared = length(all_shared_prots) # 2239 length_non_shared = length(all_non_shared_prots) # 17243 GO_enrichment_uids = function(uids, toplength=20) { geneids = extended_table[which(extended_table[,2] %in% uids), 1] if (length(geneids)>0) { GO_MF = NULL; GO_BP = NULL; GO_CC = NULL for (i in geneids) { GO_MF = c(GO_MF, getOntology(xx[[i]], "MF")) GO_BP = c(GO_BP, getOntology(xx[[i]], "BP")) GO_CC = c(GO_CC, getOntology(xx[[i]], "CC")) } if (length(GO_MF)>0) { All_GO_MF_Terms = as.matrix(Term(unique(GO_MF))) Top_GO_MF_Terms = sort(table(GO_MF), decreasing=T) #[1:toplength] names_MF = Term(names(Top_GO_MF_Terms)) rownames(Top_GO_MF_Terms)=NULL GO_enrichment_MF = cbind(names_MF, Top_GO_MF_Terms) } else { GO_enrichment_MF = "No ontology found for geneID" } if (length(GO_BP)>0) { All_GO_BP_Terms = as.matrix(Term(unique(GO_BP))) Top_GO_BP_Terms = sort(table(GO_BP), decreasing=T) #[1:toplength] names_BP = Term(names(Top_GO_BP_Terms)) rownames(Top_GO_BP_Terms)=NULL GO_enrichment_BP = cbind(names_BP, Top_GO_BP_Terms) } else { GO_enrichment_BP = "No ontology found for geneID" } if (length(GO_CC)>0) { All_GO_CC_Terms = as.matrix(Term(unique(GO_CC))) Top_GO_CC_Terms = sort(table(GO_CC), decreasing=T) #[1:toplength] names_CC = Term(names(Top_GO_CC_Terms)) rownames(Top_GO_CC_Terms)=NULL GO_enrichment_CC = cbind(names_CC, Top_GO_CC_Terms) } else { GO_enrichment_CC = "No ontology found for geneID" } } else { GO_enrichment_MF = "GeneID not found" GO_enrichment_BP = "GeneID not found" GO_enrichment_CC = "GeneID not found" } result = list(GO_enrichment_BP, GO_enrichment_MF, GO_enrichment_CC) } tmp = GO_enrichment_uids(all_shared_prots) BP_enrichment_shared = cbind(tmp[[1]][,1], tmp[[1]][,2]) MF_enrichment_shared = cbind(tmp[[2]][,1], tmp[[2]][,2]) CC_enrichment_shared = cbind(tmp[[3]][,1], tmp[[3]][,2]) BP_enrichment_shared[1:20,] # Indeed, enriched in regulatory terms. length(grep("regulat", BP_enrichment_shared[1:20,]))/20 # 25% of top 20 is related to regulation BP_sum = 0; MF_sum = 0; CC_sum = 0 for (i in 1:100) { tmp = GO_enrichment_uids(sample(all_non_shared_prots, length_shared, replace=F)) BP_enrichment_non_shared = cbind(tmp[[1]][,1], tmp[[1]][,2]) MF_enrichment_non_shared = cbind(tmp[[2]][,1], tmp[[2]][,2]) CC_enrichment_non_shared = cbind(tmp[[3]][,1], tmp[[3]][,2]) BP_sum = BP_sum + dim(BP_enrichment_non_shared)[1] MF_sum = MF_sum + dim(MF_enrichment_non_shared)[1] CC_sum = CC_sum + dim(CC_enrichment_non_shared)[1] } BP_enrichment_non_shared = BP_sum / 100 MF_enrichment_non_shared = MF_sum / 100 CC_enrichment_non_shared = CC_sum / 100 GO_enrichment_analysis = rbind(c(dim(BP_enrichment_shared)[1], BP_enrichment_non_shared), c(dim(MF_enrichment_shared)[1], MF_enrichment_non_shared), c(dim(CC_enrichment_shared)[1], CC_enrichment_non_shared)) colnames(GO_enrichment_analysis) = c("Shared prots", "Non-shared prots") rownames(GO_enrichment_analysis) = c("BP", "MF", "CC") GO_enrichment_analysis # Conclusion: Slightly more terms in shared BP and CC.